med-pdf-nmo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +298 -0
- package/README.ru.md +298 -0
- package/dist/bm25.d.ts +47 -0
- package/dist/bm25.js +86 -0
- package/dist/browser-shims/buffer.d.ts +30 -0
- package/dist/browser-shims/buffer.js +31 -0
- package/dist/browser-shims/crypto.d.ts +33 -0
- package/dist/browser-shims/crypto.js +45 -0
- package/dist/browser-shims/fs-promises.d.ts +13 -0
- package/dist/browser-shims/fs-promises.js +25 -0
- package/dist/browser-shims/fs.d.ts +14 -0
- package/dist/browser-shims/fs.js +24 -0
- package/dist/browser-shims/globals.d.ts +9 -0
- package/dist/browser-shims/globals.js +23 -0
- package/dist/browser-shims/path.d.ts +57 -0
- package/dist/browser-shims/path.js +65 -0
- package/dist/browser-shims/process.d.ts +22 -0
- package/dist/browser-shims/process.js +27 -0
- package/dist/browser.d.ts +9 -0
- package/dist/browser.js +12 -0
- package/dist/chunk.d.ts +15 -0
- package/dist/chunk.js +76 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +87 -0
- package/dist/index.d.ts +82 -0
- package/dist/index.js +51 -0
- package/dist/med-pdf-nmo.browser.js +40413 -0
- package/dist/med-pdf-nmo.browser.mjs +40395 -0
- package/dist/normalize.d.ts +73 -0
- package/dist/normalize.js +477 -0
- package/dist/pdf.d.ts +35 -0
- package/dist/pdf.js +396 -0
- package/dist/predictor/config.d.ts +28 -0
- package/dist/predictor/config.js +26 -0
- package/dist/predictor/constants.d.ts +3 -0
- package/dist/predictor/constants.js +59 -0
- package/dist/predictor/runtime.d.ts +15 -0
- package/dist/predictor/runtime.js +59 -0
- package/dist/predictor/scorers/biomedical-symbols.d.ts +36 -0
- package/dist/predictor/scorers/biomedical-symbols.js +347 -0
- package/dist/predictor/scorers/coordinate-table.d.ts +82 -0
- package/dist/predictor/scorers/coordinate-table.js +1210 -0
- package/dist/predictor/scorers/direction.d.ts +71 -0
- package/dist/predictor/scorers/direction.js +345 -0
- package/dist/predictor/scorers/drug-dose.d.ts +6 -0
- package/dist/predictor/scorers/drug-dose.js +221 -0
- package/dist/predictor/scorers/exact-answer.d.ts +10 -0
- package/dist/predictor/scorers/exact-answer.js +75 -0
- package/dist/predictor/scorers/fibrosis-stage.d.ts +6 -0
- package/dist/predictor/scorers/fibrosis-stage.js +103 -0
- package/dist/predictor/scorers/focused.d.ts +40 -0
- package/dist/predictor/scorers/focused.js +204 -0
- package/dist/predictor/scorers/frequency.d.ts +10 -0
- package/dist/predictor/scorers/frequency.js +203 -0
- package/dist/predictor/scorers/numeric.d.ts +77 -0
- package/dist/predictor/scorers/numeric.js +1161 -0
- package/dist/predictor/scorers/recommendation-item.d.ts +27 -0
- package/dist/predictor/scorers/recommendation-item.js +469 -0
- package/dist/predictor/scorers/search.d.ts +41 -0
- package/dist/predictor/scorers/search.js +515 -0
- package/dist/predictor/selection.d.ts +30 -0
- package/dist/predictor/selection.js +370 -0
- package/dist/predictor/text-utils.d.ts +49 -0
- package/dist/predictor/text-utils.js +497 -0
- package/dist/predictor/types.d.ts +23 -0
- package/dist/predictor/types.js +1 -0
- package/dist/predictor.d.ts +52 -0
- package/dist/predictor.js +3834 -0
- package/package.json +82 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Сопоставляет направление (рост/снижение) в вопросе и в локальном контексте
|
|
3
|
+
* найденного варианта; штрафует противоположную полярность.
|
|
4
|
+
*/
|
|
5
|
+
export declare function polarityAdjustment({ pages, topQuestionPages, mode, question, questionTokens, answer }: {
|
|
6
|
+
pages: any;
|
|
7
|
+
topQuestionPages: any;
|
|
8
|
+
mode: any;
|
|
9
|
+
question: any;
|
|
10
|
+
questionTokens: any;
|
|
11
|
+
answer: any;
|
|
12
|
+
}): {
|
|
13
|
+
adjustment: number;
|
|
14
|
+
evidence: any;
|
|
15
|
+
};
|
|
16
|
+
/**
|
|
17
|
+
* Различает дневные и ночные подсказки для single-вопросов и штрафует
|
|
18
|
+
* вариант с противоположным временем суток рядом с фокусом вопроса.
|
|
19
|
+
*/
|
|
20
|
+
export declare function temporalCueAdjustment({ mode, pages, topQuestionPages, answer, focusTokens, questionTokens }: {
|
|
21
|
+
mode: any;
|
|
22
|
+
pages: any;
|
|
23
|
+
topQuestionPages: any;
|
|
24
|
+
answer: any;
|
|
25
|
+
focusTokens: any;
|
|
26
|
+
questionTokens: any;
|
|
27
|
+
}): {
|
|
28
|
+
support: any;
|
|
29
|
+
adjustment: number;
|
|
30
|
+
evidence: any;
|
|
31
|
+
} | {
|
|
32
|
+
support: any;
|
|
33
|
+
adjustment: number;
|
|
34
|
+
evidence: any;
|
|
35
|
+
};
|
|
36
|
+
/**
|
|
37
|
+
* Штрафует multi-вариант, у которого сильнейшее evidence содержит
|
|
38
|
+
* противоположную подсказку (верх/низ, рост/снижение, порядок дистальный/
|
|
39
|
+
* проксимальный) либо противоположный модификатор у того же целевого слова.
|
|
40
|
+
*/
|
|
41
|
+
export declare function contrastCueMismatchAdjustment({ mode, answer }: {
|
|
42
|
+
mode: any;
|
|
43
|
+
answer: any;
|
|
44
|
+
}, evidence: any): {
|
|
45
|
+
adjustment: number;
|
|
46
|
+
evidence: {
|
|
47
|
+
answerId: any;
|
|
48
|
+
page: any;
|
|
49
|
+
text: any;
|
|
50
|
+
score: number;
|
|
51
|
+
kind: string;
|
|
52
|
+
};
|
|
53
|
+
};
|
|
54
|
+
/**
|
|
55
|
+
* Штрафует вариант, чье локальное evidence относится к исключенной подгруппе
|
|
56
|
+
* (`при X` рядом с фразой ответа), когда вопрос явно про `без X`.
|
|
57
|
+
*/
|
|
58
|
+
export declare function excludedConditionMismatchAdjustment({ mode, question, answer }: {
|
|
59
|
+
mode: any;
|
|
60
|
+
question: any;
|
|
61
|
+
answer: any;
|
|
62
|
+
}, evidence: any): {
|
|
63
|
+
adjustment: number;
|
|
64
|
+
evidence: {
|
|
65
|
+
answerId: any;
|
|
66
|
+
page: any;
|
|
67
|
+
text: any;
|
|
68
|
+
score: number;
|
|
69
|
+
kind: string;
|
|
70
|
+
};
|
|
71
|
+
};
|
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
import { coverage, normalizeForSearch, tokenize, uniqueTokens } from "../../normalize.js";
|
|
2
|
+
import { FOCUS_STOPWORDS } from "../constants.js";
|
|
3
|
+
import { answerSearchPhrases, betterEvidence, cachedLineWindowSegments, containsNormalizedPhrase, evidenceSnippet, findPhraseOccurrences, nearestCueName, pageWindow, tokenHitCount, tokenizeNormalized, } from "../text-utils.js";
|
|
4
|
+
import { latinAnswerTokens } from "./biomedical-symbols.js";
|
|
5
|
+
const POLARITY_UP_CUES = ["повыш", "увелич", "возраста", "рост", "высок", "более", "выше"].map((item) => normalizeForSearch(item));
|
|
6
|
+
const POLARITY_DOWN_CUES = ["сниж", "уменьш", "низк", "менее", "ниже"].map((item) => normalizeForSearch(item));
|
|
7
|
+
function detectPolarity(text) {
|
|
8
|
+
const normalized = normalizeForSearch(text);
|
|
9
|
+
if (containsNormalizedPhrase(normalized, "менее высокий") || containsNormalizedPhrase(normalized, "менее высок") || containsNormalizedPhrase(normalized, "ниже")) {
|
|
10
|
+
return "down";
|
|
11
|
+
}
|
|
12
|
+
if (containsNormalizedPhrase(normalized, "более высокий") || containsNormalizedPhrase(normalized, "более высок") || containsNormalizedPhrase(normalized, "выше")) {
|
|
13
|
+
return "up";
|
|
14
|
+
}
|
|
15
|
+
const up = POLARITY_UP_CUES.some((cue) => normalized.includes(cue));
|
|
16
|
+
const down = POLARITY_DOWN_CUES.some((cue) => normalized.includes(cue));
|
|
17
|
+
if (up && !down)
|
|
18
|
+
return "up";
|
|
19
|
+
if (down && !up)
|
|
20
|
+
return "down";
|
|
21
|
+
return null;
|
|
22
|
+
}
|
|
23
|
+
function nearestPolarityBefore(pageNorm, hit) {
|
|
24
|
+
const before = pageNorm.slice(Math.max(0, hit - 140), hit);
|
|
25
|
+
let best = null;
|
|
26
|
+
for (const cue of POLARITY_UP_CUES) {
|
|
27
|
+
const index = before.lastIndexOf(cue);
|
|
28
|
+
if (index >= 0 && (!best || index > best.index))
|
|
29
|
+
best = { type: "up", index };
|
|
30
|
+
}
|
|
31
|
+
for (const cue of POLARITY_DOWN_CUES) {
|
|
32
|
+
const index = before.lastIndexOf(cue);
|
|
33
|
+
if (index >= 0 && (!best || index > best.index))
|
|
34
|
+
best = { type: "down", index };
|
|
35
|
+
}
|
|
36
|
+
return best?.type ?? null;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Сопоставляет направление (рост/снижение) в вопросе и в локальном контексте
|
|
40
|
+
* найденного варианта; штрафует противоположную полярность.
|
|
41
|
+
*/
|
|
42
|
+
export function polarityAdjustment({ pages, topQuestionPages, mode, question, questionTokens, answer }) {
|
|
43
|
+
const targetPolarity = detectPolarity(question) ?? detectPolarity(answer.text);
|
|
44
|
+
if (!targetPolarity)
|
|
45
|
+
return { adjustment: 0, evidence: null };
|
|
46
|
+
const phrases = [...new Set([...latinAnswerTokens(answer.text), ...answerSearchPhrases(answer.text)])].slice(0, 14);
|
|
47
|
+
let bestMatch = null;
|
|
48
|
+
let bestMismatch = null;
|
|
49
|
+
for (const page of pages) {
|
|
50
|
+
if (topQuestionPages?.size && !topQuestionPages.has(page.page))
|
|
51
|
+
continue;
|
|
52
|
+
const pageNorm = page.normalized;
|
|
53
|
+
for (const phrase of phrases) {
|
|
54
|
+
const normalizedPhrase = normalizeForSearch(phrase);
|
|
55
|
+
if (!normalizedPhrase || normalizedPhrase.length < 3)
|
|
56
|
+
continue;
|
|
57
|
+
const hits = findPhraseOccurrences(pageNorm, phrase, { textIsNormalized: true });
|
|
58
|
+
for (const hit of hits) {
|
|
59
|
+
const local = pageWindow(page, hit, 180);
|
|
60
|
+
const questionCoverage = coverage(questionTokens, tokenizeNormalized(local));
|
|
61
|
+
if (questionCoverage < 0.16)
|
|
62
|
+
continue;
|
|
63
|
+
const found = nearestPolarityBefore(pageNorm, hit);
|
|
64
|
+
if (!found)
|
|
65
|
+
continue;
|
|
66
|
+
const evidence = {
|
|
67
|
+
answerId: answer.id,
|
|
68
|
+
page: page.page,
|
|
69
|
+
text: evidenceSnippet(page.text, phrase),
|
|
70
|
+
score: (found === targetPolarity ? 4.8 : 4.2) + questionCoverage * 5.0,
|
|
71
|
+
kind: found === targetPolarity ? "polarity_match" : "polarity_mismatch",
|
|
72
|
+
};
|
|
73
|
+
if (found === targetPolarity)
|
|
74
|
+
bestMatch = betterEvidence(bestMatch, evidence);
|
|
75
|
+
else
|
|
76
|
+
bestMismatch = betterEvidence(bestMismatch, evidence);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
if (bestMatch && (!bestMismatch || bestMatch.score >= bestMismatch.score + 0.3))
|
|
81
|
+
return { adjustment: 2.4, evidence: bestMatch };
|
|
82
|
+
if (bestMismatch && (!bestMatch || bestMismatch.score > bestMatch.score + 0.3)) {
|
|
83
|
+
return { adjustment: mode === "single" ? -5.2 : -2.4, evidence: bestMismatch };
|
|
84
|
+
}
|
|
85
|
+
return { adjustment: 0, evidence: null };
|
|
86
|
+
}
|
|
87
|
+
function temporalCue(text) {
|
|
88
|
+
const normalized = normalizeForSearch(text);
|
|
89
|
+
if (containsNormalizedPhrase(normalized, "ноч"))
|
|
90
|
+
return "night";
|
|
91
|
+
if (containsNormalizedPhrase(normalized, "днем") || containsNormalizedPhrase(normalized, "днев"))
|
|
92
|
+
return "day";
|
|
93
|
+
return null;
|
|
94
|
+
}
|
|
95
|
+
function nearestTemporalCue(local) {
|
|
96
|
+
return nearestCueName(local, [
|
|
97
|
+
["night", ["ноч"]],
|
|
98
|
+
["day", ["днем", "днев"]],
|
|
99
|
+
]);
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Различает дневные и ночные подсказки для single-вопросов и штрафует
|
|
103
|
+
* вариант с противоположным временем суток рядом с фокусом вопроса.
|
|
104
|
+
*/
|
|
105
|
+
export function temporalCueAdjustment({ mode, pages, topQuestionPages, answer, focusTokens, questionTokens }) {
|
|
106
|
+
if (mode !== "single")
|
|
107
|
+
return { support: null, adjustment: 0, evidence: null };
|
|
108
|
+
const cue = temporalCue(answer.text);
|
|
109
|
+
if (!cue)
|
|
110
|
+
return { support: null, adjustment: 0, evidence: null };
|
|
111
|
+
const usefulFocus = focusTokens?.length ? focusTokens : questionTokens;
|
|
112
|
+
let bestMatch = null;
|
|
113
|
+
let bestMismatch = null;
|
|
114
|
+
for (const page of pages) {
|
|
115
|
+
const topPage = topQuestionPages?.has(page.page);
|
|
116
|
+
const adjacentTopPage = topQuestionPages?.has(page.page - 1) || topQuestionPages?.has(page.page + 1);
|
|
117
|
+
if (topQuestionPages?.size && !topPage && !adjacentTopPage)
|
|
118
|
+
continue;
|
|
119
|
+
for (const segment of cachedLineWindowSegments(page)) {
|
|
120
|
+
const focusCoverage = coverage(usefulFocus, segment.tokens);
|
|
121
|
+
if (focusCoverage < 0.12)
|
|
122
|
+
continue;
|
|
123
|
+
const found = nearestTemporalCue(segment.normalized);
|
|
124
|
+
if (!found)
|
|
125
|
+
continue;
|
|
126
|
+
const evidence = {
|
|
127
|
+
answerId: answer.id,
|
|
128
|
+
page: page.page,
|
|
129
|
+
text: segment.text,
|
|
130
|
+
score: 9.8 + Math.min(0.5, focusCoverage) * 5.0,
|
|
131
|
+
kind: found === cue ? "temporal_cue_match" : "temporal_cue_mismatch",
|
|
132
|
+
};
|
|
133
|
+
if (found === cue)
|
|
134
|
+
bestMatch = betterEvidence(bestMatch, evidence);
|
|
135
|
+
else
|
|
136
|
+
bestMismatch = betterEvidence(bestMismatch, evidence);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
if (bestMatch && (!bestMismatch || bestMatch.score >= bestMismatch.score - 0.4))
|
|
140
|
+
return { support: bestMatch, adjustment: 0, evidence: null };
|
|
141
|
+
if (bestMismatch && (!bestMatch || bestMismatch.score > bestMatch.score + 0.4))
|
|
142
|
+
return { support: null, adjustment: -4.8, evidence: bestMismatch };
|
|
143
|
+
return { support: null, adjustment: 0, evidence: null };
|
|
144
|
+
}
|
|
145
|
+
const CONTRAST_CUE_GROUPS = [
|
|
146
|
+
{
|
|
147
|
+
answer: ["верхн"],
|
|
148
|
+
opposite: ["нижн", "базал"],
|
|
149
|
+
},
|
|
150
|
+
{
|
|
151
|
+
answer: ["нижн", "базал"],
|
|
152
|
+
opposite: ["верхн"],
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
answer: ["повыш", "увелич"],
|
|
156
|
+
opposite: ["пониж", "сниж", "уменьш"],
|
|
157
|
+
},
|
|
158
|
+
{
|
|
159
|
+
answer: ["пониж", "сниж", "уменьш"],
|
|
160
|
+
opposite: ["повыш", "увелич"],
|
|
161
|
+
},
|
|
162
|
+
{
|
|
163
|
+
answer: ["дистальнопроксим"],
|
|
164
|
+
opposite: ["проксимальнодист"],
|
|
165
|
+
},
|
|
166
|
+
{
|
|
167
|
+
answer: ["проксимальнодист"],
|
|
168
|
+
opposite: ["дистальнопроксим"],
|
|
169
|
+
},
|
|
170
|
+
].map((group) => ({
|
|
171
|
+
answer: group.answer.map((item) => normalizeForSearch(item)),
|
|
172
|
+
opposite: group.opposite.map((item) => normalizeForSearch(item)),
|
|
173
|
+
}));
|
|
174
|
+
const MODIFIER_TARGET_CONTRAST_GROUPS = [
|
|
175
|
+
{
|
|
176
|
+
answer: "ранний ранняя раннее ранней",
|
|
177
|
+
opposite: "поздний поздняя позднее поздней",
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
answer: "поздний поздняя позднее поздней",
|
|
181
|
+
opposite: "ранний ранняя раннее ранней",
|
|
182
|
+
},
|
|
183
|
+
].map((group) => ({
|
|
184
|
+
answer: new Set(tokenize(group.answer)),
|
|
185
|
+
opposite: new Set(tokenize(group.opposite)),
|
|
186
|
+
}));
|
|
187
|
+
function modifierTargetContrastMismatch(answerText, sourceText) {
|
|
188
|
+
const answerTokens = tokenize(answerText);
|
|
189
|
+
const sourceTokens = tokenize(sourceText);
|
|
190
|
+
for (const group of MODIFIER_TARGET_CONTRAST_GROUPS) {
|
|
191
|
+
if (!answerTokens.some((token) => group.answer.has(token)))
|
|
192
|
+
continue;
|
|
193
|
+
const targets = answerTokens.filter((token) => token.length >= 4 && !group.answer.has(token) && !group.opposite.has(token) && !FOCUS_STOPWORDS.has(token));
|
|
194
|
+
if (!targets.length)
|
|
195
|
+
continue;
|
|
196
|
+
for (let index = 0; index < sourceTokens.length; index += 1) {
|
|
197
|
+
if (!targets.includes(sourceTokens[index]))
|
|
198
|
+
continue;
|
|
199
|
+
for (let cursor = index - 1; cursor >= Math.max(0, index - 4); cursor -= 1) {
|
|
200
|
+
const token = sourceTokens[cursor];
|
|
201
|
+
if (group.opposite.has(token))
|
|
202
|
+
return true;
|
|
203
|
+
if (group.answer.has(token))
|
|
204
|
+
break;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
return false;
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* Штрафует multi-вариант, у которого сильнейшее evidence содержит
|
|
212
|
+
* противоположную подсказку (верх/низ, рост/снижение, порядок дистальный/
|
|
213
|
+
* проксимальный) либо противоположный модификатор у того же целевого слова.
|
|
214
|
+
*/
|
|
215
|
+
export function contrastCueMismatchAdjustment({ mode, answer }, evidence) {
|
|
216
|
+
if (mode !== "multi" || !evidence?.length)
|
|
217
|
+
return { adjustment: 0, evidence: null };
|
|
218
|
+
const answerNorm = normalizeForSearch(answer.text);
|
|
219
|
+
const group = CONTRAST_CUE_GROUPS.find((item) => item.answer.some((cue) => answerNorm.includes(cue)));
|
|
220
|
+
for (const item of evidence.slice(0, 4)) {
|
|
221
|
+
if ((item.score ?? 0) < 5.5 || !item.text)
|
|
222
|
+
continue;
|
|
223
|
+
const sourceNorm = normalizeForSearch(item.text);
|
|
224
|
+
if (modifierTargetContrastMismatch(answer.text, item.text)) {
|
|
225
|
+
return {
|
|
226
|
+
adjustment: -6.4,
|
|
227
|
+
evidence: {
|
|
228
|
+
answerId: answer.id,
|
|
229
|
+
page: item.page,
|
|
230
|
+
text: item.text,
|
|
231
|
+
score: 6.4,
|
|
232
|
+
kind: "modifier_target_mismatch",
|
|
233
|
+
},
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
if (!group)
|
|
237
|
+
continue;
|
|
238
|
+
const hasAnswerCue = group.answer.some((cue) => sourceNorm.includes(cue));
|
|
239
|
+
const hasOppositeCue = group.opposite.some((cue) => sourceNorm.includes(cue));
|
|
240
|
+
if (!hasAnswerCue && hasOppositeCue) {
|
|
241
|
+
return {
|
|
242
|
+
adjustment: -5.2,
|
|
243
|
+
evidence: {
|
|
244
|
+
answerId: answer.id,
|
|
245
|
+
page: item.page,
|
|
246
|
+
text: item.text,
|
|
247
|
+
score: 5.2,
|
|
248
|
+
kind: "contrast_cue_mismatch",
|
|
249
|
+
},
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
return { adjustment: 0, evidence: null };
|
|
254
|
+
}
|
|
255
|
+
const EXCLUDED_CONDITION_START_STOP = [
|
|
256
|
+
"сниж",
|
|
257
|
+
"повыш",
|
|
258
|
+
"примен",
|
|
259
|
+
"лечен",
|
|
260
|
+
"назнач",
|
|
261
|
+
"пров",
|
|
262
|
+
"провед",
|
|
263
|
+
"проведение",
|
|
264
|
+
"использ",
|
|
265
|
+
"концентр",
|
|
266
|
+
"проб",
|
|
267
|
+
].flatMap((item) => uniqueTokens(item));
|
|
268
|
+
const CONDITION_POSITIVE_CUES = [
|
|
269
|
+
"при",
|
|
270
|
+
"на фоне",
|
|
271
|
+
"налич",
|
|
272
|
+
"имеющ",
|
|
273
|
+
].map((item) => normalizeForSearch(item));
|
|
274
|
+
/**
|
|
275
|
+
* Достает короткое условие из формулировок вида `без цирроза`, чтобы отличать
|
|
276
|
+
* рекомендации для исключенной подгруппы от рекомендаций для этой подгруппы.
|
|
277
|
+
*/
|
|
278
|
+
function excludedConditionTokens(question) {
|
|
279
|
+
const normalized = normalizeForSearch(question);
|
|
280
|
+
if (containsNormalizedPhrase(normalized, "без проведен") ||
|
|
281
|
+
containsNormalizedPhrase(normalized, "без применен") ||
|
|
282
|
+
containsNormalizedPhrase(normalized, "без назнач")) {
|
|
283
|
+
return [];
|
|
284
|
+
}
|
|
285
|
+
const tokens = tokenize(question);
|
|
286
|
+
const withoutCue = normalizeForSearch("без");
|
|
287
|
+
const out = [];
|
|
288
|
+
for (let index = 0; index < tokens.length - 1; index += 1) {
|
|
289
|
+
if (tokens[index] !== withoutCue)
|
|
290
|
+
continue;
|
|
291
|
+
const next = tokens.slice(index + 1, index + 4).filter((token) => token.length >= 4 && !FOCUS_STOPWORDS.has(token));
|
|
292
|
+
if (!next.length)
|
|
293
|
+
continue;
|
|
294
|
+
if (EXCLUDED_CONDITION_START_STOP.some((prefix) => next[0].startsWith(prefix) || prefix.startsWith(next[0])))
|
|
295
|
+
continue;
|
|
296
|
+
out.push(...next.slice(0, 2));
|
|
297
|
+
break;
|
|
298
|
+
}
|
|
299
|
+
return [...new Set(out)];
|
|
300
|
+
}
|
|
301
|
+
function evidenceHasExcludedConditionBeforeAnswer(answerText, evidenceText, conditionTokens) {
|
|
302
|
+
if (!conditionTokens.length || !evidenceText)
|
|
303
|
+
return false;
|
|
304
|
+
const normalized = normalizeForSearch(evidenceText);
|
|
305
|
+
const phrases = answerSearchPhrases(answerText)
|
|
306
|
+
.map((phrase) => normalizeForSearch(phrase))
|
|
307
|
+
.filter((phrase) => phrase.length >= 3);
|
|
308
|
+
for (const phrase of phrases) {
|
|
309
|
+
const hit = normalized.indexOf(phrase);
|
|
310
|
+
if (hit < 0)
|
|
311
|
+
continue;
|
|
312
|
+
const before = normalized.slice(Math.max(0, hit - 140), hit);
|
|
313
|
+
if (!CONDITION_POSITIVE_CUES.some((cue) => before.includes(cue)))
|
|
314
|
+
continue;
|
|
315
|
+
if (tokenHitCount(conditionTokens, tokenizeNormalized(before)) > 0)
|
|
316
|
+
return true;
|
|
317
|
+
}
|
|
318
|
+
return false;
|
|
319
|
+
}
|
|
320
|
+
/**
|
|
321
|
+
* Штрафует вариант, чье локальное evidence относится к исключенной подгруппе
|
|
322
|
+
* (`при X` рядом с фразой ответа), когда вопрос явно про `без X`.
|
|
323
|
+
*/
|
|
324
|
+
export function excludedConditionMismatchAdjustment({ mode, question, answer }, evidence) {
|
|
325
|
+
const conditionTokens = excludedConditionTokens(question);
|
|
326
|
+
if (!conditionTokens.length)
|
|
327
|
+
return { adjustment: 0, evidence: null };
|
|
328
|
+
for (const item of evidence.slice(0, 5)) {
|
|
329
|
+
if ((item.score ?? 0) < 6.5)
|
|
330
|
+
continue;
|
|
331
|
+
if (!evidenceHasExcludedConditionBeforeAnswer(answer.text, item.text, conditionTokens))
|
|
332
|
+
continue;
|
|
333
|
+
return {
|
|
334
|
+
adjustment: mode === "single" ? -12.4 : -4.2,
|
|
335
|
+
evidence: {
|
|
336
|
+
answerId: answer.id,
|
|
337
|
+
page: item.page,
|
|
338
|
+
text: item.text,
|
|
339
|
+
score: 8.4,
|
|
340
|
+
kind: "excluded_condition_mismatch",
|
|
341
|
+
},
|
|
342
|
+
};
|
|
343
|
+
}
|
|
344
|
+
return { adjustment: 0, evidence: null };
|
|
345
|
+
}
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
import { extractNumbers, normalizeForSearch, uniqueTokens } from "../../normalize.js";
|
|
2
|
+
import { FOCUS_STOPWORDS } from "../constants.js";
|
|
3
|
+
import { betterEvidence, containsNormalizedPhrase, expandNumberToken, rawTokens, softCoverage, tokenizeNormalized } from "../text-utils.js";
|
|
4
|
+
const DOSE_DRUG_GENERIC = new Set([
|
|
5
|
+
"\u0441\u0443\u0442\u043e\u0447\u043d\u0430\u044f",
|
|
6
|
+
"\u0434\u043e\u0437\u0430",
|
|
7
|
+
"\u0434\u043e\u0437\u044b",
|
|
8
|
+
"\u0434\u043e\u0437\u0435",
|
|
9
|
+
"\u043f\u0440\u0438",
|
|
10
|
+
"\u043b\u0435\u0447\u0435\u043d\u0438\u0438",
|
|
11
|
+
"\u043b\u0435\u0447\u0435\u043d\u0438\u044f",
|
|
12
|
+
"\u043b\u0435\u0447\u0435\u043d\u0438\u0435",
|
|
13
|
+
"\u043b\u043e\u043a\u0430\u043b\u0438\u0437\u043e\u0432\u0430\u043d\u043d\u044b\u0445",
|
|
14
|
+
"\u043b\u043e\u043a\u0430\u043b\u0438\u0437\u043e\u0432\u0430\u043d\u043d\u044b\u0435",
|
|
15
|
+
"\u0444\u043e\u0440\u043c",
|
|
16
|
+
"\u0444\u043e\u0440\u043c\u044b",
|
|
17
|
+
"\u0438\u043d\u0444\u0435\u043a\u0446\u0438\u0438",
|
|
18
|
+
"\u0438\u043d\u0444\u0435\u043a\u0446\u0438\u044f",
|
|
19
|
+
"\u043c\u0435\u043d\u0438\u043d\u0433\u043e\u043a\u043e\u043a\u043a\u043e\u0432\u043e\u0439",
|
|
20
|
+
"\u043f\u0430\u0446\u0438\u0435\u043d\u0442\u0430\u043c",
|
|
21
|
+
"\u043f\u0430\u0446\u0438\u0435\u043d\u0442\u043e\u0432",
|
|
22
|
+
"\u043e\u043f\u044b\u0442\u043e\u043c",
|
|
23
|
+
"\u043f\u0440\u0435\u0434\u0448\u0435\u0441\u0442\u0432\u0443\u044e\u0449\u0435\u0439",
|
|
24
|
+
"\u0442\u0435\u0440\u0430\u043f\u0438\u0438",
|
|
25
|
+
"\u0434\u0430\u043d\u043d\u044b\u043c",
|
|
26
|
+
"\u043f\u0440\u0435\u043f\u0430\u0440\u0430\u0442\u043e\u043c",
|
|
27
|
+
"\u0441\u043e\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u0442",
|
|
28
|
+
"\u0441\u0443\u0442\u043a\u0438",
|
|
29
|
+
"\u0441\u0443\u0442",
|
|
30
|
+
].flatMap((item) => uniqueTokens(item)));
|
|
31
|
+
const DOSE_ASSIGNMENT_CUES = [
|
|
32
|
+
"\u043d\u0430\u0437\u043d\u0430\u0447",
|
|
33
|
+
"\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434",
|
|
34
|
+
"\u043f\u0440\u0438\u043c\u0435\u043d",
|
|
35
|
+
"\u043f\u043e\u043b\u0443\u0447",
|
|
36
|
+
"\u0432\u0432\u043e\u0434",
|
|
37
|
+
"\u0441\u043e\u0441\u0442\u0430\u0432\u043b",
|
|
38
|
+
].map((item) => normalizeForSearch(item));
|
|
39
|
+
function doseTokenStartsWithAny(token, cues) {
|
|
40
|
+
const normalized = normalizeForSearch(token);
|
|
41
|
+
return cues.some((cue) => normalized.startsWith(cue));
|
|
42
|
+
}
|
|
43
|
+
function doseContentTokens(text) {
|
|
44
|
+
return uniqueTokens(text).filter((token) => token.length >= 5 && !DOSE_DRUG_GENERIC.has(token) && !FOCUS_STOPWORDS.has(token) && !/^\d/u.test(token));
|
|
45
|
+
}
|
|
46
|
+
function questionDoseDrugTokens(question) {
|
|
47
|
+
const normalized = normalizeForSearch(question);
|
|
48
|
+
if (!containsNormalizedPhrase(normalized, "\u0434\u043e\u0437"))
|
|
49
|
+
return [];
|
|
50
|
+
const raw = rawTokens(question);
|
|
51
|
+
const doseIndex = raw.findIndex((token) => doseTokenStartsWithAny(token, [normalizeForSearch("\u0434\u043e\u0437")]));
|
|
52
|
+
const assignIndex = raw.findIndex((token, index) => index < (doseIndex < 0 ? raw.length : doseIndex) && doseTokenStartsWithAny(token, DOSE_ASSIGNMENT_CUES));
|
|
53
|
+
if (assignIndex > 0) {
|
|
54
|
+
const beforeAssign = raw.slice(Math.max(0, assignIndex - 9), assignIndex).join(" ");
|
|
55
|
+
const local = doseContentTokens(beforeAssign).slice(-3);
|
|
56
|
+
if (local.length)
|
|
57
|
+
return local;
|
|
58
|
+
}
|
|
59
|
+
const tokens = doseContentTokens(question);
|
|
60
|
+
return tokens.slice(0, 3);
|
|
61
|
+
}
|
|
62
|
+
function drugTokenIndex(normalized, drugTokens) {
|
|
63
|
+
let best = -1;
|
|
64
|
+
for (const token of drugTokens) {
|
|
65
|
+
const prefix = token.slice(0, Math.min(token.length, 9));
|
|
66
|
+
const index = normalized.indexOf(prefix);
|
|
67
|
+
if (index >= 0)
|
|
68
|
+
best = best < 0 ? index : Math.min(best, index);
|
|
69
|
+
}
|
|
70
|
+
return best;
|
|
71
|
+
}
|
|
72
|
+
function doseSlashNumbers(sourceText, drugTokens) {
|
|
73
|
+
const out = [];
|
|
74
|
+
const slashPattern = /(\d+(?:[.,]\d+)?)\s*\/\s*(\d+(?:[.,]\d+)?)\s*мг/giu;
|
|
75
|
+
for (const match of sourceText.matchAll(slashPattern)) {
|
|
76
|
+
const rawIndex = match.index ?? 0;
|
|
77
|
+
const beforeText = sourceText.slice(Math.max(0, rawIndex - 150), rawIndex);
|
|
78
|
+
const before = normalizeForSearch(beforeText);
|
|
79
|
+
if (softCoverage(drugTokens, tokenizeNormalized(before)) < 0.8)
|
|
80
|
+
continue;
|
|
81
|
+
const drugIndex = drugTokenIndex(before, drugTokens);
|
|
82
|
+
if (drugIndex < 0)
|
|
83
|
+
continue;
|
|
84
|
+
const plusAfter = before.indexOf("+", drugIndex);
|
|
85
|
+
const plusBefore = before.lastIndexOf("+", drugIndex);
|
|
86
|
+
const first = String(match[1]).replace(",", ".");
|
|
87
|
+
const second = String(match[2]).replace(",", ".");
|
|
88
|
+
if (plusAfter >= 0 && plusAfter <= before.length - 1) {
|
|
89
|
+
out.push(first);
|
|
90
|
+
}
|
|
91
|
+
else if (plusBefore >= 0) {
|
|
92
|
+
out.push(second);
|
|
93
|
+
}
|
|
94
|
+
else {
|
|
95
|
+
out.push(first, second);
|
|
96
|
+
}
|
|
97
|
+
break;
|
|
98
|
+
}
|
|
99
|
+
return out;
|
|
100
|
+
}
|
|
101
|
+
function doseNearDrugNumbers(sourceText, drugTokens) {
|
|
102
|
+
const normalized = normalizeForSearch(sourceText);
|
|
103
|
+
const drugIndex = drugTokenIndex(normalized, drugTokens);
|
|
104
|
+
if (drugIndex < 0)
|
|
105
|
+
return [];
|
|
106
|
+
const local = normalized.slice(drugIndex, Math.min(normalized.length, drugIndex + 95));
|
|
107
|
+
if (!containsNormalizedPhrase(local, "\u043c\u0433"))
|
|
108
|
+
return [];
|
|
109
|
+
if (/\d+(?:[.,]\d+)?\s*\/\s*\d+(?:[.,]\d+)?\s*mг/iu.test(local))
|
|
110
|
+
return [];
|
|
111
|
+
const firstNumber = local.match(/\d+(?:[.,]\d+)?/u);
|
|
112
|
+
if (!firstNumber || (firstNumber.index ?? 0) > 55)
|
|
113
|
+
return [];
|
|
114
|
+
if (local.slice(0, firstNumber.index ?? 0).includes("+"))
|
|
115
|
+
return [];
|
|
116
|
+
const beforeNumberTokens = tokenizeNormalized(local.slice(0, firstNumber.index ?? 0));
|
|
117
|
+
const genericBeforeDose = new Set(["taб", "taбл", "paз", "p", "д", "mг"]);
|
|
118
|
+
const hasOtherDrugMarker = beforeNumberTokens.some((token) => {
|
|
119
|
+
if (genericBeforeDose.has(token) || /^\d/.test(token))
|
|
120
|
+
return false;
|
|
121
|
+
if (drugTokens.some((drugToken) => drugToken.startsWith(token) || token.startsWith(drugToken.slice(0, Math.min(8, drugToken.length)))))
|
|
122
|
+
return false;
|
|
123
|
+
return token.length >= 3;
|
|
124
|
+
});
|
|
125
|
+
if (hasOtherDrugMarker)
|
|
126
|
+
return [];
|
|
127
|
+
return extractNumbers(local).slice(0, 2).map((number) => String(number).replace(",", "."));
|
|
128
|
+
}
|
|
129
|
+
function normalizeDoseNumber(value) {
|
|
130
|
+
return String(value ?? "").replace(",", ".").replace(/\.0$/u, "");
|
|
131
|
+
}
|
|
132
|
+
function answerDoseFact(answerText) {
|
|
133
|
+
const normalized = normalizeForSearch(answerText);
|
|
134
|
+
const doseRangeMatch = normalized.match(/(\d+(?:[.,]\d+)?)\s*-\s*(\d+(?:[.,]\d+)?)\s*[\u006d\u043c]\u0433/iu);
|
|
135
|
+
const doseMatch = normalized.match(/(\d+(?:[.,]\d+)?)\s*[\u006d\u043c]\u0433/iu);
|
|
136
|
+
const frequencyMatch = normalized.match(/(?:[\u0078\u0445]\s*|(?:\u0440\u0430\u0437|\u0440)\s*)(\d+(?:[.,]\d+)?)(?:\s*[\u0070\u0440]\s*\/\s*\u0434|\s*\u0440|\s*\u0440\u0430\u0437)?/iu);
|
|
137
|
+
return {
|
|
138
|
+
doseRange: doseRangeMatch?.[1] && doseRangeMatch?.[2] ? [normalizeDoseNumber(doseRangeMatch[1]), normalizeDoseNumber(doseRangeMatch[2])] : null,
|
|
139
|
+
dose: doseMatch?.[1] ? normalizeDoseNumber(doseMatch[1]) : null,
|
|
140
|
+
frequency: frequencyMatch?.[1] ? normalizeDoseNumber(frequencyMatch[1]) : null,
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
function sourceDoseFacts(sourceText, drugTokens) {
|
|
144
|
+
const normalized = normalizeForSearch(sourceText);
|
|
145
|
+
const drugIndex = drugTokenIndex(normalized, drugTokens);
|
|
146
|
+
if (drugIndex < 0)
|
|
147
|
+
return [];
|
|
148
|
+
const local = normalized.slice(drugIndex, Math.min(normalized.length, drugIndex + 125));
|
|
149
|
+
const facts = [];
|
|
150
|
+
const dosePattern = /(\d+(?:[.,]\d+)?)(?:\s*-\s*(\d+(?:[.,]\d+)?))?\s*[\u006d\u043c]\u0433(?:\s*[\u0078\u0445]\s*(\d+(?:[.,]\d+)?))?/giu;
|
|
151
|
+
for (const match of local.matchAll(dosePattern)) {
|
|
152
|
+
const index = match.index ?? 0;
|
|
153
|
+
if (index > 80)
|
|
154
|
+
continue;
|
|
155
|
+
facts.push({
|
|
156
|
+
dose: normalizeDoseNumber(match[2] ?? match[1]),
|
|
157
|
+
doseRange: match[2] ? [normalizeDoseNumber(match[1]), normalizeDoseNumber(match[2])] : null,
|
|
158
|
+
frequency: match[3] ? normalizeDoseNumber(match[3]) : null,
|
|
159
|
+
});
|
|
160
|
+
break;
|
|
161
|
+
}
|
|
162
|
+
for (const number of [...doseSlashNumbers(sourceText, drugTokens), ...doseNearDrugNumbers(sourceText, drugTokens)]) {
|
|
163
|
+
facts.push({ dose: normalizeDoseNumber(number), doseRange: null, frequency: null });
|
|
164
|
+
}
|
|
165
|
+
return facts;
|
|
166
|
+
}
|
|
167
|
+
function doseFactMatchesAnswer(fact, answerFact, answerNumbers, hasFrequencyFacts = false) {
|
|
168
|
+
if (answerFact.doseRange) {
|
|
169
|
+
if (!fact.doseRange)
|
|
170
|
+
return false;
|
|
171
|
+
if (fact.doseRange[0] !== answerFact.doseRange[0] || fact.doseRange[1] !== answerFact.doseRange[1])
|
|
172
|
+
return false;
|
|
173
|
+
}
|
|
174
|
+
if (answerFact.dose && fact.dose !== answerFact.dose)
|
|
175
|
+
return false;
|
|
176
|
+
if (!answerFact.dose && !answerNumbers.has(fact.dose))
|
|
177
|
+
return false;
|
|
178
|
+
if (answerFact.frequency && hasFrequencyFacts && !fact.frequency)
|
|
179
|
+
return false;
|
|
180
|
+
if (answerFact.frequency && fact.frequency && fact.frequency !== answerFact.frequency)
|
|
181
|
+
return false;
|
|
182
|
+
return true;
|
|
183
|
+
}
|
|
184
|
+
export function bestDrugDoseSupport({ mode, pages, question, answer }) {
|
|
185
|
+
if (mode !== "single")
|
|
186
|
+
return null;
|
|
187
|
+
const drugTokens = questionDoseDrugTokens(question);
|
|
188
|
+
if (!drugTokens.length)
|
|
189
|
+
return null;
|
|
190
|
+
const answerNumbers = new Set(extractNumbers(answer.text).flatMap(expandNumberToken).map((number) => String(number).replace(",", ".")));
|
|
191
|
+
if (!answerNumbers.size || !containsNormalizedPhrase(normalizeForSearch(answer.text), "\u043c\u0433"))
|
|
192
|
+
return null;
|
|
193
|
+
const answerFact = answerDoseFact(answer.text);
|
|
194
|
+
let best = null;
|
|
195
|
+
for (const page of pages) {
|
|
196
|
+
const lines = page.lines ?? [];
|
|
197
|
+
for (let index = 0; index < lines.length; index += 1) {
|
|
198
|
+
const text = lines.slice(index, Math.min(lines.length, index + 3)).join(" ");
|
|
199
|
+
const normalized = normalizeForSearch(text);
|
|
200
|
+
const sourceTokens = tokenizeNormalized(normalized);
|
|
201
|
+
if (softCoverage(drugTokens, sourceTokens) < 0.8)
|
|
202
|
+
continue;
|
|
203
|
+
const facts = sourceDoseFacts(text, drugTokens);
|
|
204
|
+
if (!facts.length)
|
|
205
|
+
continue;
|
|
206
|
+
const hasFrequencyFacts = facts.some((fact) => fact.frequency);
|
|
207
|
+
const hit = facts.some((fact) => doseFactMatchesAnswer(fact, answerFact, answerNumbers, hasFrequencyFacts));
|
|
208
|
+
if (!hit)
|
|
209
|
+
continue;
|
|
210
|
+
const score = 16.2 + Math.min(2, facts.length) * 0.7 + (answerFact.frequency && facts.some((fact) => fact.frequency === answerFact.frequency) ? 2.1 : 0);
|
|
211
|
+
best = betterEvidence(best, {
|
|
212
|
+
answerId: answer.id,
|
|
213
|
+
page: page.page,
|
|
214
|
+
text,
|
|
215
|
+
score,
|
|
216
|
+
kind: "drug_dose_segment",
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
return best;
|
|
221
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export declare function bestExactAnswerSupport({ mode, pages, topQuestionPages, question, answer, questionTokens, answerTokens, focusTokens }: {
|
|
2
|
+
mode: any;
|
|
3
|
+
pages: any;
|
|
4
|
+
topQuestionPages: any;
|
|
5
|
+
question: any;
|
|
6
|
+
answer: any;
|
|
7
|
+
questionTokens: any;
|
|
8
|
+
answerTokens: any;
|
|
9
|
+
focusTokens: any;
|
|
10
|
+
}): any;
|