med-pdf-nmo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +298 -0
- package/README.ru.md +298 -0
- package/dist/bm25.d.ts +47 -0
- package/dist/bm25.js +86 -0
- package/dist/browser-shims/buffer.d.ts +30 -0
- package/dist/browser-shims/buffer.js +31 -0
- package/dist/browser-shims/crypto.d.ts +33 -0
- package/dist/browser-shims/crypto.js +45 -0
- package/dist/browser-shims/fs-promises.d.ts +13 -0
- package/dist/browser-shims/fs-promises.js +25 -0
- package/dist/browser-shims/fs.d.ts +14 -0
- package/dist/browser-shims/fs.js +24 -0
- package/dist/browser-shims/globals.d.ts +9 -0
- package/dist/browser-shims/globals.js +23 -0
- package/dist/browser-shims/path.d.ts +57 -0
- package/dist/browser-shims/path.js +65 -0
- package/dist/browser-shims/process.d.ts +22 -0
- package/dist/browser-shims/process.js +27 -0
- package/dist/browser.d.ts +9 -0
- package/dist/browser.js +12 -0
- package/dist/chunk.d.ts +15 -0
- package/dist/chunk.js +76 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +87 -0
- package/dist/index.d.ts +82 -0
- package/dist/index.js +51 -0
- package/dist/med-pdf-nmo.browser.js +40413 -0
- package/dist/med-pdf-nmo.browser.mjs +40395 -0
- package/dist/normalize.d.ts +73 -0
- package/dist/normalize.js +477 -0
- package/dist/pdf.d.ts +35 -0
- package/dist/pdf.js +396 -0
- package/dist/predictor/config.d.ts +28 -0
- package/dist/predictor/config.js +26 -0
- package/dist/predictor/constants.d.ts +3 -0
- package/dist/predictor/constants.js +59 -0
- package/dist/predictor/runtime.d.ts +15 -0
- package/dist/predictor/runtime.js +59 -0
- package/dist/predictor/scorers/biomedical-symbols.d.ts +36 -0
- package/dist/predictor/scorers/biomedical-symbols.js +347 -0
- package/dist/predictor/scorers/coordinate-table.d.ts +82 -0
- package/dist/predictor/scorers/coordinate-table.js +1210 -0
- package/dist/predictor/scorers/direction.d.ts +71 -0
- package/dist/predictor/scorers/direction.js +345 -0
- package/dist/predictor/scorers/drug-dose.d.ts +6 -0
- package/dist/predictor/scorers/drug-dose.js +221 -0
- package/dist/predictor/scorers/exact-answer.d.ts +10 -0
- package/dist/predictor/scorers/exact-answer.js +75 -0
- package/dist/predictor/scorers/fibrosis-stage.d.ts +6 -0
- package/dist/predictor/scorers/fibrosis-stage.js +103 -0
- package/dist/predictor/scorers/focused.d.ts +40 -0
- package/dist/predictor/scorers/focused.js +204 -0
- package/dist/predictor/scorers/frequency.d.ts +10 -0
- package/dist/predictor/scorers/frequency.js +203 -0
- package/dist/predictor/scorers/numeric.d.ts +77 -0
- package/dist/predictor/scorers/numeric.js +1161 -0
- package/dist/predictor/scorers/recommendation-item.d.ts +27 -0
- package/dist/predictor/scorers/recommendation-item.js +469 -0
- package/dist/predictor/scorers/search.d.ts +41 -0
- package/dist/predictor/scorers/search.js +515 -0
- package/dist/predictor/selection.d.ts +30 -0
- package/dist/predictor/selection.js +370 -0
- package/dist/predictor/text-utils.d.ts +49 -0
- package/dist/predictor/text-utils.js +497 -0
- package/dist/predictor/types.d.ts +23 -0
- package/dist/predictor/types.js +1 -0
- package/dist/predictor.d.ts +52 -0
- package/dist/predictor.js +3834 -0
- package/package.json +82 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import { coverage, extractNumbers, normalizeForSearch, tokenize } from "../../normalize.js";
|
|
2
|
+
import { answerSearchPhrases, betterEvidence, containsNormalizedPhrase, evidenceSnippet, findPhraseOccurrences, numberCoverage, tokenHitCount, tokenizeNormalized, } from "../text-utils.js";
|
|
3
|
+
function exactAnswerPhrases(answerText, answerTokens) {
|
|
4
|
+
const answerNumbers = extractNumbers(answerText);
|
|
5
|
+
const minTokenCount = Math.max(3, Math.ceil(Math.max(1, answerTokens.length) * 0.72));
|
|
6
|
+
const phrases = [];
|
|
7
|
+
const seen = new Set();
|
|
8
|
+
for (const phrase of answerSearchPhrases(answerText)) {
|
|
9
|
+
const normalized = normalizeForSearch(phrase);
|
|
10
|
+
if (!normalized || normalized.length < 10 || seen.has(normalized))
|
|
11
|
+
continue;
|
|
12
|
+
const tokens = tokenize(phrase);
|
|
13
|
+
if (tokens.length < minTokenCount)
|
|
14
|
+
continue;
|
|
15
|
+
if (answerNumbers.length && numberCoverage(answerText, normalized) < 0.99)
|
|
16
|
+
continue;
|
|
17
|
+
phrases.push({ raw: phrase, normalized, tokens });
|
|
18
|
+
seen.add(normalized);
|
|
19
|
+
}
|
|
20
|
+
return phrases;
|
|
21
|
+
}
|
|
22
|
+
function exactAnswerApplicable(question, answer) {
|
|
23
|
+
const normalizedQuestion = normalizeForSearch(question);
|
|
24
|
+
const answerNumbers = extractNumbers(answer.text);
|
|
25
|
+
if (answerNumbers.length < 3)
|
|
26
|
+
return false;
|
|
27
|
+
const routeDoseQuestion = containsNormalizedPhrase(normalizedQuestion, "\u0432\u043d\u0443\u0442\u0440\u044c \u043f\u043e") ||
|
|
28
|
+
(containsNormalizedPhrase(normalizedQuestion, "\u043d\u0430\u0437\u043d\u0430\u0447") &&
|
|
29
|
+
containsNormalizedPhrase(normalizedQuestion, "\u043f\u043e") &&
|
|
30
|
+
containsNormalizedPhrase(normalizedQuestion, "\u0432\u043d\u0443\u0442\u0440"));
|
|
31
|
+
return routeDoseQuestion;
|
|
32
|
+
}
|
|
33
|
+
export function bestExactAnswerSupport({ mode, pages, topQuestionPages, question, answer, questionTokens, answerTokens, focusTokens }) {
|
|
34
|
+
if (mode !== "single")
|
|
35
|
+
return null;
|
|
36
|
+
if (!exactAnswerApplicable(question, answer))
|
|
37
|
+
return null;
|
|
38
|
+
const phrases = exactAnswerPhrases(answer.text, answerTokens);
|
|
39
|
+
if (!phrases.length)
|
|
40
|
+
return null;
|
|
41
|
+
let best = null;
|
|
42
|
+
for (const page of pages) {
|
|
43
|
+
const nearQuestionPage = !topQuestionPages?.size || topQuestionPages.has(page.page) || topQuestionPages.has(page.page - 1) || topQuestionPages.has(page.page + 1);
|
|
44
|
+
for (const phrase of phrases) {
|
|
45
|
+
const hits = findPhraseOccurrences(page.normalized, phrase.raw, { textIsNormalized: true });
|
|
46
|
+
for (const hit of hits) {
|
|
47
|
+
const local = page.normalized.slice(Math.max(0, hit - 380), Math.min(page.normalized.length, hit + phrase.normalized.length + 380));
|
|
48
|
+
const localTokens = tokenizeNormalized(local);
|
|
49
|
+
const focusHits = tokenHitCount(focusTokens, localTokens);
|
|
50
|
+
const focusCoverage = focusTokens.length ? coverage(focusTokens, localTokens) : 0;
|
|
51
|
+
const questionCoverage = questionTokens.length ? coverage(questionTokens, localTokens) : 0;
|
|
52
|
+
if (!nearQuestionPage && focusHits < 2 && focusCoverage < 0.22)
|
|
53
|
+
continue;
|
|
54
|
+
if (focusTokens.length && focusHits < 1 && focusCoverage < 0.16 && questionCoverage < 0.14)
|
|
55
|
+
continue;
|
|
56
|
+
const answerLengthBonus = Math.min(4.8, phrase.tokens.length * 0.55);
|
|
57
|
+
const score = 20.2 +
|
|
58
|
+
answerLengthBonus +
|
|
59
|
+
Math.min(0.8, focusCoverage) * 8.2 +
|
|
60
|
+
Math.min(4, focusHits) * 0.95 +
|
|
61
|
+
Math.min(0.6, questionCoverage) * 2.4 +
|
|
62
|
+
numberCoverage(answer.text, local) * 2.4 +
|
|
63
|
+
(nearQuestionPage ? 1.2 : 0);
|
|
64
|
+
best = betterEvidence(best, {
|
|
65
|
+
answerId: answer.id,
|
|
66
|
+
page: page.page,
|
|
67
|
+
text: evidenceSnippet(page.text, question, answer.text),
|
|
68
|
+
score,
|
|
69
|
+
kind: "exact_answer_phrase",
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
return best;
|
|
75
|
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import { extractNumbers, normalizeForSearch } from "../../normalize.js";
|
|
2
|
+
import { betterEvidence, containsNormalizedPhrase, rawTokens } from "../text-utils.js";
|
|
3
|
+
function fibrosisDescriptorKey(text) {
|
|
4
|
+
const normalized = normalizeForSearch(text);
|
|
5
|
+
const metavir = normalized.match(/^f\s*([0-4])\b/iu);
|
|
6
|
+
if (metavir?.[1] === "0" && containsNormalizedPhrase(normalized, "\u043e\u0442\u0441\u0443\u0442"))
|
|
7
|
+
return "none";
|
|
8
|
+
if (metavir?.[1] === "1" && containsNormalizedPhrase(normalized, "\u0431\u0435\u0437") && containsNormalizedPhrase(normalized, "\u0441\u0435\u043f\u0442"))
|
|
9
|
+
return "mild";
|
|
10
|
+
if (metavir?.[1] === "2" && containsNormalizedPhrase(normalized, "\u0435\u0434\u0438\u043d\u0438\u0447") && containsNormalizedPhrase(normalized, "\u0441\u0435\u043f\u0442"))
|
|
11
|
+
return "moderate";
|
|
12
|
+
if (metavir?.[1] === "3" && (containsNormalizedPhrase(normalized, "\u043c\u043d\u043e\u0433\u043e\u0447\u0438\u0441\u043b") || containsNormalizedPhrase(normalized, "\u0431\u0435\u0437 \u0446\u0438\u0440\u0440\u043e\u0437")))
|
|
13
|
+
return "marked";
|
|
14
|
+
if (metavir?.[1] === "4" && containsNormalizedPhrase(normalized, "\u0446\u0438\u0440\u0440\u043e\u0437"))
|
|
15
|
+
return "cirrhosis";
|
|
16
|
+
if (!containsNormalizedPhrase(normalized, "\u0444\u0438\u0431\u0440\u043e\u0437") && !containsNormalizedPhrase(normalized, "\u0446\u0438\u0440\u0440\u043e\u0437"))
|
|
17
|
+
return null;
|
|
18
|
+
if (containsNormalizedPhrase(normalized, "\u0431\u0435\u0437 \u0446\u0438\u0440\u0440\u043e\u0437") && !containsNormalizedPhrase(normalized, "\u0444\u0438\u0431\u0440\u043e\u0437"))
|
|
19
|
+
return null;
|
|
20
|
+
if (containsNormalizedPhrase(normalized, "\u0431\u0435\u0437 \u0444\u0438\u0431\u0440\u043e\u0437") || containsNormalizedPhrase(normalized, "\u043e\u0442\u0441\u0443\u0442"))
|
|
21
|
+
return "none";
|
|
22
|
+
if (containsNormalizedPhrase(normalized, "\u0441\u043b\u0430\u0431\u043e\u0432\u044b\u0440\u0430\u0436"))
|
|
23
|
+
return "mild";
|
|
24
|
+
if (containsNormalizedPhrase(normalized, "\u0443\u043c\u0435\u0440\u0435\u043d"))
|
|
25
|
+
return "moderate";
|
|
26
|
+
if (containsNormalizedPhrase(normalized, "\u0442\u044f\u0436\u0435\u043b"))
|
|
27
|
+
return "severe";
|
|
28
|
+
if (containsNormalizedPhrase(normalized, "\u0446\u0438\u0440\u0440\u043e\u0437"))
|
|
29
|
+
return "cirrhosis";
|
|
30
|
+
if (containsNormalizedPhrase(normalized, "\u0432\u044b\u0440\u0430\u0436"))
|
|
31
|
+
return "marked";
|
|
32
|
+
return null;
|
|
33
|
+
}
|
|
34
|
+
function questionFibrosisStage(question) {
|
|
35
|
+
const tokens = rawTokens(question);
|
|
36
|
+
const stageIndex = tokens.findIndex((token) => token.startsWith("\u0441\u0442\u0430\u0434"));
|
|
37
|
+
for (let index = Math.max(0, stageIndex); index >= 0 && index < Math.min(tokens.length, stageIndex + 4); index += 1) {
|
|
38
|
+
const token = tokens[index];
|
|
39
|
+
if (/^[0-4]$/u.test(token))
|
|
40
|
+
return token;
|
|
41
|
+
}
|
|
42
|
+
const normalized = normalizeForSearch(question);
|
|
43
|
+
const fStage = normalized.match(/\bf\s*([0-4])\b/iu);
|
|
44
|
+
return fStage?.[1] ?? null;
|
|
45
|
+
}
|
|
46
|
+
function answerFibrosisStage(answerText) {
|
|
47
|
+
const normalized = normalizeForSearch(answerText);
|
|
48
|
+
const exact = normalized.match(/^(?:f\s*)?([0-4])$/iu);
|
|
49
|
+
if (exact)
|
|
50
|
+
return exact[1];
|
|
51
|
+
const numbers = extractNumbers(answerText).map((item) => String(item).replace(",", "."));
|
|
52
|
+
const stageNumbers = numbers.filter((number) => /^[0-4]$/u.test(number));
|
|
53
|
+
return stageNumbers.length === 1 ? stageNumbers[0] : null;
|
|
54
|
+
}
|
|
55
|
+
function fibrosisRowStage(line) {
|
|
56
|
+
const normalized = normalizeForSearch(line).trim();
|
|
57
|
+
const numeric = normalized.match(/^([0-4])\s*-/u);
|
|
58
|
+
if (numeric)
|
|
59
|
+
return numeric[1];
|
|
60
|
+
const metavir = normalized.match(/^f\s*([0-4])\b/iu);
|
|
61
|
+
return metavir?.[1] ?? null;
|
|
62
|
+
}
|
|
63
|
+
export function bestFibrosisStageSupport({ mode, pages, question, answer }) {
|
|
64
|
+
if (mode !== "single")
|
|
65
|
+
return null;
|
|
66
|
+
const questionNorm = normalizeForSearch(question);
|
|
67
|
+
if (!containsNormalizedPhrase(questionNorm, "\u0444\u0438\u0431\u0440\u043e\u0437") &&
|
|
68
|
+
!containsNormalizedPhrase(questionNorm, "\u0446\u0438\u0440\u0440\u043e\u0437") &&
|
|
69
|
+
!containsNormalizedPhrase(questionNorm, "\u0441\u0442\u0430\u0434")) {
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
const qStage = questionFibrosisStage(question);
|
|
73
|
+
const qDescriptor = fibrosisDescriptorKey(question);
|
|
74
|
+
const answerStage = answerFibrosisStage(answer.text);
|
|
75
|
+
const answerDescriptor = fibrosisDescriptorKey(answer.text);
|
|
76
|
+
if (!qStage && answerStage && !containsNormalizedPhrase(questionNorm, "\u0441\u043e\u043e\u0442\u0432\u0435\u0442"))
|
|
77
|
+
return null;
|
|
78
|
+
const targetStage = qStage ?? answerStage;
|
|
79
|
+
const targetDescriptor = qStage ? answerDescriptor : qDescriptor;
|
|
80
|
+
if (!targetStage || !targetDescriptor)
|
|
81
|
+
return null;
|
|
82
|
+
let best = null;
|
|
83
|
+
for (const page of pages) {
|
|
84
|
+
const lines = page.lines ?? [];
|
|
85
|
+
for (let index = 0; index < lines.length; index += 1) {
|
|
86
|
+
const text = lines[index];
|
|
87
|
+
const stage = fibrosisRowStage(text);
|
|
88
|
+
if (stage !== targetStage)
|
|
89
|
+
continue;
|
|
90
|
+
const descriptor = fibrosisDescriptorKey(text);
|
|
91
|
+
if (descriptor !== targetDescriptor)
|
|
92
|
+
continue;
|
|
93
|
+
best = betterEvidence(best, {
|
|
94
|
+
answerId: answer.id,
|
|
95
|
+
page: page.page,
|
|
96
|
+
text,
|
|
97
|
+
score: 22.4,
|
|
98
|
+
kind: "fibrosis_stage_row",
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
return best;
|
|
103
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Собирает компактный набор токенов вопроса, по которым остальные scorer'ы ищут
|
|
3
|
+
* релевантные страницы, строки и локальные окна в PDF.
|
|
4
|
+
*
|
|
5
|
+
* В набор попадают явные cue-фразы, числа из вопроса и негeneric-токены; это
|
|
6
|
+
* снижает шанс, что вариант ответа будет поддержан похожим, но чужим фрагментом.
|
|
7
|
+
*/
|
|
8
|
+
export declare function questionFocusTokens(question: any): any[];
|
|
9
|
+
/**
|
|
10
|
+
* Ищет вариант ответа в небольшом окне вокруг точного/нормализованного вхождения
|
|
11
|
+
* и требует рядом фокус вопроса или совпадающие числа из вопроса.
|
|
12
|
+
*/
|
|
13
|
+
export declare function bestFocusedSupport({ pages, topQuestionPages, question, answer, answerTokens, focusTokens, intent }: {
|
|
14
|
+
pages: any;
|
|
15
|
+
topQuestionPages: any;
|
|
16
|
+
question: any;
|
|
17
|
+
answer: any;
|
|
18
|
+
answerTokens: any;
|
|
19
|
+
focusTokens: any;
|
|
20
|
+
intent: any;
|
|
21
|
+
}): any;
|
|
22
|
+
/**
|
|
23
|
+
* Возвращает кешированные line/pair-сегменты с уже посчитанной нормализацией и
|
|
24
|
+
* токенами; кеш живет на объекте страницы и не попадает в публичный результат.
|
|
25
|
+
*/
|
|
26
|
+
export declare function cachedLineTokenSegments(page: any): any;
|
|
27
|
+
/**
|
|
28
|
+
* Сравнивает вариант ответа с локальными line/pair-сегментами и отдает поддержку
|
|
29
|
+
* только когда рядом есть фокус вопроса, числа или достаточно сильное совпадение.
|
|
30
|
+
*/
|
|
31
|
+
export declare function bestLineTokenSupport({ pages, topQuestionPages, question, answer, questionTokens, answerTokens, focusTokens, intent }: {
|
|
32
|
+
pages: any;
|
|
33
|
+
topQuestionPages: any;
|
|
34
|
+
question: any;
|
|
35
|
+
answer: any;
|
|
36
|
+
questionTokens: any;
|
|
37
|
+
answerTokens: any;
|
|
38
|
+
focusTokens: any;
|
|
39
|
+
intent: any;
|
|
40
|
+
}): any;
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
import { coverage, extractNumbers, normalizeForSearch, normalizeText, tokenize, uniqueTokens } from "../../normalize.js";
|
|
2
|
+
import { FOCUS_STOPWORDS } from "../constants.js";
|
|
3
|
+
import { answerSearchPhrases, betterEvidence, cachedPageTokens, containsNormalizedPhrase, evidenceSnippet, expandNumberToken, findPhraseOccurrences, focusedAnswerSearchPhrases, numberCoverage, pageWindow, strictSoftCoverage, tokenizeNormalized, tokenHitCount, } from "../text-utils.js";
|
|
4
|
+
/**
|
|
5
|
+
* Собирает компактный набор токенов вопроса, по которым остальные scorer'ы ищут
|
|
6
|
+
* релевантные страницы, строки и локальные окна в PDF.
|
|
7
|
+
*
|
|
8
|
+
* В набор попадают явные cue-фразы, числа из вопроса и негeneric-токены; это
|
|
9
|
+
* снижает шанс, что вариант ответа будет поддержан похожим, но чужим фрагментом.
|
|
10
|
+
*/
|
|
11
|
+
export function questionFocusTokens(question) {
|
|
12
|
+
const allTokens = uniqueTokens(question);
|
|
13
|
+
const cueTokens = cueFocusTokens(question);
|
|
14
|
+
const numbers = new Set(extractNumbers(question).flatMap(expandNumberToken));
|
|
15
|
+
const filtered = allTokens.filter((token) => {
|
|
16
|
+
if (!token)
|
|
17
|
+
return false;
|
|
18
|
+
if (numbers.has(token) || /^\d/.test(token))
|
|
19
|
+
return true;
|
|
20
|
+
if (FOCUS_STOPWORDS.has(token))
|
|
21
|
+
return false;
|
|
22
|
+
return token.length > 2;
|
|
23
|
+
});
|
|
24
|
+
const merged = [];
|
|
25
|
+
for (const token of [...cueTokens, ...filtered]) {
|
|
26
|
+
if (!merged.includes(token))
|
|
27
|
+
merged.push(token);
|
|
28
|
+
}
|
|
29
|
+
return merged.slice(0, 16);
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* Достает из вопроса короткие смысловые хвосты после устойчивых формулировок
|
|
33
|
+
* вроде "с целью", "для" и возрастных ограничений.
|
|
34
|
+
*/
|
|
35
|
+
function cueFocusTokens(question) {
|
|
36
|
+
const raw = normalizeText(question);
|
|
37
|
+
const parts = [];
|
|
38
|
+
const patterns = [
|
|
39
|
+
/с\s+целью\s+(.+?)(?:\s+рекоменд|\s+провод|\s+назнач|$)/u,
|
|
40
|
+
/для\s+(.+?)(?:\s+рекоменд|\s+провод|\s+назнач|$)/u,
|
|
41
|
+
/(?:старше|младше|моложе|до|после)\s+\d+(?:[.,]\d+)?\s+(?:лет|года|месяц|дней|сут)/u,
|
|
42
|
+
/по\s+([а-яa-z0-9 -]{4,48})/u,
|
|
43
|
+
];
|
|
44
|
+
for (const pattern of patterns) {
|
|
45
|
+
const match = raw.match(pattern);
|
|
46
|
+
if (match?.[0])
|
|
47
|
+
parts.push(match[1] ?? match[0]);
|
|
48
|
+
}
|
|
49
|
+
return uniqueTokens(parts.join(" ")).filter((token) => !FOCUS_STOPWORDS.has(token));
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Ищет вариант ответа в небольшом окне вокруг точного/нормализованного вхождения
|
|
53
|
+
* и требует рядом фокус вопроса или совпадающие числа из вопроса.
|
|
54
|
+
*/
|
|
55
|
+
export function bestFocusedSupport({ pages, topQuestionPages, question, answer, answerTokens, focusTokens, intent }) {
|
|
56
|
+
if (!focusTokens?.length)
|
|
57
|
+
return null;
|
|
58
|
+
const answerPhrases = focusedAnswerSearchPhrases(answer.text).slice(0, 24);
|
|
59
|
+
let best = null;
|
|
60
|
+
for (const page of pages) {
|
|
61
|
+
if (topQuestionPages?.size && !topQuestionPages.has(page.page))
|
|
62
|
+
continue;
|
|
63
|
+
const pageNorm = page.normalized;
|
|
64
|
+
for (const phrase of answerPhrases) {
|
|
65
|
+
const normalizedPhrase = normalizeForSearch(phrase);
|
|
66
|
+
if (!normalizedPhrase || normalizedPhrase.length < 5)
|
|
67
|
+
continue;
|
|
68
|
+
const hits = findPhraseOccurrences(pageNorm, phrase, { textIsNormalized: true });
|
|
69
|
+
for (const hit of hits) {
|
|
70
|
+
const local = pageWindow(page, hit, 260);
|
|
71
|
+
const localTokens = tokenizeNormalized(local);
|
|
72
|
+
const focusCoverage = coverage(focusTokens, localTokens);
|
|
73
|
+
const questionNumberCoverage = numberCoverage(question, local);
|
|
74
|
+
if (focusCoverage < 0.22 && questionNumberCoverage <= 0)
|
|
75
|
+
continue;
|
|
76
|
+
const answerCoverage = coverage(answerTokens, localTokens);
|
|
77
|
+
const limitedPenalty = intent.negative || intent.exception ? 0 : limitedCuePenalty(local);
|
|
78
|
+
const score = 2.2 +
|
|
79
|
+
focusCoverage * 5.2 +
|
|
80
|
+
answerCoverage * 1.2 +
|
|
81
|
+
questionNumberCoverage * (intent.numeric ? 4.0 : 2.2) -
|
|
82
|
+
limitedPenalty;
|
|
83
|
+
if (score <= 2.6)
|
|
84
|
+
continue;
|
|
85
|
+
best = betterEvidence(best, {
|
|
86
|
+
answerId: answer.id,
|
|
87
|
+
page: page.page,
|
|
88
|
+
text: evidenceSnippet(page.text, phrase, question),
|
|
89
|
+
score,
|
|
90
|
+
kind: "focused_answer_window",
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
return best;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Строит короткие строковые сегменты страницы: одну строку и пару соседних строк.
|
|
99
|
+
* Эти сегменты дают дешевый локальный evidence без широкого поиска по странице.
|
|
100
|
+
*/
|
|
101
|
+
function lineTokenSegments(page) {
|
|
102
|
+
const lines = page.lines ?? [];
|
|
103
|
+
const segments = [];
|
|
104
|
+
for (let index = 0; index < lines.length; index += 1) {
|
|
105
|
+
const line = lines[index];
|
|
106
|
+
if (line?.length >= 8)
|
|
107
|
+
segments.push({ text: line, kind: "line" });
|
|
108
|
+
if (index + 1 < lines.length) {
|
|
109
|
+
const pair = `${line} ${lines[index + 1]}`.replace(/\s+/g, " ").trim();
|
|
110
|
+
if (pair.length >= 16 && pair.length <= 700)
|
|
111
|
+
segments.push({ text: pair, kind: "line_pair" });
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
return segments;
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Возвращает кешированные line/pair-сегменты с уже посчитанной нормализацией и
|
|
118
|
+
* токенами; кеш живет на объекте страницы и не попадает в публичный результат.
|
|
119
|
+
*/
|
|
120
|
+
export function cachedLineTokenSegments(page) {
|
|
121
|
+
if (!page.__lineTokenSegments) {
|
|
122
|
+
Object.defineProperty(page, "__lineTokenSegments", {
|
|
123
|
+
value: lineTokenSegments(page).map((segment) => ({
|
|
124
|
+
...segment,
|
|
125
|
+
normalized: normalizeForSearch(segment.text),
|
|
126
|
+
tokens: tokenize(segment.text),
|
|
127
|
+
})),
|
|
128
|
+
enumerable: false,
|
|
129
|
+
});
|
|
130
|
+
}
|
|
131
|
+
return page.__lineTokenSegments;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Сравнивает вариант ответа с локальными line/pair-сегментами и отдает поддержку
|
|
135
|
+
* только когда рядом есть фокус вопроса, числа или достаточно сильное совпадение.
|
|
136
|
+
*/
|
|
137
|
+
export function bestLineTokenSupport({ pages, topQuestionPages, question, answer, questionTokens, answerTokens, focusTokens, intent }) {
|
|
138
|
+
if (!answerTokens.length)
|
|
139
|
+
return null;
|
|
140
|
+
const numericAnswer = extractNumbers(answer.text).length > 0;
|
|
141
|
+
const minAnswerSupport = numericAnswer ? 0.65 : answerTokens.length <= 2 ? 0.95 : 0.62;
|
|
142
|
+
const usefulFocusTokens = (focusTokens?.length ? focusTokens : questionTokens).filter((token) => token.length > 2 || /^\d/.test(token));
|
|
143
|
+
if (!usefulFocusTokens.length)
|
|
144
|
+
return null;
|
|
145
|
+
const answerPhrases = answerSearchPhrases(answer.text);
|
|
146
|
+
let best = null;
|
|
147
|
+
for (const page of pages) {
|
|
148
|
+
const isTopPage = topQuestionPages?.has(page.page);
|
|
149
|
+
const pageTokens = cachedPageTokens(page);
|
|
150
|
+
const pageFocusHits = tokenHitCount(usefulFocusTokens, pageTokens);
|
|
151
|
+
const pageAnswerSupport = Math.max(strictSoftCoverage(answerTokens, pageTokens), numberCoverage(answer.text, page.normalized));
|
|
152
|
+
if (!isTopPage && (pageFocusHits < 2 || pageAnswerSupport < minAnswerSupport))
|
|
153
|
+
continue;
|
|
154
|
+
for (const segment of cachedLineTokenSegments(page)) {
|
|
155
|
+
const segmentTokens = segment.tokens;
|
|
156
|
+
if (!segmentTokens.length)
|
|
157
|
+
continue;
|
|
158
|
+
const answerCoverage = strictSoftCoverage(answerTokens, segmentTokens);
|
|
159
|
+
const numericCoverage = numberCoverage(answer.text, segment.text);
|
|
160
|
+
const answerSupport = Math.max(answerCoverage, numericCoverage);
|
|
161
|
+
if (answerSupport < minAnswerSupport)
|
|
162
|
+
continue;
|
|
163
|
+
const focusHits = tokenHitCount(usefulFocusTokens, segmentTokens);
|
|
164
|
+
const focusCoverage = coverage(usefulFocusTokens, segmentTokens);
|
|
165
|
+
const questionNumberCoverage = numberCoverage(question, segment.text);
|
|
166
|
+
const enoughFocus = isTopPage ? focusHits >= 1 || focusCoverage >= 0.16 : focusHits >= 2 || focusCoverage >= 0.24;
|
|
167
|
+
if (!enoughFocus && questionNumberCoverage <= 0)
|
|
168
|
+
continue;
|
|
169
|
+
const exactPhrase = answerPhrases.some((phrase) => containsNormalizedPhrase(segment.normalized, phrase));
|
|
170
|
+
const lengthPenalty = segment.text.length > 420 ? Math.min(1.4, (segment.text.length - 420) / 220) : 0;
|
|
171
|
+
const score = 3.2 +
|
|
172
|
+
answerSupport * 4.4 +
|
|
173
|
+
Math.min(0.55, focusCoverage) * 5.2 +
|
|
174
|
+
Math.min(4, focusHits) * 0.42 +
|
|
175
|
+
questionNumberCoverage * (intent.numeric ? 2.5 : 1.2) +
|
|
176
|
+
(exactPhrase ? 0.8 : 0) +
|
|
177
|
+
(isTopPage ? 0.5 : 0) -
|
|
178
|
+
lengthPenalty;
|
|
179
|
+
if (score < 6.2)
|
|
180
|
+
continue;
|
|
181
|
+
best = betterEvidence(best, {
|
|
182
|
+
answerId: answer.id,
|
|
183
|
+
page: page.page,
|
|
184
|
+
text: segment.text,
|
|
185
|
+
score,
|
|
186
|
+
kind: `line_token_${segment.kind}`,
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
return best;
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Штрафует локальные окна с ограничивающими формулировками, чтобы "не
|
|
194
|
+
* рекомендуется" и "только при невозможности" не выглядели как обычная поддержка.
|
|
195
|
+
*/
|
|
196
|
+
function limitedCuePenalty(normalizedText) {
|
|
197
|
+
const limitedCues = ["не рекомендуется", "не рекомендовано", "только в случаях", "при невозможности", "невозможности", "за исключением"];
|
|
198
|
+
let penalty = 0;
|
|
199
|
+
for (const cue of limitedCues) {
|
|
200
|
+
if (containsNormalizedPhrase(normalizedText, cue))
|
|
201
|
+
penalty += 0.8;
|
|
202
|
+
}
|
|
203
|
+
return Math.min(1.6, penalty);
|
|
204
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export declare function frequencyAnswer(answerText: any): boolean;
|
|
2
|
+
export declare function frequencySearchPhrases(answerText: any): unknown[];
|
|
3
|
+
export declare function bestFrequencyRecommendationSupport({ mode, pages, topQuestionPages, question, answer, focusTokens }: {
|
|
4
|
+
mode: any;
|
|
5
|
+
pages: any;
|
|
6
|
+
topQuestionPages: any;
|
|
7
|
+
question: any;
|
|
8
|
+
answer: any;
|
|
9
|
+
focusTokens: any;
|
|
10
|
+
}): any;
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
import { coverage, extractNumbers, normalizeForSearch, normalizeText, tokenize, uniqueTokens } from "../../normalize.js";
|
|
2
|
+
import { betterEvidence, containsNormalizedPhrase, numberCoverage, tokenHitCount } from "../text-utils.js";
|
|
3
|
+
export function frequencyAnswer(answerText) {
|
|
4
|
+
const raw = normalizeText(answerText);
|
|
5
|
+
return /\d|один|два|три|четыре|пять|шесть|семь|восемь|девять/u.test(raw) && /(год|месяц|недел|дн|сут|час|(?:^|\s)ч\.?(?:\s|$)|раз)/u.test(raw);
|
|
6
|
+
}
|
|
7
|
+
export function frequencySearchPhrases(answerText) {
|
|
8
|
+
const raw = normalizeText(answerText);
|
|
9
|
+
const numbers = extractNumbers(answerText);
|
|
10
|
+
const phrases = new Set();
|
|
11
|
+
if (answerText && /(год|месяц|недел|дн|сут|час|(?:^|\s)ч\.?(?:\s|$)|раз|\d)/u.test(raw))
|
|
12
|
+
phrases.add(answerText);
|
|
13
|
+
for (const number of numbers) {
|
|
14
|
+
if (/год/u.test(raw)) {
|
|
15
|
+
phrases.add(`${number} год`);
|
|
16
|
+
phrases.add(`${number} раз в год`);
|
|
17
|
+
}
|
|
18
|
+
if (/месяц/u.test(raw)) {
|
|
19
|
+
phrases.add(`${number} месяц`);
|
|
20
|
+
phrases.add(`${number} месяцев`);
|
|
21
|
+
phrases.add(`${number} месяца`);
|
|
22
|
+
}
|
|
23
|
+
if (/недел/u.test(raw)) {
|
|
24
|
+
phrases.add(`${number} неделю`);
|
|
25
|
+
phrases.add(`${number} недели`);
|
|
26
|
+
phrases.add(`${number} недель`);
|
|
27
|
+
}
|
|
28
|
+
if (/(дн|сут)/u.test(raw)) {
|
|
29
|
+
phrases.add(`${number} день`);
|
|
30
|
+
phrases.add(`${number} дня`);
|
|
31
|
+
phrases.add(`${number} дней`);
|
|
32
|
+
phrases.add(`${number} сутки`);
|
|
33
|
+
phrases.add(`${number} суток`);
|
|
34
|
+
}
|
|
35
|
+
if (/час|(?:^|\s)ч\.?(?:\s|$)/u.test(raw)) {
|
|
36
|
+
phrases.add(`${number} ч`);
|
|
37
|
+
phrases.add(`${number} ч.`);
|
|
38
|
+
phrases.add(`${number} час`);
|
|
39
|
+
phrases.add(`${number} часа`);
|
|
40
|
+
phrases.add(`${number} часов`);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return [...phrases].filter((phrase) => {
|
|
44
|
+
const phraseNorm = normalizeForSearch(phrase);
|
|
45
|
+
if (!/\u0441\u0443\u0442/u.test(raw) && containsNormalizedPhrase(phraseNorm, "\u0441\u0443\u0442"))
|
|
46
|
+
return false;
|
|
47
|
+
if (!/\u0434\u043d/u.test(raw) && (containsNormalizedPhrase(phraseNorm, "\u0434\u0435\u043d\u044c") || containsNormalizedPhrase(phraseNorm, "\u0434\u043d\u044f") || containsNormalizedPhrase(phraseNorm, "\u0434\u043d\u0435\u0439")))
|
|
48
|
+
return false;
|
|
49
|
+
if (!/\u0447\u0430\u0441|(?:^|\s)\u0447\.?(?:\s|$)/u.test(raw) && (containsNormalizedPhrase(phraseNorm, "\u0447\u0430\u0441") || containsNormalizedPhrase(phraseNorm, "\u0447.")))
|
|
50
|
+
return false;
|
|
51
|
+
if (/^\d+\s+\u0447$/u.test(phraseNorm))
|
|
52
|
+
return true;
|
|
53
|
+
return phraseNorm.length >= 4;
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
function lineWindowSegments(page, radius = 2) {
|
|
57
|
+
const lines = page.lines ?? [];
|
|
58
|
+
const segments = [];
|
|
59
|
+
for (let index = 0; index < lines.length; index += 1) {
|
|
60
|
+
const text = lines.slice(index, Math.min(lines.length, index + radius + 1)).join(" ").replace(/\s+/g, " ").trim();
|
|
61
|
+
if (text.length >= 16 && text.length <= 900) {
|
|
62
|
+
segments.push({
|
|
63
|
+
text,
|
|
64
|
+
normalized: normalizeForSearch(text),
|
|
65
|
+
tokens: tokenize(text),
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
return segments;
|
|
70
|
+
}
|
|
71
|
+
function cachedLineWindowSegments(page) {
|
|
72
|
+
if (!page.__lineWindowSegments) {
|
|
73
|
+
Object.defineProperty(page, "__lineWindowSegments", {
|
|
74
|
+
value: lineWindowSegments(page, 3),
|
|
75
|
+
enumerable: false,
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
return page.__lineWindowSegments;
|
|
79
|
+
}
|
|
80
|
+
const FREQUENCY_GENERIC_FOCUS = new Set([
|
|
81
|
+
"динамическое",
|
|
82
|
+
"динамического",
|
|
83
|
+
"наблюдение",
|
|
84
|
+
"наблюдения",
|
|
85
|
+
"пациент",
|
|
86
|
+
"пациентам",
|
|
87
|
+
"хвгс",
|
|
88
|
+
"хвгв",
|
|
89
|
+
"цп",
|
|
90
|
+
"цирроз",
|
|
91
|
+
"печень",
|
|
92
|
+
"печени",
|
|
93
|
+
"рекомендуется",
|
|
94
|
+
"рекомендовано",
|
|
95
|
+
"выполнение",
|
|
96
|
+
"выполнять",
|
|
97
|
+
"проведение",
|
|
98
|
+
"проводить",
|
|
99
|
+
"контроль",
|
|
100
|
+
"контроля",
|
|
101
|
+
"эффективность",
|
|
102
|
+
"эффективности",
|
|
103
|
+
"исключение",
|
|
104
|
+
"рецидив",
|
|
105
|
+
"раз",
|
|
106
|
+
].flatMap((item) => uniqueTokens(item)));
|
|
107
|
+
const FREQUENCY_ANSWER_GENERIC = new Set([
|
|
108
|
+
"внутривенное",
|
|
109
|
+
"внутривенно",
|
|
110
|
+
"внутримышечно",
|
|
111
|
+
"местное",
|
|
112
|
+
"перорально",
|
|
113
|
+
"введение",
|
|
114
|
+
"вводят",
|
|
115
|
+
"назначение",
|
|
116
|
+
"назначают",
|
|
117
|
+
"применение",
|
|
118
|
+
"применяют",
|
|
119
|
+
"дозе",
|
|
120
|
+
"доза",
|
|
121
|
+
"средняя",
|
|
122
|
+
"суточная",
|
|
123
|
+
"содержанием",
|
|
124
|
+
"составе",
|
|
125
|
+
"область",
|
|
126
|
+
"боли",
|
|
127
|
+
"сутки",
|
|
128
|
+
"суток",
|
|
129
|
+
"дней",
|
|
130
|
+
"дня",
|
|
131
|
+
"недель",
|
|
132
|
+
"недели",
|
|
133
|
+
"течение",
|
|
134
|
+
"каждые",
|
|
135
|
+
"каждый",
|
|
136
|
+
"курсом",
|
|
137
|
+
"раствора",
|
|
138
|
+
"раствор",
|
|
139
|
+
"таблеток",
|
|
140
|
+
"крема",
|
|
141
|
+
"геля",
|
|
142
|
+
"мг",
|
|
143
|
+
"мл",
|
|
144
|
+
"кг",
|
|
145
|
+
"раз",
|
|
146
|
+
].flatMap((item) => uniqueTokens(item)));
|
|
147
|
+
function specificFrequencyFocusTokens(focusTokens) {
|
|
148
|
+
return focusTokens.filter((token) => token.length >= 4 && !/^\d/.test(token) && !FREQUENCY_GENERIC_FOCUS.has(token));
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Выделяет из числового варианта предмет назначения: препарат, действующее вещество
|
|
152
|
+
* или медицинское средство. Это защищает scorer от ложных совпадений, когда в PDF
|
|
153
|
+
* рядом найден только срок или кратность, но указан другой препарат.
|
|
154
|
+
*/
|
|
155
|
+
function frequencyAnswerSubjectTokens(answerText) {
|
|
156
|
+
const tokens = uniqueTokens(answerText).filter((token) => token.length >= 5 && !/^\d/u.test(token) && !/[/%]/u.test(token) && !FREQUENCY_ANSWER_GENERIC.has(token));
|
|
157
|
+
return tokens.slice(0, 5);
|
|
158
|
+
}
|
|
159
|
+
function frequencySubjectCompatible(answerText, segmentTokens) {
|
|
160
|
+
const subjectTokens = frequencyAnswerSubjectTokens(answerText);
|
|
161
|
+
if (!subjectTokens.length)
|
|
162
|
+
return true;
|
|
163
|
+
return tokenHitCount(subjectTokens, segmentTokens) > 0;
|
|
164
|
+
}
|
|
165
|
+
export function bestFrequencyRecommendationSupport({ mode, pages, topQuestionPages, question, answer, focusTokens }) {
|
|
166
|
+
if (mode !== "single")
|
|
167
|
+
return null;
|
|
168
|
+
if (!frequencyAnswer(answer.text))
|
|
169
|
+
return null;
|
|
170
|
+
const questionRaw = normalizeText(question);
|
|
171
|
+
if (!/(рекоменд|наблюден|контрол|выполн|провод)/u.test(questionRaw))
|
|
172
|
+
return null;
|
|
173
|
+
const phrases = frequencySearchPhrases(answer.text).slice(0, 10);
|
|
174
|
+
if (!phrases.length)
|
|
175
|
+
return null;
|
|
176
|
+
const specificTokens = specificFrequencyFocusTokens(focusTokens);
|
|
177
|
+
let best = null;
|
|
178
|
+
for (const page of pages) {
|
|
179
|
+
if (topQuestionPages?.size && !topQuestionPages.has(page.page))
|
|
180
|
+
continue;
|
|
181
|
+
for (const segment of cachedLineWindowSegments(page)) {
|
|
182
|
+
if (!containsNormalizedPhrase(segment.normalized, "рекоменд"))
|
|
183
|
+
continue;
|
|
184
|
+
const hasAnswer = phrases.some((phrase) => containsNormalizedPhrase(segment.normalized, phrase));
|
|
185
|
+
if (!hasAnswer)
|
|
186
|
+
continue;
|
|
187
|
+
if (!frequencySubjectCompatible(answer.text, segment.tokens))
|
|
188
|
+
continue;
|
|
189
|
+
if (specificTokens.length && tokenHitCount(specificTokens, segment.tokens) < Math.min(2, specificTokens.length))
|
|
190
|
+
continue;
|
|
191
|
+
const focusCoverage = coverage(focusTokens, segment.tokens);
|
|
192
|
+
const score = 11.8 + focusCoverage * 9.0 + numberCoverage(answer.text, segment.normalized) * 1.0;
|
|
193
|
+
best = betterEvidence(best, {
|
|
194
|
+
answerId: answer.id,
|
|
195
|
+
page: page.page,
|
|
196
|
+
text: segment.text,
|
|
197
|
+
score,
|
|
198
|
+
kind: "frequency_recommendation_line",
|
|
199
|
+
});
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
return best;
|
|
203
|
+
}
|