med-pdf-nmo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +298 -0
- package/README.ru.md +298 -0
- package/dist/bm25.d.ts +47 -0
- package/dist/bm25.js +86 -0
- package/dist/browser-shims/buffer.d.ts +30 -0
- package/dist/browser-shims/buffer.js +31 -0
- package/dist/browser-shims/crypto.d.ts +33 -0
- package/dist/browser-shims/crypto.js +45 -0
- package/dist/browser-shims/fs-promises.d.ts +13 -0
- package/dist/browser-shims/fs-promises.js +25 -0
- package/dist/browser-shims/fs.d.ts +14 -0
- package/dist/browser-shims/fs.js +24 -0
- package/dist/browser-shims/globals.d.ts +9 -0
- package/dist/browser-shims/globals.js +23 -0
- package/dist/browser-shims/path.d.ts +57 -0
- package/dist/browser-shims/path.js +65 -0
- package/dist/browser-shims/process.d.ts +22 -0
- package/dist/browser-shims/process.js +27 -0
- package/dist/browser.d.ts +9 -0
- package/dist/browser.js +12 -0
- package/dist/chunk.d.ts +15 -0
- package/dist/chunk.js +76 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +87 -0
- package/dist/index.d.ts +82 -0
- package/dist/index.js +51 -0
- package/dist/med-pdf-nmo.browser.js +40413 -0
- package/dist/med-pdf-nmo.browser.mjs +40395 -0
- package/dist/normalize.d.ts +73 -0
- package/dist/normalize.js +477 -0
- package/dist/pdf.d.ts +35 -0
- package/dist/pdf.js +396 -0
- package/dist/predictor/config.d.ts +28 -0
- package/dist/predictor/config.js +26 -0
- package/dist/predictor/constants.d.ts +3 -0
- package/dist/predictor/constants.js +59 -0
- package/dist/predictor/runtime.d.ts +15 -0
- package/dist/predictor/runtime.js +59 -0
- package/dist/predictor/scorers/biomedical-symbols.d.ts +36 -0
- package/dist/predictor/scorers/biomedical-symbols.js +347 -0
- package/dist/predictor/scorers/coordinate-table.d.ts +82 -0
- package/dist/predictor/scorers/coordinate-table.js +1210 -0
- package/dist/predictor/scorers/direction.d.ts +71 -0
- package/dist/predictor/scorers/direction.js +345 -0
- package/dist/predictor/scorers/drug-dose.d.ts +6 -0
- package/dist/predictor/scorers/drug-dose.js +221 -0
- package/dist/predictor/scorers/exact-answer.d.ts +10 -0
- package/dist/predictor/scorers/exact-answer.js +75 -0
- package/dist/predictor/scorers/fibrosis-stage.d.ts +6 -0
- package/dist/predictor/scorers/fibrosis-stage.js +103 -0
- package/dist/predictor/scorers/focused.d.ts +40 -0
- package/dist/predictor/scorers/focused.js +204 -0
- package/dist/predictor/scorers/frequency.d.ts +10 -0
- package/dist/predictor/scorers/frequency.js +203 -0
- package/dist/predictor/scorers/numeric.d.ts +77 -0
- package/dist/predictor/scorers/numeric.js +1161 -0
- package/dist/predictor/scorers/recommendation-item.d.ts +27 -0
- package/dist/predictor/scorers/recommendation-item.js +469 -0
- package/dist/predictor/scorers/search.d.ts +41 -0
- package/dist/predictor/scorers/search.js +515 -0
- package/dist/predictor/selection.d.ts +30 -0
- package/dist/predictor/selection.js +370 -0
- package/dist/predictor/text-utils.d.ts +49 -0
- package/dist/predictor/text-utils.js +497 -0
- package/dist/predictor/types.d.ts +23 -0
- package/dist/predictor/types.js +1 -0
- package/dist/predictor.d.ts +52 -0
- package/dist/predictor.js +3834 -0
- package/package.json +82 -0
|
@@ -0,0 +1,3834 @@
|
|
|
1
|
+
import { coverage, detectQuestionIntent, extractNumbers, jaccard, normalizeText, normalizeForSearch, phraseTokens, stemToken, tokenize, uniqueTokens, } from "./normalize.js";
|
|
2
|
+
import { FOCUS_STOPWORDS, LABEL_CUES } from "./predictor/constants.js";
|
|
3
|
+
import { DEFAULT_CONFIG } from "./predictor/config.js";
|
|
4
|
+
import { clearPdfRuntimeCache, getPdfRuntime, normalizeAnswers } from "./predictor/runtime.js";
|
|
5
|
+
import { bestDrugDoseSupport } from "./predictor/scorers/drug-dose.js";
|
|
6
|
+
import { bestExactAnswerSupport } from "./predictor/scorers/exact-answer.js";
|
|
7
|
+
import { bestFibrosisStageSupport } from "./predictor/scorers/fibrosis-stage.js";
|
|
8
|
+
import { bestFrequencyRecommendationSupport } from "./predictor/scorers/frequency.js";
|
|
9
|
+
import { bestGeneSentenceSupport, bestLatinFuzzySupport, geneMutationQuestion, latinAnswerTokens, sentenceSegments } from "./predictor/scorers/biomedical-symbols.js";
|
|
10
|
+
import { bestCoordinateMultiCellRowSupport, bestCoordinateTableMembershipSupport, bestCoordinateTableGroupSupport, bestCoordinateTableRowSupport, buildCoordinateMultiCellRowsByPage, buildCoordinateTableMembershipsByPage, buildCoordinateTableGroupsByPage, buildCoordinateTableRowsByPage, hasCoordinateTableCue, hasCoordinateTableGroupCue, } from "./predictor/scorers/coordinate-table.js";
|
|
11
|
+
import { bestFocusedSupport, bestLineTokenSupport, cachedLineTokenSegments, questionFocusTokens } from "./predictor/scorers/focused.js";
|
|
12
|
+
import { bestRecommendationItemSupport, explicitRecommendationTargetAdjustment } from "./predictor/scorers/recommendation-item.js";
|
|
13
|
+
import { contrastCueMismatchAdjustment, excludedConditionMismatchAdjustment, polarityAdjustment, temporalCueAdjustment, } from "./predictor/scorers/direction.js";
|
|
14
|
+
import { bestClozeGapSupport, bestConditionedNumberSupport, bestCountRelationSupport, bestExactHourAliasOptionSupport, bestExactNumericOptionSupport, bestNumericConditionSupport, conditionPairAdjustment, } from "./predictor/scorers/numeric.js";
|
|
15
|
+
import { bestAnchorSupport, bestPhraseSupport, bestPrecedingQuestionLabelSupport, bestRowLabelSupport, bestSectionSupport, findAnchorSegments, findRowSegments, findSectionSegments, } from "./predictor/scorers/search.js";
|
|
16
|
+
import { applyFrozenFeatureRanker, calibrateScores, round4, selectAnswers } from "./predictor/selection.js";
|
|
17
|
+
import { answerSearchPhrases, betterEvidence, cachedLineWindowSegments, containsNormalizedPhrase, escapeRegExp, evidenceFromChunk, evidenceSnippet, expandNumberToken, findPhraseOccurrences, focusedAnswerSearchPhrases, hasSearchBoundaries, numberCoverage, pageWindow, proximityBonus, rawSoftCoverage, rawTokens, softCoverage, strictSoftCoverage, tokenBoundaryIncludes, tokenizeNormalized, tokenHitCount, tokenProximity, tokenSequenceIncludes, } from "./predictor/text-utils.js";
|
|
18
|
+
const CLINICAL_FEATURE_GENERIC_TOKENS = new Set([
|
|
19
|
+
"\u0438\u043c\u0435\u0435\u0442",
|
|
20
|
+
"\u0441\u043b\u0435\u0434\u0443\u044e\u0449\u0438\u0435",
|
|
21
|
+
"\u043a\u043b\u0438\u043d\u0438\u0447\u0435\u0441\u043a\u0438\u0435",
|
|
22
|
+
"\u043a\u043b\u0438\u043d\u0438\u0447\u0435\u0441\u043a\u0438",
|
|
23
|
+
"\u043f\u0440\u0438\u0437\u043d\u0430\u043a\u0438",
|
|
24
|
+
"\u043f\u0440\u0438\u0437\u043d\u0430\u043a",
|
|
25
|
+
"\u0441\u0438\u043c\u043f\u0442\u043e\u043c\u044b",
|
|
26
|
+
"\u0441\u0438\u043c\u043f\u0442\u043e\u043c",
|
|
27
|
+
"\u043f\u0440\u043e\u044f\u0432\u043b\u0435\u043d\u0438\u044f",
|
|
28
|
+
"\u043f\u0440\u043e\u044f\u0432\u043b\u0435\u043d\u0438\u0435",
|
|
29
|
+
"\u0444\u043e\u0440\u043c\u0430",
|
|
30
|
+
"\u0444\u043e\u0440\u043c\u044b",
|
|
31
|
+
].flatMap((item) => uniqueTokens(item)));
|
|
32
|
+
const CLINICAL_FEATURE_ANSWER_GENERIC_TOKENS = new Set(["\u043e\u0431\u044b\u0447\u043d\u043e", "\u0442\u0438\u043f\u0438\u0447\u043d\u043e", "\u0446\u0432\u0435\u0442\u0430", "\u0446\u0432\u0435\u0442"].flatMap((item) => uniqueTokens(item)));
|
|
33
|
+
function clinicalFeatureQuestion({ mode, question, intent }) {
|
|
34
|
+
if (mode !== "multi" || intent.negative || intent.exception)
|
|
35
|
+
return false;
|
|
36
|
+
const normalized = normalizeForSearch(question);
|
|
37
|
+
return (containsNormalizedPhrase(normalized, "\u0438\u043c\u0435") &&
|
|
38
|
+
containsNormalizedPhrase(normalized, "\u0441\u043b\u0435\u0434\u0443\u044e") &&
|
|
39
|
+
containsNormalizedPhrase(normalized, "\u043a\u043b\u0438\u043d\u0438\u0447") &&
|
|
40
|
+
containsNormalizedPhrase(normalized, "\u043f\u0440\u0438\u0437\u043d"));
|
|
41
|
+
}
|
|
42
|
+
function clinicalFeatureFocusTokens(question) {
|
|
43
|
+
return uniqueTokens(question).filter((token) => token.length >= 4 && !CLINICAL_FEATURE_GENERIC_TOKENS.has(token) && !FOCUS_STOPWORDS.has(token));
|
|
44
|
+
}
|
|
45
|
+
function clinicalFeatureAnswerTokens(answerText) {
|
|
46
|
+
return uniqueTokens(answerText).filter((token) => token.length >= 4 && !CLINICAL_FEATURE_ANSWER_GENERIC_TOKENS.has(token) && !FOCUS_STOPWORDS.has(token));
|
|
47
|
+
}
|
|
48
|
+
function answerHasNegativeClinicalCue(answerText) {
|
|
49
|
+
const normalized = normalizeForSearch(answerText);
|
|
50
|
+
return (containsNormalizedPhrase(normalized, "\u043d\u0435 ") ||
|
|
51
|
+
containsNormalizedPhrase(normalized, "\u0431\u0435\u0437 ") ||
|
|
52
|
+
containsNormalizedPhrase(normalized, "\u043e\u0442\u0441\u0443\u0442") ||
|
|
53
|
+
containsNormalizedPhrase(normalized, "\u043d\u0435\u0442\u0438\u043f\u0438\u0447"));
|
|
54
|
+
}
|
|
55
|
+
function clinicalFeatureSentenceNegative(normalizedSentence) {
|
|
56
|
+
return (containsNormalizedPhrase(normalizedSentence, "\u043d\u0435 \u0442\u0438\u043f\u0438\u0447") ||
|
|
57
|
+
containsNormalizedPhrase(normalizedSentence, "\u043d\u0435\u0442\u0438\u043f\u0438\u0447") ||
|
|
58
|
+
containsNormalizedPhrase(normalizedSentence, "\u043d\u0435 \u0445\u0430\u0440\u0430\u043a\u0442\u0435\u0440") ||
|
|
59
|
+
containsNormalizedPhrase(normalizedSentence, "\u043d\u0435 \u044f\u0432\u043b\u044f") ||
|
|
60
|
+
containsNormalizedPhrase(normalizedSentence, "\u043e\u0442\u0441\u0443\u0442") ||
|
|
61
|
+
containsNormalizedPhrase(normalizedSentence, "\u0431\u0435\u0437 "));
|
|
62
|
+
}
|
|
63
|
+
function clinicalFeatureCandidateSentences(pageText, focusTokens) {
|
|
64
|
+
const sentences = sentenceSegments(pageText).map((sentence) => {
|
|
65
|
+
const normalized = normalizeForSearch(sentence);
|
|
66
|
+
const tokens = tokenizeNormalized(normalized);
|
|
67
|
+
return { sentence, normalized, tokens, focusHits: tokenHitCount(focusTokens, tokens) };
|
|
68
|
+
});
|
|
69
|
+
const anchors = sentences.map((item, index) => (item.focusHits > 0 ? index : -1)).filter((index) => index >= 0);
|
|
70
|
+
if (!anchors.length)
|
|
71
|
+
return [];
|
|
72
|
+
return sentences
|
|
73
|
+
.map((item, index) => {
|
|
74
|
+
const distance = Math.min(...anchors.map((anchor) => (index >= anchor ? index - anchor : Infinity)));
|
|
75
|
+
return { ...item, distance };
|
|
76
|
+
})
|
|
77
|
+
.filter((item) => item.focusHits > 0 || item.distance <= 4);
|
|
78
|
+
}
|
|
79
|
+
function clinicalFeatureAdjustment(context) {
|
|
80
|
+
const { pages, topQuestionPages, mode, question, answer, intent } = context;
|
|
81
|
+
if (!clinicalFeatureQuestion({ mode, question, intent }))
|
|
82
|
+
return { support: null, adjustment: 0, evidence: null };
|
|
83
|
+
const focusTokens = clinicalFeatureFocusTokens(question);
|
|
84
|
+
if (!focusTokens.length)
|
|
85
|
+
return { support: null, adjustment: 0, evidence: null };
|
|
86
|
+
const answerTokens = clinicalFeatureAnswerTokens(answer.text);
|
|
87
|
+
if (answerTokens.length < 2)
|
|
88
|
+
return { support: null, adjustment: 0, evidence: null };
|
|
89
|
+
const answerNegative = answerHasNegativeClinicalCue(answer.text);
|
|
90
|
+
let bestSupport = null;
|
|
91
|
+
let bestNegated = null;
|
|
92
|
+
for (const page of pages) {
|
|
93
|
+
for (const item of clinicalFeatureCandidateSentences(page.text, focusTokens)) {
|
|
94
|
+
const answerCoverage = Math.max(strictSoftCoverage(answerTokens, item.tokens), softCoverage(answerTokens, item.tokens), rawSoftCoverage(answerTokens, item.tokens));
|
|
95
|
+
if (answerCoverage < 0.5)
|
|
96
|
+
continue;
|
|
97
|
+
const negated = clinicalFeatureSentenceNegative(item.normalized);
|
|
98
|
+
const focusBonus = Math.min(2, item.focusHits) * 1.1;
|
|
99
|
+
const distanceBonus = Math.max(0, 4 - item.distance) * 0.35;
|
|
100
|
+
const score = 12.4 + answerCoverage * 5.2 + focusBonus + distanceBonus;
|
|
101
|
+
const evidence = {
|
|
102
|
+
answerId: answer.id,
|
|
103
|
+
page: page.page,
|
|
104
|
+
text: item.sentence,
|
|
105
|
+
score,
|
|
106
|
+
kind: negated && !answerNegative ? "clinical_feature_negated" : "clinical_feature_segment",
|
|
107
|
+
};
|
|
108
|
+
if (negated && !answerNegative)
|
|
109
|
+
bestNegated = betterEvidence(bestNegated, evidence);
|
|
110
|
+
else
|
|
111
|
+
bestSupport = betterEvidence(bestSupport, evidence);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
if (bestNegated && (!bestSupport || bestNegated.score >= bestSupport.score - 0.8)) {
|
|
115
|
+
return { support: null, adjustment: -8.4, evidence: bestNegated };
|
|
116
|
+
}
|
|
117
|
+
return bestSupport ? { support: bestSupport, adjustment: 0, evidence: null } : { support: null, adjustment: 0, evidence: null };
|
|
118
|
+
}
|
|
119
|
+
function questionLabelCues(question) {
|
|
120
|
+
const normalized = normalizeForSearch(question);
|
|
121
|
+
return LABEL_CUES.filter((cue) => normalized.includes(cue));
|
|
122
|
+
}
|
|
123
|
+
function bestLabelNumberSupport({ pages, topQuestionPages, question, answer }) {
|
|
124
|
+
const labels = questionLabelCues(question);
|
|
125
|
+
if (/мкб/u.test(normalizeText(question)))
|
|
126
|
+
return null;
|
|
127
|
+
if (!labels.length || !extractNumbers(answer.text).length)
|
|
128
|
+
return null;
|
|
129
|
+
const answerPhrases = answerSearchPhrases(answer.text).slice(0, 12);
|
|
130
|
+
let best = null;
|
|
131
|
+
for (const page of pages) {
|
|
132
|
+
if (topQuestionPages?.size && !topQuestionPages.has(page.page))
|
|
133
|
+
continue;
|
|
134
|
+
const pageNorm = page.normalized;
|
|
135
|
+
const labelHits = [];
|
|
136
|
+
for (const label of labels) {
|
|
137
|
+
let start = 0;
|
|
138
|
+
while (start < pageNorm.length) {
|
|
139
|
+
const index = pageNorm.indexOf(label, start);
|
|
140
|
+
if (index < 0)
|
|
141
|
+
break;
|
|
142
|
+
const around = pageNorm.slice(Math.max(0, index - 24), index + 48);
|
|
143
|
+
if (!containsNormalizedPhrase(around, "степени тяжести"))
|
|
144
|
+
labelHits.push(index);
|
|
145
|
+
start = index + Math.max(1, label.length);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
if (!labelHits.length)
|
|
149
|
+
continue;
|
|
150
|
+
for (const phrase of answerPhrases) {
|
|
151
|
+
const hits = findPhraseOccurrences(pageNorm, phrase, { textIsNormalized: true });
|
|
152
|
+
for (const hit of hits) {
|
|
153
|
+
const forwardDistances = labelHits.map((labelHit) => hit - labelHit).filter((distance) => distance >= 0);
|
|
154
|
+
if (!forwardDistances.length)
|
|
155
|
+
continue;
|
|
156
|
+
const distance = Math.min(...forwardDistances);
|
|
157
|
+
if (distance > 150)
|
|
158
|
+
continue;
|
|
159
|
+
const local = pageWindow(page, hit, 180);
|
|
160
|
+
const score = 6.6 + proximityBonus(distance, 150) * 4.4 + numberCoverage(answer.text, local) * 1.4;
|
|
161
|
+
best = betterEvidence(best, {
|
|
162
|
+
answerId: answer.id,
|
|
163
|
+
page: page.page,
|
|
164
|
+
text: evidenceSnippet(page.text, phrase, question),
|
|
165
|
+
score,
|
|
166
|
+
kind: "label_number_proximity",
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
return best;
|
|
172
|
+
}
|
|
173
|
+
const CLASSIFICATION_CODE_QUESTION_CUES = [
|
|
174
|
+
"\u043a\u043e\u0434",
|
|
175
|
+
"\u043a\u043e\u0434\u0438\u0440",
|
|
176
|
+
"\u043c\u043a\u0431",
|
|
177
|
+
].map((item) => normalizeForSearch(item));
|
|
178
|
+
const CLASSIFICATION_CODE_GENERIC_TOKENS = new Set([
|
|
179
|
+
"\u043a\u043e\u0434",
|
|
180
|
+
"\u043a\u043e\u0434\u0438\u0440\u0443\u0435\u0442\u0441\u044f",
|
|
181
|
+
"\u043a\u043e\u0434\u0438\u0440\u043e\u0432\u043a\u0430",
|
|
182
|
+
"\u043c\u043a\u0431",
|
|
183
|
+
"\u043c\u0435\u0436\u0434\u0443\u043d\u0430\u0440\u043e\u0434\u043d\u043e\u0439",
|
|
184
|
+
"\u0441\u0442\u0430\u0442\u0438\u0441\u0442\u0438\u0447\u0435\u0441\u043a\u043e\u0439",
|
|
185
|
+
"\u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u0438",
|
|
186
|
+
"\u0431\u043e\u043b\u0435\u0437\u043d\u0435\u0439",
|
|
187
|
+
"\u043f\u0440\u043e\u0431\u043b\u0435\u043c",
|
|
188
|
+
"\u0441\u0432\u044f\u0437\u0430\u043d\u043d\u044b\u0445",
|
|
189
|
+
"\u0437\u0434\u043e\u0440\u043e\u0432\u044c\u0435\u043c",
|
|
190
|
+
"\u043a\u0440\u0438\u0442\u0435\u0440\u0438\u0439",
|
|
191
|
+
"\u043a\u0440\u0438\u0442\u0435\u0440\u0438\u0438",
|
|
192
|
+
"\u0443\u043a\u0430\u0437\u044b\u0432\u0430\u0435\u0442",
|
|
193
|
+
"\u0441\u0432\u0438\u0434\u0435\u0442\u0435\u043b\u044c\u0441\u0442\u0432\u0443\u0435\u0442",
|
|
194
|
+
"\u043e\u0442\u0440\u0430\u0436\u0430\u0435\u0442",
|
|
195
|
+
"\u043f\u0440\u0438\u0437\u043d\u0430\u043a\u0438",
|
|
196
|
+
"\u0441\u0442\u0430\u0434\u0438\u044f",
|
|
197
|
+
].flatMap((item) => uniqueTokens(item)));
|
|
198
|
+
const CYRILLIC_CODE_LETTERS = new Map([
|
|
199
|
+
["\u0410", "a"],
|
|
200
|
+
["\u0412", "b"],
|
|
201
|
+
["\u0421", "c"],
|
|
202
|
+
["\u0415", "e"],
|
|
203
|
+
["\u041d", "h"],
|
|
204
|
+
["\u041a", "k"],
|
|
205
|
+
["\u041c", "m"],
|
|
206
|
+
["\u041e", "o"],
|
|
207
|
+
["\u0420", "p"],
|
|
208
|
+
["\u0422", "t"],
|
|
209
|
+
["\u0425", "x"],
|
|
210
|
+
["\u0430", "a"],
|
|
211
|
+
["\u0432", "b"],
|
|
212
|
+
["\u0441", "c"],
|
|
213
|
+
["\u0435", "e"],
|
|
214
|
+
["\u043d", "h"],
|
|
215
|
+
["\u043a", "k"],
|
|
216
|
+
["\u043c", "m"],
|
|
217
|
+
["\u043e", "o"],
|
|
218
|
+
["\u0440", "p"],
|
|
219
|
+
["\u0442", "t"],
|
|
220
|
+
["\u0445", "x"],
|
|
221
|
+
]);
|
|
222
|
+
function canonicalClassificationCode(text) {
|
|
223
|
+
const normalized = String(text ?? "").normalize("NFKC");
|
|
224
|
+
const match = normalized.match(/(?:^|[^\p{L}\p{N}])([A-Za-z\u0410-\u042f\u0430-\u044f])\s*\.?\s*(\d{1,3})(?:\s*[.]\s*(\d{1,2}))?(?![\p{L}\p{N}])/u);
|
|
225
|
+
if (!match)
|
|
226
|
+
return null;
|
|
227
|
+
const letter = (CYRILLIC_CODE_LETTERS.get(match[1]) ?? match[1]).toLowerCase();
|
|
228
|
+
if (!/[a-z]/.test(letter))
|
|
229
|
+
return null;
|
|
230
|
+
const main = match[2].replace(/^0+(?=\d)/, "");
|
|
231
|
+
const sub = match[3]?.replace(/^0+(?=\d)/, "");
|
|
232
|
+
return sub ? `${letter}${main}.${sub}` : `${letter}${main}`;
|
|
233
|
+
}
|
|
234
|
+
function canonicalClassificationCodes(text) {
|
|
235
|
+
const normalized = String(text ?? "").normalize("NFKC");
|
|
236
|
+
const codes = [];
|
|
237
|
+
const pattern = /(?:^|[^\p{L}\p{N}])([A-Za-z\u0410-\u042f\u0430-\u044f])\s*\.?\s*(\d{1,3})(?:\s*[.]\s*(\d{1,2}))?(?![\p{L}\p{N}])/gu;
|
|
238
|
+
let match;
|
|
239
|
+
while ((match = pattern.exec(normalized))) {
|
|
240
|
+
const code = canonicalClassificationCode(match[0]);
|
|
241
|
+
if (code)
|
|
242
|
+
codes.push(code);
|
|
243
|
+
}
|
|
244
|
+
const ocrJPattern = /(?:^|[^\p{L}\p{N}])(?:[.\u041b\u043b])\s*\.?\s*(\d{2,3})(?:\s*[.]\s*(\d{1,2}))?(?![\p{L}\p{N}])/gu;
|
|
245
|
+
while ((match = ocrJPattern.exec(normalized))) {
|
|
246
|
+
const main = match[1].length === 3 && match[1].startsWith("1") ? match[1].slice(1) : match[1];
|
|
247
|
+
if (/^\d{2}$/.test(main)) {
|
|
248
|
+
const sub = match[2]?.replace(/^0+(?=\d)/, "");
|
|
249
|
+
codes.push(sub ? `j${main}.${sub}` : `j${main}`);
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
return codes;
|
|
253
|
+
}
|
|
254
|
+
function classificationCodeWindows(page) {
|
|
255
|
+
const lines = page.lines ?? [];
|
|
256
|
+
const windows = [];
|
|
257
|
+
for (let index = 0; index < lines.length; index += 1) {
|
|
258
|
+
const parts = [lines[index], lines[index + 1], lines[index + 2]].filter(Boolean);
|
|
259
|
+
const one = parts[0]?.trim();
|
|
260
|
+
const two = parts.slice(0, 2).join(" ").replace(/\s+/g, " ").trim();
|
|
261
|
+
const three = parts.join(" ").replace(/\s+/g, " ").trim();
|
|
262
|
+
if (one && one.length >= 4)
|
|
263
|
+
windows.push(one);
|
|
264
|
+
if (two.length >= 12)
|
|
265
|
+
windows.push(two);
|
|
266
|
+
if (three.length >= 24)
|
|
267
|
+
windows.push(three);
|
|
268
|
+
}
|
|
269
|
+
return [...new Set(windows)];
|
|
270
|
+
}
|
|
271
|
+
function bestClassificationCodeSupport({ pages, topQuestionPages, question, answer, questionTokens, focusTokens }) {
|
|
272
|
+
const code = canonicalClassificationCode(answer.text);
|
|
273
|
+
if (!code)
|
|
274
|
+
return null;
|
|
275
|
+
const normalizedQuestion = normalizeForSearch(question);
|
|
276
|
+
const isCodeQuestion = CLASSIFICATION_CODE_QUESTION_CUES.some((cue) => normalizedQuestion.includes(cue));
|
|
277
|
+
if (!isCodeQuestion)
|
|
278
|
+
return null;
|
|
279
|
+
const filteredFocus = focusTokens
|
|
280
|
+
.filter((token) => token.length >= 3 && !CLASSIFICATION_CODE_GENERIC_TOKENS.has(token) && !/^\d/.test(token))
|
|
281
|
+
.slice(0, 12);
|
|
282
|
+
const filteredQuestion = questionTokens
|
|
283
|
+
.filter((token) => token.length >= 3 && !CLASSIFICATION_CODE_GENERIC_TOKENS.has(token) && !/^\d/.test(token))
|
|
284
|
+
.slice(0, 18);
|
|
285
|
+
if (!filteredFocus.length && !filteredQuestion.length)
|
|
286
|
+
return null;
|
|
287
|
+
let best = null;
|
|
288
|
+
for (const page of pages) {
|
|
289
|
+
if (topQuestionPages?.size && !topQuestionPages.has(page.page))
|
|
290
|
+
continue;
|
|
291
|
+
for (const windowText of classificationCodeWindows(page)) {
|
|
292
|
+
const codes = canonicalClassificationCodes(windowText);
|
|
293
|
+
if (!codes.includes(code))
|
|
294
|
+
continue;
|
|
295
|
+
const tokens = tokenize(windowText);
|
|
296
|
+
const focusCoverage = filteredFocus.length ? coverage(filteredFocus, tokens) : 0;
|
|
297
|
+
const questionCoverage = filteredQuestion.length ? coverage(filteredQuestion, tokens) : 0;
|
|
298
|
+
if (focusCoverage < 0.22 && questionCoverage < 0.18)
|
|
299
|
+
continue;
|
|
300
|
+
const codeCountPenalty = Math.max(0, new Set(codes).size - 1) * 0.9;
|
|
301
|
+
const score = 12.8 + focusCoverage * 11 + questionCoverage * 6 + (codes[0] === code ? 1.2 : 0) - codeCountPenalty;
|
|
302
|
+
best = betterEvidence(best, {
|
|
303
|
+
answerId: answer.id,
|
|
304
|
+
page: page.page,
|
|
305
|
+
text: evidenceSnippet(page.text, answer.text, question),
|
|
306
|
+
score,
|
|
307
|
+
kind: "classification_code_segment",
|
|
308
|
+
});
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
return best;
|
|
312
|
+
}
|
|
313
|
+
const MKB_CLASS_EXCLUSION_GENERIC_TOKENS = new Set([
|
|
314
|
+
"\u0437\u043b\u043e\u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435\u043d\u043d\u044b\u0435",
|
|
315
|
+
"\u0437\u043b\u043e\u043a\u0430\u0447\u0435\u0441\u0442\u0432\u0435\u043d\u043d\u0430\u044f",
|
|
316
|
+
"\u043d\u043e\u0432\u043e\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u043d\u0438\u044f",
|
|
317
|
+
"\u043d\u043e\u0432\u043e\u043e\u0431\u0440\u0430\u0437\u043e\u0432\u0430\u043d\u0438\u0435",
|
|
318
|
+
"\u043a\u043e\u0436\u0438",
|
|
319
|
+
"\u043a\u043e\u0436\u0430",
|
|
320
|
+
"\u0434\u0440\u0443\u0433\u0438\u0435",
|
|
321
|
+
"\u043a\u043b\u0430\u0441\u0441",
|
|
322
|
+
"\u043c\u043a\u0431",
|
|
323
|
+
].flatMap((item) => uniqueTokens(item)));
|
|
324
|
+
function mkbClassExclusionQuestion(mode, question) {
|
|
325
|
+
if (mode !== "multi")
|
|
326
|
+
return false;
|
|
327
|
+
const normalized = normalizeForSearch(question);
|
|
328
|
+
const hasMkb = containsNormalizedPhrase(normalized, "\u043c\u043a\u0431");
|
|
329
|
+
const hasClass = containsNormalizedPhrase(normalized, "\u043a\u043b\u0430\u0441\u0441");
|
|
330
|
+
const asksExcluded = containsNormalizedPhrase(normalized, "\u043d\u0435 \u0432\u043a\u043b\u044e\u0447") ||
|
|
331
|
+
containsNormalizedPhrase(normalized, "\u0438\u0441\u043a\u043b\u044e\u0447") ||
|
|
332
|
+
containsNormalizedPhrase(normalized, "\u043d\u0435 \u043e\u0442\u043d\u043e\u0441");
|
|
333
|
+
return hasMkb && hasClass && asksExcluded && Boolean(questionMkbClassCode(question));
|
|
334
|
+
}
|
|
335
|
+
function questionMkbClassCode(question) {
|
|
336
|
+
return canonicalClassificationCodes(question).find((code) => !code.includes(".")) ?? null;
|
|
337
|
+
}
|
|
338
|
+
function sameMkbClass(code, classCode) {
|
|
339
|
+
return code === classCode || code.startsWith(`${classCode}.`);
|
|
340
|
+
}
|
|
341
|
+
function lineHasMkbClass(line, classCode) {
|
|
342
|
+
return canonicalClassificationCodes(line).some((code) => sameMkbClass(code, classCode));
|
|
343
|
+
}
|
|
344
|
+
function mkbClassSectionLines(pages, topQuestionPages, classCode) {
|
|
345
|
+
let startPageIndex = -1;
|
|
346
|
+
let startLineIndex = -1;
|
|
347
|
+
const candidates = topQuestionPages?.size ? pages.filter((page) => topQuestionPages.has(page.page) || topQuestionPages.has(page.page - 1) || topQuestionPages.has(page.page + 1)) : pages;
|
|
348
|
+
for (const page of candidates) {
|
|
349
|
+
const lines = page.lines ?? [];
|
|
350
|
+
for (let index = 0; index < lines.length; index += 1) {
|
|
351
|
+
if (!lineHasMkbClass(lines[index], classCode))
|
|
352
|
+
continue;
|
|
353
|
+
startPageIndex = pages.findIndex((candidate) => candidate.page === page.page);
|
|
354
|
+
startLineIndex = index;
|
|
355
|
+
break;
|
|
356
|
+
}
|
|
357
|
+
if (startPageIndex >= 0)
|
|
358
|
+
break;
|
|
359
|
+
}
|
|
360
|
+
if (startPageIndex < 0)
|
|
361
|
+
return [];
|
|
362
|
+
const out = [];
|
|
363
|
+
for (let pageIndex = startPageIndex; pageIndex < Math.min(pages.length, startPageIndex + 3); pageIndex += 1) {
|
|
364
|
+
const lines = pages[pageIndex].lines ?? [];
|
|
365
|
+
const from = pageIndex === startPageIndex ? startLineIndex : 0;
|
|
366
|
+
for (let index = from; index < lines.length; index += 1) {
|
|
367
|
+
const line = lines[index];
|
|
368
|
+
if (out.length && /^\s*\d+(?:\.\d+)+\s+/u.test(normalizeText(line)) && !lineHasMkbClass(line, classCode))
|
|
369
|
+
return out;
|
|
370
|
+
out.push(line);
|
|
371
|
+
if (out.length >= 90)
|
|
372
|
+
return out;
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
return out;
|
|
376
|
+
}
|
|
377
|
+
function mkbClassIncludedRows(sectionLines, classCode) {
|
|
378
|
+
const rows = [];
|
|
379
|
+
for (let index = 0; index < sectionLines.length; index += 1) {
|
|
380
|
+
const line = sectionLines[index];
|
|
381
|
+
const codes = canonicalClassificationCodes(line);
|
|
382
|
+
if (!codes.some((code) => code.startsWith(`${classCode}.`)))
|
|
383
|
+
continue;
|
|
384
|
+
const row = [line];
|
|
385
|
+
for (let next = index + 1; next < Math.min(sectionLines.length, index + 4); next += 1) {
|
|
386
|
+
const nextLine = sectionLines[next];
|
|
387
|
+
const nextCodes = canonicalClassificationCodes(nextLine);
|
|
388
|
+
if (nextCodes.some((code) => sameMkbClass(code, classCode)))
|
|
389
|
+
break;
|
|
390
|
+
if (containsNormalizedPhrase(normalizeForSearch(nextLine), "\u0438\u0441\u043a\u043b\u044e\u0447"))
|
|
391
|
+
break;
|
|
392
|
+
row.push(nextLine);
|
|
393
|
+
if (/[.;:]$/u.test(normalizeText(nextLine)))
|
|
394
|
+
break;
|
|
395
|
+
}
|
|
396
|
+
rows.push(row.join(" ").replace(/\s+/g, " ").trim());
|
|
397
|
+
}
|
|
398
|
+
return rows;
|
|
399
|
+
}
|
|
400
|
+
function mkbClassAnswerTokens(answerText) {
|
|
401
|
+
return uniqueTokens(answerText).filter((token) => token.length >= 4 && !MKB_CLASS_EXCLUSION_GENERIC_TOKENS.has(token) && !FOCUS_STOPWORDS.has(token));
|
|
402
|
+
}
|
|
403
|
+
function mkbClassIncludedRowHit(row, answerText) {
|
|
404
|
+
const tokens = mkbClassAnswerTokens(answerText);
|
|
405
|
+
if (!tokens.length)
|
|
406
|
+
return false;
|
|
407
|
+
const rowTokens = tokenize(row);
|
|
408
|
+
const strict = strictSoftCoverage(tokens, rowTokens);
|
|
409
|
+
const soft = softCoverage(tokens, rowTokens);
|
|
410
|
+
const raw = rawSoftCoverage(tokens, tokenize(row, { keepStopwords: true, stem: false }));
|
|
411
|
+
const threshold = tokens.length <= 1 ? 1 : 0.58;
|
|
412
|
+
return Math.max(strict, soft, raw) >= threshold;
|
|
413
|
+
}
|
|
414
|
+
function bestMkbClassExclusionSupport({ pages, topQuestionPages, mode, question, answer }) {
|
|
415
|
+
if (!mkbClassExclusionQuestion(mode, question))
|
|
416
|
+
return { support: null, adjustment: 0, evidence: null };
|
|
417
|
+
const classCode = questionMkbClassCode(question);
|
|
418
|
+
if (!classCode)
|
|
419
|
+
return { support: null, adjustment: 0, evidence: null };
|
|
420
|
+
const sectionLines = mkbClassSectionLines(pages, topQuestionPages, classCode);
|
|
421
|
+
if (sectionLines.length < 3)
|
|
422
|
+
return { support: null, adjustment: 0, evidence: null };
|
|
423
|
+
const includedRows = mkbClassIncludedRows(sectionLines, classCode);
|
|
424
|
+
if (includedRows.length < 2)
|
|
425
|
+
return { support: null, adjustment: 0, evidence: null };
|
|
426
|
+
const includedRow = includedRows.find((row) => mkbClassIncludedRowHit(row, answer.text));
|
|
427
|
+
if (includedRow) {
|
|
428
|
+
return {
|
|
429
|
+
support: null,
|
|
430
|
+
adjustment: -9.4,
|
|
431
|
+
evidence: {
|
|
432
|
+
answerId: answer.id,
|
|
433
|
+
page: topQuestionPages?.values().next().value ?? 0,
|
|
434
|
+
text: includedRow,
|
|
435
|
+
score: 17.2,
|
|
436
|
+
kind: "mkb_class_included_mismatch",
|
|
437
|
+
},
|
|
438
|
+
};
|
|
439
|
+
}
|
|
440
|
+
const sectionText = sectionLines.join(" ").replace(/\s+/g, " ").trim();
|
|
441
|
+
return {
|
|
442
|
+
support: {
|
|
443
|
+
answerId: answer.id,
|
|
444
|
+
page: topQuestionPages?.values().next().value ?? 0,
|
|
445
|
+
text: sectionText.slice(0, 900),
|
|
446
|
+
score: 15.8,
|
|
447
|
+
kind: "mkb_class_exclusion_absent",
|
|
448
|
+
},
|
|
449
|
+
adjustment: 0,
|
|
450
|
+
evidence: null,
|
|
451
|
+
};
|
|
452
|
+
}
|
|
453
|
+
function canonicalShortLabel(value) {
|
|
454
|
+
const compact = String(value ?? "")
|
|
455
|
+
.normalize("NFKC")
|
|
456
|
+
.toLowerCase()
|
|
457
|
+
.replace(/[.\s_\-–—]+/g, "")
|
|
458
|
+
.replace(/[тТ]/g, "t")
|
|
459
|
+
.replace(/[мМ]/g, "m")
|
|
460
|
+
.replace(/[хХ]/g, "x")
|
|
461
|
+
.replace(/[оОoO]/g, "0")
|
|
462
|
+
.replace(/[аА]/g, "a")
|
|
463
|
+
.replace(/[вВ]/g, "b");
|
|
464
|
+
return compact.replace(/[^a-z0-9]/g, "");
|
|
465
|
+
}
|
|
466
|
+
function questionShortLabels(question) {
|
|
467
|
+
const text = String(question ?? "").normalize("NFKC");
|
|
468
|
+
const labels = new Set();
|
|
469
|
+
const patterns = [
|
|
470
|
+
/(?<![\p{L}\p{N}])[TТ]\s*(?:is|[0-4xхoо])\s*[abаАвВ]?(?![\p{L}\p{N}])/giu,
|
|
471
|
+
/(?<![\p{L}\p{N}])[NН]\s*(?:[0-3xхoо])\s*[abаАвВ]?(?![\p{L}\p{N}])/giu,
|
|
472
|
+
/(?<![\p{L}\p{N}])[MМ]\s*(?:[0-1xхoо])\s*[abаАвВ]?(?![\p{L}\p{N}])/giu,
|
|
473
|
+
/(?<![\p{L}\p{N}])(?:I|II|III|IV|V|VI|VII|VIII|IX|X)\s*[abаАвВ]?(?![\p{L}\p{N}])/giu,
|
|
474
|
+
];
|
|
475
|
+
for (const pattern of patterns) {
|
|
476
|
+
for (const match of text.matchAll(pattern)) {
|
|
477
|
+
const label = canonicalShortLabel(match[0]);
|
|
478
|
+
if (label.length >= 2 && label.length <= 5)
|
|
479
|
+
labels.add(label);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
return [...labels];
|
|
483
|
+
}
|
|
484
|
+
function lineShortLabels(text) {
|
|
485
|
+
const raw = String(text ?? "").normalize("NFKC");
|
|
486
|
+
const labels = new Set(questionShortLabels(raw));
|
|
487
|
+
const compact = canonicalShortLabel(raw);
|
|
488
|
+
if (/^[tnm](?:is|[0-4x])(?:[ab])?$/.test(compact))
|
|
489
|
+
labels.add(compact);
|
|
490
|
+
if (/^(?:i|ii|iii|iv|v|vi|vii|viii|ix|x)(?:[ab])?$/.test(compact))
|
|
491
|
+
labels.add(compact);
|
|
492
|
+
return [...labels];
|
|
493
|
+
}
|
|
494
|
+
function visualRowText(lines, index) {
|
|
495
|
+
const start = Math.max(0, index - 2);
|
|
496
|
+
const end = Math.min(lines.length, index + 4);
|
|
497
|
+
return lines
|
|
498
|
+
.slice(start, end)
|
|
499
|
+
.map((line) => line.text)
|
|
500
|
+
.join(" ")
|
|
501
|
+
.replace(/\s+/g, " ")
|
|
502
|
+
.trim();
|
|
503
|
+
}
|
|
504
|
+
const VISUAL_TABLE_COLUMN_GENERIC_FOCUS = new Set(uniqueTokens([
|
|
505
|
+
"признаки критерии относятся следующие показатель показатели таблица согласно классификация",
|
|
506
|
+
"значение значения характерны является являются включает включают",
|
|
507
|
+
].join(" ")));
|
|
508
|
+
const VISUAL_TABLE_METRIC_STOP = new Set(uniqueTokens("мм мг мл г л ч мин сутки день дней раз более менее выше ниже или норма"));
|
|
509
|
+
const VISUAL_TABLE_COLUMN_CUE_TOKENS = new Set(uniqueTokens("легкая легкой средняя средней среднетяжелая среднетяжелой тяжелая тяжелой степень степени стадия стадии класс класса категория категории группа тип форма"));
|
|
510
|
+
function hasVisualTableColumnCue(question, focusTokens) {
|
|
511
|
+
const tokens = [...new Set([...(focusTokens ?? []), ...uniqueTokens(question)])];
|
|
512
|
+
return tokens.some((token) => VISUAL_TABLE_COLUMN_CUE_TOKENS.has(token));
|
|
513
|
+
}
|
|
514
|
+
function visualTableColumnFocusTokens(focusTokens, question) {
|
|
515
|
+
const out = [];
|
|
516
|
+
for (const token of [...(focusTokens ?? []), ...uniqueTokens(question)]) {
|
|
517
|
+
if (!token || token.length < 4)
|
|
518
|
+
continue;
|
|
519
|
+
if (FOCUS_STOPWORDS.has(token) || VISUAL_TABLE_COLUMN_GENERIC_FOCUS.has(token))
|
|
520
|
+
continue;
|
|
521
|
+
if (!out.includes(token))
|
|
522
|
+
out.push(token);
|
|
523
|
+
}
|
|
524
|
+
return out.slice(0, 10);
|
|
525
|
+
}
|
|
526
|
+
function lineXSpread(line) {
|
|
527
|
+
const xs = (line?.items ?? []).map((item) => item.x ?? 0);
|
|
528
|
+
if (xs.length < 2)
|
|
529
|
+
return 0;
|
|
530
|
+
return Math.max(...xs) - Math.min(...xs);
|
|
531
|
+
}
|
|
532
|
+
function visualTableColumnTargets(page, question, focusTokens) {
|
|
533
|
+
const focus = visualTableColumnFocusTokens(focusTokens, question);
|
|
534
|
+
if (!focus.length)
|
|
535
|
+
return [];
|
|
536
|
+
const targets = [];
|
|
537
|
+
const lines = page?.lineItems ?? [];
|
|
538
|
+
for (const line of lines) {
|
|
539
|
+
if ((line.items?.length ?? 0) < 3 || lineXSpread(line) < 140)
|
|
540
|
+
continue;
|
|
541
|
+
if (String(line.text ?? "").length > 220)
|
|
542
|
+
continue;
|
|
543
|
+
const lineNorm = normalizeForSearch(line.text);
|
|
544
|
+
if (containsNormalizedPhrase(lineNorm, "рекоменду") || /pekom/iu.test(lineNorm))
|
|
545
|
+
continue;
|
|
546
|
+
for (const item of line.items ?? []) {
|
|
547
|
+
if (String(item.text ?? "").length > 90)
|
|
548
|
+
continue;
|
|
549
|
+
const itemTokens = uniqueTokens(item.text);
|
|
550
|
+
const hits = tokenHitCount(focus, itemTokens);
|
|
551
|
+
const required = focus.length >= 2 ? 2 : 1;
|
|
552
|
+
if (hits < required)
|
|
553
|
+
continue;
|
|
554
|
+
targets.push({
|
|
555
|
+
x: item.x ?? 0,
|
|
556
|
+
text: line.text,
|
|
557
|
+
page: page.page,
|
|
558
|
+
});
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
return targets;
|
|
562
|
+
}
|
|
563
|
+
function visualTableTargetsNearPage(pages, page, question, focusTokens) {
|
|
564
|
+
const out = [];
|
|
565
|
+
for (const candidate of pages) {
|
|
566
|
+
if (candidate.page !== page.page && candidate.page !== page.page - 1)
|
|
567
|
+
continue;
|
|
568
|
+
out.push(...visualTableColumnTargets(candidate, question, focusTokens));
|
|
569
|
+
}
|
|
570
|
+
return out;
|
|
571
|
+
}
|
|
572
|
+
function buildVisualTableColumnTargetsByPage(pages, question, focusTokens, topQuestionPages) {
|
|
573
|
+
const byPage = new Map();
|
|
574
|
+
for (const page of pages) {
|
|
575
|
+
const nearTopPage = !topQuestionPages?.size || topQuestionPages.has(page.page) || topQuestionPages.has(page.page - 1) || topQuestionPages.has(page.page + 1);
|
|
576
|
+
if (!nearTopPage)
|
|
577
|
+
continue;
|
|
578
|
+
const targets = visualTableTargetsNearPage(pages, page, question, focusTokens);
|
|
579
|
+
if (targets.length)
|
|
580
|
+
byPage.set(page.page, targets);
|
|
581
|
+
}
|
|
582
|
+
return byPage;
|
|
583
|
+
}
|
|
584
|
+
function answerMetricTokens(answerText) {
|
|
585
|
+
return uniqueTokens(answerText).filter((token) => {
|
|
586
|
+
if (!token || token.length < 3)
|
|
587
|
+
return false;
|
|
588
|
+
if (/^\d/u.test(token))
|
|
589
|
+
return false;
|
|
590
|
+
if (VISUAL_TABLE_METRIC_STOP.has(token) || FOCUS_STOPWORDS.has(token))
|
|
591
|
+
return false;
|
|
592
|
+
return true;
|
|
593
|
+
});
|
|
594
|
+
}
|
|
595
|
+
function comparatorSigns(text) {
|
|
596
|
+
const signs = new Set();
|
|
597
|
+
const raw = String(text ?? "");
|
|
598
|
+
if (/[<≤]/u.test(raw))
|
|
599
|
+
signs.add("<");
|
|
600
|
+
if (/[>≥]/u.test(raw))
|
|
601
|
+
signs.add(">");
|
|
602
|
+
return signs;
|
|
603
|
+
}
|
|
604
|
+
function visualValueMatchesAnswer(itemText, answerText) {
|
|
605
|
+
const numericCoverage = numberCoverage(answerText, normalizeForSearch(itemText));
|
|
606
|
+
if (numericCoverage <= 0)
|
|
607
|
+
return false;
|
|
608
|
+
const expandedAnswerNumbers = [...new Set(extractNumbers(answerText).flatMap(expandNumberToken))];
|
|
609
|
+
if (expandedAnswerNumbers.length > 1 && numericCoverage < 0.99)
|
|
610
|
+
return false;
|
|
611
|
+
const answerSigns = comparatorSigns(answerText);
|
|
612
|
+
if (!answerSigns.size)
|
|
613
|
+
return true;
|
|
614
|
+
const itemSigns = comparatorSigns(itemText);
|
|
615
|
+
return [...answerSigns].some((sign) => itemSigns.has(sign));
|
|
616
|
+
}
|
|
617
|
+
function targetCellText(line, targetX) {
|
|
618
|
+
return (line.items ?? [])
|
|
619
|
+
.filter((item) => Math.abs((item.x ?? 0) - targetX) <= 52)
|
|
620
|
+
.map((item) => item.text)
|
|
621
|
+
.join(" ")
|
|
622
|
+
.replace(/\s+/g, " ")
|
|
623
|
+
.trim();
|
|
624
|
+
}
|
|
625
|
+
function nearbyMetricText(lines, index, targetX) {
|
|
626
|
+
const baseY = lines[index]?.y ?? 0;
|
|
627
|
+
const parts = [];
|
|
628
|
+
for (let offset = -2; offset <= 2; offset += 1) {
|
|
629
|
+
const line = lines[index + offset];
|
|
630
|
+
if (!line)
|
|
631
|
+
continue;
|
|
632
|
+
if (Math.abs((line.y ?? baseY) - baseY) > 28)
|
|
633
|
+
continue;
|
|
634
|
+
for (const item of line.items ?? []) {
|
|
635
|
+
if ((item.x ?? 0) < targetX - 45)
|
|
636
|
+
parts.push(item.text);
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
return parts.join(" ").replace(/\s+/g, " ").trim();
|
|
640
|
+
}
|
|
641
|
+
function bestVisualTableColumnSupport({ mode, pages, topQuestionPages, question, answer, focusTokens, visualTableColumnTargetsByPage }) {
|
|
642
|
+
if (mode !== "multi" || !extractNumbers(answer.text).length)
|
|
643
|
+
return null;
|
|
644
|
+
if (!visualTableColumnTargetsByPage)
|
|
645
|
+
return null;
|
|
646
|
+
const metricTokens = answerMetricTokens(answer.text);
|
|
647
|
+
if (!metricTokens.length)
|
|
648
|
+
return null;
|
|
649
|
+
let best = null;
|
|
650
|
+
for (const page of pages) {
|
|
651
|
+
const nearTopPage = !topQuestionPages?.size || topQuestionPages.has(page.page) || topQuestionPages.has(page.page - 1) || topQuestionPages.has(page.page + 1);
|
|
652
|
+
if (!nearTopPage)
|
|
653
|
+
continue;
|
|
654
|
+
const targets = visualTableColumnTargetsByPage.get(page.page) ?? [];
|
|
655
|
+
if (!targets.length)
|
|
656
|
+
continue;
|
|
657
|
+
const lines = page.lineItems ?? [];
|
|
658
|
+
for (let index = 0; index < lines.length; index += 1) {
|
|
659
|
+
const line = lines[index];
|
|
660
|
+
for (const target of targets) {
|
|
661
|
+
for (const item of line.items ?? []) {
|
|
662
|
+
const xDistance = Math.abs((item.x ?? 0) - target.x);
|
|
663
|
+
if (xDistance > 48)
|
|
664
|
+
continue;
|
|
665
|
+
const cellText = targetCellText(line, target.x) || item.text;
|
|
666
|
+
if (!visualValueMatchesAnswer(cellText, answer.text))
|
|
667
|
+
continue;
|
|
668
|
+
const metricText = nearbyMetricText(lines, index, target.x);
|
|
669
|
+
const metricDocTokens = uniqueTokens(metricText);
|
|
670
|
+
const metricHits = tokenHitCount(metricTokens, metricDocTokens);
|
|
671
|
+
const metricCoverage = coverage(metricTokens, metricDocTokens);
|
|
672
|
+
if (metricHits < 1 && metricCoverage < 0.34)
|
|
673
|
+
continue;
|
|
674
|
+
const score = 15.2 +
|
|
675
|
+
proximityBonus(xDistance, 48) * 3.0 +
|
|
676
|
+
Math.min(3, metricHits) * 1.8 +
|
|
677
|
+
Math.min(0.8, metricCoverage) * 4.2 +
|
|
678
|
+
numberCoverage(answer.text, normalizeForSearch(cellText)) * 2.2;
|
|
679
|
+
best = betterEvidence(best, {
|
|
680
|
+
answerId: answer.id,
|
|
681
|
+
page: page.page,
|
|
682
|
+
text: `${target.text} ${metricText} ${cellText}`.replace(/\s+/g, " ").trim(),
|
|
683
|
+
score,
|
|
684
|
+
kind: "visual_table_column",
|
|
685
|
+
});
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
return best;
|
|
691
|
+
}
|
|
692
|
+
function lineStartX(line) {
|
|
693
|
+
return line?.items?.[0]?.x ?? 0;
|
|
694
|
+
}
|
|
695
|
+
function linePrefixShortLabels(line) {
|
|
696
|
+
const prefix = (line?.items ?? [])
|
|
697
|
+
.slice(0, 3)
|
|
698
|
+
.map((item) => item.text)
|
|
699
|
+
.join(" ");
|
|
700
|
+
return lineShortLabels(prefix || String(line?.text ?? "").slice(0, 24));
|
|
701
|
+
}
|
|
702
|
+
function lineStartsWithShortLabelStem(line) {
|
|
703
|
+
const first = canonicalShortLabel(line?.items?.[0]?.text ?? "");
|
|
704
|
+
return /^[tnm]$/.test(first) || /^(?:i|ii|iii|iv|v|vi|vii|viii|ix|x)$/.test(first);
|
|
705
|
+
}
|
|
706
|
+
function splitShortLabelSuffix(line) {
|
|
707
|
+
const compact = canonicalShortLabel(line?.items?.[0]?.text ?? line?.text ?? "");
|
|
708
|
+
if (/^(?:is|[0-4x]|[0-4][ab]?)$/.test(compact))
|
|
709
|
+
return compact;
|
|
710
|
+
if (/^(?:i|ii|iii|iv|v|vi|vii|viii|ix|x)[ab]?$/.test(compact))
|
|
711
|
+
return compact;
|
|
712
|
+
return null;
|
|
713
|
+
}
|
|
714
|
+
function lineExactShortLabels(lines, index) {
|
|
715
|
+
const labels = new Set(linePrefixShortLabels(lines[index]));
|
|
716
|
+
if (lineStartsWithShortLabelStem(lines[index]) && index + 1 < lines.length) {
|
|
717
|
+
const suffix = splitShortLabelSuffix(lines[index + 1]);
|
|
718
|
+
if (suffix && Math.abs(lineStartX(lines[index + 1]) - lineStartX(lines[index])) <= 18) {
|
|
719
|
+
const stem = lines[index]?.items?.[0]?.text ?? "";
|
|
720
|
+
for (const label of lineShortLabels(`${stem} ${suffix}`))
|
|
721
|
+
labels.add(label);
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
return [...labels];
|
|
725
|
+
}
|
|
726
|
+
function visualExactLabelRowText(lines, index) {
|
|
727
|
+
const row = [];
|
|
728
|
+
const first = lines[index];
|
|
729
|
+
if (!first?.text)
|
|
730
|
+
return "";
|
|
731
|
+
const startX = lineStartX(first);
|
|
732
|
+
let previousY = first.y ?? 0;
|
|
733
|
+
for (let current = index; current < lines.length && row.length < 8; current += 1) {
|
|
734
|
+
const line = lines[current];
|
|
735
|
+
const text = String(line?.text ?? "").replace(/\s+/g, " ").trim();
|
|
736
|
+
if (!text)
|
|
737
|
+
continue;
|
|
738
|
+
if (current > index) {
|
|
739
|
+
const gap = Math.abs((line?.y ?? previousY) - previousY);
|
|
740
|
+
if (gap > 32)
|
|
741
|
+
break;
|
|
742
|
+
const startsNewLabel = (linePrefixShortLabels(line).length > 0 || lineStartsWithShortLabelStem(line)) && Math.abs(lineStartX(line) - startX) <= 18;
|
|
743
|
+
if (startsNewLabel)
|
|
744
|
+
break;
|
|
745
|
+
if (lineStartX(line) < startX + 18 && row.length > 1)
|
|
746
|
+
break;
|
|
747
|
+
}
|
|
748
|
+
previousY = line?.y ?? previousY;
|
|
749
|
+
if (/^\d{1,2}$/.test(text) && lineStartX(line) > startX + 120)
|
|
750
|
+
continue;
|
|
751
|
+
row.push(text);
|
|
752
|
+
}
|
|
753
|
+
return row.join(" ").replace(/\s+/g, " ").trim();
|
|
754
|
+
}
|
|
755
|
+
function bestExactShortLabelRowSupport({ pages, topQuestionPages, question, answer, answerTokens, focusTokens }) {
|
|
756
|
+
const labels = questionShortLabels(question);
|
|
757
|
+
if (!labels.length || !answerTokens.length)
|
|
758
|
+
return null;
|
|
759
|
+
const answerPhrases = answerSearchPhrases(answer.text);
|
|
760
|
+
const usefulFocusTokens = (focusTokens?.length ? focusTokens : uniqueTokens(question)).filter((token) => token.length > 2);
|
|
761
|
+
const numericAnswer = extractNumbers(answer.text).length > 0;
|
|
762
|
+
const minSupport = numericAnswer ? 0.48 : answerTokens.length <= 2 ? 0.84 : 0.4;
|
|
763
|
+
let best = null;
|
|
764
|
+
for (const page of pages) {
|
|
765
|
+
const nearTopPage = !topQuestionPages?.size || topQuestionPages.has(page.page) || topQuestionPages.has(page.page - 1) || topQuestionPages.has(page.page + 1);
|
|
766
|
+
if (!nearTopPage)
|
|
767
|
+
continue;
|
|
768
|
+
const lines = page.lineItems ?? [];
|
|
769
|
+
for (let index = 0; index < lines.length; index += 1) {
|
|
770
|
+
const localLabels = lineExactShortLabels(lines, index);
|
|
771
|
+
if (!labels.some((label) => localLabels.includes(label)))
|
|
772
|
+
continue;
|
|
773
|
+
const text = visualExactLabelRowText(lines, index);
|
|
774
|
+
const normalized = normalizeForSearch(text);
|
|
775
|
+
const tokens = tokenizeNormalized(normalized);
|
|
776
|
+
const answerCoverage = strictSoftCoverage(answerTokens, tokens);
|
|
777
|
+
const numericCoverage = numberCoverage(answer.text, normalized);
|
|
778
|
+
const phraseHit = answerPhrases.some((phrase) => containsNormalizedPhrase(normalized, phrase));
|
|
779
|
+
const answerSupport = Math.max(answerCoverage, numericCoverage, phraseHit ? 1 : 0);
|
|
780
|
+
if (answerSupport < minSupport)
|
|
781
|
+
continue;
|
|
782
|
+
const focusCoverage = usefulFocusTokens.length ? coverage(usefulFocusTokens, tokens) : 0;
|
|
783
|
+
const score = 15.8 +
|
|
784
|
+
answerSupport * 8.6 +
|
|
785
|
+
Math.min(0.42, focusCoverage) * 3.1 +
|
|
786
|
+
numericCoverage * 1.6 +
|
|
787
|
+
(phraseHit ? 1.8 : 0);
|
|
788
|
+
best = betterEvidence(best, {
|
|
789
|
+
answerId: answer.id,
|
|
790
|
+
page: page.page,
|
|
791
|
+
text,
|
|
792
|
+
score,
|
|
793
|
+
kind: "short_label_exact_row",
|
|
794
|
+
});
|
|
795
|
+
}
|
|
796
|
+
}
|
|
797
|
+
return best;
|
|
798
|
+
}
|
|
799
|
+
function bestShortLabelRowSupport({ pages, topQuestionPages, question, answer, answerTokens, focusTokens }) {
|
|
800
|
+
const labels = questionShortLabels(question);
|
|
801
|
+
if (!labels.length || !answerTokens.length)
|
|
802
|
+
return null;
|
|
803
|
+
const answerPhrases = answerSearchPhrases(answer.text);
|
|
804
|
+
const usefulFocusTokens = (focusTokens?.length ? focusTokens : uniqueTokens(question)).filter((token) => token.length > 2);
|
|
805
|
+
const numericAnswer = extractNumbers(answer.text).length > 0;
|
|
806
|
+
const minSupport = numericAnswer ? 0.55 : answerTokens.length <= 2 ? 0.86 : 0.34;
|
|
807
|
+
let best = null;
|
|
808
|
+
for (const page of pages) {
|
|
809
|
+
const nearTopPage = !topQuestionPages?.size || topQuestionPages.has(page.page) || topQuestionPages.has(page.page - 1) || topQuestionPages.has(page.page + 1);
|
|
810
|
+
if (!nearTopPage)
|
|
811
|
+
continue;
|
|
812
|
+
const lines = page.lineItems ?? [];
|
|
813
|
+
for (let index = 0; index < lines.length; index += 1) {
|
|
814
|
+
const localLabels = new Set(lineShortLabels(lines[index]?.text));
|
|
815
|
+
if (index + 1 < lines.length) {
|
|
816
|
+
for (const label of lineShortLabels(`${lines[index].text} ${lines[index + 1].text}`))
|
|
817
|
+
localLabels.add(label);
|
|
818
|
+
}
|
|
819
|
+
if (!labels.some((label) => localLabels.has(label)))
|
|
820
|
+
continue;
|
|
821
|
+
const text = visualRowText(lines, index);
|
|
822
|
+
const normalized = normalizeForSearch(text);
|
|
823
|
+
const tokens = tokenizeNormalized(normalized);
|
|
824
|
+
const answerCoverage = strictSoftCoverage(answerTokens, tokens);
|
|
825
|
+
const numericCoverage = numberCoverage(answer.text, normalized);
|
|
826
|
+
const phraseHit = answerPhrases.some((phrase) => containsNormalizedPhrase(normalized, phrase));
|
|
827
|
+
const answerSupport = Math.max(answerCoverage, numericCoverage, phraseHit ? 1 : 0);
|
|
828
|
+
if (answerSupport < minSupport)
|
|
829
|
+
continue;
|
|
830
|
+
const focusCoverage = usefulFocusTokens.length ? coverage(usefulFocusTokens, tokens) : 0;
|
|
831
|
+
const score = 10.4 +
|
|
832
|
+
answerSupport * 7.2 +
|
|
833
|
+
Math.min(0.35, focusCoverage) * 3.0 +
|
|
834
|
+
numericCoverage * 1.2 +
|
|
835
|
+
(phraseHit ? 1.2 : 0);
|
|
836
|
+
best = betterEvidence(best, {
|
|
837
|
+
answerId: answer.id,
|
|
838
|
+
page: page.page,
|
|
839
|
+
text,
|
|
840
|
+
score,
|
|
841
|
+
kind: "short_label_visual_row",
|
|
842
|
+
});
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
return best;
|
|
846
|
+
}
|
|
847
|
+
function questionPrefixes(question) {
|
|
848
|
+
const tokens = phraseTokens(question);
|
|
849
|
+
const prefixes = new Set();
|
|
850
|
+
for (const length of [14, 11, 8, 6]) {
|
|
851
|
+
if (tokens.length >= length)
|
|
852
|
+
prefixes.add(tokens.slice(0, length).join(" "));
|
|
853
|
+
}
|
|
854
|
+
if (tokens.length > 12) {
|
|
855
|
+
prefixes.add(tokens.slice(Math.max(0, tokens.length - 10)).join(" "));
|
|
856
|
+
}
|
|
857
|
+
return [...prefixes].filter((prefix) => prefix.length >= 18);
|
|
858
|
+
}
|
|
859
|
+
function bestPrefixSupport({ pages, question, answer, answerTokens, intent }) {
|
|
860
|
+
const prefixes = questionPrefixes(question);
|
|
861
|
+
if (!prefixes.length)
|
|
862
|
+
return null;
|
|
863
|
+
const answerPhrases = answerSearchPhrases(answer.text);
|
|
864
|
+
let best = null;
|
|
865
|
+
for (const page of pages) {
|
|
866
|
+
for (const prefix of prefixes) {
|
|
867
|
+
const normalizedPrefix = normalizeForSearch(prefix);
|
|
868
|
+
let start = 0;
|
|
869
|
+
while (start < page.normalized.length) {
|
|
870
|
+
const index = page.normalized.indexOf(normalizedPrefix, start);
|
|
871
|
+
if (index < 0)
|
|
872
|
+
break;
|
|
873
|
+
const afterStart = index + normalizedPrefix.length;
|
|
874
|
+
const after = page.normalized.slice(afterStart, afterStart + 850);
|
|
875
|
+
for (const phrase of answerPhrases) {
|
|
876
|
+
const normalizedPhrase = normalizeForSearch(phrase);
|
|
877
|
+
if (!normalizedPhrase)
|
|
878
|
+
continue;
|
|
879
|
+
const answerIndex = after.indexOf(normalizedPhrase);
|
|
880
|
+
if (answerIndex < 0)
|
|
881
|
+
continue;
|
|
882
|
+
const local = after.slice(Math.max(0, answerIndex - 120), answerIndex + normalizedPhrase.length + 180);
|
|
883
|
+
const score = 5.8 +
|
|
884
|
+
proximityBonus(answerIndex, 850) * 3.0 +
|
|
885
|
+
coverage(answerTokens, tokenize(local)) * 1.2 +
|
|
886
|
+
numberCoverage(answer.text, local) * 0.6 +
|
|
887
|
+
(intent.numeric ? 0.25 : 0);
|
|
888
|
+
best = betterEvidence(best, {
|
|
889
|
+
answerId: answer.id,
|
|
890
|
+
page: page.page,
|
|
891
|
+
text: evidenceSnippet(page.text, question, answer.text),
|
|
892
|
+
score,
|
|
893
|
+
kind: "question_prefix_continuation",
|
|
894
|
+
});
|
|
895
|
+
}
|
|
896
|
+
start = index + normalizedPrefix.length;
|
|
897
|
+
}
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
return best;
|
|
901
|
+
}
|
|
902
|
+
function bestChunkSupport({ index, chunks, question, answer, questionTokens, answerTokens }) {
|
|
903
|
+
const qaTokens = tokenize(`${question} ${answer.text}`);
|
|
904
|
+
const answerOnlyTokens = tokenize(answer.text);
|
|
905
|
+
const qResults = index.search(questionTokens, { limit: DEFAULT_CONFIG.topQuestionChunks });
|
|
906
|
+
const qaResults = index.search(qaTokens, { limit: 8 });
|
|
907
|
+
const aResults = index.search(answerOnlyTokens, { limit: 8 });
|
|
908
|
+
const topQScore = qResults[0]?.score || 0;
|
|
909
|
+
const topQaScore = qaResults[0]?.score || 0;
|
|
910
|
+
const topAScore = aResults[0]?.score || 0;
|
|
911
|
+
let best = null;
|
|
912
|
+
for (const result of qaResults) {
|
|
913
|
+
const chunk = result.chunk;
|
|
914
|
+
const answerCoverage = coverage(answerTokens, chunk.tokens);
|
|
915
|
+
const questionCoverage = coverage(questionTokens, chunk.tokens);
|
|
916
|
+
const exact = containsNormalizedPhrase(chunk.normalized, answer.text) ? 1 : 0;
|
|
917
|
+
const score = normalizeBm25(result.score, topQaScore) * 2.4 +
|
|
918
|
+
questionCoverage * 1.7 +
|
|
919
|
+
answerCoverage * 1.4 +
|
|
920
|
+
exact * 2.4 +
|
|
921
|
+
numberCoverage(answer.text, chunk.normalized) * 0.9 +
|
|
922
|
+
tokenProximity(questionTokens, answerTokens, chunk.tokens) * 1.1;
|
|
923
|
+
best = betterEvidence(best, evidenceFromChunk(answer.id, chunk, score, "bm25_question_answer"));
|
|
924
|
+
}
|
|
925
|
+
for (const result of qResults) {
|
|
926
|
+
const chunk = result.chunk;
|
|
927
|
+
const answerCoverage = coverage(answerTokens, chunk.tokens);
|
|
928
|
+
if (answerCoverage <= 0 && !containsNormalizedPhrase(chunk.normalized, answer.text))
|
|
929
|
+
continue;
|
|
930
|
+
const exact = containsNormalizedPhrase(chunk.normalized, answer.text) ? 1 : 0;
|
|
931
|
+
const lineBoost = chunk.kind === "line" || chunk.kind === "line_pair" || chunk.kind === "layout_line" || chunk.kind === "layout_line_pair"
|
|
932
|
+
? 0.55
|
|
933
|
+
: chunk.kind === "list"
|
|
934
|
+
? 0.35
|
|
935
|
+
: chunk.kind === "heading"
|
|
936
|
+
? 0.2
|
|
937
|
+
: 0;
|
|
938
|
+
const score = normalizeBm25(result.score, topQScore) * 1.6 +
|
|
939
|
+
answerCoverage * 3.2 +
|
|
940
|
+
exact * 3.4 +
|
|
941
|
+
lineBoost +
|
|
942
|
+
jaccard(answerTokens, chunk.tokens) * 0.8 +
|
|
943
|
+
numberCoverage(answer.text, chunk.normalized) * 1.2 +
|
|
944
|
+
tokenProximity(questionTokens, answerTokens, chunk.tokens) * 1.4;
|
|
945
|
+
best = betterEvidence(best, evidenceFromChunk(answer.id, chunk, score, "question_chunk_answer"));
|
|
946
|
+
}
|
|
947
|
+
for (const result of aResults) {
|
|
948
|
+
const chunk = result.chunk;
|
|
949
|
+
const questionCoverage = coverage(questionTokens, chunk.tokens);
|
|
950
|
+
if (questionCoverage <= 0.06)
|
|
951
|
+
continue;
|
|
952
|
+
const score = normalizeBm25(result.score, topAScore) * 0.8 +
|
|
953
|
+
questionCoverage * 2.2 +
|
|
954
|
+
numberCoverage(answer.text, chunk.normalized) * 0.7 +
|
|
955
|
+
tokenProximity(questionTokens, answerTokens, chunk.tokens) * 0.8;
|
|
956
|
+
best = betterEvidence(best, evidenceFromChunk(answer.id, chunk, score, "answer_chunk_question"));
|
|
957
|
+
}
|
|
958
|
+
if (!best && chunks.length) {
|
|
959
|
+
const fallback = qResults[0]?.chunk ?? chunks[0];
|
|
960
|
+
best = evidenceFromChunk(answer.id, fallback, 0, "fallback");
|
|
961
|
+
}
|
|
962
|
+
return best;
|
|
963
|
+
}
|
|
964
|
+
function normalizeBm25(score, topScore) {
|
|
965
|
+
if (!score || !topScore)
|
|
966
|
+
return 0;
|
|
967
|
+
return Math.min(1, score / topScore);
|
|
968
|
+
}
|
|
969
|
+
function numberSpecificity(answer) {
|
|
970
|
+
const count = extractNumbers(answer).length;
|
|
971
|
+
return Math.min(1, count / 3);
|
|
972
|
+
}
|
|
973
|
+
function lineTokenApplicable({ mode, question, answer, intent }) {
|
|
974
|
+
if (mode !== "single")
|
|
975
|
+
return false;
|
|
976
|
+
if (intent.numeric || extractNumbers(answer.text).length)
|
|
977
|
+
return false;
|
|
978
|
+
const raw = normalizeText(question);
|
|
979
|
+
return (/является\s+заболеванием/u.test(raw) ||
|
|
980
|
+
/переда[а-яa-z0-9-]*\s+пут/u.test(raw) ||
|
|
981
|
+
/рекоменду[а-яa-z0-9-]*\s+(?:применение|назначение|применять|назначать)/u.test(raw) ||
|
|
982
|
+
/конкурентно\s+ингибирует/u.test(raw) ||
|
|
983
|
+
/фермент/u.test(raw));
|
|
984
|
+
}
|
|
985
|
+
function questionRiskCondition(question) {
|
|
986
|
+
const raw = normalizeText(question);
|
|
987
|
+
if (/(?:не\s+имеющ|без|отсутств)[а-яa-z0-9-\s]{0,80}фактор[а-яa-z0-9-\s]{0,40}риска/u.test(raw))
|
|
988
|
+
return "risk_absent";
|
|
989
|
+
if (/(?:имеющ|налич)[а-яa-z0-9-\s]{0,80}фактор[а-яa-z0-9-\s]{0,40}риска/u.test(raw))
|
|
990
|
+
return "risk_present";
|
|
991
|
+
return null;
|
|
992
|
+
}
|
|
993
|
+
function windowRiskCondition(normalizedWindow) {
|
|
994
|
+
if (containsNormalizedPhrase(normalizedWindow, "не имеющих факторов риска") || containsNormalizedPhrase(normalizedWindow, "без факторов риска")) {
|
|
995
|
+
return "risk_absent";
|
|
996
|
+
}
|
|
997
|
+
if (containsNormalizedPhrase(normalizedWindow, "при наличии") && containsNormalizedPhrase(normalizedWindow, "фактор")) {
|
|
998
|
+
return "risk_present";
|
|
999
|
+
}
|
|
1000
|
+
if (containsNormalizedPhrase(normalizedWindow, "имеющих") && containsNormalizedPhrase(normalizedWindow, "факторов риска")) {
|
|
1001
|
+
return "risk_present";
|
|
1002
|
+
}
|
|
1003
|
+
return null;
|
|
1004
|
+
}
|
|
1005
|
+
function primaryNumberPhrase(answerText) {
|
|
1006
|
+
const first = extractNumbers(answerText)[0];
|
|
1007
|
+
if (!first)
|
|
1008
|
+
return null;
|
|
1009
|
+
return String(first).replace(",", ".");
|
|
1010
|
+
}
|
|
1011
|
+
function riskConditionAdjustment({ pages, topQuestionPages, question, answer }) {
|
|
1012
|
+
const target = questionRiskCondition(question);
|
|
1013
|
+
const value = primaryNumberPhrase(answer.text);
|
|
1014
|
+
if (!target || !value)
|
|
1015
|
+
return { adjustment: 0, evidence: null };
|
|
1016
|
+
let bestMatch = null;
|
|
1017
|
+
let bestMismatch = null;
|
|
1018
|
+
for (const page of pages) {
|
|
1019
|
+
if (topQuestionPages?.size && !topQuestionPages.has(page.page))
|
|
1020
|
+
continue;
|
|
1021
|
+
const hits = findPhraseOccurrences(page.normalized, value, { textIsNormalized: true });
|
|
1022
|
+
for (const hit of hits) {
|
|
1023
|
+
const beforeNumber = page.normalized.slice(Math.max(0, hit - 50), hit);
|
|
1024
|
+
if (!containsNormalizedPhrase(beforeNumber, "уровн"))
|
|
1025
|
+
continue;
|
|
1026
|
+
const levelIndex = beforeNumber.lastIndexOf(normalizeForSearch("уровн"));
|
|
1027
|
+
if (levelIndex >= 0 && extractNumbers(beforeNumber.slice(levelIndex)).length)
|
|
1028
|
+
continue;
|
|
1029
|
+
const window = page.normalized.slice(Math.max(0, hit - 70), hit + value.length + 240);
|
|
1030
|
+
if (!containsNormalizedPhrase(window, "фактор") || !containsNormalizedPhrase(window, "риск"))
|
|
1031
|
+
continue;
|
|
1032
|
+
const after = page.normalized.slice(hit, hit + value.length + 240);
|
|
1033
|
+
const actual = windowRiskCondition(after) ?? windowRiskCondition(window);
|
|
1034
|
+
if (!actual)
|
|
1035
|
+
continue;
|
|
1036
|
+
const evidence = {
|
|
1037
|
+
answerId: answer.id,
|
|
1038
|
+
page: page.page,
|
|
1039
|
+
text: evidenceSnippet(page.text, value, question),
|
|
1040
|
+
score: actual === target ? 8.4 : 2.2,
|
|
1041
|
+
kind: actual === target ? "risk_condition_match" : "risk_condition_mismatch",
|
|
1042
|
+
};
|
|
1043
|
+
if (actual === target)
|
|
1044
|
+
bestMatch = betterEvidence(bestMatch, evidence);
|
|
1045
|
+
else
|
|
1046
|
+
bestMismatch = betterEvidence(bestMismatch, evidence);
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
if (bestMatch)
|
|
1050
|
+
return { adjustment: 4.2, evidence: bestMatch };
|
|
1051
|
+
if (bestMismatch)
|
|
1052
|
+
return { adjustment: -2.1, evidence: bestMismatch };
|
|
1053
|
+
return { adjustment: 0, evidence: null };
|
|
1054
|
+
}
|
|
1055
|
+
function genericPopulationAnswer(answerText) {
|
|
1056
|
+
const raw = normalizeText(answerText);
|
|
1057
|
+
return /^(?:всем|все)\s+(?:пациент|больн|пострадав)/u.test(raw);
|
|
1058
|
+
}
|
|
1059
|
+
function genericPopulationConditionAdjustment({ mode, pages, topQuestionPages, question, answer, focusTokens }) {
|
|
1060
|
+
if (mode !== "single" || !genericPopulationAnswer(answer.text))
|
|
1061
|
+
return { adjustment: 0, evidence: null };
|
|
1062
|
+
if (/^(?:всем|все)\s+(?:пациент|больн|пострадав)/u.test(normalizeText(question)))
|
|
1063
|
+
return { adjustment: 0, evidence: null };
|
|
1064
|
+
const answerPhrases = answerSearchPhrases(answer.text).slice(0, 8);
|
|
1065
|
+
let best = null;
|
|
1066
|
+
for (const page of pages) {
|
|
1067
|
+
if (topQuestionPages?.size && !topQuestionPages.has(page.page))
|
|
1068
|
+
continue;
|
|
1069
|
+
for (const phrase of answerPhrases) {
|
|
1070
|
+
const phraseNorm = normalizeForSearch(phrase);
|
|
1071
|
+
if (!phraseNorm || phraseNorm.length < 5)
|
|
1072
|
+
continue;
|
|
1073
|
+
const hits = findPhraseOccurrences(page.normalized, phrase, { textIsNormalized: true });
|
|
1074
|
+
for (const hit of hits) {
|
|
1075
|
+
const after = page.normalized.slice(hit + phraseNorm.length, hit + phraseNorm.length + 520);
|
|
1076
|
+
const hasCondition = containsNormalizedPhrase(after, "при") ||
|
|
1077
|
+
containsNormalizedPhrase(after, "с целью") ||
|
|
1078
|
+
containsNormalizedPhrase(after, "при наличии") ||
|
|
1079
|
+
containsNormalizedPhrase(after, "при развитии");
|
|
1080
|
+
if (!hasCondition)
|
|
1081
|
+
continue;
|
|
1082
|
+
const focusCoverage = coverage(focusTokens, tokenizeNormalized(after));
|
|
1083
|
+
if (focusCoverage < 0.12)
|
|
1084
|
+
continue;
|
|
1085
|
+
best = betterEvidence(best, {
|
|
1086
|
+
answerId: answer.id,
|
|
1087
|
+
page: page.page,
|
|
1088
|
+
text: evidenceSnippet(page.text, answer.text, question),
|
|
1089
|
+
score: 3.0 + focusCoverage * 4.0,
|
|
1090
|
+
kind: "generic_population_condition_penalty",
|
|
1091
|
+
});
|
|
1092
|
+
}
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
return best ? { adjustment: -10.4, evidence: best } : { adjustment: 0, evidence: null };
|
|
1096
|
+
}
|
|
1097
|
+
function genericPopulationConditionAdjustmentForMode(context) {
|
|
1098
|
+
const { mode, pages, topQuestionPages, question, answer, answers, focusTokens } = context;
|
|
1099
|
+
if (mode !== "multi")
|
|
1100
|
+
return genericPopulationConditionAdjustment(context);
|
|
1101
|
+
if (!genericPopulationAnswer(answer.text))
|
|
1102
|
+
return { adjustment: 0, evidence: null };
|
|
1103
|
+
if (genericPopulationAnswer(question))
|
|
1104
|
+
return { adjustment: 0, evidence: null };
|
|
1105
|
+
if (!containsNormalizedPhrase(normalizeForSearch(question), "\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434"))
|
|
1106
|
+
return { adjustment: 0, evidence: null };
|
|
1107
|
+
if (!hasSpecificPopulationAlternative(answers, answer))
|
|
1108
|
+
return { adjustment: 0, evidence: null };
|
|
1109
|
+
const answerPhrases = answerSearchPhrases(answer.text).slice(0, 8);
|
|
1110
|
+
let best = null;
|
|
1111
|
+
for (const page of pages) {
|
|
1112
|
+
if (topQuestionPages?.size && !topQuestionPages.has(page.page))
|
|
1113
|
+
continue;
|
|
1114
|
+
for (const phrase of answerPhrases) {
|
|
1115
|
+
const phraseNorm = normalizeForSearch(phrase);
|
|
1116
|
+
if (!phraseNorm || phraseNorm.length < 5)
|
|
1117
|
+
continue;
|
|
1118
|
+
const hits = findPhraseOccurrences(page.normalized, phrase, { textIsNormalized: true });
|
|
1119
|
+
for (const hit of hits) {
|
|
1120
|
+
const after = page.normalized.slice(hit + phraseNorm.length, hit + phraseNorm.length + 520);
|
|
1121
|
+
const hasCondition = containsNormalizedPhrase(after, "\u043f\u0440\u0438") ||
|
|
1122
|
+
containsNormalizedPhrase(after, "\u0441 \u0446\u0435\u043b\u044c\u044e") ||
|
|
1123
|
+
containsNormalizedPhrase(after, "\u0434\u043b\u044f") ||
|
|
1124
|
+
containsNormalizedPhrase(after, "\u0441\u0442\u0435\u043f\u0435\u043d") ||
|
|
1125
|
+
containsNormalizedPhrase(after, "\u0442\u044f\u0436\u0435\u043b");
|
|
1126
|
+
if (!hasCondition)
|
|
1127
|
+
continue;
|
|
1128
|
+
const focusCoverage = coverage(focusTokens, tokenizeNormalized(after));
|
|
1129
|
+
if (focusCoverage < 0.12)
|
|
1130
|
+
continue;
|
|
1131
|
+
best = betterEvidence(best, {
|
|
1132
|
+
answerId: answer.id,
|
|
1133
|
+
page: page.page,
|
|
1134
|
+
text: evidenceSnippet(page.text, answer.text, question),
|
|
1135
|
+
score: 3.0 + focusCoverage * 4.0,
|
|
1136
|
+
kind: "generic_population_condition_penalty",
|
|
1137
|
+
});
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
}
|
|
1141
|
+
return best ? { adjustment: -5.2, evidence: best } : { adjustment: 0, evidence: null };
|
|
1142
|
+
}
|
|
1143
|
+
function populationStem(answerText) {
|
|
1144
|
+
const tokens = uniqueTokens(answerText);
|
|
1145
|
+
const stems = ["\u043f\u0430\u0446\u0438\u0435\u043d\u0442", "\u043f\u043e\u0441\u0442\u0440\u0430\u0434", "\u0431\u043e\u043b\u044c\u043d"].map((item) => normalizeForSearch(item));
|
|
1146
|
+
return tokens.find((token) => stems.some((stem) => token.startsWith(stem.slice(0, Math.min(8, stem.length))))) ?? null;
|
|
1147
|
+
}
|
|
1148
|
+
function hasSpecificPopulationAlternative(answers, genericAnswer) {
|
|
1149
|
+
const stem = populationStem(genericAnswer.text);
|
|
1150
|
+
if (!stem)
|
|
1151
|
+
return false;
|
|
1152
|
+
return (answers ?? []).some((candidate) => {
|
|
1153
|
+
if (candidate.id === genericAnswer.id)
|
|
1154
|
+
return false;
|
|
1155
|
+
const normalized = normalizeForSearch(candidate.text);
|
|
1156
|
+
const candidateTokens = uniqueTokens(candidate.text);
|
|
1157
|
+
if (!candidateTokens.some((token) => token.startsWith(stem.slice(0, Math.min(8, stem.length)))))
|
|
1158
|
+
return false;
|
|
1159
|
+
return (containsNormalizedPhrase(normalized, "\u0441\u0440\u0435\u0434\u043d") ||
|
|
1160
|
+
containsNormalizedPhrase(normalized, "\u0442\u044f\u0436\u0435\u043b") ||
|
|
1161
|
+
containsNormalizedPhrase(normalized, "\u0441\u0442\u0435\u043f\u0435\u043d") ||
|
|
1162
|
+
containsNormalizedPhrase(normalized, "\u043f\u0440\u0438 \u043d\u0430\u043b\u0438\u0447") ||
|
|
1163
|
+
containsNormalizedPhrase(normalized, "\u0441 \u043d\u0430\u043b\u0438\u0447"));
|
|
1164
|
+
});
|
|
1165
|
+
}
|
|
1166
|
+
function questionClassSubject(question) {
|
|
1167
|
+
const raw = normalizeText(question);
|
|
1168
|
+
const match = raw.match(/^(.+?)\s+относят\s+к\s+классу/u);
|
|
1169
|
+
if (!match?.[1])
|
|
1170
|
+
return null;
|
|
1171
|
+
const subject = match[1].trim();
|
|
1172
|
+
return subject.length >= 4 ? subject : null;
|
|
1173
|
+
}
|
|
1174
|
+
function romanClassVariants(answerText) {
|
|
1175
|
+
const raw = normalizeText(answerText).replace(/\s+/g, "");
|
|
1176
|
+
const variants = new Set();
|
|
1177
|
+
const romanMap = new Map([
|
|
1178
|
+
["i", "1"],
|
|
1179
|
+
["ii", "2"],
|
|
1180
|
+
["iii", "3"],
|
|
1181
|
+
["iv", "4"],
|
|
1182
|
+
["v", "5"],
|
|
1183
|
+
]);
|
|
1184
|
+
if (romanMap.has(raw)) {
|
|
1185
|
+
variants.add(raw);
|
|
1186
|
+
variants.add(romanMap.get(raw));
|
|
1187
|
+
}
|
|
1188
|
+
const numeric = extractNumbers(answerText)[0];
|
|
1189
|
+
if (numeric) {
|
|
1190
|
+
variants.add(numeric);
|
|
1191
|
+
for (const [roman, value] of romanMap.entries())
|
|
1192
|
+
if (value === numeric)
|
|
1193
|
+
variants.add(roman);
|
|
1194
|
+
}
|
|
1195
|
+
return [...variants].map((item) => normalizeForSearch(item)).filter(Boolean);
|
|
1196
|
+
}
|
|
1197
|
+
function bestClassSubjectSupport({ pages, question, answer }) {
|
|
1198
|
+
const subject = questionClassSubject(question);
|
|
1199
|
+
const variants = romanClassVariants(answer.text);
|
|
1200
|
+
if (!subject || !variants.length)
|
|
1201
|
+
return null;
|
|
1202
|
+
const subjectTokens = uniqueTokens(subject);
|
|
1203
|
+
let best = null;
|
|
1204
|
+
for (const page of pages) {
|
|
1205
|
+
for (const segment of cachedLineTokenSegments(page)) {
|
|
1206
|
+
if (!containsNormalizedPhrase(segment.normalized, "класс"))
|
|
1207
|
+
continue;
|
|
1208
|
+
const subjectCoverage = coverage(subjectTokens, segment.tokens);
|
|
1209
|
+
if (subjectCoverage < 0.65)
|
|
1210
|
+
continue;
|
|
1211
|
+
const hasAnswerClass = variants.some((variant) => tokenBoundaryIncludes(segment.normalized, variant));
|
|
1212
|
+
if (!hasAnswerClass)
|
|
1213
|
+
continue;
|
|
1214
|
+
const score = 10.8 + subjectCoverage * 4.0;
|
|
1215
|
+
best = betterEvidence(best, {
|
|
1216
|
+
answerId: answer.id,
|
|
1217
|
+
page: page.page,
|
|
1218
|
+
text: segment.text,
|
|
1219
|
+
score,
|
|
1220
|
+
kind: "subject_class_line",
|
|
1221
|
+
});
|
|
1222
|
+
}
|
|
1223
|
+
}
|
|
1224
|
+
return best;
|
|
1225
|
+
}
|
|
1226
|
+
function negativeLocalAnswerAdjustment({ pages, topQuestionPages, question, answer, intent }) {
|
|
1227
|
+
const questionRaw = normalizeText(question);
|
|
1228
|
+
if (intent.negative || intent.exception || /редк/u.test(questionRaw))
|
|
1229
|
+
return { adjustment: 0, evidence: null };
|
|
1230
|
+
const phrases = answerSearchPhrases(answer.text).slice(0, 12);
|
|
1231
|
+
let best = null;
|
|
1232
|
+
for (const page of pages) {
|
|
1233
|
+
for (const phrase of phrases) {
|
|
1234
|
+
const phraseNorm = normalizeForSearch(phrase);
|
|
1235
|
+
if (!phraseNorm || phraseNorm.length < 5)
|
|
1236
|
+
continue;
|
|
1237
|
+
const hits = findPhraseOccurrences(page.normalized, phrase, { textIsNormalized: true });
|
|
1238
|
+
for (const hit of hits) {
|
|
1239
|
+
const local = page.normalized.slice(Math.max(0, hit - 80), hit + phraseNorm.length + 120);
|
|
1240
|
+
const negativeCue = containsNormalizedPhrase(local, "крайне ред") ||
|
|
1241
|
+
containsNormalizedPhrase(local, "редк") ||
|
|
1242
|
+
containsNormalizedPhrase(local, "не характер") ||
|
|
1243
|
+
containsNormalizedPhrase(local, "не рекоменд") ||
|
|
1244
|
+
containsNormalizedPhrase(local, "не показ") ||
|
|
1245
|
+
containsNormalizedPhrase(local, "исключ");
|
|
1246
|
+
if (!negativeCue)
|
|
1247
|
+
continue;
|
|
1248
|
+
best = betterEvidence(best, {
|
|
1249
|
+
answerId: answer.id,
|
|
1250
|
+
page: page.page,
|
|
1251
|
+
text: evidenceSnippet(page.text, answer.text, question),
|
|
1252
|
+
score: 6.6,
|
|
1253
|
+
kind: "negative_local_answer_penalty",
|
|
1254
|
+
});
|
|
1255
|
+
}
|
|
1256
|
+
}
|
|
1257
|
+
}
|
|
1258
|
+
return best ? { adjustment: -5.2, evidence: best } : { adjustment: 0, evidence: null };
|
|
1259
|
+
}
|
|
1260
|
+
function boundedListQuestion({ mode, question, intent }) {
|
|
1261
|
+
if (mode !== "multi" || intent.negative || intent.exception)
|
|
1262
|
+
return false;
|
|
1263
|
+
const normalized = normalizeForSearch(question);
|
|
1264
|
+
return ((containsNormalizedPhrase(normalized, "\u043a\u043b\u0438\u043d\u0438\u0447") &&
|
|
1265
|
+
containsNormalizedPhrase(normalized, "\u043f\u0440\u043e\u044f\u0432\u043b")) ||
|
|
1266
|
+
containsNormalizedPhrase(normalized, "\u0441\u0438\u043c\u043f\u0442\u043e\u043c") ||
|
|
1267
|
+
containsNormalizedPhrase(normalized, "\u0441\u043e\u043f\u0440\u043e\u0432\u043e\u0436\u0434") ||
|
|
1268
|
+
(containsNormalizedPhrase(normalized, "\u043e\u0441\u043d\u043e\u0432\u043d") && containsNormalizedPhrase(normalized, "\u044d\u0444\u0444\u0435\u043a\u0442")) ||
|
|
1269
|
+
containsNormalizedPhrase(normalized, "\u0432 \u043e\u0441\u043d\u043e\u0432\u0435"));
|
|
1270
|
+
}
|
|
1271
|
+
function boundedListAnchors(question) {
|
|
1272
|
+
const tokens = rawTokens(question);
|
|
1273
|
+
const anchors = new Set();
|
|
1274
|
+
const addTokens = (items) => {
|
|
1275
|
+
const cleaned = items.filter(Boolean).join(" ").trim();
|
|
1276
|
+
if (cleaned.length >= 3)
|
|
1277
|
+
anchors.add(cleaned);
|
|
1278
|
+
};
|
|
1279
|
+
const syndromeIndex = tokens.findIndex((token) => token.startsWith("\u0441\u0438\u043d\u0434\u0440\u043e\u043c"));
|
|
1280
|
+
if (syndromeIndex >= 0) {
|
|
1281
|
+
const stopPrefixes = [
|
|
1282
|
+
"\u044f\u0432\u043b\u044f",
|
|
1283
|
+
"\u0441\u043e\u043f\u0440\u043e\u0432\u043e\u0436\u0434",
|
|
1284
|
+
"\u0445\u0430\u0440\u0430\u043a\u0442\u0435\u0440",
|
|
1285
|
+
"\u043e\u0441\u043d\u043e\u0432\u043d",
|
|
1286
|
+
"\u043e\u0442\u043d\u043e\u0441",
|
|
1287
|
+
];
|
|
1288
|
+
const anchor = [];
|
|
1289
|
+
for (let index = syndromeIndex + 1; index < Math.min(tokens.length, syndromeIndex + 6); index += 1) {
|
|
1290
|
+
if (stopPrefixes.some((prefix) => tokens[index].startsWith(prefix)))
|
|
1291
|
+
break;
|
|
1292
|
+
anchor.push(tokens[index]);
|
|
1293
|
+
}
|
|
1294
|
+
addTokens(anchor);
|
|
1295
|
+
}
|
|
1296
|
+
const ageIndex = tokens.findIndex((token) => token === "\u0432\u043e\u0437\u0440\u0430\u0441\u0442\u0435");
|
|
1297
|
+
if (ageIndex >= 0) {
|
|
1298
|
+
const next = tokens.slice(ageIndex, Math.min(tokens.length, ageIndex + 12));
|
|
1299
|
+
const directionIndex = next.findIndex((token) => token.startsWith("\u043c\u043e\u043b\u043e\u0436") || token.startsWith("\u0441\u0442\u0430\u0440\u0448") || token.startsWith("\u043c\u043b\u0430\u0434\u0448"));
|
|
1300
|
+
if (next.some((token) => /^\d/.test(token)) && directionIndex >= 0) {
|
|
1301
|
+
addTokens(next.slice(0, directionIndex + 1));
|
|
1302
|
+
}
|
|
1303
|
+
}
|
|
1304
|
+
return [...anchors].slice(0, 6);
|
|
1305
|
+
}
|
|
1306
|
+
function boundedListBoundary(after) {
|
|
1307
|
+
const boundaries = [
|
|
1308
|
+
"\u0438 \u0441",
|
|
1309
|
+
"\u043e\u0431\u0449\u0438\u0435 \u0441\u0438\u043c\u043f\u0442\u043e\u043c\u044b",
|
|
1310
|
+
"\u044d\u0442\u043e \u0440\u0430\u0437\u0434\u0435\u043b\u0435\u043d\u0438\u0435",
|
|
1311
|
+
"\u0443\u0440\u043e\u0432\u0435\u043d\u044c \u0443\u0431\u0435\u0434\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u0438",
|
|
1312
|
+
"\u043a\u043e\u043c\u043c\u0435\u043d\u0442\u0430\u0440\u0438\u0438",
|
|
1313
|
+
"\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434\u0430\u0446\u0438\u0438",
|
|
1314
|
+
].map((item) => normalizeForSearch(item));
|
|
1315
|
+
let end = Math.min(after.length, 900);
|
|
1316
|
+
for (const boundary of boundaries) {
|
|
1317
|
+
const index = after.indexOf(` ${boundary} `, 70);
|
|
1318
|
+
if (index > 0)
|
|
1319
|
+
end = Math.min(end, index);
|
|
1320
|
+
}
|
|
1321
|
+
return Math.max(90, end);
|
|
1322
|
+
}
|
|
1323
|
+
function findBoundedListSegments(pages, question, topQuestionPages, mode, intent) {
|
|
1324
|
+
if (!boundedListQuestion({ mode, question, intent }))
|
|
1325
|
+
return [];
|
|
1326
|
+
const anchors = boundedListAnchors(question);
|
|
1327
|
+
if (!anchors.length)
|
|
1328
|
+
return [];
|
|
1329
|
+
const segments = [];
|
|
1330
|
+
const seen = new Set();
|
|
1331
|
+
const triadCue = normalizeForSearch("\u0434\u043e\u043c\u0438\u043d\u0438\u0440\u0443\u0435\u0442 \u0442\u0440\u0438\u0430\u0434\u0430");
|
|
1332
|
+
for (const page of pages) {
|
|
1333
|
+
for (const source of cachedLineWindowSegments(page)) {
|
|
1334
|
+
for (const anchor of anchors) {
|
|
1335
|
+
const anchorNorm = normalizeForSearch(anchor);
|
|
1336
|
+
const anchorIndex = source.normalized.indexOf(anchorNorm);
|
|
1337
|
+
if (anchorIndex < 0)
|
|
1338
|
+
continue;
|
|
1339
|
+
let start = anchorIndex;
|
|
1340
|
+
const afterAnchor = source.normalized.slice(anchorIndex);
|
|
1341
|
+
const triadIndex = afterAnchor.indexOf(triadCue);
|
|
1342
|
+
if (triadIndex >= 0 && triadIndex <= 260) {
|
|
1343
|
+
start = anchorIndex + triadIndex + triadCue.length;
|
|
1344
|
+
}
|
|
1345
|
+
const after = source.normalized.slice(start);
|
|
1346
|
+
const end = start + boundedListBoundary(after);
|
|
1347
|
+
const included = source.normalized.slice(start, end);
|
|
1348
|
+
const outside = `${source.normalized.slice(0, start)} ${source.normalized.slice(end)}`.trim();
|
|
1349
|
+
const key = `${page.page}:${included.slice(0, 220)}`;
|
|
1350
|
+
if (seen.has(key))
|
|
1351
|
+
continue;
|
|
1352
|
+
seen.add(key);
|
|
1353
|
+
segments.push({
|
|
1354
|
+
page: page.page,
|
|
1355
|
+
text: source.text,
|
|
1356
|
+
normalized: included,
|
|
1357
|
+
outside,
|
|
1358
|
+
anchor,
|
|
1359
|
+
priority: topQuestionPages?.has(page.page) ? 1 : 0,
|
|
1360
|
+
});
|
|
1361
|
+
}
|
|
1362
|
+
}
|
|
1363
|
+
}
|
|
1364
|
+
return segments.sort((a, b) => b.priority - a.priority).slice(0, 40);
|
|
1365
|
+
}
|
|
1366
|
+
function bestBoundedListSupport({ boundedListSegments, answer, answerTokens }) {
|
|
1367
|
+
if (!boundedListSegments?.length)
|
|
1368
|
+
return { support: null, adjustment: 0, evidence: null };
|
|
1369
|
+
const answerPhrases = answerSearchPhrases(answer.text).slice(0, 16);
|
|
1370
|
+
let bestSupport = null;
|
|
1371
|
+
let bestPenalty = null;
|
|
1372
|
+
for (const segment of boundedListSegments) {
|
|
1373
|
+
const segmentTokens = tokenizeNormalized(segment.normalized);
|
|
1374
|
+
const outsideTokens = tokenizeNormalized(segment.outside);
|
|
1375
|
+
const answerCoverage = strictSoftCoverage(answerTokens, segmentTokens);
|
|
1376
|
+
const outsideCoverage = strictSoftCoverage(answerTokens, outsideTokens);
|
|
1377
|
+
const insidePhrase = answerPhrases.some((phrase) => containsNormalizedPhrase(segment.normalized, phrase));
|
|
1378
|
+
const outsidePhrase = answerPhrases.some((phrase) => containsNormalizedPhrase(segment.outside, phrase));
|
|
1379
|
+
const hasInside = insidePhrase || answerCoverage >= 0.66;
|
|
1380
|
+
const hasOutside = outsidePhrase || outsideCoverage >= 0.72;
|
|
1381
|
+
if (hasInside) {
|
|
1382
|
+
const score = 10.8 + (insidePhrase ? 2.6 : 0) + answerCoverage * 3.2 + numberCoverage(answer.text, segment.normalized) * 0.8;
|
|
1383
|
+
bestSupport = betterEvidence(bestSupport, {
|
|
1384
|
+
answerId: answer.id,
|
|
1385
|
+
page: segment.page,
|
|
1386
|
+
text: segment.text,
|
|
1387
|
+
score,
|
|
1388
|
+
kind: "bounded_list_segment",
|
|
1389
|
+
});
|
|
1390
|
+
}
|
|
1391
|
+
else if (hasOutside) {
|
|
1392
|
+
bestPenalty = betterEvidence(bestPenalty, {
|
|
1393
|
+
answerId: answer.id,
|
|
1394
|
+
page: segment.page,
|
|
1395
|
+
text: segment.text,
|
|
1396
|
+
score: 6.0 + outsideCoverage * 2.0,
|
|
1397
|
+
kind: "bounded_list_outside_penalty",
|
|
1398
|
+
});
|
|
1399
|
+
}
|
|
1400
|
+
}
|
|
1401
|
+
if (bestSupport)
|
|
1402
|
+
return { support: bestSupport, adjustment: 0, evidence: null };
|
|
1403
|
+
return bestPenalty ? { support: null, adjustment: -4.8, evidence: bestPenalty } : { support: null, adjustment: 0, evidence: null };
|
|
1404
|
+
}
|
|
1405
|
+
function ordinalTarget(question) {
|
|
1406
|
+
const normalized = normalizeForSearch(question);
|
|
1407
|
+
const hasStage = containsNormalizedPhrase(normalized, "\u044d\u0442\u0430\u043f");
|
|
1408
|
+
const hasLine = containsNormalizedPhrase(normalized, "\u043b\u0438\u043d\u0438");
|
|
1409
|
+
const hasStep = containsNormalizedPhrase(normalized, "\u0441\u0442\u0443\u043f\u0435\u043d");
|
|
1410
|
+
const hasDegree = containsNormalizedPhrase(normalized, "\u0441\u0442\u0435\u043f\u0435\u043d");
|
|
1411
|
+
if (!hasStage && !hasLine && !hasStep && !hasDegree)
|
|
1412
|
+
return null;
|
|
1413
|
+
if (hasStep) {
|
|
1414
|
+
const stepCue = normalizeForSearch("\u0441\u0442\u0443\u043f\u0435\u043d");
|
|
1415
|
+
const stepMatch = normalized.match(new RegExp(`(?:^|\\s)(\\d{1,2})(?:\\s*-?\\s*\\S{0,2})?\\s+${escapeRegExp(stepCue)}`, "iu"));
|
|
1416
|
+
if (stepMatch)
|
|
1417
|
+
return { number: Number(stepMatch[1]), kind: "step" };
|
|
1418
|
+
}
|
|
1419
|
+
if (hasDegree) {
|
|
1420
|
+
const degreeCue = normalizeForSearch("\u0441\u0442\u0435\u043f\u0435\u043d");
|
|
1421
|
+
const degreeMatch = normalized.match(new RegExp(`(?:^|\\s)(\\d{1,2}|[ivx]{1,7})(?:\\s*-?\\s*\\S{0,2})?\\s+${escapeRegExp(degreeCue)}`, "iu"));
|
|
1422
|
+
if (degreeMatch) {
|
|
1423
|
+
const number = ordinalValueToNumber(degreeMatch[1]);
|
|
1424
|
+
if (number)
|
|
1425
|
+
return { number, kind: "degree" };
|
|
1426
|
+
}
|
|
1427
|
+
}
|
|
1428
|
+
const candidates = [
|
|
1429
|
+
{ number: 1, cues: ["\u043f\u0435\u0440\u0432"] },
|
|
1430
|
+
{ number: 2, cues: ["\u0432\u0442\u043e\u0440"] },
|
|
1431
|
+
{ number: 3, cues: ["\u0442\u0440\u0435\u0442", "\u0442\u0440\u0435\u0442\u044c"] },
|
|
1432
|
+
{ number: 4, cues: ["\u0447\u0435\u0442\u0432\u0435\u0440"] },
|
|
1433
|
+
];
|
|
1434
|
+
for (const candidate of candidates) {
|
|
1435
|
+
if (candidate.cues.some((cue) => containsNormalizedPhrase(normalized, cue))) {
|
|
1436
|
+
return { number: candidate.number, kind: hasDegree ? "degree" : hasStage ? "stage" : "line" };
|
|
1437
|
+
}
|
|
1438
|
+
}
|
|
1439
|
+
return null;
|
|
1440
|
+
}
|
|
1441
|
+
function ordinalWordForms(number, kind = "line") {
|
|
1442
|
+
const formsByKind = {
|
|
1443
|
+
line: {
|
|
1444
|
+
1: [
|
|
1445
|
+
"\u043f\u0435\u0440\u0432\u043e\u0439 \u043b\u0438\u043d\u0438\u0438",
|
|
1446
|
+
"\u043f\u0435\u0440\u0432\u0430\u044f \u043b\u0438\u043d\u0438\u044f",
|
|
1447
|
+
"\u043f\u0435\u0440\u0432\u0443\u044e \u043b\u0438\u043d\u0438\u044e",
|
|
1448
|
+
],
|
|
1449
|
+
2: [
|
|
1450
|
+
"\u0432\u0442\u043e\u0440\u043e\u0439 \u043b\u0438\u043d\u0438\u0438",
|
|
1451
|
+
"\u0432\u0442\u043e\u0440\u0430\u044f \u043b\u0438\u043d\u0438\u044f",
|
|
1452
|
+
"\u0432\u0442\u043e\u0440\u0443\u044e \u043b\u0438\u043d\u0438\u044e",
|
|
1453
|
+
],
|
|
1454
|
+
3: [
|
|
1455
|
+
"\u0442\u0440\u0435\u0442\u044c\u0435\u0439 \u043b\u0438\u043d\u0438\u0438",
|
|
1456
|
+
"\u0442\u0440\u0435\u0442\u044c\u044f \u043b\u0438\u043d\u0438\u044f",
|
|
1457
|
+
"\u0442\u0440\u0435\u0442\u044c\u044e \u043b\u0438\u043d\u0438\u044e",
|
|
1458
|
+
],
|
|
1459
|
+
4: [
|
|
1460
|
+
"\u0447\u0435\u0442\u0432\u0435\u0440\u0442\u043e\u0439 \u043b\u0438\u043d\u0438\u0438",
|
|
1461
|
+
"\u0447\u0435\u0442\u0432\u0435\u0440\u0442\u0430\u044f \u043b\u0438\u043d\u0438\u044f",
|
|
1462
|
+
"\u0447\u0435\u0442\u0432\u0435\u0440\u0442\u0443\u044e \u043b\u0438\u043d\u0438\u044e",
|
|
1463
|
+
],
|
|
1464
|
+
},
|
|
1465
|
+
degree: {
|
|
1466
|
+
1: [
|
|
1467
|
+
"\u043f\u0435\u0440\u0432\u043e\u0439 \u0441\u0442\u0435\u043f\u0435\u043d\u0438",
|
|
1468
|
+
"\u043f\u0435\u0440\u0432\u0430\u044f \u0441\u0442\u0435\u043f\u0435\u043d\u044c",
|
|
1469
|
+
"\u043f\u0435\u0440\u0432\u0443\u044e \u0441\u0442\u0435\u043f\u0435\u043d\u044c",
|
|
1470
|
+
],
|
|
1471
|
+
2: [
|
|
1472
|
+
"\u0432\u0442\u043e\u0440\u043e\u0439 \u0441\u0442\u0435\u043f\u0435\u043d\u0438",
|
|
1473
|
+
"\u0432\u0442\u043e\u0440\u0430\u044f \u0441\u0442\u0435\u043f\u0435\u043d\u044c",
|
|
1474
|
+
"\u0432\u0442\u043e\u0440\u0443\u044e \u0441\u0442\u0435\u043f\u0435\u043d\u044c",
|
|
1475
|
+
],
|
|
1476
|
+
3: [
|
|
1477
|
+
"\u0442\u0440\u0435\u0442\u044c\u0435\u0439 \u0441\u0442\u0435\u043f\u0435\u043d\u0438",
|
|
1478
|
+
"\u0442\u0440\u0435\u0442\u044c\u044f \u0441\u0442\u0435\u043f\u0435\u043d\u044c",
|
|
1479
|
+
"\u0442\u0440\u0435\u0442\u044c\u044e \u0441\u0442\u0435\u043f\u0435\u043d\u044c",
|
|
1480
|
+
],
|
|
1481
|
+
4: [
|
|
1482
|
+
"\u0447\u0435\u0442\u0432\u0435\u0440\u0442\u043e\u0439 \u0441\u0442\u0435\u043f\u0435\u043d\u0438",
|
|
1483
|
+
"\u0447\u0435\u0442\u0432\u0435\u0440\u0442\u0430\u044f \u0441\u0442\u0435\u043f\u0435\u043d\u044c",
|
|
1484
|
+
"\u0447\u0435\u0442\u0432\u0435\u0440\u0442\u0443\u044e \u0441\u0442\u0435\u043f\u0435\u043d\u044c",
|
|
1485
|
+
],
|
|
1486
|
+
},
|
|
1487
|
+
};
|
|
1488
|
+
return formsByKind[kind]?.[number] ?? formsByKind.line[number] ?? [];
|
|
1489
|
+
}
|
|
1490
|
+
function nextOrdinalIndex(normalized, start, number) {
|
|
1491
|
+
let best = -1;
|
|
1492
|
+
for (const nextNumber of [number + 1, number + 2]) {
|
|
1493
|
+
const pattern = new RegExp(`(?:^|[ .])${nextNumber}(?:[ .]|$)`, "u");
|
|
1494
|
+
const match = normalized.slice(start).match(pattern);
|
|
1495
|
+
if (match?.index != null) {
|
|
1496
|
+
const index = start + match.index;
|
|
1497
|
+
if (best < 0 || index < best)
|
|
1498
|
+
best = index;
|
|
1499
|
+
}
|
|
1500
|
+
}
|
|
1501
|
+
return best;
|
|
1502
|
+
}
|
|
1503
|
+
function nextStepOrdinalIndex(normalized, start, number) {
|
|
1504
|
+
const stepCue = normalizeForSearch("\u0441\u0442\u0443\u043f\u0435\u043d");
|
|
1505
|
+
let best = -1;
|
|
1506
|
+
for (const nextNumber of [number + 1, number + 2, number + 3]) {
|
|
1507
|
+
const pattern = new RegExp(`(?:^|\\s)${nextNumber}(?:\\s*-?\\s*\\S{0,2})?\\s+${escapeRegExp(stepCue)}`, "iu");
|
|
1508
|
+
const match = normalized.slice(start).match(pattern);
|
|
1509
|
+
if (match?.index != null) {
|
|
1510
|
+
const index = start + match.index;
|
|
1511
|
+
if (best < 0 || index < best)
|
|
1512
|
+
best = index;
|
|
1513
|
+
}
|
|
1514
|
+
}
|
|
1515
|
+
return best;
|
|
1516
|
+
}
|
|
1517
|
+
function ordinalValueToNumber(value) {
|
|
1518
|
+
const normalized = normalizeForSearch(value);
|
|
1519
|
+
if (/^\d{1,2}$/.test(normalized))
|
|
1520
|
+
return Number(normalized);
|
|
1521
|
+
const roman = new Map([
|
|
1522
|
+
["i", 1],
|
|
1523
|
+
["ii", 2],
|
|
1524
|
+
["iii", 3],
|
|
1525
|
+
["iv", 4],
|
|
1526
|
+
["v", 5],
|
|
1527
|
+
["vi", 6],
|
|
1528
|
+
["vii", 7],
|
|
1529
|
+
["viii", 8],
|
|
1530
|
+
["ix", 9],
|
|
1531
|
+
["x", 10],
|
|
1532
|
+
]);
|
|
1533
|
+
return roman.get(normalized) ?? null;
|
|
1534
|
+
}
|
|
1535
|
+
function nextDegreeOrdinalIndex(normalized, start, number) {
|
|
1536
|
+
const degreeCue = normalizeForSearch("\u0441\u0442\u0435\u043f\u0435\u043d");
|
|
1537
|
+
let best = -1;
|
|
1538
|
+
for (const nextNumber of [number + 1, number + 2, number + 3]) {
|
|
1539
|
+
for (const variant of romanStageVariants(String(nextNumber))) {
|
|
1540
|
+
const pattern = new RegExp(`(?:^|\\s)${escapeRegExp(variant)}(?:\\s|-|$)`, "iu");
|
|
1541
|
+
const match = normalized.slice(start).match(pattern);
|
|
1542
|
+
if (!match?.index && match?.index !== 0)
|
|
1543
|
+
continue;
|
|
1544
|
+
const index = start + match.index;
|
|
1545
|
+
const before = normalized.slice(Math.max(0, index - 180), index);
|
|
1546
|
+
const after = normalized.slice(index, Math.min(normalized.length, index + 80));
|
|
1547
|
+
if (!before.includes(degreeCue) && !after.includes(degreeCue))
|
|
1548
|
+
continue;
|
|
1549
|
+
if (best < 0 || index < best)
|
|
1550
|
+
best = index;
|
|
1551
|
+
}
|
|
1552
|
+
}
|
|
1553
|
+
return best;
|
|
1554
|
+
}
|
|
1555
|
+
function ordinalWindows(source, target) {
|
|
1556
|
+
const normalized = source.normalized;
|
|
1557
|
+
const windows = [];
|
|
1558
|
+
if (target.kind === "degree") {
|
|
1559
|
+
const degreeCue = normalizeForSearch("\u0441\u0442\u0435\u043f\u0435\u043d");
|
|
1560
|
+
for (const variant of romanStageVariants(String(target.number))) {
|
|
1561
|
+
const directPatterns = [
|
|
1562
|
+
new RegExp(`(?:^|\\s)${escapeRegExp(variant)}(?:\\s*-?\\s*\\S{0,3})?\\s+${escapeRegExp(degreeCue)}`, "giu"),
|
|
1563
|
+
new RegExp(`${escapeRegExp(degreeCue)}\\s+(?:\\S+\\s+){0,2}${escapeRegExp(variant)}(?:\\s|$)`, "giu"),
|
|
1564
|
+
];
|
|
1565
|
+
for (const pattern of directPatterns) {
|
|
1566
|
+
for (const match of normalized.matchAll(pattern)) {
|
|
1567
|
+
const index = match.index ?? 0;
|
|
1568
|
+
const afterStart = index + match[0].length;
|
|
1569
|
+
const afterLimit = nextDegreeOrdinalIndex(normalized, afterStart + 8, target.number);
|
|
1570
|
+
const end = afterLimit > 0 ? afterLimit : Math.min(normalized.length, afterStart + 520);
|
|
1571
|
+
windows.push(normalized.slice(Math.max(0, index - 160), end));
|
|
1572
|
+
}
|
|
1573
|
+
}
|
|
1574
|
+
let start = 0;
|
|
1575
|
+
while (start < normalized.length) {
|
|
1576
|
+
const index = normalized.indexOf(variant, start);
|
|
1577
|
+
if (index < 0)
|
|
1578
|
+
break;
|
|
1579
|
+
if (!hasSearchBoundaries(normalized, index, variant.length)) {
|
|
1580
|
+
start = index + Math.max(1, variant.length);
|
|
1581
|
+
continue;
|
|
1582
|
+
}
|
|
1583
|
+
const before = normalized.slice(Math.max(0, index - 220), index);
|
|
1584
|
+
if (!before.includes(degreeCue)) {
|
|
1585
|
+
start = index + Math.max(1, variant.length);
|
|
1586
|
+
continue;
|
|
1587
|
+
}
|
|
1588
|
+
const afterLimit = nextDegreeOrdinalIndex(normalized, index + variant.length + 8, target.number);
|
|
1589
|
+
const end = afterLimit > 0 ? afterLimit : Math.min(normalized.length, index + 520);
|
|
1590
|
+
windows.push(normalized.slice(Math.max(0, index - 160), end));
|
|
1591
|
+
start = index + Math.max(1, variant.length);
|
|
1592
|
+
}
|
|
1593
|
+
}
|
|
1594
|
+
for (const form of ordinalWordForms(target.number, "degree")) {
|
|
1595
|
+
const formNorm = normalizeForSearch(form);
|
|
1596
|
+
let start = 0;
|
|
1597
|
+
while (start < normalized.length) {
|
|
1598
|
+
const index = normalized.indexOf(formNorm, start);
|
|
1599
|
+
if (index < 0)
|
|
1600
|
+
break;
|
|
1601
|
+
windows.push(normalized.slice(Math.max(0, index - 220), Math.min(normalized.length, index + formNorm.length + 480)));
|
|
1602
|
+
start = index + formNorm.length;
|
|
1603
|
+
}
|
|
1604
|
+
}
|
|
1605
|
+
return windows;
|
|
1606
|
+
}
|
|
1607
|
+
if (target.kind === "step") {
|
|
1608
|
+
const stepCue = normalizeForSearch("\u0441\u0442\u0443\u043f\u0435\u043d");
|
|
1609
|
+
const pattern = new RegExp(`(?:^|\\s)${target.number}(?:\\s*-?\\s*\\S{0,2})?\\s+${escapeRegExp(stepCue)}`, "giu");
|
|
1610
|
+
for (const match of normalized.matchAll(pattern)) {
|
|
1611
|
+
const index = match.index ?? 0;
|
|
1612
|
+
const afterStart = index + match[0].length;
|
|
1613
|
+
const afterLimit = nextStepOrdinalIndex(normalized, afterStart + 12, target.number);
|
|
1614
|
+
const end = afterLimit > 0 ? afterLimit : Math.min(normalized.length, afterStart + 700);
|
|
1615
|
+
windows.push(normalized.slice(index, end));
|
|
1616
|
+
}
|
|
1617
|
+
return windows;
|
|
1618
|
+
}
|
|
1619
|
+
if (target.kind === "stage") {
|
|
1620
|
+
if (!containsNormalizedPhrase(normalized, "\u044d\u0442\u0430\u043f"))
|
|
1621
|
+
return windows;
|
|
1622
|
+
const pattern = new RegExp(`(?:^|[ .])${target.number}(?:[ .]|$)`, "gu");
|
|
1623
|
+
for (const match of normalized.matchAll(pattern)) {
|
|
1624
|
+
const index = match.index ?? 0;
|
|
1625
|
+
const before = normalized.slice(Math.max(0, index - 180), index);
|
|
1626
|
+
const afterStart = index + match[0].length;
|
|
1627
|
+
const afterLimit = nextOrdinalIndex(normalized, afterStart + 12, target.number);
|
|
1628
|
+
const end = afterLimit > 0 ? afterLimit : Math.min(normalized.length, afterStart + 520);
|
|
1629
|
+
const local = normalized.slice(index, end);
|
|
1630
|
+
if (!containsNormalizedPhrase(`${before} ${local}`, "\u044d\u0442\u0430\u043f"))
|
|
1631
|
+
continue;
|
|
1632
|
+
windows.push(local);
|
|
1633
|
+
}
|
|
1634
|
+
return windows;
|
|
1635
|
+
}
|
|
1636
|
+
for (const form of ordinalWordForms(target.number, "line")) {
|
|
1637
|
+
const formNorm = normalizeForSearch(form);
|
|
1638
|
+
let start = 0;
|
|
1639
|
+
while (start < normalized.length) {
|
|
1640
|
+
const index = normalized.indexOf(formNorm, start);
|
|
1641
|
+
if (index < 0)
|
|
1642
|
+
break;
|
|
1643
|
+
windows.push(normalized.slice(lineOrdinalWindowStart(normalized, index), Math.min(normalized.length, index + formNorm.length + 420)));
|
|
1644
|
+
start = index + formNorm.length;
|
|
1645
|
+
}
|
|
1646
|
+
}
|
|
1647
|
+
return windows;
|
|
1648
|
+
}
|
|
1649
|
+
function lineOrdinalWindowStart(normalized, index) {
|
|
1650
|
+
const before = normalized.slice(Math.max(0, index - 80), index);
|
|
1651
|
+
if (containsNormalizedPhrase(before, "\u0442\u0435\u0440\u0430\u043f"))
|
|
1652
|
+
return Math.max(0, index - 24);
|
|
1653
|
+
return Math.max(0, index - 110);
|
|
1654
|
+
}
|
|
1655
|
+
function abbreviationSupport(answerText, window) {
|
|
1656
|
+
const answerNorm = normalizeForSearch(answerText);
|
|
1657
|
+
if (containsNormalizedPhrase(window, "\u0441\u0433\u043a\u0441") && containsNormalizedPhrase(answerNorm, "\u043a\u043e\u0440\u0442\u0438\u043a\u043e\u0441\u0442\u0435\u0440\u043e\u0438\u0434"))
|
|
1658
|
+
return 1;
|
|
1659
|
+
return 0;
|
|
1660
|
+
}
|
|
1661
|
+
const ORDINAL_GENERIC_FOCUS = new Set([
|
|
1662
|
+
"\u043f\u0435\u0440\u0432\u044b\u0439",
|
|
1663
|
+
"\u0432\u0442\u043e\u0440\u043e\u0439",
|
|
1664
|
+
"\u0442\u0440\u0435\u0442\u0438\u0439",
|
|
1665
|
+
"\u0447\u0435\u0442\u0432\u0435\u0440\u0442\u044b\u0439",
|
|
1666
|
+
"\u0441\u0442\u0430\u0434\u0438\u044f",
|
|
1667
|
+
"\u0441\u0442\u0430\u0434\u0438\u0438",
|
|
1668
|
+
"\u0441\u0442\u0435\u043f\u0435\u043d\u044c",
|
|
1669
|
+
"\u0441\u0442\u0435\u043f\u0435\u043d\u0438",
|
|
1670
|
+
"\u043a\u043b\u0430\u0441\u0441",
|
|
1671
|
+
"\u043a\u043b\u0430\u0441\u0441\u0430",
|
|
1672
|
+
"\u043b\u0438\u043d\u0438\u044f",
|
|
1673
|
+
"\u043b\u0438\u043d\u0438\u0438",
|
|
1674
|
+
"\u044d\u0442\u0430\u043f",
|
|
1675
|
+
"\u044d\u0442\u0430\u043f\u043e\u043c",
|
|
1676
|
+
"\u0442\u0435\u0440\u0430\u043f\u0438\u044f",
|
|
1677
|
+
"\u0442\u0435\u0440\u0430\u043f\u0438\u0438",
|
|
1678
|
+
"\u043b\u0435\u0447\u0435\u043d\u0438\u0435",
|
|
1679
|
+
"\u043b\u0435\u0447\u0435\u043d\u0438\u044f",
|
|
1680
|
+
"\u043f\u0440\u0435\u043f\u0430\u0440\u0430\u0442",
|
|
1681
|
+
"\u043f\u0440\u0435\u043f\u0430\u0440\u0430\u0442\u043e\u043c",
|
|
1682
|
+
"\u043f\u0440\u0435\u043f\u0430\u0440\u0430\u0442\u0430\u043c\u0438",
|
|
1683
|
+
"\u044f\u0432\u043b\u044f\u0435\u0442\u0441\u044f",
|
|
1684
|
+
"\u044f\u0432\u043b\u044f\u044e\u0442\u0441\u044f",
|
|
1685
|
+
"\u0441\u0430\u0440\u043a\u043e\u0438\u0434\u043e\u0437",
|
|
1686
|
+
"\u0441\u0430\u0440\u043a\u043e\u0438\u0434\u043e\u0437\u0430",
|
|
1687
|
+
].flatMap((item) => uniqueTokens(item)));
|
|
1688
|
+
function specificOrdinalFocusTokens(focusTokens) {
|
|
1689
|
+
return (focusTokens ?? []).filter((token) => token.length >= 4 && !/^\d/.test(token) && !ORDINAL_GENERIC_FOCUS.has(token));
|
|
1690
|
+
}
|
|
1691
|
+
function ordinalWindowNegatesSpecificFocus(window, specificTokens) {
|
|
1692
|
+
for (const token of specificTokens ?? []) {
|
|
1693
|
+
if (token.length < 6)
|
|
1694
|
+
continue;
|
|
1695
|
+
const stem = token.slice(0, Math.min(8, token.length));
|
|
1696
|
+
let start = 0;
|
|
1697
|
+
while (start < window.length) {
|
|
1698
|
+
const index = window.indexOf(stem, start);
|
|
1699
|
+
if (index < 0)
|
|
1700
|
+
break;
|
|
1701
|
+
const before = window.slice(Math.max(0, index - 58), index);
|
|
1702
|
+
if (containsNormalizedPhrase(before, "\u0431\u0435\u0437") ||
|
|
1703
|
+
containsNormalizedPhrase(before, "\u043e\u0442\u0441\u0443\u0442") ||
|
|
1704
|
+
containsNormalizedPhrase(before, "\u043d\u0435\u0442")) {
|
|
1705
|
+
return true;
|
|
1706
|
+
}
|
|
1707
|
+
start = index + stem.length;
|
|
1708
|
+
}
|
|
1709
|
+
}
|
|
1710
|
+
return false;
|
|
1711
|
+
}
|
|
1712
|
+
function bestOrdinalListSupport({ mode, pages, question, answer, answerTokens, focusTokens }) {
|
|
1713
|
+
const target = ordinalTarget(question);
|
|
1714
|
+
if (!target)
|
|
1715
|
+
return null;
|
|
1716
|
+
if (mode !== "single" && target.kind !== "degree")
|
|
1717
|
+
return null;
|
|
1718
|
+
const answerPhrases = answerSearchPhrases(answer.text).slice(0, 16);
|
|
1719
|
+
const specificTokens = specificOrdinalFocusTokens(focusTokens);
|
|
1720
|
+
let best = null;
|
|
1721
|
+
for (const page of pages) {
|
|
1722
|
+
const nextPage = target.kind === "step" ? pages.find((candidate) => candidate.page === page.page + 1) : null;
|
|
1723
|
+
const sources = [...cachedLineWindowSegments(page), { normalized: page.normalized, text: page.text }];
|
|
1724
|
+
if (nextPage) {
|
|
1725
|
+
const text = `${page.text}\n${nextPage.text}`;
|
|
1726
|
+
sources.push({ normalized: normalizeForSearch(text), text });
|
|
1727
|
+
}
|
|
1728
|
+
for (const source of sources) {
|
|
1729
|
+
for (const window of ordinalWindows(source, target)) {
|
|
1730
|
+
const tokens = tokenizeNormalized(window);
|
|
1731
|
+
const focusHits = tokenHitCount(specificTokens, tokens);
|
|
1732
|
+
const focusCoverage = strictSoftCoverage(specificTokens, tokens);
|
|
1733
|
+
if (target.kind !== "step" && specificTokens.length && focusHits <= 0 && focusCoverage < 0.72)
|
|
1734
|
+
continue;
|
|
1735
|
+
if (target.kind === "line" && ordinalWindowNegatesSpecificFocus(window, specificTokens))
|
|
1736
|
+
continue;
|
|
1737
|
+
const answerCoverage = strictSoftCoverage(answerTokens, tokens);
|
|
1738
|
+
const phraseHit = answerPhrases.some((phrase) => containsNormalizedPhrase(window, phrase));
|
|
1739
|
+
const abbreviation = abbreviationSupport(answer.text, window);
|
|
1740
|
+
if (!phraseHit && answerCoverage < 0.58 && abbreviation <= 0)
|
|
1741
|
+
continue;
|
|
1742
|
+
const score = 12.2 +
|
|
1743
|
+
(phraseHit ? 2.4 : 0) +
|
|
1744
|
+
Math.max(answerCoverage, abbreviation) * 4.4 +
|
|
1745
|
+
Math.min(2, focusHits) * 1.1 +
|
|
1746
|
+
Math.min(1, focusCoverage) * 0.8;
|
|
1747
|
+
best = betterEvidence(best, {
|
|
1748
|
+
answerId: answer.id,
|
|
1749
|
+
page: page.page,
|
|
1750
|
+
text: source.text,
|
|
1751
|
+
score,
|
|
1752
|
+
kind: "ordinal_list_segment",
|
|
1753
|
+
});
|
|
1754
|
+
}
|
|
1755
|
+
}
|
|
1756
|
+
}
|
|
1757
|
+
return best;
|
|
1758
|
+
}
|
|
1759
|
+
function typeOrdinalNumber(question) {
|
|
1760
|
+
const normalized = normalizeForSearch(question);
|
|
1761
|
+
if (!containsNormalizedPhrase(normalized, "\u0442\u0438\u043f"))
|
|
1762
|
+
return null;
|
|
1763
|
+
if (!containsNormalizedPhrase(normalized, "\u0445\u0430\u0440\u0430\u043a\u0442\u0435\u0440") &&
|
|
1764
|
+
!containsNormalizedPhrase(normalized, "\u043c\u0435\u0445\u0430\u043d\u0438\u0437\u043c")) {
|
|
1765
|
+
return null;
|
|
1766
|
+
}
|
|
1767
|
+
if (containsNormalizedPhrase(normalized, "\u043f\u0435\u0440\u0432"))
|
|
1768
|
+
return 1;
|
|
1769
|
+
if (containsNormalizedPhrase(normalized, "\u0432\u0442\u043e\u0440"))
|
|
1770
|
+
return 2;
|
|
1771
|
+
if (containsNormalizedPhrase(normalized, "\u0442\u0440\u0435\u0442"))
|
|
1772
|
+
return 3;
|
|
1773
|
+
return null;
|
|
1774
|
+
}
|
|
1775
|
+
function typeOrdinalForms(number) {
|
|
1776
|
+
if (number === 1)
|
|
1777
|
+
return ["\u043f\u0435\u0440\u0432\u044b\u0439", "\u043f\u0435\u0440\u0432\u043e\u0433\u043e", "\u043f\u0435\u0440\u0432\u044b\u043c"];
|
|
1778
|
+
if (number === 2)
|
|
1779
|
+
return ["\u0432\u0442\u043e\u0440\u043e\u0439", "\u0432\u0442\u043e\u0440\u043e\u0433\u043e", "\u0432\u0442\u043e\u0440\u044b\u043c"];
|
|
1780
|
+
return ["\u0442\u0440\u0435\u0442\u0438\u0439", "\u0442\u0440\u0435\u0442\u044c\u0435\u0433\u043e", "\u0442\u0440\u0435\u0442\u044c\u0438\u043c"];
|
|
1781
|
+
}
|
|
1782
|
+
function nextTypeOrdinalBoundary(normalized, start, number) {
|
|
1783
|
+
let best = -1;
|
|
1784
|
+
for (const otherNumber of [1, 2, 3]) {
|
|
1785
|
+
if (otherNumber === number)
|
|
1786
|
+
continue;
|
|
1787
|
+
for (const form of typeOrdinalForms(otherNumber)) {
|
|
1788
|
+
const formNorm = normalizeForSearch(form);
|
|
1789
|
+
let index = normalized.indexOf(formNorm, start);
|
|
1790
|
+
while (index >= 0) {
|
|
1791
|
+
const before = normalized.slice(Math.max(0, index - 20), index);
|
|
1792
|
+
const after = normalized.slice(index, Math.min(normalized.length, index + 40));
|
|
1793
|
+
if (/\d/u.test(form) || containsNormalizedPhrase(`${before} ${after}`, "\u0442\u0438\u043f") || containsNormalizedPhrase(before, "\u0438")) {
|
|
1794
|
+
best = best < 0 ? index : Math.min(best, index);
|
|
1795
|
+
break;
|
|
1796
|
+
}
|
|
1797
|
+
index = normalized.indexOf(formNorm, index + formNorm.length);
|
|
1798
|
+
}
|
|
1799
|
+
}
|
|
1800
|
+
}
|
|
1801
|
+
return best;
|
|
1802
|
+
}
|
|
1803
|
+
function typeOrdinalWindows(source, number) {
|
|
1804
|
+
const windows = [];
|
|
1805
|
+
const normalized = source.normalized;
|
|
1806
|
+
for (const form of typeOrdinalForms(number)) {
|
|
1807
|
+
const formNorm = normalizeForSearch(form);
|
|
1808
|
+
let start = 0;
|
|
1809
|
+
while (start < normalized.length) {
|
|
1810
|
+
const index = normalized.indexOf(formNorm, start);
|
|
1811
|
+
if (index < 0)
|
|
1812
|
+
break;
|
|
1813
|
+
const before = normalized.slice(Math.max(0, index - 180), index);
|
|
1814
|
+
const near = normalized.slice(index, Math.min(normalized.length, index + 90));
|
|
1815
|
+
if (containsNormalizedPhrase(`${before} ${near}`, "\u0442\u0438\u043f")) {
|
|
1816
|
+
const afterStart = index + formNorm.length;
|
|
1817
|
+
const boundary = nextTypeOrdinalBoundary(normalized, afterStart + 8, number);
|
|
1818
|
+
const end = boundary > afterStart ? boundary : Math.min(normalized.length, afterStart + 360);
|
|
1819
|
+
windows.push(normalized.slice(index, end));
|
|
1820
|
+
}
|
|
1821
|
+
start = index + Math.max(1, formNorm.length);
|
|
1822
|
+
}
|
|
1823
|
+
}
|
|
1824
|
+
return windows;
|
|
1825
|
+
}
|
|
1826
|
+
function typeAbbreviationSupport(answerText, window) {
|
|
1827
|
+
const answerNorm = normalizeForSearch(answerText);
|
|
1828
|
+
let support = 0;
|
|
1829
|
+
if (containsNormalizedPhrase(answerNorm, "\u0430\u043e\u0440\u0442") &&
|
|
1830
|
+
containsNormalizedPhrase(answerNorm, "\u043a\u043b\u0430\u043f\u0430\u043d") &&
|
|
1831
|
+
containsNormalizedPhrase(window, "\u0410\u041a")) {
|
|
1832
|
+
support += 0.28;
|
|
1833
|
+
}
|
|
1834
|
+
if (containsNormalizedPhrase(answerNorm, "\u0432\u043e\u0441\u0445\u043e\u0434") &&
|
|
1835
|
+
containsNormalizedPhrase(answerNorm, "\u0430\u043e\u0440\u0442") &&
|
|
1836
|
+
containsNormalizedPhrase(window, "\u0412\u0410")) {
|
|
1837
|
+
support += 0.22;
|
|
1838
|
+
}
|
|
1839
|
+
return support;
|
|
1840
|
+
}
|
|
1841
|
+
const TYPE_ORDINAL_GENERIC_ANSWER = new Set([
|
|
1842
|
+
"\u0441\u0442\u0432\u043e\u0440\u043a\u0438",
|
|
1843
|
+
"\u0441\u0442\u0432\u043e\u0440\u043e\u043a",
|
|
1844
|
+
"\u0430\u043e\u0440\u0442\u0430\u043b\u044c\u043d\u043e\u0433\u043e",
|
|
1845
|
+
"\u0430\u043e\u0440\u0442\u0430\u043b\u044c\u043d\u044b\u0439",
|
|
1846
|
+
"\u043a\u043b\u0430\u043f\u0430\u043d",
|
|
1847
|
+
"\u043a\u043b\u0430\u043f\u0430\u043d\u0430",
|
|
1848
|
+
"\u0440\u0435\u0433\u0443\u0440\u0433\u0438\u0442\u0430\u0446\u0438\u0438",
|
|
1849
|
+
"\u043f\u043e\u0442\u043e\u043a",
|
|
1850
|
+
"\u043f\u043e\u0442\u043e\u043a\u043e\u043c",
|
|
1851
|
+
].flatMap((item) => uniqueTokens(item)));
|
|
1852
|
+
function typeDistinctiveAnswerTokens(answerTokens) {
|
|
1853
|
+
return answerTokens.filter((token) => token.length >= 4 && !TYPE_ORDINAL_GENERIC_ANSWER.has(token));
|
|
1854
|
+
}
|
|
1855
|
+
function bestTypeOrdinalSupport({ pages, question, answer, answerTokens }) {
|
|
1856
|
+
const number = typeOrdinalNumber(question);
|
|
1857
|
+
if (!number)
|
|
1858
|
+
return null;
|
|
1859
|
+
const answerPhrases = answerSearchPhrases(answer.text).slice(0, 16);
|
|
1860
|
+
const distinctiveTokens = typeDistinctiveAnswerTokens(answerTokens);
|
|
1861
|
+
let best = null;
|
|
1862
|
+
for (const page of pages) {
|
|
1863
|
+
const sources = [...cachedLineWindowSegments(page), { normalized: page.normalized, text: page.text }];
|
|
1864
|
+
for (const source of sources) {
|
|
1865
|
+
for (const window of typeOrdinalWindows(source, number)) {
|
|
1866
|
+
const tokens = tokenizeNormalized(window);
|
|
1867
|
+
const phraseHit = answerPhrases.some((phrase) => containsNormalizedPhrase(window, phrase));
|
|
1868
|
+
const coverageScore = strictSoftCoverage(answerTokens, tokens);
|
|
1869
|
+
const distinctiveCoverage = distinctiveTokens.length ? softCoverage(distinctiveTokens, tokens) : 0;
|
|
1870
|
+
if (distinctiveTokens.length && distinctiveCoverage <= 0)
|
|
1871
|
+
continue;
|
|
1872
|
+
const abbreviation = typeAbbreviationSupport(answer.text, window);
|
|
1873
|
+
const support = Math.min(1, coverageScore + abbreviation + Math.min(0.2, distinctiveCoverage * 0.2));
|
|
1874
|
+
if (!phraseHit && support < 0.5)
|
|
1875
|
+
continue;
|
|
1876
|
+
const score = 13.4 + (phraseHit ? 2.6 : 0) + support * 5.2;
|
|
1877
|
+
best = betterEvidence(best, {
|
|
1878
|
+
answerId: answer.id,
|
|
1879
|
+
page: page.page,
|
|
1880
|
+
text: source.text,
|
|
1881
|
+
score,
|
|
1882
|
+
kind: "type_ordinal_segment",
|
|
1883
|
+
});
|
|
1884
|
+
}
|
|
1885
|
+
}
|
|
1886
|
+
}
|
|
1887
|
+
return best;
|
|
1888
|
+
}
|
|
1889
|
+
const INDICATION_LABEL_STOPS = new Set([
|
|
1890
|
+
"\u043f\u0430\u0446\u0438\u0435\u043d\u0442",
|
|
1891
|
+
"\u043f\u0430\u0446\u0438\u0435\u043d\u0442\u0430",
|
|
1892
|
+
"\u043f\u0430\u0446\u0438\u0435\u043d\u0442\u043e\u0432",
|
|
1893
|
+
"\u0431\u043e\u043b\u044c\u043d\u043e\u0439",
|
|
1894
|
+
"\u0431\u043e\u043b\u044c\u043d\u044b\u0445",
|
|
1895
|
+
"\u0437\u0430",
|
|
1896
|
+
"\u0441",
|
|
1897
|
+
"\u043f\u0440\u0438",
|
|
1898
|
+
"\u043f\u043e",
|
|
1899
|
+
].flatMap((item) => rawTokens(item)));
|
|
1900
|
+
function questionIndicationLabel(question) {
|
|
1901
|
+
const tokens = rawTokens(question);
|
|
1902
|
+
if (!tokens.some((token) => token.startsWith("\u043f\u043e\u043a\u0430\u0437\u0430\u043d")))
|
|
1903
|
+
return null;
|
|
1904
|
+
const start = tokens.findIndex((token) => token === "\u0434\u043b\u044f" || token === "\u043a");
|
|
1905
|
+
if (start < 0)
|
|
1906
|
+
return null;
|
|
1907
|
+
const label = [];
|
|
1908
|
+
for (let index = start + 1; index < tokens.length && label.length < 5; index += 1) {
|
|
1909
|
+
const token = tokens[index];
|
|
1910
|
+
if (INDICATION_LABEL_STOPS.has(token))
|
|
1911
|
+
break;
|
|
1912
|
+
label.push(token);
|
|
1913
|
+
}
|
|
1914
|
+
return label.length ? label.join(" ") : null;
|
|
1915
|
+
}
|
|
1916
|
+
function indicationLineMatches(line, labelTokens) {
|
|
1917
|
+
const lineTokens = tokenizeNormalized(normalizeForSearch(line));
|
|
1918
|
+
if (softCoverage(labelTokens, lineTokens) < Math.min(1, labelTokens.length <= 3 ? 0.9 : 0.72))
|
|
1919
|
+
return false;
|
|
1920
|
+
const normalized = normalizeForSearch(line);
|
|
1921
|
+
return (containsNormalizedPhrase(normalized, "\u043f\u043e\u043a\u0430\u0437\u0430\u043d") ||
|
|
1922
|
+
containsNormalizedPhrase(normalized, "\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434") ||
|
|
1923
|
+
labelTokens.length >= 2);
|
|
1924
|
+
}
|
|
1925
|
+
function buildIndicationSegment(lines, index) {
|
|
1926
|
+
const current = normalizeForSearch(lines[index]);
|
|
1927
|
+
const before = normalizeForSearch(lines.slice(Math.max(0, index - 2), index).join(" "));
|
|
1928
|
+
let start = index;
|
|
1929
|
+
if (!containsNormalizedPhrase(current, "\u0433\u043e\u0441\u043f\u0438\u0442\u0430\u043b") && containsNormalizedPhrase(before, "\u043e\u0442\u0441\u0443\u0442")) {
|
|
1930
|
+
start = Math.max(0, index - 2);
|
|
1931
|
+
}
|
|
1932
|
+
const out = [];
|
|
1933
|
+
for (let cursor = start; cursor < Math.min(lines.length, index + 5); cursor += 1) {
|
|
1934
|
+
if (cursor > index) {
|
|
1935
|
+
const normalized = normalizeForSearch(lines[cursor]);
|
|
1936
|
+
if (containsNormalizedPhrase(normalized, "\u043f\u043b\u0430\u043d\u043e\u0432") ||
|
|
1937
|
+
containsNormalizedPhrase(normalized, "\u044d\u043a\u0441\u0442\u0440\u0435\u043d") ||
|
|
1938
|
+
containsNormalizedPhrase(normalized, "\u043f\u043e\u043a\u0430\u0437\u0430\u043d\u0438\u044f \u043a")) {
|
|
1939
|
+
break;
|
|
1940
|
+
}
|
|
1941
|
+
}
|
|
1942
|
+
out.push(lines[cursor]);
|
|
1943
|
+
}
|
|
1944
|
+
return out.join(" ");
|
|
1945
|
+
}
|
|
1946
|
+
function indicationSemanticSupport(answerText, segment) {
|
|
1947
|
+
const answerNorm = normalizeForSearch(answerText);
|
|
1948
|
+
const segmentNorm = normalizeForSearch(segment);
|
|
1949
|
+
if (containsNormalizedPhrase(answerNorm, "\u0441\u043e\u0445\u0440\u0430\u043d") &&
|
|
1950
|
+
containsNormalizedPhrase(answerNorm, "\u0444\u0443\u043d\u043a\u0446") &&
|
|
1951
|
+
containsNormalizedPhrase(segmentNorm, "\u043e\u0442\u0441\u0443\u0442") &&
|
|
1952
|
+
containsNormalizedPhrase(segmentNorm, "\u0441\u043d\u0438\u0436") &&
|
|
1953
|
+
containsNormalizedPhrase(segmentNorm, "\u0444\u0443\u043d\u043a\u0446")) {
|
|
1954
|
+
return 0.78;
|
|
1955
|
+
}
|
|
1956
|
+
if (containsNormalizedPhrase(answerNorm, "\u043e\u0441\u0442\u0440") &&
|
|
1957
|
+
containsNormalizedPhrase(answerNorm, "\u043f\u0440\u043e\u0433\u0440\u0435\u0441") &&
|
|
1958
|
+
containsNormalizedPhrase(segmentNorm, "\u043e\u0441\u0442\u0440") &&
|
|
1959
|
+
containsNormalizedPhrase(segmentNorm, "\u043f\u0440\u043e\u0433\u0440\u0435\u0441")) {
|
|
1960
|
+
return 0.86;
|
|
1961
|
+
}
|
|
1962
|
+
return 0;
|
|
1963
|
+
}
|
|
1964
|
+
function indicationContrastMismatch(answerText, segment) {
|
|
1965
|
+
const answerNorm = normalizeForSearch(answerText);
|
|
1966
|
+
const segmentNorm = normalizeForSearch(segment);
|
|
1967
|
+
if (containsNormalizedPhrase(segmentNorm, "\u043e\u0442\u0441\u0443\u0442") &&
|
|
1968
|
+
!containsNormalizedPhrase(answerNorm, "\u043e\u0442\u0441\u0443\u0442") &&
|
|
1969
|
+
(containsNormalizedPhrase(answerNorm, "\u0443\u0433\u0440\u043e\u0437") || containsNormalizedPhrase(answerNorm, "\u043d\u0435\u0434\u043e\u0441\u0442\u0430\u0442")) &&
|
|
1970
|
+
containsNormalizedPhrase(segmentNorm, "\u043d\u0435\u0434\u043e\u0441\u0442\u0430\u0442")) {
|
|
1971
|
+
return true;
|
|
1972
|
+
}
|
|
1973
|
+
return false;
|
|
1974
|
+
}
|
|
1975
|
+
function bestIndicationSegmentSupport({ pages, question, answer, answerTokens }) {
|
|
1976
|
+
const label = questionIndicationLabel(question);
|
|
1977
|
+
if (!label)
|
|
1978
|
+
return null;
|
|
1979
|
+
const labelTokens = uniqueTokens(label);
|
|
1980
|
+
if (!labelTokens.length)
|
|
1981
|
+
return null;
|
|
1982
|
+
const answerPhrases = answerSearchPhrases(answer.text).slice(0, 16);
|
|
1983
|
+
let best = null;
|
|
1984
|
+
for (const page of pages) {
|
|
1985
|
+
const lines = page.lines ?? [];
|
|
1986
|
+
for (let index = 0; index < lines.length; index += 1) {
|
|
1987
|
+
const neighborhood = lines.slice(index, Math.min(lines.length, index + 2)).join(" ");
|
|
1988
|
+
if (!indicationLineMatches(neighborhood, labelTokens))
|
|
1989
|
+
continue;
|
|
1990
|
+
const segment = buildIndicationSegment(lines, index);
|
|
1991
|
+
if (indicationContrastMismatch(answer.text, segment))
|
|
1992
|
+
continue;
|
|
1993
|
+
const normalized = normalizeForSearch(segment);
|
|
1994
|
+
const tokens = tokenizeNormalized(normalized);
|
|
1995
|
+
const phraseHit = answerPhrases.some((phrase) => containsNormalizedPhrase(normalized, phrase));
|
|
1996
|
+
const answerCoverage = strictSoftCoverage(answerTokens, tokens);
|
|
1997
|
+
const semantic = indicationSemanticSupport(answer.text, segment);
|
|
1998
|
+
const support = Math.max(answerCoverage, semantic);
|
|
1999
|
+
if (!phraseHit && support < 0.45)
|
|
2000
|
+
continue;
|
|
2001
|
+
const score = 13.8 + (phraseHit ? 2.6 : 0) + support * 5.4;
|
|
2002
|
+
best = betterEvidence(best, {
|
|
2003
|
+
answerId: answer.id,
|
|
2004
|
+
page: page.page,
|
|
2005
|
+
text: segment,
|
|
2006
|
+
score,
|
|
2007
|
+
kind: "indication_label_segment",
|
|
2008
|
+
});
|
|
2009
|
+
}
|
|
2010
|
+
}
|
|
2011
|
+
return best;
|
|
2012
|
+
}
|
|
2013
|
+
function ageEligibilityAdjustment({ pages, question, answer }) {
|
|
2014
|
+
const questionNorm = normalizeForSearch(question);
|
|
2015
|
+
const answerNorm = normalizeForSearch(answer.text);
|
|
2016
|
+
if (!containsNormalizedPhrase(questionNorm, "\u043f\u043e\u043a\u0430\u0437") &&
|
|
2017
|
+
!containsNormalizedPhrase(questionNorm, "\u043d\u0430\u0437\u043d\u0430\u0447") &&
|
|
2018
|
+
!containsNormalizedPhrase(questionNorm, "\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434")) {
|
|
2019
|
+
return { adjustment: 0, evidence: null };
|
|
2020
|
+
}
|
|
2021
|
+
const childAnswer = containsNormalizedPhrase(answerNorm, "\u0434\u0435\u0442\u0441\u043a") ||
|
|
2022
|
+
containsNormalizedPhrase(answerNorm, "\u0434\u0435\u0442\u044f\u043c") ||
|
|
2023
|
+
containsNormalizedPhrase(answerNorm, "\u0434\u0435\u0442\u0438") ||
|
|
2024
|
+
containsNormalizedPhrase(answerNorm, "\u0434\u0435\u0442\u0435\u0439");
|
|
2025
|
+
if (!childAnswer || containsNormalizedPhrase(answerNorm, "\u0432\u0437\u0440\u043e\u0441"))
|
|
2026
|
+
return { adjustment: 0, evidence: null };
|
|
2027
|
+
for (const page of pages) {
|
|
2028
|
+
for (const source of cachedLineWindowSegments(page)) {
|
|
2029
|
+
const normalized = source.normalized;
|
|
2030
|
+
if (containsNormalizedPhrase(normalized, "\u0434\u0435\u0442") &&
|
|
2031
|
+
(containsNormalizedPhrase(normalized, "\u043f\u0440\u043e\u0442\u0438\u0432\u043e\u043f\u043e\u043a\u0430\u0437") ||
|
|
2032
|
+
(containsNormalizedPhrase(normalized, "\u0442\u043e\u043b\u044c\u043a\u043e \u0432\u0437\u0440\u043e\u0441") && containsNormalizedPhrase(normalized, "\u0434\u0435\u0442")))) {
|
|
2033
|
+
return {
|
|
2034
|
+
adjustment: -4.2,
|
|
2035
|
+
evidence: {
|
|
2036
|
+
answerId: answer.id,
|
|
2037
|
+
page: page.page,
|
|
2038
|
+
text: source.text,
|
|
2039
|
+
score: 4.2,
|
|
2040
|
+
kind: "age_eligibility_contraindication",
|
|
2041
|
+
},
|
|
2042
|
+
};
|
|
2043
|
+
}
|
|
2044
|
+
}
|
|
2045
|
+
}
|
|
2046
|
+
return { adjustment: 0, evidence: null };
|
|
2047
|
+
}
|
|
2048
|
+
function questionDefinitionTerm(question) {
|
|
2049
|
+
const tokens = rawTokens(question);
|
|
2050
|
+
const podIndex = tokens.findIndex((token) => token === "\u043f\u043e\u0434");
|
|
2051
|
+
const ponimIndex = tokens.findIndex((token) => token.startsWith("\u043f\u043e\u043d\u0438\u043c"));
|
|
2052
|
+
if (podIndex >= 0 && ponimIndex > podIndex + 1) {
|
|
2053
|
+
return tokens.slice(podIndex + 1, ponimIndex).join(" ");
|
|
2054
|
+
}
|
|
2055
|
+
const calledIndex = tokens.findIndex((token) => token.startsWith("\u043d\u0430\u0437\u044b\u0432"));
|
|
2056
|
+
if (calledIndex > 0)
|
|
2057
|
+
return tokens.slice(0, calledIndex).join(" ");
|
|
2058
|
+
return null;
|
|
2059
|
+
}
|
|
2060
|
+
function definitionTermIndex(normalized, term) {
|
|
2061
|
+
const labelNorm = normalizeForSearch(term);
|
|
2062
|
+
const exact = normalized.indexOf(labelNorm);
|
|
2063
|
+
if (exact >= 0)
|
|
2064
|
+
return exact;
|
|
2065
|
+
const prefixes = uniqueTokens(term)
|
|
2066
|
+
.filter((token) => token.length >= 5)
|
|
2067
|
+
.map((token) => token.slice(0, Math.min(6, token.length)));
|
|
2068
|
+
return prefixes.length ? normalized.indexOf(prefixes[0]) : -1;
|
|
2069
|
+
}
|
|
2070
|
+
function definitionTermWindow(normalized, term) {
|
|
2071
|
+
const exact = normalizeForSearch(term);
|
|
2072
|
+
const prefixes = [
|
|
2073
|
+
exact,
|
|
2074
|
+
...uniqueTokens(term)
|
|
2075
|
+
.filter((token) => token.length >= 5)
|
|
2076
|
+
.map((token) => token.slice(0, Math.min(6, token.length))),
|
|
2077
|
+
].filter(Boolean);
|
|
2078
|
+
for (const prefix of prefixes.length ? prefixes : [normalizeForSearch(term)]) {
|
|
2079
|
+
let start = 0;
|
|
2080
|
+
while (start < normalized.length) {
|
|
2081
|
+
const labelIndex = normalized.indexOf(prefix, start);
|
|
2082
|
+
if (labelIndex < 0)
|
|
2083
|
+
break;
|
|
2084
|
+
const around = normalized.slice(labelIndex, Math.min(normalized.length, labelIndex + 56));
|
|
2085
|
+
if (containsNormalizedPhrase(around, "\u044d\u0442\u043e") ||
|
|
2086
|
+
containsNormalizedPhrase(around, "\u043f\u043e\u043d\u0438\u043c") ||
|
|
2087
|
+
around.includes("-")) {
|
|
2088
|
+
let end = Math.min(normalized.length, labelIndex + 300);
|
|
2089
|
+
const nextDefinition = normalized.indexOf(normalizeForSearch("\u044d\u0442\u043e"), labelIndex + 64);
|
|
2090
|
+
if (nextDefinition > labelIndex)
|
|
2091
|
+
end = Math.min(end, Math.max(labelIndex + 80, nextDefinition - 24));
|
|
2092
|
+
return normalized.slice(labelIndex, end);
|
|
2093
|
+
}
|
|
2094
|
+
start = labelIndex + Math.max(1, prefix.length);
|
|
2095
|
+
}
|
|
2096
|
+
}
|
|
2097
|
+
const fallback = definitionTermIndex(normalized, term);
|
|
2098
|
+
return fallback >= 0 ? normalized.slice(fallback, Math.min(normalized.length, fallback + 260)) : null;
|
|
2099
|
+
}
|
|
2100
|
+
function answerAbbreviations(answerText) {
|
|
2101
|
+
return (String(answerText ?? "").match(/[A-ZА-ЯЁ]{2,}(?:-[A-ZА-ЯЁ]{2,})?/gu) ?? [])
|
|
2102
|
+
.map((item) => normalizeForSearch(item))
|
|
2103
|
+
.filter((item) => item.length >= 2);
|
|
2104
|
+
}
|
|
2105
|
+
function bestTermDefinitionSupport({ pages, question, answer, answerTokens }) {
|
|
2106
|
+
const term = questionDefinitionTerm(question);
|
|
2107
|
+
if (!term)
|
|
2108
|
+
return null;
|
|
2109
|
+
if (normalizeForSearch(term).length < 4)
|
|
2110
|
+
return null;
|
|
2111
|
+
const abbreviations = answerAbbreviations(answer.text);
|
|
2112
|
+
let best = null;
|
|
2113
|
+
for (const page of pages) {
|
|
2114
|
+
const sources = [...cachedLineWindowSegments(page), { normalized: page.normalized, text: page.text }];
|
|
2115
|
+
for (const source of sources) {
|
|
2116
|
+
const window = definitionTermWindow(source.normalized, term);
|
|
2117
|
+
if (!window)
|
|
2118
|
+
continue;
|
|
2119
|
+
if (abbreviations.length && !abbreviations.some((abbr) => window.includes(abbr)))
|
|
2120
|
+
continue;
|
|
2121
|
+
const tokens = tokenizeNormalized(window);
|
|
2122
|
+
const answerCoverage = strictSoftCoverage(answerTokens, tokens);
|
|
2123
|
+
if (answerCoverage < 0.52)
|
|
2124
|
+
continue;
|
|
2125
|
+
const score = 14.2 + answerCoverage * 6.2 + numberCoverage(answer.text, window) * 0.8;
|
|
2126
|
+
best = betterEvidence(best, {
|
|
2127
|
+
answerId: answer.id,
|
|
2128
|
+
page: page.page,
|
|
2129
|
+
text: source.text,
|
|
2130
|
+
score,
|
|
2131
|
+
kind: "term_definition_segment",
|
|
2132
|
+
});
|
|
2133
|
+
}
|
|
2134
|
+
}
|
|
2135
|
+
return best;
|
|
2136
|
+
}
|
|
2137
|
+
function negatedAnswerPrefixAdjustment({ mode, pages, question, answer, answerTokens }) {
|
|
2138
|
+
if (mode !== "single" || answerTokens.length < 2)
|
|
2139
|
+
return { adjustment: 0, evidence: null };
|
|
2140
|
+
const questionNorm = normalizeForSearch(question);
|
|
2141
|
+
if (!containsNormalizedPhrase(questionNorm, "\u043e\u0431\u0440\u0430\u0437\u043e\u0432") && !containsNormalizedPhrase(questionNorm, "\u0445\u0430\u0440\u0430\u043a\u0442\u0435\u0440")) {
|
|
2142
|
+
return { adjustment: 0, evidence: null };
|
|
2143
|
+
}
|
|
2144
|
+
const first = answerTokens[0];
|
|
2145
|
+
if (first.startsWith("he") || first.startsWith("\u043d\u0435"))
|
|
2146
|
+
return { adjustment: 0, evidence: null };
|
|
2147
|
+
const negatedPrefix = `he${first.slice(0, Math.min(first.length, 4))}`;
|
|
2148
|
+
for (const page of pages) {
|
|
2149
|
+
if (page.normalized.includes(negatedPrefix) && answerTokens.slice(1).some((token) => page.normalized.includes(token.slice(0, Math.min(token.length, 8))))) {
|
|
2150
|
+
return {
|
|
2151
|
+
adjustment: -3.8,
|
|
2152
|
+
evidence: {
|
|
2153
|
+
answerId: answer.id,
|
|
2154
|
+
page: page.page,
|
|
2155
|
+
text: evidenceSnippet(page.text, first, question),
|
|
2156
|
+
score: 3.8,
|
|
2157
|
+
kind: "negated_answer_prefix_mismatch",
|
|
2158
|
+
},
|
|
2159
|
+
};
|
|
2160
|
+
}
|
|
2161
|
+
}
|
|
2162
|
+
return { adjustment: 0, evidence: null };
|
|
2163
|
+
}
|
|
2164
|
+
function impossibilityOnlyAdjustment({ mode, pages, question, answer }) {
|
|
2165
|
+
if (mode !== "single")
|
|
2166
|
+
return { adjustment: 0, evidence: null };
|
|
2167
|
+
const questionNorm = normalizeForSearch(question);
|
|
2168
|
+
if (!containsNormalizedPhrase(questionNorm, "\u0434\u0438\u043d\u0430\u043c\u0438\u0447") &&
|
|
2169
|
+
!containsNormalizedPhrase(questionNorm, "\u044d\u0444\u0444\u0435\u043a\u0442\u0438\u0432")) {
|
|
2170
|
+
return { adjustment: 0, evidence: null };
|
|
2171
|
+
}
|
|
2172
|
+
const answerTokens = uniqueTokens(answer.text).filter((token) => token.length >= 5 && !FOCUS_STOPWORDS.has(token));
|
|
2173
|
+
const phrases = answerSearchPhrases(answer.text).slice(0, 12);
|
|
2174
|
+
for (const page of pages) {
|
|
2175
|
+
for (const phrase of phrases) {
|
|
2176
|
+
const hits = findPhraseOccurrences(page.normalized, phrase, { textIsNormalized: true });
|
|
2177
|
+
for (const hit of hits) {
|
|
2178
|
+
const local = pageWindow(page, hit, 230);
|
|
2179
|
+
if (containsNormalizedPhrase(local, "\u0442\u043e\u043b\u044c\u043a\u043e \u0432 \u0441\u043b\u0443\u0447\u0430\u044f\u0445 \u043d\u0435\u0432\u043e\u0437\u043c\u043e\u0436") ||
|
|
2180
|
+
containsNormalizedPhrase(local, "\u043d\u0435\u0432\u043e\u0437\u043c\u043e\u0436\u043d\u043e\u0441\u0442\u0438 \u043f\u0440\u043e\u0432\u0435\u0434\u0435\u043d")) {
|
|
2181
|
+
return {
|
|
2182
|
+
adjustment: -3.6,
|
|
2183
|
+
evidence: {
|
|
2184
|
+
answerId: answer.id,
|
|
2185
|
+
page: page.page,
|
|
2186
|
+
text: evidenceSnippet(page.text, phrase, question),
|
|
2187
|
+
score: 3.6,
|
|
2188
|
+
kind: "impossibility_only_penalty",
|
|
2189
|
+
},
|
|
2190
|
+
};
|
|
2191
|
+
}
|
|
2192
|
+
}
|
|
2193
|
+
}
|
|
2194
|
+
if (answerTokens.length) {
|
|
2195
|
+
for (const source of cachedLineWindowSegments(page)) {
|
|
2196
|
+
const local = source.normalized;
|
|
2197
|
+
const tokens = tokenizeNormalized(local);
|
|
2198
|
+
const answerCoverage = strictSoftCoverage(answerTokens, tokens);
|
|
2199
|
+
if (answerCoverage < 0.45)
|
|
2200
|
+
continue;
|
|
2201
|
+
if (containsNormalizedPhrase(local, "\u0442\u043e\u043b\u044c\u043a\u043e \u0432 \u0441\u043b\u0443\u0447\u0430\u044f\u0445 \u043d\u0435\u0432\u043e\u0437\u043c\u043e\u0436") ||
|
|
2202
|
+
containsNormalizedPhrase(local, "\u043d\u0435\u0432\u043e\u0437\u043c\u043e\u0436\u043d\u043e\u0441\u0442\u0438 \u043f\u0440\u043e\u0432\u0435\u0434\u0435\u043d")) {
|
|
2203
|
+
return {
|
|
2204
|
+
adjustment: -3.6,
|
|
2205
|
+
evidence: {
|
|
2206
|
+
answerId: answer.id,
|
|
2207
|
+
page: page.page,
|
|
2208
|
+
text: source.text,
|
|
2209
|
+
score: 3.6,
|
|
2210
|
+
kind: "impossibility_only_penalty",
|
|
2211
|
+
},
|
|
2212
|
+
};
|
|
2213
|
+
}
|
|
2214
|
+
}
|
|
2215
|
+
}
|
|
2216
|
+
}
|
|
2217
|
+
return { adjustment: 0, evidence: null };
|
|
2218
|
+
}
|
|
2219
|
+
function activeTherapyIndicationAdjustment({ question, answer }) {
|
|
2220
|
+
const questionNorm = normalizeForSearch(question);
|
|
2221
|
+
if (!containsNormalizedPhrase(questionNorm, "\u043d\u0430\u0447\u0430\u043b") ||
|
|
2222
|
+
!containsNormalizedPhrase(questionNorm, "\u0430\u043a\u0442\u0438\u0432") ||
|
|
2223
|
+
!containsNormalizedPhrase(questionNorm, "\u0442\u0435\u0440\u0430\u043f")) {
|
|
2224
|
+
return { adjustment: 0, evidence: null };
|
|
2225
|
+
}
|
|
2226
|
+
const answerNorm = normalizeForSearch(answer.text);
|
|
2227
|
+
const supportive = containsNormalizedPhrase(answerNorm, "\u0443\u0433\u0440\u043e\u0437") ||
|
|
2228
|
+
containsNormalizedPhrase(answerNorm, "\u043d\u0435\u0434\u043e\u0441\u0442\u0430\u0442") ||
|
|
2229
|
+
containsNormalizedPhrase(answerNorm, "\u043f\u043e\u0442\u0435\u0440") ||
|
|
2230
|
+
containsNormalizedPhrase(answerNorm, "\u043a\u0430\u0447\u0435\u0441\u0442") ||
|
|
2231
|
+
containsNormalizedPhrase(answerNorm, "\u0436\u0438\u0437\u043d");
|
|
2232
|
+
if (supportive)
|
|
2233
|
+
return { adjustment: 0, evidence: null };
|
|
2234
|
+
return {
|
|
2235
|
+
adjustment: -4.2,
|
|
2236
|
+
evidence: {
|
|
2237
|
+
answerId: answer.id,
|
|
2238
|
+
page: 0,
|
|
2239
|
+
text: answer.text,
|
|
2240
|
+
score: 4.2,
|
|
2241
|
+
kind: "active_therapy_indication_mismatch",
|
|
2242
|
+
},
|
|
2243
|
+
};
|
|
2244
|
+
}
|
|
2245
|
+
function questionDefinitionLabel(question) {
|
|
2246
|
+
const tokens = rawTokens(question);
|
|
2247
|
+
const index = tokens.findIndex((token) => token.startsWith("\u0441\u0447\u0438\u0442\u0430"));
|
|
2248
|
+
if (index < 0)
|
|
2249
|
+
return null;
|
|
2250
|
+
const label = [];
|
|
2251
|
+
for (let offset = index + 1; offset < Math.min(tokens.length, index + 5); offset += 1) {
|
|
2252
|
+
if (tokens[offset] === "\u043f\u0440\u0438")
|
|
2253
|
+
break;
|
|
2254
|
+
label.push(tokens[offset]);
|
|
2255
|
+
}
|
|
2256
|
+
return label.length ? label.join(" ") : null;
|
|
2257
|
+
}
|
|
2258
|
+
function labelDefinitionWindows(normalized, labelNorm) {
|
|
2259
|
+
const labelBoundaries = [
|
|
2260
|
+
"\u043e\u0442\u0440\u0438\u0446\u0430\u0442\u0435\u043b",
|
|
2261
|
+
"\u0441\u043e\u043c\u043d\u0438\u0442\u0435\u043b",
|
|
2262
|
+
"\u043f\u043e\u043b\u043e\u0436\u0438\u0442\u0435\u043b",
|
|
2263
|
+
].map((item) => normalizeForSearch(item));
|
|
2264
|
+
const windows = [];
|
|
2265
|
+
let start = 0;
|
|
2266
|
+
while (start < normalized.length) {
|
|
2267
|
+
const labelIndex = normalized.indexOf(labelNorm, start);
|
|
2268
|
+
if (labelIndex < 0)
|
|
2269
|
+
break;
|
|
2270
|
+
const afterLabel = labelIndex + labelNorm.length;
|
|
2271
|
+
let end = Math.min(normalized.length, afterLabel + 220);
|
|
2272
|
+
for (const boundary of labelBoundaries) {
|
|
2273
|
+
if (labelNorm.includes(boundary))
|
|
2274
|
+
continue;
|
|
2275
|
+
const index = normalized.indexOf(boundary, afterLabel + 18);
|
|
2276
|
+
if (index > 0)
|
|
2277
|
+
end = Math.min(end, index);
|
|
2278
|
+
}
|
|
2279
|
+
windows.push({
|
|
2280
|
+
answerWindow: normalized.slice(labelIndex, end),
|
|
2281
|
+
contextWindow: normalized.slice(Math.max(0, labelIndex - 240), Math.min(normalized.length, end + 80)),
|
|
2282
|
+
});
|
|
2283
|
+
start = afterLabel;
|
|
2284
|
+
}
|
|
2285
|
+
return windows;
|
|
2286
|
+
}
|
|
2287
|
+
const LABEL_DEFINITION_GENERIC_FOCUS = new Set([
|
|
2288
|
+
"\u043f\u0440\u043e\u0431\u0430",
|
|
2289
|
+
"\u0441\u0447\u0438\u0442\u0430\u0435\u0442\u0441\u044f",
|
|
2290
|
+
"\u043f\u0440\u0438",
|
|
2291
|
+
"\u043f\u043e\u043b\u043e\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0439",
|
|
2292
|
+
"\u0441\u043e\u043c\u043d\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0439",
|
|
2293
|
+
"\u043e\u0442\u0440\u0438\u0446\u0430\u0442\u0435\u043b\u044c\u043d\u043e\u0439",
|
|
2294
|
+
].flatMap((item) => uniqueTokens(item)));
|
|
2295
|
+
function labelDefinitionFocusTokens(focusTokens) {
|
|
2296
|
+
return (focusTokens ?? []).filter((token) => token.length >= 3 && !LABEL_DEFINITION_GENERIC_FOCUS.has(token));
|
|
2297
|
+
}
|
|
2298
|
+
function bestLabelDefinitionSupport({ mode, pages, question, answer, answerTokens, focusTokens }) {
|
|
2299
|
+
if (mode !== "single")
|
|
2300
|
+
return null;
|
|
2301
|
+
const label = questionDefinitionLabel(question);
|
|
2302
|
+
if (!label)
|
|
2303
|
+
return null;
|
|
2304
|
+
const labelNorm = normalizeForSearch(label);
|
|
2305
|
+
const answerPhrases = answerSearchPhrases(answer.text).slice(0, 16);
|
|
2306
|
+
const specificTokens = labelDefinitionFocusTokens(focusTokens);
|
|
2307
|
+
let best = null;
|
|
2308
|
+
for (const page of pages) {
|
|
2309
|
+
for (const source of cachedLineWindowSegments(page)) {
|
|
2310
|
+
if (!containsNormalizedPhrase(source.normalized, label))
|
|
2311
|
+
continue;
|
|
2312
|
+
for (const { answerWindow, contextWindow } of labelDefinitionWindows(source.normalized, labelNorm)) {
|
|
2313
|
+
if (specificTokens.length && tokenHitCount(specificTokens, tokenizeNormalized(contextWindow)) <= 0)
|
|
2314
|
+
continue;
|
|
2315
|
+
const tokens = tokenizeNormalized(answerWindow);
|
|
2316
|
+
const answerCoverage = strictSoftCoverage(answerTokens, tokens);
|
|
2317
|
+
const phraseHit = answerPhrases.some((phrase) => containsNormalizedPhrase(answerWindow, phrase));
|
|
2318
|
+
if (!phraseHit && answerCoverage < 0.55)
|
|
2319
|
+
continue;
|
|
2320
|
+
const score = 13.0 + (phraseHit ? 2.8 : 0) + answerCoverage * 4.2 + numberCoverage(answer.text, answerWindow) * 1.2;
|
|
2321
|
+
best = betterEvidence(best, {
|
|
2322
|
+
answerId: answer.id,
|
|
2323
|
+
page: page.page,
|
|
2324
|
+
text: source.text,
|
|
2325
|
+
score,
|
|
2326
|
+
kind: "label_definition_segment",
|
|
2327
|
+
});
|
|
2328
|
+
}
|
|
2329
|
+
}
|
|
2330
|
+
}
|
|
2331
|
+
return best;
|
|
2332
|
+
}
|
|
2333
|
+
const RECOMMENDATION_GENERIC_FOCUS = new Set([
|
|
2334
|
+
"\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434\u0443\u0435\u0442\u0441\u044f",
|
|
2335
|
+
"\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434\u043e\u0432\u0430\u043d",
|
|
2336
|
+
"\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434\u043e\u0432\u0430\u043d\u043d\u044b\u043c",
|
|
2337
|
+
"\u043b\u0435\u0447\u0435\u043d\u0438\u0435",
|
|
2338
|
+
"\u043b\u0435\u0447\u0435\u043d\u0438\u044e",
|
|
2339
|
+
"\u043f\u0430\u0446\u0438\u0435\u043d\u0442",
|
|
2340
|
+
"\u043f\u0430\u0446\u0438\u0435\u043d\u0442\u0430\u043c",
|
|
2341
|
+
"\u043f\u0440\u0435\u043f\u0430\u0440\u0430\u0442",
|
|
2342
|
+
"\u043f\u0440\u043e\u0432\u043e\u0434\u0438\u0442\u044c",
|
|
2343
|
+
"\u043f\u0440\u043e\u0432\u0435\u0434\u0435\u043d\u0438\u0435",
|
|
2344
|
+
].flatMap((item) => uniqueTokens(item)));
|
|
2345
|
+
function specificRecommendationFocusTokens(focusTokens) {
|
|
2346
|
+
return (focusTokens ?? []).filter((token) => token.length >= 4 && !RECOMMENDATION_GENERIC_FOCUS.has(token));
|
|
2347
|
+
}
|
|
2348
|
+
function recommendationQuestion(question) {
|
|
2349
|
+
return containsNormalizedPhrase(normalizeForSearch(question), "\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434");
|
|
2350
|
+
}
|
|
2351
|
+
function segmentRecommendationPolarity(normalized) {
|
|
2352
|
+
if (containsNormalizedPhrase(normalized, "\u043d\u0435 \u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434") ||
|
|
2353
|
+
containsNormalizedPhrase(normalized, "\u043d\u0435\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434")) {
|
|
2354
|
+
return "negative";
|
|
2355
|
+
}
|
|
2356
|
+
if (containsNormalizedPhrase(normalized, "\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434"))
|
|
2357
|
+
return "positive";
|
|
2358
|
+
return null;
|
|
2359
|
+
}
|
|
2360
|
+
function recommendationQuestionPolarity(question, intent) {
|
|
2361
|
+
const normalized = normalizeForSearch(question);
|
|
2362
|
+
if (intent.negative || containsNormalizedPhrase(normalized, "\u043d\u0435 \u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434") || containsNormalizedPhrase(normalized, "\u043d\u0435\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434")) {
|
|
2363
|
+
return "negative";
|
|
2364
|
+
}
|
|
2365
|
+
return "positive";
|
|
2366
|
+
}
|
|
2367
|
+
function recommendationAnswerHit(segment, answer, answerTokens) {
|
|
2368
|
+
const answerPhrases = answerSearchPhrases(answer.text).slice(0, 16);
|
|
2369
|
+
const phraseHit = answerPhrases.some((phrase) => containsNormalizedPhrase(segment.normalized, phrase));
|
|
2370
|
+
const answerCoverage = strictSoftCoverage(answerTokens, segment.tokens);
|
|
2371
|
+
return { phraseHit, answerCoverage, hit: phraseHit || answerCoverage >= 0.6 };
|
|
2372
|
+
}
|
|
2373
|
+
function recommendationPolarityAdjustment({ mode, pages, question, answer, answerTokens, focusTokens, intent }) {
|
|
2374
|
+
if (mode !== "single" || !recommendationQuestion(question))
|
|
2375
|
+
return { support: null, adjustment: 0, evidence: null };
|
|
2376
|
+
const target = recommendationQuestionPolarity(question, intent);
|
|
2377
|
+
if (target !== "negative")
|
|
2378
|
+
return { support: null, adjustment: 0, evidence: null };
|
|
2379
|
+
const specificTokens = specificRecommendationFocusTokens(focusTokens);
|
|
2380
|
+
let bestMatch = null;
|
|
2381
|
+
let bestMismatch = null;
|
|
2382
|
+
for (const page of pages) {
|
|
2383
|
+
for (const segment of cachedLineWindowSegments(page)) {
|
|
2384
|
+
const polarity = segmentRecommendationPolarity(segment.normalized);
|
|
2385
|
+
if (!polarity)
|
|
2386
|
+
continue;
|
|
2387
|
+
const focusHits = tokenHitCount(specificTokens, segment.tokens);
|
|
2388
|
+
if (specificTokens.length >= 2 && focusHits <= 0)
|
|
2389
|
+
continue;
|
|
2390
|
+
const answerHit = recommendationAnswerHit(segment, answer, answerTokens);
|
|
2391
|
+
if (!answerHit.hit)
|
|
2392
|
+
continue;
|
|
2393
|
+
const evidence = {
|
|
2394
|
+
answerId: answer.id,
|
|
2395
|
+
page: page.page,
|
|
2396
|
+
text: segment.text,
|
|
2397
|
+
score: 11.8 + (answerHit.phraseHit ? 2.5 : 0) + answerHit.answerCoverage * 3.2 + Math.min(2, focusHits) * 1.0,
|
|
2398
|
+
kind: polarity === target ? "recommendation_polarity_match" : "recommendation_polarity_mismatch",
|
|
2399
|
+
};
|
|
2400
|
+
if (polarity === target)
|
|
2401
|
+
bestMatch = betterEvidence(bestMatch, evidence);
|
|
2402
|
+
else
|
|
2403
|
+
bestMismatch = betterEvidence(bestMismatch, evidence);
|
|
2404
|
+
}
|
|
2405
|
+
}
|
|
2406
|
+
if (bestMatch)
|
|
2407
|
+
return { support: bestMatch, adjustment: 0, evidence: null };
|
|
2408
|
+
return bestMismatch ? { support: null, adjustment: -7.5, evidence: bestMismatch } : { support: null, adjustment: 0, evidence: null };
|
|
2409
|
+
}
|
|
2410
|
+
const SHARED_MULTI_SOURCE_KINDS = new Set([
|
|
2411
|
+
"question_anchor_segment",
|
|
2412
|
+
"question_chunk_answer",
|
|
2413
|
+
"bm25_question_answer",
|
|
2414
|
+
"section_list_segment",
|
|
2415
|
+
"bounded_list_segment",
|
|
2416
|
+
"ordinal_list_segment",
|
|
2417
|
+
"latin_fuzzy_ocr",
|
|
2418
|
+
]);
|
|
2419
|
+
const SHARED_MULTI_GENERIC_TOKENS = new Set([
|
|
2420
|
+
"\u0434\u0430\u043d\u043d\u044b\u0435",
|
|
2421
|
+
"\u0434\u0430\u043d\u043d\u044b\u0445",
|
|
2422
|
+
"\u0446\u0435\u043b\u044c",
|
|
2423
|
+
"\u0446\u0435\u043b\u044c\u044e",
|
|
2424
|
+
"\u043f\u0430\u0446\u0438\u0435\u043d\u0442",
|
|
2425
|
+
"\u043f\u0430\u0446\u0438\u0435\u043d\u0442\u0430",
|
|
2426
|
+
"\u043f\u0430\u0446\u0438\u0435\u043d\u0442\u0430\u043c",
|
|
2427
|
+
"\u043f\u0440\u043e\u0432\u0435\u0434\u0435\u043d",
|
|
2428
|
+
"\u043f\u0440\u043e\u0432\u043e\u0434",
|
|
2429
|
+
"\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434",
|
|
2430
|
+
"\u043e\u0442\u043d\u043e\u0441",
|
|
2431
|
+
"\u044f\u0432\u043b\u044f",
|
|
2432
|
+
"\u0432\u044b\u043f\u043e\u043b\u043d",
|
|
2433
|
+
"\u043b\u0435\u0447\u0435\u043d",
|
|
2434
|
+
"\u0442\u0435\u0440\u0430\u043f",
|
|
2435
|
+
].flatMap((item) => uniqueTokens(item)));
|
|
2436
|
+
const SHARED_MULTI_SECTION_CUES = [
|
|
2437
|
+
"\u043f\u043e \u043b\u043e\u043a\u0430\u043b\u0438\u0437\u0430\u0446\u0438\u0438",
|
|
2438
|
+
"\u043f\u043e \u044d\u0442\u0438\u043e\u043b\u043e\u0433\u0438\u0438",
|
|
2439
|
+
"\u043f\u043e \u0441\u0442\u0435\u043f\u0435\u043d\u0438",
|
|
2440
|
+
"\u043f\u043e \u043e\u0441\u043e\u0431\u0435\u043d\u043d\u043e\u0441\u0442\u044f\u043c \u0442\u0435\u0447\u0435\u043d\u0438\u044f",
|
|
2441
|
+
"\u043f\u043e \u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u0438",
|
|
2442
|
+
].map((item) => normalizeForSearch(item));
|
|
2443
|
+
const SHARED_MULTI_REQUIRED_CUE_GROUPS = [
|
|
2444
|
+
{
|
|
2445
|
+
answer: ["\u043c\u0435\u043d\u0435\u0435", "\u043d\u0438\u0436\u0435", "\u0441\u043d\u0438\u0436", "\u043d\u0438\u0437\u043a", "\u043c\u043e\u043b\u043e\u0436\u0435", "\u043f\u043e\u043d\u0438\u0436"],
|
|
2446
|
+
source: ["\u043c\u0435\u043d\u0435\u0435", "\u043d\u0438\u0436\u0435", "\u0441\u043d\u0438\u0436", "\u043d\u0438\u0437\u043a", "\u043c\u043e\u043b\u043e\u0436\u0435", "\u043f\u043e\u043d\u0438\u0436"],
|
|
2447
|
+
},
|
|
2448
|
+
{
|
|
2449
|
+
answer: ["\u0431\u043e\u043b\u0435\u0435", "\u0432\u044b\u0448\u0435", "\u043f\u043e\u0432\u044b\u0448", "\u0432\u044b\u0441\u043e\u043a", "\u0441\u0442\u0430\u0440\u0448\u0435"],
|
|
2450
|
+
source: ["\u0431\u043e\u043b\u0435\u0435", "\u0432\u044b\u0448\u0435", "\u043f\u043e\u0432\u044b\u0448", "\u0432\u044b\u0441\u043e\u043a", "\u0441\u0442\u0430\u0440\u0448\u0435"],
|
|
2451
|
+
},
|
|
2452
|
+
].map((group) => ({
|
|
2453
|
+
answer: group.answer.map((item) => normalizeForSearch(item)),
|
|
2454
|
+
source: group.source.map((item) => normalizeForSearch(item)),
|
|
2455
|
+
}));
|
|
2456
|
+
const SHARED_MULTI_SHORT_ALIAS_PHRASES = new Set(["\u0441\u043f\u044f", "\u0440\u044d"].map((item) => normalizeForSearch(item)));
|
|
2457
|
+
function answerShortMedicalAliases(answerText) {
|
|
2458
|
+
const own = new Set(focusedAnswerSearchPhrases(answerText).map((phrase) => normalizeForSearch(phrase)));
|
|
2459
|
+
const answerNorm = normalizeForSearch(answerText);
|
|
2460
|
+
return [...SHARED_MULTI_SHORT_ALIAS_PHRASES].filter((alias) => own.has(alias) && !answerNorm.includes(alias));
|
|
2461
|
+
}
|
|
2462
|
+
function bestShortMedicalAliasSupport({ mode, pages, topQuestionPages, questionTokens, answer }) {
|
|
2463
|
+
if (mode !== "multi")
|
|
2464
|
+
return null;
|
|
2465
|
+
const aliases = answerShortMedicalAliases(answer.text);
|
|
2466
|
+
if (!aliases.length)
|
|
2467
|
+
return null;
|
|
2468
|
+
let best = null;
|
|
2469
|
+
for (const page of pages) {
|
|
2470
|
+
const nearTopPage = !topQuestionPages?.size || topQuestionPages.has(page.page) || topQuestionPages.has(page.page - 1) || topQuestionPages.has(page.page + 1);
|
|
2471
|
+
if (!nearTopPage)
|
|
2472
|
+
continue;
|
|
2473
|
+
for (const segment of cachedLineWindowSegments(page)) {
|
|
2474
|
+
if (!aliases.some((alias) => segment.normalized.includes(alias)))
|
|
2475
|
+
continue;
|
|
2476
|
+
const questionCoverage = coverage(questionTokens, segment.tokens);
|
|
2477
|
+
if (questionCoverage < 0.18)
|
|
2478
|
+
continue;
|
|
2479
|
+
const score = 10.8 + Math.min(0.65, questionCoverage) * 5.4;
|
|
2480
|
+
best = betterEvidence(best, {
|
|
2481
|
+
answerId: answer.id,
|
|
2482
|
+
page: page.page,
|
|
2483
|
+
text: segment.text,
|
|
2484
|
+
score,
|
|
2485
|
+
kind: "short_medical_alias_segment",
|
|
2486
|
+
});
|
|
2487
|
+
}
|
|
2488
|
+
}
|
|
2489
|
+
return best;
|
|
2490
|
+
}
|
|
2491
|
+
function sharedMultiTokens(answerText) {
|
|
2492
|
+
return uniqueTokens(answerText).filter((token) => token.length >= 3 && !FOCUS_STOPWORDS.has(token) && !SHARED_MULTI_GENERIC_TOKENS.has(token));
|
|
2493
|
+
}
|
|
2494
|
+
const PARENTHETICAL_GROUP_GENERIC_FOCUS = new Set([
|
|
2495
|
+
"\u0430\u043c\u043a",
|
|
2496
|
+
"\u0430\u043d\u043e\u043c\u0430\u043b\u044c\u043d",
|
|
2497
|
+
"\u043c\u0430\u0442\u043e\u0447",
|
|
2498
|
+
"\u043a\u0440\u043e\u0432\u043e\u0442\u0435\u0447",
|
|
2499
|
+
"\u043a\u0430\u0442\u0435\u0433\u043e\u0440",
|
|
2500
|
+
"\u043a\u0430\u0442\u0435\u0433\u043e\u0440\u0438",
|
|
2501
|
+
"\u043e\u0442\u043d\u043e\u0441",
|
|
2502
|
+
"\u044f\u0432\u043b\u044f",
|
|
2503
|
+
"\u044f\u0432\u043b\u044f\u044e\u0442",
|
|
2504
|
+
].flatMap((item) => uniqueTokens(item)));
|
|
2505
|
+
function parentheticalGroupFocusTokens(question) {
|
|
2506
|
+
return uniqueTokens(question).filter((token) => token.length >= 4 && !FOCUS_STOPWORDS.has(token) && !PARENTHETICAL_GROUP_GENERIC_FOCUS.has(token));
|
|
2507
|
+
}
|
|
2508
|
+
function answerInParentheticalGroup(groupNormalized, answer) {
|
|
2509
|
+
return answerSearchPhrases(answer.text)
|
|
2510
|
+
.map((phrase) => normalizeForSearch(phrase))
|
|
2511
|
+
.filter((phrase) => phrase.length >= 3)
|
|
2512
|
+
.some((phrase) => containsNormalizedPhrase(groupNormalized, phrase));
|
|
2513
|
+
}
|
|
2514
|
+
function parentheticalGroupAnswerHit(groupNormalized, groupTokens, answer) {
|
|
2515
|
+
const answerTokens = uniqueTokens(answer.text);
|
|
2516
|
+
return answerInParentheticalGroup(groupNormalized, answer) || strictSoftCoverage(answerTokens, groupTokens) >= (answerTokens.length <= 1 ? 0.95 : 0.68);
|
|
2517
|
+
}
|
|
2518
|
+
function inlineParentheticalGroupContext({ beforeText, afterText, specificFocus }) {
|
|
2519
|
+
const beforeTokens = tokenize(beforeText);
|
|
2520
|
+
const afterTokens = tokenize(afterText);
|
|
2521
|
+
const headHits = tokenHitCount(specificFocus, beforeTokens);
|
|
2522
|
+
const tailHits = tokenHitCount(specificFocus, afterTokens);
|
|
2523
|
+
const hasListCue = beforeTokens.includes(stemToken(normalizeForSearch("\u0440\u044f\u0434"))) || beforeTokens.includes(stemToken(normalizeForSearch("\u0433\u0440\u0443\u043f\u043f")));
|
|
2524
|
+
return hasListCue && headHits >= 1 && tailHits >= 1;
|
|
2525
|
+
}
|
|
2526
|
+
/**
|
|
2527
|
+
* Связывает варианты ответа с ближайшей скобочной группой после релевантного
|
|
2528
|
+
* заголовка: `органические причины (...)`, `факторы риска (...)` и похожие
|
|
2529
|
+
* конструкции. Это помогает не смешивать соседние группы в одной строке.
|
|
2530
|
+
*/
|
|
2531
|
+
function bestParentheticalGroupSupport({ mode, pages, question, answer, answers, answerTokens }) {
|
|
2532
|
+
if (mode !== "multi")
|
|
2533
|
+
return null;
|
|
2534
|
+
const normalizedQuestion = normalizeForSearch(question);
|
|
2535
|
+
const questionTokenSet = new Set(tokenize(question));
|
|
2536
|
+
if (questionTokenSet.has(stemToken(normalizeForSearch("\u0444\u0430\u043a\u0442\u043e\u0440"))) && questionTokenSet.has(stemToken(normalizeForSearch("\u0440\u0438\u0441\u043a")))) {
|
|
2537
|
+
return null;
|
|
2538
|
+
}
|
|
2539
|
+
const specificFocus = parentheticalGroupFocusTokens(question);
|
|
2540
|
+
if (specificFocus.length < 2)
|
|
2541
|
+
return null;
|
|
2542
|
+
let best = null;
|
|
2543
|
+
for (const page of pages) {
|
|
2544
|
+
const text = String(page.text ?? "");
|
|
2545
|
+
const matches = text.matchAll(/\(([^()]{6,260})\)/gu);
|
|
2546
|
+
for (const match of matches) {
|
|
2547
|
+
const groupText = match[1] ?? "";
|
|
2548
|
+
const groupStart = match.index ?? 0;
|
|
2549
|
+
let beforeText = text.slice(Math.max(0, groupStart - 180), groupStart);
|
|
2550
|
+
const previousGroupEnd = beforeText.lastIndexOf(")");
|
|
2551
|
+
if (previousGroupEnd >= 0)
|
|
2552
|
+
beforeText = beforeText.slice(previousGroupEnd + 1);
|
|
2553
|
+
const afterText = text.slice(groupStart + match[0].length, groupStart + match[0].length + 180);
|
|
2554
|
+
const beforeTokens = tokenize(beforeText);
|
|
2555
|
+
const categoryContext = beforeTokens.includes(stemToken(normalizeForSearch("\u043a\u0430\u0442\u0435\u0433\u043e\u0440\u0438\u0438")));
|
|
2556
|
+
const inlineContext = inlineParentheticalGroupContext({ beforeText, afterText, specificFocus });
|
|
2557
|
+
if (!categoryContext && !inlineContext)
|
|
2558
|
+
continue;
|
|
2559
|
+
const specificHits = tokenHitCount(specificFocus, beforeTokens);
|
|
2560
|
+
const specificCoverage = coverage(specificFocus, beforeTokens);
|
|
2561
|
+
if (categoryContext && specificHits < 2 && specificCoverage < 0.34)
|
|
2562
|
+
continue;
|
|
2563
|
+
const groupNormalized = normalizeForSearch(groupText);
|
|
2564
|
+
const groupTokens = tokenize(groupText);
|
|
2565
|
+
const groupAnswerHits = (answers ?? []).filter((candidate) => parentheticalGroupAnswerHit(groupNormalized, groupTokens, candidate)).length;
|
|
2566
|
+
if (inlineContext && groupAnswerHits < 2)
|
|
2567
|
+
continue;
|
|
2568
|
+
const answerCoverage = strictSoftCoverage(answerTokens, groupTokens);
|
|
2569
|
+
if (!answerInParentheticalGroup(groupNormalized, answer) && answerCoverage < (answerTokens.length <= 1 ? 0.95 : 0.68))
|
|
2570
|
+
continue;
|
|
2571
|
+
const score = (inlineContext ? 14.6 : 13.8) +
|
|
2572
|
+
Math.min(4, specificHits) * 1.15 +
|
|
2573
|
+
Math.min(0.75, specificCoverage) * 5.2 +
|
|
2574
|
+
answerCoverage * 2.2 +
|
|
2575
|
+
Math.min(3, groupAnswerHits) * 0.8;
|
|
2576
|
+
best = betterEvidence(best, {
|
|
2577
|
+
answerId: answer.id,
|
|
2578
|
+
page: page.page,
|
|
2579
|
+
text: `${beforeText}(${groupText})`.replace(/\s+/g, " ").trim(),
|
|
2580
|
+
score,
|
|
2581
|
+
kind: "parenthetical_group_segment",
|
|
2582
|
+
});
|
|
2583
|
+
}
|
|
2584
|
+
}
|
|
2585
|
+
return best;
|
|
2586
|
+
}
|
|
2587
|
+
const CONTINUATION_LIST_QUESTION_CUES = [
|
|
2588
|
+
"\u043e\u0441\u043d\u043e\u0432\u0430\u043d",
|
|
2589
|
+
].map((item) => normalizeForSearch(item));
|
|
2590
|
+
const CONTINUATION_LIST_SEGMENT_CUES = [
|
|
2591
|
+
"\u043e\u0441\u043d\u043e\u0432\u0430\u043d",
|
|
2592
|
+
"\u0434\u0430\u043d\u043d",
|
|
2593
|
+
].map((item) => normalizeForSearch(item));
|
|
2594
|
+
function continuationListQuestion(question, intent) {
|
|
2595
|
+
if (intent?.exception)
|
|
2596
|
+
return false;
|
|
2597
|
+
const normalized = normalizeForSearch(question);
|
|
2598
|
+
if (containsNormalizedPhrase(normalized, "\u043d\u0435 \u0432\u043a\u043b\u044e\u0447"))
|
|
2599
|
+
return false;
|
|
2600
|
+
return CONTINUATION_LIST_QUESTION_CUES.some((cue) => normalized.includes(cue)) && containsNormalizedPhrase(normalized, "\u043d\u0430");
|
|
2601
|
+
}
|
|
2602
|
+
function answerContinuationListHit(segment, answer, answerTokens) {
|
|
2603
|
+
const normalized = segment.normalized;
|
|
2604
|
+
const phraseHit = answerSearchPhrases(answer.text)
|
|
2605
|
+
.map((phrase) => normalizeForSearch(phrase))
|
|
2606
|
+
.filter((phrase) => phrase.length >= 5)
|
|
2607
|
+
.some((phrase) => containsNormalizedPhrase(normalized, phrase));
|
|
2608
|
+
const answerCoverage = strictSoftCoverage(answerTokens, segment.tokens);
|
|
2609
|
+
const numbers = extractNumbers(answer.text);
|
|
2610
|
+
if (numbers.length && numberCoverage(answer.text, normalized) < 1)
|
|
2611
|
+
return { phraseHit: false, answerCoverage, hit: false };
|
|
2612
|
+
const hit = phraseHit || answerCoverage >= (answerTokens.length <= 2 ? 0.86 : 0.68);
|
|
2613
|
+
return { phraseHit, answerCoverage, hit };
|
|
2614
|
+
}
|
|
2615
|
+
function continuationLineSegments(page) {
|
|
2616
|
+
const lines = page.lines ?? [];
|
|
2617
|
+
const segments = [];
|
|
2618
|
+
for (let index = 0; index < lines.length; index += 1) {
|
|
2619
|
+
const text = lines.slice(index, Math.min(lines.length, index + 7)).join(" ").replace(/\s+/g, " ").trim();
|
|
2620
|
+
if (text.length >= 40 && text.length <= 1500) {
|
|
2621
|
+
segments.push({
|
|
2622
|
+
text,
|
|
2623
|
+
normalized: normalizeForSearch(text),
|
|
2624
|
+
tokens: tokenize(text),
|
|
2625
|
+
});
|
|
2626
|
+
}
|
|
2627
|
+
}
|
|
2628
|
+
return segments;
|
|
2629
|
+
}
|
|
2630
|
+
/**
|
|
2631
|
+
* Ищет варианты в строке-продолжении вопроса вида `критерии основаны на...`.
|
|
2632
|
+
*
|
|
2633
|
+
* В отличие от общего BM25 этот scorer требует, чтобы сама строка содержала
|
|
2634
|
+
* формулировку вопроса и структурный list-cue, поэтому соседние обсуждения
|
|
2635
|
+
* вариантов не получают такой же вес.
|
|
2636
|
+
*/
|
|
2637
|
+
function bestQuestionContinuationListSupport({ mode, pages, topQuestionPages, question, answer, answerTokens, questionTokens, focusTokens, intent }) {
|
|
2638
|
+
if (mode !== "multi" || !continuationListQuestion(question, intent))
|
|
2639
|
+
return null;
|
|
2640
|
+
const usefulFocus = (focusTokens?.length ? focusTokens : questionTokens).filter((token) => token.length >= 4 && !FOCUS_STOPWORDS.has(token));
|
|
2641
|
+
let best = null;
|
|
2642
|
+
for (const page of pages) {
|
|
2643
|
+
const nearTopPage = !topQuestionPages?.size || topQuestionPages.has(page.page) || topQuestionPages.has(page.page - 1) || topQuestionPages.has(page.page + 1);
|
|
2644
|
+
if (!nearTopPage)
|
|
2645
|
+
continue;
|
|
2646
|
+
for (const segment of continuationLineSegments(page)) {
|
|
2647
|
+
if (!CONTINUATION_LIST_SEGMENT_CUES.some((cue) => segment.normalized.includes(cue)))
|
|
2648
|
+
continue;
|
|
2649
|
+
const questionCoverage = coverage(questionTokens, segment.tokens);
|
|
2650
|
+
const focusHits = tokenHitCount(usefulFocus, segment.tokens);
|
|
2651
|
+
if (questionCoverage < 0.5)
|
|
2652
|
+
continue;
|
|
2653
|
+
if (usefulFocus.length >= 2 && focusHits < 2)
|
|
2654
|
+
continue;
|
|
2655
|
+
const answerHit = answerContinuationListHit(segment, answer, answerTokens);
|
|
2656
|
+
if (!answerHit.hit)
|
|
2657
|
+
continue;
|
|
2658
|
+
const score = 11.6 +
|
|
2659
|
+
Math.min(0.72, questionCoverage) * 5.4 +
|
|
2660
|
+
Math.min(3, focusHits) * 0.8 +
|
|
2661
|
+
answerHit.answerCoverage * 2.6 +
|
|
2662
|
+
(answerHit.phraseHit ? 1.6 : 0);
|
|
2663
|
+
best = betterEvidence(best, {
|
|
2664
|
+
answerId: answer.id,
|
|
2665
|
+
page: page.page,
|
|
2666
|
+
text: segment.text,
|
|
2667
|
+
score,
|
|
2668
|
+
kind: "question_continuation_list",
|
|
2669
|
+
});
|
|
2670
|
+
}
|
|
2671
|
+
}
|
|
2672
|
+
return best;
|
|
2673
|
+
}
|
|
2674
|
+
function sharedMultiSectionCue(question) {
|
|
2675
|
+
const normalizedQuestion = normalizeForSearch(question);
|
|
2676
|
+
return SHARED_MULTI_SECTION_CUES.find((cue) => normalizedQuestion.includes(cue)) ?? null;
|
|
2677
|
+
}
|
|
2678
|
+
function sharedMultiFocusedNormalized(segmentText, question) {
|
|
2679
|
+
const normalized = normalizeForSearch(segmentText);
|
|
2680
|
+
const cue = sharedMultiSectionCue(question);
|
|
2681
|
+
if (!cue)
|
|
2682
|
+
return normalized;
|
|
2683
|
+
const start = normalized.indexOf(cue);
|
|
2684
|
+
if (start < 0)
|
|
2685
|
+
return normalized;
|
|
2686
|
+
let end = normalized.length;
|
|
2687
|
+
for (const nextCue of SHARED_MULTI_SECTION_CUES) {
|
|
2688
|
+
if (nextCue === cue)
|
|
2689
|
+
continue;
|
|
2690
|
+
const index = normalized.indexOf(nextCue, start + cue.length + 20);
|
|
2691
|
+
if (index > start)
|
|
2692
|
+
end = Math.min(end, index);
|
|
2693
|
+
}
|
|
2694
|
+
return normalized.slice(start, end);
|
|
2695
|
+
}
|
|
2696
|
+
function sharedMultiRequiredCueMismatch(answerText, normalizedSegment) {
|
|
2697
|
+
const normalizedAnswer = normalizeForSearch(answerText);
|
|
2698
|
+
for (const group of SHARED_MULTI_REQUIRED_CUE_GROUPS) {
|
|
2699
|
+
if (group.answer.some((cue) => normalizedAnswer.includes(cue)) && !group.source.some((cue) => normalizedSegment.includes(cue))) {
|
|
2700
|
+
return true;
|
|
2701
|
+
}
|
|
2702
|
+
}
|
|
2703
|
+
return false;
|
|
2704
|
+
}
|
|
2705
|
+
function sharedMultiTokenPosition(normalizedSegment, token) {
|
|
2706
|
+
const probes = [token, token.slice(0, 10), token.slice(0, 8), token.slice(0, 6)].filter((item) => item.length >= 4);
|
|
2707
|
+
for (const probe of probes) {
|
|
2708
|
+
const index = normalizedSegment.indexOf(probe);
|
|
2709
|
+
if (index >= 0)
|
|
2710
|
+
return index;
|
|
2711
|
+
}
|
|
2712
|
+
return -1;
|
|
2713
|
+
}
|
|
2714
|
+
function sharedMultiCompactSpan(normalizedSegment, tokens) {
|
|
2715
|
+
const positions = tokens
|
|
2716
|
+
.map((token) => sharedMultiTokenPosition(normalizedSegment, token))
|
|
2717
|
+
.filter((position) => position >= 0)
|
|
2718
|
+
.sort((a, b) => a - b);
|
|
2719
|
+
if (positions.length < Math.min(2, tokens.length))
|
|
2720
|
+
return Infinity;
|
|
2721
|
+
return positions[positions.length - 1] - positions[0];
|
|
2722
|
+
}
|
|
2723
|
+
function sharedMultiNumericComparatorMismatch(answerText, normalizedSegment) {
|
|
2724
|
+
const answerNumbers = extractNumbers(answerText).filter((number) => /^\d+(?:[.,]\d+)?$/u.test(number));
|
|
2725
|
+
if (answerNumbers.length !== 1)
|
|
2726
|
+
return false;
|
|
2727
|
+
const answerNumber = answerNumbers[0].replace(",", ".");
|
|
2728
|
+
const comparatorHits = [...String(normalizedSegment ?? "").matchAll(/(?:<=|<|>=|>)\s*(\d+(?:[.,]\d+)?)/gu)].map((match) => String(match[1] ?? "").replace(",", "."));
|
|
2729
|
+
if (!comparatorHits.length)
|
|
2730
|
+
return false;
|
|
2731
|
+
return !comparatorHits.includes(answerNumber);
|
|
2732
|
+
}
|
|
2733
|
+
function sharedMultiSegmentHit(segmentText, answer, question) {
|
|
2734
|
+
const normalized = sharedMultiFocusedNormalized(segmentText, question);
|
|
2735
|
+
if (!normalized || normalized.length < 30)
|
|
2736
|
+
return null;
|
|
2737
|
+
if (sharedMultiRequiredCueMismatch(answer.text, normalized))
|
|
2738
|
+
return null;
|
|
2739
|
+
if (sharedMultiNumericComparatorMismatch(answer.text, normalized))
|
|
2740
|
+
return null;
|
|
2741
|
+
const tokens = sharedMultiTokens(answer.text);
|
|
2742
|
+
const phraseHit = focusedAnswerSearchPhrases(answer.text)
|
|
2743
|
+
.map((phrase) => normalizeForSearch(phrase))
|
|
2744
|
+
.filter((phrase) => phrase.length >= 9 || SHARED_MULTI_SHORT_ALIAS_PHRASES.has(phrase) || (tokens.length === 1 && phrase.length >= 5))
|
|
2745
|
+
.some((phrase) => (SHARED_MULTI_SHORT_ALIAS_PHRASES.has(phrase) ? normalized.includes(phrase) : containsNormalizedPhrase(normalized, phrase)));
|
|
2746
|
+
const tokenCoverage = tokens.length ? strictSoftCoverage(tokens, tokenizeNormalized(normalized)) : 0;
|
|
2747
|
+
const compactSpan = sharedMultiCompactSpan(normalized, tokens);
|
|
2748
|
+
const spanLimit = Math.min(520, 150 + tokens.length * 45);
|
|
2749
|
+
const strongTokenHit = tokens.length >= 2 && tokenCoverage >= 0.78 && compactSpan <= spanLimit;
|
|
2750
|
+
if (!phraseHit && !strongTokenHit)
|
|
2751
|
+
return null;
|
|
2752
|
+
return { phraseHit, tokenCoverage, tokens, compactSpan };
|
|
2753
|
+
}
|
|
2754
|
+
function addSharedMultiSegmentSupport(answerScores, intent, question) {
|
|
2755
|
+
if (intent.negative || intent.exception || answerScores.length < 3)
|
|
2756
|
+
return answerScores;
|
|
2757
|
+
const sorted = [...answerScores].sort((a, b) => b.raw - a.raw);
|
|
2758
|
+
const topRaw = sorted[0]?.raw ?? 0;
|
|
2759
|
+
if (topRaw < 5)
|
|
2760
|
+
return answerScores;
|
|
2761
|
+
const sourceMap = new Map();
|
|
2762
|
+
for (const item of sorted.slice(0, Math.min(3, sorted.length))) {
|
|
2763
|
+
for (const evidenceItem of item.evidence.slice(0, 4)) {
|
|
2764
|
+
if (!SHARED_MULTI_SOURCE_KINDS.has(evidenceItem.kind))
|
|
2765
|
+
continue;
|
|
2766
|
+
if (!evidenceItem.text || evidenceItem.text.length < 50)
|
|
2767
|
+
continue;
|
|
2768
|
+
if ((evidenceItem.score ?? 0) < 4.8)
|
|
2769
|
+
continue;
|
|
2770
|
+
const key = `${evidenceItem.page}:${evidenceItem.kind}:${evidenceItem.text.slice(0, 220)}`;
|
|
2771
|
+
if (!sourceMap.has(key) || sourceMap.get(key).score < evidenceItem.score) {
|
|
2772
|
+
sourceMap.set(key, evidenceItem);
|
|
2773
|
+
}
|
|
2774
|
+
}
|
|
2775
|
+
}
|
|
2776
|
+
const sources = [...sourceMap.values()].slice(0, 8);
|
|
2777
|
+
if (!sources.length)
|
|
2778
|
+
return answerScores;
|
|
2779
|
+
return answerScores.map((item) => {
|
|
2780
|
+
let best = null;
|
|
2781
|
+
for (const source of sources) {
|
|
2782
|
+
const hit = sharedMultiSegmentHit(source.text, item.answer, question);
|
|
2783
|
+
if (!hit)
|
|
2784
|
+
continue;
|
|
2785
|
+
const evidenceScore = 9.2 +
|
|
2786
|
+
Math.min(3.2, source.score * 0.18) +
|
|
2787
|
+
hit.tokenCoverage * 2.6 +
|
|
2788
|
+
(hit.phraseHit ? 1.4 : 0);
|
|
2789
|
+
best = betterEvidence(best, {
|
|
2790
|
+
answerId: item.answer.id,
|
|
2791
|
+
page: source.page,
|
|
2792
|
+
text: source.text,
|
|
2793
|
+
score: evidenceScore,
|
|
2794
|
+
kind: "shared_multi_segment",
|
|
2795
|
+
});
|
|
2796
|
+
}
|
|
2797
|
+
if (!best)
|
|
2798
|
+
return item;
|
|
2799
|
+
const minPriorRatio = topRaw < 10 ? 0.48 : 0.38;
|
|
2800
|
+
if (item.raw < topRaw * minPriorRatio)
|
|
2801
|
+
return item;
|
|
2802
|
+
const supportRatio = topRaw < 13 ? 0.96 : best.score >= 12 ? 0.82 : 0.76;
|
|
2803
|
+
const boostedRaw = Math.max(item.raw, topRaw * supportRatio);
|
|
2804
|
+
if (boostedRaw <= item.raw + 0.05)
|
|
2805
|
+
return item;
|
|
2806
|
+
return { ...item, raw: boostedRaw, evidence: [...item.evidence, best] };
|
|
2807
|
+
});
|
|
2808
|
+
}
|
|
2809
|
+
function applyGeneSentenceSetSupport(answerScores, mode, question) {
|
|
2810
|
+
if (mode !== "multi" || !geneMutationQuestion(question))
|
|
2811
|
+
return answerScores;
|
|
2812
|
+
const supported = answerScores.filter((item) => item.evidence.some((evidenceItem) => evidenceItem.kind === "gene_sentence_segment"));
|
|
2813
|
+
if (supported.length < 2)
|
|
2814
|
+
return answerScores;
|
|
2815
|
+
const topRaw = Math.max(...answerScores.map((item) => item.raw), 0);
|
|
2816
|
+
return answerScores.map((item) => {
|
|
2817
|
+
const hasGeneSupport = item.evidence.some((evidenceItem) => evidenceItem.kind === "gene_sentence_segment");
|
|
2818
|
+
if (hasGeneSupport)
|
|
2819
|
+
return { ...item, raw: Math.max(item.raw, topRaw * 0.93) };
|
|
2820
|
+
if (latinAnswerTokens(item.answer.text).length)
|
|
2821
|
+
return { ...item, raw: item.raw * 0.56 };
|
|
2822
|
+
return item;
|
|
2823
|
+
});
|
|
2824
|
+
}
|
|
2825
|
+
function questionAgeFormCues(question) {
|
|
2826
|
+
const normalized = normalizeForSearch(question);
|
|
2827
|
+
if (!containsNormalizedPhrase(normalized, "\u0432\u043e\u0437\u0440\u0430\u0441\u0442") || !containsNormalizedPhrase(normalized, "\u0444\u043e\u0440\u043c"))
|
|
2828
|
+
return null;
|
|
2829
|
+
if (containsNormalizedPhrase(normalized, "\u043f\u043e\u0434\u0440\u043e\u0441\u0442") || containsNormalizedPhrase(normalized, "\u0432\u0437\u0440\u043e\u0441\u043b")) {
|
|
2830
|
+
return ["\u043f\u043e\u0434\u0440\u043e\u0441\u0442", "\u0432\u0437\u0440\u043e\u0441\u043b"].map((item) => normalizeForSearch(item));
|
|
2831
|
+
}
|
|
2832
|
+
if (containsNormalizedPhrase(normalized, "\u043f\u043e\u0437\u0434") && containsNormalizedPhrase(normalized, "\u043c\u043b\u0430\u0434\u0435\u043d")) {
|
|
2833
|
+
return ["\u043f\u043e\u0437\u0434", "\u043c\u043b\u0430\u0434\u0435\u043d"].map((item) => normalizeForSearch(item));
|
|
2834
|
+
}
|
|
2835
|
+
if (containsNormalizedPhrase(normalized, "\u0440\u0430\u043d") && containsNormalizedPhrase(normalized, "\u043c\u043b\u0430\u0434\u0435\u043d")) {
|
|
2836
|
+
return ["\u0440\u0430\u043d", "\u043c\u043b\u0430\u0434\u0435\u043d"].map((item) => normalizeForSearch(item));
|
|
2837
|
+
}
|
|
2838
|
+
if (containsNormalizedPhrase(normalized, "\u044e\u0432\u0435\u043d")) {
|
|
2839
|
+
return ["\u044e\u0432\u0435\u043d"].map((item) => normalizeForSearch(item));
|
|
2840
|
+
}
|
|
2841
|
+
return null;
|
|
2842
|
+
}
|
|
2843
|
+
function ageFormLabelIndex(normalized, cues) {
|
|
2844
|
+
if (cues.length === 1)
|
|
2845
|
+
return normalized.indexOf(cues[0]);
|
|
2846
|
+
let best = -1;
|
|
2847
|
+
const primary = cues[0];
|
|
2848
|
+
let start = 0;
|
|
2849
|
+
while (start < normalized.length) {
|
|
2850
|
+
const index = normalized.indexOf(primary, start);
|
|
2851
|
+
if (index < 0)
|
|
2852
|
+
break;
|
|
2853
|
+
const positions = [index];
|
|
2854
|
+
let ok = true;
|
|
2855
|
+
for (const cue of cues.slice(1)) {
|
|
2856
|
+
const before = normalized.lastIndexOf(cue, index + 42);
|
|
2857
|
+
const after = normalized.indexOf(cue, Math.max(0, index - 8));
|
|
2858
|
+
const candidate = before >= 0 && Math.abs(before - index) <= 42
|
|
2859
|
+
? before
|
|
2860
|
+
: after >= 0 && Math.abs(after - index) <= 42
|
|
2861
|
+
? after
|
|
2862
|
+
: -1;
|
|
2863
|
+
if (candidate < 0) {
|
|
2864
|
+
ok = false;
|
|
2865
|
+
break;
|
|
2866
|
+
}
|
|
2867
|
+
positions.push(candidate);
|
|
2868
|
+
}
|
|
2869
|
+
if (ok && Math.max(...positions) - Math.min(...positions) <= 48) {
|
|
2870
|
+
const labelStart = Math.min(...positions);
|
|
2871
|
+
best = best < 0 ? labelStart : Math.min(best, labelStart);
|
|
2872
|
+
}
|
|
2873
|
+
start = index + primary.length;
|
|
2874
|
+
}
|
|
2875
|
+
return best;
|
|
2876
|
+
}
|
|
2877
|
+
const AGE_FORM_BOUNDARY_CUES = [
|
|
2878
|
+
"\u043f\u0435\u0440\u0438\u043d\u0430\u0442",
|
|
2879
|
+
"\u0440\u0430\u043d",
|
|
2880
|
+
"\u043f\u043e\u0437\u0434",
|
|
2881
|
+
"\u044e\u0432\u0435\u043d",
|
|
2882
|
+
"\u043f\u043e\u0434\u0440\u043e\u0441\u0442",
|
|
2883
|
+
"\u0432\u0437\u0440\u043e\u0441\u043b",
|
|
2884
|
+
].map((item) => normalizeForSearch(item));
|
|
2885
|
+
function nextAgeFormBoundary(normalized, labelIndex, cues) {
|
|
2886
|
+
let best = -1;
|
|
2887
|
+
for (const cue of AGE_FORM_BOUNDARY_CUES) {
|
|
2888
|
+
let index = normalized.indexOf(cue, labelIndex + 8);
|
|
2889
|
+
while (index >= 0) {
|
|
2890
|
+
const isCurrentLabelCue = cues.includes(cue) && Math.abs(index - labelIndex) <= 48;
|
|
2891
|
+
if (!isCurrentLabelCue) {
|
|
2892
|
+
best = best < 0 ? index : Math.min(best, index);
|
|
2893
|
+
break;
|
|
2894
|
+
}
|
|
2895
|
+
index = normalized.indexOf(cue, index + cue.length);
|
|
2896
|
+
}
|
|
2897
|
+
}
|
|
2898
|
+
return best;
|
|
2899
|
+
}
|
|
2900
|
+
function answerComparatorMismatch(answerText, window) {
|
|
2901
|
+
const numbers = extractNumbers(answerText);
|
|
2902
|
+
if (!numbers.length)
|
|
2903
|
+
return false;
|
|
2904
|
+
const firstNumber = expandNumberToken(numbers[0])[0] ?? numbers[0];
|
|
2905
|
+
const normalizedAnswer = normalizeForSearch(answerText);
|
|
2906
|
+
const startsWithDo = normalizedAnswer.startsWith(normalizeForSearch("\u0434\u043e "));
|
|
2907
|
+
const lessAnswer = answerText.includes("<") ||
|
|
2908
|
+
startsWithDo ||
|
|
2909
|
+
containsNormalizedPhrase(normalizedAnswer, "\u043c\u0435\u043d\u0435\u0435") ||
|
|
2910
|
+
containsNormalizedPhrase(normalizedAnswer, "\u043c\u0435\u043d\u044c\u0448\u0435") ||
|
|
2911
|
+
containsNormalizedPhrase(normalizedAnswer, "\u043c\u043e\u043b\u043e\u0436\u0435");
|
|
2912
|
+
if (lessAnswer) {
|
|
2913
|
+
return ![
|
|
2914
|
+
"\u0434\u043e",
|
|
2915
|
+
"\u043c\u0435\u043d\u0435\u0435",
|
|
2916
|
+
"\u043c\u0435\u043d\u044c\u0448\u0435",
|
|
2917
|
+
"\u043c\u043e\u043b\u043e\u0436\u0435",
|
|
2918
|
+
"\u043d\u0438\u0436\u0435",
|
|
2919
|
+
].some((cue) => containsNormalizedPhrase(window, `${cue} ${firstNumber}`));
|
|
2920
|
+
}
|
|
2921
|
+
const greaterAnswer = answerText.includes(">") ||
|
|
2922
|
+
containsNormalizedPhrase(normalizedAnswer, "\u0441\u0442\u0430\u0440\u0448\u0435") ||
|
|
2923
|
+
containsNormalizedPhrase(normalizedAnswer, "\u0431\u043e\u043b\u0435\u0435") ||
|
|
2924
|
+
containsNormalizedPhrase(normalizedAnswer, "\u0432\u044b\u0448\u0435");
|
|
2925
|
+
if (greaterAnswer) {
|
|
2926
|
+
return ![
|
|
2927
|
+
"\u0441\u0442\u0430\u0440\u0448\u0435",
|
|
2928
|
+
"\u0431\u043e\u043b\u0435\u0435",
|
|
2929
|
+
"\u0432\u044b\u0448\u0435",
|
|
2930
|
+
"\u043f\u043e\u0441\u043b\u0435",
|
|
2931
|
+
].some((cue) => containsNormalizedPhrase(window, `${cue} ${firstNumber}`));
|
|
2932
|
+
}
|
|
2933
|
+
return false;
|
|
2934
|
+
}
|
|
2935
|
+
function ageAnswerSupport(window, answer, answerTokens) {
|
|
2936
|
+
if (answerComparatorMismatch(answer.text, window))
|
|
2937
|
+
return null;
|
|
2938
|
+
const phraseHit = answerSearchPhrases(answer.text)
|
|
2939
|
+
.map((phrase) => normalizeForSearch(phrase))
|
|
2940
|
+
.filter((phrase) => phrase.length >= 2)
|
|
2941
|
+
.some((phrase) => containsNormalizedPhrase(window, phrase));
|
|
2942
|
+
const tokens = answerTokens.filter((token) => token.length >= 2);
|
|
2943
|
+
const tokenCoverage = tokens.length ? strictSoftCoverage(tokens, tokenizeNormalized(window)) : 0;
|
|
2944
|
+
const numberHit = numberCoverage(answer.text, window);
|
|
2945
|
+
if (!phraseHit && tokenCoverage < 0.7 && numberHit < 0.9)
|
|
2946
|
+
return null;
|
|
2947
|
+
return { phraseHit, tokenCoverage, numberHit };
|
|
2948
|
+
}
|
|
2949
|
+
function bestAgeFormSupport({ mode, pages, question, answer, answerTokens }) {
|
|
2950
|
+
if (mode !== "single")
|
|
2951
|
+
return null;
|
|
2952
|
+
const cues = questionAgeFormCues(question);
|
|
2953
|
+
if (!cues)
|
|
2954
|
+
return null;
|
|
2955
|
+
const normalizedAnswer = normalizeForSearch(answer.text);
|
|
2956
|
+
if (!extractNumbers(answer.text).length && !containsNormalizedPhrase(normalizedAnswer, "\u0441\u0442\u0430\u0440\u0448") && !containsNormalizedPhrase(normalizedAnswer, "\u043c\u043e\u043b\u043e\u0436"))
|
|
2957
|
+
return null;
|
|
2958
|
+
let best = null;
|
|
2959
|
+
for (const page of pages) {
|
|
2960
|
+
const lines = page.lines ?? [];
|
|
2961
|
+
for (let index = 0; index < lines.length; index += 1) {
|
|
2962
|
+
const text = lines.slice(Math.max(0, index - 1), Math.min(lines.length, index + 2)).join(" ");
|
|
2963
|
+
const normalized = normalizeForSearch(text);
|
|
2964
|
+
const labelIndex = ageFormLabelIndex(normalized, cues);
|
|
2965
|
+
if (labelIndex < 0)
|
|
2966
|
+
continue;
|
|
2967
|
+
const boundary = nextAgeFormBoundary(normalized, labelIndex, cues);
|
|
2968
|
+
const windowEnd = boundary > labelIndex ? boundary : Math.min(normalized.length, labelIndex + 145);
|
|
2969
|
+
const window = normalized.slice(labelIndex, windowEnd);
|
|
2970
|
+
const support = ageAnswerSupport(window, answer, answerTokens);
|
|
2971
|
+
if (!support)
|
|
2972
|
+
continue;
|
|
2973
|
+
const score = 15.4 + support.numberHit * 3.8 + support.tokenCoverage * 2.4 + (support.phraseHit ? 2.0 : 0);
|
|
2974
|
+
best = betterEvidence(best, {
|
|
2975
|
+
answerId: answer.id,
|
|
2976
|
+
page: page.page,
|
|
2977
|
+
text,
|
|
2978
|
+
score,
|
|
2979
|
+
kind: "age_form_segment",
|
|
2980
|
+
});
|
|
2981
|
+
}
|
|
2982
|
+
}
|
|
2983
|
+
return best;
|
|
2984
|
+
}
|
|
2985
|
+
function questionRomanStage(question) {
|
|
2986
|
+
const tokens = rawTokens(question);
|
|
2987
|
+
const index = tokens.findIndex((token) => token.startsWith("\u0441\u0442\u0430\u0434\u0438"));
|
|
2988
|
+
const next = index >= 0 ? tokens[index + 1] : null;
|
|
2989
|
+
const previous = index > 0 ? tokens[index - 1] : null;
|
|
2990
|
+
if (/^(?:[ivx]+|\d+)$/iu.test(next ?? ""))
|
|
2991
|
+
return next.toLowerCase();
|
|
2992
|
+
if (/^(?:[ivx]+|\d+)$/iu.test(previous ?? ""))
|
|
2993
|
+
return previous.toLowerCase();
|
|
2994
|
+
return null;
|
|
2995
|
+
}
|
|
2996
|
+
function romanStageVariants(stage) {
|
|
2997
|
+
const romanMap = new Map([
|
|
2998
|
+
["1", "i"],
|
|
2999
|
+
["2", "ii"],
|
|
3000
|
+
["3", "iii"],
|
|
3001
|
+
["4", "iv"],
|
|
3002
|
+
["5", "v"],
|
|
3003
|
+
["6", "vi"],
|
|
3004
|
+
]);
|
|
3005
|
+
const reverse = new Map([...romanMap.entries()].map(([number, roman]) => [roman, number]));
|
|
3006
|
+
const variants = new Set([stage]);
|
|
3007
|
+
if (romanMap.has(stage))
|
|
3008
|
+
variants.add(romanMap.get(stage));
|
|
3009
|
+
if (reverse.has(stage))
|
|
3010
|
+
variants.add(reverse.get(stage));
|
|
3011
|
+
return [...variants].map((item) => normalizeForSearch(item));
|
|
3012
|
+
}
|
|
3013
|
+
function nextRomanStageRowIndex(normalized, start) {
|
|
3014
|
+
const pattern = /(?:^|\s)(?:[ivx]{1,5}|\d{1,2})(?:\s|$)/giu;
|
|
3015
|
+
pattern.lastIndex = start;
|
|
3016
|
+
const match = pattern.exec(normalized);
|
|
3017
|
+
return match?.index ?? -1;
|
|
3018
|
+
}
|
|
3019
|
+
function romanStageWindow(normalized, stage) {
|
|
3020
|
+
const stageCue = normalizeForSearch("\u0441\u0442\u0430\u0434\u0438\u044f");
|
|
3021
|
+
for (const variant of romanStageVariants(stage)) {
|
|
3022
|
+
const cues = [normalizeForSearch(`\u0441\u0442\u0430\u0434\u0438\u044f ${variant}`), normalizeForSearch(`${variant} \u0441\u0442\u0430\u0434\u0438\u044f`)];
|
|
3023
|
+
for (const cue of cues) {
|
|
3024
|
+
let index = -1;
|
|
3025
|
+
for (let start = 0; start < normalized.length; start += 1) {
|
|
3026
|
+
const found = normalized.indexOf(cue, start);
|
|
3027
|
+
if (found < 0)
|
|
3028
|
+
break;
|
|
3029
|
+
if (hasSearchBoundaries(normalized, found, cue.length)) {
|
|
3030
|
+
index = found;
|
|
3031
|
+
break;
|
|
3032
|
+
}
|
|
3033
|
+
start = found + cue.length;
|
|
3034
|
+
}
|
|
3035
|
+
if (index < 0)
|
|
3036
|
+
continue;
|
|
3037
|
+
let end = Math.min(normalized.length, index + 520);
|
|
3038
|
+
const nextStage = normalized.indexOf(stageCue, index + cue.length + 20);
|
|
3039
|
+
if (nextStage > 0)
|
|
3040
|
+
end = Math.min(end, nextStage);
|
|
3041
|
+
return normalized.slice(index, end);
|
|
3042
|
+
}
|
|
3043
|
+
}
|
|
3044
|
+
if (!normalized.includes(stageCue))
|
|
3045
|
+
return null;
|
|
3046
|
+
for (const variant of romanStageVariants(stage)) {
|
|
3047
|
+
let start = 0;
|
|
3048
|
+
while (start < normalized.length) {
|
|
3049
|
+
const index = normalized.indexOf(variant, start);
|
|
3050
|
+
if (index < 0)
|
|
3051
|
+
break;
|
|
3052
|
+
if (!hasSearchBoundaries(normalized, index, variant.length)) {
|
|
3053
|
+
start = index + variant.length;
|
|
3054
|
+
continue;
|
|
3055
|
+
}
|
|
3056
|
+
const before = normalized.slice(Math.max(0, index - 220), index);
|
|
3057
|
+
if (!before.includes(stageCue)) {
|
|
3058
|
+
start = index + variant.length;
|
|
3059
|
+
continue;
|
|
3060
|
+
}
|
|
3061
|
+
const next = nextRomanStageRowIndex(normalized, index + variant.length + 1);
|
|
3062
|
+
const end = next > index ? Math.min(next, index + 420) : Math.min(normalized.length, index + 420);
|
|
3063
|
+
return normalized.slice(index, end);
|
|
3064
|
+
}
|
|
3065
|
+
}
|
|
3066
|
+
return null;
|
|
3067
|
+
}
|
|
3068
|
+
function bestRomanStageSupport({ mode, pages, question, answer, answerTokens }) {
|
|
3069
|
+
if (mode !== "single")
|
|
3070
|
+
return null;
|
|
3071
|
+
const stage = questionRomanStage(question);
|
|
3072
|
+
if (!stage)
|
|
3073
|
+
return null;
|
|
3074
|
+
const answerPhrases = answerSearchPhrases(answer.text).slice(0, 16);
|
|
3075
|
+
let best = null;
|
|
3076
|
+
for (const page of pages) {
|
|
3077
|
+
for (const source of cachedLineWindowSegments(page)) {
|
|
3078
|
+
const window = romanStageWindow(source.normalized, stage);
|
|
3079
|
+
if (!window)
|
|
3080
|
+
continue;
|
|
3081
|
+
const tokens = tokenizeNormalized(window);
|
|
3082
|
+
const answerCoverage = strictSoftCoverage(answerTokens, tokens);
|
|
3083
|
+
const phraseHit = answerPhrases.some((phrase) => containsNormalizedPhrase(window, phrase));
|
|
3084
|
+
if (!phraseHit && answerCoverage < 0.58)
|
|
3085
|
+
continue;
|
|
3086
|
+
const score = 12.8 + (phraseHit ? 2.4 : 0) + answerCoverage * 4.0 + numberCoverage(answer.text, window) * 0.8;
|
|
3087
|
+
best = betterEvidence(best, {
|
|
3088
|
+
answerId: answer.id,
|
|
3089
|
+
page: page.page,
|
|
3090
|
+
text: source.text,
|
|
3091
|
+
score,
|
|
3092
|
+
kind: "roman_stage_segment",
|
|
3093
|
+
});
|
|
3094
|
+
}
|
|
3095
|
+
}
|
|
3096
|
+
return best;
|
|
3097
|
+
}
|
|
3098
|
+
function answerOrdinalLabel(answerText) {
|
|
3099
|
+
const normalized = normalizeForSearch(answerText);
|
|
3100
|
+
const tokens = normalized.split(/\s+/u).filter(Boolean);
|
|
3101
|
+
const kinds = [
|
|
3102
|
+
{ kind: "stage", cue: normalizeForSearch("\u0441\u0442\u0430\u0434\u0438") },
|
|
3103
|
+
{ kind: "degree", cue: normalizeForSearch("\u0441\u0442\u0435\u043f\u0435\u043d") },
|
|
3104
|
+
{ kind: "type", cue: normalizeForSearch("\u0442\u0438\u043f") },
|
|
3105
|
+
];
|
|
3106
|
+
const kind = kinds.find((item) => tokens.some((token) => token.startsWith(item.cue)));
|
|
3107
|
+
if (!kind)
|
|
3108
|
+
return null;
|
|
3109
|
+
const values = new Set();
|
|
3110
|
+
for (const match of normalized.matchAll(/(?:^|\s)(\d{1,2}|[ivx]{1,7})(?:\s|$)/giu)) {
|
|
3111
|
+
const number = ordinalValueToNumber(match[1]);
|
|
3112
|
+
if (number && number > 0 && number <= 10)
|
|
3113
|
+
values.add(number);
|
|
3114
|
+
}
|
|
3115
|
+
if (values.size !== 1)
|
|
3116
|
+
return null;
|
|
3117
|
+
return { kind: kind.kind, cue: kind.cue, number: [...values][0] };
|
|
3118
|
+
}
|
|
3119
|
+
function ordinalKindCue(kind) {
|
|
3120
|
+
if (kind === "stage")
|
|
3121
|
+
return normalizeForSearch("\u0441\u0442\u0430\u0434\u0438");
|
|
3122
|
+
if (kind === "degree")
|
|
3123
|
+
return normalizeForSearch("\u0441\u0442\u0435\u043f\u0435\u043d");
|
|
3124
|
+
if (kind === "type")
|
|
3125
|
+
return normalizeForSearch("\u0442\u0438\u043f");
|
|
3126
|
+
return normalizeForSearch("\u043a\u043b\u0430\u0441\u0441");
|
|
3127
|
+
}
|
|
3128
|
+
function hasOrdinalKindCue(normalized, kind) {
|
|
3129
|
+
const cue = ordinalKindCue(kind);
|
|
3130
|
+
return new RegExp(`(?:^|\\s)${escapeRegExp(cue)}\\S*(?:\\s|$)`, "iu").test(normalized);
|
|
3131
|
+
}
|
|
3132
|
+
function nextAnswerOrdinalIndex(normalized, start, label) {
|
|
3133
|
+
const cue = ordinalKindCue(label.kind);
|
|
3134
|
+
let best = -1;
|
|
3135
|
+
for (let number = 1; number <= 10; number += 1) {
|
|
3136
|
+
if (number === label.number)
|
|
3137
|
+
continue;
|
|
3138
|
+
for (const variant of romanStageVariants(String(number))) {
|
|
3139
|
+
const pattern = new RegExp(`(?:^|\\s)${escapeRegExp(variant)}(?:\\s|-|$)`, "iu");
|
|
3140
|
+
const match = normalized.slice(start).match(pattern);
|
|
3141
|
+
if (!match?.index && match?.index !== 0)
|
|
3142
|
+
continue;
|
|
3143
|
+
const index = start + match.index;
|
|
3144
|
+
if (isRomanOneConjunctionMatch(normalized, index, variant))
|
|
3145
|
+
continue;
|
|
3146
|
+
const before = normalized.slice(Math.max(0, index - 180), index);
|
|
3147
|
+
const after = normalized.slice(index, Math.min(normalized.length, index + 90));
|
|
3148
|
+
if (!hasOrdinalKindCue(before, label.kind) && !hasOrdinalKindCue(after, label.kind))
|
|
3149
|
+
continue;
|
|
3150
|
+
if (best < 0 || index < best)
|
|
3151
|
+
best = index;
|
|
3152
|
+
}
|
|
3153
|
+
}
|
|
3154
|
+
return best;
|
|
3155
|
+
}
|
|
3156
|
+
function nearestTokenBefore(normalized, index) {
|
|
3157
|
+
const tokens = normalized.slice(0, index).trim().match(/\S+/gu) ?? [];
|
|
3158
|
+
return tokens[tokens.length - 1] ?? "";
|
|
3159
|
+
}
|
|
3160
|
+
function nearestTokenAfter(normalized, index, length) {
|
|
3161
|
+
const tokens = normalized.slice(index + length).trim().match(/\S+/gu) ?? [];
|
|
3162
|
+
return tokens[0] ?? "";
|
|
3163
|
+
}
|
|
3164
|
+
function isRomanOneConjunctionMatch(normalized, index, variant) {
|
|
3165
|
+
if (variant !== "i")
|
|
3166
|
+
return false;
|
|
3167
|
+
const before = ordinalValueToNumber(nearestTokenBefore(normalized, index));
|
|
3168
|
+
const after = ordinalValueToNumber(nearestTokenAfter(normalized, index, variant.length));
|
|
3169
|
+
return Boolean(before && after);
|
|
3170
|
+
}
|
|
3171
|
+
function answerOrdinalRowWindows(source, label) {
|
|
3172
|
+
const normalized = source.normalized;
|
|
3173
|
+
const cue = ordinalKindCue(label.kind);
|
|
3174
|
+
const windows = [];
|
|
3175
|
+
for (const variant of romanStageVariants(String(label.number))) {
|
|
3176
|
+
if (hasOrdinalKindCue(normalized, label.kind)) {
|
|
3177
|
+
const directPatterns = [
|
|
3178
|
+
new RegExp(`(?:^|\\s)${escapeRegExp(variant)}(?:\\s|$)(?:-?\\s*\\S{0,3}\\s+)?${escapeRegExp(cue)}`, "giu"),
|
|
3179
|
+
new RegExp(`${escapeRegExp(cue)}\\s+(?:\\S+\\s+){0,2}${escapeRegExp(variant)}(?:\\s|$)`, "giu"),
|
|
3180
|
+
];
|
|
3181
|
+
for (const pattern of directPatterns) {
|
|
3182
|
+
for (const match of normalized.matchAll(pattern)) {
|
|
3183
|
+
const index = match.index ?? 0;
|
|
3184
|
+
if (isRomanOneConjunctionMatch(normalized, index, variant))
|
|
3185
|
+
continue;
|
|
3186
|
+
const afterStart = index + match[0].length;
|
|
3187
|
+
const next = nextAnswerOrdinalIndex(normalized, afterStart + 8, label);
|
|
3188
|
+
const end = next > 0 ? next : Math.min(normalized.length, afterStart + 520);
|
|
3189
|
+
windows.push(normalized.slice(index, end));
|
|
3190
|
+
}
|
|
3191
|
+
}
|
|
3192
|
+
let start = 0;
|
|
3193
|
+
while (start < normalized.length) {
|
|
3194
|
+
const index = normalized.indexOf(variant, start);
|
|
3195
|
+
if (index < 0)
|
|
3196
|
+
break;
|
|
3197
|
+
if (!hasSearchBoundaries(normalized, index, variant.length)) {
|
|
3198
|
+
start = index + Math.max(1, variant.length);
|
|
3199
|
+
continue;
|
|
3200
|
+
}
|
|
3201
|
+
if (isRomanOneConjunctionMatch(normalized, index, variant)) {
|
|
3202
|
+
start = index + Math.max(1, variant.length);
|
|
3203
|
+
continue;
|
|
3204
|
+
}
|
|
3205
|
+
const before = normalized.slice(Math.max(0, index - 220), index);
|
|
3206
|
+
const after = normalized.slice(index, Math.min(normalized.length, index + 100));
|
|
3207
|
+
if (!hasOrdinalKindCue(before, label.kind) && !hasOrdinalKindCue(after, label.kind)) {
|
|
3208
|
+
start = index + Math.max(1, variant.length);
|
|
3209
|
+
continue;
|
|
3210
|
+
}
|
|
3211
|
+
const next = nextAnswerOrdinalIndex(normalized, index + variant.length + 8, label);
|
|
3212
|
+
const end = next > 0 ? next : Math.min(normalized.length, index + 520);
|
|
3213
|
+
windows.push(normalized.slice(index, end));
|
|
3214
|
+
start = index + Math.max(1, variant.length);
|
|
3215
|
+
}
|
|
3216
|
+
}
|
|
3217
|
+
else {
|
|
3218
|
+
const barePattern = new RegExp(`^\\s*${escapeRegExp(variant)}(?:\\s|$)`, "iu");
|
|
3219
|
+
const match = normalized.match(barePattern);
|
|
3220
|
+
if (match?.[0]) {
|
|
3221
|
+
windows.push(normalized.slice(0, Math.min(normalized.length, 520)));
|
|
3222
|
+
}
|
|
3223
|
+
}
|
|
3224
|
+
}
|
|
3225
|
+
return windows;
|
|
3226
|
+
}
|
|
3227
|
+
function ordinalRangeIncludesValue(normalized, label) {
|
|
3228
|
+
if (!hasOrdinalKindCue(normalized, label.kind))
|
|
3229
|
+
return false;
|
|
3230
|
+
const number = label.number;
|
|
3231
|
+
const digitPatterns = [
|
|
3232
|
+
/(?:^|\s)(\d{1,2})\s*-\s*(\d{1,2})(?:\s|$)/giu,
|
|
3233
|
+
/(?:^|\s)(\d{1,2})\s*\/\s*(\d{1,2})(?:\s|$)/giu,
|
|
3234
|
+
];
|
|
3235
|
+
for (const pattern of digitPatterns) {
|
|
3236
|
+
for (const match of normalized.matchAll(pattern)) {
|
|
3237
|
+
const left = Number(match[1]);
|
|
3238
|
+
const right = Number(match[2]);
|
|
3239
|
+
if (number >= Math.min(left, right) && number <= Math.max(left, right))
|
|
3240
|
+
return true;
|
|
3241
|
+
}
|
|
3242
|
+
}
|
|
3243
|
+
const romanPattern = /(?:^|\s)(i|ii|iii|iv|v|vi|vii|viii|ix|x)\s*-\s*(i|ii|iii|iv|v|vi|vii|viii|ix|x)(?:\s|$)/giu;
|
|
3244
|
+
for (const match of normalized.matchAll(romanPattern)) {
|
|
3245
|
+
const left = ordinalValueToNumber(match[1]);
|
|
3246
|
+
const right = ordinalValueToNumber(match[2]);
|
|
3247
|
+
if (left && right && number >= Math.min(left, right) && number <= Math.max(left, right))
|
|
3248
|
+
return true;
|
|
3249
|
+
}
|
|
3250
|
+
return false;
|
|
3251
|
+
}
|
|
3252
|
+
const ANSWER_ORDINAL_GENERIC_FOCUS = new Set([
|
|
3253
|
+
"\u0441\u043e\u0433\u043b\u0430\u0441\u043d\u043e",
|
|
3254
|
+
"\u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u044f",
|
|
3255
|
+
"\u043a\u043b\u0430\u0441\u0441\u0438\u0444\u0438\u043a\u0430\u0446\u0438\u0438",
|
|
3256
|
+
"\u0445\u0430\u0440\u0430\u043a\u0442\u0435\u0440\u043d\u043e",
|
|
3257
|
+
"\u0445\u0430\u0440\u0430\u043a\u0442\u0435\u0440\u043d\u044b",
|
|
3258
|
+
"\u0441\u0442\u0430\u0434\u0438\u044f",
|
|
3259
|
+
"\u0441\u0442\u0430\u0434\u0438\u0438",
|
|
3260
|
+
"\u0441\u0442\u0435\u043f\u0435\u043d\u044c",
|
|
3261
|
+
"\u0441\u0442\u0435\u043f\u0435\u043d\u0438",
|
|
3262
|
+
"\u0442\u0438\u043f",
|
|
3263
|
+
"\u0442\u0438\u043f\u0430",
|
|
3264
|
+
"\u043a\u043b\u0430\u0441\u0441",
|
|
3265
|
+
"\u043a\u043b\u0430\u0441\u0441\u0430",
|
|
3266
|
+
].flatMap((item) => uniqueTokens(item)));
|
|
3267
|
+
function specificAnswerOrdinalFocusTokens(focusTokens, answerTokens) {
|
|
3268
|
+
const answerSet = new Set(answerTokens ?? []);
|
|
3269
|
+
return (focusTokens ?? []).filter((token) => token.length >= 4 && !/^\d/.test(token) && !answerSet.has(token) && !ANSWER_ORDINAL_GENERIC_FOCUS.has(token));
|
|
3270
|
+
}
|
|
3271
|
+
function orderedFocusPairHits(focusTokens, documentTokens) {
|
|
3272
|
+
if ((focusTokens?.length ?? 0) < 2 || !documentTokens?.length)
|
|
3273
|
+
return 0;
|
|
3274
|
+
const seen = new Set();
|
|
3275
|
+
let hits = 0;
|
|
3276
|
+
for (let index = 0; index < focusTokens.length - 1; index += 1) {
|
|
3277
|
+
const left = focusTokens[index];
|
|
3278
|
+
const right = focusTokens[index + 1];
|
|
3279
|
+
if (!left || !right || left === right)
|
|
3280
|
+
continue;
|
|
3281
|
+
const key = `${left}\u0000${right}`;
|
|
3282
|
+
if (seen.has(key))
|
|
3283
|
+
continue;
|
|
3284
|
+
seen.add(key);
|
|
3285
|
+
if (tokenSequenceIncludes(documentTokens, [left, right]))
|
|
3286
|
+
hits += 1;
|
|
3287
|
+
}
|
|
3288
|
+
return hits;
|
|
3289
|
+
}
|
|
3290
|
+
function bestAnswerOrdinalRowSupport({ mode, pages, topQuestionPages, answer, answerTokens, focusTokens }) {
|
|
3291
|
+
const label = answerOrdinalLabel(answer.text);
|
|
3292
|
+
if (!label)
|
|
3293
|
+
return null;
|
|
3294
|
+
const specificTokens = specificAnswerOrdinalFocusTokens(focusTokens, answerTokens);
|
|
3295
|
+
if (specificTokens.length < 2)
|
|
3296
|
+
return null;
|
|
3297
|
+
let best = null;
|
|
3298
|
+
for (const page of pages) {
|
|
3299
|
+
const nearTopPage = !topQuestionPages?.size || topQuestionPages.has(page.page) || topQuestionPages.has(page.page - 1) || topQuestionPages.has(page.page + 1);
|
|
3300
|
+
if (!nearTopPage)
|
|
3301
|
+
continue;
|
|
3302
|
+
const sources = [...cachedLineWindowSegments(page), { normalized: page.normalized, text: page.text }];
|
|
3303
|
+
for (const source of sources) {
|
|
3304
|
+
const windows = answerOrdinalRowWindows(source, label);
|
|
3305
|
+
if (mode === "multi" && ordinalRangeIncludesValue(source.normalized, label)) {
|
|
3306
|
+
windows.push(source.normalized);
|
|
3307
|
+
}
|
|
3308
|
+
for (const window of windows) {
|
|
3309
|
+
const tokens = tokenizeNormalized(window);
|
|
3310
|
+
const focusHits = tokenHitCount(specificTokens, tokens);
|
|
3311
|
+
if (focusHits < 2)
|
|
3312
|
+
continue;
|
|
3313
|
+
const focusCoverage = coverage(specificTokens, tokens);
|
|
3314
|
+
const pairHits = orderedFocusPairHits(specificTokens, tokens);
|
|
3315
|
+
const answerCoverage = strictSoftCoverage(answerTokens, tokens);
|
|
3316
|
+
const score = 13.4 +
|
|
3317
|
+
Math.min(5, focusHits) * 1.45 +
|
|
3318
|
+
Math.min(0.7, focusCoverage) * 5.4 +
|
|
3319
|
+
Math.min(4, pairHits) * 1.8 +
|
|
3320
|
+
answerCoverage * 2.2 +
|
|
3321
|
+
(ordinalRangeIncludesValue(window, label) ? 1.0 : 0);
|
|
3322
|
+
best = betterEvidence(best, {
|
|
3323
|
+
answerId: answer.id,
|
|
3324
|
+
page: page.page,
|
|
3325
|
+
text: source.text,
|
|
3326
|
+
score,
|
|
3327
|
+
kind: "answer_ordinal_row",
|
|
3328
|
+
});
|
|
3329
|
+
}
|
|
3330
|
+
}
|
|
3331
|
+
}
|
|
3332
|
+
return best;
|
|
3333
|
+
}
|
|
3334
|
+
function scoreAnswer(context) {
|
|
3335
|
+
const anchor = bestAnchorSupport(context);
|
|
3336
|
+
const section = bestSectionSupport(context);
|
|
3337
|
+
const rowLabel = bestRowLabelSupport(context);
|
|
3338
|
+
const focused = bestFocusedSupport(context);
|
|
3339
|
+
const lineToken = lineTokenApplicable(context) ? bestLineTokenSupport(context) : null;
|
|
3340
|
+
const prefix = bestPrefixSupport(context);
|
|
3341
|
+
const phrase = bestPhraseSupport(context);
|
|
3342
|
+
const precedingLabel = bestPrecedingQuestionLabelSupport(context);
|
|
3343
|
+
const exactAnswer = bestExactAnswerSupport(context);
|
|
3344
|
+
const chunk = bestChunkSupport(context);
|
|
3345
|
+
const polarity = polarityAdjustment(context);
|
|
3346
|
+
const temporal = temporalCueAdjustment(context);
|
|
3347
|
+
const conditionPair = conditionPairAdjustment(context);
|
|
3348
|
+
const riskCondition = riskConditionAdjustment(context);
|
|
3349
|
+
const genericPopulation = genericPopulationConditionAdjustmentForMode(context);
|
|
3350
|
+
const classSubject = bestClassSubjectSupport(context);
|
|
3351
|
+
const frequency = bestFrequencyRecommendationSupport(context);
|
|
3352
|
+
const negativeLocal = { adjustment: 0, evidence: null };
|
|
3353
|
+
const boundedList = bestBoundedListSupport(context);
|
|
3354
|
+
const ordinalList = bestOrdinalListSupport(context);
|
|
3355
|
+
const typeOrdinal = bestTypeOrdinalSupport(context);
|
|
3356
|
+
const indicationLabel = bestIndicationSegmentSupport(context);
|
|
3357
|
+
const labelDefinition = bestLabelDefinitionSupport(context);
|
|
3358
|
+
const recommendationPolarity = recommendationPolarityAdjustment(context);
|
|
3359
|
+
const exactNumericOption = bestExactNumericOptionSupport(context);
|
|
3360
|
+
const exactHourAlias = bestExactHourAliasOptionSupport(context);
|
|
3361
|
+
const ageEligibility = ageEligibilityAdjustment(context);
|
|
3362
|
+
const drugDose = bestDrugDoseSupport(context);
|
|
3363
|
+
const termDefinition = bestTermDefinitionSupport(context);
|
|
3364
|
+
const negatedAnswerPrefix = negatedAnswerPrefixAdjustment(context);
|
|
3365
|
+
const impossibilityOnly = impossibilityOnlyAdjustment(context);
|
|
3366
|
+
const activeTherapyIndication = activeTherapyIndicationAdjustment(context);
|
|
3367
|
+
const recommendationItem = bestRecommendationItemSupport(context);
|
|
3368
|
+
const explicitRecommendationTarget = explicitRecommendationTargetAdjustment(context);
|
|
3369
|
+
const conditionedNumber = bestConditionedNumberSupport(context);
|
|
3370
|
+
const numericCondition = bestNumericConditionSupport(context);
|
|
3371
|
+
const countRelation = context.config?.countRelationBoost ? bestCountRelationSupport(context) : null;
|
|
3372
|
+
const ageForm = bestAgeFormSupport(context);
|
|
3373
|
+
const fibrosisStage = bestFibrosisStageSupport(context);
|
|
3374
|
+
const conditionNumber = null;
|
|
3375
|
+
const romanStage = bestRomanStageSupport(context);
|
|
3376
|
+
const answerOrdinalRow = bestAnswerOrdinalRowSupport(context);
|
|
3377
|
+
const clozeGap = bestClozeGapSupport(context);
|
|
3378
|
+
const visualTableColumn = bestVisualTableColumnSupport(context);
|
|
3379
|
+
const coordinateTableRow = bestCoordinateTableRowSupport(context);
|
|
3380
|
+
const coordinateTableGroup = bestCoordinateTableGroupSupport(context);
|
|
3381
|
+
const coordinateMultiCellRow = bestCoordinateMultiCellRowSupport(context);
|
|
3382
|
+
const coordinateTableMembership = bestCoordinateTableMembershipSupport(context);
|
|
3383
|
+
const parentheticalGroup = bestParentheticalGroupSupport(context);
|
|
3384
|
+
const questionContinuationList = bestQuestionContinuationListSupport(context);
|
|
3385
|
+
const shortMedicalAlias = bestShortMedicalAliasSupport(context);
|
|
3386
|
+
const latinFuzzy = bestLatinFuzzySupport(context);
|
|
3387
|
+
const geneSentence = bestGeneSentenceSupport(context);
|
|
3388
|
+
const clinicalFeature = clinicalFeatureAdjustment(context);
|
|
3389
|
+
const mkbClassExclusion = bestMkbClassExclusionSupport(context);
|
|
3390
|
+
const labelNumber = bestLabelNumberSupport(context);
|
|
3391
|
+
const classificationCode = bestClassificationCodeSupport(context);
|
|
3392
|
+
const exactShortLabelRow = bestExactShortLabelRowSupport(context);
|
|
3393
|
+
const shortLabelRow = bestShortLabelRowSupport(context);
|
|
3394
|
+
const answerTokens = context.answerTokens;
|
|
3395
|
+
const numbers = extractNumbers(context.answer.text);
|
|
3396
|
+
const answerPhraseFound = phrase?.kind === "answer_window" || phrase?.kind === "answer_after_question" || phrase?.kind === "question_answer_phrase";
|
|
3397
|
+
const phraseWeight = phrase?.kind === "answer_window" ? 0.55 : phrase?.kind === "answer_directional_window" ? 0.95 : phrase ? 1.15 : 0;
|
|
3398
|
+
const focusedWeight = context.mode === "multi" ? 0.15 : 0.9;
|
|
3399
|
+
const lineTokenWeight = context.mode === "single" ? 0.85 : 0;
|
|
3400
|
+
const latinFuzzyWeight = context.mode === "multi" && polarity.evidence?.kind !== "polarity_mismatch" ? 1.15 : 0;
|
|
3401
|
+
let raw = (anchor?.score ?? 0) * 1.35 +
|
|
3402
|
+
(section?.score ?? 0) * 1.2 +
|
|
3403
|
+
(rowLabel?.score ?? 0) * 0.95 +
|
|
3404
|
+
(focused?.score ?? 0) * focusedWeight +
|
|
3405
|
+
(lineToken?.score ?? 0) * lineTokenWeight +
|
|
3406
|
+
(prefix?.score ?? 0) * 1.15 +
|
|
3407
|
+
(phrase?.score ?? 0) * phraseWeight +
|
|
3408
|
+
(precedingLabel?.score ?? 0) * 1.3 +
|
|
3409
|
+
(exactAnswer?.score ?? 0) * 1.08 +
|
|
3410
|
+
(chunk?.score ?? 0) * 1.0 +
|
|
3411
|
+
polarity.adjustment +
|
|
3412
|
+
(temporal.support?.score ?? 0) * 1.0 +
|
|
3413
|
+
temporal.adjustment +
|
|
3414
|
+
conditionPair.adjustment +
|
|
3415
|
+
riskCondition.adjustment +
|
|
3416
|
+
genericPopulation.adjustment +
|
|
3417
|
+
(classSubject?.score ?? 0) * 1.15 +
|
|
3418
|
+
(frequency?.score ?? 0) * 1.1 +
|
|
3419
|
+
negativeLocal.adjustment +
|
|
3420
|
+
(boundedList.support?.score ?? 0) * 1.15 +
|
|
3421
|
+
boundedList.adjustment +
|
|
3422
|
+
(ordinalList?.score ?? 0) * 1.15 +
|
|
3423
|
+
(typeOrdinal?.score ?? 0) * 1.15 +
|
|
3424
|
+
(indicationLabel?.score ?? 0) * 1.15 +
|
|
3425
|
+
(labelDefinition?.score ?? 0) * 1.15 +
|
|
3426
|
+
(recommendationPolarity.support?.score ?? 0) * 1.05 +
|
|
3427
|
+
recommendationPolarity.adjustment +
|
|
3428
|
+
(exactNumericOption?.score ?? 0) * 1.04 +
|
|
3429
|
+
(exactHourAlias?.score ?? 0) * 1.08 +
|
|
3430
|
+
ageEligibility.adjustment +
|
|
3431
|
+
(drugDose?.score ?? 0) * 1.15 +
|
|
3432
|
+
(termDefinition?.score ?? 0) * 1.15 +
|
|
3433
|
+
negatedAnswerPrefix.adjustment +
|
|
3434
|
+
impossibilityOnly.adjustment +
|
|
3435
|
+
activeTherapyIndication.adjustment +
|
|
3436
|
+
(recommendationItem?.score ?? 0) * 1.1 +
|
|
3437
|
+
(explicitRecommendationTarget.support?.score ?? 0) * 1.05 +
|
|
3438
|
+
explicitRecommendationTarget.adjustment +
|
|
3439
|
+
(conditionedNumber?.score ?? 0) * 1.1 +
|
|
3440
|
+
(numericCondition?.score ?? 0) * 1.05 +
|
|
3441
|
+
(countRelation?.score ?? 0) * 1.1 +
|
|
3442
|
+
(ageForm?.score ?? 0) * 1.15 +
|
|
3443
|
+
(fibrosisStage?.score ?? 0) * 1.15 +
|
|
3444
|
+
(conditionNumber?.score ?? 0) * 1.15 +
|
|
3445
|
+
(romanStage?.score ?? 0) * 1.15 +
|
|
3446
|
+
(answerOrdinalRow?.score ?? 0) * 1.15 +
|
|
3447
|
+
(clozeGap?.score ?? 0) * 1.12 +
|
|
3448
|
+
(visualTableColumn?.score ?? 0) * 1.18 +
|
|
3449
|
+
(coordinateTableRow?.score ?? 0) * 1.12 +
|
|
3450
|
+
(coordinateTableGroup?.score ?? 0) * 1.16 +
|
|
3451
|
+
(coordinateMultiCellRow?.score ?? 0) * 1.16 +
|
|
3452
|
+
(coordinateTableMembership?.score ?? 0) * 1.1 +
|
|
3453
|
+
(parentheticalGroup?.score ?? 0) * 1.16 +
|
|
3454
|
+
(questionContinuationList?.score ?? 0) * 1.1 +
|
|
3455
|
+
(shortMedicalAlias?.score ?? 0) * 0.35 +
|
|
3456
|
+
(latinFuzzy?.score ?? 0) * latinFuzzyWeight +
|
|
3457
|
+
(geneSentence?.score ?? 0) * 1.18 +
|
|
3458
|
+
(clinicalFeature.support?.score ?? 0) * 1.12 +
|
|
3459
|
+
clinicalFeature.adjustment +
|
|
3460
|
+
(mkbClassExclusion.support?.score ?? 0) * 1.12 +
|
|
3461
|
+
mkbClassExclusion.adjustment +
|
|
3462
|
+
(labelNumber?.score ?? 0) * 1.15 +
|
|
3463
|
+
(classificationCode?.score ?? 0) * 1.15 +
|
|
3464
|
+
(exactShortLabelRow?.score ?? 0) * 1.2 +
|
|
3465
|
+
(shortLabelRow?.score ?? 0) * 1.15 +
|
|
3466
|
+
(answerPhraseFound ? 0.35 : 0) +
|
|
3467
|
+
(numbers.length ? numberSpecificity(context.answer.text) * 0.35 : 0) +
|
|
3468
|
+
Math.min(0.35, answerTokens.length * 0.015);
|
|
3469
|
+
if (context.intent.listLike && context.anchorSegments?.length && !anchor) {
|
|
3470
|
+
raw *= 0.62;
|
|
3471
|
+
}
|
|
3472
|
+
if (context.intent.listLike && context.sectionSegments?.length && !section) {
|
|
3473
|
+
raw *= 0.72;
|
|
3474
|
+
}
|
|
3475
|
+
let evidence = [
|
|
3476
|
+
anchor,
|
|
3477
|
+
section,
|
|
3478
|
+
rowLabel,
|
|
3479
|
+
focused,
|
|
3480
|
+
lineToken,
|
|
3481
|
+
prefix,
|
|
3482
|
+
phrase,
|
|
3483
|
+
precedingLabel,
|
|
3484
|
+
exactAnswer,
|
|
3485
|
+
chunk,
|
|
3486
|
+
polarity.evidence,
|
|
3487
|
+
temporal.support,
|
|
3488
|
+
temporal.evidence,
|
|
3489
|
+
conditionPair.evidence,
|
|
3490
|
+
riskCondition.evidence,
|
|
3491
|
+
genericPopulation.evidence,
|
|
3492
|
+
classSubject,
|
|
3493
|
+
frequency,
|
|
3494
|
+
negativeLocal.evidence,
|
|
3495
|
+
boundedList.support,
|
|
3496
|
+
boundedList.evidence,
|
|
3497
|
+
ordinalList,
|
|
3498
|
+
typeOrdinal,
|
|
3499
|
+
indicationLabel,
|
|
3500
|
+
labelDefinition,
|
|
3501
|
+
recommendationPolarity.support,
|
|
3502
|
+
recommendationPolarity.evidence,
|
|
3503
|
+
exactNumericOption,
|
|
3504
|
+
exactHourAlias,
|
|
3505
|
+
ageEligibility.evidence,
|
|
3506
|
+
drugDose,
|
|
3507
|
+
termDefinition,
|
|
3508
|
+
negatedAnswerPrefix.evidence,
|
|
3509
|
+
impossibilityOnly.evidence,
|
|
3510
|
+
activeTherapyIndication.evidence,
|
|
3511
|
+
recommendationItem,
|
|
3512
|
+
explicitRecommendationTarget.support,
|
|
3513
|
+
explicitRecommendationTarget.evidence,
|
|
3514
|
+
conditionedNumber,
|
|
3515
|
+
numericCondition,
|
|
3516
|
+
countRelation,
|
|
3517
|
+
ageForm,
|
|
3518
|
+
fibrosisStage,
|
|
3519
|
+
conditionNumber,
|
|
3520
|
+
romanStage,
|
|
3521
|
+
answerOrdinalRow,
|
|
3522
|
+
clozeGap,
|
|
3523
|
+
visualTableColumn,
|
|
3524
|
+
coordinateTableRow,
|
|
3525
|
+
coordinateTableGroup,
|
|
3526
|
+
coordinateMultiCellRow,
|
|
3527
|
+
coordinateTableMembership,
|
|
3528
|
+
parentheticalGroup,
|
|
3529
|
+
questionContinuationList,
|
|
3530
|
+
shortMedicalAlias,
|
|
3531
|
+
latinFuzzy,
|
|
3532
|
+
geneSentence,
|
|
3533
|
+
clinicalFeature.support,
|
|
3534
|
+
clinicalFeature.evidence,
|
|
3535
|
+
mkbClassExclusion.support,
|
|
3536
|
+
mkbClassExclusion.evidence,
|
|
3537
|
+
labelNumber,
|
|
3538
|
+
classificationCode,
|
|
3539
|
+
exactShortLabelRow,
|
|
3540
|
+
shortLabelRow,
|
|
3541
|
+
].filter(Boolean);
|
|
3542
|
+
const contrastCue = contrastCueMismatchAdjustment(context, evidence.sort((a, b) => b.score - a.score));
|
|
3543
|
+
raw += contrastCue.adjustment;
|
|
3544
|
+
if (contrastCue.evidence)
|
|
3545
|
+
evidence.push(contrastCue.evidence);
|
|
3546
|
+
const excludedCondition = excludedConditionMismatchAdjustment(context, evidence.sort((a, b) => b.score - a.score));
|
|
3547
|
+
raw += excludedCondition.adjustment;
|
|
3548
|
+
if (excludedCondition.evidence)
|
|
3549
|
+
evidence.push(excludedCondition.evidence);
|
|
3550
|
+
evidence = evidence.sort((a, b) => b.score - a.score);
|
|
3551
|
+
return { raw, evidence };
|
|
3552
|
+
}
|
|
3553
|
+
/**
|
|
3554
|
+
* Запускает локальный non-LLM predictor для выбора ответа.
|
|
3555
|
+
*
|
|
3556
|
+
* Predictor получает источник PDF, текст вопроса, варианты ответа и режим
|
|
3557
|
+
* (`single` или `multi`). Он извлекает или переиспользует текст PDF, считает
|
|
3558
|
+
* score для каждого варианта по документу и возвращает id выбранных ответов
|
|
3559
|
+
* вместе с evidence-фрагментами.
|
|
3560
|
+
*
|
|
3561
|
+
* Runtime использует только данные, переданные вызывающим кодом.
|
|
3562
|
+
*
|
|
3563
|
+
* @param input Запрос с PDF-данными/путем/URL, вопросом, ответами и режимом.
|
|
3564
|
+
* @param options Необязательные runtime-зависимости, например явный модуль PDF.js.
|
|
3565
|
+
* @returns ID выбранных ответов, калиброванные score, raw score, evidence и метаданные.
|
|
3566
|
+
*/
|
|
3567
|
+
export async function predict(input, options = {}) {
|
|
3568
|
+
const config = { ...DEFAULT_CONFIG, ...options };
|
|
3569
|
+
const pdfInput = input.pdfData ?? input.pdfBuffer ?? input.pdf ?? input.file ?? input.blob ?? input.pdfUrl ?? input.url ?? input.pdfPath;
|
|
3570
|
+
if (!pdfInput)
|
|
3571
|
+
throw new Error("predict input requires pdfData, pdfUrl, file/blob, or pdfPath-compatible data");
|
|
3572
|
+
const mode = input.mode === "multi" ? "multi" : "single";
|
|
3573
|
+
const answers = normalizeAnswers(input.answers ?? input.variants ?? []);
|
|
3574
|
+
if (!answers.length)
|
|
3575
|
+
throw new Error("predict input requires answers");
|
|
3576
|
+
const runtime = await getPdfRuntime(pdfInput, {
|
|
3577
|
+
cacheKey: input.cacheKey ?? input.pdfPath ?? input.pdfUrl ?? input.url,
|
|
3578
|
+
pdfjsLib: options.pdfjsLib,
|
|
3579
|
+
});
|
|
3580
|
+
const question = String(input.question ?? "");
|
|
3581
|
+
const questionTokens = uniqueTokens(question);
|
|
3582
|
+
const focusTokens = questionFocusTokens(question);
|
|
3583
|
+
const intent = detectQuestionIntent(question);
|
|
3584
|
+
const anchorSegments = findAnchorSegments(runtime.pdfText.pages, question);
|
|
3585
|
+
const sectionSegments = findSectionSegments(runtime.pdfText.pages, question);
|
|
3586
|
+
const topQuestionPages = new Set(runtime.index.search(questionTokens, { limit: 6 }).map((result) => result.chunk.page));
|
|
3587
|
+
const rowSegments = findRowSegments(runtime.pdfText.pages, question, topQuestionPages);
|
|
3588
|
+
const boundedListSegments = findBoundedListSegments(runtime.pdfText.pages, question, topQuestionPages, mode, intent);
|
|
3589
|
+
const visualTableColumnTargetsByPage = mode === "multi" && hasVisualTableColumnCue(question, focusTokens)
|
|
3590
|
+
? buildVisualTableColumnTargetsByPage(runtime.pdfText.pages, question, focusTokens, topQuestionPages)
|
|
3591
|
+
: null;
|
|
3592
|
+
const coordinateTableRowsByPage = hasCoordinateTableCue(question, focusTokens)
|
|
3593
|
+
? buildCoordinateTableRowsByPage(runtime.pdfText.pages, topQuestionPages)
|
|
3594
|
+
: null;
|
|
3595
|
+
const coordinateTableGroupsByPage = mode === "multi" && hasCoordinateTableGroupCue(question, focusTokens, intent)
|
|
3596
|
+
? buildCoordinateTableGroupsByPage(runtime.pdfText.pages, topQuestionPages)
|
|
3597
|
+
: null;
|
|
3598
|
+
const coordinateMultiCellRowsByPage = mode === "multi" && hasCoordinateTableGroupCue(question, focusTokens, intent)
|
|
3599
|
+
? buildCoordinateMultiCellRowsByPage(runtime.pdfText.pages, topQuestionPages)
|
|
3600
|
+
: null;
|
|
3601
|
+
const coordinateTableMembershipsByPage = mode === "multi" && hasCoordinateTableGroupCue(question, focusTokens, intent)
|
|
3602
|
+
? buildCoordinateTableMembershipsByPage(runtime.pdfText.pages, topQuestionPages)
|
|
3603
|
+
: null;
|
|
3604
|
+
let answerScores = answers.map((answer) => {
|
|
3605
|
+
const answerTokens = uniqueTokens(answer.text);
|
|
3606
|
+
const result = scoreAnswer({
|
|
3607
|
+
pages: runtime.pdfText.pages,
|
|
3608
|
+
chunks: runtime.chunks,
|
|
3609
|
+
index: runtime.index,
|
|
3610
|
+
config,
|
|
3611
|
+
mode,
|
|
3612
|
+
question,
|
|
3613
|
+
answer,
|
|
3614
|
+
answers,
|
|
3615
|
+
questionTokens,
|
|
3616
|
+
topQuestionPages,
|
|
3617
|
+
focusTokens,
|
|
3618
|
+
answerTokens,
|
|
3619
|
+
intent,
|
|
3620
|
+
anchorSegments,
|
|
3621
|
+
sectionSegments,
|
|
3622
|
+
rowSegments,
|
|
3623
|
+
boundedListSegments,
|
|
3624
|
+
visualTableColumnTargetsByPage,
|
|
3625
|
+
coordinateTableRowsByPage,
|
|
3626
|
+
coordinateTableGroupsByPage,
|
|
3627
|
+
coordinateMultiCellRowsByPage,
|
|
3628
|
+
coordinateTableMembershipsByPage,
|
|
3629
|
+
});
|
|
3630
|
+
return {
|
|
3631
|
+
answer,
|
|
3632
|
+
raw: result.raw,
|
|
3633
|
+
evidence: result.evidence,
|
|
3634
|
+
};
|
|
3635
|
+
});
|
|
3636
|
+
if (mode === "multi" && config.sharedMultiSegmentBoost) {
|
|
3637
|
+
answerScores = addSharedMultiSegmentSupport(answerScores, intent, question);
|
|
3638
|
+
}
|
|
3639
|
+
answerScores = applyGeneSentenceSetSupport(answerScores, mode, question);
|
|
3640
|
+
if (mode === "single" && questionDefinitionLabel(question) && answerScores.some((item) => item.evidence.some((evidenceItem) => evidenceItem.kind === "label_definition_segment"))) {
|
|
3641
|
+
answerScores = answerScores.map((item) => item.evidence.some((evidenceItem) => evidenceItem.kind === "label_definition_segment") ? item : { ...item, raw: item.raw * 0.48 });
|
|
3642
|
+
}
|
|
3643
|
+
if (mode === "multi" && answerScores.some((item) => item.evidence.some((evidenceItem) => evidenceItem.kind === "latin_fuzzy_ocr"))) {
|
|
3644
|
+
answerScores = answerScores.map((item) => {
|
|
3645
|
+
const hasLatin = latinAnswerTokens(item.answer.text).length > 0;
|
|
3646
|
+
const hasLatinSupport = item.evidence.some((evidenceItem) => evidenceItem.kind === "latin_fuzzy_ocr" || evidenceItem.kind === "gene_sentence_segment");
|
|
3647
|
+
return hasLatin && !hasLatinSupport ? { ...item, raw: item.raw * 0.68 } : item;
|
|
3648
|
+
});
|
|
3649
|
+
}
|
|
3650
|
+
answerScores = applyFrozenFeatureRanker(answerScores, mode, config, { question });
|
|
3651
|
+
const calibrated = calibrateScores(answerScores);
|
|
3652
|
+
const selected = selectAnswers(calibrated, mode, config);
|
|
3653
|
+
const confidence = predictionConfidence(calibrated, selected, mode);
|
|
3654
|
+
const scores = Object.fromEntries(calibrated.map((item) => [item.answer.id, item.score]));
|
|
3655
|
+
const rawScores = Object.fromEntries(calibrated.map((item) => [item.answer.id, round4(item.raw)]));
|
|
3656
|
+
const evidence = calibrated
|
|
3657
|
+
.flatMap((item) => item.evidence.map((evidenceItem) => ({ ...evidenceItem, answerId: item.answer.id, score: round4(evidenceItem.score) })))
|
|
3658
|
+
.sort((a, b) => b.score - a.score)
|
|
3659
|
+
.slice(0, config.evidenceLimit);
|
|
3660
|
+
const diagnostics = options.diagnostics ? { answerEvidence: buildAnswerEvidenceDiagnostics(calibrated) } : undefined;
|
|
3661
|
+
return {
|
|
3662
|
+
selected,
|
|
3663
|
+
mode,
|
|
3664
|
+
confidence: round4(confidence),
|
|
3665
|
+
scores,
|
|
3666
|
+
rawScores,
|
|
3667
|
+
evidence,
|
|
3668
|
+
...(diagnostics ? { diagnostics } : {}),
|
|
3669
|
+
meta: {
|
|
3670
|
+
pageCount: runtime.pdfText.pageCount,
|
|
3671
|
+
chunks: runtime.chunks.length,
|
|
3672
|
+
ocrNeeded: runtime.pdfText.ocrNeeded,
|
|
3673
|
+
intent,
|
|
3674
|
+
},
|
|
3675
|
+
};
|
|
3676
|
+
}
|
|
3677
|
+
const CONFIDENCE_STRUCTURAL_KINDS = new Set([
|
|
3678
|
+
"coordinate_table_row",
|
|
3679
|
+
"coordinate_table_group",
|
|
3680
|
+
"coordinate_table_group_inverse",
|
|
3681
|
+
"coordinate_table_multicell_row",
|
|
3682
|
+
"coordinate_table_membership",
|
|
3683
|
+
"parenthetical_group_segment",
|
|
3684
|
+
"preceding_question_label",
|
|
3685
|
+
"question_continuation_list",
|
|
3686
|
+
"exact_numeric_option_segment",
|
|
3687
|
+
"exact_hour_alias_segment",
|
|
3688
|
+
"visual_table_column",
|
|
3689
|
+
"exact_short_label_visual_row",
|
|
3690
|
+
"short_label_visual_row",
|
|
3691
|
+
"answer_ordinal_row",
|
|
3692
|
+
"fibrosis_stage_row",
|
|
3693
|
+
"gene_sentence_segment",
|
|
3694
|
+
"clinical_feature_segment",
|
|
3695
|
+
"mkb_class_exclusion_absent",
|
|
3696
|
+
"classification_code_segment",
|
|
3697
|
+
"label_number_proximity",
|
|
3698
|
+
"label_definition_segment",
|
|
3699
|
+
"row_label_segment",
|
|
3700
|
+
"bounded_list_segment",
|
|
3701
|
+
"ordinal_list_segment",
|
|
3702
|
+
"drug_dose_segment",
|
|
3703
|
+
"recommendation_item_segment",
|
|
3704
|
+
"explicit_recommendation_target_segment",
|
|
3705
|
+
"numeric_condition_less_than",
|
|
3706
|
+
"numeric_condition_more_than",
|
|
3707
|
+
"numeric_condition_equal",
|
|
3708
|
+
"conditioned_number_segment",
|
|
3709
|
+
"cloze_gap_local",
|
|
3710
|
+
]);
|
|
3711
|
+
const CONFIDENCE_BROAD_KINDS = new Set(["bm25_question_answer", "question_chunk_answer", "answer_chunk_question", "answer_window", "focused_answer_window", "shared_multi_segment"]);
|
|
3712
|
+
function confidenceEvidenceSummary(item) {
|
|
3713
|
+
let bestScore = 0;
|
|
3714
|
+
let bestKind = "";
|
|
3715
|
+
let structuralScore = 0;
|
|
3716
|
+
let broadCount = 0;
|
|
3717
|
+
for (const evidence of item.evidence ?? []) {
|
|
3718
|
+
const score = Number(evidence.score ?? 0);
|
|
3719
|
+
if (score > bestScore) {
|
|
3720
|
+
bestScore = score;
|
|
3721
|
+
bestKind = String(evidence.kind ?? "");
|
|
3722
|
+
}
|
|
3723
|
+
if (CONFIDENCE_STRUCTURAL_KINDS.has(evidence.kind))
|
|
3724
|
+
structuralScore = Math.max(structuralScore, score);
|
|
3725
|
+
if (CONFIDENCE_BROAD_KINDS.has(evidence.kind))
|
|
3726
|
+
broadCount += 1;
|
|
3727
|
+
}
|
|
3728
|
+
return {
|
|
3729
|
+
bestScore,
|
|
3730
|
+
bestKind,
|
|
3731
|
+
structuralScore,
|
|
3732
|
+
hasStructural: structuralScore > 0,
|
|
3733
|
+
broadOnly: broadCount > 0 && structuralScore <= 0,
|
|
3734
|
+
};
|
|
3735
|
+
}
|
|
3736
|
+
function clampConfidence(value) {
|
|
3737
|
+
return Math.max(0.05, Math.min(0.99, value));
|
|
3738
|
+
}
|
|
3739
|
+
/**
|
|
3740
|
+
* Считает итоговую уверенность прогноза без влияния на выбор ответа.
|
|
3741
|
+
*
|
|
3742
|
+
* Selection по-прежнему использует raw score. Этот слой только снижает
|
|
3743
|
+
* confidence, когда выбранный набор держится на плоском поисковом evidence,
|
|
3744
|
+
* близкой границе между выбранными и невыбранными вариантами или плотной
|
|
3745
|
+
* multi-семье без структурной поддержки.
|
|
3746
|
+
*/
|
|
3747
|
+
function predictionConfidence(calibrated, selected, mode) {
|
|
3748
|
+
const selectedScores = selected.map((id) => calibrated.find((item) => item.answer.id === id)?.score ?? 0);
|
|
3749
|
+
let confidence = mode === "single" ? Math.max(...selectedScores, 0) : selectedScores.reduce((sum, score) => sum + score, 0) / (selectedScores.length || 1);
|
|
3750
|
+
const sorted = [...calibrated].sort((a, b) => b.raw - a.raw);
|
|
3751
|
+
const selectedSet = new Set(selected);
|
|
3752
|
+
const selectedItems = calibrated.filter((item) => selectedSet.has(item.answer.id));
|
|
3753
|
+
const selectedSummaries = selectedItems.map(confidenceEvidenceSummary);
|
|
3754
|
+
const broadOnlySelected = selectedSummaries.filter((summary) => summary.broadOnly).length;
|
|
3755
|
+
const structuralSelected = selectedSummaries.filter((summary) => summary.hasStructural).length;
|
|
3756
|
+
const selectedCount = Math.max(1, selectedItems.length);
|
|
3757
|
+
let penalty = 0;
|
|
3758
|
+
if (broadOnlySelected)
|
|
3759
|
+
penalty += Math.min(0.16, 0.045 * broadOnlySelected);
|
|
3760
|
+
if (!structuralSelected && selectedItems.length)
|
|
3761
|
+
penalty += mode === "multi" ? 0.055 : 0.035;
|
|
3762
|
+
if (mode === "single") {
|
|
3763
|
+
const top = sorted[0];
|
|
3764
|
+
const second = sorted[1];
|
|
3765
|
+
if (top && second) {
|
|
3766
|
+
const gap = top.raw - second.raw;
|
|
3767
|
+
if (gap < 0.35)
|
|
3768
|
+
penalty += 0.095;
|
|
3769
|
+
else if (gap < 0.85)
|
|
3770
|
+
penalty += 0.06;
|
|
3771
|
+
else if (gap < 1.5)
|
|
3772
|
+
penalty += 0.03;
|
|
3773
|
+
const topSummary = confidenceEvidenceSummary(top);
|
|
3774
|
+
if (topSummary.broadOnly && gap < 2.2)
|
|
3775
|
+
penalty += 0.045;
|
|
3776
|
+
}
|
|
3777
|
+
}
|
|
3778
|
+
else {
|
|
3779
|
+
const selectedRaw = selectedItems.map((item) => item.raw);
|
|
3780
|
+
const unselectedRaw = calibrated.filter((item) => !selectedSet.has(item.answer.id)).map((item) => item.raw);
|
|
3781
|
+
const minSelected = selectedRaw.length ? Math.min(...selectedRaw) : 0;
|
|
3782
|
+
const maxUnselected = unselectedRaw.length ? Math.max(...unselectedRaw) : 0;
|
|
3783
|
+
const boundaryGap = minSelected - maxUnselected;
|
|
3784
|
+
if (boundaryGap < 0.2)
|
|
3785
|
+
penalty += 0.095;
|
|
3786
|
+
else if (boundaryGap < 0.7)
|
|
3787
|
+
penalty += 0.06;
|
|
3788
|
+
else if (boundaryGap < 1.4)
|
|
3789
|
+
penalty += 0.03;
|
|
3790
|
+
if (selected.length >= calibrated.length - 1 && calibrated.length >= 4 && !structuralSelected)
|
|
3791
|
+
penalty += 0.04;
|
|
3792
|
+
penalty += Math.min(0.08, (broadOnlySelected / selectedCount) * 0.07);
|
|
3793
|
+
}
|
|
3794
|
+
const structuralBonus = structuralSelected ? Math.min(0.035, selectedSummaries.reduce((sum, summary) => sum + Math.min(18, summary.structuralScore), 0) / selectedCount / 600) : 0;
|
|
3795
|
+
return clampConfidence(confidence - penalty + structuralBonus);
|
|
3796
|
+
}
|
|
3797
|
+
function buildAnswerEvidenceDiagnostics(calibrated) {
|
|
3798
|
+
return Object.fromEntries(calibrated.map((item) => {
|
|
3799
|
+
const kindCounts = {};
|
|
3800
|
+
const kindBestScores = {};
|
|
3801
|
+
const pages = new Set();
|
|
3802
|
+
let bestEvidenceScore = 0;
|
|
3803
|
+
for (const evidenceItem of item.evidence ?? []) {
|
|
3804
|
+
const kind = String(evidenceItem.kind ?? "unknown");
|
|
3805
|
+
const score = Number(evidenceItem.score ?? 0);
|
|
3806
|
+
kindCounts[kind] = (kindCounts[kind] ?? 0) + 1;
|
|
3807
|
+
kindBestScores[kind] = round4(Math.max(kindBestScores[kind] ?? 0, score));
|
|
3808
|
+
bestEvidenceScore = Math.max(bestEvidenceScore, score);
|
|
3809
|
+
if (Number.isFinite(evidenceItem.page))
|
|
3810
|
+
pages.add(evidenceItem.page);
|
|
3811
|
+
}
|
|
3812
|
+
return [
|
|
3813
|
+
item.answer.id,
|
|
3814
|
+
{
|
|
3815
|
+
evidenceCount: item.evidence?.length ?? 0,
|
|
3816
|
+
uniqueEvidencePages: pages.size,
|
|
3817
|
+
bestEvidenceScore: round4(bestEvidenceScore),
|
|
3818
|
+
kindCounts,
|
|
3819
|
+
kindBestScores,
|
|
3820
|
+
refs: (item.evidence ?? []).map((evidenceItem) => ({
|
|
3821
|
+
page: Number.isFinite(evidenceItem.page) ? evidenceItem.page : 0,
|
|
3822
|
+
kind: String(evidenceItem.kind ?? "unknown"),
|
|
3823
|
+
score: round4(Number(evidenceItem.score ?? 0)),
|
|
3824
|
+
})),
|
|
3825
|
+
},
|
|
3826
|
+
];
|
|
3827
|
+
}));
|
|
3828
|
+
}
|
|
3829
|
+
/**
|
|
3830
|
+
* Очищает in-memory кеши predictor, включая кешированный текст PDF и runtime-состояние.
|
|
3831
|
+
*/
|
|
3832
|
+
export function clearPredictorCache() {
|
|
3833
|
+
clearPdfRuntimeCache();
|
|
3834
|
+
}
|