med-pdf-nmo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +298 -0
  3. package/README.ru.md +298 -0
  4. package/dist/bm25.d.ts +47 -0
  5. package/dist/bm25.js +86 -0
  6. package/dist/browser-shims/buffer.d.ts +30 -0
  7. package/dist/browser-shims/buffer.js +31 -0
  8. package/dist/browser-shims/crypto.d.ts +33 -0
  9. package/dist/browser-shims/crypto.js +45 -0
  10. package/dist/browser-shims/fs-promises.d.ts +13 -0
  11. package/dist/browser-shims/fs-promises.js +25 -0
  12. package/dist/browser-shims/fs.d.ts +14 -0
  13. package/dist/browser-shims/fs.js +24 -0
  14. package/dist/browser-shims/globals.d.ts +9 -0
  15. package/dist/browser-shims/globals.js +23 -0
  16. package/dist/browser-shims/path.d.ts +57 -0
  17. package/dist/browser-shims/path.js +65 -0
  18. package/dist/browser-shims/process.d.ts +22 -0
  19. package/dist/browser-shims/process.js +27 -0
  20. package/dist/browser.d.ts +9 -0
  21. package/dist/browser.js +12 -0
  22. package/dist/chunk.d.ts +15 -0
  23. package/dist/chunk.js +76 -0
  24. package/dist/cli.d.ts +2 -0
  25. package/dist/cli.js +87 -0
  26. package/dist/index.d.ts +82 -0
  27. package/dist/index.js +51 -0
  28. package/dist/med-pdf-nmo.browser.js +40413 -0
  29. package/dist/med-pdf-nmo.browser.mjs +40395 -0
  30. package/dist/normalize.d.ts +73 -0
  31. package/dist/normalize.js +477 -0
  32. package/dist/pdf.d.ts +35 -0
  33. package/dist/pdf.js +396 -0
  34. package/dist/predictor/config.d.ts +28 -0
  35. package/dist/predictor/config.js +26 -0
  36. package/dist/predictor/constants.d.ts +3 -0
  37. package/dist/predictor/constants.js +59 -0
  38. package/dist/predictor/runtime.d.ts +15 -0
  39. package/dist/predictor/runtime.js +59 -0
  40. package/dist/predictor/scorers/biomedical-symbols.d.ts +36 -0
  41. package/dist/predictor/scorers/biomedical-symbols.js +347 -0
  42. package/dist/predictor/scorers/coordinate-table.d.ts +82 -0
  43. package/dist/predictor/scorers/coordinate-table.js +1210 -0
  44. package/dist/predictor/scorers/direction.d.ts +71 -0
  45. package/dist/predictor/scorers/direction.js +345 -0
  46. package/dist/predictor/scorers/drug-dose.d.ts +6 -0
  47. package/dist/predictor/scorers/drug-dose.js +221 -0
  48. package/dist/predictor/scorers/exact-answer.d.ts +10 -0
  49. package/dist/predictor/scorers/exact-answer.js +75 -0
  50. package/dist/predictor/scorers/fibrosis-stage.d.ts +6 -0
  51. package/dist/predictor/scorers/fibrosis-stage.js +103 -0
  52. package/dist/predictor/scorers/focused.d.ts +40 -0
  53. package/dist/predictor/scorers/focused.js +204 -0
  54. package/dist/predictor/scorers/frequency.d.ts +10 -0
  55. package/dist/predictor/scorers/frequency.js +203 -0
  56. package/dist/predictor/scorers/numeric.d.ts +77 -0
  57. package/dist/predictor/scorers/numeric.js +1161 -0
  58. package/dist/predictor/scorers/recommendation-item.d.ts +27 -0
  59. package/dist/predictor/scorers/recommendation-item.js +469 -0
  60. package/dist/predictor/scorers/search.d.ts +41 -0
  61. package/dist/predictor/scorers/search.js +515 -0
  62. package/dist/predictor/selection.d.ts +30 -0
  63. package/dist/predictor/selection.js +370 -0
  64. package/dist/predictor/text-utils.d.ts +49 -0
  65. package/dist/predictor/text-utils.js +497 -0
  66. package/dist/predictor/types.d.ts +23 -0
  67. package/dist/predictor/types.js +1 -0
  68. package/dist/predictor.d.ts +52 -0
  69. package/dist/predictor.js +3834 -0
  70. package/package.json +82 -0
@@ -0,0 +1,1161 @@
1
+ import { coverage, extractNumbers, normalizeForSearch, normalizeText, phraseTokens, tokenize, uniqueTokens, } from "../../normalize.js";
2
+ import { FOCUS_STOPWORDS } from "../constants.js";
3
+ import { frequencyAnswer, frequencySearchPhrases } from "./frequency.js";
4
+ import { answerSearchPhrases, betterEvidence, cachedLineWindowSegments, containsNormalizedPhrase, escapeRegExp, evidenceSnippet, expandNumberToken, findPhraseOccurrences, hasSearchBoundaries, nearestCueName, numberCoverage, proximityBonus, strictSoftCoverage, tokenBoundaryIncludes, tokenizeNormalized, tokenHitCount, } from "../text-utils.js";
5
+ const CLOZE_GENERIC_FOCUS = new Set(uniqueTokens([
6
+ "пациент пациенты пациентам больной больных дети детей ребенок ребенка",
7
+ "рекомендуется проводится применяется назначается принимается используют",
8
+ "составляет относятся следующие критерии показатель значение терапия лечение",
9
+ "клинический рекомендации заболевание диагноз подтвержденный форма",
10
+ "обычно необходимо следует возможно после перед при для",
11
+ ].join(" ")));
12
+ const CLOZE_COUNT_RIGHT_TOKENS = new Set(uniqueTokens("раз сутки прием приём день"));
13
+ const CLOZE_CONTRAST_PHRASES = [
14
+ "при менее",
15
+ "менее выраж",
16
+ "далее",
17
+ "после",
18
+ "либо",
19
+ "или",
20
+ "для декрет",
21
+ "декретирован",
22
+ "старше",
23
+ "от 1 года",
24
+ "через",
25
+ ].map((phrase) => normalizeForSearch(phrase));
26
+ const SMALL_NUMBER_ALIASES = new Map(Object.entries({
27
+ "1": ["один", "одна", "одно", "однократно", "однократное", "однократный", "однократная", "1 раз", "1 р"],
28
+ "2": ["два", "две", "дважды", "двукратно", "двукратное", "двукратный", "двукратная", "2 раза", "2 р"],
29
+ "3": ["три", "трижды", "трехкратно", "трёхкратно", "3 раза", "3 р"],
30
+ "4": ["четыре", "четырехкратно", "четырёхкратно", "4 раза", "4 р"],
31
+ "5": ["пять", "5 раз", "5 р"],
32
+ "6": ["шесть", "6 раз", "6 р"],
33
+ }));
34
+ function clozeQuestionParts(question) {
35
+ const raw = String(question ?? "");
36
+ const blank = raw.match(/_{2,}|…+/u);
37
+ if (!blank?.index)
38
+ return { left: raw, right: "" };
39
+ return {
40
+ left: raw.slice(0, blank.index),
41
+ right: raw.slice(blank.index + blank[0].length),
42
+ };
43
+ }
44
+ function clozeApplicable({ mode, question, answer }) {
45
+ if (mode !== "single")
46
+ return false;
47
+ const hasBlank = /_{2,}|…+/u.test(String(question ?? ""));
48
+ if (hasBlank)
49
+ return true;
50
+ return false;
51
+ }
52
+ function clozeFocusTokens(question, focusTokens, answerTokens) {
53
+ const answerSet = new Set(answerTokens ?? []);
54
+ const out = [];
55
+ for (const token of [...(focusTokens ?? []), ...uniqueTokens(question)]) {
56
+ if (!token || token.length < 3)
57
+ continue;
58
+ if (answerSet.has(token))
59
+ continue;
60
+ if (FOCUS_STOPWORDS.has(token) || CLOZE_GENERIC_FOCUS.has(token))
61
+ continue;
62
+ if (!out.includes(token))
63
+ out.push(token);
64
+ }
65
+ return out.slice(0, 18);
66
+ }
67
+ function clozeCoreTokens(question, answerTokens) {
68
+ const parts = clozeQuestionParts(question);
69
+ const left = parts.left
70
+ .split(/\s+(?:у|для|при|с|со|в)\s+пациент|\s+пациентам|\s+пациентов|\s+больным|\s+детям|\s+младше|\s+старше|\s+кажд|\s+принима|\s+провод|\s+составля|\s+равн|\s+в\s+дозе/iu)[0];
71
+ const tokens = uniqueTokens(left).filter((token) => token.length >= 4 && !FOCUS_STOPWORDS.has(token) && !CLOZE_GENERIC_FOCUS.has(token));
72
+ const answerSet = new Set(answerTokens ?? []);
73
+ return tokens.filter((token) => !answerSet.has(token)).slice(0, 6);
74
+ }
75
+ function clozeAnswerPhraseEntries(answerText) {
76
+ const entries = [];
77
+ const seen = new Set();
78
+ const add = (value, alias = false) => {
79
+ const normalizedPhrase = normalizeForSearch(value);
80
+ if (!normalizedPhrase || normalizedPhrase.length < 1 || seen.has(normalizedPhrase))
81
+ return;
82
+ seen.add(normalizedPhrase);
83
+ entries.push({
84
+ phrase: String(value),
85
+ alias,
86
+ bareNumber: /^\d+(?:[.,]\d+)?$/u.test(normalizedPhrase),
87
+ });
88
+ };
89
+ for (const phrase of answerSearchPhrases(answerText).slice(0, 18))
90
+ add(phrase, false);
91
+ const numbers = extractNumbers(answerText);
92
+ for (const number of numbers) {
93
+ for (const expanded of expandNumberToken(number))
94
+ add(expanded, true);
95
+ }
96
+ if (numbers.length === 1) {
97
+ const normalizedNumber = numbers[0].replace(/[.,]0+$/u, "");
98
+ for (const alias of SMALL_NUMBER_ALIASES.get(normalizedNumber) ?? [])
99
+ add(alias, true);
100
+ }
101
+ const answerNorm = normalizeForSearch(answerText);
102
+ if (containsNormalizedPhrase(answerNorm, normalizeForSearch("месяц")) || containsNormalizedPhrase(answerNorm, normalizeForSearch("месяцев"))) {
103
+ add("мес", true);
104
+ }
105
+ if (containsNormalizedPhrase(answerNorm, normalizeForSearch("неделя")) || containsNormalizedPhrase(answerNorm, normalizeForSearch("недели"))) {
106
+ add("нед", true);
107
+ }
108
+ return entries;
109
+ }
110
+ function clozeHasUnitCue(local, question) {
111
+ const text = normalizeForSearch(`${local} ${question}`);
112
+ return /(?:мг|мес|месяц|сут|дн|раз|р |%|°|мм|г\/л|лет|год)/u.test(text);
113
+ }
114
+ function lastTokenDistance(before, focusTokens) {
115
+ let best = -1;
116
+ for (const token of focusTokens) {
117
+ if (!token)
118
+ continue;
119
+ const index = before.lastIndexOf(token);
120
+ if (index > best)
121
+ best = index;
122
+ }
123
+ if (best < 0)
124
+ return Number.POSITIVE_INFINITY;
125
+ return before.length - best;
126
+ }
127
+ function clozeContrastPenalty(tail, questionNumbers) {
128
+ let penalty = 0;
129
+ for (const phrase of CLOZE_CONTRAST_PHRASES) {
130
+ if (phrase && containsNormalizedPhrase(tail, phrase))
131
+ penalty += 1;
132
+ }
133
+ const localNumbers = extractNumbers(tail);
134
+ if (questionNumbers.length && localNumbers.some((number) => !questionNumbers.includes(number))) {
135
+ penalty += 1;
136
+ }
137
+ return Math.min(3, penalty);
138
+ }
139
+ function relevantClozeQuestionNumbers(question) {
140
+ const raw = String(question ?? "");
141
+ const out = [];
142
+ const pattern = /(?<![\p{L}])([<>]?\d+(?:[.,]\d+)?)(?![\p{L}])/giu;
143
+ for (const match of raw.matchAll(pattern)) {
144
+ const index = match.index ?? 0;
145
+ const around = raw.slice(Math.max(0, index - 24), index + match[0].length + 24).toLowerCase();
146
+ if (!/[<>]|мг|мм|мес|меся|лет|год|сут|дн|%|°|температур|доз|кажд|раз/u.test(around))
147
+ continue;
148
+ const cleaned = match[1].replace(/^[<>]/u, "");
149
+ for (const expanded of expandNumberToken(cleaned)) {
150
+ if (!out.includes(expanded))
151
+ out.push(expanded);
152
+ }
153
+ }
154
+ return out;
155
+ }
156
+ function clozeLocalHasRelevantQuestionNumber(local, relevantNumbers) {
157
+ if (!relevantNumbers.length)
158
+ return true;
159
+ const localNumbers = new Set(extractNumbers(local).flatMap(expandNumberToken));
160
+ return relevantNumbers.some((number) => localNumbers.has(number));
161
+ }
162
+ function clozeTailHasConflictingNumber(tail, answerText) {
163
+ const answerNumbers = new Set(extractNumbers(answerText).flatMap(expandNumberToken));
164
+ if (!answerNumbers.size)
165
+ return false;
166
+ return extractNumbers(tail)
167
+ .flatMap(expandNumberToken)
168
+ .some((number) => !answerNumbers.has(number));
169
+ }
170
+ function clozeTailHasTimingCue(tail) {
171
+ return containsNormalizedPhrase(tail, "через") || containsNormalizedPhrase(tail, "после");
172
+ }
173
+ export function bestClozeGapSupport({ mode, pages, topQuestionPages, question, answer, answerTokens, focusTokens }) {
174
+ if (!clozeApplicable({ mode, question, answer }))
175
+ return null;
176
+ const specificFocus = clozeFocusTokens(question, focusTokens, answerTokens);
177
+ if (specificFocus.length < 2)
178
+ return null;
179
+ const answerEntries = clozeAnswerPhraseEntries(answer.text);
180
+ if (!answerEntries.length)
181
+ return null;
182
+ const parts = clozeQuestionParts(question);
183
+ const rightTokens = clozeFocusTokens(parts.right, uniqueTokens(parts.right), answerTokens);
184
+ if (!rightTokens.some((token) => CLOZE_COUNT_RIGHT_TOKENS.has(token)))
185
+ return null;
186
+ const hasBlank = /_{2,}|…+/u.test(String(question ?? ""));
187
+ const coreTokens = clozeCoreTokens(question, answerTokens);
188
+ const questionNumbers = extractNumbers(question);
189
+ const relevantQuestionNumbers = relevantClozeQuestionNumbers(question);
190
+ let best = null;
191
+ for (const page of pages) {
192
+ const nearTopPage = !topQuestionPages?.size || topQuestionPages.has(page.page) || topQuestionPages.has(page.page - 1) || topQuestionPages.has(page.page + 1);
193
+ if (!nearTopPage)
194
+ continue;
195
+ const sources = cachedLineWindowSegments(page).filter((segment) => segment.normalized.length <= 760);
196
+ for (const source of sources) {
197
+ const tokens = tokenizeNormalized(source.normalized);
198
+ const focusHits = tokenHitCount(specificFocus, tokens);
199
+ const focusCoverage = coverage(specificFocus, tokens);
200
+ if (focusHits < 2 && focusCoverage < 0.24)
201
+ continue;
202
+ for (const entry of answerEntries) {
203
+ const hits = findPhraseOccurrences(source.normalized, entry.phrase, { textIsNormalized: true });
204
+ for (const hit of hits) {
205
+ const local = source.normalized.slice(Math.max(0, hit - 80), hit + entry.phrase.length + 90);
206
+ if (entry.bareNumber && !clozeHasUnitCue(local, question))
207
+ continue;
208
+ const relevantLocal = source.normalized.slice(Math.max(0, hit - 220), hit + entry.phrase.length + 140);
209
+ if (!clozeLocalHasRelevantQuestionNumber(relevantLocal, relevantQuestionNumbers))
210
+ continue;
211
+ const before = source.normalized.slice(Math.max(0, hit - 300), hit);
212
+ const after = source.normalized.slice(hit + entry.phrase.length, hit + entry.phrase.length + 180);
213
+ const beforeTokens = tokenizeNormalized(before);
214
+ if (hasBlank && coreTokens.length >= 2) {
215
+ const recentCoreCoverage = coverage(coreTokens, tokenizeNormalized(before.slice(-180)));
216
+ const overallCoreCoverage = coverage(coreTokens, beforeTokens);
217
+ if (recentCoreCoverage < 0.45 && overallCoreCoverage < 0.75)
218
+ continue;
219
+ if (lastTokenDistance(before, coreTokens) > 110)
220
+ continue;
221
+ }
222
+ const beforeFocusHits = tokenHitCount(specificFocus, beforeTokens);
223
+ const beforeCoverage = coverage(specificFocus, beforeTokens);
224
+ if (beforeFocusHits < 2 && beforeCoverage < 0.18)
225
+ continue;
226
+ const distance = lastTokenDistance(before, specificFocus);
227
+ if (!Number.isFinite(distance) || distance > 220)
228
+ continue;
229
+ const tail = before.slice(Math.max(0, before.length - Math.min(140, distance + 28)));
230
+ const contrastPenalty = clozeContrastPenalty(tail, questionNumbers);
231
+ if (!hasBlank && entry.bareNumber && clozeTailHasTimingCue(tail))
232
+ continue;
233
+ if (!hasBlank && clozeTailHasConflictingNumber(tail, answer.text))
234
+ continue;
235
+ if (contrastPenalty >= 2 && !rightTokens.length)
236
+ continue;
237
+ const rightCoverage = rightTokens.length ? coverage(rightTokens, tokenizeNormalized(after)) : 0;
238
+ const numeric = numberCoverage(answer.text, local);
239
+ const score = 12.1 +
240
+ Math.min(6, focusHits) * 0.65 +
241
+ Math.min(6, beforeFocusHits) * 0.85 +
242
+ Math.min(0.7, beforeCoverage) * 4.0 +
243
+ proximityBonus(distance, 180) * 6.0 +
244
+ Math.min(0.75, rightCoverage) * 4.0 +
245
+ (entry.alias ? 1.4 : 0) +
246
+ numeric * 1.1 -
247
+ contrastPenalty * 5.2;
248
+ if (score < 10.8)
249
+ continue;
250
+ best = betterEvidence(best, {
251
+ answerId: answer.id,
252
+ page: page.page,
253
+ text: source.text,
254
+ score,
255
+ kind: "cloze_gap_local",
256
+ });
257
+ }
258
+ }
259
+ }
260
+ }
261
+ return best;
262
+ }
263
+ function conditionFamily(text) {
264
+ const normalized = normalizeForSearch(text);
265
+ if (containsNormalizedPhrase(normalized, "тяжел"))
266
+ return "heavy";
267
+ if (containsNormalizedPhrase(normalized, "умерен") || containsNormalizedPhrase(normalized, "средн"))
268
+ return "moderate";
269
+ if (containsNormalizedPhrase(normalized, "легк"))
270
+ return "mild";
271
+ return null;
272
+ }
273
+ function nearestConditionFamily(normalizedText) {
274
+ let best = null;
275
+ for (const [family, cues] of [
276
+ ["heavy", ["тяжел"]],
277
+ ["moderate", ["умерен", "средн"]],
278
+ ["mild", ["легк"]],
279
+ ]) {
280
+ for (const cueText of cues) {
281
+ const cue = normalizeForSearch(cueText);
282
+ const index = normalizedText.indexOf(cue);
283
+ if (index >= 0 && (!best || index < best.index))
284
+ best = { family, index };
285
+ }
286
+ }
287
+ return best?.family ?? null;
288
+ }
289
+ function answerValueCondition(answerText) {
290
+ const raw = normalizeText(answerText);
291
+ const match = raw.match(/^(.{2,90}?)\s+для\s+(.{3,120})$/u);
292
+ if (!match)
293
+ return null;
294
+ const value = match[1].trim();
295
+ const condition = match[2].trim();
296
+ if (!extractNumbers(value).length && !/(год|месяц|дн|сут|раз)/u.test(value))
297
+ return null;
298
+ const family = conditionFamily(condition);
299
+ if (!family)
300
+ return null;
301
+ return { value, condition, family };
302
+ }
303
+ export function conditionPairAdjustment({ pages, topQuestionPages, answer }) {
304
+ const pair = answerValueCondition(answer.text);
305
+ if (!pair)
306
+ return { adjustment: 0, evidence: null };
307
+ let bestMatch = null;
308
+ let bestMismatch = null;
309
+ const valuePhrases = answerSearchPhrases(pair.value).slice(0, 8);
310
+ for (const page of pages) {
311
+ if (topQuestionPages?.size && !topQuestionPages.has(page.page))
312
+ continue;
313
+ for (const phrase of valuePhrases) {
314
+ const phraseNorm = normalizeForSearch(phrase);
315
+ if (!phraseNorm || phraseNorm.length < 3)
316
+ continue;
317
+ const hits = findPhraseOccurrences(page.normalized, phrase, { textIsNormalized: true });
318
+ for (const hit of hits) {
319
+ const after = page.normalized.slice(hit + phraseNorm.length, hit + phraseNorm.length + 120);
320
+ const actual = nearestConditionFamily(after);
321
+ if (!actual)
322
+ continue;
323
+ const local = page.normalized.slice(Math.max(0, hit - 80), hit + phraseNorm.length + 160);
324
+ const evidence = {
325
+ answerId: answer.id,
326
+ page: page.page,
327
+ text: evidenceSnippet(page.text, pair.value, pair.condition),
328
+ score: actual === pair.family ? 8.8 : 2.4,
329
+ kind: actual === pair.family ? "condition_pair_match" : "condition_pair_mismatch",
330
+ };
331
+ if (actual === pair.family) {
332
+ const proximity = after.indexOf(normalizeForSearch(pair.condition).slice(0, 5));
333
+ bestMatch = betterEvidence(bestMatch, { ...evidence, score: evidence.score + proximityBonus(proximity, 120) });
334
+ }
335
+ else if (local) {
336
+ bestMismatch = betterEvidence(bestMismatch, evidence);
337
+ }
338
+ }
339
+ }
340
+ }
341
+ if (bestMatch)
342
+ return { adjustment: 4.6, evidence: bestMatch };
343
+ if (bestMismatch)
344
+ return { adjustment: -2.4, evidence: bestMismatch };
345
+ return { adjustment: 0, evidence: null };
346
+ }
347
+ const NUMERIC_OPTION_UNIT_TOKENS = new Set([
348
+ "\u043c\u0433",
349
+ "\u043c\u043a\u0433",
350
+ "\u043c\u043b",
351
+ "\u043c\u0435",
352
+ "\u043a\u0433",
353
+ "\u0434\u0435\u043d\u044c",
354
+ "\u0434\u043d\u044f",
355
+ "\u0434\u043d\u0435\u0439",
356
+ "\u0441\u0443\u0442\u043a\u0438",
357
+ "\u0441\u0443\u0442\u043e\u043a",
358
+ "\u043d\u0435\u0434\u0435\u043b\u044e",
359
+ "\u043d\u0435\u0434\u0435\u043b\u0438",
360
+ "\u043c\u0435\u0441\u044f\u0446",
361
+ "\u043c\u0435\u0441\u044f\u0446\u0430",
362
+ "\u043c\u0435\u0441\u044f\u0446\u0435\u0432",
363
+ "\u0433\u043e\u0434",
364
+ "\u0433\u043e\u0434\u0430",
365
+ "\u043b\u0435\u0442",
366
+ "\u0440\u0430\u0437",
367
+ "\u0447\u0430\u0441",
368
+ "\u0447",
369
+ ].flatMap((item) => uniqueTokens(item)));
370
+ function numericOptionAnswer(answerText) {
371
+ if (!extractNumbers(answerText).length)
372
+ return false;
373
+ const normalized = normalizeForSearch(answerText);
374
+ return normalized.includes("%") || tokenHitCount([...NUMERIC_OPTION_UNIT_TOKENS], tokenize(answerText)) > 0;
375
+ }
376
+ function denseNumericSingleQuestion(mode, answers) {
377
+ return mode === "single" && answers.filter((answer) => numericOptionAnswer(answer.text)).length >= 2;
378
+ }
379
+ function exactNumericOptionQuestion(question) {
380
+ const normalized = normalizeForSearch(question);
381
+ return (containsNormalizedPhrase(normalized, "\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434") ||
382
+ containsNormalizedPhrase(normalized, "\u043d\u0430\u0437\u043d\u0430\u0447") ||
383
+ containsNormalizedPhrase(normalized, "\u0434\u043e\u0437") ||
384
+ containsNormalizedPhrase(normalized, "\u0432 \u0442\u0435\u0447\u0435\u043d") ||
385
+ containsNormalizedPhrase(normalized, "\u0440\u0430\u0437 \u0432") ||
386
+ containsNormalizedPhrase(normalized, "\u043a\u0430\u0436\u0434") ||
387
+ containsNormalizedPhrase(normalized, "\u043f\u0440\u043e\u0432\u043e\u0434"));
388
+ }
389
+ function numericExactPhrases(answerText) {
390
+ const normalized = normalizeForSearch(answerText);
391
+ const withoutParentheses = normalizeForSearch(normalized.replace(/\([^)]*\)/g, " "));
392
+ const hyphenSplit = normalizeForSearch(String(answerText ?? "").replace(/\s*[-\u2010-\u2015]\s*/g, " "));
393
+ const phrases = new Set([normalized, withoutParentheses, hyphenSplit]);
394
+ return [...phrases].filter((phrase) => phrase.length >= 5 && extractNumbers(phrase).length);
395
+ }
396
+ function hourAliasPhrases(answerText) {
397
+ const raw = normalizeText(answerText);
398
+ const numbers = extractNumbers(answerText);
399
+ if (!numbers.length || !/(?:^|\s)(?:\u0447|\u0447\.|\u0447\u0430\u0441|\u0447\u0430\u0441\u0430|\u0447\u0430\u0441\u043e\u0432)(?:\s|$)/u.test(raw))
400
+ return [];
401
+ const phrases = new Set();
402
+ for (const number of numbers) {
403
+ phrases.add(`${number} \u0447`);
404
+ phrases.add(`${number} \u0447.`);
405
+ }
406
+ const answerNorm = normalizeForSearch(answerText);
407
+ return [...phrases].filter((phrase) => normalizeForSearch(phrase) !== answerNorm);
408
+ }
409
+ function segmentContainsBoundedPhrase(normalizedSegment, phrase) {
410
+ const normalizedPhrase = normalizeForSearch(phrase);
411
+ if (!normalizedPhrase)
412
+ return false;
413
+ return findPhraseOccurrences(normalizedSegment, normalizedPhrase, { textIsNormalized: true }).some((index) => hasSearchBoundaries(normalizedSegment, index, normalizedPhrase.length));
414
+ }
415
+ /**
416
+ * Поддерживает single-вопросы с плотной числовой семьей вариантов.
417
+ *
418
+ * Если несколько вариантов отличаются дозой, сроком, частотой или процентом,
419
+ * полный числовой режим в релевантной строке должен весить сильнее, чем общий
420
+ * chunk, где рядом могут встречаться несколько альтернативных значений.
421
+ */
422
+ export function bestExactNumericOptionSupport({ mode, pages, topQuestionPages, question, answer, answers, answerTokens, questionTokens, focusTokens }) {
423
+ if (!denseNumericSingleQuestion(mode, answers) || !numericOptionAnswer(answer.text))
424
+ return null;
425
+ if (!exactNumericOptionQuestion(question))
426
+ return null;
427
+ const phrases = numericExactPhrases(answer.text).slice(0, 12);
428
+ if (!phrases.length)
429
+ return null;
430
+ const usefulFocus = (focusTokens?.length ? focusTokens : questionTokens).filter((token) => token.length >= 4 && !FOCUS_STOPWORDS.has(token));
431
+ let best = null;
432
+ for (const page of pages) {
433
+ const nearTopPage = !topQuestionPages?.size || topQuestionPages.has(page.page) || topQuestionPages.has(page.page - 1) || topQuestionPages.has(page.page + 1);
434
+ if (!nearTopPage)
435
+ continue;
436
+ for (const segment of cachedLineWindowSegments(page)) {
437
+ const phraseHit = phrases.some((phrase) => containsNormalizedPhrase(segment.normalized, phrase));
438
+ if (!phraseHit)
439
+ continue;
440
+ const numericCoverage = numberCoverage(answer.text, segment.normalized);
441
+ const focusHits = tokenHitCount(usefulFocus, segment.tokens);
442
+ const questionCoverage = coverage(questionTokens, segment.tokens);
443
+ if (questionCoverage < 0.14 && focusHits < Math.min(2, usefulFocus.length))
444
+ continue;
445
+ const answerCoverage = strictSoftCoverage(answerTokens, segment.tokens);
446
+ const score = 12.8 +
447
+ 4.2 +
448
+ numericCoverage * 3.0 +
449
+ answerCoverage * 2.8 +
450
+ Math.min(0.52, questionCoverage) * 5.6 +
451
+ Math.min(2, focusHits) * 0.8;
452
+ best = betterEvidence(best, {
453
+ answerId: answer.id,
454
+ page: page.page,
455
+ text: segment.text,
456
+ score,
457
+ kind: "exact_numeric_option_segment",
458
+ });
459
+ }
460
+ }
461
+ return best;
462
+ }
463
+ /**
464
+ * Узко поддерживает варианты времени, где PDF использует сокращение (`6 ч`),
465
+ * а вариант ответа дан полностью (`6 часов`). Это отдельный слой, чтобы не
466
+ * расширять общий numeric scorer и не усиливать соседние дозировки/сроки.
467
+ */
468
+ export function bestExactHourAliasOptionSupport({ mode, pages, topQuestionPages, question, answer, answers, answerTokens, questionTokens, focusTokens }) {
469
+ if (mode !== "single" || answers.filter((candidate) => extractNumbers(candidate.text).length > 0).length < 2)
470
+ return null;
471
+ if (!exactNumericOptionQuestion(question))
472
+ return null;
473
+ const phrases = hourAliasPhrases(answer.text);
474
+ if (!phrases.length)
475
+ return null;
476
+ const answerNumbers = new Set(extractNumbers(answer.text));
477
+ const questionConditionNumbers = extractNumbers(question).filter((number) => !answerNumbers.has(number));
478
+ const usefulFocus = (focusTokens?.length ? focusTokens : questionTokens).filter((token) => token.length >= 4 && !FOCUS_STOPWORDS.has(token));
479
+ let best = null;
480
+ for (const page of pages) {
481
+ const nearTopPage = !topQuestionPages?.size || topQuestionPages.has(page.page) || topQuestionPages.has(page.page - 1) || topQuestionPages.has(page.page + 1);
482
+ if (!nearTopPage)
483
+ continue;
484
+ for (const segment of cachedLineWindowSegments(page)) {
485
+ const phraseHit = phrases.some((phrase) => segmentContainsBoundedPhrase(segment.normalized, phrase));
486
+ if (!phraseHit)
487
+ continue;
488
+ if (questionConditionNumbers.length && !questionConditionNumbers.some((number) => containsNormalizedPhrase(segment.normalized, number)))
489
+ continue;
490
+ const focusHits = tokenHitCount(usefulFocus, segment.tokens);
491
+ const questionCoverage = coverage(questionTokens, segment.tokens);
492
+ if (questionCoverage < 0.14 && focusHits < Math.min(2, usefulFocus.length))
493
+ continue;
494
+ const answerCoverage = strictSoftCoverage(answerTokens, segment.tokens);
495
+ const numericCoverage = numberCoverage(answer.text, segment.normalized);
496
+ const score = 15.2 +
497
+ numericCoverage * 3.2 +
498
+ answerCoverage * 2.2 +
499
+ Math.min(0.52, questionCoverage) * 5.2 +
500
+ Math.min(2, focusHits) * 0.9;
501
+ best = betterEvidence(best, {
502
+ answerId: answer.id,
503
+ page: page.page,
504
+ text: segment.text,
505
+ score,
506
+ kind: "exact_hour_alias_segment",
507
+ });
508
+ }
509
+ }
510
+ return best;
511
+ }
512
+ function conditionNumberCueHit(local, question, answer) {
513
+ const questionNorm = normalizeForSearch(question);
514
+ let hasCue = false;
515
+ if (containsNormalizedPhrase(questionNorm, "hbeag")) {
516
+ const nearestStatus = nearestCueName(local, [
517
+ ["negative", ["\u043e\u0442\u0440\u0438\u0446"]],
518
+ ["positive", ["\u043f\u043e\u043b\u043e\u0436"]],
519
+ ]);
520
+ if (containsNormalizedPhrase(questionNorm, "\u043e\u0442\u0440\u0438\u0446")) {
521
+ hasCue = true;
522
+ if (!(containsNormalizedPhrase(local, "hbeag") && nearestStatus === "negative"))
523
+ return false;
524
+ }
525
+ else if (containsNormalizedPhrase(questionNorm, "\u043f\u043e\u043b\u043e\u0436")) {
526
+ hasCue = true;
527
+ if (!(containsNormalizedPhrase(local, "hbeag") && nearestStatus === "positive"))
528
+ return false;
529
+ }
530
+ }
531
+ if (containsNormalizedPhrase(questionNorm, "\u0431\u0435\u0437 \u0446\u0438\u0440\u0440\u043e\u0437")) {
532
+ hasCue = true;
533
+ if (!containsNormalizedPhrase(local, "\u0431\u0435\u0437 \u0446\u0438\u0440\u0440\u043e\u0437"))
534
+ return false;
535
+ }
536
+ else if (containsNormalizedPhrase(questionNorm, "\u043f\u0440\u0438 \u0446\u0438\u0440\u0440\u043e\u0437") || containsNormalizedPhrase(questionNorm, "\u0441 \u0446\u0438\u0440\u0440\u043e\u0437")) {
537
+ hasCue = true;
538
+ if (containsNormalizedPhrase(local, "\u0431\u0435\u0437 \u0446\u0438\u0440\u0440\u043e\u0437") || !containsNormalizedPhrase(local, "\u0446\u0438\u0440\u0440\u043e\u0437"))
539
+ return false;
540
+ }
541
+ const family = conditionFamily(question);
542
+ if (family) {
543
+ const nearestFamily = nearestCueName(local, [
544
+ ["heavy", ["\u0442\u044f\u0436\u0435\u043b"]],
545
+ ["moderate", ["\u0443\u043c\u0435\u0440\u0435\u043d", "\u0441\u0440\u0435\u0434\u043d"]],
546
+ ["mild", ["\u043b\u0435\u0433\u043a"]],
547
+ ]);
548
+ hasCue = true;
549
+ if (nearestFamily !== family)
550
+ return false;
551
+ }
552
+ const answerNumbers = new Set(extractNumbers(answer.text).flatMap(expandNumberToken));
553
+ const conditionNumbers = extractNumbers(question)
554
+ .flatMap(expandNumberToken)
555
+ .filter((number) => !answerNumbers.has(number));
556
+ if (conditionNumbers.length) {
557
+ hasCue = true;
558
+ if (!conditionNumbers.some((number) => tokenBoundaryIncludes(local, number)))
559
+ return false;
560
+ }
561
+ return hasCue;
562
+ }
563
+ function conditionNumberAnswerPhrases(answerText) {
564
+ const phrases = new Set([...answerSearchPhrases(answerText), ...frequencySearchPhrases(answerText)]);
565
+ if (/%/.test(answerText)) {
566
+ for (const number of extractNumbers(answerText).flatMap(expandNumberToken)) {
567
+ if (/^\d+(?:\.\d+)?$/.test(number))
568
+ phrases.add(number);
569
+ }
570
+ }
571
+ return [...phrases].filter((phrase) => normalizeForSearch(phrase).length >= 1).slice(0, 20);
572
+ }
573
+ const CONDITION_NUMBER_GENERIC_FOCUS = new Set([
574
+ "\u0440\u0438\u0441\u043a",
575
+ "\u0441\u043e\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u0442",
576
+ "\u0440\u0435\u043a\u043e\u043c\u0435\u043d\u0434",
577
+ "\u043f\u0430\u0446\u0438\u0435\u043d\u0442",
578
+ "\u043c\u0430\u0442\u0435\u0440\u0435\u0439",
579
+ "\u043f\u043e\u043b\u043e\u0436\u0438\u0442\u0435\u043b\u044c\u043d",
580
+ "\u043e\u0442\u0440\u0438\u0446\u0430\u0442\u0435\u043b\u044c\u043d",
581
+ "\u0442\u044f\u0436\u0435\u043b\u044b\u043c",
582
+ "\u0442\u044f\u0436\u0435\u043b\u043e\u043c",
583
+ "\u0441\u0440\u0435\u0434\u043d\u0435\u0439",
584
+ "\u0446\u0438\u0440\u0440\u043e\u0437",
585
+ "hbeag",
586
+ ].flatMap((item) => uniqueTokens(item)));
587
+ function specificConditionNumberFocusTokens(focusTokens) {
588
+ return (focusTokens ?? []).filter((token) => token.length >= 4 && !/^\d/.test(token) && !CONDITION_NUMBER_GENERIC_FOCUS.has(token));
589
+ }
590
+ function bestConditionNumberSupport({ mode, pages, question, answer, answerTokens, focusTokens }) {
591
+ if (mode !== "single")
592
+ return null;
593
+ if (!extractNumbers(answer.text).length && !frequencyAnswer(answer.text))
594
+ return null;
595
+ const phrases = conditionNumberAnswerPhrases(answer.text);
596
+ if (!phrases.length)
597
+ return null;
598
+ const specificTokens = specificConditionNumberFocusTokens(focusTokens);
599
+ let best = null;
600
+ for (const page of pages) {
601
+ for (const phrase of phrases) {
602
+ const phraseNorm = normalizeForSearch(phrase);
603
+ if (!phraseNorm)
604
+ continue;
605
+ const hits = findPhraseOccurrences(page.normalized, phrase, { textIsNormalized: true });
606
+ for (const hit of hits) {
607
+ if (/%/.test(answer.text) && /^\d+(?:\.\d+)?$/.test(phraseNorm) && !page.normalized.slice(hit, hit + 14).includes("%"))
608
+ continue;
609
+ const local = page.normalized.slice(Math.max(0, hit - 160), hit + phraseNorm.length + 180);
610
+ if (!conditionNumberCueHit(local, question, answer))
611
+ continue;
612
+ const localTokens = tokenizeNormalized(local);
613
+ const focusHits = tokenHitCount(specificTokens, localTokens);
614
+ if (specificTokens.length >= 2 && focusHits < 2)
615
+ continue;
616
+ const score = 12.6 +
617
+ strictSoftCoverage(answerTokens, localTokens) * 2.2 +
618
+ numberCoverage(answer.text, local) * 3.4 +
619
+ focusHits * 1.1;
620
+ best = betterEvidence(best, {
621
+ answerId: answer.id,
622
+ page: page.page,
623
+ text: evidenceSnippet(page.text, phrase, question),
624
+ score,
625
+ kind: "condition_number_segment",
626
+ });
627
+ }
628
+ }
629
+ }
630
+ return best;
631
+ }
632
+ function questionMarkerConditions(question) {
633
+ const normalized = normalizeForSearch(question);
634
+ const conditions = [];
635
+ if (containsNormalizedPhrase(normalized, "hbeag")) {
636
+ if (containsNormalizedPhrase(normalized, "\u043e\u0442\u0440\u0438\u0446"))
637
+ conditions.push({ type: "hbeag", value: "negative" });
638
+ if (containsNormalizedPhrase(normalized, "\u043f\u043e\u043b\u043e\u0436"))
639
+ conditions.push({ type: "hbeag", value: "positive" });
640
+ }
641
+ if (containsNormalizedPhrase(normalized, "\u0431\u0435\u0437 \u0446\u0438\u0440\u0440\u043e\u0437")) {
642
+ conditions.push({ type: "cirrhosis", value: "without" });
643
+ }
644
+ else if (containsNormalizedPhrase(normalized, "\u043f\u0440\u0438 \u0446\u0438\u0440\u0440\u043e\u0437") || containsNormalizedPhrase(normalized, "\u0441 \u0446\u0438\u0440\u0440\u043e\u0437")) {
645
+ conditions.push({ type: "cirrhosis", value: "with" });
646
+ }
647
+ return conditions;
648
+ }
649
+ function markerConditionsMatch(local, conditions) {
650
+ for (const condition of conditions) {
651
+ if (condition.type === "hbeag") {
652
+ const nearestStatus = nearestCueName(local, [
653
+ ["negative", ["\u043e\u0442\u0440\u0438\u0446"]],
654
+ ["positive", ["\u043f\u043e\u043b\u043e\u0436"]],
655
+ ]);
656
+ if (!containsNormalizedPhrase(local, "hbeag") || nearestStatus !== condition.value)
657
+ return false;
658
+ }
659
+ else if (condition.type === "cirrhosis") {
660
+ if (condition.value === "without") {
661
+ if (!containsNormalizedPhrase(local, "\u0431\u0435\u0437 \u0446\u0438\u0440\u0440\u043e\u0437"))
662
+ return false;
663
+ }
664
+ else if (!containsNormalizedPhrase(local, "\u0446\u0438\u0440\u0440\u043e\u0437") || containsNormalizedPhrase(local, "\u0431\u0435\u0437 \u0446\u0438\u0440\u0440\u043e\u0437")) {
665
+ return false;
666
+ }
667
+ }
668
+ }
669
+ return true;
670
+ }
671
+ function conditionedNumberPhrases(answerText) {
672
+ const phrases = new Set();
673
+ for (const number of extractNumbers(answerText)) {
674
+ phrases.add(number);
675
+ for (const expanded of expandNumberToken(number))
676
+ phrases.add(expanded);
677
+ const withoutPercent = String(number).replace("%", "");
678
+ if (withoutPercent)
679
+ phrases.add(withoutPercent);
680
+ }
681
+ for (const phrase of frequencySearchPhrases(answerText))
682
+ phrases.add(phrase);
683
+ return [...phrases].map((phrase) => normalizeForSearch(phrase)).filter((phrase) => phrase.length >= 1).slice(0, 18);
684
+ }
685
+ function exactNumericForms(text) {
686
+ const forms = new Set();
687
+ for (const number of extractNumbers(text)) {
688
+ const normalized = normalizeForSearch(number);
689
+ if (!normalized)
690
+ continue;
691
+ forms.add(normalized);
692
+ forms.add(normalized.replace(/\.0+$/u, ""));
693
+ if (normalized.includes("."))
694
+ forms.add(normalized.replace(/0+$/u, "").replace(/\.$/u, ""));
695
+ }
696
+ return [...forms].filter(Boolean);
697
+ }
698
+ function numericSearchBoundary(normalizedText, hit, length) {
699
+ const before = hit > 0 ? normalizedText[hit - 1] : "";
700
+ const after = hit + length < normalizedText.length ? normalizedText[hit + length] : "";
701
+ const beforeBefore = hit > 1 ? normalizedText[hit - 2] : "";
702
+ const afterAfter = hit + length + 1 < normalizedText.length ? normalizedText[hit + length + 1] : "";
703
+ const tokenChar = /[a-zа-я0-9%.+/]/iu;
704
+ if (before && tokenChar.test(before))
705
+ return false;
706
+ if (after && tokenChar.test(after))
707
+ return false;
708
+ if (before === "-" && /\d/u.test(beforeBefore))
709
+ return false;
710
+ if (after === "-" && /\d/u.test(afterAfter))
711
+ return false;
712
+ return true;
713
+ }
714
+ function findNumericFormHits(normalizedText, form) {
715
+ const hits = [];
716
+ if (!form)
717
+ return hits;
718
+ let start = 0;
719
+ while (start < normalizedText.length) {
720
+ const index = normalizedText.indexOf(form, start);
721
+ if (index < 0)
722
+ break;
723
+ if (numericSearchBoundary(normalizedText, index, form.length))
724
+ hits.push({ index, length: form.length });
725
+ start = index + Math.max(1, form.length);
726
+ if (hits.length > 80)
727
+ break;
728
+ }
729
+ return hits;
730
+ }
731
+ function sourceConditionHits(normalizedText, anchor) {
732
+ if (anchor.pattern) {
733
+ const hits = [];
734
+ for (const match of normalizedText.matchAll(anchor.pattern)) {
735
+ hits.push({ index: match.index ?? 0, length: match[0].length });
736
+ if (hits.length > 80)
737
+ break;
738
+ }
739
+ return hits;
740
+ }
741
+ const hits = [];
742
+ for (const phrase of anchor.phrases ?? []) {
743
+ let start = 0;
744
+ while (start < normalizedText.length) {
745
+ const index = normalizedText.indexOf(phrase, start);
746
+ if (index < 0)
747
+ break;
748
+ if (hasSearchBoundaries(normalizedText, index, phrase.length))
749
+ hits.push({ index, length: phrase.length });
750
+ start = index + Math.max(1, phrase.length);
751
+ if (hits.length > 80)
752
+ break;
753
+ }
754
+ }
755
+ return hits;
756
+ }
757
+ function nextConditionHit(normalizedText, anchor, start) {
758
+ if (!anchor.nextPattern)
759
+ return -1;
760
+ anchor.nextPattern.lastIndex = start;
761
+ const match = anchor.nextPattern.exec(normalizedText);
762
+ anchor.nextPattern.lastIndex = 0;
763
+ return match?.index ?? -1;
764
+ }
765
+ function interveningNumberCount(normalizedText) {
766
+ return extractNumbers(normalizedText).length;
767
+ }
768
+ function numericConditionDirectionOk(normalizedText, conditionHit, answerHit, anchor) {
769
+ const conditionEnd = conditionHit.index + conditionHit.length;
770
+ const answerEnd = answerHit.index + answerHit.length;
771
+ if (anchor.direction === "before") {
772
+ if (answerHit.index < conditionEnd)
773
+ return false;
774
+ if (answerHit.index - conditionEnd > anchor.after)
775
+ return false;
776
+ const next = nextConditionHit(normalizedText, anchor, conditionEnd + 1);
777
+ if (next >= 0 && answerHit.index >= next)
778
+ return false;
779
+ if (interveningNumberCount(normalizedText.slice(conditionEnd, answerHit.index)) > 0)
780
+ return false;
781
+ return true;
782
+ }
783
+ if (answerEnd > conditionHit.index)
784
+ return false;
785
+ if (conditionHit.index - answerEnd > anchor.before)
786
+ return false;
787
+ if (interveningNumberCount(normalizedText.slice(answerEnd, conditionHit.index)) > 0)
788
+ return false;
789
+ return true;
790
+ }
791
+ function numericConditionAnchorSatisfied(local, anchor) {
792
+ if (!anchor.phrases?.length || !anchor.minPhraseHits)
793
+ return true;
794
+ let hits = 0;
795
+ for (const phrase of anchor.phrases) {
796
+ if (local.includes(phrase))
797
+ hits += 1;
798
+ }
799
+ return hits >= anchor.minPhraseHits;
800
+ }
801
+ function questionNumericConditionAnchors(question) {
802
+ const raw = normalizeText(question);
803
+ const normalized = normalizeForSearch(question);
804
+ const anchors = [];
805
+ const weekCue = normalizeForSearch("\u043d\u0435\u0434\u0435\u043b");
806
+ const kgCue = normalizeForSearch("\u043a\u0433");
807
+ const weekMatch = normalized.match(new RegExp(`(?:^|\\s)(\\d{1,2})(?:\\s*-?\\s*[a-zа-я]{1,2})?\\s+${escapeRegExp(weekCue)}`, "iu"));
808
+ if (weekMatch?.[1]) {
809
+ const number = weekMatch[1];
810
+ anchors.push({
811
+ kind: "week_number",
812
+ direction: "before",
813
+ after: 170,
814
+ before: 10,
815
+ base: 58,
816
+ pattern: new RegExp(`(?:^|\\s)${escapeRegExp(number)}(?:\\s*-?\\s*[a-zа-я]{1,2})?\\s+${escapeRegExp(weekCue)}`, "giu"),
817
+ nextPattern: new RegExp(`(?:^|\\s)\\d{1,2}(?:\\s*-?\\s*[a-zа-я]{1,2})?\\s+${escapeRegExp(weekCue)}`, "giu"),
818
+ });
819
+ }
820
+ for (const number of extractNumbers(question)) {
821
+ const normalizedNumber = normalizeForSearch(number);
822
+ if (!normalizedNumber.includes("-"))
823
+ continue;
824
+ const hits = findNumericFormHits(normalized, normalizedNumber);
825
+ const hasKg = hits.some((hit) => normalized.slice(hit.index, Math.min(normalized.length, hit.index + 48)).includes(kgCue));
826
+ if (!hasKg && !containsNormalizedPhrase(normalized, "\u043c\u0430\u0441\u0441\u0430") && !containsNormalizedPhrase(normalized, "\u0432\u0435\u0441"))
827
+ continue;
828
+ anchors.push({
829
+ kind: "weight_range",
830
+ direction: "before",
831
+ after: 90,
832
+ before: 8,
833
+ base: 60,
834
+ pattern: new RegExp(`(?:^|\\s)${escapeRegExp(normalizedNumber)}\\s*${escapeRegExp(kgCue)}`, "giu"),
835
+ nextPattern: new RegExp(`(?:^|\\s)\\d+(?:-\\d+)?\\s*${escapeRegExp(kgCue)}`, "giu"),
836
+ });
837
+ }
838
+ if (containsNormalizedPhrase(normalized, "\u0444\u0430\u0437")) {
839
+ if (containsNormalizedPhrase(normalized, "\u0445\u0440\u043e\u043d\u0438\u0447")) {
840
+ anchors.push({
841
+ kind: "phase_abbreviation",
842
+ direction: "after",
843
+ after: 18,
844
+ before: 95,
845
+ base: 59,
846
+ phrases: [normalizeForSearch("\u0445\u0444")],
847
+ minPhraseHits: 1,
848
+ });
849
+ }
850
+ const phasePhrases = [];
851
+ if (containsNormalizedPhrase(normalized, "\u0430\u043a\u0441\u0435\u043b\u0435\u0440\u0430\u0446"))
852
+ phasePhrases.push(normalizeForSearch("\u0444\u0430"));
853
+ if (containsNormalizedPhrase(normalized, "\u0431\u043b\u0430\u0441\u0442"))
854
+ phasePhrases.push(normalizeForSearch("\u0431\u043a"));
855
+ if (phasePhrases.length) {
856
+ anchors.push({
857
+ kind: "phase_abbreviation",
858
+ direction: "after",
859
+ after: 24,
860
+ before: 105,
861
+ base: 59,
862
+ phrases: phasePhrases,
863
+ minPhraseHits: 1,
864
+ });
865
+ }
866
+ }
867
+ return anchors;
868
+ }
869
+ function numericConditionSources(pages, topQuestionPages) {
870
+ const sources = [];
871
+ for (const page of pages) {
872
+ const topPage = topQuestionPages?.has(page.page);
873
+ const adjacentTopPage = topQuestionPages?.has(page.page - 1) || topQuestionPages?.has(page.page + 1);
874
+ if (topQuestionPages?.size && !topPage && !adjacentTopPage)
875
+ continue;
876
+ for (const segment of cachedLineWindowSegments(page)) {
877
+ sources.push({ page: page.page, text: segment.text, normalized: segment.normalized });
878
+ }
879
+ }
880
+ return sources;
881
+ }
882
+ export function bestNumericConditionSupport({ mode, pages, topQuestionPages, question, answer, answerTokens, focusTokens }) {
883
+ if (mode !== "single")
884
+ return null;
885
+ const answerForms = exactNumericForms(answer.text);
886
+ if (!answerForms.length)
887
+ return null;
888
+ const anchors = questionNumericConditionAnchors(question);
889
+ if (!anchors.length)
890
+ return null;
891
+ const specificTokens = specificConditionNumberFocusTokens(focusTokens);
892
+ let best = null;
893
+ for (const source of numericConditionSources(pages, topQuestionPages)) {
894
+ const sourceTokens = tokenizeNormalized(source.normalized);
895
+ const focusHits = tokenHitCount(specificTokens, sourceTokens);
896
+ for (const anchor of anchors) {
897
+ const conditionHits = sourceConditionHits(source.normalized, anchor);
898
+ if (!conditionHits.length)
899
+ continue;
900
+ for (const answerForm of answerForms) {
901
+ const answerHits = findNumericFormHits(source.normalized, answerForm);
902
+ for (const conditionHit of conditionHits) {
903
+ for (const answerHit of answerHits) {
904
+ if (!numericConditionDirectionOk(source.normalized, conditionHit, answerHit, anchor))
905
+ continue;
906
+ const localStart = Math.max(0, Math.min(conditionHit.index, answerHit.index) - 32);
907
+ const localEnd = Math.min(source.normalized.length, Math.max(conditionHit.index + conditionHit.length, answerHit.index + answerHit.length) + 56);
908
+ const local = source.normalized.slice(localStart, localEnd);
909
+ if (!numericConditionAnchorSatisfied(local, anchor))
910
+ continue;
911
+ const score = anchor.base +
912
+ numberCoverage(answer.text, local) * 5.4 +
913
+ strictSoftCoverage(answerTokens, tokenizeNormalized(local)) * 1.6 +
914
+ Math.min(3, focusHits) * 0.55;
915
+ best = betterEvidence(best, {
916
+ answerId: answer.id,
917
+ page: source.page,
918
+ text: source.text,
919
+ score,
920
+ kind: `numeric_condition_${anchor.kind}`,
921
+ });
922
+ }
923
+ }
924
+ }
925
+ }
926
+ }
927
+ return best;
928
+ }
929
+ export function bestConditionedNumberSupport({ mode, pages, topQuestionPages, question, answer, answerTokens, focusTokens }) {
930
+ if (mode !== "single")
931
+ return null;
932
+ if (!extractNumbers(answer.text).length && !frequencyAnswer(answer.text))
933
+ return null;
934
+ const conditions = questionMarkerConditions(question);
935
+ if (!conditions.length)
936
+ return null;
937
+ const phrases = conditionedNumberPhrases(answer.text);
938
+ if (!phrases.length)
939
+ return null;
940
+ const specificTokens = specificConditionNumberFocusTokens(focusTokens);
941
+ let best = null;
942
+ for (const page of pages) {
943
+ if (topQuestionPages?.size && !topQuestionPages.has(page.page))
944
+ continue;
945
+ for (const phrase of phrases) {
946
+ let start = 0;
947
+ while (start < page.normalized.length) {
948
+ const hit = page.normalized.indexOf(phrase, start);
949
+ if (hit < 0)
950
+ break;
951
+ const numericRangeStart = /^\d+(?:\.\d+)?%?$/.test(phrase) && page.normalized[hit + phrase.length] === "-";
952
+ if (phrase.length > 1 && !hasSearchBoundaries(page.normalized, hit, phrase.length) && !numericRangeStart) {
953
+ start = hit + Math.max(1, phrase.length);
954
+ continue;
955
+ }
956
+ if (page.normalized.slice(Math.max(0, hit - 3), hit).includes("-")) {
957
+ start = hit + Math.max(1, phrase.length);
958
+ continue;
959
+ }
960
+ const local = page.normalized.slice(Math.max(0, hit - 180), Math.min(page.normalized.length, hit + phrase.length + 190));
961
+ if (!markerConditionsMatch(local, conditions)) {
962
+ start = hit + Math.max(1, phrase.length);
963
+ continue;
964
+ }
965
+ const localTokens = tokenizeNormalized(local);
966
+ const focusHits = tokenHitCount(specificTokens, localTokens);
967
+ const score = 15.0 +
968
+ strictSoftCoverage(answerTokens, localTokens) * 2.6 +
969
+ numberCoverage(answer.text, local) * 3.2 +
970
+ Math.min(3, focusHits) * 0.9;
971
+ best = betterEvidence(best, {
972
+ answerId: answer.id,
973
+ page: page.page,
974
+ text: evidenceSnippet(page.text, phrase, question),
975
+ score,
976
+ kind: "conditioned_number_segment",
977
+ });
978
+ start = hit + Math.max(1, phrase.length);
979
+ }
980
+ }
981
+ }
982
+ return best;
983
+ }
984
+ const COUNT_NUMBER_WORDS = new Map(Object.entries({
985
+ "1": ["\u043e\u0434\u0438\u043d", "\u043e\u0434\u043d"],
986
+ "2": ["\u0434\u0432\u0430", "\u0434\u0432\u0435", "\u0434\u0432\u0443"],
987
+ "3": ["\u0442\u0440\u0438", "\u0442\u0440\u0435"],
988
+ "4": ["\u0447\u0435\u0442\u044b\u0440"],
989
+ "5": ["\u043f\u044f\u0442"],
990
+ "6": ["\u0448\u0435\u0441\u0442"],
991
+ "7": ["\u0441\u0435\u043c"],
992
+ "8": ["\u0432\u043e\u0441\u0435\u043c"],
993
+ "9": ["\u0434\u0435\u0432\u044f\u0442"],
994
+ "10": ["\u0434\u0435\u0441\u044f\u0442"],
995
+ "11": ["\u043e\u0434\u0438\u043d\u043d\u0430\u0434\u0446\u0430\u0442"],
996
+ "12": ["\u0434\u0432\u0435\u043d\u0430\u0434\u0446\u0430\u0442"],
997
+ }).map(([number, words]) => [number, words.map((word) => normalizeForSearch(word))]));
998
+ const COUNT_QUESTION_CUES = ["\u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432", "\u0447\u0438\u0441\u043b\u043e", "\u0441\u043a\u043e\u043b\u044c\u043a"].map((item) => normalizeForSearch(item));
999
+ const COUNT_LOCAL_CUES = [
1000
+ "\u0441\u043e\u0441\u0442\u0430\u0432\u043b",
1001
+ "\u0432\u044b\u0434\u0435\u043b\u044f",
1002
+ "\u0432\u044b\u0437\u0432\u0430\u043d",
1003
+ "\u043a\u043e\u0434\u0438\u0440",
1004
+ "\u0432\u043a\u043b\u044e\u0447",
1005
+ "\u0431\u043e\u043b\u044c\u0448\u0438\u043d\u0441\u0442\u0432",
1006
+ "\u0441\u0440\u0435\u0434\u0438 \u043a\u043e\u0442\u043e\u0440",
1007
+ "\u0440\u0430\u0437\u043b\u0438\u0447\u043d",
1008
+ "\u0440\u0430\u0437\u043b\u0438\u0447\u0430",
1009
+ "\u043f\u043e\u0434\u0440\u0430\u0437\u0434\u0435\u043b",
1010
+ ].map((item) => normalizeForSearch(item));
1011
+ const COUNT_GENERIC_TOKENS = new Set([
1012
+ "\u043a\u043e\u043b\u0438\u0447\u0435\u0441\u0442\u0432\u043e",
1013
+ "\u0441\u043e\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u0442",
1014
+ "\u0447\u0438\u0441\u043b\u043e",
1015
+ "\u0441\u043a\u043e\u043b\u044c\u043a\u043e",
1016
+ "\u0432\u044b\u0434\u0435\u043b\u044f\u044e\u0442",
1017
+ "\u043d\u0430\u0441\u0442\u043e\u044f\u0449\u0435\u0435",
1018
+ "\u0432\u0440\u0435\u043c\u044f",
1019
+ ].flatMap((item) => uniqueTokens(item)));
1020
+ function countQuestion(question) {
1021
+ const normalized = normalizeForSearch(question);
1022
+ return COUNT_QUESTION_CUES.some((cue) => normalized.includes(cue));
1023
+ }
1024
+ function countFocusTokens(question) {
1025
+ return uniqueTokens(question).filter((token) => token.length >= 3 && !FOCUS_STOPWORDS.has(token) && !COUNT_GENERIC_TOKENS.has(token) && !/^\d/.test(token));
1026
+ }
1027
+ function countNumberSearchPhrases(answerText) {
1028
+ const phrases = new Set();
1029
+ for (const number of extractNumbers(answerText)) {
1030
+ for (const expanded of expandNumberToken(number)) {
1031
+ const clean = String(expanded).replace("%", "");
1032
+ if (!clean || !/^\d+$/.test(clean))
1033
+ continue;
1034
+ phrases.add(clean);
1035
+ for (const word of COUNT_NUMBER_WORDS.get(clean) ?? [])
1036
+ phrases.add(word);
1037
+ }
1038
+ }
1039
+ return [...phrases].filter(Boolean);
1040
+ }
1041
+ function countRelationAnswerOption(answerText) {
1042
+ const normalized = normalizeForSearch(answerText);
1043
+ const tokens = phraseTokens(answerText).filter((token) => token.length > 0);
1044
+ const numbers = extractNumbers(answerText);
1045
+ if (!numbers.length || tokens.length > 4 || normalized.length > 36)
1046
+ return false;
1047
+ const numberLike = new Set(numbers.flatMap(expandNumberToken).map((item) => String(item).replace("%", "")));
1048
+ for (const [number, words] of COUNT_NUMBER_WORDS.entries()) {
1049
+ if (numberLike.has(number)) {
1050
+ for (const word of words)
1051
+ numberLike.add(word);
1052
+ }
1053
+ }
1054
+ const nonNumericTokens = tokens.filter((token) => {
1055
+ const clean = token.replace(/[%.,+-]/g, "");
1056
+ if (!clean)
1057
+ return false;
1058
+ if (/^\d+$/u.test(clean))
1059
+ return false;
1060
+ return !numberLike.has(clean);
1061
+ });
1062
+ return nonNumericTokens.length <= 1;
1063
+ }
1064
+ function countCueHit(local) {
1065
+ return COUNT_LOCAL_CUES.some((cue) => local.includes(cue));
1066
+ }
1067
+ function positiveStructuralHit(local) {
1068
+ const cue = normalizeForSearch("\u0441\u0442\u0440\u0443\u043a\u0442\u0443\u0440");
1069
+ for (let index = local.indexOf(cue); index >= 0; index = local.indexOf(cue, index + cue.length)) {
1070
+ const before = local.slice(Math.max(0, index - 4), index);
1071
+ if (!before.includes(normalizeForSearch("\u043d\u0435")))
1072
+ return true;
1073
+ }
1074
+ return false;
1075
+ }
1076
+ function countTargetNear(normalizedPage, hit, phraseLength, question) {
1077
+ const questionNorm = normalizeForSearch(question);
1078
+ const local = normalizedPage.slice(Math.max(0, hit - 25), Math.min(normalizedPage.length, hit + phraseLength + 55));
1079
+ const after = normalizedPage.slice(hit + phraseLength, Math.min(normalizedPage.length, hit + phraseLength + 58));
1080
+ if (containsNormalizedPhrase(questionNorm, "\u0433\u0435\u043d\u043e\u0442\u0438\u043f")) {
1081
+ return containsNormalizedPhrase(after, "\u0433\u0435\u043d\u043e\u0442\u0438\u043f");
1082
+ }
1083
+ if (containsNormalizedPhrase(questionNorm, "\u043d\u0435\u0441\u0442\u0440\u0443\u043a\u0442\u0443\u0440")) {
1084
+ return containsNormalizedPhrase(after, "\u043d\u0435\u0441\u0442\u0440\u0443\u043a\u0442\u0443\u0440");
1085
+ }
1086
+ if (containsNormalizedPhrase(questionNorm, "\u0441\u0442\u0440\u0443\u043a\u0442\u0443\u0440") && containsNormalizedPhrase(questionNorm, "\u0431\u0435\u043b\u043a")) {
1087
+ return positiveStructuralHit(after);
1088
+ }
1089
+ if (containsNormalizedPhrase(questionNorm, "\u0441\u0435\u0440\u043e\u0433\u0440\u0443\u043f")) {
1090
+ return containsNormalizedPhrase(after, "\u0441\u0435\u0440\u043e\u0433\u0440\u0443\u043f");
1091
+ }
1092
+ if (containsNormalizedPhrase(questionNorm, "\u0441\u0435\u0440\u043e\u0442\u0438\u043f")) {
1093
+ return containsNormalizedPhrase(after, "\u0441\u0435\u0440\u043e\u0442\u0438\u043f");
1094
+ }
1095
+ return true;
1096
+ }
1097
+ export function bestCountRelationSupport({ mode, pages, topQuestionPages, question, answer, answerTokens }) {
1098
+ if (mode !== "single" || !countQuestion(question))
1099
+ return null;
1100
+ if (!extractNumbers(answer.text).length)
1101
+ return null;
1102
+ if (!countRelationAnswerOption(answer.text))
1103
+ return null;
1104
+ const phrases = countNumberSearchPhrases(answer.text);
1105
+ if (!phrases.length)
1106
+ return null;
1107
+ const focusTokens = countFocusTokens(question);
1108
+ if (focusTokens.length < 2)
1109
+ return null;
1110
+ let best = null;
1111
+ for (const page of pages) {
1112
+ if (topQuestionPages?.size && !topQuestionPages.has(page.page))
1113
+ continue;
1114
+ for (const phrase of phrases) {
1115
+ let start = 0;
1116
+ while (start < page.normalized.length) {
1117
+ const hit = page.normalized.indexOf(phrase, start);
1118
+ if (hit < 0)
1119
+ break;
1120
+ if (/^\d+$/.test(phrase)) {
1121
+ const before = hit > 0 ? page.normalized[hit - 1] : "";
1122
+ const after = page.normalized[hit + phrase.length] ?? "";
1123
+ if (/[0-9]/.test(before) || /[0-9]/.test(after)) {
1124
+ start = hit + Math.max(1, phrase.length);
1125
+ continue;
1126
+ }
1127
+ const nearBefore = page.normalized.slice(Math.max(0, hit - 3), hit);
1128
+ const nearAfter = page.normalized.slice(hit + phrase.length, hit + phrase.length + 3);
1129
+ if (nearBefore.includes("[") || nearAfter.includes("]")) {
1130
+ start = hit + Math.max(1, phrase.length);
1131
+ continue;
1132
+ }
1133
+ }
1134
+ if (!countTargetNear(page.normalized, hit, phrase.length, question)) {
1135
+ start = hit + Math.max(1, phrase.length);
1136
+ continue;
1137
+ }
1138
+ const local = page.normalized.slice(Math.max(0, hit - 210), Math.min(page.normalized.length, hit + phrase.length + 230));
1139
+ const localTokens = tokenizeNormalized(local);
1140
+ const focusCoverage = strictSoftCoverage(focusTokens, localTokens);
1141
+ if (focusCoverage < 0.34 || !countCueHit(local)) {
1142
+ start = hit + Math.max(1, phrase.length);
1143
+ continue;
1144
+ }
1145
+ const score = 14.2 +
1146
+ focusCoverage * 6.2 +
1147
+ strictSoftCoverage(answerTokens, localTokens) * 1.2 +
1148
+ numberCoverage(answer.text, local) * 2.6;
1149
+ best = betterEvidence(best, {
1150
+ answerId: answer.id,
1151
+ page: page.page,
1152
+ text: evidenceSnippet(page.text, phrase, question),
1153
+ score,
1154
+ kind: "count_relation_segment",
1155
+ });
1156
+ start = hit + Math.max(1, phrase.length);
1157
+ }
1158
+ }
1159
+ }
1160
+ return best;
1161
+ }