truthguard-ai 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of truthguard-ai might be problematic. Click here for more details.

Files changed (53) hide show
  1. package/dist-npm/Claims/index.d.ts +73 -0
  2. package/dist-npm/Claims/index.d.ts.map +1 -0
  3. package/dist-npm/Claims/index.js +1669 -0
  4. package/dist-npm/Claims/index.js.map +1 -0
  5. package/dist-npm/Config/index.d.ts +41 -0
  6. package/dist-npm/Config/index.d.ts.map +1 -0
  7. package/dist-npm/Config/index.js +129 -0
  8. package/dist-npm/Config/index.js.map +1 -0
  9. package/dist-npm/Grounding/index.d.ts +40 -0
  10. package/dist-npm/Grounding/index.d.ts.map +1 -0
  11. package/dist-npm/Grounding/index.js +1433 -0
  12. package/dist-npm/Grounding/index.js.map +1 -0
  13. package/dist-npm/L2/index.d.ts +93 -0
  14. package/dist-npm/L2/index.d.ts.map +1 -0
  15. package/dist-npm/L2/index.js +1773 -0
  16. package/dist-npm/L2/index.js.map +1 -0
  17. package/dist-npm/Matchers/index.d.ts +101 -0
  18. package/dist-npm/Matchers/index.d.ts.map +1 -0
  19. package/dist-npm/Matchers/index.js +690 -0
  20. package/dist-npm/Matchers/index.js.map +1 -0
  21. package/dist-npm/Mode/index.d.ts +87 -0
  22. package/dist-npm/Mode/index.d.ts.map +1 -0
  23. package/dist-npm/Mode/index.js +117 -0
  24. package/dist-npm/Mode/index.js.map +1 -0
  25. package/dist-npm/Policy/index.d.ts +89 -0
  26. package/dist-npm/Policy/index.d.ts.map +1 -0
  27. package/dist-npm/Policy/index.js +143 -0
  28. package/dist-npm/Policy/index.js.map +1 -0
  29. package/dist-npm/Registry/index.d.ts +93 -0
  30. package/dist-npm/Registry/index.d.ts.map +1 -0
  31. package/dist-npm/Registry/index.js +818 -0
  32. package/dist-npm/Registry/index.js.map +1 -0
  33. package/dist-npm/Rules/index.d.ts +587 -0
  34. package/dist-npm/Rules/index.d.ts.map +1 -0
  35. package/dist-npm/Rules/index.js +6236 -0
  36. package/dist-npm/Rules/index.js.map +1 -0
  37. package/dist-npm/Rules/intents.d.ts +22 -0
  38. package/dist-npm/Rules/intents.d.ts.map +1 -0
  39. package/dist-npm/Rules/intents.js +242 -0
  40. package/dist-npm/Rules/intents.js.map +1 -0
  41. package/dist-npm/TraceReadiness/index.d.ts +42 -0
  42. package/dist-npm/TraceReadiness/index.d.ts.map +1 -0
  43. package/dist-npm/TraceReadiness/index.js +169 -0
  44. package/dist-npm/TraceReadiness/index.js.map +1 -0
  45. package/dist-npm/i18n/index.d.ts +44 -0
  46. package/dist-npm/i18n/index.d.ts.map +1 -0
  47. package/dist-npm/i18n/index.js +124 -0
  48. package/dist-npm/i18n/index.js.map +1 -0
  49. package/package.json +5 -17
  50. package/dist/cli/index.d.ts +0 -15
  51. package/dist/cli/index.d.ts.map +0 -1
  52. package/dist/cli/index.js +0 -807
  53. package/dist/cli/index.js.map +0 -1
@@ -0,0 +1,1669 @@
1
+ "use strict";
2
+ /**
3
+ * Claim Extractor
4
+ *
5
+ * Extracts factual claims from free-form LLM text.
6
+ *
7
+ * V1 supports:
8
+ * - numbers (integer and decimal)
9
+ * - dates (common formats → ISO-8601)
10
+ * - names (capitalised multi-word proper nouns)
11
+ * - counts (explicit "N [items]" patterns)
12
+ *
13
+ * V1 explicitly skips vague qualitative claims ("most", "significant", "many").
14
+ */
15
+ Object.defineProperty(exports, "__esModule", { value: true });
16
+ exports.tryParseDate = tryParseDate;
17
+ exports.parsePeriodBounds = parsePeriodBounds;
18
+ exports.normalizeEntityName = normalizeEntityName;
19
+ exports.entitiesMatch = entitiesMatch;
20
+ exports.normalizeDiacritics = normalizeDiacritics;
21
+ exports.extractClaims = extractClaims;
22
+ const crypto_1 = require("crypto");
23
+ const i18n_1 = require("../i18n");
24
+ // ---------------------------------------------------------------------------
25
+ // Vague qualifier guard
26
+ // ---------------------------------------------------------------------------
27
+ const VAGUE_QUALIFIERS = [
28
+ // ── English ──
29
+ 'most', 'many', 'significant', 'several', 'few', 'some',
30
+ 'various', 'numerous', 'majority', 'minority',
31
+ 'large number', 'small number', 'a lot', 'lots', 'plenty', 'much',
32
+ // ── Serbian (sr) ──
33
+ 'oko', 'otprilike', 'negde', 'većina', 'vecina', 'mnogo', 'malo',
34
+ 'nekoliko', 'razni', 'razne', 'dosta', 'puno',
35
+ // ── Spanish (es) ──
36
+ 'mucho', 'muchos', 'muchas', 'poco', 'pocos', 'algunos', 'algunas',
37
+ 'varios', 'varias', 'mayoría', 'minoría', 'bastante', 'bastantes',
38
+ // ── French (fr) ──
39
+ 'beaucoup', 'peu', 'plusieurs', 'quelques', 'divers', 'diverses',
40
+ 'majorité', 'minorité', 'environ', 'à peu près',
41
+ // ── Portuguese (pt) ──
42
+ 'muito', 'muitos', 'muitas', 'pouco', 'poucos', 'alguns', 'algumas',
43
+ 'maioria', 'minoria', 'bastante', 'diversos', 'diversas',
44
+ // ── Russian (ru) ──
45
+ 'много', 'мало', 'несколько', 'некоторые', 'различные',
46
+ 'большинство', 'меньшинство', 'примерно', 'около', 'достаточно',
47
+ // ── Hindi (hi) ──
48
+ 'बहुत', 'कुछ', 'कई', 'थोड़ा', 'ज़्यादातर', 'अधिकतर',
49
+ 'लगभग', 'अनेक',
50
+ // ── Arabic (ar) ──
51
+ 'كثير', 'قليل', 'بعض', 'عدة', 'معظم', 'أغلب',
52
+ 'تقريبا', 'تقريباً', 'حوالي', 'نحو',
53
+ // ── Bengali (bn) ──
54
+ 'অনেক', 'কিছু', 'কয়েক', 'সামান্য', 'বেশিরভাগ', 'প্রায়',
55
+ // ── Chinese (zh) ──
56
+ '很多', '一些', '少数', '大多数', '大部分', '少量', '大约', '大概',
57
+ // ── Japanese (ja) ──
58
+ 'たくさん', 'いくつか', '少し', 'ほとんど', '多くの', '約', 'およそ',
59
+ ];
60
+ /** Returns true if the surrounding text is dominated by a vague qualifier. */
61
+ function containsVagueQualifier(text) {
62
+ const lower = text.toLowerCase();
63
+ return VAGUE_QUALIFIERS.some((q) => {
64
+ const re = new RegExp(`\\b${q}\\b`);
65
+ return re.test(lower);
66
+ });
67
+ }
68
+ // ---------------------------------------------------------------------------
69
+ // Date parsing helpers
70
+ // ---------------------------------------------------------------------------
71
+ /** Month name → zero-padded number map (EN + SR). */
72
+ const MONTH_NAMES = {
73
+ january: '01',
74
+ jan: '01',
75
+ february: '02',
76
+ feb: '02',
77
+ march: '03',
78
+ mar: '03',
79
+ april: '04',
80
+ apr: '04',
81
+ may: '05',
82
+ june: '06',
83
+ jun: '06',
84
+ july: '07',
85
+ jul: '07',
86
+ august: '08',
87
+ aug: '08',
88
+ september: '09',
89
+ sep: '09',
90
+ sept: '09',
91
+ october: '10',
92
+ oct: '10',
93
+ november: '11',
94
+ nov: '11',
95
+ december: '12',
96
+ dec: '12',
97
+ // Serbian nominative
98
+ januar: '01',
99
+ februar: '02',
100
+ mart: '03',
101
+ maj: '05',
102
+ avgust: '08',
103
+ septembar: '09',
104
+ oktobar: '10',
105
+ novembar: '11',
106
+ decembar: '12',
107
+ // Serbian genitive
108
+ januara: '01',
109
+ februara: '02',
110
+ marta: '03',
111
+ maja: '05',
112
+ avgusta: '08',
113
+ septembra: '09',
114
+ oktobra: '10',
115
+ novembra: '11',
116
+ decembra: '12',
117
+ };
118
+ /** Quarter start month (1-indexed). */
119
+ const QUARTER_START_MONTH = { 1: 1, 2: 4, 3: 7, 4: 10 };
120
+ /**
121
+ * Attempt to parse a date-like string into an ISO-8601 date (YYYY-MM-DD).
122
+ * Returns null if parsing fails.
123
+ */
124
+ function tryParseDate(raw, _referenceDate) {
125
+ const s = raw.trim();
126
+ // Already ISO-8601 (YYYY-MM-DD)
127
+ if (/^\d{4}-\d{2}-\d{2}$/.test(s)) {
128
+ const d = new Date(s + 'T00:00:00Z');
129
+ if (!isNaN(d.getTime()))
130
+ return s;
131
+ }
132
+ // European: DD.MM.YYYY
133
+ const dotFull = s.match(/^(\d{1,2})\.(\d{1,2})\.(\d{4})$/);
134
+ if (dotFull) {
135
+ const day = dotFull[1].padStart(2, '0');
136
+ const mon = dotFull[2].padStart(2, '0');
137
+ const yr = dotFull[3];
138
+ const iso = `${yr}-${mon}-${day}`;
139
+ const dateObj = new Date(iso + 'T00:00:00Z');
140
+ if (!isNaN(dateObj.getTime()))
141
+ return iso;
142
+ }
143
+ // European short: DD.MM. or DD.MM (trailing dot optional for parsing)
144
+ // We validate day/month range and return a partial ISO date.
145
+ const dotShort = s.match(/^(\d{1,2})\.(\d{1,2})\.?$/);
146
+ if (dotShort) {
147
+ const day = parseInt(dotShort[1], 10);
148
+ const mon = parseInt(dotShort[2], 10);
149
+ const maxDay = [0, 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31][mon] ?? 31;
150
+ if (day >= 1 && day <= maxDay && mon >= 1 && mon <= 12) {
151
+ return `0000-${String(mon).padStart(2, '0')}-${String(day).padStart(2, '0')}`;
152
+ }
153
+ }
154
+ // MM/DD/YYYY or DD/MM/YYYY — ambiguous, treat as MM/DD/YYYY
155
+ const slashMatch = s.match(/^(\d{1,2})\/(\d{1,2})\/(\d{4})$/);
156
+ if (slashMatch) {
157
+ const m = slashMatch[1].padStart(2, '0');
158
+ const d2 = slashMatch[2].padStart(2, '0');
159
+ const y = slashMatch[3];
160
+ const iso = `${y}-${m}-${d2}`;
161
+ const dateObj = new Date(iso + 'T00:00:00Z');
162
+ if (!isNaN(dateObj.getTime()))
163
+ return iso;
164
+ }
165
+ // "15 March 2024" or "March 15, 2024" (EN + SR)
166
+ const MONTH_NAMES_RE = 'january|jan|february|feb|march|mar|april|apr|may|june|jun|july|jul|august|aug|september|sep|sept|october|oct|november|nov|december|dec|januar|januara|februar|februara|mart|marta|maj|maja|avgust|avgusta|septembar|septembra|oktobar|oktobra|novembar|novembra|decembar|decembra';
167
+ const monthDayYear = s.match(new RegExp(`^(\\d{1,2})\\s+(${MONTH_NAMES_RE})\\s+(\\d{4})$`, 'i'));
168
+ if (monthDayYear) {
169
+ const day = monthDayYear[1].padStart(2, '0');
170
+ const mon = MONTH_NAMES[monthDayYear[2].toLowerCase()];
171
+ const yr = monthDayYear[3];
172
+ if (mon)
173
+ return `${yr}-${mon}-${day}`;
174
+ }
175
+ const dayMonthYear = s.match(new RegExp(`^(${MONTH_NAMES_RE})\\s+(\\d{1,2}),?\\s+(\\d{4})$`, 'i'));
176
+ if (dayMonthYear) {
177
+ const mon = MONTH_NAMES[dayMonthYear[1].toLowerCase()];
178
+ const day = dayMonthYear[2].padStart(2, '0');
179
+ const yr = dayMonthYear[3];
180
+ if (mon)
181
+ return `${yr}-${mon}-${day}`;
182
+ }
183
+ // "March 2024" (no day — treat as YYYY-MM-01 for matching purposes)
184
+ const monthYear = s.match(/^(january|jan|february|feb|march|mar|april|apr|may|june|jun|july|jul|august|aug|september|sep|sept|october|oct|november|nov|december|dec|januar|januara|februar|februara|mart|marta|maj|maja|juni|juli|avgust|avgusta|septembar|septembra|oktobar|oktobra|novembar|novembra|decembar|decembra)\s+(\d{4})$/i);
185
+ if (monthYear) {
186
+ const mon = MONTH_NAMES[monthYear[1].toLowerCase()];
187
+ const yr = monthYear[2];
188
+ if (mon)
189
+ return `${yr}-${mon}-01`;
190
+ }
191
+ // Quarter: "Q1 2024" or "Q3/2024"
192
+ const quarterMatch = s.match(/^Q([1-4])\s*[\/\s]\s*(\d{4})$/i);
193
+ if (quarterMatch) {
194
+ const q = parseInt(quarterMatch[1], 10);
195
+ const yr = quarterMatch[2];
196
+ const startMonth = QUARTER_START_MONTH[q];
197
+ return `${yr}-${String(startMonth).padStart(2, '0')}-01`;
198
+ }
199
+ // Half-year: "H1 2024" or "H2/2024"
200
+ const halfMatch = s.match(/^H([12])\s*[\/\s]\s*(\d{4})$/i);
201
+ if (halfMatch) {
202
+ const h = parseInt(halfMatch[1], 10);
203
+ const yr = halfMatch[2];
204
+ return `${yr}-${h === 1 ? '01' : '07'}-01`;
205
+ }
206
+ // Fiscal year: "FY2024", "FY 2024", "FY24"
207
+ const fyMatch = s.match(/^FY\s*(\d{2,4})$/i);
208
+ if (fyMatch) {
209
+ let yr = fyMatch[1];
210
+ if (yr.length === 2)
211
+ yr = `20${yr}`;
212
+ // Calendar-aligned fiscal year by default (Jan 1).
213
+ // Fiscal calendars that start Oct, Apr, etc. vary by org — we default to calendar year.
214
+ return `${yr}-01-01`;
215
+ }
216
+ // Relative dates (EN): "yesterday", "today", "tomorrow"
217
+ const lower = s.toLowerCase();
218
+ const ref = _referenceDate ?? new Date();
219
+ if (lower === 'yesterday' || lower === 'juče' || lower === 'juce') {
220
+ const d = new Date(ref);
221
+ d.setDate(d.getDate() - 1);
222
+ return toISO(d);
223
+ }
224
+ if (lower === 'today' || lower === 'danas') {
225
+ return toISO(ref);
226
+ }
227
+ if (lower === 'tomorrow' || lower === 'sutra') {
228
+ const d = new Date(ref);
229
+ d.setDate(d.getDate() + 1);
230
+ return toISO(d);
231
+ }
232
+ // "N days ago" / "pre N dana"
233
+ const daysAgoEN = s.match(/^(\d+)\s+days?\s+ago$/i);
234
+ if (daysAgoEN) {
235
+ const d = new Date(ref);
236
+ d.setDate(d.getDate() - parseInt(daysAgoEN[1], 10));
237
+ return toISO(d);
238
+ }
239
+ const daysAgoSR = s.match(/^pre\s+(\d+)\s+dan(?:a)?$/i);
240
+ if (daysAgoSR) {
241
+ const d = new Date(ref);
242
+ d.setDate(d.getDate() - parseInt(daysAgoSR[1], 10));
243
+ return toISO(d);
244
+ }
245
+ // "last week" / "prošle nedelje"
246
+ const lastWeekEN = /^last\s+week$/i.test(s);
247
+ const lastWeekSR = /^pro[sš]l[ea]\s+nedelj[ea]$/i.test(s);
248
+ if (lastWeekEN || lastWeekSR) {
249
+ const d = new Date(ref);
250
+ d.setDate(d.getDate() - 7);
251
+ // Snap to Monday of that week
252
+ const day = d.getDay();
253
+ const diff = day === 0 ? 6 : day - 1;
254
+ d.setDate(d.getDate() - diff);
255
+ return toISO(d);
256
+ }
257
+ // "last month" / "prošlog meseca"
258
+ const lastMonthEN = /^last\s+month$/i.test(s);
259
+ const lastMonthSR = /^pro[sš]lo?g?\s+meseca$/i.test(s);
260
+ if (lastMonthEN || lastMonthSR) {
261
+ const d = new Date(ref);
262
+ d.setMonth(d.getMonth() - 1);
263
+ d.setDate(1);
264
+ return toISO(d);
265
+ }
266
+ return null;
267
+ }
268
+ /** Format a Date as ISO-8601 YYYY-MM-DD. */
269
+ function toISO(d) {
270
+ const yr = d.getFullYear();
271
+ const mo = String(d.getMonth() + 1).padStart(2, '0');
272
+ const dy = String(d.getDate()).padStart(2, '0');
273
+ return `${yr}-${mo}-${dy}`;
274
+ }
275
+ /** Last day of a given month. */
276
+ function lastDayOfMonth(year, month) {
277
+ // month is 1-indexed
278
+ const d = new Date(Date.UTC(year, month, 0)); // day 0 of next month = last day of month
279
+ return toISO(d);
280
+ }
281
+ /** Quarter end month (1-indexed). */
282
+ const QUARTER_END_MONTH = { 1: 3, 2: 6, 3: 9, 4: 12 };
283
+ /**
284
+ * Parse a period expression into start/end bounds.
285
+ * Supports: Q1-Q4, H1/H2, FY, YTD, MTD, "last quarter", "this quarter",
286
+ * "last month", "this month", "MonthName YYYY", "YYYY", rolling periods.
287
+ *
288
+ * Returns null if the expression is not a recognized period.
289
+ */
290
+ function parsePeriodBounds(raw, referenceDate) {
291
+ const s = raw.trim();
292
+ const lower = s.toLowerCase();
293
+ const ref = referenceDate ?? new Date();
294
+ const refYear = ref.getFullYear();
295
+ const refMonth = ref.getMonth() + 1; // 1-indexed
296
+ // Quarter: "Q1 2024", "Q3/2024", "Q1-2024"
297
+ const qm = s.match(/^Q([1-4])\s*[\/\s-]\s*(\d{4})$/i);
298
+ if (qm) {
299
+ const q = parseInt(qm[1], 10);
300
+ const yr = parseInt(qm[2], 10);
301
+ const sm = QUARTER_START_MONTH[q];
302
+ const em = QUARTER_END_MONTH[q];
303
+ return {
304
+ start: `${yr}-${String(sm).padStart(2, '0')}-01`,
305
+ end: lastDayOfMonth(yr, em),
306
+ label: s,
307
+ };
308
+ }
309
+ // Half-year: "H1 2024", "H2/2024"
310
+ const hm = s.match(/^H([12])\s*[\/\s-]\s*(\d{4})$/i);
311
+ if (hm) {
312
+ const h = parseInt(hm[1], 10);
313
+ const yr = parseInt(hm[2], 10);
314
+ return {
315
+ start: `${yr}-${h === 1 ? '01' : '07'}-01`,
316
+ end: lastDayOfMonth(yr, h === 1 ? 6 : 12),
317
+ label: s,
318
+ };
319
+ }
320
+ // Fiscal year: "FY2024", "FY 2024", "FY24"
321
+ const fym = s.match(/^FY\s*(\d{2,4})$/i);
322
+ if (fym) {
323
+ let yr = fym[1];
324
+ if (yr.length === 2)
325
+ yr = `20${yr}`;
326
+ const y = parseInt(yr, 10);
327
+ return {
328
+ start: `${y}-01-01`,
329
+ end: `${y}-12-31`,
330
+ label: s,
331
+ };
332
+ }
333
+ // "YTD" / "year to date" / "od pocetka godine"
334
+ if (/^(ytd|year[\s-]to[\s-]date|od po[cč]etka godine)$/i.test(lower)) {
335
+ return {
336
+ start: `${refYear}-01-01`,
337
+ end: toISO(ref),
338
+ label: s,
339
+ };
340
+ }
341
+ // "MTD" / "month to date" / "od pocetka meseca"
342
+ if (/^(mtd|month[\s-]to[\s-]date|od po[cč]etka meseca)$/i.test(lower)) {
343
+ return {
344
+ start: `${refYear}-${String(refMonth).padStart(2, '0')}-01`,
345
+ end: toISO(ref),
346
+ label: s,
347
+ };
348
+ }
349
+ // "this quarter" / "ovaj kvartal"
350
+ if (/^(this\s+quarter|current\s+quarter|ovaj\s+kvartal|teku[cć]i\s+kvartal)$/i.test(lower)) {
351
+ const q = Math.ceil(refMonth / 3);
352
+ const sm = QUARTER_START_MONTH[q];
353
+ const em = QUARTER_END_MONTH[q];
354
+ return {
355
+ start: `${refYear}-${String(sm).padStart(2, '0')}-01`,
356
+ end: lastDayOfMonth(refYear, em),
357
+ label: s,
358
+ };
359
+ }
360
+ // "last quarter" / "prethodni kvartal" / "prosli kvartal"
361
+ if (/^(last\s+quarter|previous\s+quarter|prethodn[io]\s+kvartal|pro[sš]l[io]\s+kvartal)$/i.test(lower)) {
362
+ let q = Math.ceil(refMonth / 3) - 1;
363
+ let yr = refYear;
364
+ if (q <= 0) {
365
+ q = 4;
366
+ yr--;
367
+ }
368
+ const sm = QUARTER_START_MONTH[q];
369
+ const em = QUARTER_END_MONTH[q];
370
+ return {
371
+ start: `${yr}-${String(sm).padStart(2, '0')}-01`,
372
+ end: lastDayOfMonth(yr, em),
373
+ label: s,
374
+ };
375
+ }
376
+ // "this month" / "ovaj mesec"
377
+ if (/^(this\s+month|current\s+month|ovaj\s+mesec|teku[cć]i\s+mesec)$/i.test(lower)) {
378
+ return {
379
+ start: `${refYear}-${String(refMonth).padStart(2, '0')}-01`,
380
+ end: lastDayOfMonth(refYear, refMonth),
381
+ label: s,
382
+ };
383
+ }
384
+ // "last month" / "proslog meseca" / "prethodni mesec"
385
+ if (/^(last\s+month|previous\s+month|pro[sš]lo?g?\s+meseca?|prethodn[io]\s+mesec)$/i.test(lower)) {
386
+ let m = refMonth - 1;
387
+ let yr = refYear;
388
+ if (m <= 0) {
389
+ m = 12;
390
+ yr--;
391
+ }
392
+ return {
393
+ start: `${yr}-${String(m).padStart(2, '0')}-01`,
394
+ end: lastDayOfMonth(yr, m),
395
+ label: s,
396
+ };
397
+ }
398
+ // "this year" / "ove godine"
399
+ if (/^(this\s+year|current\s+year|ov[ea]\s+godin[ea])$/i.test(lower)) {
400
+ return {
401
+ start: `${refYear}-01-01`,
402
+ end: `${refYear}-12-31`,
403
+ label: s,
404
+ };
405
+ }
406
+ // "last year" / "prosla godina"
407
+ if (/^(last\s+year|previous\s+year|pro[sš]l[ea]\s+godin[ea])$/i.test(lower)) {
408
+ return {
409
+ start: `${refYear - 1}-01-01`,
410
+ end: `${refYear - 1}-12-31`,
411
+ label: s,
412
+ };
413
+ }
414
+ // Rolling: "last N months" / "poslednjih N meseci" / "trailing N days"
415
+ const rollingMonths = lower.match(/^(?:last|poslednjih|trailing)\s+(\d+)\s+months?(?:i|a)?$/);
416
+ if (rollingMonths) {
417
+ const n = parseInt(rollingMonths[1], 10);
418
+ const d = new Date(ref);
419
+ d.setMonth(d.getMonth() - n);
420
+ d.setDate(1); // start of the month N months ago
421
+ return {
422
+ start: toISO(d),
423
+ end: toISO(ref),
424
+ label: s,
425
+ };
426
+ }
427
+ const rollingDays = lower.match(/^(?:last|poslednjih|trailing)\s+(\d+)\s+da(?:ys?|na)$/);
428
+ if (rollingDays) {
429
+ const n = parseInt(rollingDays[1], 10);
430
+ const d = new Date(ref);
431
+ d.setDate(d.getDate() - n);
432
+ return {
433
+ start: toISO(d),
434
+ end: toISO(ref),
435
+ label: s,
436
+ };
437
+ }
438
+ // "MonthName YYYY" → full month bounds
439
+ const MONTH_NAMES_RE_PERIOD = /^(january|jan|february|feb|march|mar|april|apr|may|june|jun|july|jul|august|aug|september|sep|sept|october|oct|november|nov|december|dec|januar|januara|februar|februara|mart|marta|maj|maja|juni|juli|avgust|avgusta|septembar|septembra|oktobar|oktobra|novembar|novembra|decembar|decembra)\s+(\d{4})$/i;
440
+ const monthYearMatch = s.match(MONTH_NAMES_RE_PERIOD);
441
+ if (monthYearMatch) {
442
+ const mon = MONTH_NAMES[monthYearMatch[1].toLowerCase()];
443
+ const yr = parseInt(monthYearMatch[2], 10);
444
+ if (mon) {
445
+ const m = parseInt(mon, 10);
446
+ return {
447
+ start: `${yr}-${mon}-01`,
448
+ end: lastDayOfMonth(yr, m),
449
+ label: s,
450
+ };
451
+ }
452
+ }
453
+ // Standalone year: "2024"
454
+ const yearOnly = s.match(/^(\d{4})$/);
455
+ if (yearOnly) {
456
+ const yr = parseInt(yearOnly[1], 10);
457
+ if (yr >= 1900 && yr <= 2100) {
458
+ return {
459
+ start: `${yr}-01-01`,
460
+ end: `${yr}-12-31`,
461
+ label: s,
462
+ };
463
+ }
464
+ }
465
+ return null;
466
+ }
467
+ // ---------------------------------------------------------------------------
468
+ /**
469
+ * Count patterns — must appear before generic numbers to avoid double-counting.
470
+ *
471
+ * Examples:
472
+ * "3 employees" → count 3
473
+ * "two records" → count 2 (word-form)
474
+ */
475
+ const WORD_NUMBERS = {
476
+ // ── English ──
477
+ zero: 0, one: 1, two: 2, three: 3, four: 4, five: 5,
478
+ six: 6, seven: 7, eight: 8, nine: 9, ten: 10,
479
+ eleven: 11, twelve: 12, thirteen: 13, fourteen: 14, fifteen: 15,
480
+ sixteen: 16, seventeen: 17, eighteen: 18, nineteen: 19, twenty: 20,
481
+ // ── Serbian (sr) ──
482
+ nula: 0, jedan: 1, jedna: 1, jedno: 1, dva: 2, dve: 2, tri: 3,
483
+ četiri: 4, cetiri: 4, pet: 5, šest: 6, sest: 6, sedam: 7, osam: 8,
484
+ devet: 9, deset: 10, jedanaest: 11, dvanaest: 12, trinaest: 13,
485
+ četrnaest: 14, cetrnaest: 14, petnaest: 15, šesnaest: 16, sesnaest: 16,
486
+ sedamnaest: 17, osamnaest: 18, devetnaest: 19, dvadeset: 20,
487
+ // ── Spanish (es) ──
488
+ cero: 0, uno: 1, una: 1, dos: 2, tres: 3, cuatro: 4, cinco: 5,
489
+ seis: 6, siete: 7, ocho: 8, nueve: 9, diez: 10,
490
+ once: 11, doce: 12, trece: 13, catorce: 14, quince: 15,
491
+ // ── French (fr) ── (six shared with EN, onze/quinze shared with ES)
492
+ zéro: 0, un: 1, une: 1, deux: 2, trois: 3, quatre: 4, cinq: 5,
493
+ sept: 7, huit: 8, neuf: 9, dix: 10,
494
+ douze: 12, treize: 13, quatorze: 14,
495
+ // ── Portuguese (pt) ── (cinco/quatro shared with ES, onze/doze/treze/catorze/quinze shared above)
496
+ um: 1, uma: 1, dois: 2, duas: 2, três: 3,
497
+ sete: 7, oito: 8, nove: 9, dez: 10,
498
+ // ── Russian (ru) ──
499
+ 'ноль': 0, 'один': 1, 'одна': 1, 'одно': 1, 'два': 2, 'две': 2,
500
+ 'три': 3, 'четыре': 4, 'пять': 5, 'шесть': 6, 'семь': 7,
501
+ 'восемь': 8, 'девять': 9, 'десять': 10,
502
+ 'одиннадцать': 11, 'двенадцать': 12, 'тринадцать': 13,
503
+ 'четырнадцать': 14, 'пятнадцать': 15, 'шестнадцать': 16,
504
+ 'семнадцать': 17, 'восемнадцать': 18, 'девятнадцать': 19, 'двадцать': 20,
505
+ // ── Hindi (hi) ──
506
+ 'शून्य': 0, 'एक': 1, 'दो': 2, 'तीन': 3, 'चार': 4, 'पाँच': 5, 'पांच': 5,
507
+ 'छह': 6, 'सात': 7, 'आठ': 8, 'नौ': 9, 'दस': 10,
508
+ // ── Arabic (ar) ──
509
+ 'صفر': 0, 'واحد': 1, 'اثنان': 2, 'ثلاثة': 3, 'أربعة': 4, 'خمسة': 5,
510
+ 'ستة': 6, 'سبعة': 7, 'ثمانية': 8, 'تسعة': 9, 'عشرة': 10,
511
+ // ── Bengali (bn) ──
512
+ 'শূন্য': 0, 'এক': 1, 'দুই': 2, 'তিন': 3, 'চার': 4, 'পাঁচ': 5,
513
+ 'ছয়': 6, 'সাত': 7, 'আট': 8, 'নয়': 9, 'দশ': 10,
514
+ // ── Chinese (zh) ──
515
+ '零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4, '五': 5,
516
+ '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
517
+ // ── Japanese (ja) — shares kanji above, add kana ──
518
+ 'ゼロ': 0, 'いち': 1, 'に': 2, 'さん': 3, 'よん': 4, 'ご': 5,
519
+ 'ろく': 6, 'なな': 7, 'はち': 8, 'きゅう': 9, 'じゅう': 10,
520
+ };
521
+ // Month names (ALL_MONTHS) imported from ../i18n
522
+ /**
523
+ * Named date-like phrases that should be extracted as date claims.
524
+ */
525
+ const DATE_REGEX = new RegExp([
526
+ // ISO: 2024-03-15
527
+ '\\b(\\d{4}-\\d{2}-\\d{2})\\b',
528
+ // European: DD.MM.YYYY (must precede generic numeric to avoid overlap)
529
+ '\\b(\\d{1,2}\\.\\d{1,2}\\.\\d{4})\\b',
530
+ // European short: DD.MM. or DD.MM (trailing dot optional)
531
+ '\\b(\\d{1,2}\\.\\d{1,2}\\.?)(?=\\s|$|[,;—–\\-\\)])',
532
+ // MM/DD/YYYY
533
+ '\\b(\\d{1,2}\\/\\d{1,2}\\/\\d{4})\\b',
534
+ // 15 March 2024 / March 15, 2024 (EN + SR)
535
+ '\\b(\\d{1,2}\\s+(?:' + i18n_1.ALL_MONTHS + ')\\s+\\d{4})\\b',
536
+ '\\b((?:' + i18n_1.ALL_MONTHS + ')\\s+\\d{1,2},?\\s+\\d{4})\\b',
537
+ // March 2024 / Mart 2024 / mars 2024 / … (all 11 languages)
538
+ '\\b((?:' + i18n_1.ALL_MONTHS + ')\\s+\\d{4})\\b',
539
+ // Quarter: Q1 2024 / Q3/2024
540
+ '\\b(Q[1-4]\\s*[/\\s]\\s*\\d{4})\\b',
541
+ // Relative dates (EN)
542
+ '\\b(yesterday|today|tomorrow)\\b',
543
+ '\\b(\\d+\\s+days?\\s+ago)\\b',
544
+ '\\b(last\\s+(?:week|month))\\b',
545
+ // Relative dates (SR)
546
+ '\\b(ju[cč]e|danas|sutra)\\b',
547
+ '\\b(pre\\s+\\d+\\s+dana?)\\b',
548
+ '\\b(pro[sš]l[ea]\\s+nedelj[ea]|pro[sš]lo?g?\\s+meseca)\\b',
549
+ // Relative dates (ES)
550
+ '\\b(ayer|hoy|ma[ñn]ana)\\b',
551
+ '\\b(hace\\s+\\d+\\s+d[ií]as?)\\b',
552
+ '\\b(la\\s+semana\\s+pasada|el\\s+mes\\s+pasado)\\b',
553
+ // Relative dates (FR)
554
+ '\\b(hier|aujourd\'hui|demain)\\b',
555
+ '\\b(il\\s+y\\s+a\\s+\\d+\\s+jours?)\\b',
556
+ '\\b(la\\s+semaine\\s+derni[eè]re|le\\s+mois\\s+dernier)\\b',
557
+ // Relative dates (PT)
558
+ '\\b(ontem|hoje|amanh[aã])\\b',
559
+ '\\b(h[aá]\\s+\\d+\\s+dias?)\\b',
560
+ '\\b(semana\\s+passada|m[eê]s\\s+passado)\\b',
561
+ // Relative dates (RU)
562
+ '(вчера|сегодня|завтра)',
563
+ '(\\d+\\s+дн(?:ей|я)\\s+назад)',
564
+ '(на\\s+прошлой\\s+неделе|в\\s+прошлом\\s+месяце)',
565
+ // Relative dates (HI)
566
+ '(कल|आज|कल)',
567
+ '(\\d+\\s+दिन\\s+पहले)',
568
+ '(पिछले\\s+हफ्ते|पिछले\\s+महीने)',
569
+ // Relative dates (AR)
570
+ '(أمس|اليوم|غدا|غداً)',
571
+ '(منذ\\s+\\d+\\s+(?:يوم|أيام))',
572
+ '(الأسبوع\\s+الماضي|الشهر\\s+الماضي)',
573
+ // Relative dates (BN)
574
+ '(গতকাল|আজ|আগামীকাল)',
575
+ '(\\d+\\s+দিন\\s+আগে)',
576
+ '(গত\\s+সপ্তাহে|গত\\s+মাসে)',
577
+ // Relative dates (ZH)
578
+ '(昨天|今天|明天)',
579
+ '(\\d+\\s*天前)',
580
+ '(上周|上个月)',
581
+ // Relative dates (JA)
582
+ '(昨日|今日|明日)',
583
+ '(\\d+\\s*日前)',
584
+ '(先週|先月)',
585
+ ].join('|'), 'gi');
586
+ /**
587
+ * Unit words that indicate a measurement rather than a count of entities.
588
+ * Digits followed by these words should be extracted as numbers, not counts.
589
+ *
590
+ * Covers 11 languages: English, Spanish, French, Portuguese, Serbian,
591
+ * Russian, Hindi, Arabic, Bengali, Mandarin Chinese, Japanese.
592
+ * International abbreviations (km, kg, cm, …) are universal.
593
+ */
594
+ const UNIT_WORDS = new Set([
595
+ // ── English ──
596
+ 'percent', 'percentage', 'degree', 'degrees',
597
+ 'second', 'seconds', 'minute', 'minutes', 'hour', 'hours',
598
+ 'day', 'days', 'week', 'weeks', 'month', 'months', 'year', 'years',
599
+ 'mile', 'miles', 'meter', 'meters', 'metre', 'metres',
600
+ 'km', 'kg', 'lb', 'lbs', 'cm', 'mm',
601
+ // ── Spanish (es) ──
602
+ 'segundo', 'segundos', 'minuto', 'minutos', 'hora', 'horas',
603
+ 'día', 'dias', 'días', 'dia', 'semana', 'semanas',
604
+ 'mes', 'meses', 'año', 'años', 'porcentaje', 'grado', 'grados',
605
+ // ── French (fr) ──
606
+ 'seconde', 'secondes', 'heure', 'heures',
607
+ 'jour', 'jours', 'semaine', 'semaines', 'mois',
608
+ 'an', 'ans', 'année', 'années', 'pourcent', 'pourcentage',
609
+ 'degré', 'degrés',
610
+ // ── Portuguese (pt) ──
611
+ 'hora', 'horas', 'dia', 'dias', 'semana', 'semanas',
612
+ 'mês', 'ano', 'anos', 'porcentagem', 'porcento', 'grau', 'graus',
613
+ // ── Serbian (sr) ──
614
+ 'sekunda', 'sekundi', 'sekunde',
615
+ 'minut', 'minuta', 'minuti',
616
+ 'sat', 'sata', 'sati',
617
+ 'dan', 'dana', 'dani',
618
+ 'nedelja', 'nedelje', 'sedmica', 'sedmice',
619
+ 'mesec', 'meseci', 'meseca',
620
+ 'godina', 'godine',
621
+ 'procenat', 'procenata', 'posto',
622
+ 'stepen', 'stepeni', 'stepena',
623
+ // ── Russian (ru) ──
624
+ 'секунда', 'секунды', 'секунд',
625
+ 'минута', 'минуты', 'минут',
626
+ 'час', 'часа', 'часов',
627
+ 'день', 'дня', 'дней',
628
+ 'неделя', 'недели', 'недель',
629
+ 'месяц', 'месяца', 'месяцев',
630
+ 'год', 'года', 'лет',
631
+ 'процент', 'процентов', 'процента',
632
+ 'градус', 'градуса', 'градусов',
633
+ // ── Hindi (hi) ──
634
+ 'सेकंड', 'मिनट', 'घंटा', 'घंटे', 'घंटों',
635
+ 'दिन', 'दिनों',
636
+ 'हफ्ता', 'हफ्ते', 'हफ्तों', 'सप्ताह',
637
+ 'महीना', 'महीने', 'महीनों',
638
+ 'साल', 'वर्ष', 'प्रतिशत', 'डिग्री',
639
+ // ── Arabic (ar) ──
640
+ 'ثانية', 'ثوان', 'ثواني',
641
+ 'دقيقة', 'دقائق',
642
+ 'ساعة', 'ساعات',
643
+ 'يوم', 'أيام',
644
+ 'أسبوع', 'أسابيع',
645
+ 'شهر', 'أشهر', 'شهور',
646
+ 'سنة', 'سنوات',
647
+ 'بالمئة', 'نسبة', 'درجة', 'درجات',
648
+ // ── Bengali (bn) ──
649
+ 'সেকেন্ড', 'মিনিট', 'ঘণ্টা', 'দিন',
650
+ 'সপ্তাহ', 'মাস', 'বছর', 'শতাংশ', 'ডিগ্রি',
651
+ // ── Mandarin Chinese (zh) ──
652
+ '秒', '分钟', '小时', '天', '日',
653
+ '周', '星期', '个月', '月', '年',
654
+ '百分比', '度',
655
+ // ── Japanese (ja) ──
656
+ '時間', '分', '週間', '週', 'ヶ月', 'か月',
657
+ 'パーセント',
658
+ // ── Currency ──
659
+ 'usd', 'eur', 'gbp', 'rsd', 'chf', 'jpy', 'cny', 'rub', 'inr', 'brl',
660
+ 'dollar', 'dollars', 'euro', 'euros', 'evro', 'evra',
661
+ 'funta', 'funti',
662
+ 'dinar', 'dinara', 'dinari', 'din',
663
+ 'franc', 'francs', 'franak', 'franaka',
664
+ 'yen', 'yuan', 'ruble', 'rubles',
665
+ 'rupee', 'rupees',
666
+ 'real', 'reais',
667
+ ]);
668
+ /**
669
+ * Count pattern: digit or word-number followed by a noun-like word.
670
+ * Uses a negative lookbehind `(?<![.\d-])` to prevent:
671
+ * - Matching decimal fractional parts: "5" in "94.5 employees" → no match
672
+ * - Matching negative number digits: "5" in "-5 degrees" → no match
673
+ * Note: "minus 3 employees" is still matched because the space between
674
+ * "minus" and "3" means the character immediately before "3" is a space,
675
+ * not a "-", so the lookbehind does not block it.
676
+ * Examples: "3 employees", "two records", "5 items"
677
+ */
678
+ const COUNT_REGEX = /(?<![.\d-])\b(\d+|zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty)\s+([\p{L}]{2,})(?=[\s.,;:!?)\]>"']|$)/giu;
679
+ /** Returns true if the match position is a list marker (e.g. "1. ", "2.\t"). */
680
+ function isListMarker(text, matchStart, matchEnd) {
681
+ // Check that the character after the number is '.' followed by whitespace
682
+ const afterMatch = text.substring(matchEnd, matchEnd + 2);
683
+ if (!/^\.\s/.test(afterMatch) && !/^\.$/.test(afterMatch))
684
+ return false;
685
+ // Check that the number is at the start of a line (or start of text)
686
+ if (matchStart === 0)
687
+ return true;
688
+ const before = text.substring(Math.max(0, matchStart - 3), matchStart);
689
+ // Start of line: preceded by newline, or only whitespace/markdown since newline
690
+ return /(?:^|\n)\s*(?:[*\->]\s*)?$/.test(before);
691
+ }
692
+ /**
693
+ * Generic numeric pattern — matches integers, decimals, and negative numbers.
694
+ * Uses a negative lookbehind to avoid re-matching parts of decimal numbers
695
+ * (e.g., "5" in "94.5") or numbers that are part of larger numbers.
696
+ *
697
+ * Alternatives (in order):
698
+ * 1. European dot-thousands: 1.234.567 or 4.496 (X.XXX pattern)
699
+ * 2. Comma/space thousands: 1,234,567 or 1 234 567 + optional .decimal
700
+ * 3. Plain numbers: 12345 or 12345.67
701
+ */
702
+ const NUMBER_REGEX = /(?<![.\w])(-?\d{1,3}(?:\.\d{3})+|-?\d{1,3}(?:[,\s]\d{3})*(?:\.\d+)?|-?\d+(?:\.\d+)?)\b/g;
703
+ /**
704
+ * Detect European-style thousand separator: dot followed by exactly 3 digits
705
+ * at the end of the number string. E.g. "4.496" → 4496, "1.845" → 1845.
706
+ * Does NOT match real decimals like "94.5" or "3.14".
707
+ */
708
+ const DOT_THOUSANDS_RE = /^-?\d{1,3}(?:\.\d{3})+$/;
709
+ /**
710
+ * Fraction pattern: "3/4", "1/2", "2/3", etc.
711
+ * Uses lookbehind/lookahead to avoid date-like "03/12/2024" and paths "src/lib".
712
+ * Matches: "1/2", "3/4", "7/8" but not "12/03/2024" or multi-segment paths.
713
+ */
714
+ const FRACTION_REGEX = /(?<![/\d])(\d+)\s*\/\s*(\d+)(?![\d/])/g;
715
+ /**
716
+ * Proper name pattern: two or more consecutive Title-Cased words.
717
+ * Avoids capturing single common title-cased words at sentence starts.
718
+ */
719
+ const NAME_REGEX = /(?:^|[\s,;.!?()\[\]*])([\p{Lu}][\p{Ll}]{1,}(?:\s+[\p{Lu}][\p{Ll}]{1,})+)(?=[\s,;.!?()\[\]*]|$)/gu;
720
+ // HEADING_STOP_WORDS imported from ../i18n
721
+ /** Returns true if every word in the candidate name is a common heading word. */
722
+ function isHeadingPhrase(name) {
723
+ const words = name.trim().split(/\s+/);
724
+ return words.every((w) => i18n_1.HEADING_STOP_WORDS.has(w.toLowerCase()));
725
+ }
726
+ /**
727
+ * Common sentence starters / discourse markers that get title-cased at
728
+ * sentence beginnings but are NOT part of a person name.
729
+ * Used to strip false name prefixes: "Dakle Marina" → "Marina" (not a 2-word name).
730
+ */
731
+ const SENTENCE_STARTERS = new Set([
732
+ // Serbian
733
+ 'dakle', 'znači', 'naime', 'stoga', 'tada', 'zato', 'upravo', 'ovde',
734
+ 'inače', 'zapravo', 'takodje', 'takođe', 'medjutim', 'međutim',
735
+ 'ipak', 'svakako', 'naravno', 'ustvari', 'nažalost', 'konkretno',
736
+ 'ukratko', 'posebno', 'slobodno', 'jednostavno', 'pretpostavljam',
737
+ // English
738
+ 'therefore', 'however', 'moreover', 'furthermore', 'meanwhile',
739
+ 'basically', 'essentially', 'actually', 'unfortunately', 'specifically',
740
+ 'obviously', 'clearly', 'indeed', 'certainly', 'additionally',
741
+ // status/section words
742
+ 'status', 'poređenje', 'poredjenje', 'analiza', 'izveštaj', 'izvestaj',
743
+ 'pregled', 'detalji', 'objašnjenje', 'objasnjenje', 'razlog',
744
+ ]);
745
+ // ---------------------------------------------------------------------------
746
+ // Entity proximity helper
747
+ // ---------------------------------------------------------------------------
748
+ /** Regex for a single Title-Cased word (proper noun candidate). */
749
+ const PROPER_NOUN_RE = /\b([A-Z][a-zà-ž]{1,})\b/g;
750
+ /**
751
+ * Find the nearest proper noun before `position` in the same sentence.
752
+ * Returns the name string or undefined.
753
+ */
754
+ function findNearestEntity(text, position) {
755
+ // Extract the sentence-like segment: go back to the previous sentence boundary
756
+ const sentenceStart = Math.max(0, text.lastIndexOf('.', position - 1) + 1, text.lastIndexOf('\n', position - 1) + 1, text.lastIndexOf(':', position - 1) + 1);
757
+ const segment = text.substring(sentenceStart, position);
758
+ // Find all proper nouns in this segment
759
+ PROPER_NOUN_RE.lastIndex = 0;
760
+ let match;
761
+ let lastMatch;
762
+ while ((match = PROPER_NOUN_RE.exec(segment)) !== null) {
763
+ lastMatch = match[1];
764
+ }
765
+ return lastMatch;
766
+ }
767
+ /**
768
+ * Detect the unit word following a number at a given position.
769
+ * Looks at the text right after the number for a known unit word.
770
+ */
771
+ function detectUnitAfterNumber(text, numberEnd) {
772
+ const after = text.substring(numberEnd, numberEnd + 30).trimStart().toLowerCase();
773
+ // Word-segmented scripts (Latin, Cyrillic, Devanagari, Arabic, Bengali)
774
+ const unitMatch = after.match(/^([\p{L}]+)/u);
775
+ if (unitMatch && UNIT_WORDS.has(unitMatch[1])) {
776
+ return unitMatch[1];
777
+ }
778
+ // CJK and other scripts without word boundaries — check prefix matches
779
+ for (const word of UNIT_WORDS) {
780
+ if (after.startsWith(word)) {
781
+ return word;
782
+ }
783
+ }
784
+ return undefined;
785
+ }
786
+ /** Currency symbols that precede numbers: "$100", "€50", "¥1000". */
787
+ const CURRENCY_PREFIX = {
788
+ '$': 'usd', '€': 'eur', '£': 'gbp', '¥': 'jpy', '₹': 'inr', 'R$': 'brl',
789
+ };
790
+ /**
791
+ * Detect a currency symbol immediately before a number.
792
+ * E.g. "$100" → 'usd', "€50" → 'eur'.
793
+ */
794
+ function detectCurrencyBeforeNumber(text, numberStart) {
795
+ for (const [sym, unit] of Object.entries(CURRENCY_PREFIX)) {
796
+ const start = numberStart - sym.length;
797
+ if (start >= 0 && text.substring(start, numberStart) === sym) {
798
+ return unit;
799
+ }
800
+ }
801
+ return undefined;
802
+ }
803
+ // ---------------------------------------------------------------------------
804
+ // Entity normalisation
805
+ // ---------------------------------------------------------------------------
806
+ /**
807
+ * Strip diacritics (č→c, ć→c, š→s, ž→z, đ→d, etc.) and lowercase.
808
+ * Allows entity matching across diacritic and case variants.
809
+ */
810
+ function normalizeEntityName(name) {
811
+ return name
812
+ .normalize('NFD')
813
+ .replace(/[\u0300-\u036f]/g, '') // strip combining marks
814
+ .replace(/\u0111/g, 'd') // đ → d
815
+ .replace(/\u0110/g, 'D') // Đ → D
816
+ .toLowerCase()
817
+ .trim();
818
+ }
819
+ /**
820
+ * Check if two entity names refer to the same person/thing.
821
+ * Handles: case, diacritics, partial name matching (first/last name subset).
822
+ */
823
+ function entitiesMatch(a, b) {
824
+ const normA = normalizeEntityName(a);
825
+ const normB = normalizeEntityName(b);
826
+ // Exact match after normalisation
827
+ if (normA === normB)
828
+ return true;
829
+ // Partial name: one is a substring of the other (e.g. "Ana" vs "Ana Jović")
830
+ const partsA = normA.split(/\s+/);
831
+ const partsB = normB.split(/\s+/);
832
+ // Check if any part of A matches any part of B
833
+ if (partsA.length !== partsB.length) {
834
+ const shorter = partsA.length < partsB.length ? partsA : partsB;
835
+ const longer = partsA.length < partsB.length ? partsB : partsA;
836
+ // Every word of the shorter name appears in the longer name
837
+ if (shorter.every((p) => longer.includes(p)))
838
+ return true;
839
+ }
840
+ return false;
841
+ }
842
+ const AGGREGATION_QUALIFIERS = [
843
+ {
844
+ op: 'sum',
845
+ patterns: [
846
+ /\bukupno\b/i, /\bukupn[aei]\b/i, /\bzbir\b/i, /\bsuma\b/i,
847
+ /\btotal(?:ly|e?)?\b/i, /\bsum\b/i, /\bin total\b/i,
848
+ /\bсума\b/i, /\bитого\b/i, /\bвсего\b/i,
849
+ /\ben total\b/i, /\bau total\b/i, /\bno total\b/i,
850
+ ],
851
+ },
852
+ {
853
+ op: 'avg',
854
+ patterns: [
855
+ /\bu proseku\b/i, /\bprose[čc]n[oaie]\b/i, /\bprosek\b/i,
856
+ /\baverage\b/i, /\bavg\.?\b/i, /\bmean\b/i,
857
+ /\bсредн[яеий]\b/i, /\bв среднем\b/i,
858
+ /\bpromedio\b/i, /\bmoyenne?\b/i, /\bmédia\b/i,
859
+ ],
860
+ },
861
+ {
862
+ op: 'count',
863
+ patterns: [
864
+ /\bbroj\b/i, /\biznos\b/i,
865
+ /\bcount\b/i, /\bnumber of\b/i,
866
+ /\bколичество\b/i, /\bчисло\b/i,
867
+ ],
868
+ },
869
+ {
870
+ op: 'min',
871
+ patterns: [
872
+ /\bminimaln[oaie]\b/i, /\bnajmanj[eai]\b/i, /\bnajni[žz][eai]\b/i,
873
+ /\bminimum\b/i, /\bmin\.?\b/i, /\blowest\b/i, /\bat least\b/i,
874
+ /\bминимальн/i, /\bминимум\b/i,
875
+ ],
876
+ },
877
+ {
878
+ op: 'max',
879
+ patterns: [
880
+ /\bmaksimaln[oaie]\b/i, /\bnajve[ćc][eai]\b/i, /\bnajvi[šs][eai]\b/i,
881
+ /\bmaximum\b/i, /\bmax\.?\b/i, /\bhighest\b/i, /\bat most\b/i,
882
+ /\bмаксимальн/i, /\bмаксимум\b/i,
883
+ ],
884
+ },
885
+ {
886
+ op: 'pct_of_total',
887
+ patterns: [
888
+ /\budeo\b/i, /\bu[čc]e[šs][ćc]e\b/i,
889
+ /\bshare\b/i, /\bproportion\b/i,
890
+ /\bдоля\b/i, /\bудельн/i,
891
+ ],
892
+ },
893
+ ];
894
+ /**
895
+ * Scan the text segment *before* a number for aggregation qualifiers.
896
+ * Returns the detected aggregationOp or undefined.
897
+ */
898
+ function detectAggregationQualifier(text, numberStart) {
899
+ // Look at up to 40 chars before the number
900
+ const before = text.substring(Math.max(0, numberStart - 40), numberStart);
901
+ for (const { op, patterns } of AGGREGATION_QUALIFIERS) {
902
+ if (patterns.some((p) => p.test(before))) {
903
+ return op;
904
+ }
905
+ }
906
+ return undefined;
907
+ }
908
+ // ---------------------------------------------------------------------------
909
+ // Negation detection (L1)
910
+ // ---------------------------------------------------------------------------
911
+ /**
912
+ * Multilingual negation patterns.
913
+ * When a claim is preceded by a negation particle, it implies value = 0 or
914
+ * absence. The claim gets `negated: true`.
915
+ *
916
+ * Examples: "nema kašnjenja" → negated, "no tardiness" → negated,
917
+ * "nijedan zaposleni" → negated, "без опозданий" → negated.
918
+ */
919
+ const NEGATION_PATTERNS = [
920
+ // Serbian
921
+ /\bnema\b/i, /\bnije\b/i, /\bnijedan[ae]?\b/i, /\bniti\s+jedan[ae]?\b/i,
922
+ /\bbez\b/i, /\bniko\b/i, /\bništa\b/i, /\bnikakv[oaie]\b/i,
923
+ // English
924
+ /\bno\b/i, /\bnone\b/i, /\bnot\s+any\b/i, /\bzero\b/i, /\bwithout\b/i,
925
+ /\bneither\b/i, /\bnor\b/i, /\bnot\s+a\s+single\b/i, /\bhasn'?t\b/i,
926
+ /\bisn'?t\b/i, /\bwasn'?t\b/i, /\bdoesn'?t\b/i, /\bdidn'?t\b/i,
927
+ // Russian
928
+ /\bнет\b/i, /\bне\s+было\b/i, /\bни\s+одного\b/i, /\bбез\b/i,
929
+ /\bникак[оиейая]\b/i, /\bникто\b/i, /\bничего\b/i,
930
+ ];
931
+ function detectNegation(text, claimStart) {
932
+ const before = text.substring(Math.max(0, claimStart - 40), claimStart);
933
+ return NEGATION_PATTERNS.some((p) => p.test(before));
934
+ }
935
+ const COMPARISON_PATTERNS = [
936
+ {
937
+ op: '>',
938
+ patterns: [
939
+ /\bviše\s+od\b/i, /\bviše\s+nego\b/i, /\bpreko\b/i, /\biznad\b/i,
940
+ /\bmore\s+than\b/i, /\bover\b/i, /\babove\b/i, /\bexceeds?\b/i, /\bgreater\s+than\b/i,
941
+ /\bбольше\b/i, /\bсвыше\b/i, /\bболее\b/i,
942
+ ],
943
+ },
944
+ {
945
+ op: '<',
946
+ patterns: [
947
+ /\bmanje\s+od\b/i, /\bmanje\s+nego\b/i, /\bispod\b/i,
948
+ /\bless\s+than\b/i, /\bunder\b/i, /\bbelow\b/i, /\bfewer\s+than\b/i,
949
+ /\bменьше\b/i, /\bниже\b/i, /\bменее\b/i,
950
+ ],
951
+ },
952
+ {
953
+ op: '>=',
954
+ patterns: [
955
+ /\bnajmanje\b/i, /\bbar\s+\d/i, /\bminimum\b/i,
956
+ /\bat\s+least\b/i, /\bno\s+less\s+than\b/i, /\bminimum\s+of\b/i,
957
+ /\bкак\s+минимум\b/i, /\bне\s+менее\b/i,
958
+ ],
959
+ },
960
+ {
961
+ op: '<=',
962
+ patterns: [
963
+ /\bnajviše\b/i, /\bmaksimum\b/i, /\bdo\b(?=\s+\d)/i,
964
+ /\bat\s+most\b/i, /\bno\s+more\s+than\b/i, /\bmaximum\s+of\b/i, /\bup\s+to\b/i,
965
+ /\bне\s+более\b/i, /\bмаксимум\b/i,
966
+ ],
967
+ },
968
+ {
969
+ op: '==',
970
+ patterns: [
971
+ /\btačno\b/i, /\bupravo\b/i,
972
+ /\bexactly\b/i, /\bprecisely\b/i,
973
+ /\bровно\b/i, /\bточно\b/i,
974
+ ],
975
+ },
976
+ ];
977
+ function detectComparisonOp(text, numberStart) {
978
+ const before = text.substring(Math.max(0, numberStart - 40), numberStart);
979
+ for (const { op, patterns } of COMPARISON_PATTERNS) {
980
+ if (patterns.some((p) => p.test(before))) {
981
+ return op;
982
+ }
983
+ }
984
+ return undefined;
985
+ }
986
+ const CHANGE_DIRECTION_PATTERNS = [
987
+ {
988
+ dir: 'increase',
989
+ patterns: [
990
+ // Serbian
991
+ /\brast\b/i, /\bporast(?:ao|la|lo)?\b/i, /\bpove[ćc]anj[eaou]\b/i,
992
+ /\bpove[ćc]a[nlot]\b/i, /\bskok\b/i, /\bsko[čc]i[lot]\b/i,
993
+ /\bve[ćc][eai]\s+(?:za|od|nego)\b/i,
994
+ // English
995
+ /\bincrease[ds]?\b/i, /\bgrew\b/i, /\bgrowth\b/i, /\brise[ns]?\b/i,
996
+ /\brose\b/i, /\bjump(?:ed|s)?\b/i, /\bgain(?:ed|s)?\b/i, /\bup\s+by\b/i,
997
+ // Russian
998
+ /\bрост\b/i, /\bувеличени[еяю]\b/i, /\bвырос(?:ла|ло)?\b/i,
999
+ /\bповышени[еяю]\b/i,
1000
+ ],
1001
+ },
1002
+ {
1003
+ dir: 'decrease',
1004
+ patterns: [
1005
+ // Serbian
1006
+ /\bpad\b/i, /\bpa[dl](?:ao|la|lo)?\b/i, /\bsmanjivanj[eaou]\b/i,
1007
+ /\bsmanjeno?\b/i, /\bsmanji[lot]\b/i, /\bsni[žz]enj[eaou]\b/i,
1008
+ /\bmanj[eai]\s+(?:za|od|nego)\b/i,
1009
+ // English
1010
+ /\bdecrease[ds]?\b/i, /\bfell\b/i, /\bdecline[ds]?\b/i, /\bdrop(?:ped|s)?\b/i,
1011
+ /\breduction\b/i, /\breduced?\b/i, /\bdown\s+by\b/i, /\bloss(?:es)?\b/i,
1012
+ // Russian
1013
+ /\bпадени[еяю]\b/i, /\bснижени[еяю]\b/i, /\bупал[оа]?\b/i,
1014
+ /\bуменьшени[еяю]\b/i,
1015
+ ],
1016
+ },
1017
+ ];
1018
+ function detectChangeDirection(text, numberStart) {
1019
+ const before = text.substring(Math.max(0, numberStart - 50), numberStart);
1020
+ for (const { dir, patterns } of CHANGE_DIRECTION_PATTERNS) {
1021
+ if (patterns.some((p) => p.test(before))) {
1022
+ return dir;
1023
+ }
1024
+ }
1025
+ return undefined;
1026
+ }
1027
+ // ---------------------------------------------------------------------------
1028
+ // Superlative / ranking detection (L1)
1029
+ // ---------------------------------------------------------------------------
1030
+ /**
1031
+ * Detects superlative claims: "najkritičniji" → rank 1, "top 5" → rank 5.
1032
+ * Scans text around a name/entity claim.
1033
+ */
1034
+ const SUPERLATIVE_RANK1 = [
1035
+ // Serbian superlatives (prefix "naj-")
1036
+ /\bnaj[a-zčćžšđ]+[aeiou]\b/i,
1037
+ /\bprv[oaie]\b/i, /\bposlednji?\b/i,
1038
+ // English
1039
+ /\bbest\b/i, /\bworst\b/i, /\bhighest\b/i, /\blowest\b/i,
1040
+ /\bmost\b/i, /\bleast\b/i, /\bgreatest\b/i, /\bsmallest\b/i,
1041
+ /\btop\b(?!\s*\d)/i, /\bbottom\b(?!\s*\d)/i,
1042
+ // Russian
1043
+ /\bнаибол[еь]/i, /\bнаименьш/i, /\bлучш/i, /\bхудш/i,
1044
+ /\bсамый\b/i, /\bсамая\b/i, /\bсамое\b/i,
1045
+ ];
1046
+ const TOP_N_PATTERN = /\btop\s+(\d+)\b/i;
1047
+ function detectRank(text, claimStart) {
1048
+ const before = text.substring(Math.max(0, claimStart - 50), claimStart);
1049
+ const tn = TOP_N_PATTERN.exec(before);
1050
+ if (tn)
1051
+ return parseInt(tn[1], 10);
1052
+ if (SUPERLATIVE_RANK1.some((p) => p.test(before)))
1053
+ return 1;
1054
+ return undefined;
1055
+ }
1056
+ // ---------------------------------------------------------------------------
1057
+ // Status transition detection (L1)
1058
+ // ---------------------------------------------------------------------------
1059
+ /**
1060
+ * Detects status transitions: "promenjen sa pending na odobren".
1061
+ * Returns [fromStatus, toStatus] or undefined.
1062
+ */
1063
+ const TRANSITION_PATTERNS = [
1064
+ // Serbian: promenjen/prebačen sa X na Y
1065
+ /(?:promenjen[oaie]?|preba[čc]en[oaie]?|preš(?:ao|la|lo))\s+(?:sa|iz)\s+["']?(\w+)["']?\s+(?:na|u)\s+["']?(\w+)["']?/i,
1066
+ // English: changed/updated from X to Y
1067
+ /(?:changed?|updated?|moved?|transitioned?|switched?)\s+(?:from)\s+["']?(\w+)["']?\s+(?:to)\s+["']?(\w+)["']?/i,
1068
+ // Russian: изменён с X на Y
1069
+ /(?:изменён[аоы]?|переведён[аоы]?)\s+(?:с|из)\s+["']?(\w+)["']?\s+(?:на|в)\s+["']?(\w+)["']?/i,
1070
+ ];
1071
+ function detectTransition(text, claimStart, claimEnd) {
1072
+ const ctx = text.substring(Math.max(0, claimStart - 80), Math.min(text.length, claimEnd + 80));
1073
+ for (const pat of TRANSITION_PATTERNS) {
1074
+ const tm = pat.exec(ctx);
1075
+ if (tm && tm[1] && tm[2]) {
1076
+ return { from: tm[1].toLowerCase(), to: tm[2].toLowerCase() };
1077
+ }
1078
+ }
1079
+ return undefined;
1080
+ }
1081
+ // ---------------------------------------------------------------------------
1082
+ // Diacritic normalization
1083
+ // ---------------------------------------------------------------------------
1084
+ /**
1085
+ * Strips combining diacritical marks for matching purposes.
1086
+ * "Tasić" → "Tasic", "Čačak" → "Cacak", "Новосибирск" → "Новосибирск" (Cyrillic unchanged).
1087
+ * Uses Unicode NFD decomposition + strip combining marks (U+0300–U+036F).
1088
+ */
1089
+ function normalizeDiacritics(str) {
1090
+ return str.normalize('NFD').replace(/[\u0300-\u036f]/g, '').replace(/đ/g, 'd').replace(/Đ/g, 'D');
1091
+ }
1092
+ // ---------------------------------------------------------------------------
1093
+ // Domain nouns — semantically rich count context
1094
+ // ---------------------------------------------------------------------------
1095
+ /**
1096
+ * Domain-specific nouns that provide meaningful context for count claims.
1097
+ * When "23 zaposlenih" is extracted, the metric field is set to "employee"
1098
+ * so downstream rules can distinguish employee counts from generic numbers.
1099
+ */
1100
+ const DOMAIN_NOUNS = new Map([
1101
+ // Serbian — employee/person/member/worker
1102
+ ['zaposleni', 'employee'], ['zaposlena', 'employee'], ['zaposlenih', 'employee'],
1103
+ ['zaposlene', 'employee'], ['zaposlen', 'employee'], ['zaposlenog', 'employee'],
1104
+ ['osoba', 'person'], ['osobe', 'person'], ['osobа', 'person'],
1105
+ ['radnik', 'worker'], ['radnika', 'worker'], ['radnici', 'worker'],
1106
+ ['radnice', 'worker'],
1107
+ ['član', 'member'], ['člana', 'member'], ['članovi', 'member'],
1108
+ ['članova', 'member'],
1109
+ ['smena', 'shift'], ['smene', 'shift'], ['smenu', 'shift'],
1110
+ ['kašnjenje', 'tardiness'], ['kašnjenja', 'tardiness'],
1111
+ ['kasnjenje', 'tardiness'], ['kasnjenja', 'tardiness'],
1112
+ ['izostanak', 'absence'], ['izostanka', 'absence'], ['izostanaka', 'absence'], ['izostanci', 'absence'],
1113
+ // English
1114
+ ['employee', 'employee'], ['employees', 'employee'],
1115
+ ['person', 'person'], ['people', 'person'], ['persons', 'person'],
1116
+ ['worker', 'worker'], ['workers', 'worker'],
1117
+ ['member', 'member'], ['members', 'member'],
1118
+ ['shift', 'shift'], ['shifts', 'shift'],
1119
+ ['absence', 'absence'], ['absences', 'absence'],
1120
+ ['record', 'record'], ['records', 'record'],
1121
+ ['entry', 'entry'], ['entries', 'entry'],
1122
+ ['item', 'item'], ['items', 'item'],
1123
+ ['incident', 'incident'], ['incidents', 'incident'],
1124
+ // Russian
1125
+ ['сотрудник', 'employee'], ['сотрудника', 'employee'], ['сотрудников', 'employee'],
1126
+ ['человек', 'person'], ['людей', 'person'],
1127
+ ]);
1128
+ /**
1129
+ * Extracts factual claims from a text string.
1130
+ *
1131
+ * Extraction order:
1132
+ * 1. Dates — before numbers so date digits are not re-extracted as numbers
1133
+ * 2. Counts — before numbers so count numerals are not re-extracted as plain numbers
1134
+ * 3. Numbers
1135
+ * 4. Names
1136
+ *
1137
+ * Vague qualitative claims are skipped.
1138
+ */
1139
+ function extractClaims(text, options = {}) {
1140
+ const claims = [];
1141
+ const sourceStepId = options.sourceStepId ?? 'unknown';
1142
+ const sourceRole = options.sourceRole ?? 'final_response';
1143
+ const makeSource = (rawText) => ({
1144
+ stepId: sourceStepId,
1145
+ role: sourceRole,
1146
+ rawText,
1147
+ });
1148
+ // Track character ranges already consumed so we don't double-extract.
1149
+ const consumed = [];
1150
+ const isConsumed = (start, end) => consumed.some(([s, e]) => start < e && end > s);
1151
+ const consume = (start, end) => {
1152
+ consumed.push([start, end]);
1153
+ };
1154
+ // 1. Dates
1155
+ DATE_REGEX.lastIndex = 0;
1156
+ let m;
1157
+ while ((m = DATE_REGEX.exec(text)) !== null) {
1158
+ const raw = m[0];
1159
+ const start = m.index;
1160
+ const end = start + raw.length;
1161
+ const parsed = tryParseDate(raw, options.referenceDate);
1162
+ if (parsed && !isConsumed(start, end)) {
1163
+ // Disambiguation: DD.MM without trailing dot (e.g. "31.6") is ambiguous —
1164
+ // it could be a decimal number. Reject when followed by a word (likely a
1165
+ // unit like "sati", "hours"), accept when followed by a range separator
1166
+ // (—, –, -), punctuation, or end of text.
1167
+ if (/^\d{1,2}\.\d{1,2}$/.test(raw)) {
1168
+ const afterText = text.substring(end);
1169
+ // If followed by whitespace then a letter → likely decimal + unit, skip
1170
+ if (/^\s+[a-zA-Z\u00C0-\u024F]/.test(afterText)) {
1171
+ continue;
1172
+ }
1173
+ }
1174
+ consume(start, end);
1175
+ const claim = {
1176
+ claimId: (0, crypto_1.randomUUID)(),
1177
+ type: 'date',
1178
+ value: parsed,
1179
+ rawText: raw,
1180
+ source: makeSource(raw),
1181
+ };
1182
+ // -----------------------------------------------------------------------
1183
+ // Date range context: consume adjacent range numbers so they aren't
1184
+ // extracted as separate number claims.
1185
+ //
1186
+ // Patterns (before a month-year match like "mart 2026"):
1187
+ // "1-21. mart 2026" → range 2026-03-01 to 2026-03-21
1188
+ // "10-20. marta 2026" → range 2026-03-10 to 2026-03-20
1189
+ // "1. - 21. mart 2026" → range
1190
+ // "od 1. do 21. maart" → range
1191
+ // -----------------------------------------------------------------------
1192
+ const beforeCtx = text.substring(Math.max(0, start - 40), start);
1193
+ const rangePrefix = beforeCtx.match(/(\d{1,2})\s*[\.\s]*[-–—]\s*(\d{1,2})\s*[\.\s]*$/);
1194
+ if (rangePrefix && parsed.length >= 7) {
1195
+ // Consume the range prefix chars so numbers aren't re-extracted
1196
+ const prefixLen = rangePrefix[0].length;
1197
+ const prefixStart = start - prefixLen;
1198
+ consume(prefixStart, start);
1199
+ const yearMonth = parsed.substring(0, 8); // "YYYY-MM-"
1200
+ const rangeStartDay = rangePrefix[1];
1201
+ const rangeEndDay = rangePrefix[2];
1202
+ // Range start date
1203
+ const startDateISO = yearMonth + rangeStartDay.padStart(2, '0');
1204
+ const startDateObj = new Date(startDateISO + 'T00:00:00Z');
1205
+ if (!isNaN(startDateObj.getTime())) {
1206
+ claims.push({
1207
+ claimId: (0, crypto_1.randomUUID)(),
1208
+ type: 'date',
1209
+ value: startDateISO,
1210
+ rawText: rangePrefix[0].trim() + ' ' + raw,
1211
+ source: makeSource(rangePrefix[0].trim() + ' ' + raw),
1212
+ });
1213
+ }
1214
+ // Range end date (overrides the original month-only claim)
1215
+ const endDateISO = yearMonth + rangeEndDay.padStart(2, '0');
1216
+ const endDateObj = new Date(endDateISO + 'T00:00:00Z');
1217
+ if (!isNaN(endDateObj.getTime())) {
1218
+ // Replace the original claim value (which was month-start) with range-end
1219
+ claim.value = endDateISO;
1220
+ claim.rawText = rangePrefix[0].trim() + ' ' + raw;
1221
+ }
1222
+ }
1223
+ else {
1224
+ // Also check for single-number prefix: "15. mart 2026" or "od 15. mart 2026"
1225
+ // Only applies to month-name dates (e.g. "mart 2026") where the prefix provides
1226
+ // the missing day. Skip when the raw match already contains a numeric day
1227
+ // (e.g. "02.12.2024") to avoid "1. 02.12.2024" → "2024-12-01".
1228
+ const rawAlreadyHasDay = /\d{1,2}[.\/\-]\d{1,2}[.\/\-]?\d{0,4}/.test(raw);
1229
+ const singlePrefix = beforeCtx.match(/(\d{1,2})\s*\.\s*$/);
1230
+ if (singlePrefix && parsed.length >= 7 && !rawAlreadyHasDay) {
1231
+ const prefixLen = singlePrefix[0].length;
1232
+ const prefixStart = start - prefixLen;
1233
+ consume(prefixStart, start);
1234
+ const yearMonth = parsed.substring(0, 8);
1235
+ const dayISO = yearMonth + singlePrefix[1].padStart(2, '0');
1236
+ const dayObj = new Date(dayISO + 'T00:00:00Z');
1237
+ if (!isNaN(dayObj.getTime())) {
1238
+ claim.value = dayISO;
1239
+ claim.rawText = singlePrefix[0].trim() + ' ' + raw;
1240
+ }
1241
+ // Check for "od X. do Y." pattern before the single prefix:
1242
+ // e.g. "od 10. do 20. marta" → also consume "10."
1243
+ const beforeSingle = text.substring(Math.max(0, prefixStart - 30), prefixStart);
1244
+ const odPrefix = beforeSingle.match(/(?:od|from|von|de|с)\s+(\d{1,2})\s*\.?\s*(?:do|to|bis|à|до)\s*$/i);
1245
+ if (odPrefix) {
1246
+ const odLen = odPrefix[0].length;
1247
+ const odStart = prefixStart - odLen;
1248
+ consume(odStart, prefixStart);
1249
+ // Extract range start date
1250
+ const rangeStartISO = yearMonth + odPrefix[1].padStart(2, '0');
1251
+ const rangeObj = new Date(rangeStartISO + 'T00:00:00Z');
1252
+ if (!isNaN(rangeObj.getTime())) {
1253
+ claims.push({
1254
+ claimId: (0, crypto_1.randomUUID)(),
1255
+ type: 'date',
1256
+ value: rangeStartISO,
1257
+ rawText: odPrefix[0].trim() + ' ' + singlePrefix[0].trim() + ' ' + raw,
1258
+ source: makeSource(odPrefix[0].trim() + ' ' + singlePrefix[0].trim() + ' ' + raw),
1259
+ });
1260
+ }
1261
+ }
1262
+ }
1263
+ }
1264
+ claims.push(claim);
1265
+ }
1266
+ }
1267
+ // 1b. Fractions (e.g. "3/4", "1/2") — must run before counts and generic numbers
1268
+ FRACTION_REGEX.lastIndex = 0;
1269
+ while ((m = FRACTION_REGEX.exec(text)) !== null) {
1270
+ const raw = m[0];
1271
+ const start = m.index;
1272
+ const end = start + raw.length;
1273
+ if (isConsumed(start, end))
1274
+ continue;
1275
+ const numerator = parseInt(m[1], 10);
1276
+ const denominator = parseInt(m[2], 10);
1277
+ if (denominator === 0 || numerator > 1000 || denominator > 1000)
1278
+ continue;
1279
+ // Skip date-like patterns (month/day or day/month): both parts ≤ 31
1280
+ if (numerator >= 1 && numerator <= 12 && denominator >= 1 && denominator <= 31) {
1281
+ // Check for surrounding date context (year nearby)
1282
+ const ctx = text.substring(Math.max(0, start - 15), Math.min(text.length, end + 15));
1283
+ if (/\b(19|20)\d{2}\b/.test(ctx))
1284
+ continue;
1285
+ }
1286
+ const decimalValue = numerator / denominator;
1287
+ consume(start, end);
1288
+ claims.push({
1289
+ claimId: (0, crypto_1.randomUUID)(),
1290
+ type: 'number',
1291
+ value: decimalValue,
1292
+ rawText: raw,
1293
+ source: makeSource(raw),
1294
+ unit: detectUnitAfterNumber(text, end),
1295
+ entity: findNearestEntity(text, start),
1296
+ });
1297
+ }
1298
+ // 1c. Numeric ranges: "od 10 do 20 sati", "between 5 and 10 hours",
1299
+ // "от 10 до 20 часов", "de 5 a 10 horas"
1300
+ // Extracts two endpoint claims + marks both as part of a range.
1301
+ {
1302
+ const RANGE_PATTERN = /\b(?:od|between|от|de|tra|entre)\s+(-?\d+(?:[.,]\d+)?)\s+(?:do|and|to|до|a|e)\s+(-?\d+(?:[.,]\d+)?)\s+([\p{L}]+)/giu;
1303
+ let rm;
1304
+ while ((rm = RANGE_PATTERN.exec(text)) !== null) {
1305
+ const fullStart = rm.index;
1306
+ const fullEnd = fullStart + rm[0].length;
1307
+ if (isConsumed(fullStart, fullEnd))
1308
+ continue;
1309
+ const lo = parseFloat(rm[1].replace(',', '.'));
1310
+ const hi = parseFloat(rm[2].replace(',', '.'));
1311
+ const unitWord = rm[3].toLowerCase();
1312
+ if (isNaN(lo) || isNaN(hi))
1313
+ continue;
1314
+ consume(fullStart, fullEnd);
1315
+ const unit = UNIT_WORDS.has(unitWord) ? unitWord : undefined;
1316
+ const entity = findNearestEntity(text, fullStart);
1317
+ claims.push({
1318
+ claimId: (0, crypto_1.randomUUID)(),
1319
+ type: 'number',
1320
+ value: lo,
1321
+ rawText: rm[0],
1322
+ source: makeSource(rm[0]),
1323
+ unit,
1324
+ entity,
1325
+ });
1326
+ claims.push({
1327
+ claimId: (0, crypto_1.randomUUID)(),
1328
+ type: 'number',
1329
+ value: hi,
1330
+ rawText: rm[0],
1331
+ source: makeSource(rm[0]),
1332
+ unit,
1333
+ entity,
1334
+ });
1335
+ }
1336
+ }
1337
+ // 1d. Percentage-of patterns: "12.5% od ukupnog", "30% of total", "15% от общего"
1338
+ // Extracted as aggregation claims with pct_of_total op.
1339
+ {
1340
+ const PCT_OF_PATTERN = /(-?\d+(?:[.,]\d+)?)\s*%\s*(?:od|of|от|de|du|des|do)\s+([\p{L}]+(?:\s+[\p{L}]+)?)/giu;
1341
+ let pm;
1342
+ while ((pm = PCT_OF_PATTERN.exec(text)) !== null) {
1343
+ const fullStart = pm.index;
1344
+ const fullEnd = fullStart + pm[0].length;
1345
+ if (isConsumed(fullStart, fullEnd))
1346
+ continue;
1347
+ const pctValue = parseFloat(pm[1].replace(',', '.'));
1348
+ if (isNaN(pctValue))
1349
+ continue;
1350
+ consume(fullStart, fullEnd);
1351
+ claims.push({
1352
+ claimId: (0, crypto_1.randomUUID)(),
1353
+ type: 'number',
1354
+ value: pctValue,
1355
+ rawText: pm[0],
1356
+ source: makeSource(pm[0]),
1357
+ unit: 'percent',
1358
+ entity: findNearestEntity(text, fullStart),
1359
+ aggregationOp: 'pct_of_total',
1360
+ });
1361
+ }
1362
+ }
1363
+ // 2. Counts (digit or word number + noun)
1364
+ // Compound time expressions: "7 sati 22 minuta" / "7 hours and 24 minutes"
1365
+ // Convert to a single claim in minutes (the smallest meaningful unit).
1366
+ // E.g. "7 sati 22 minuta" → 442 minutes, "2h 30min" → 150 minutes.
1367
+ // Also handles optional seconds component.
1368
+ {
1369
+ const TIME_EXPANSION = /\b(\d{1,2})\s+(?:sat[ia]?|hours?|heure[s]?|hora[s]?|час[а-я]*)[\s,]+(?:i|and|et|y|и)?\s*(\d{1,2})\s+(?:minut[aei]?|minutes?|мин[а-я]*)(?:[\s,]+(?:i|and|et|y|и)?\s*(\d{1,2})\s+(?:sekund[aei]?|seconds?|сек[а-я]*))?\b/gi;
1370
+ let te;
1371
+ while ((te = TIME_EXPANSION.exec(text)) !== null) {
1372
+ consume(te.index, te.index + te[0].length);
1373
+ const hours = parseInt(te[1], 10);
1374
+ const mins = parseInt(te[2], 10);
1375
+ const totalMinutes = hours * 60 + mins;
1376
+ claims.push({
1377
+ claimId: (0, crypto_1.randomUUID)(),
1378
+ type: 'number',
1379
+ value: totalMinutes,
1380
+ rawText: te[0],
1381
+ source: makeSource(te[0]),
1382
+ unit: 'minutes',
1383
+ entity: findNearestEntity(text, te.index),
1384
+ });
1385
+ }
1386
+ }
1387
+ // Also match abbreviated compound time: "7h 22min", "7h22m", "3h 15min"
1388
+ {
1389
+ const COMPOUND_TIME_ABBREV = /\b(\d{1,3})\s*h(?:rs?)?\s*(?:i\s+)?(\d{1,2})\s*min(?:ut[aei]?)?\b/gi;
1390
+ let ta;
1391
+ while ((ta = COMPOUND_TIME_ABBREV.exec(text)) !== null) {
1392
+ if (isConsumed(ta.index, ta.index + ta[0].length))
1393
+ continue;
1394
+ consume(ta.index, ta.index + ta[0].length);
1395
+ const hours = parseInt(ta[1], 10);
1396
+ const mins = parseInt(ta[2], 10);
1397
+ const totalMinutes = hours * 60 + mins;
1398
+ claims.push({
1399
+ claimId: (0, crypto_1.randomUUID)(),
1400
+ type: 'number',
1401
+ value: totalMinutes,
1402
+ rawText: ta[0],
1403
+ source: makeSource(ta[0]),
1404
+ unit: 'minutes',
1405
+ entity: findNearestEntity(text, ta.index),
1406
+ });
1407
+ }
1408
+ }
1409
+ COUNT_REGEX.lastIndex = 0;
1410
+ while ((m = COUNT_REGEX.exec(text)) !== null) {
1411
+ const raw = m[0];
1412
+ const numPart = m[1];
1413
+ const nounPart = m[2];
1414
+ const start = m.index;
1415
+ const end = start + raw.length;
1416
+ if (isConsumed(start, end))
1417
+ continue;
1418
+ if (containsVagueQualifier(text.substring(Math.max(0, start - 30), end + 30)))
1419
+ continue;
1420
+ // Skip time components: "24 ujutru" from "7:24 ujutru" is a time, not a count
1421
+ {
1422
+ const timeCtxStart = Math.max(0, start - 10);
1423
+ const timeCtx = text.substring(timeCtxStart, Math.min(text.length, end + 10));
1424
+ const localOffset = start - timeCtxStart;
1425
+ const TIME_PAT = /\b(\d{1,2}):(\d{2})(?::(\d{2}))?\b/g;
1426
+ let tm;
1427
+ let insideTime = false;
1428
+ while ((tm = TIME_PAT.exec(timeCtx)) !== null) {
1429
+ const tStart = tm.index;
1430
+ const tEnd = tStart + tm[0].length;
1431
+ // Check if the digit portion of the count sits inside a time pattern
1432
+ if (localOffset >= tStart && localOffset + numPart.length <= tEnd) {
1433
+ consume(start - (localOffset - tStart), start - (localOffset - tStart) + tm[0].length);
1434
+ insideTime = true;
1435
+ break;
1436
+ }
1437
+ }
1438
+ if (insideTime)
1439
+ continue;
1440
+ }
1441
+ // Unit words — measurements like "36 hours" or "5 degrees"
1442
+ // Extract as number claim WITH unit metadata (not count)
1443
+ if (UNIT_WORDS.has(nounPart.toLowerCase())) {
1444
+ const numericValue = WORD_NUMBERS[numPart.toLowerCase()] !== undefined
1445
+ ? WORD_NUMBERS[numPart.toLowerCase()]
1446
+ : parseFloat(numPart.replace(/[,\s]/g, ''));
1447
+ if (isNaN(numericValue))
1448
+ continue;
1449
+ consume(start, end);
1450
+ const claim = {
1451
+ claimId: (0, crypto_1.randomUUID)(),
1452
+ type: 'number',
1453
+ value: numericValue,
1454
+ rawText: raw,
1455
+ source: makeSource(raw),
1456
+ unit: nounPart.toLowerCase(),
1457
+ entity: findNearestEntity(text, start),
1458
+ aggregationOp: detectAggregationQualifier(text, start),
1459
+ ...(detectNegation(text, start) ? { negated: true } : {}),
1460
+ ...(detectComparisonOp(text, start) ? { comparisonOp: detectComparisonOp(text, start) } : {}),
1461
+ ...(detectChangeDirection(text, start) ? { changeDirection: detectChangeDirection(text, start) } : {}),
1462
+ };
1463
+ claims.push(claim);
1464
+ continue;
1465
+ }
1466
+ // Skip ordered list markers: "1. " at line start (numPart ends before ".")
1467
+ const numPartEnd = start + numPart.length;
1468
+ if (isListMarker(text, start, numPartEnd))
1469
+ continue;
1470
+ const numericValue = WORD_NUMBERS[numPart.toLowerCase()] !== undefined
1471
+ ? WORD_NUMBERS[numPart.toLowerCase()]
1472
+ : parseFloat(numPart.replace(/[,\s]/g, ''));
1473
+ if (isNaN(numericValue))
1474
+ continue;
1475
+ // Skip 4-digit years (1900–2099) when adjacent to a month name.
1476
+ // E.g. "februar 2026\n\nSlavica" → COUNT_REGEX matches "2026 Slavica"
1477
+ // across the newline, but 2026 is a year, not a count of "Slavica".
1478
+ if (/^\d{4}$/.test(numPart) && numericValue >= 1900 && numericValue <= 2099) {
1479
+ const MONTH_PAT = /(?:january|jan|february|feb|march|mar|april|apr|may|june|jun|july|jul|august|aug|september|sep|sept|october|oct|november|nov|december|dec|januar|januara|februar|februara|mart|marta|maj|maja|jun[ia]|jul[ia]|avgust|avgusta|septembar|septembra|oktobar|oktobra|novembar|novembra|decembar|decembra)\b/i;
1480
+ const beforeCtx = text.substring(Math.max(0, start - 25), start);
1481
+ if (MONTH_PAT.test(beforeCtx)) {
1482
+ consume(start, end);
1483
+ continue;
1484
+ }
1485
+ }
1486
+ consume(start, end);
1487
+ const domainMetric = DOMAIN_NOUNS.get(nounPart.toLowerCase());
1488
+ const claim = {
1489
+ claimId: (0, crypto_1.randomUUID)(),
1490
+ type: 'count',
1491
+ value: numericValue,
1492
+ rawText: raw,
1493
+ source: makeSource(raw),
1494
+ entity: findNearestEntity(text, start),
1495
+ aggregationOp: detectAggregationQualifier(text, start),
1496
+ ...(domainMetric ? { metric: domainMetric } : {}),
1497
+ ...(detectNegation(text, start) ? { negated: true } : {}),
1498
+ ...(detectComparisonOp(text, start) ? { comparisonOp: detectComparisonOp(text, start) } : {}),
1499
+ ...(detectChangeDirection(text, start) ? { changeDirection: detectChangeDirection(text, start) } : {}),
1500
+ };
1501
+ claims.push(claim);
1502
+ }
1503
+ // 3b. Generic numbers (not already consumed as date, count, or fraction)
1504
+ NUMBER_REGEX.lastIndex = 0;
1505
+ while ((m = NUMBER_REGEX.exec(text)) !== null) {
1506
+ const raw = m[0];
1507
+ const start = m.index;
1508
+ const end = start + raw.length;
1509
+ if (isConsumed(start, end))
1510
+ continue;
1511
+ if (containsVagueQualifier(text.substring(Math.max(0, start - 30), end + 30)))
1512
+ continue;
1513
+ // Skip numbers that are part of hyphenated compound words:
1514
+ // e.g. "24-satni" (24-hour), "3-mesečni" (3-month)
1515
+ if (text[end] === '-' && /^[a-zA-Z\u00C0-\u024F]/.test(text[end + 1] || '')) {
1516
+ consume(start, end);
1517
+ continue;
1518
+ }
1519
+ // Skip ordered list markers: "1. ", "2. " at line start
1520
+ if (isListMarker(text, start, end)) {
1521
+ consume(start, end);
1522
+ continue;
1523
+ }
1524
+ const numericValue = DOT_THOUSANDS_RE.test(raw)
1525
+ ? parseFloat(raw.replace(/\./g, '')) // European thousands: "4.496" → 4496
1526
+ : parseFloat(raw.replace(/[,\s]/g, '')); // US/standard thousands: "4,496" → 4496
1527
+ if (isNaN(numericValue))
1528
+ continue;
1529
+ // Skip standalone 4-digit values in the year range (1900-2099).
1530
+ // These are almost always calendar years, not meaningful data claims.
1531
+ // Also catch years adjacent to month names (e.g. "februar 2026") that
1532
+ // the count extractor might otherwise pick up.
1533
+ if (/^\d{4}$/.test(raw)) {
1534
+ const yr = numericValue;
1535
+ if (yr >= 1900 && yr <= 2099) {
1536
+ consume(start, end);
1537
+ continue;
1538
+ }
1539
+ }
1540
+ // Skip European date fragments: DD.MM or DD.MM.YYYY
1541
+ // E.g. "03.12" (3rd December), "10.12.2024" — not numeric claims.
1542
+ // But NOT when followed by a unit word (e.g. "23.1 minuta" is a decimal, not a date).
1543
+ if (/^\d{1,2}\.\d{1,2}(\.\d{2,4})?$/.test(raw)) {
1544
+ const parts = raw.split('.');
1545
+ const day = parseInt(parts[0], 10);
1546
+ const month = parseInt(parts[1], 10);
1547
+ const afterNum = text.substring(end);
1548
+ const followedByUnit = /^\s+[a-zA-Z\u00C0-\u024F]/.test(afterNum);
1549
+ if (day >= 1 && day <= 31 && month >= 1 && month <= 12 && !followedByUnit) {
1550
+ consume(start, end);
1551
+ continue;
1552
+ }
1553
+ }
1554
+ // Skip day-of-month numbers (1-31) adjacent to month names.
1555
+ // E.g. "Subota, 15. mart" or "March 15" — date component, not a data claim.
1556
+ if (numericValue >= 1 && numericValue <= 31 && /^\d{1,2}$/.test(raw)) {
1557
+ const monthPat = /(?:january|jan|february|feb|march|mar|april|apr|may|june|jun|july|jul|august|aug|september|sep|sept|october|oct|november|nov|december|dec|januar|januara|februar|februara|mart|marta|maj|maja|avgust|avgusta|septembar|septembra|oktobar|oktobra|novembar|novembra|decembar|decembra)\b/i;
1558
+ const afterCtx = text.substring(end, Math.min(text.length, end + 30));
1559
+ const beforeCtx = text.substring(Math.max(0, start - 30), start);
1560
+ if (new RegExp('^[\\s.*_]*\\.?\\s*' + monthPat.source, 'i').test(afterCtx) ||
1561
+ new RegExp(monthPat.source + '\\s*$', 'i').test(beforeCtx)) {
1562
+ consume(start, end);
1563
+ continue;
1564
+ }
1565
+ }
1566
+ // Skip time components: numbers inside HH:MM:SS or HH:MM patterns.
1567
+ // E.g. "07:24:02" should not produce claims for 7, 24, 2.
1568
+ // Check surrounding context for colon-separated digit groups.
1569
+ {
1570
+ const timeCtxStart = Math.max(0, start - 10);
1571
+ const timeCtx = text.substring(timeCtxStart, Math.min(text.length, end + 10));
1572
+ const localOffset = start - timeCtxStart;
1573
+ const TIME_PATTERN = /\b(\d{1,2}):(\d{2})(?::(\d{2}))?\b/g;
1574
+ let tm;
1575
+ let insideTime = false;
1576
+ while ((tm = TIME_PATTERN.exec(timeCtx)) !== null) {
1577
+ const tStart = tm.index;
1578
+ const tEnd = tStart + tm[0].length;
1579
+ if (localOffset >= tStart && localOffset + raw.length <= tEnd) {
1580
+ // Consume the entire timestamp so other components are skipped too
1581
+ consume(start - (localOffset - tStart), start - (localOffset - tStart) + tm[0].length);
1582
+ insideTime = true;
1583
+ break;
1584
+ }
1585
+ }
1586
+ if (insideTime)
1587
+ continue;
1588
+ }
1589
+ consume(start, end);
1590
+ const unitAfter = detectUnitAfterNumber(text, end);
1591
+ const claim = {
1592
+ claimId: (0, crypto_1.randomUUID)(),
1593
+ type: 'number',
1594
+ value: numericValue,
1595
+ rawText: raw,
1596
+ source: makeSource(raw),
1597
+ unit: unitAfter ?? detectCurrencyBeforeNumber(text, start),
1598
+ entity: findNearestEntity(text, start),
1599
+ aggregationOp: detectAggregationQualifier(text, start),
1600
+ ...(detectNegation(text, start) ? { negated: true } : {}),
1601
+ ...(detectComparisonOp(text, start) ? { comparisonOp: detectComparisonOp(text, start) } : {}),
1602
+ ...(detectChangeDirection(text, start) ? { changeDirection: detectChangeDirection(text, start) } : {}),
1603
+ };
1604
+ claims.push(claim);
1605
+ }
1606
+ // 3b. Status transitions: "sa X na Y", "from X to Y", "из X в Y"
1607
+ // Creates a synthetic name claim with transitionFrom / transitionTo metadata.
1608
+ {
1609
+ const TRANS_REGEX = /\b(?:sa|iz|od|from|из|von)\s+([\p{L}\p{N}_]+)\s+(?:na|u|do|to|in|в|auf|nach)\s+([\p{L}\p{N}_]+)\b/giu;
1610
+ let tr;
1611
+ while ((tr = TRANS_REGEX.exec(text)) !== null) {
1612
+ const fullStart = tr.index;
1613
+ const fullEnd = fullStart + tr[0].length;
1614
+ if (isConsumed(fullStart, fullEnd))
1615
+ continue;
1616
+ consume(fullStart, fullEnd);
1617
+ claims.push({
1618
+ claimId: (0, crypto_1.randomUUID)(),
1619
+ type: 'name',
1620
+ value: tr[1] + ' → ' + tr[2],
1621
+ rawText: tr[0],
1622
+ source: makeSource(tr[0]),
1623
+ transitionFrom: tr[1],
1624
+ transitionTo: tr[2],
1625
+ });
1626
+ }
1627
+ }
1628
+ // 4. Names (proper nouns — 2+ Title-Cased words)
1629
+ NAME_REGEX.lastIndex = 0;
1630
+ while ((m = NAME_REGEX.exec(text)) !== null) {
1631
+ const raw = m[0];
1632
+ const namePart = m[1].trim();
1633
+ const start = m.index;
1634
+ const end = start + raw.length;
1635
+ if (isConsumed(start, end))
1636
+ continue;
1637
+ // Skip matches that span line boundaries (cosmetic title-casing across lines)
1638
+ if (raw.includes('\n') || raw.includes('\r'))
1639
+ continue;
1640
+ // Skip section headers / label phrases (all words are common English words)
1641
+ if (isHeadingPhrase(namePart))
1642
+ continue;
1643
+ // Strip sentence-starting discourse markers: "Dakle Marina" → "Marina"
1644
+ const nameWords = namePart.split(/\s+/);
1645
+ let cleanedName = namePart;
1646
+ if (nameWords.length >= 2 && SENTENCE_STARTERS.has(nameWords[0].toLowerCase())) {
1647
+ const remainder = nameWords.slice(1).join(' ');
1648
+ // Only keep if remainder has at least one word (single-word names don't qualify here)
1649
+ if (nameWords.length === 2)
1650
+ continue; // "Dakle Marina" → single word "Marina" — not a name claim
1651
+ cleanedName = remainder;
1652
+ }
1653
+ consume(start, end);
1654
+ const nameRank = detectRank(text, start);
1655
+ const transition = detectTransition(text, start, end);
1656
+ const claim = {
1657
+ claimId: (0, crypto_1.randomUUID)(),
1658
+ type: 'name',
1659
+ value: cleanedName,
1660
+ rawText: raw,
1661
+ source: makeSource(raw),
1662
+ ...(nameRank !== undefined ? { rank: nameRank } : {}),
1663
+ ...(transition ? { transitionFrom: transition.from, transitionTo: transition.to } : {}),
1664
+ };
1665
+ claims.push(claim);
1666
+ }
1667
+ return claims;
1668
+ }
1669
+ //# sourceMappingURL=index.js.map