elementary-assertions 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/CHANGELOG.md +353 -0
  2. package/LICENSE +21 -0
  3. package/README.md +211 -0
  4. package/bin/elementary-assertions.js +8 -0
  5. package/docs/DEV_TOOLING.md +98 -0
  6. package/docs/NPM_RELEASE.md +177 -0
  7. package/docs/OPERATIONAL.md +159 -0
  8. package/docs/RELEASE_NOTES_TEMPLATE.md +37 -0
  9. package/docs/REPO_WORKFLOWS.md +48 -0
  10. package/package.json +46 -0
  11. package/src/core/accepted-annotations.js +44 -0
  12. package/src/core/assertions.js +2304 -0
  13. package/src/core/determinism.js +95 -0
  14. package/src/core/diagnostics.js +496 -0
  15. package/src/core/ids.js +9 -0
  16. package/src/core/mention-builder.js +272 -0
  17. package/src/core/mention-evidence.js +52 -0
  18. package/src/core/mention-head-resolution.js +108 -0
  19. package/src/core/mention-materialization.js +31 -0
  20. package/src/core/mentions.js +149 -0
  21. package/src/core/output.js +296 -0
  22. package/src/core/projection.js +192 -0
  23. package/src/core/roles.js +164 -0
  24. package/src/core/strings.js +7 -0
  25. package/src/core/tokens.js +53 -0
  26. package/src/core/upstream.js +31 -0
  27. package/src/index.js +6 -0
  28. package/src/render/index.js +5 -0
  29. package/src/render/layouts/compact.js +10 -0
  30. package/src/render/layouts/meaning.js +7 -0
  31. package/src/render/layouts/readable.js +7 -0
  32. package/src/render/layouts/table.js +7 -0
  33. package/src/render/render.js +931 -0
  34. package/src/run.js +278 -0
  35. package/src/schema/seed.elementary-assertions.schema.json +1751 -0
  36. package/src/tools/cli.js +158 -0
  37. package/src/tools/index.js +6 -0
  38. package/src/tools/io.js +55 -0
  39. package/src/validate/ajv.js +20 -0
  40. package/src/validate/coverage.js +215 -0
  41. package/src/validate/determinism.js +115 -0
  42. package/src/validate/diagnostics-strict.js +392 -0
  43. package/src/validate/errors.js +19 -0
  44. package/src/validate/index.js +20 -0
  45. package/src/validate/integrity.js +41 -0
  46. package/src/validate/invariants.js +157 -0
  47. package/src/validate/references.js +110 -0
  48. package/src/validate/schema.js +50 -0
@@ -0,0 +1,272 @@
1
+ const { findSelector, normalizeSpanKey, normalizeIds } = require("./determinism");
2
+ const { getMweHeadEvidence, getMweLexiconEvidence } = require("./mention-materialization");
3
+ const {
4
+ buildChunkHeadMaps,
5
+ buildDependencyObservationMaps,
6
+ resolveMentionHead,
7
+ } = require("./mention-head-resolution");
8
+ const { buildMentionLexiconEvidence } = require("./mention-evidence");
9
+
10
+ function mentionSortKey(mention) {
11
+ return `${mention.segment_id}|${String(mention.span.start).padStart(8, "0")}|${String(mention.span.end).padStart(8, "0")}|${mention.kind}|${mention.id}`;
12
+ }
13
+
14
+ function buildMentions({ relationsSeed, mweSeed, headsSeed, tokenById, tokenWikiById }) {
15
+ const annotations = Array.isArray(mweSeed?.annotations) ? mweSeed.annotations : [];
16
+ const mweCandidates = [];
17
+ for (const annotation of annotations) {
18
+ if (!annotation || annotation.kind !== "mwe" || annotation.status !== "accepted") continue;
19
+ const tokenSelector = findSelector(annotation, "TokenSelector");
20
+ const textPos = findSelector(annotation, "TextPositionSelector");
21
+ if (!tokenSelector || !Array.isArray(tokenSelector.token_ids) || tokenSelector.token_ids.length === 0) continue;
22
+ if (!textPos || !textPos.span || typeof textPos.span.start !== "number" || typeof textPos.span.end !== "number") continue;
23
+ const ids = normalizeIds(tokenSelector.token_ids);
24
+ const missing = ids.some((id) => !tokenById.has(id));
25
+ if (missing) continue;
26
+ const segmentId = tokenById.get(ids[0]).segment_id;
27
+ if (!ids.every((id) => tokenById.get(id).segment_id === segmentId)) continue;
28
+ mweCandidates.push({
29
+ annotation_id: typeof annotation.id === "string" ? annotation.id : "",
30
+ token_ids: ids,
31
+ span: { start: textPos.span.start, end: textPos.span.end },
32
+ segment_id: segmentId,
33
+ explicit_head: getMweHeadEvidence(annotation),
34
+ lexicon_evidence: getMweLexiconEvidence(annotation),
35
+ });
36
+ }
37
+
38
+ mweCandidates.sort((a, b) => {
39
+ if (b.token_ids.length !== a.token_ids.length) return b.token_ids.length - a.token_ids.length;
40
+ const al = a.span.end - a.span.start;
41
+ const bl = b.span.end - b.span.start;
42
+ if (bl !== al) return bl - al;
43
+ if (a.span.start !== b.span.start) return a.span.start - b.span.start;
44
+ return a.annotation_id.localeCompare(b.annotation_id);
45
+ });
46
+
47
+ const claimed = new Set();
48
+ const winners = [];
49
+ const alternatives = [];
50
+ for (const candidate of mweCandidates) {
51
+ if (candidate.token_ids.some((id) => claimed.has(id))) {
52
+ alternatives.push(candidate);
53
+ continue;
54
+ }
55
+ winners.push(candidate);
56
+ for (const id of candidate.token_ids) claimed.add(id);
57
+ }
58
+
59
+ const { chunkById, headByChunkId } = buildChunkHeadMaps(headsSeed);
60
+ const { incomingInside } = buildDependencyObservationMaps(relationsSeed, tokenById);
61
+
62
+ const mentions = [];
63
+ const unresolvedHeadMap = new Map();
64
+
65
+ for (const winner of winners) {
66
+ const head = resolveMentionHead({
67
+ tokenIds: winner.token_ids,
68
+ explicitHead: winner.explicit_head,
69
+ chunkById,
70
+ headByChunkId,
71
+ incomingInsideMap: incomingInside,
72
+ tokenById,
73
+ findSelector,
74
+ });
75
+ const mentionLexiconEvidence = buildMentionLexiconEvidence({
76
+ tokenIds: winner.token_ids,
77
+ tokenWikiById,
78
+ mweLexiconEvidence: winner.lexicon_evidence,
79
+ });
80
+ const idBase = `m:${winner.segment_id}:${normalizeSpanKey(winner.span)}:mwe`;
81
+ mentions.push({
82
+ id: idBase,
83
+ kind: "mwe",
84
+ priority: 0,
85
+ token_ids: winner.token_ids,
86
+ head_token_id: head.head,
87
+ span: winner.span,
88
+ segment_id: winner.segment_id,
89
+ is_primary: true,
90
+ provenance: {
91
+ source_annotation_id: winner.annotation_id || undefined,
92
+ source_kind: "mwe_materialized",
93
+ head_strategy: head.strategy,
94
+ lexicon_source: mentionLexiconEvidence ? "wikipedia-title-index" : undefined,
95
+ lexicon_evidence: mentionLexiconEvidence || undefined,
96
+ },
97
+ });
98
+ if (head.unresolved) unresolvedHeadMap.set(idBase, head.unresolved);
99
+ }
100
+
101
+ for (const winner of alternatives) {
102
+ const head = resolveMentionHead({
103
+ tokenIds: winner.token_ids,
104
+ explicitHead: winner.explicit_head,
105
+ chunkById,
106
+ headByChunkId,
107
+ incomingInsideMap: incomingInside,
108
+ tokenById,
109
+ findSelector,
110
+ });
111
+ const mentionLexiconEvidence = buildMentionLexiconEvidence({
112
+ tokenIds: winner.token_ids,
113
+ tokenWikiById,
114
+ mweLexiconEvidence: winner.lexicon_evidence,
115
+ });
116
+ const idBase = `m:${winner.segment_id}:${normalizeSpanKey(winner.span)}:mwe_alt`;
117
+ mentions.push({
118
+ id: idBase,
119
+ kind: "mwe",
120
+ priority: 2,
121
+ token_ids: winner.token_ids,
122
+ head_token_id: head.head,
123
+ span: winner.span,
124
+ segment_id: winner.segment_id,
125
+ is_primary: false,
126
+ provenance: {
127
+ source_annotation_id: winner.annotation_id || undefined,
128
+ source_kind: "mwe_alternative",
129
+ head_strategy: head.strategy,
130
+ lexicon_source: mentionLexiconEvidence ? "wikipedia-title-index" : undefined,
131
+ lexicon_evidence: mentionLexiconEvidence || undefined,
132
+ },
133
+ });
134
+ if (head.unresolved) unresolvedHeadMap.set(idBase, head.unresolved);
135
+ }
136
+
137
+ const allTokensSorted = Array.from(tokenById.values()).sort((a, b) => a.i - b.i);
138
+ for (const token of allTokensSorted) {
139
+ if (claimed.has(token.id)) continue;
140
+ const mentionLexiconEvidence = buildMentionLexiconEvidence({
141
+ tokenIds: [token.id],
142
+ tokenWikiById,
143
+ mweLexiconEvidence: null,
144
+ });
145
+ const idBase = `m:${token.segment_id}:${token.span.start}-${token.span.end}:token`;
146
+ mentions.push({
147
+ id: idBase,
148
+ kind: "token",
149
+ priority: 1,
150
+ token_ids: [token.id],
151
+ head_token_id: token.id,
152
+ span: { start: token.span.start, end: token.span.end },
153
+ segment_id: token.segment_id,
154
+ is_primary: true,
155
+ provenance: {
156
+ source_kind: "token_fallback",
157
+ head_strategy: "explicit",
158
+ lexicon_source: mentionLexiconEvidence ? "wikipedia-title-index" : undefined,
159
+ lexicon_evidence: mentionLexiconEvidence || undefined,
160
+ },
161
+ });
162
+ }
163
+
164
+ for (const token of allTokensSorted) {
165
+ if (!claimed.has(token.id)) continue;
166
+ const mentionLexiconEvidence = buildMentionLexiconEvidence({
167
+ tokenIds: [token.id],
168
+ tokenWikiById,
169
+ mweLexiconEvidence: null,
170
+ });
171
+ const idBase = `m:${token.segment_id}:${token.span.start}-${token.span.end}:token_shadow`;
172
+ mentions.push({
173
+ id: idBase,
174
+ kind: "token",
175
+ priority: 4,
176
+ token_ids: [token.id],
177
+ head_token_id: token.id,
178
+ span: { start: token.span.start, end: token.span.end },
179
+ segment_id: token.segment_id,
180
+ is_primary: false,
181
+ provenance: {
182
+ source_kind: "token_shadow",
183
+ head_strategy: "explicit",
184
+ lexicon_source: mentionLexiconEvidence ? "wikipedia-title-index" : undefined,
185
+ lexicon_evidence: mentionLexiconEvidence || undefined,
186
+ },
187
+ });
188
+ }
189
+
190
+ for (const [chunkId, chunk] of chunkById.entries()) {
191
+ const tokenSelector = findSelector(chunk, "TokenSelector");
192
+ const textPos = findSelector(chunk, "TextPositionSelector");
193
+ if (!tokenSelector || !Array.isArray(tokenSelector.token_ids) || tokenSelector.token_ids.length === 0) continue;
194
+ if (!textPos || !textPos.span || typeof textPos.span.start !== "number" || typeof textPos.span.end !== "number") continue;
195
+ const tokenIds = normalizeIds(tokenSelector.token_ids);
196
+ if (tokenIds.some((id) => !tokenById.has(id))) continue;
197
+ const segmentId = tokenById.get(tokenIds[0]).segment_id;
198
+ if (!tokenIds.every((id) => tokenById.get(id).segment_id === segmentId)) continue;
199
+ const explicitHead = headByChunkId.get(chunkId) || null;
200
+ const head = resolveMentionHead({
201
+ tokenIds,
202
+ explicitHead,
203
+ chunkById,
204
+ headByChunkId,
205
+ incomingInsideMap: incomingInside,
206
+ tokenById,
207
+ findSelector,
208
+ });
209
+ const mentionLexiconEvidence = buildMentionLexiconEvidence({
210
+ tokenIds,
211
+ tokenWikiById,
212
+ mweLexiconEvidence: null,
213
+ });
214
+ const idBase = `m:${segmentId}:${normalizeSpanKey(textPos.span)}:chunk`;
215
+ mentions.push({
216
+ id: idBase,
217
+ kind: "chunk",
218
+ priority: 3,
219
+ token_ids: tokenIds,
220
+ head_token_id: head.head,
221
+ span: { start: textPos.span.start, end: textPos.span.end },
222
+ segment_id: segmentId,
223
+ is_primary: false,
224
+ provenance: {
225
+ source_annotation_id: chunkId,
226
+ source_kind: "chunk_accepted",
227
+ head_strategy: head.strategy,
228
+ lexicon_source: mentionLexiconEvidence ? "wikipedia-title-index" : undefined,
229
+ lexicon_evidence: mentionLexiconEvidence || undefined,
230
+ },
231
+ });
232
+ if (head.unresolved) unresolvedHeadMap.set(idBase, head.unresolved);
233
+ }
234
+
235
+ mentions.sort((a, b) => mentionSortKey(a).localeCompare(mentionSortKey(b)));
236
+
237
+ const baseIdCounts = new Map();
238
+ const assignedIds = new Set();
239
+ for (const mention of mentions) {
240
+ const baseId = mention.id;
241
+ let n = (baseIdCounts.get(baseId) || 0) + 1;
242
+ let candidate = n === 1 ? baseId : `${baseId}:${n}`;
243
+ while (assignedIds.has(candidate)) {
244
+ n += 1;
245
+ candidate = `${baseId}:${n}`;
246
+ }
247
+ baseIdCounts.set(baseId, n);
248
+ assignedIds.add(candidate);
249
+ mention.id = candidate;
250
+ }
251
+
252
+ const tokenToPrimaryMention = new Map();
253
+ const tokenToAllMentions = new Map();
254
+ for (const mention of mentions) {
255
+ if (!mention.is_primary) continue;
256
+ for (const tokenId of mention.token_ids) tokenToPrimaryMention.set(tokenId, mention.id);
257
+ }
258
+ for (const mention of mentions) {
259
+ for (const tokenId of mention.token_ids) {
260
+ if (!tokenToAllMentions.has(tokenId)) tokenToAllMentions.set(tokenId, []);
261
+ tokenToAllMentions.get(tokenId).push(mention.id);
262
+ }
263
+ }
264
+ for (const mentionIds of tokenToAllMentions.values()) mentionIds.sort((a, b) => a.localeCompare(b));
265
+
266
+ return { mentions, tokenToPrimaryMention, tokenToAllMentions, unresolvedHeadMap };
267
+ }
268
+
269
+ module.exports = {
270
+ mentionSortKey,
271
+ buildMentions,
272
+ };
@@ -0,0 +1,52 @@
1
+ const { deepCloneJson, normalizeIds } = require("./determinism");
2
+
3
+ function buildMentionLexiconEvidence({ tokenIds, tokenWikiById, mweLexiconEvidence }) {
4
+ const tokenEvidence = normalizeIds(tokenIds || [])
5
+ .filter((tokenId) => tokenWikiById.has(tokenId))
6
+ .map((tokenId) => ({
7
+ token_id: tokenId,
8
+ evidence: deepCloneJson(tokenWikiById.get(tokenId)),
9
+ }));
10
+
11
+ if (tokenEvidence.length === 0 && !mweLexiconEvidence) return null;
12
+
13
+ const out = {};
14
+ if (mweLexiconEvidence) out.mwe = deepCloneJson(mweLexiconEvidence);
15
+ if (tokenEvidence.length > 0) out.tokens = tokenEvidence;
16
+ return out;
17
+ }
18
+
19
+ function buildAssertionWikiSignals({ predicateMentionId, relations, mentionById }) {
20
+ const mentionIds = new Set([predicateMentionId]);
21
+ for (const relation of relations || []) {
22
+ if (relation && typeof relation.dep_mention_id === "string") mentionIds.add(relation.dep_mention_id);
23
+ }
24
+
25
+ const mentionEvidence = Array.from(mentionIds)
26
+ .sort((a, b) => a.localeCompare(b))
27
+ .map((mentionId) => {
28
+ const mention = mentionById.get(mentionId);
29
+ const lexiconEvidence =
30
+ mention &&
31
+ mention.provenance &&
32
+ mention.provenance.lexicon_evidence &&
33
+ typeof mention.provenance.lexicon_evidence === "object"
34
+ ? deepCloneJson(mention.provenance.lexicon_evidence)
35
+ : null;
36
+ if (!mention || !lexiconEvidence) return null;
37
+ return {
38
+ mention_id: mentionId,
39
+ token_ids: normalizeIds(mention.token_ids || []),
40
+ evidence: lexiconEvidence,
41
+ };
42
+ })
43
+ .filter(Boolean);
44
+
45
+ if (mentionEvidence.length === 0) return null;
46
+ return { mention_evidence: mentionEvidence };
47
+ }
48
+
49
+ module.exports = {
50
+ buildMentionLexiconEvidence,
51
+ buildAssertionWikiSignals,
52
+ };
@@ -0,0 +1,108 @@
1
+ function buildChunkHeadMaps(headsSeed) {
2
+ const chunkById = new Map();
3
+ const headByChunkId = new Map();
4
+ if (!headsSeed || !Array.isArray(headsSeed.annotations)) return { chunkById, headByChunkId };
5
+ for (const annotation of headsSeed.annotations) {
6
+ if (!annotation || annotation.status !== "accepted") continue;
7
+ if (annotation.kind === "chunk" && typeof annotation.id === "string") {
8
+ chunkById.set(annotation.id, annotation);
9
+ }
10
+ if (
11
+ annotation.kind === "chunk_head" &&
12
+ typeof annotation.chunk_id === "string" &&
13
+ annotation.head &&
14
+ typeof annotation.head.id === "string"
15
+ ) {
16
+ headByChunkId.set(annotation.chunk_id, annotation.head.id);
17
+ }
18
+ }
19
+ return { chunkById, headByChunkId };
20
+ }
21
+
22
+ function buildDependencyObservationMaps(relationsSeed, tokenById) {
23
+ const incomingInside = new Map();
24
+ const outgoingInside = new Map();
25
+ const annotations = Array.isArray(relationsSeed && relationsSeed.annotations) ? relationsSeed.annotations : [];
26
+ for (const annotation of annotations) {
27
+ if (!annotation || annotation.kind !== "dependency" || annotation.status !== "observation") continue;
28
+ if (!annotation.dep || typeof annotation.dep.id !== "string" || !tokenById.has(annotation.dep.id)) continue;
29
+ if (annotation.is_root || !annotation.head || typeof annotation.head.id !== "string" || !tokenById.has(annotation.head.id)) {
30
+ continue;
31
+ }
32
+ const dep = annotation.dep.id;
33
+ const head = annotation.head.id;
34
+ if (!incomingInside.has(dep)) incomingInside.set(dep, []);
35
+ if (!outgoingInside.has(head)) outgoingInside.set(head, []);
36
+ incomingInside.get(dep).push(head);
37
+ outgoingInside.get(head).push(dep);
38
+ }
39
+ return { incomingInside, outgoingInside };
40
+ }
41
+
42
+ function posFallbackHead(tokenIds, tokenById) {
43
+ const tokens = (tokenIds || []).map((id) => tokenById.get(id)).filter(Boolean).sort((a, b) => a.i - b.i);
44
+ const nouns = tokens.filter((token) => /^(NN|NNS|NNP|NNPS|PRP|CD)/.test(token.pos.tag));
45
+ if (nouns.length > 0) return nouns[nouns.length - 1].id;
46
+ const verbs = tokens.filter((token) => /^VB/.test(token.pos.tag));
47
+ if (verbs.length > 0) return verbs[0].id;
48
+ return tokens.length > 0 ? tokens[tokens.length - 1].id : null;
49
+ }
50
+
51
+ function resolveMentionHead({
52
+ tokenIds,
53
+ explicitHead,
54
+ chunkById,
55
+ headByChunkId,
56
+ incomingInsideMap,
57
+ tokenById,
58
+ findSelector,
59
+ }) {
60
+ const tokenSet = new Set(tokenIds);
61
+ if (explicitHead && tokenSet.has(explicitHead)) {
62
+ return { head: explicitHead, strategy: "explicit", unresolved: null };
63
+ }
64
+
65
+ for (const [chunkId, chunk] of chunkById.entries()) {
66
+ const tokenSelector = findSelector(chunk, "TokenSelector");
67
+ if (!tokenSelector || !Array.isArray(tokenSelector.token_ids)) continue;
68
+ const ids = tokenSelector.token_ids;
69
+ if (ids.length !== tokenIds.length) continue;
70
+ let same = true;
71
+ for (const id of ids) {
72
+ if (!tokenSet.has(id)) {
73
+ same = false;
74
+ break;
75
+ }
76
+ }
77
+ if (!same) continue;
78
+ const head = headByChunkId.get(chunkId);
79
+ if (head && tokenSet.has(head)) return { head, strategy: "chunk_head", unresolved: null };
80
+ }
81
+
82
+ const rootCandidates = tokenIds.filter((id) => {
83
+ const incoming = incomingInsideMap.get(id) || [];
84
+ const insideIncoming = incoming.filter((headId) => tokenSet.has(headId));
85
+ return insideIncoming.length === 0;
86
+ });
87
+ if (rootCandidates.length === 1) return { head: rootCandidates[0], strategy: "dependency_head", unresolved: null };
88
+
89
+ const fallback = posFallbackHead(tokenIds, tokenById);
90
+ if (fallback) {
91
+ return {
92
+ head: fallback,
93
+ strategy: "pos_fallback",
94
+ unresolved:
95
+ rootCandidates.length === 0
96
+ ? "no_dependency_head_in_mention"
97
+ : "multiple_dependency_head_candidates",
98
+ };
99
+ }
100
+ return { head: tokenIds[0], strategy: "unresolved", unresolved: "empty_mention_tokens" };
101
+ }
102
+
103
+ module.exports = {
104
+ buildChunkHeadMaps,
105
+ buildDependencyObservationMaps,
106
+ posFallbackHead,
107
+ resolveMentionHead,
108
+ };
@@ -0,0 +1,31 @@
1
+ const { deepCloneJson } = require("./determinism");
2
+
3
+ function getMweHeadEvidence(mwe) {
4
+ if (!Array.isArray(mwe && mwe.sources)) return null;
5
+ const src = mwe.sources.find(
6
+ (entry) =>
7
+ entry &&
8
+ entry.name === "mwe-materialization" &&
9
+ entry.evidence &&
10
+ typeof entry.evidence.head_token_id === "string"
11
+ );
12
+ return src ? src.evidence.head_token_id : null;
13
+ }
14
+
15
+ function getMweLexiconEvidence(mwe) {
16
+ if (!Array.isArray(mwe && mwe.sources)) return null;
17
+ const src = mwe.sources.find(
18
+ (entry) =>
19
+ entry &&
20
+ entry.name === "wikipedia-title-index" &&
21
+ entry.evidence &&
22
+ typeof entry.evidence === "object"
23
+ );
24
+ if (!src) return null;
25
+ return deepCloneJson(src.evidence);
26
+ }
27
+
28
+ module.exports = {
29
+ getMweHeadEvidence,
30
+ getMweLexiconEvidence,
31
+ };
@@ -0,0 +1,149 @@
1
+ const { findSelector, normalizeSpanKey, normalizeIds, deepCloneJson } = require('./determinism');
2
+ const { annotationHasSource, collectStep11Relations } = require('./upstream');
3
+ const { buildTokenIndex, getTokenWikipediaEvidence, buildTokenWikiById, getTokenMetadataProjection } = require('./tokens');
4
+ const { getMweHeadEvidence, getMweLexiconEvidence } = require('./mention-materialization');
5
+ const { toAnnotationSummary, buildAcceptedAnnotationsInventory } = require('./accepted-annotations');
6
+ const { buildChunkHeadMaps, buildDependencyObservationMaps, posFallbackHead, resolveMentionHead } = require('./mention-head-resolution');
7
+ const { buildMentionLexiconEvidence, buildAssertionWikiSignals } = require('./mention-evidence');
8
+ const { mentionSortKey, buildMentions } = require('./mention-builder');
9
+
10
+ function normalizeWikiSurface(surface) {
11
+ if (typeof surface !== 'string') return '';
12
+ return surface
13
+ .normalize('NFKC')
14
+ .replace(/[\u2018\u2019\u02bc]/g, "'")
15
+ .replace(/[\u2010-\u2015]/g, '-')
16
+ .replace(/\s+/g, ' ')
17
+ .trim()
18
+ .toLowerCase();
19
+ }
20
+
21
+ function hasPositiveWikiSignal(evidence) {
22
+ if (!evidence || typeof evidence !== 'object') return false;
23
+ if (evidence.wiki_exact_match === true) return true;
24
+ if (typeof evidence.wiki_prefix_count === 'number' && evidence.wiki_prefix_count > 0) return true;
25
+ if (
26
+ typeof evidence.wiki_parenthetical_variant_count === 'number' &&
27
+ evidence.wiki_parenthetical_variant_count > 0
28
+ ) {
29
+ return true;
30
+ }
31
+ if (evidence.wiki_hyphen_space_variant_match === true) return true;
32
+ if (evidence.wiki_apostrophe_variant_match === true) return true;
33
+ if (evidence.wiki_singular_plural_variant_match === true) return true;
34
+ if (evidence.wiki_any_signal === true) return true;
35
+ return false;
36
+ }
37
+
38
+ const SUBJECT_ROLE_LABELS = new Set([
39
+ 'actor',
40
+ 'agent',
41
+ 'subject',
42
+ 'subj',
43
+ 'nsubj',
44
+ 'nsubjpass',
45
+ 'csubj',
46
+ 'csubjpass',
47
+ 'agent_passive',
48
+ ]);
49
+
50
+ function isSubjectRoleLabel(role) {
51
+ return SUBJECT_ROLE_LABELS.has(String(role || ''));
52
+ }
53
+
54
+ function roleToSlot(role) {
55
+ if (isSubjectRoleLabel(role)) return { slot: 'actor', role: null };
56
+ if (role === 'theme') return { slot: 'theme', role: null };
57
+ if (role === 'patient') return { slot: 'theme', role: null };
58
+ if (role === 'attribute') return { slot: 'attr', role: null };
59
+ if (role === 'topic') return { slot: 'topic', role: null };
60
+ if (role === 'location') return { slot: 'location', role: null };
61
+ if (role === 'recipient') return { slot: 'other', role: 'recipient' };
62
+ return { slot: 'other', role };
63
+ }
64
+
65
+ function isCompareLabel(label) {
66
+ const l = String(label || '');
67
+ return l === 'compare' || l === 'compare_gt' || l === 'compare_lt';
68
+ }
69
+
70
+ function isQuantifierLabel(label) {
71
+ const l = String(label || '');
72
+ return l === 'quantifier' || l === 'quantifier_scope' || l === 'scope_quantifier';
73
+ }
74
+
75
+ function mentionHasLexiconEvidence(mention) {
76
+ return !!(
77
+ mention &&
78
+ mention.provenance &&
79
+ mention.provenance.lexicon_evidence &&
80
+ typeof mention.provenance.lexicon_evidence === 'object'
81
+ );
82
+ }
83
+
84
+ function compareMentionProjectionPriority(a, b) {
85
+ const aSpanLen = Array.isArray(a && a.token_ids) ? a.token_ids.length : 0;
86
+ const bSpanLen = Array.isArray(b && b.token_ids) ? b.token_ids.length : 0;
87
+ if (aSpanLen !== bSpanLen) return bSpanLen - aSpanLen;
88
+ const aPriority = Number.isFinite(a && a.priority) ? a.priority : Number.MAX_SAFE_INTEGER;
89
+ const bPriority = Number.isFinite(b && b.priority) ? b.priority : Number.MAX_SAFE_INTEGER;
90
+ if (aPriority !== bPriority) return aPriority - bPriority;
91
+ const aLex = mentionHasLexiconEvidence(a) ? 0 : 1;
92
+ const bLex = mentionHasLexiconEvidence(b) ? 0 : 1;
93
+ if (aLex !== bLex) return aLex - bLex;
94
+ return String(a && a.id ? a.id : '').localeCompare(String(b && b.id ? b.id : ''));
95
+ }
96
+
97
+ function chooseBestMentionForToken({ tokenId, segmentId, mentionById, candidateMentionIds, excludeMentionId }) {
98
+ const sourceIds = (candidateMentionIds || [])
99
+ .filter((id) => typeof id === 'string')
100
+ .slice();
101
+ const filteredIds = sourceIds.filter((id) => {
102
+ if (excludeMentionId && id === excludeMentionId) return false;
103
+ const mention = mentionById.get(id);
104
+ if (!mention || mention.segment_id !== segmentId) return false;
105
+ if (mention.suppressed === true) return false;
106
+ return Array.isArray(mention.token_ids) && mention.token_ids.includes(tokenId);
107
+ });
108
+ if (filteredIds.length === 0) {
109
+ return { mention_id: null, candidate_count: 0, chosen_was_first: true };
110
+ }
111
+ const chosenSorted = filteredIds.slice().sort((a, b) => {
112
+ const ma = mentionById.get(a);
113
+ const mb = mentionById.get(b);
114
+ return compareMentionProjectionPriority(ma, mb);
115
+ });
116
+ const chosenId = chosenSorted[0];
117
+ return {
118
+ mention_id: chosenId,
119
+ candidate_count: filteredIds.length,
120
+ chosen_was_first: sourceIds.length > 0 ? sourceIds[0] === chosenId : true,
121
+ };
122
+ }
123
+
124
+ module.exports = {
125
+ normalizeWikiSurface,
126
+ hasPositiveWikiSignal,
127
+ mentionSortKey,
128
+ SUBJECT_ROLE_LABELS,
129
+ isSubjectRoleLabel,
130
+ roleToSlot,
131
+ isCompareLabel,
132
+ isQuantifierLabel,
133
+ mentionHasLexiconEvidence,
134
+ compareMentionProjectionPriority,
135
+ chooseBestMentionForToken,
136
+ getMweHeadEvidence,
137
+ getMweLexiconEvidence,
138
+ toAnnotationSummary,
139
+ buildAcceptedAnnotationsInventory,
140
+ buildMentionLexiconEvidence,
141
+ buildAssertionWikiSignals,
142
+ buildChunkHeadMaps,
143
+ buildDependencyObservationMaps,
144
+ posFallbackHead,
145
+ resolveMentionHead,
146
+ buildMentions,
147
+ };
148
+
149
+