elementary-assertions 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +353 -0
- package/LICENSE +21 -0
- package/README.md +211 -0
- package/bin/elementary-assertions.js +8 -0
- package/docs/DEV_TOOLING.md +98 -0
- package/docs/NPM_RELEASE.md +177 -0
- package/docs/OPERATIONAL.md +159 -0
- package/docs/RELEASE_NOTES_TEMPLATE.md +37 -0
- package/docs/REPO_WORKFLOWS.md +48 -0
- package/package.json +46 -0
- package/src/core/accepted-annotations.js +44 -0
- package/src/core/assertions.js +2304 -0
- package/src/core/determinism.js +95 -0
- package/src/core/diagnostics.js +496 -0
- package/src/core/ids.js +9 -0
- package/src/core/mention-builder.js +272 -0
- package/src/core/mention-evidence.js +52 -0
- package/src/core/mention-head-resolution.js +108 -0
- package/src/core/mention-materialization.js +31 -0
- package/src/core/mentions.js +149 -0
- package/src/core/output.js +296 -0
- package/src/core/projection.js +192 -0
- package/src/core/roles.js +164 -0
- package/src/core/strings.js +7 -0
- package/src/core/tokens.js +53 -0
- package/src/core/upstream.js +31 -0
- package/src/index.js +6 -0
- package/src/render/index.js +5 -0
- package/src/render/layouts/compact.js +10 -0
- package/src/render/layouts/meaning.js +7 -0
- package/src/render/layouts/readable.js +7 -0
- package/src/render/layouts/table.js +7 -0
- package/src/render/render.js +931 -0
- package/src/run.js +278 -0
- package/src/schema/seed.elementary-assertions.schema.json +1751 -0
- package/src/tools/cli.js +158 -0
- package/src/tools/index.js +6 -0
- package/src/tools/io.js +55 -0
- package/src/validate/ajv.js +20 -0
- package/src/validate/coverage.js +215 -0
- package/src/validate/determinism.js +115 -0
- package/src/validate/diagnostics-strict.js +392 -0
- package/src/validate/errors.js +19 -0
- package/src/validate/index.js +20 -0
- package/src/validate/integrity.js +41 -0
- package/src/validate/invariants.js +157 -0
- package/src/validate/references.js +110 -0
- package/src/validate/schema.js +50 -0
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
const { normalizeIds } = require("./determinism");
|
|
2
|
+
const { normalizeWikiSurface } = require("./mentions");
|
|
3
|
+
const { getTokenWikipediaEvidence, getTokenMetadataProjection } = require("./tokens");
|
|
4
|
+
|
|
5
|
+
function mentionSurfaceText(mention, tokenById, canonicalText) {
|
|
6
|
+
if (!mention) return "";
|
|
7
|
+
if (mention.span && typeof mention.span.start === "number" && typeof mention.span.end === "number") {
|
|
8
|
+
return String(canonicalText || "").slice(mention.span.start, mention.span.end);
|
|
9
|
+
}
|
|
10
|
+
const tokens = (mention.token_ids || []).map((id) => tokenById.get(id)).filter(Boolean).sort((a, b) => a.i - b.i);
|
|
11
|
+
return tokens.map((t) => t.surface).join(" ");
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function mergeWikiTitlesInto(target, evidence) {
|
|
15
|
+
if (!target || !evidence || typeof evidence !== "object") return;
|
|
16
|
+
const exactTitles = Array.isArray(evidence.exact_titles) ? evidence.exact_titles : [];
|
|
17
|
+
const prefixTitles = Array.isArray(evidence.prefix_titles) ? evidence.prefix_titles : [];
|
|
18
|
+
for (const title of exactTitles) {
|
|
19
|
+
if (typeof title !== "string") continue;
|
|
20
|
+
if (!target._exactSeen.has(title)) {
|
|
21
|
+
target._exactSeen.add(title);
|
|
22
|
+
target.exact_titles.push(title);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
for (const title of prefixTitles) {
|
|
26
|
+
if (typeof title !== "string") continue;
|
|
27
|
+
if (!target._prefixSeen.has(title)) {
|
|
28
|
+
target._prefixSeen.add(title);
|
|
29
|
+
target.prefix_titles.push(title);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function buildWikiTitleEvidenceFromUpstream({ mentions, assertions, tokenById, canonicalText }) {
|
|
35
|
+
const mentionById = new Map((mentions || []).map((m) => [m.id, m]));
|
|
36
|
+
const primaryMentionIds = (mentions || []).filter((m) => m && m.is_primary).map((m) => m.id);
|
|
37
|
+
const predicateMentionIds = (assertions || []).map((a) => a && a.predicate && a.predicate.mention_id).filter(Boolean);
|
|
38
|
+
const targetMentionIds = normalizeIds(primaryMentionIds.concat(predicateMentionIds));
|
|
39
|
+
|
|
40
|
+
const byMention = [];
|
|
41
|
+
for (const mentionId of targetMentionIds) {
|
|
42
|
+
const mention = mentionById.get(mentionId);
|
|
43
|
+
if (!mention) continue;
|
|
44
|
+
const aggregate = {
|
|
45
|
+
exact_titles: [],
|
|
46
|
+
prefix_titles: [],
|
|
47
|
+
_exactSeen: new Set(),
|
|
48
|
+
_prefixSeen: new Set(),
|
|
49
|
+
};
|
|
50
|
+
const lexiconEvidence =
|
|
51
|
+
mention.provenance &&
|
|
52
|
+
mention.provenance.lexicon_evidence &&
|
|
53
|
+
typeof mention.provenance.lexicon_evidence === "object"
|
|
54
|
+
? mention.provenance.lexicon_evidence
|
|
55
|
+
: null;
|
|
56
|
+
if (lexiconEvidence && lexiconEvidence.mwe && typeof lexiconEvidence.mwe === "object") {
|
|
57
|
+
mergeWikiTitlesInto(aggregate, lexiconEvidence.mwe);
|
|
58
|
+
}
|
|
59
|
+
const tokenEvidence = lexiconEvidence && Array.isArray(lexiconEvidence.tokens) ? lexiconEvidence.tokens : [];
|
|
60
|
+
for (const entry of tokenEvidence) {
|
|
61
|
+
if (!entry || !entry.evidence || typeof entry.evidence !== "object") continue;
|
|
62
|
+
mergeWikiTitlesInto(aggregate, entry.evidence);
|
|
63
|
+
}
|
|
64
|
+
const surface = mentionSurfaceText(mention, tokenById, canonicalText);
|
|
65
|
+
byMention.push({
|
|
66
|
+
mention_id: mentionId,
|
|
67
|
+
normalized_surface: normalizeWikiSurface(surface),
|
|
68
|
+
exact_titles: aggregate.exact_titles,
|
|
69
|
+
prefix_titles: aggregate.prefix_titles,
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
byMention.sort((a, b) => a.mention_id.localeCompare(b.mention_id));
|
|
73
|
+
|
|
74
|
+
const byAssertion = [];
|
|
75
|
+
const byMentionMap = new Map(byMention.map((x) => [x.mention_id, x]));
|
|
76
|
+
for (const a of assertions || []) {
|
|
77
|
+
if (!a || typeof a.id !== "string") continue;
|
|
78
|
+
const m = byMentionMap.get(a.predicate && a.predicate.mention_id);
|
|
79
|
+
if (!m) continue;
|
|
80
|
+
byAssertion.push({
|
|
81
|
+
assertion_id: a.id,
|
|
82
|
+
predicate_mention_id: a.predicate.mention_id,
|
|
83
|
+
exact_titles: Array.isArray(m.exact_titles) ? m.exact_titles : [],
|
|
84
|
+
prefix_titles: Array.isArray(m.prefix_titles) ? m.prefix_titles : [],
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
byAssertion.sort((a, b) => a.assertion_id.localeCompare(b.assertion_id));
|
|
88
|
+
|
|
89
|
+
return {
|
|
90
|
+
normalization: {
|
|
91
|
+
unicode_form: "NFKC",
|
|
92
|
+
punctuation_map: { apostrophes: "['\\u2018\\u2019\\u02bc]->'", dashes: "[\\u2010-\\u2015]->-" },
|
|
93
|
+
whitespace: "collapse_spaces_trim",
|
|
94
|
+
casefold: "toLowerCase",
|
|
95
|
+
},
|
|
96
|
+
mention_matches: byMention,
|
|
97
|
+
assertion_predicate_matches: byAssertion,
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function isContentPosTag(tag) {
|
|
102
|
+
if (!tag || typeof tag !== "string") return false;
|
|
103
|
+
return /^(NN|NNS|NNP|NNPS|VB|VBD|VBG|VBN|VBP|VBZ|JJ|JJR|JJS|RB|RBR|RBS|CD|PRP|PRP\$|FW|UH)$/.test(tag);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
function isPunctuationSurface(surface) {
|
|
107
|
+
if (typeof surface !== "string" || surface.length === 0) return false;
|
|
108
|
+
return /^[\p{P}\p{S}]+$/u.test(surface);
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function buildCoverageDomainMentionIds(mentions, tokenById) {
|
|
112
|
+
const ids = [];
|
|
113
|
+
for (const m of mentions || []) {
|
|
114
|
+
if (!m.is_primary) continue;
|
|
115
|
+
const headTok = tokenById.get(m.head_token_id);
|
|
116
|
+
if (!headTok) continue;
|
|
117
|
+
const tag = headTok.pos && typeof headTok.pos.tag === "string" ? headTok.pos.tag : "";
|
|
118
|
+
if (!isContentPosTag(tag)) continue;
|
|
119
|
+
if (isPunctuationSurface(headTok.surface)) continue;
|
|
120
|
+
ids.push(m.id);
|
|
121
|
+
}
|
|
122
|
+
return normalizeIds(ids);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
function buildOutput({
|
|
126
|
+
schemaVersion,
|
|
127
|
+
relationsSeed,
|
|
128
|
+
mentions,
|
|
129
|
+
assertions,
|
|
130
|
+
coveredMentions,
|
|
131
|
+
unresolved,
|
|
132
|
+
sourceInputs,
|
|
133
|
+
pipelineTrace,
|
|
134
|
+
acceptedAnnotations,
|
|
135
|
+
diagnostics,
|
|
136
|
+
projectedBuild,
|
|
137
|
+
wikiTitleEvidence,
|
|
138
|
+
}) {
|
|
139
|
+
const tokenById = new Map((relationsSeed.tokens || []).map((t) => [t.id, t]));
|
|
140
|
+
const coverageDomain = new Set(buildCoverageDomainMentionIds(mentions, tokenById));
|
|
141
|
+
const primary = normalizeIds(Array.from(coverageDomain));
|
|
142
|
+
const covered = normalizeIds(Array.from(coveredMentions || []).filter((id) => coverageDomain.has(id)));
|
|
143
|
+
const uncovered = primary.filter((id) => !(coveredMentions || new Set()).has(id));
|
|
144
|
+
|
|
145
|
+
const normalizedSegments = (relationsSeed.segments || []).map((s) => ({
|
|
146
|
+
id: s.id,
|
|
147
|
+
span: { start: s.span.start, end: s.span.end },
|
|
148
|
+
token_range: {
|
|
149
|
+
start: s.token_range && typeof s.token_range.start === "number" ? s.token_range.start : 0,
|
|
150
|
+
end: s.token_range && typeof s.token_range.end === "number" ? s.token_range.end : 0,
|
|
151
|
+
},
|
|
152
|
+
}));
|
|
153
|
+
|
|
154
|
+
const normalizedTokens = (relationsSeed.tokens || []).map((t) => {
|
|
155
|
+
const wikiEvidence = getTokenWikipediaEvidence(t);
|
|
156
|
+
const tokenMeta = getTokenMetadataProjection(t);
|
|
157
|
+
return {
|
|
158
|
+
id: t.id,
|
|
159
|
+
i: t.i,
|
|
160
|
+
segment_id: t.segment_id,
|
|
161
|
+
span: { start: t.span.start, end: t.span.end },
|
|
162
|
+
surface: t.surface,
|
|
163
|
+
...(t.pos && typeof t.pos.tag === "string"
|
|
164
|
+
? { pos: { tag: t.pos.tag, ...(typeof t.pos.coarse === "string" ? { coarse: t.pos.coarse } : {}) } }
|
|
165
|
+
: {}),
|
|
166
|
+
...tokenMeta,
|
|
167
|
+
...(wikiEvidence ? { lexicon: { wikipedia_title_index: wikiEvidence } } : {}),
|
|
168
|
+
};
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
const out = {
|
|
172
|
+
seed_id: relationsSeed.seed_id,
|
|
173
|
+
stage: "elementary_assertions",
|
|
174
|
+
index_basis: { text_field: "canonical_text", span_unit: "utf16_code_units" },
|
|
175
|
+
canonical_text: relationsSeed.canonical_text,
|
|
176
|
+
segments: normalizedSegments,
|
|
177
|
+
tokens: normalizedTokens,
|
|
178
|
+
mentions,
|
|
179
|
+
assertions,
|
|
180
|
+
relation_projection: {
|
|
181
|
+
all_relations: projectedBuild.all || [],
|
|
182
|
+
projected_relations: (projectedBuild.projected || []).map((r) => ({
|
|
183
|
+
relation_id: r.relation_id,
|
|
184
|
+
label: r.label,
|
|
185
|
+
segment_id: r.segment_id,
|
|
186
|
+
head_token_id: r.head_token_id,
|
|
187
|
+
dep_token_id: r.dep_token_id,
|
|
188
|
+
head_mention_id: r.head_mention_id,
|
|
189
|
+
dep_mention_id: r.dep_mention_id,
|
|
190
|
+
})),
|
|
191
|
+
dropped_relations: projectedBuild.dropped || [],
|
|
192
|
+
},
|
|
193
|
+
accepted_annotations: acceptedAnnotations,
|
|
194
|
+
wiki_title_evidence: wikiTitleEvidence,
|
|
195
|
+
diagnostics,
|
|
196
|
+
coverage: {
|
|
197
|
+
primary_mention_ids: primary,
|
|
198
|
+
covered_primary_mention_ids: covered,
|
|
199
|
+
uncovered_primary_mention_ids: uncovered,
|
|
200
|
+
unresolved,
|
|
201
|
+
},
|
|
202
|
+
sources: {
|
|
203
|
+
inputs: sourceInputs,
|
|
204
|
+
pipeline: pipelineTrace,
|
|
205
|
+
},
|
|
206
|
+
};
|
|
207
|
+
if (typeof schemaVersion === "string" && schemaVersion.length > 0) {
|
|
208
|
+
out.schema_version = schemaVersion;
|
|
209
|
+
}
|
|
210
|
+
return out;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
function buildCoverageAudit(output) {
|
|
214
|
+
const mentions = Array.isArray(output && output.mentions) ? output.mentions : [];
|
|
215
|
+
const coverage = ((output || {}).coverage) || {};
|
|
216
|
+
const primaryIds = normalizeIds(Array.isArray(coverage.primary_mention_ids) ? coverage.primary_mention_ids : []);
|
|
217
|
+
const coveredIds = new Set(normalizeIds(Array.isArray(coverage.covered_primary_mention_ids) ? coverage.covered_primary_mention_ids : []));
|
|
218
|
+
const assertions = Array.isArray(output && output.assertions) ? output.assertions : [];
|
|
219
|
+
const unresolved = Array.isArray(coverage.unresolved) ? coverage.unresolved : [];
|
|
220
|
+
const suppressed = Array.isArray((((output || {}).diagnostics) || {}).suppressed_assertions)
|
|
221
|
+
? output.diagnostics.suppressed_assertions
|
|
222
|
+
: [];
|
|
223
|
+
|
|
224
|
+
const unresolvedByMention = new Map();
|
|
225
|
+
for (const u of unresolved) {
|
|
226
|
+
if (!u || typeof u.mention_id !== "string") continue;
|
|
227
|
+
unresolvedByMention.set(u.mention_id, String(u.reason || "other"));
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
const coveredBy = new Map();
|
|
231
|
+
function addMechanism(mid, mechanism) {
|
|
232
|
+
if (!mid || !mechanism) return;
|
|
233
|
+
if (!coveredBy.has(mid)) coveredBy.set(mid, new Set());
|
|
234
|
+
coveredBy.get(mid).add(mechanism);
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
for (const a of assertions) {
|
|
238
|
+
if (!a || typeof a !== "object") continue;
|
|
239
|
+
if (Object.prototype.hasOwnProperty.call(a, "slots")) {
|
|
240
|
+
throw new Error("Invalid input: legacy assertions[*].slots is not supported.");
|
|
241
|
+
}
|
|
242
|
+
const argEntries = Array.isArray(a.arguments) ? a.arguments : [];
|
|
243
|
+
const modEntries = Array.isArray(a.modifiers) ? a.modifiers : [];
|
|
244
|
+
for (const entry of argEntries) for (const mid of entry.mention_ids || []) addMechanism(mid, "slot");
|
|
245
|
+
for (const entry of modEntries) for (const mid of entry.mention_ids || []) addMechanism(mid, "slot");
|
|
246
|
+
|
|
247
|
+
for (const op of a.operators || []) {
|
|
248
|
+
const tid = String((op && op.token_id) || "");
|
|
249
|
+
if (!tid) continue;
|
|
250
|
+
for (const m of mentions) {
|
|
251
|
+
if (!m || !primaryIds.includes(m.id)) continue;
|
|
252
|
+
if ((m.token_ids || []).includes(tid)) addMechanism(m.id, "operator");
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
const evTokenIds = normalizeIds((((a || {}).evidence) || {}).token_ids || []);
|
|
257
|
+
if (evTokenIds.length > 0) {
|
|
258
|
+
for (const m of mentions) {
|
|
259
|
+
if (!m || !primaryIds.includes(m.id)) continue;
|
|
260
|
+
const hasEvidence = (m.token_ids || []).some((tid) => evTokenIds.includes(tid));
|
|
261
|
+
if (hasEvidence) addMechanism(m.id, "evidence");
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
for (const s of suppressed) {
|
|
267
|
+
for (const mid of s.transferred_mention_ids || []) addMechanism(mid, "transfer");
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
const primaryMentions = mentions
|
|
271
|
+
.filter((m) => m && primaryIds.includes(m.id))
|
|
272
|
+
.slice()
|
|
273
|
+
.sort((a, b) => String(a.id || "").localeCompare(String(b.id || "")));
|
|
274
|
+
|
|
275
|
+
return primaryMentions.map((m) => {
|
|
276
|
+
const mechanisms = normalizeIds(Array.from(coveredBy.get(m.id) || []));
|
|
277
|
+
const covered = coveredIds.has(m.id);
|
|
278
|
+
return {
|
|
279
|
+
mention_id: m.id,
|
|
280
|
+
covered,
|
|
281
|
+
covered_by: covered ? mechanisms : [],
|
|
282
|
+
uncovered_reason: covered ? null : (unresolvedByMention.get(m.id) || "other"),
|
|
283
|
+
};
|
|
284
|
+
});
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
module.exports = {
|
|
288
|
+
mentionSurfaceText,
|
|
289
|
+
mergeWikiTitlesInto,
|
|
290
|
+
buildWikiTitleEvidenceFromUpstream,
|
|
291
|
+
isContentPosTag,
|
|
292
|
+
isPunctuationSurface,
|
|
293
|
+
buildCoverageDomainMentionIds,
|
|
294
|
+
buildOutput,
|
|
295
|
+
buildCoverageAudit,
|
|
296
|
+
};
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
const { deepCloneJson, sha256Hex, normalizeIds } = require('./determinism');
|
|
2
|
+
const { chooseBestMentionForToken, compareMentionProjectionPriority, mentionHasLexiconEvidence } = require('./mentions');
|
|
3
|
+
const { annotationHasSource, collectStep11Relations } = require('./upstream');
|
|
4
|
+
const { getTokenWikipediaEvidence, buildTokenWikiById, getTokenMetadataProjection } = require('./tokens');
|
|
5
|
+
const { getMweHeadEvidence, getMweLexiconEvidence } = require('./mention-materialization');
|
|
6
|
+
const { toAnnotationSummary, buildAcceptedAnnotationsInventory } = require('./accepted-annotations');
|
|
7
|
+
const { buildChunkHeadMaps, buildDependencyObservationMaps, posFallbackHead, resolveMentionHead } = require('./mention-head-resolution');
|
|
8
|
+
const { buildMentionLexiconEvidence, buildAssertionWikiSignals } = require('./mention-evidence');
|
|
9
|
+
const { buildMentions } = require('./mention-builder');
|
|
10
|
+
|
|
11
|
+
function buildProjectedRelations(relations, tokenToMention, tokenToAllMentions, mentionById, tokenById) {
|
|
12
|
+
function mentionKindRank(kind) {
|
|
13
|
+
if (kind === 'token') return 0;
|
|
14
|
+
if (kind === 'mwe') return 1;
|
|
15
|
+
if (kind === 'chunk') return 2;
|
|
16
|
+
return 9;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
function chooseMentionId(candidates, preferPrimary, excludeId) {
|
|
20
|
+
const ids = (candidates || [])
|
|
21
|
+
.filter((id) => typeof id === 'string' && mentionById.has(id) && id !== excludeId)
|
|
22
|
+
.slice();
|
|
23
|
+
ids.sort((a, b) => {
|
|
24
|
+
const ma = mentionById.get(a);
|
|
25
|
+
const mb = mentionById.get(b);
|
|
26
|
+
const pa = ma && ma.is_primary ? 0 : 1;
|
|
27
|
+
const pb = mb && mb.is_primary ? 0 : 1;
|
|
28
|
+
if (preferPrimary && pa !== pb) return pa - pb;
|
|
29
|
+
if (!preferPrimary && pa !== pb) return pb - pa;
|
|
30
|
+
const ka = mentionKindRank(ma ? ma.kind : '');
|
|
31
|
+
const kb = mentionKindRank(mb ? mb.kind : '');
|
|
32
|
+
if (ka !== kb) return ka - kb;
|
|
33
|
+
if (ma && mb && ma.segment_id !== mb.segment_id) return ma.segment_id.localeCompare(mb.segment_id);
|
|
34
|
+
if (ma && mb && ma.span.start !== mb.span.start) return ma.span.start - mb.span.start;
|
|
35
|
+
if (ma && mb && ma.span.end !== mb.span.end) return ma.span.end - mb.span.end;
|
|
36
|
+
return a.localeCompare(b);
|
|
37
|
+
});
|
|
38
|
+
return ids.length > 0 ? ids[0] : null;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const out = [];
|
|
42
|
+
const unresolved = [];
|
|
43
|
+
const dropped = [];
|
|
44
|
+
const all = [];
|
|
45
|
+
for (const r of relations) {
|
|
46
|
+
const headMentionIds = tokenToAllMentions.get(r.head_token_id) || [];
|
|
47
|
+
const depMentionIds = tokenToAllMentions.get(r.dep_token_id) || [];
|
|
48
|
+
const headTok = tokenById.get(r.head_token_id);
|
|
49
|
+
const depTok = tokenById.get(r.dep_token_id);
|
|
50
|
+
const segmentId = headTok.segment_id;
|
|
51
|
+
|
|
52
|
+
let headMentionId = tokenToMention.get(r.head_token_id);
|
|
53
|
+
let depMentionId = tokenToMention.get(r.dep_token_id);
|
|
54
|
+
|
|
55
|
+
if (!headMentionId) {
|
|
56
|
+
headMentionId = chooseMentionId(headMentionIds, true, null);
|
|
57
|
+
}
|
|
58
|
+
if (!depMentionId) {
|
|
59
|
+
depMentionId = chooseMentionId(depMentionIds, true, null);
|
|
60
|
+
}
|
|
61
|
+
if (headMentionId && depMentionId && headMentionId === depMentionId) {
|
|
62
|
+
const depAlt = chooseMentionId(depMentionIds, false, headMentionId);
|
|
63
|
+
if (depAlt) {
|
|
64
|
+
depMentionId = depAlt;
|
|
65
|
+
} else {
|
|
66
|
+
const headAlt = chooseMentionId(headMentionIds, false, depMentionId);
|
|
67
|
+
if (headAlt) headMentionId = headAlt;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
all.push({
|
|
71
|
+
relation_id: r.id,
|
|
72
|
+
label: r.label,
|
|
73
|
+
segment_id: segmentId,
|
|
74
|
+
head_token_id: r.head_token_id,
|
|
75
|
+
dep_token_id: r.dep_token_id,
|
|
76
|
+
head_primary_mention_id: headMentionId || null,
|
|
77
|
+
dep_primary_mention_id: depMentionId || null,
|
|
78
|
+
head_mention_ids: headMentionIds,
|
|
79
|
+
dep_mention_ids: depMentionIds,
|
|
80
|
+
});
|
|
81
|
+
if (!depTok || depTok.segment_id !== segmentId) continue;
|
|
82
|
+
if (!headMentionId || !depMentionId) {
|
|
83
|
+
if (depMentionId) unresolved.push({ kind: 'unresolved_attachment', segment_id: segmentId, mention_id: depMentionId, reason: 'missing_primary_projection', relation: r });
|
|
84
|
+
dropped.push({
|
|
85
|
+
relation_id: r.id,
|
|
86
|
+
label: r.label,
|
|
87
|
+
segment_id: segmentId,
|
|
88
|
+
reason: 'missing_primary_projection',
|
|
89
|
+
head_token_id: r.head_token_id,
|
|
90
|
+
dep_token_id: r.dep_token_id,
|
|
91
|
+
head_primary_mention_id: headMentionId || null,
|
|
92
|
+
dep_primary_mention_id: depMentionId || null,
|
|
93
|
+
});
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
if (headMentionId === depMentionId) {
|
|
97
|
+
dropped.push({
|
|
98
|
+
relation_id: r.id,
|
|
99
|
+
label: r.label,
|
|
100
|
+
segment_id: segmentId,
|
|
101
|
+
reason: 'self_loop_after_primary_projection',
|
|
102
|
+
head_token_id: r.head_token_id,
|
|
103
|
+
dep_token_id: r.dep_token_id,
|
|
104
|
+
head_primary_mention_id: headMentionId,
|
|
105
|
+
dep_primary_mention_id: depMentionId,
|
|
106
|
+
});
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
const headMention = mentionById.get(headMentionId);
|
|
110
|
+
const depMention = mentionById.get(depMentionId);
|
|
111
|
+
if (!headMention || !depMention) continue;
|
|
112
|
+
if (headMention.segment_id !== depMention.segment_id) continue;
|
|
113
|
+
out.push({
|
|
114
|
+
relation_id: r.id,
|
|
115
|
+
label: r.label,
|
|
116
|
+
head_token_id: r.head_token_id,
|
|
117
|
+
dep_token_id: r.dep_token_id,
|
|
118
|
+
head_mention_id: headMentionId,
|
|
119
|
+
dep_mention_id: depMentionId,
|
|
120
|
+
segment_id: headMention.segment_id,
|
|
121
|
+
evidence: r.evidence || {},
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
out.sort((a, b) => {
|
|
125
|
+
const ta = tokenById.get(a.head_token_id);
|
|
126
|
+
const tb = tokenById.get(b.head_token_id);
|
|
127
|
+
if (a.segment_id !== b.segment_id) return a.segment_id.localeCompare(b.segment_id);
|
|
128
|
+
if (ta.span.start !== tb.span.start) return ta.span.start - tb.span.start;
|
|
129
|
+
const da = tokenById.get(a.dep_token_id);
|
|
130
|
+
const db = tokenById.get(b.dep_token_id);
|
|
131
|
+
if (da.span.start !== db.span.start) return da.span.start - db.span.start;
|
|
132
|
+
if (a.label !== b.label) return a.label.localeCompare(b.label);
|
|
133
|
+
return a.relation_id.localeCompare(b.relation_id);
|
|
134
|
+
});
|
|
135
|
+
dropped.sort((a, b) => {
|
|
136
|
+
if (a.segment_id !== b.segment_id) return a.segment_id.localeCompare(b.segment_id);
|
|
137
|
+
if (a.head_token_id !== b.head_token_id) return a.head_token_id.localeCompare(b.head_token_id);
|
|
138
|
+
if (a.dep_token_id !== b.dep_token_id) return a.dep_token_id.localeCompare(b.dep_token_id);
|
|
139
|
+
if (a.label !== b.label) return a.label.localeCompare(b.label);
|
|
140
|
+
return (a.relation_id || '').localeCompare(b.relation_id || '');
|
|
141
|
+
});
|
|
142
|
+
all.sort((a, b) => {
|
|
143
|
+
if (a.segment_id !== b.segment_id) return a.segment_id.localeCompare(b.segment_id);
|
|
144
|
+
if (a.head_token_id !== b.head_token_id) return a.head_token_id.localeCompare(b.head_token_id);
|
|
145
|
+
if (a.dep_token_id !== b.dep_token_id) return a.dep_token_id.localeCompare(b.dep_token_id);
|
|
146
|
+
if (a.label !== b.label) return a.label.localeCompare(b.label);
|
|
147
|
+
return (a.relation_id || '').localeCompare(b.relation_id || '');
|
|
148
|
+
});
|
|
149
|
+
return { projected: out, unresolved, dropped, all };
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
function buildCoordinationGroups(projected) {
|
|
153
|
+
const edges = projected.filter((p) => p.label === 'coordination');
|
|
154
|
+
const graph = new Map();
|
|
155
|
+
for (const e of edges) {
|
|
156
|
+
if (!graph.has(e.head_mention_id)) graph.set(e.head_mention_id, new Set());
|
|
157
|
+
if (!graph.has(e.dep_mention_id)) graph.set(e.dep_mention_id, new Set());
|
|
158
|
+
graph.get(e.head_mention_id).add(e.dep_mention_id);
|
|
159
|
+
graph.get(e.dep_mention_id).add(e.head_mention_id);
|
|
160
|
+
}
|
|
161
|
+
const seen = new Set();
|
|
162
|
+
const groups = new Map();
|
|
163
|
+
for (const node of graph.keys()) {
|
|
164
|
+
if (seen.has(node)) continue;
|
|
165
|
+
const comp = [];
|
|
166
|
+
const q = [node];
|
|
167
|
+
seen.add(node);
|
|
168
|
+
while (q.length > 0) {
|
|
169
|
+
const cur = q.shift();
|
|
170
|
+
comp.push(cur);
|
|
171
|
+
for (const n of graph.get(cur) || []) {
|
|
172
|
+
if (seen.has(n)) continue;
|
|
173
|
+
seen.add(n);
|
|
174
|
+
q.push(n);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
comp.sort((a, b) => a.localeCompare(b));
|
|
178
|
+
const gid = `cg:${sha256Hex(comp.join('|')).slice(0, 12)}`;
|
|
179
|
+
for (const m of comp) groups.set(m, gid);
|
|
180
|
+
}
|
|
181
|
+
return groups;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
module.exports = {
|
|
186
|
+
annotationHasSource,
|
|
187
|
+
collectStep11Relations,
|
|
188
|
+
buildProjectedRelations,
|
|
189
|
+
buildCoordinationGroups,
|
|
190
|
+
};
|
|
191
|
+
|
|
192
|
+
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
const { normalizeIds } = require('./determinism');
|
|
2
|
+
const { isSubjectRoleLabel } = require('./mentions');
|
|
3
|
+
|
|
4
|
+
function argumentRolePriority(role) {
|
|
5
|
+
const r = String(role || '');
|
|
6
|
+
if (r === 'actor') return 0;
|
|
7
|
+
if (r === 'patient') return 1;
|
|
8
|
+
if (r === 'location') return 2;
|
|
9
|
+
if (r === 'theme') return 3;
|
|
10
|
+
if (r === 'attribute') return 4;
|
|
11
|
+
if (r === 'topic') return 5;
|
|
12
|
+
return 10;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function modifierRolePriority(role) {
|
|
16
|
+
const r = String(role || '');
|
|
17
|
+
if (r === 'recipient') return 0;
|
|
18
|
+
if (r === 'modifier') return 1;
|
|
19
|
+
return 10;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function canonicalizeRoleEntries(entries, priorityFn) {
|
|
23
|
+
return (entries || [])
|
|
24
|
+
.map((entry) => ({
|
|
25
|
+
role: String((entry && entry.role) || ''),
|
|
26
|
+
mention_ids: normalizeIds(Array.isArray(entry && entry.mention_ids) ? entry.mention_ids : []),
|
|
27
|
+
evidence: {
|
|
28
|
+
relation_ids: normalizeIds(
|
|
29
|
+
Array.isArray(entry && entry.evidence && entry.evidence.relation_ids)
|
|
30
|
+
? entry.evidence.relation_ids
|
|
31
|
+
: []
|
|
32
|
+
),
|
|
33
|
+
token_ids: normalizeIds(
|
|
34
|
+
Array.isArray(entry && entry.evidence && entry.evidence.token_ids)
|
|
35
|
+
? entry.evidence.token_ids
|
|
36
|
+
: []
|
|
37
|
+
),
|
|
38
|
+
},
|
|
39
|
+
}))
|
|
40
|
+
.filter((entry) => entry.role.length > 0 && entry.mention_ids.length > 0)
|
|
41
|
+
.sort((a, b) => {
|
|
42
|
+
const pa = priorityFn(a.role);
|
|
43
|
+
const pb = priorityFn(b.role);
|
|
44
|
+
if (pa !== pb) return pa - pb;
|
|
45
|
+
if (a.role !== b.role) return a.role.localeCompare(b.role);
|
|
46
|
+
const am = JSON.stringify(a.mention_ids);
|
|
47
|
+
const bm = JSON.stringify(b.mention_ids);
|
|
48
|
+
if (am !== bm) return am.localeCompare(bm);
|
|
49
|
+
const ae = JSON.stringify(a.evidence);
|
|
50
|
+
const be = JSON.stringify(b.evidence);
|
|
51
|
+
return ae.localeCompare(be);
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function collectEntryTokenIds(mentionIds, mentionById) {
|
|
56
|
+
const tokenIds = [];
|
|
57
|
+
for (const mentionId of mentionIds || []) {
|
|
58
|
+
const mention = mentionById.get(mentionId);
|
|
59
|
+
if (!mention) continue;
|
|
60
|
+
for (const tokenId of mention.token_ids || []) tokenIds.push(tokenId);
|
|
61
|
+
}
|
|
62
|
+
return normalizeIds(tokenIds);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function slotToRoleEntries(slots, mentionById) {
|
|
66
|
+
const source = slots || {};
|
|
67
|
+
const argumentsOut = [];
|
|
68
|
+
const modifiersOut = [];
|
|
69
|
+
|
|
70
|
+
const coreMappings = [
|
|
71
|
+
{ slot: 'actor', role: 'actor' },
|
|
72
|
+
{ slot: 'theme', role: 'theme' },
|
|
73
|
+
{ slot: 'attr', role: 'attribute' },
|
|
74
|
+
{ slot: 'topic', role: 'topic' },
|
|
75
|
+
{ slot: 'location', role: 'location' },
|
|
76
|
+
];
|
|
77
|
+
|
|
78
|
+
for (const mapping of coreMappings) {
|
|
79
|
+
const mentionIds = normalizeIds(Array.isArray(source[mapping.slot]) ? source[mapping.slot] : []);
|
|
80
|
+
if (mentionIds.length === 0) continue;
|
|
81
|
+
argumentsOut.push({
|
|
82
|
+
role: mapping.role,
|
|
83
|
+
mention_ids: mentionIds,
|
|
84
|
+
evidence: {
|
|
85
|
+
relation_ids: [],
|
|
86
|
+
token_ids: collectEntryTokenIds(mentionIds, mentionById),
|
|
87
|
+
},
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
for (const entry of Array.isArray(source.other) ? source.other : []) {
|
|
92
|
+
const role = String((entry && entry.role) || '').trim();
|
|
93
|
+
const mentionIds = normalizeIds(Array.isArray(entry && entry.mention_ids) ? entry.mention_ids : []);
|
|
94
|
+
if (!role || mentionIds.length === 0) continue;
|
|
95
|
+
modifiersOut.push({
|
|
96
|
+
role,
|
|
97
|
+
mention_ids: mentionIds,
|
|
98
|
+
evidence: {
|
|
99
|
+
relation_ids: [],
|
|
100
|
+
token_ids: collectEntryTokenIds(mentionIds, mentionById),
|
|
101
|
+
},
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return {
|
|
106
|
+
arguments: canonicalizeRoleEntries(argumentsOut, argumentRolePriority),
|
|
107
|
+
modifiers: canonicalizeRoleEntries(modifiersOut, modifierRolePriority),
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function collectAssertionMentionRefs(assertion) {
|
|
112
|
+
const out = new Set();
|
|
113
|
+
for (const entry of assertion && Array.isArray(assertion.arguments) ? assertion.arguments : []) {
|
|
114
|
+
for (const mentionId of entry.mention_ids || []) out.add(mentionId);
|
|
115
|
+
}
|
|
116
|
+
for (const entry of assertion && Array.isArray(assertion.modifiers) ? assertion.modifiers : []) {
|
|
117
|
+
for (const mentionId of entry.mention_ids || []) out.add(mentionId);
|
|
118
|
+
}
|
|
119
|
+
return out;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
function projectRolesToSlots(assertion) {
|
|
123
|
+
const slots = { actor: [], theme: [], attr: [], topic: [], location: [], other: [] };
|
|
124
|
+
for (const entry of assertion && Array.isArray(assertion.arguments) ? assertion.arguments : []) {
|
|
125
|
+
const role = String((entry && entry.role) || '');
|
|
126
|
+
const mentionIds = normalizeIds(Array.isArray(entry && entry.mention_ids) ? entry.mention_ids : []);
|
|
127
|
+
if (mentionIds.length === 0) continue;
|
|
128
|
+
if (role === 'actor' || isSubjectRoleLabel(role)) slots.actor = normalizeIds(slots.actor.concat(mentionIds));
|
|
129
|
+
else if (role === 'theme') slots.theme = normalizeIds(slots.theme.concat(mentionIds));
|
|
130
|
+
else if (role === 'attribute') slots.attr = normalizeIds(slots.attr.concat(mentionIds));
|
|
131
|
+
else if (role === 'topic') slots.topic = normalizeIds(slots.topic.concat(mentionIds));
|
|
132
|
+
else if (role === 'location') slots.location = normalizeIds(slots.location.concat(mentionIds));
|
|
133
|
+
else slots.other.push({ role, mention_ids: mentionIds });
|
|
134
|
+
}
|
|
135
|
+
for (const entry of assertion && Array.isArray(assertion.modifiers) ? assertion.modifiers : []) {
|
|
136
|
+
const role = String((entry && entry.role) || '');
|
|
137
|
+
const mentionIds = normalizeIds(Array.isArray(entry && entry.mention_ids) ? entry.mention_ids : []);
|
|
138
|
+
if (!role || mentionIds.length === 0) continue;
|
|
139
|
+
slots.other.push({ role, mention_ids: mentionIds });
|
|
140
|
+
}
|
|
141
|
+
slots.other = slots.other
|
|
142
|
+
.map((entry) => ({ role: entry.role, mention_ids: normalizeIds(entry.mention_ids || []) }))
|
|
143
|
+
.filter((entry) => entry.mention_ids.length > 0)
|
|
144
|
+
.sort((a, b) => {
|
|
145
|
+
if (a.role !== b.role) return a.role.localeCompare(b.role);
|
|
146
|
+
return JSON.stringify(a.mention_ids).localeCompare(JSON.stringify(b.mention_ids));
|
|
147
|
+
});
|
|
148
|
+
return slots;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
function collectMentionIdsFromRoles(assertion) {
|
|
152
|
+
return normalizeIds(Array.from(collectAssertionMentionRefs(assertion)));
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
module.exports = {
|
|
156
|
+
argumentRolePriority,
|
|
157
|
+
modifierRolePriority,
|
|
158
|
+
canonicalizeRoleEntries,
|
|
159
|
+
collectEntryTokenIds,
|
|
160
|
+
slotToRoleEntries,
|
|
161
|
+
collectAssertionMentionRefs,
|
|
162
|
+
projectRolesToSlots,
|
|
163
|
+
collectMentionIdsFromRoles,
|
|
164
|
+
};
|