elementary-assertions 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +353 -0
- package/LICENSE +21 -0
- package/README.md +211 -0
- package/bin/elementary-assertions.js +8 -0
- package/docs/DEV_TOOLING.md +98 -0
- package/docs/NPM_RELEASE.md +177 -0
- package/docs/OPERATIONAL.md +159 -0
- package/docs/RELEASE_NOTES_TEMPLATE.md +37 -0
- package/docs/REPO_WORKFLOWS.md +48 -0
- package/package.json +46 -0
- package/src/core/accepted-annotations.js +44 -0
- package/src/core/assertions.js +2304 -0
- package/src/core/determinism.js +95 -0
- package/src/core/diagnostics.js +496 -0
- package/src/core/ids.js +9 -0
- package/src/core/mention-builder.js +272 -0
- package/src/core/mention-evidence.js +52 -0
- package/src/core/mention-head-resolution.js +108 -0
- package/src/core/mention-materialization.js +31 -0
- package/src/core/mentions.js +149 -0
- package/src/core/output.js +296 -0
- package/src/core/projection.js +192 -0
- package/src/core/roles.js +164 -0
- package/src/core/strings.js +7 -0
- package/src/core/tokens.js +53 -0
- package/src/core/upstream.js +31 -0
- package/src/index.js +6 -0
- package/src/render/index.js +5 -0
- package/src/render/layouts/compact.js +10 -0
- package/src/render/layouts/meaning.js +7 -0
- package/src/render/layouts/readable.js +7 -0
- package/src/render/layouts/table.js +7 -0
- package/src/render/render.js +931 -0
- package/src/run.js +278 -0
- package/src/schema/seed.elementary-assertions.schema.json +1751 -0
- package/src/tools/cli.js +158 -0
- package/src/tools/index.js +6 -0
- package/src/tools/io.js +55 -0
- package/src/validate/ajv.js +20 -0
- package/src/validate/coverage.js +215 -0
- package/src/validate/determinism.js +115 -0
- package/src/validate/diagnostics-strict.js +392 -0
- package/src/validate/errors.js +19 -0
- package/src/validate/index.js +20 -0
- package/src/validate/integrity.js +41 -0
- package/src/validate/invariants.js +157 -0
- package/src/validate/references.js +110 -0
- package/src/validate/schema.js +50 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
const crypto = require("crypto");
|
|
2
|
+
const { normalizeIds } = require("./ids");
|
|
3
|
+
|
|
4
|
+
function sha256Hex(text) {
|
|
5
|
+
return crypto.createHash("sha256").update(Buffer.from(String(text || ""), "utf8")).digest("hex");
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
function findSelector(annotation, type) {
|
|
9
|
+
if (!annotation || !annotation.anchor || !Array.isArray(annotation.anchor.selectors)) return null;
|
|
10
|
+
return annotation.anchor.selectors.find((s) => s && s.type === type) || null;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
function normalizeSpanKey(span) {
|
|
14
|
+
return `${span.start}-${span.end}`;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function canonicalizeSlotObject(slots) {
|
|
18
|
+
return {
|
|
19
|
+
actor: normalizeIds((slots && slots.actor) || []),
|
|
20
|
+
theme: normalizeIds((slots && slots.theme) || []),
|
|
21
|
+
attr: normalizeIds((slots && slots.attr) || []),
|
|
22
|
+
topic: normalizeIds((slots && slots.topic) || []),
|
|
23
|
+
location: normalizeIds((slots && slots.location) || []),
|
|
24
|
+
other: Array.isArray(slots && slots.other)
|
|
25
|
+
? slots.other
|
|
26
|
+
.map((o) => ({ role: String((o && o.role) || ""), mention_ids: normalizeIds((o && o.mention_ids) || []) }))
|
|
27
|
+
.sort((a, b) => {
|
|
28
|
+
if (a.role !== b.role) return a.role.localeCompare(b.role);
|
|
29
|
+
return JSON.stringify(a.mention_ids).localeCompare(JSON.stringify(b.mention_ids));
|
|
30
|
+
})
|
|
31
|
+
: [],
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function stableObjectKey(obj) {
|
|
36
|
+
const keys = Object.keys(obj || {}).sort();
|
|
37
|
+
return keys.map((k) => `${k}:${JSON.stringify(obj[k])}`).join("|");
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function evidenceSortKey(e) {
|
|
41
|
+
return `${e.from_token_id || ""}|${e.to_token_id || ""}|${e.label || ""}|${e.relation_id || e.annotation_id || ""}`;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function dedupeAndSortEvidence(items) {
|
|
45
|
+
const byKey = new Map();
|
|
46
|
+
for (const it of items || []) {
|
|
47
|
+
const k = stableObjectKey(it);
|
|
48
|
+
if (!byKey.has(k)) byKey.set(k, it);
|
|
49
|
+
}
|
|
50
|
+
return Array.from(byKey.values()).sort((a, b) => evidenceSortKey(a).localeCompare(evidenceSortKey(b)));
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function canonicalizeOperatorsForHash(ops) {
|
|
54
|
+
return (ops || [])
|
|
55
|
+
.map((op) => ({
|
|
56
|
+
kind: op.kind,
|
|
57
|
+
value: op.value || undefined,
|
|
58
|
+
token_id: op.token_id || undefined,
|
|
59
|
+
group_id: op.group_id || undefined,
|
|
60
|
+
evidence: dedupeAndSortEvidence(op.evidence || []),
|
|
61
|
+
}))
|
|
62
|
+
.sort((a, b) => {
|
|
63
|
+
if (String(a.kind || "") !== String(b.kind || "")) return String(a.kind || "").localeCompare(String(b.kind || ""));
|
|
64
|
+
if (String(a.value || "") !== String(b.value || "")) return String(a.value || "").localeCompare(String(b.value || ""));
|
|
65
|
+
if (String(a.token_id || "") !== String(b.token_id || "")) return String(a.token_id || "").localeCompare(String(b.token_id || ""));
|
|
66
|
+
if (String(a.group_id || "") !== String(b.group_id || "")) return String(a.group_id || "").localeCompare(String(b.group_id || ""));
|
|
67
|
+
return JSON.stringify(a.evidence || []).localeCompare(JSON.stringify(b.evidence || []));
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function deepCloneJson(value) {
|
|
72
|
+
return JSON.parse(JSON.stringify(value));
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const UNRESOLVED_REASON_PRECEDENCE = [
|
|
76
|
+
"predicate_invalid",
|
|
77
|
+
"coord_type_missing",
|
|
78
|
+
"operator_scope_open",
|
|
79
|
+
"missing_relation",
|
|
80
|
+
"projection_failed",
|
|
81
|
+
];
|
|
82
|
+
|
|
83
|
+
module.exports = {
|
|
84
|
+
sha256Hex,
|
|
85
|
+
findSelector,
|
|
86
|
+
normalizeSpanKey,
|
|
87
|
+
normalizeIds,
|
|
88
|
+
canonicalizeSlotObject,
|
|
89
|
+
canonicalizeOperatorsForHash,
|
|
90
|
+
stableObjectKey,
|
|
91
|
+
deepCloneJson,
|
|
92
|
+
UNRESOLVED_REASON_PRECEDENCE,
|
|
93
|
+
evidenceSortKey,
|
|
94
|
+
dedupeAndSortEvidence,
|
|
95
|
+
};
|
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
const { UNRESOLVED_REASON_PRECEDENCE, normalizeIds, findSelector, dedupeAndSortEvidence } = require('./determinism');
|
|
2
|
+
const { hasPositiveWikiSignal, isSubjectRoleLabel, isCompareLabel, isQuantifierLabel } = require('./mentions');
|
|
3
|
+
const { normalizeOptionalString } = require('./strings');
|
|
4
|
+
const { annotationHasSource } = require('./upstream');
|
|
5
|
+
|
|
6
|
+
function operatorIdentityKey(op) {
|
|
7
|
+
return `${op.kind || ''}|${op.value || ''}|${op.group_id || ''}|${op.role || ''}`;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
function mergeOperator(opMap, op) {
|
|
11
|
+
const key = operatorIdentityKey(op);
|
|
12
|
+
const existing = opMap.get(key);
|
|
13
|
+
if (!existing) {
|
|
14
|
+
opMap.set(key, {
|
|
15
|
+
...op,
|
|
16
|
+
evidence: dedupeAndSortEvidence(op.evidence || []),
|
|
17
|
+
});
|
|
18
|
+
return;
|
|
19
|
+
}
|
|
20
|
+
existing.evidence = dedupeAndSortEvidence((existing.evidence || []).concat(op.evidence || []));
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
function pickReasonByPrecedence(candidates) {
|
|
24
|
+
const set = new Set((candidates || []).filter(Boolean));
|
|
25
|
+
for (const reason of UNRESOLVED_REASON_PRECEDENCE) {
|
|
26
|
+
if (set.has(reason)) return reason;
|
|
27
|
+
}
|
|
28
|
+
return 'projection_failed';
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function classifyUnresolvedReason({
|
|
32
|
+
mentionId,
|
|
33
|
+
predicateQualityByMentionId,
|
|
34
|
+
roleRelationIdsByMention,
|
|
35
|
+
operatorRelationIdsByMention,
|
|
36
|
+
coordMissingTypeIdsByMention,
|
|
37
|
+
}) {
|
|
38
|
+
const reasons = [];
|
|
39
|
+
if ((predicateQualityByMentionId.get(mentionId) || '') === 'low') reasons.push('predicate_invalid');
|
|
40
|
+
const roleIds = roleRelationIdsByMention.get(mentionId);
|
|
41
|
+
const operatorIds = operatorRelationIdsByMention.get(mentionId);
|
|
42
|
+
const coordTypeMissingIds = coordMissingTypeIdsByMention.get(mentionId);
|
|
43
|
+
if (coordTypeMissingIds && coordTypeMissingIds.size > 0) reasons.push('coord_type_missing');
|
|
44
|
+
if ((!roleIds || roleIds.size === 0) && operatorIds && operatorIds.size > 0) reasons.push('operator_scope_open');
|
|
45
|
+
if (!roleIds || roleIds.size === 0) reasons.push('missing_relation');
|
|
46
|
+
if (roleIds && roleIds.size > 0) reasons.push('projection_failed');
|
|
47
|
+
return pickReasonByPrecedence(reasons);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function buildUnresolved({
|
|
51
|
+
mentions,
|
|
52
|
+
unresolvedHeadMap,
|
|
53
|
+
projectedUnresolved,
|
|
54
|
+
mentionById,
|
|
55
|
+
assertions,
|
|
56
|
+
projected,
|
|
57
|
+
uncoveredPrimaryMentionIds,
|
|
58
|
+
}) {
|
|
59
|
+
const predicateQualityByMentionId = new Map();
|
|
60
|
+
for (const a of assertions || []) {
|
|
61
|
+
if (!a || !a.predicate || typeof a.predicate.mention_id !== 'string') continue;
|
|
62
|
+
const q = a.diagnostics && typeof a.diagnostics.predicate_quality === 'string' ? a.diagnostics.predicate_quality : '';
|
|
63
|
+
if (q) predicateQualityByMentionId.set(a.predicate.mention_id, q);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
const roleLabels = new Set(['theme', 'patient', 'attribute', 'topic', 'location', 'recipient']);
|
|
67
|
+
const operatorLabels = new Set(['modality', 'negation', 'coordination', 'complement_clause', 'purpose']);
|
|
68
|
+
const roleRelationIdsByMention = new Map();
|
|
69
|
+
const operatorRelationIdsByMention = new Map();
|
|
70
|
+
const coordMissingTypeIdsByMention = new Map();
|
|
71
|
+
function addRelId(map, mentionId, relationId) {
|
|
72
|
+
if (typeof mentionId !== 'string' || mentionId.length === 0) return;
|
|
73
|
+
if (typeof relationId !== 'string' || relationId.length === 0) return;
|
|
74
|
+
if (!map.has(mentionId)) map.set(mentionId, new Set());
|
|
75
|
+
map.get(mentionId).add(relationId);
|
|
76
|
+
}
|
|
77
|
+
for (const rel of projected || []) {
|
|
78
|
+
if (!rel) continue;
|
|
79
|
+
const relationId = typeof rel.relation_id === 'string' && rel.relation_id.length > 0
|
|
80
|
+
? rel.relation_id
|
|
81
|
+
: (typeof rel.id === 'string' ? rel.id : '');
|
|
82
|
+
if (roleLabels.has(String(rel.label || '')) || isSubjectRoleLabel(rel.label)) {
|
|
83
|
+
addRelId(roleRelationIdsByMention, rel.head_mention_id, relationId);
|
|
84
|
+
}
|
|
85
|
+
if (
|
|
86
|
+
operatorLabels.has(String(rel.label || '')) ||
|
|
87
|
+
(rel.evidence && (rel.evidence.pattern === 'control_inherit_subject' || rel.evidence.pattern === 'control_propagation'))
|
|
88
|
+
) {
|
|
89
|
+
addRelId(operatorRelationIdsByMention, rel.head_mention_id, relationId);
|
|
90
|
+
}
|
|
91
|
+
if (String(rel.label || '') === 'coordination') {
|
|
92
|
+
const coordType = rel.evidence && (
|
|
93
|
+
rel.evidence.coord_type ||
|
|
94
|
+
rel.evidence.coordination_type ||
|
|
95
|
+
rel.evidence.coordinator_type
|
|
96
|
+
);
|
|
97
|
+
if (!coordType) {
|
|
98
|
+
addRelId(coordMissingTypeIdsByMention, rel.head_mention_id, relationId);
|
|
99
|
+
addRelId(coordMissingTypeIdsByMention, rel.dep_mention_id, relationId);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const grouped = new Map();
|
|
105
|
+
const put = (kind, segment_id, mention_id, mention_ids, token_ids, span, upstreamRelationIds) => {
|
|
106
|
+
const reason = classifyUnresolvedReason({
|
|
107
|
+
mentionId: mention_id,
|
|
108
|
+
predicateQualityByMentionId,
|
|
109
|
+
roleRelationIdsByMention,
|
|
110
|
+
operatorRelationIdsByMention,
|
|
111
|
+
coordMissingTypeIdsByMention,
|
|
112
|
+
});
|
|
113
|
+
const k = `${kind}|${segment_id}|${mention_id}|${reason}`;
|
|
114
|
+
if (!grouped.has(k)) {
|
|
115
|
+
grouped.set(k, {
|
|
116
|
+
kind,
|
|
117
|
+
segment_id,
|
|
118
|
+
mention_id,
|
|
119
|
+
mention_ids: new Set(),
|
|
120
|
+
reason,
|
|
121
|
+
token_ids: new Set(),
|
|
122
|
+
upstream_relation_ids: new Set(),
|
|
123
|
+
span: span || null,
|
|
124
|
+
});
|
|
125
|
+
}
|
|
126
|
+
const g = grouped.get(k);
|
|
127
|
+
g.mention_ids.add(mention_id);
|
|
128
|
+
for (const mid of mention_ids || []) {
|
|
129
|
+
if (typeof mid === 'string' && mid.length > 0) g.mention_ids.add(mid);
|
|
130
|
+
}
|
|
131
|
+
for (const t of token_ids) g.token_ids.add(t);
|
|
132
|
+
for (const rid of upstreamRelationIds || []) {
|
|
133
|
+
if (typeof rid === 'string' && rid.length > 0) g.upstream_relation_ids.add(rid);
|
|
134
|
+
}
|
|
135
|
+
if (!g.span && span) g.span = span;
|
|
136
|
+
};
|
|
137
|
+
|
|
138
|
+
for (const m of mentions) {
|
|
139
|
+
if (!unresolvedHeadMap.get(m.id)) continue;
|
|
140
|
+
put('unresolved_head', m.segment_id, m.id, [m.id], m.token_ids, m.span, []);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
for (const u of projectedUnresolved) {
|
|
144
|
+
const m = mentionById.get(u.mention_id);
|
|
145
|
+
if (!m) continue;
|
|
146
|
+
const upstreamRelationIds = [];
|
|
147
|
+
if (u.relation && typeof u.relation.relation_id === 'string' && u.relation.relation_id.length > 0) {
|
|
148
|
+
upstreamRelationIds.push(u.relation.relation_id);
|
|
149
|
+
} else if (u.relation && typeof u.relation.id === 'string' && u.relation.id.length > 0) {
|
|
150
|
+
upstreamRelationIds.push(u.relation.id);
|
|
151
|
+
}
|
|
152
|
+
const mentionIds = [u.mention_id];
|
|
153
|
+
if (u.relation && typeof u.relation.head_mention_id === 'string' && u.relation.head_mention_id.length > 0) {
|
|
154
|
+
mentionIds.push(u.relation.head_mention_id);
|
|
155
|
+
}
|
|
156
|
+
if (u.relation && typeof u.relation.dep_mention_id === 'string' && u.relation.dep_mention_id.length > 0) {
|
|
157
|
+
mentionIds.push(u.relation.dep_mention_id);
|
|
158
|
+
}
|
|
159
|
+
put('unresolved_attachment', u.segment_id, u.mention_id, mentionIds, m.token_ids, m.span, upstreamRelationIds);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
for (const uncoveredId of uncoveredPrimaryMentionIds || []) {
|
|
163
|
+
if (typeof uncoveredId !== 'string' || !uncoveredId) continue;
|
|
164
|
+
const m = mentionById.get(uncoveredId);
|
|
165
|
+
if (!m) continue;
|
|
166
|
+
put('unresolved_attachment', m.segment_id, m.id, [m.id], m.token_ids || [], m.span, []);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
const out = Array.from(grouped.values()).map((g) => ({
|
|
170
|
+
kind: g.kind,
|
|
171
|
+
segment_id: g.segment_id,
|
|
172
|
+
mention_id: g.mention_id,
|
|
173
|
+
mention_ids: normalizeIds(Array.from(g.mention_ids)),
|
|
174
|
+
reason: g.reason,
|
|
175
|
+
evidence: {
|
|
176
|
+
token_ids: normalizeIds(Array.from(g.token_ids)),
|
|
177
|
+
upstream_relation_ids: normalizeIds(Array.from(g.upstream_relation_ids)),
|
|
178
|
+
span: g.span || undefined,
|
|
179
|
+
},
|
|
180
|
+
}));
|
|
181
|
+
out.sort((a, b) => {
|
|
182
|
+
if (a.segment_id !== b.segment_id) return a.segment_id.localeCompare(b.segment_id);
|
|
183
|
+
if (a.mention_id !== b.mention_id) return a.mention_id.localeCompare(b.mention_id);
|
|
184
|
+
if (a.kind !== b.kind) return a.kind.localeCompare(b.kind);
|
|
185
|
+
return a.reason.localeCompare(b.reason);
|
|
186
|
+
});
|
|
187
|
+
return out;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function buildSubjectRoleGaps({ assertions, projected }) {
|
|
191
|
+
const subjectRelsByPredicate = new Map();
|
|
192
|
+
for (const rel of projected || []) {
|
|
193
|
+
if (!rel || !isSubjectRoleLabel(rel.label)) continue;
|
|
194
|
+
const predMentionId = String(rel.head_mention_id || '');
|
|
195
|
+
if (!predMentionId) continue;
|
|
196
|
+
if (!subjectRelsByPredicate.has(predMentionId)) subjectRelsByPredicate.set(predMentionId, new Set());
|
|
197
|
+
const rid = String(rel.relation_id || rel.id || '');
|
|
198
|
+
if (rid) subjectRelsByPredicate.get(predMentionId).add(rid);
|
|
199
|
+
}
|
|
200
|
+
const out = [];
|
|
201
|
+
for (const a of assertions || []) {
|
|
202
|
+
if (!a || !a.predicate || typeof a.predicate.mention_id !== 'string') continue;
|
|
203
|
+
const predicateClass = String((((a || {}).diagnostics) || {}).predicate_class || '');
|
|
204
|
+
if (predicateClass !== 'lexical_verb') continue;
|
|
205
|
+
const actorIds = normalizeIds(((a || {}).arguments || []).filter((entry) => String((entry && entry.role) || '') === 'actor').flatMap((entry) => entry.mention_ids || []));
|
|
206
|
+
if (actorIds.length > 0) continue;
|
|
207
|
+
const predMentionId = a.predicate.mention_id;
|
|
208
|
+
const relIds = normalizeIds(Array.from(subjectRelsByPredicate.get(predMentionId) || []));
|
|
209
|
+
if (relIds.length > 0) continue;
|
|
210
|
+
const tokenIds = normalizeIds([String(a.predicate.head_token_id || '')].filter(Boolean));
|
|
211
|
+
out.push({
|
|
212
|
+
segment_id: String(a.segment_id || ''),
|
|
213
|
+
assertion_id: String(a.id || ''),
|
|
214
|
+
predicate_mention_id: predMentionId,
|
|
215
|
+
predicate_head_token_id: String(a.predicate.head_token_id || ''),
|
|
216
|
+
reason: 'missing_subject_role',
|
|
217
|
+
evidence: {
|
|
218
|
+
token_ids: tokenIds,
|
|
219
|
+
upstream_relation_ids: [],
|
|
220
|
+
},
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
out.sort((a, b) => {
|
|
224
|
+
if (a.segment_id !== b.segment_id) return a.segment_id.localeCompare(b.segment_id);
|
|
225
|
+
if (a.assertion_id !== b.assertion_id) return a.assertion_id.localeCompare(b.assertion_id);
|
|
226
|
+
return a.predicate_mention_id.localeCompare(b.predicate_mention_id);
|
|
227
|
+
});
|
|
228
|
+
return out;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
function buildDiagnostics({ tokenWikiById, mentions, assertions, projectedBuild, relationsSeed, wtiEndpoint, suppressedAssertions }) {
|
|
232
|
+
const mentionWithLexicon = mentions.filter(
|
|
233
|
+
(m) => m && m.provenance && m.provenance.lexicon_evidence && typeof m.provenance.lexicon_evidence === 'object'
|
|
234
|
+
).length;
|
|
235
|
+
const assertionsWithWiki = assertions.filter(
|
|
236
|
+
(a) => a && a.evidence && a.evidence.wiki_signals && typeof a.evidence.wiki_signals === 'object'
|
|
237
|
+
).length;
|
|
238
|
+
const warnings = [];
|
|
239
|
+
if (normalizeOptionalString(wtiEndpoint) && tokenWikiById.size === 0) {
|
|
240
|
+
warnings.push('wti_configured_but_no_token_wiki_signals');
|
|
241
|
+
}
|
|
242
|
+
if ((projectedBuild.dropped || []).length > 0) {
|
|
243
|
+
warnings.push('relation_projection_drops_present');
|
|
244
|
+
}
|
|
245
|
+
const projected = projectedBuild.projected || [];
|
|
246
|
+
const coordTypeMissing = projected.some((r) => {
|
|
247
|
+
if (!r || String(r.label || '') !== 'coordination') return false;
|
|
248
|
+
const ev = r.evidence || {};
|
|
249
|
+
return !(
|
|
250
|
+
(typeof ev.coord_type === 'string' && ev.coord_type.length > 0) ||
|
|
251
|
+
(typeof ev.coordination_type === 'string' && ev.coordination_type.length > 0) ||
|
|
252
|
+
(typeof ev.coordinator_type === 'string' && ev.coordinator_type.length > 0)
|
|
253
|
+
);
|
|
254
|
+
});
|
|
255
|
+
const compareRelPresent = projected.some((r) => isCompareLabel(r && r.label));
|
|
256
|
+
const quantifierRelPresent = projected.some((r) => isQuantifierLabel(r && r.label));
|
|
257
|
+
const segTokens = new Map();
|
|
258
|
+
for (const t of (relationsSeed && relationsSeed.tokens) || []) {
|
|
259
|
+
if (!t || typeof t.segment_id !== 'string') continue;
|
|
260
|
+
if (!segTokens.has(t.segment_id)) segTokens.set(t.segment_id, []);
|
|
261
|
+
segTokens.get(t.segment_id).push(String(t.surface || '').toLowerCase());
|
|
262
|
+
}
|
|
263
|
+
const comparativeSurfacePresent = Array.from(segTokens.values()).some((arr) =>
|
|
264
|
+
arr.includes('than') && arr.some((s) => s === 'greater' || s === 'less' || s === 'more' || s === 'fewer')
|
|
265
|
+
);
|
|
266
|
+
const quantifierSurfaceSet = new Set(['each', 'every', 'all', 'some', 'no', 'only']);
|
|
267
|
+
const quantifierSurfacePresent = Array.from(segTokens.values()).some((arr) => arr.some((s) => quantifierSurfaceSet.has(s)));
|
|
268
|
+
if (coordTypeMissing) warnings.push('coordination_type_missing');
|
|
269
|
+
if (comparativeSurfacePresent && !compareRelPresent) warnings.push('comparative_gap');
|
|
270
|
+
if (quantifierSurfacePresent && !quantifierRelPresent) warnings.push('quantifier_scope_gap');
|
|
271
|
+
const coordGroups = new Map();
|
|
272
|
+
for (const a of assertions || []) {
|
|
273
|
+
for (const op of (a.operators || [])) {
|
|
274
|
+
if (!op || op.kind !== 'coordination_group' || typeof op.group_id !== 'string') continue;
|
|
275
|
+
if (!coordGroups.has(op.group_id)) {
|
|
276
|
+
coordGroups.set(op.group_id, {
|
|
277
|
+
id: op.group_id,
|
|
278
|
+
type: typeof op.value === 'string' && op.value.length > 0 ? op.value : null,
|
|
279
|
+
member_assertion_ids: new Set(),
|
|
280
|
+
});
|
|
281
|
+
}
|
|
282
|
+
const g = coordGroups.get(op.group_id);
|
|
283
|
+
g.member_assertion_ids.add(a.id);
|
|
284
|
+
if (!g.type && typeof op.value === 'string' && op.value.length > 0) g.type = op.value;
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
const coordinationGroups = Array.from(coordGroups.values())
|
|
288
|
+
.map((g) => ({
|
|
289
|
+
id: g.id,
|
|
290
|
+
type: g.type,
|
|
291
|
+
member_assertion_ids: normalizeIds(Array.from(g.member_assertion_ids)),
|
|
292
|
+
}))
|
|
293
|
+
.sort((a, b) => a.id.localeCompare(b.id));
|
|
294
|
+
const perSegmentMap = new Map();
|
|
295
|
+
for (const a of assertions || []) {
|
|
296
|
+
if (!a || typeof a.segment_id !== 'string') continue;
|
|
297
|
+
if (!perSegmentMap.has(a.segment_id)) {
|
|
298
|
+
perSegmentMap.set(a.segment_id, {
|
|
299
|
+
segment_id: a.segment_id,
|
|
300
|
+
predicate_assertion_count: 0,
|
|
301
|
+
lexical_verb_count: 0,
|
|
302
|
+
tolerated_auxiliary_count: 0,
|
|
303
|
+
structural_fragment_count: 0,
|
|
304
|
+
clause_fragmentation_warning: false,
|
|
305
|
+
});
|
|
306
|
+
}
|
|
307
|
+
const bucket = perSegmentMap.get(a.segment_id);
|
|
308
|
+
bucket.predicate_assertion_count += 1;
|
|
309
|
+
const predicateClass = String((((a || {}).diagnostics) || {}).predicate_class || '');
|
|
310
|
+
if (predicateClass === 'lexical_verb') bucket.lexical_verb_count += 1;
|
|
311
|
+
if (predicateClass === 'auxiliary' || predicateClass === 'copula') bucket.tolerated_auxiliary_count += 1;
|
|
312
|
+
if ((((a || {}).diagnostics) || {}).structural_fragment === true) bucket.structural_fragment_count += 1;
|
|
313
|
+
}
|
|
314
|
+
const perSegment = Array.from(perSegmentMap.values())
|
|
315
|
+
.map((x) => ({
|
|
316
|
+
...x,
|
|
317
|
+
clause_fragmentation_warning:
|
|
318
|
+
x.predicate_assertion_count > (x.lexical_verb_count + x.tolerated_auxiliary_count),
|
|
319
|
+
}))
|
|
320
|
+
.sort((a, b) => a.segment_id.localeCompare(b.segment_id));
|
|
321
|
+
const structuralFragmentCount = perSegment.reduce((n, x) => n + x.structural_fragment_count, 0);
|
|
322
|
+
const totalAssertions = (assertions || []).length;
|
|
323
|
+
const noiseCount = (assertions || []).filter((a) => {
|
|
324
|
+
const cls = String((((a || {}).diagnostics) || {}).predicate_class || '');
|
|
325
|
+
return cls === 'preposition' || cls === 'nominal_head';
|
|
326
|
+
}).length;
|
|
327
|
+
const predicateNoiseIndex = totalAssertions > 0 ? Number((noiseCount / totalAssertions).toFixed(6)) : 0;
|
|
328
|
+
const subjectRoleGaps = buildSubjectRoleGaps({
|
|
329
|
+
assertions,
|
|
330
|
+
projected,
|
|
331
|
+
});
|
|
332
|
+
return {
|
|
333
|
+
token_wiki_signal_count: tokenWikiById.size,
|
|
334
|
+
mentions_with_lexicon_evidence: mentionWithLexicon,
|
|
335
|
+
assertions_with_wiki_signals: assertionsWithWiki,
|
|
336
|
+
projected_relation_count: (projectedBuild.projected || []).length,
|
|
337
|
+
dropped_relation_count: (projectedBuild.dropped || []).length,
|
|
338
|
+
fragmentation: {
|
|
339
|
+
structural_fragment_count: structuralFragmentCount,
|
|
340
|
+
predicate_noise_index: predicateNoiseIndex,
|
|
341
|
+
per_segment: perSegment,
|
|
342
|
+
},
|
|
343
|
+
gap_signals: {
|
|
344
|
+
coordination_type_missing: coordTypeMissing,
|
|
345
|
+
comparative_gap: comparativeSurfacePresent && !compareRelPresent,
|
|
346
|
+
quantifier_scope_gap: quantifierSurfacePresent && !quantifierRelPresent,
|
|
347
|
+
},
|
|
348
|
+
coordination_groups: coordinationGroups,
|
|
349
|
+
subject_role_gaps: subjectRoleGaps,
|
|
350
|
+
suppressed_assertions: Array.isArray(suppressedAssertions) ? suppressedAssertions : [],
|
|
351
|
+
warnings: normalizeIds(warnings),
|
|
352
|
+
};
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
function collectWikiFieldDiagnostics(inputDoc) {
|
|
357
|
+
const terms = ['wiki', 'wikipedia', 'title_index', 'lexicon'];
|
|
358
|
+
const buckets = new Map();
|
|
359
|
+
|
|
360
|
+
function normalizePath(p) {
|
|
361
|
+
return p.replace(/\[\d+\]/g, '[]');
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
function summarizeValue(value) {
|
|
365
|
+
try {
|
|
366
|
+
const raw = JSON.stringify(value);
|
|
367
|
+
if (typeof raw !== 'string') return String(value);
|
|
368
|
+
return raw.length > 180 ? `${raw.slice(0, 177)}...` : raw;
|
|
369
|
+
} catch (_) {
|
|
370
|
+
return String(value);
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
function visit(node, pathPrefix) {
|
|
375
|
+
if (Array.isArray(node)) {
|
|
376
|
+
for (let i = 0; i < node.length; i += 1) {
|
|
377
|
+
visit(node[i], `${pathPrefix}[${i}]`);
|
|
378
|
+
}
|
|
379
|
+
return;
|
|
380
|
+
}
|
|
381
|
+
if (!node || typeof node !== 'object') return;
|
|
382
|
+
for (const key of Object.keys(node)) {
|
|
383
|
+
const value = node[key];
|
|
384
|
+
const path = pathPrefix ? `${pathPrefix}.${key}` : key;
|
|
385
|
+
const keyLower = String(key).toLowerCase();
|
|
386
|
+
if (terms.some((term) => keyLower.includes(term))) {
|
|
387
|
+
const bucketKey = normalizePath(path);
|
|
388
|
+
const existing = buckets.get(bucketKey) || { path: bucketKey, count: 0, example: '' };
|
|
389
|
+
existing.count += 1;
|
|
390
|
+
if (!existing.example) existing.example = summarizeValue(value);
|
|
391
|
+
buckets.set(bucketKey, existing);
|
|
392
|
+
}
|
|
393
|
+
visit(value, path);
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
visit(inputDoc, '');
|
|
398
|
+
return Array.from(buckets.values()).sort((a, b) => a.path.localeCompare(b.path));
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
function analyzeUpstreamWikiEvidence(inputDoc) {
|
|
402
|
+
const tokens = Array.isArray(inputDoc && inputDoc.tokens) ? inputDoc.tokens : [];
|
|
403
|
+
const annotations = Array.isArray(inputDoc && inputDoc.annotations) ? inputDoc.annotations : [];
|
|
404
|
+
const acceptedMwes = annotations.filter((a) => a && a.kind === 'mwe' && a.status === 'accepted');
|
|
405
|
+
|
|
406
|
+
const mentionEvidence = new Map();
|
|
407
|
+
const mentionOrder = [];
|
|
408
|
+
for (const t of tokens) {
|
|
409
|
+
if (!t || typeof t.id !== 'string') continue;
|
|
410
|
+
const mentionId = `token:${t.id}`;
|
|
411
|
+
mentionOrder.push(mentionId);
|
|
412
|
+
const has = hasPositiveWikiSignal(
|
|
413
|
+
t && t.lexicon && typeof t.lexicon === 'object' ? t.lexicon.wikipedia_title_index : null
|
|
414
|
+
);
|
|
415
|
+
mentionEvidence.set(mentionId, has);
|
|
416
|
+
}
|
|
417
|
+
for (const mwe of acceptedMwes) {
|
|
418
|
+
const annId = typeof mwe.id === 'string' ? mwe.id : '';
|
|
419
|
+
if (!annId) continue;
|
|
420
|
+
const mentionId = `mwe:${annId}`;
|
|
421
|
+
mentionOrder.push(mentionId);
|
|
422
|
+
const has =
|
|
423
|
+
Array.isArray(mwe.sources) &&
|
|
424
|
+
mwe.sources.some((s) => s && s.name === 'wikipedia-title-index' && hasPositiveWikiSignal(s.evidence));
|
|
425
|
+
mentionEvidence.set(mentionId, has);
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
const mweByToken = new Map();
|
|
429
|
+
const sortedMwes = acceptedMwes
|
|
430
|
+
.map((mwe) => {
|
|
431
|
+
const ts = findSelector(mwe, 'TokenSelector');
|
|
432
|
+
const ids = ts && Array.isArray(ts.token_ids) ? normalizeIds(ts.token_ids) : [];
|
|
433
|
+
const ps = findSelector(mwe, 'TextPositionSelector');
|
|
434
|
+
const spanStart = ps && ps.span && typeof ps.span.start === 'number' ? ps.span.start : Number.MAX_SAFE_INTEGER;
|
|
435
|
+
return {
|
|
436
|
+
id: typeof mwe.id === 'string' ? mwe.id : '',
|
|
437
|
+
token_ids: ids,
|
|
438
|
+
len: ids.length,
|
|
439
|
+
spanStart,
|
|
440
|
+
};
|
|
441
|
+
})
|
|
442
|
+
.filter((x) => x.id && x.len > 0)
|
|
443
|
+
.sort((a, b) => {
|
|
444
|
+
if (b.len !== a.len) return b.len - a.len;
|
|
445
|
+
if (a.spanStart !== b.spanStart) return a.spanStart - b.spanStart;
|
|
446
|
+
return a.id.localeCompare(b.id);
|
|
447
|
+
});
|
|
448
|
+
for (const mwe of sortedMwes) {
|
|
449
|
+
for (const tid of mwe.token_ids) {
|
|
450
|
+
if (!mweByToken.has(tid)) mweByToken.set(tid, []);
|
|
451
|
+
mweByToken.get(tid).push(`mwe:${mwe.id}`);
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
const deps = annotations.filter((a) => a && a.kind === 'dependency' && a.status === 'accepted' && annotationHasSource(a, 'relation-extraction'));
|
|
456
|
+
const predicateIds = normalizeIds(
|
|
457
|
+
deps
|
|
458
|
+
.map((d) => (d && d.head && typeof d.head.id === 'string' ? d.head.id : ''))
|
|
459
|
+
.filter(Boolean)
|
|
460
|
+
.map((tid) => {
|
|
461
|
+
const mweMentions = mweByToken.get(tid) || [];
|
|
462
|
+
if (mweMentions.length > 0) return mweMentions[0];
|
|
463
|
+
return `token:${tid}`;
|
|
464
|
+
})
|
|
465
|
+
);
|
|
466
|
+
|
|
467
|
+
const missingMentions = mentionOrder.filter((id) => !mentionEvidence.get(id));
|
|
468
|
+
const predicatesWith = predicateIds.filter((id) => mentionEvidence.get(id));
|
|
469
|
+
const predicatesWithout = predicateIds.filter((id) => !mentionEvidence.get(id));
|
|
470
|
+
|
|
471
|
+
return {
|
|
472
|
+
evidence_definition: 'positive_signal_only',
|
|
473
|
+
total_mentions: mentionOrder.length,
|
|
474
|
+
mentions_with_wiki_evidence: mentionOrder.length - missingMentions.length,
|
|
475
|
+
mentions_without_wiki_evidence: missingMentions.length,
|
|
476
|
+
total_predicates: predicateIds.length,
|
|
477
|
+
predicates_with_wiki_evidence: predicatesWith.length,
|
|
478
|
+
predicates_without_wiki_evidence: predicatesWithout.length,
|
|
479
|
+
sample_missing_mention_ids: missingMentions.slice(0, 10),
|
|
480
|
+
sample_missing_predicate_ids: predicatesWithout.slice(0, 10),
|
|
481
|
+
wiki_related_fields: collectWikiFieldDiagnostics(inputDoc),
|
|
482
|
+
};
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
module.exports = {
|
|
487
|
+
operatorIdentityKey,
|
|
488
|
+
mergeOperator,
|
|
489
|
+
pickReasonByPrecedence,
|
|
490
|
+
classifyUnresolvedReason,
|
|
491
|
+
buildUnresolved,
|
|
492
|
+
buildSubjectRoleGaps,
|
|
493
|
+
buildDiagnostics,
|
|
494
|
+
collectWikiFieldDiagnostics,
|
|
495
|
+
analyzeUpstreamWikiEvidence,
|
|
496
|
+
};
|