memory-braid 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/entities.ts +90 -6
package/package.json
CHANGED
package/src/entities.ts
CHANGED
|
@@ -11,6 +11,8 @@ type NerRecord = {
|
|
|
11
11
|
entity_group?: unknown;
|
|
12
12
|
entity?: unknown;
|
|
13
13
|
score?: unknown;
|
|
14
|
+
start?: unknown;
|
|
15
|
+
end?: unknown;
|
|
14
16
|
};
|
|
15
17
|
|
|
16
18
|
export type ExtractedEntity = {
|
|
@@ -79,6 +81,76 @@ function normalizeEntityText(raw: unknown): string {
|
|
|
79
81
|
return normalizeWhitespace(raw.replace(/^##/, "").replace(/^▁/, ""));
|
|
80
82
|
}
|
|
81
83
|
|
|
84
|
+
type NormalizedEntityToken = {
|
|
85
|
+
text: string;
|
|
86
|
+
type: ExtractedEntity["type"];
|
|
87
|
+
score: number;
|
|
88
|
+
start?: number;
|
|
89
|
+
end?: number;
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
function asFiniteNumber(value: unknown): number | undefined {
|
|
93
|
+
if (typeof value !== "number" || !Number.isFinite(value)) {
|
|
94
|
+
return undefined;
|
|
95
|
+
}
|
|
96
|
+
return value;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function joinEntityText(left: NormalizedEntityToken, right: NormalizedEntityToken): string {
|
|
100
|
+
const leftEnd = left.end;
|
|
101
|
+
const rightStart = right.start;
|
|
102
|
+
if (typeof leftEnd === "number" && typeof rightStart === "number") {
|
|
103
|
+
const gap = rightStart - leftEnd;
|
|
104
|
+
if (gap <= 0) {
|
|
105
|
+
return `${left.text}${right.text}`;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return `${left.text} ${right.text}`;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function shouldMergeEntityTokens(left: NormalizedEntityToken, right: NormalizedEntityToken): boolean {
|
|
112
|
+
if (left.type !== right.type || !left.text || !right.text) {
|
|
113
|
+
return false;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
const leftEnd = left.end;
|
|
117
|
+
const rightStart = right.start;
|
|
118
|
+
if (typeof leftEnd === "number" && typeof rightStart === "number") {
|
|
119
|
+
const gap = rightStart - leftEnd;
|
|
120
|
+
if (gap < 0) {
|
|
121
|
+
return false;
|
|
122
|
+
}
|
|
123
|
+
return gap <= 1;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
if (/[.,!?;:]$/.test(left.text) || /^[.,!?;:]/.test(right.text)) {
|
|
127
|
+
return false;
|
|
128
|
+
}
|
|
129
|
+
return true;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function collapseAdjacentEntityTokens(tokens: NormalizedEntityToken[]): NormalizedEntityToken[] {
|
|
133
|
+
if (tokens.length <= 1) {
|
|
134
|
+
return tokens;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
const collapsed: NormalizedEntityToken[] = [];
|
|
138
|
+
for (const token of tokens) {
|
|
139
|
+
const previous = collapsed[collapsed.length - 1];
|
|
140
|
+
if (!previous || !shouldMergeEntityTokens(previous, token)) {
|
|
141
|
+
collapsed.push({ ...token });
|
|
142
|
+
continue;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
previous.text = normalizeWhitespace(joinEntityText(previous, token));
|
|
146
|
+
previous.score = Math.min(previous.score, token.score);
|
|
147
|
+
previous.start = typeof previous.start === "number" ? previous.start : token.start;
|
|
148
|
+
previous.end = typeof token.end === "number" ? token.end : previous.end;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
return collapsed;
|
|
152
|
+
}
|
|
153
|
+
|
|
82
154
|
type EntityExtractionOptions = {
|
|
83
155
|
stateDir?: string;
|
|
84
156
|
};
|
|
@@ -319,7 +391,7 @@ export class EntityExtractionManager {
|
|
|
319
391
|
});
|
|
320
392
|
const rows = Array.isArray(raw) ? raw : [];
|
|
321
393
|
|
|
322
|
-
const
|
|
394
|
+
const normalized: NormalizedEntityToken[] = [];
|
|
323
395
|
for (const row of rows) {
|
|
324
396
|
if (!row || typeof row !== "object") {
|
|
325
397
|
continue;
|
|
@@ -335,13 +407,25 @@ export class EntityExtractionManager {
|
|
|
335
407
|
}
|
|
336
408
|
|
|
337
409
|
const type = normalizeEntityType(record.entity_group ?? record.entity);
|
|
338
|
-
|
|
410
|
+
normalized.push({
|
|
411
|
+
text: entityText,
|
|
412
|
+
type,
|
|
413
|
+
score,
|
|
414
|
+
start: asFiniteNumber(record.start),
|
|
415
|
+
end: asFiniteNumber(record.end),
|
|
416
|
+
});
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
const collapsed = collapseAdjacentEntityTokens(normalized);
|
|
420
|
+
const deduped = new Map<string, ExtractedEntity>();
|
|
421
|
+
for (const token of collapsed) {
|
|
422
|
+
const canonicalUri = buildCanonicalEntityUri(token.type, token.text);
|
|
339
423
|
const current = deduped.get(canonicalUri);
|
|
340
|
-
if (!current || score > current.score) {
|
|
424
|
+
if (!current || token.score > current.score) {
|
|
341
425
|
deduped.set(canonicalUri, {
|
|
342
|
-
text:
|
|
343
|
-
type,
|
|
344
|
-
score,
|
|
426
|
+
text: token.text,
|
|
427
|
+
type: token.type,
|
|
428
|
+
score: token.score,
|
|
345
429
|
canonicalUri,
|
|
346
430
|
});
|
|
347
431
|
}
|