@velvetmonkey/vault-core 2.0.143 → 2.0.145
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/data/google-10k-english.txt +9894 -0
- package/dist/common-words.d.ts +9 -0
- package/dist/common-words.js +14 -0
- package/dist/entities.js +16 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +4 -0
- package/dist/migrations.js +14 -0
- package/dist/schema.d.ts +2 -2
- package/dist/schema.js +3 -1
- package/dist/stemmer.d.ts +21 -0
- package/dist/stemmer.js +228 -0
- package/dist/wikilinks.js +125 -15
- package/package.json +2 -1
package/dist/wikilinks.js
CHANGED
|
@@ -9,28 +9,29 @@
|
|
|
9
9
|
* - Alias resolution for existing wikilinks (resolves [[alias]] to [[Entity|alias]])
|
|
10
10
|
*/
|
|
11
11
|
import { getProtectedZones, rangeOverlapsProtectedZone } from './protectedZones.js';
|
|
12
|
+
import { stem } from './stemmer.js';
|
|
12
13
|
/**
|
|
13
14
|
* Get all search terms for an entity (name + aliases)
|
|
14
15
|
* Returns tuples of [searchTerm, entityName] for proper linking
|
|
15
16
|
*/
|
|
16
17
|
function getSearchTerms(entity) {
|
|
17
18
|
if (typeof entity === 'string') {
|
|
18
|
-
return [{ term: entity, entityName: entity }];
|
|
19
|
+
return [{ term: entity, entityName: entity, isAlias: false }];
|
|
19
20
|
}
|
|
20
21
|
// Include the entity name and all aliases
|
|
21
22
|
const terms = [
|
|
22
|
-
{ term: entity.name, entityName: entity.name }
|
|
23
|
+
{ term: entity.name, entityName: entity.name, isAlias: false }
|
|
23
24
|
];
|
|
24
25
|
for (const alias of entity.aliases) {
|
|
25
|
-
terms.push({ term: alias, entityName: entity.name });
|
|
26
|
+
terms.push({ term: alias, entityName: entity.name, isAlias: true });
|
|
26
27
|
}
|
|
27
28
|
return terms;
|
|
28
29
|
}
|
|
29
30
|
/**
|
|
30
|
-
*
|
|
31
|
-
*
|
|
31
|
+
* Base set of common words to exclude from wikilink matching.
|
|
32
|
+
* Extended by IMPLICIT_EXCLUDE_WORDS to form the full EXCLUDE_WORDS set.
|
|
32
33
|
*/
|
|
33
|
-
const
|
|
34
|
+
const EXCLUDE_WORDS_BASE = new Set([
|
|
34
35
|
// Day names
|
|
35
36
|
'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
|
|
36
37
|
// Month names
|
|
@@ -227,7 +228,31 @@ const EXCLUDE_WORDS = new Set([
|
|
|
227
228
|
'view', 'village', 'voice', 'volume', 'wall', 'war', 'waste', 'water',
|
|
228
229
|
'wave', 'way', 'weather', 'weight', 'west', 'wind', 'window',
|
|
229
230
|
'winter', 'wood', 'word', 'worker', 'world', 'writing',
|
|
231
|
+
// Nationalities / demonyms
|
|
232
|
+
'american', 'british', 'french', 'german', 'chinese', 'japanese',
|
|
233
|
+
'indian', 'russian', 'australian', 'canadian', 'italian', 'spanish',
|
|
234
|
+
'dutch', 'swiss', 'irish', 'scottish', 'welsh', 'english',
|
|
235
|
+
'european', 'african', 'asian', 'brazilian', 'mexican', 'korean',
|
|
236
|
+
'turkish', 'polish', 'swedish', 'norwegian', 'danish', 'finnish',
|
|
237
|
+
// Multi-word production false positives
|
|
238
|
+
'front door', 'back door', 'side door',
|
|
230
239
|
]);
|
|
240
|
+
/**
|
|
241
|
+
* Unified EXCLUDE_WORDS: base set (300+) merged with IMPLICIT_EXCLUDE_WORDS (1100+).
|
|
242
|
+
* This ensures shouldExcludeEntity() checks all 1200+ common English words,
|
|
243
|
+
* not just the smaller base set. Fixes words like "phase", "tier", "recall"
|
|
244
|
+
* that were in IMPLICIT but not in the explicit matching path.
|
|
245
|
+
*
|
|
246
|
+
* Note: IMPLICIT_EXCLUDE_WORDS is defined later in this file.
|
|
247
|
+
* We use a lazy getter to avoid forward-reference issues.
|
|
248
|
+
*/
|
|
249
|
+
let _mergedExcludeWords = null;
|
|
250
|
+
function getMergedExcludeWords() {
|
|
251
|
+
if (!_mergedExcludeWords) {
|
|
252
|
+
_mergedExcludeWords = new Set([...EXCLUDE_WORDS_BASE, ...IMPLICIT_EXCLUDE_WORDS]);
|
|
253
|
+
}
|
|
254
|
+
return _mergedExcludeWords;
|
|
255
|
+
}
|
|
231
256
|
/**
|
|
232
257
|
* Escape special regex characters in a string
|
|
233
258
|
*/
|
|
@@ -237,15 +262,19 @@ function escapeRegex(str) {
|
|
|
237
262
|
/**
|
|
238
263
|
* Check if an entity should be excluded from wikilikning
|
|
239
264
|
*/
|
|
240
|
-
function shouldExcludeEntity(entity) {
|
|
265
|
+
function shouldExcludeEntity(entity, isAlias = false) {
|
|
241
266
|
// Skip single-char terms (e.g. alias "I" for Ben)
|
|
242
267
|
if (entity.length < 2)
|
|
243
268
|
return true;
|
|
244
|
-
if (
|
|
269
|
+
if (getMergedExcludeWords().has(entity.toLowerCase()))
|
|
245
270
|
return true;
|
|
246
271
|
// Skip lowercase hyphenated descriptors (e.g., self-improving, local-first, Claude-native)
|
|
247
272
|
if (entity.includes('-') && entity === entity.toLowerCase())
|
|
248
273
|
return true;
|
|
274
|
+
// Short aliases (≤3 chars) must be ALL-UPPERCASE to survive (e.g., "CI", "ML" ok, "api", "tF" blocked)
|
|
275
|
+
// Entity names like "Ben" (3 chars, mixed case) are unaffected since isAlias=false for names.
|
|
276
|
+
if (isAlias && entity.length <= 3 && entity !== entity.toUpperCase())
|
|
277
|
+
return true;
|
|
249
278
|
return false;
|
|
250
279
|
}
|
|
251
280
|
/**
|
|
@@ -290,13 +319,34 @@ export function applyWikilinks(content, entities, options = {}) {
|
|
|
290
319
|
linkedEntities: [],
|
|
291
320
|
};
|
|
292
321
|
}
|
|
322
|
+
// Detect ambiguous aliases — aliases claimed by multiple entities
|
|
323
|
+
// Skip these to avoid wrong entity resolution (same pattern as resolveAliasWikilinks)
|
|
324
|
+
const aliasCounts = new Map();
|
|
325
|
+
for (const entity of entities) {
|
|
326
|
+
if (typeof entity === 'string')
|
|
327
|
+
continue;
|
|
328
|
+
for (const alias of entity.aliases) {
|
|
329
|
+
const key = alias.toLowerCase();
|
|
330
|
+
const owners = aliasCounts.get(key) ?? new Set();
|
|
331
|
+
owners.add(entity.name);
|
|
332
|
+
aliasCounts.set(key, owners);
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
const ambiguousAliases = new Set();
|
|
336
|
+
for (const [key, owners] of aliasCounts) {
|
|
337
|
+
if (owners.size > 1)
|
|
338
|
+
ambiguousAliases.add(key);
|
|
339
|
+
}
|
|
293
340
|
// Build search terms from all entities (names + aliases)
|
|
294
341
|
// Each term maps back to its canonical entity name
|
|
295
342
|
const allSearchTerms = [];
|
|
296
343
|
for (const entity of entities) {
|
|
297
344
|
const terms = getSearchTerms(entity);
|
|
298
345
|
for (const t of terms) {
|
|
299
|
-
|
|
346
|
+
// Skip ambiguous aliases (shared by multiple entities)
|
|
347
|
+
if (t.isAlias && ambiguousAliases.has(t.term.toLowerCase()))
|
|
348
|
+
continue;
|
|
349
|
+
if (!shouldExcludeEntity(t.term, t.isAlias)) {
|
|
300
350
|
allSearchTerms.push(t);
|
|
301
351
|
}
|
|
302
352
|
}
|
|
@@ -314,10 +364,12 @@ export function applyWikilinks(content, entities, options = {}) {
|
|
|
314
364
|
// Also need to handle overlapping matches between different entities
|
|
315
365
|
// First, collect ALL valid matches for each entity (name + aliases combined)
|
|
316
366
|
const entityAllMatches = new Map();
|
|
317
|
-
for (const { term, entityName } of allSearchTerms) {
|
|
367
|
+
for (const { term, entityName, isAlias } of allSearchTerms) {
|
|
318
368
|
const entityKey = entityName.toLowerCase();
|
|
319
|
-
//
|
|
320
|
-
|
|
369
|
+
// Short uppercase aliases (≤4 chars, all-caps) match case-sensitively
|
|
370
|
+
// so "CI" matches "CI" but not "ci" or "Ci"
|
|
371
|
+
const useCaseInsensitive = !(isAlias && term.length <= 4 && term === term.toUpperCase());
|
|
372
|
+
const matches = findEntityMatches(result, term, useCaseInsensitive ? caseInsensitive : false);
|
|
321
373
|
// Filter out matches in protected zones
|
|
322
374
|
const validMatches = matches.filter(match => !rangeOverlapsProtectedZone(match.start, match.end, zones));
|
|
323
375
|
if (validMatches.length === 0) {
|
|
@@ -406,6 +458,63 @@ export function applyWikilinks(content, entities, options = {}) {
|
|
|
406
458
|
linkedEntities.push(entityName);
|
|
407
459
|
}
|
|
408
460
|
}
|
|
461
|
+
// Stemmed matching pass: for single-word entities (≥4 chars) that didn't match
|
|
462
|
+
// exactly, find content words with the same Porter stem and link them.
|
|
463
|
+
// This eliminates the need for explicit morphological aliases
|
|
464
|
+
// (e.g., Pipelines matches "Pipeline", Sprint matches "Sprinting").
|
|
465
|
+
for (const entity of entities) {
|
|
466
|
+
if (typeof entity === 'string')
|
|
467
|
+
continue;
|
|
468
|
+
const entityName = entity.name;
|
|
469
|
+
if (selectedEntityNames.has(entityName.toLowerCase()))
|
|
470
|
+
continue;
|
|
471
|
+
// Only single-word entities ≥4 chars — multi-word needs exact matching
|
|
472
|
+
if (entityName.includes(' ') || entityName.length < 4)
|
|
473
|
+
continue;
|
|
474
|
+
if (shouldExcludeEntity(entityName))
|
|
475
|
+
continue;
|
|
476
|
+
const entityStem = stem(entityName);
|
|
477
|
+
// Find word-boundary matches in content for words with same stem
|
|
478
|
+
const wordPattern = /\b[A-Za-z]{4,}\b/g;
|
|
479
|
+
let wordMatch;
|
|
480
|
+
let bestStemMatch = null;
|
|
481
|
+
while ((wordMatch = wordPattern.exec(result)) !== null) {
|
|
482
|
+
const word = wordMatch[0];
|
|
483
|
+
if (stem(word) !== entityStem)
|
|
484
|
+
continue;
|
|
485
|
+
// Skip if same as entity name (already tried in exact pass)
|
|
486
|
+
if (word.toLowerCase() === entityName.toLowerCase())
|
|
487
|
+
continue;
|
|
488
|
+
const start = wordMatch.index;
|
|
489
|
+
const end = start + word.length;
|
|
490
|
+
// Must not be in a protected zone
|
|
491
|
+
if (rangeOverlapsProtectedZone(start, end, zones))
|
|
492
|
+
continue;
|
|
493
|
+
// Check bracket chars
|
|
494
|
+
const charBefore = start > 0 ? result[start - 1] : '';
|
|
495
|
+
const charAfter = end < result.length ? result[end] : '';
|
|
496
|
+
if ('()[]{}'.includes(charBefore) || '()[]{}'.includes(charAfter))
|
|
497
|
+
continue;
|
|
498
|
+
bestStemMatch = { start, end, matched: word };
|
|
499
|
+
break; // First occurrence only
|
|
500
|
+
}
|
|
501
|
+
if (bestStemMatch) {
|
|
502
|
+
const wikilink = `[[${entityName}|${bestStemMatch.matched}]]`;
|
|
503
|
+
result = result.slice(0, bestStemMatch.start) + wikilink + result.slice(bestStemMatch.end);
|
|
504
|
+
const shift = wikilink.length - bestStemMatch.matched.length;
|
|
505
|
+
zones = zones.map(zone => ({
|
|
506
|
+
...zone,
|
|
507
|
+
start: zone.start <= bestStemMatch.start ? zone.start : zone.start + shift,
|
|
508
|
+
end: zone.end <= bestStemMatch.start ? zone.end : zone.end + shift,
|
|
509
|
+
}));
|
|
510
|
+
zones.push({ start: bestStemMatch.start, end: bestStemMatch.start + wikilink.length, type: 'wikilink' });
|
|
511
|
+
zones.sort((a, b) => a.start - b.start);
|
|
512
|
+
linksAdded++;
|
|
513
|
+
if (!linkedEntities.includes(entityName)) {
|
|
514
|
+
linkedEntities.push(entityName);
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
}
|
|
409
518
|
}
|
|
410
519
|
else {
|
|
411
520
|
// For all occurrences mode, process each term
|
|
@@ -473,7 +582,7 @@ export function suggestWikilinks(content, entities, options = {}) {
|
|
|
473
582
|
for (const entity of entities) {
|
|
474
583
|
const terms = getSearchTerms(entity);
|
|
475
584
|
for (const t of terms) {
|
|
476
|
-
if (!shouldExcludeEntity(t.term)) {
|
|
585
|
+
if (!shouldExcludeEntity(t.term, t.isAlias)) {
|
|
477
586
|
allSearchTerms.push(t);
|
|
478
587
|
}
|
|
479
588
|
}
|
|
@@ -486,9 +595,10 @@ export function suggestWikilinks(content, entities, options = {}) {
|
|
|
486
595
|
// For firstOccurrenceOnly mode, find the earliest match across all terms
|
|
487
596
|
// for each entity, similar to applyWikilinks behavior
|
|
488
597
|
const entityAllMatches = new Map();
|
|
489
|
-
for (const { term, entityName } of allSearchTerms) {
|
|
598
|
+
for (const { term, entityName, isAlias } of allSearchTerms) {
|
|
490
599
|
const entityKey = entityName.toLowerCase();
|
|
491
|
-
const
|
|
600
|
+
const useCaseInsensitive = !(isAlias && term.length <= 4 && term === term.toUpperCase());
|
|
601
|
+
const matches = findEntityMatches(content, term, useCaseInsensitive ? caseInsensitive : false);
|
|
492
602
|
// Filter out matches in protected zones
|
|
493
603
|
const validMatches = matches.filter(match => !rangeOverlapsProtectedZone(match.start, match.end, zones));
|
|
494
604
|
if (validMatches.length === 0)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@velvetmonkey/vault-core",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.145",
|
|
4
4
|
"description": "Shared vault utilities for Flywheel ecosystem (entity scanning, wikilinks, protected zones)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -59,6 +59,7 @@
|
|
|
59
59
|
"license": "AGPL-3.0-only",
|
|
60
60
|
"files": [
|
|
61
61
|
"dist",
|
|
62
|
+
"data",
|
|
62
63
|
"!dist/**/*.js.map",
|
|
63
64
|
"!dist/**/*.d.ts.map",
|
|
64
65
|
"README.md"
|