@velvetmonkey/vault-core 2.0.142 → 2.0.144

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/wikilinks.js CHANGED
@@ -9,28 +9,29 @@
9
9
  * - Alias resolution for existing wikilinks (resolves [[alias]] to [[Entity|alias]])
10
10
  */
11
11
  import { getProtectedZones, rangeOverlapsProtectedZone } from './protectedZones.js';
12
+ import { stem } from './stemmer.js';
12
13
  /**
13
14
  * Get all search terms for an entity (name + aliases)
14
15
  * Returns tuples of [searchTerm, entityName] for proper linking
15
16
  */
16
17
  function getSearchTerms(entity) {
17
18
  if (typeof entity === 'string') {
18
- return [{ term: entity, entityName: entity }];
19
+ return [{ term: entity, entityName: entity, isAlias: false }];
19
20
  }
20
21
  // Include the entity name and all aliases
21
22
  const terms = [
22
- { term: entity.name, entityName: entity.name }
23
+ { term: entity.name, entityName: entity.name, isAlias: false }
23
24
  ];
24
25
  for (const alias of entity.aliases) {
25
- terms.push({ term: alias, entityName: entity.name });
26
+ terms.push({ term: alias, entityName: entity.name, isAlias: true });
26
27
  }
27
28
  return terms;
28
29
  }
29
30
  /**
30
- * Common words to exclude from wikilink matching.
31
- * These words are never wikified even when they match entity names or aliases.
31
+ * Base set of common words to exclude from wikilink matching.
32
+ * Extended by IMPLICIT_EXCLUDE_WORDS to form the full EXCLUDE_WORDS set.
32
33
  */
33
- const EXCLUDE_WORDS = new Set([
34
+ const EXCLUDE_WORDS_BASE = new Set([
34
35
  // Day names
35
36
  'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
36
37
  // Month names
@@ -227,7 +228,31 @@ const EXCLUDE_WORDS = new Set([
227
228
  'view', 'village', 'voice', 'volume', 'wall', 'war', 'waste', 'water',
228
229
  'wave', 'way', 'weather', 'weight', 'west', 'wind', 'window',
229
230
  'winter', 'wood', 'word', 'worker', 'world', 'writing',
231
+ // Nationalities / demonyms
232
+ 'american', 'british', 'french', 'german', 'chinese', 'japanese',
233
+ 'indian', 'russian', 'australian', 'canadian', 'italian', 'spanish',
234
+ 'dutch', 'swiss', 'irish', 'scottish', 'welsh', 'english',
235
+ 'european', 'african', 'asian', 'brazilian', 'mexican', 'korean',
236
+ 'turkish', 'polish', 'swedish', 'norwegian', 'danish', 'finnish',
237
+ // Multi-word production false positives
238
+ 'front door', 'back door', 'side door',
230
239
  ]);
240
+ /**
241
+ * Unified EXCLUDE_WORDS: base set (300+) merged with IMPLICIT_EXCLUDE_WORDS (1100+).
242
+ * This ensures shouldExcludeEntity() checks all 1200+ common English words,
243
+ * not just the smaller base set. Fixes words like "phase", "tier", "recall"
244
+ * that were in IMPLICIT but not in the explicit matching path.
245
+ *
246
+ * Note: IMPLICIT_EXCLUDE_WORDS is defined later in this file.
247
+ * We use a lazy getter to avoid forward-reference issues.
248
+ */
249
+ let _mergedExcludeWords = null;
250
+ function getMergedExcludeWords() {
251
+ if (!_mergedExcludeWords) {
252
+ _mergedExcludeWords = new Set([...EXCLUDE_WORDS_BASE, ...IMPLICIT_EXCLUDE_WORDS]);
253
+ }
254
+ return _mergedExcludeWords;
255
+ }
231
256
  /**
232
257
  * Escape special regex characters in a string
233
258
  */
@@ -237,15 +262,19 @@ function escapeRegex(str) {
237
262
  /**
238
263
  * Check if an entity should be excluded from wikilikning
239
264
  */
240
- function shouldExcludeEntity(entity) {
265
+ function shouldExcludeEntity(entity, isAlias = false) {
241
266
  // Skip single-char terms (e.g. alias "I" for Ben)
242
267
  if (entity.length < 2)
243
268
  return true;
244
- if (EXCLUDE_WORDS.has(entity.toLowerCase()))
269
+ if (getMergedExcludeWords().has(entity.toLowerCase()))
245
270
  return true;
246
271
  // Skip lowercase hyphenated descriptors (e.g., self-improving, local-first, Claude-native)
247
272
  if (entity.includes('-') && entity === entity.toLowerCase())
248
273
  return true;
274
+ // Short aliases (≤3 chars) must be ALL-UPPERCASE to survive (e.g., "CI", "ML" ok, "api", "tF" blocked)
275
+ // Entity names like "Ben" (3 chars, mixed case) are unaffected since isAlias=false for names.
276
+ if (isAlias && entity.length <= 3 && entity !== entity.toUpperCase())
277
+ return true;
249
278
  return false;
250
279
  }
251
280
  /**
@@ -290,13 +319,34 @@ export function applyWikilinks(content, entities, options = {}) {
290
319
  linkedEntities: [],
291
320
  };
292
321
  }
322
+ // Detect ambiguous aliases — aliases claimed by multiple entities
323
+ // Skip these to avoid wrong entity resolution (same pattern as resolveAliasWikilinks)
324
+ const aliasCounts = new Map();
325
+ for (const entity of entities) {
326
+ if (typeof entity === 'string')
327
+ continue;
328
+ for (const alias of entity.aliases) {
329
+ const key = alias.toLowerCase();
330
+ const owners = aliasCounts.get(key) ?? new Set();
331
+ owners.add(entity.name);
332
+ aliasCounts.set(key, owners);
333
+ }
334
+ }
335
+ const ambiguousAliases = new Set();
336
+ for (const [key, owners] of aliasCounts) {
337
+ if (owners.size > 1)
338
+ ambiguousAliases.add(key);
339
+ }
293
340
  // Build search terms from all entities (names + aliases)
294
341
  // Each term maps back to its canonical entity name
295
342
  const allSearchTerms = [];
296
343
  for (const entity of entities) {
297
344
  const terms = getSearchTerms(entity);
298
345
  for (const t of terms) {
299
- if (!shouldExcludeEntity(t.term)) {
346
+ // Skip ambiguous aliases (shared by multiple entities)
347
+ if (t.isAlias && ambiguousAliases.has(t.term.toLowerCase()))
348
+ continue;
349
+ if (!shouldExcludeEntity(t.term, t.isAlias)) {
300
350
  allSearchTerms.push(t);
301
351
  }
302
352
  }
@@ -314,10 +364,12 @@ export function applyWikilinks(content, entities, options = {}) {
314
364
  // Also need to handle overlapping matches between different entities
315
365
  // First, collect ALL valid matches for each entity (name + aliases combined)
316
366
  const entityAllMatches = new Map();
317
- for (const { term, entityName } of allSearchTerms) {
367
+ for (const { term, entityName, isAlias } of allSearchTerms) {
318
368
  const entityKey = entityName.toLowerCase();
319
- // Find all matches of the search term
320
- const matches = findEntityMatches(result, term, caseInsensitive);
369
+ // Short uppercase aliases (≤4 chars, all-caps) match case-sensitively
370
+ // so "CI" matches "CI" but not "ci" or "Ci"
371
+ const useCaseInsensitive = !(isAlias && term.length <= 4 && term === term.toUpperCase());
372
+ const matches = findEntityMatches(result, term, useCaseInsensitive ? caseInsensitive : false);
321
373
  // Filter out matches in protected zones
322
374
  const validMatches = matches.filter(match => !rangeOverlapsProtectedZone(match.start, match.end, zones));
323
375
  if (validMatches.length === 0) {
@@ -406,6 +458,63 @@ export function applyWikilinks(content, entities, options = {}) {
406
458
  linkedEntities.push(entityName);
407
459
  }
408
460
  }
461
+ // Stemmed matching pass: for single-word entities (≥4 chars) that didn't match
462
+ // exactly, find content words with the same Porter stem and link them.
463
+ // This eliminates the need for explicit morphological aliases
464
+ // (e.g., Pipelines matches "Pipeline", Sprint matches "Sprinting").
465
+ for (const entity of entities) {
466
+ if (typeof entity === 'string')
467
+ continue;
468
+ const entityName = entity.name;
469
+ if (selectedEntityNames.has(entityName.toLowerCase()))
470
+ continue;
471
+ // Only single-word entities ≥4 chars — multi-word needs exact matching
472
+ if (entityName.includes(' ') || entityName.length < 4)
473
+ continue;
474
+ if (shouldExcludeEntity(entityName))
475
+ continue;
476
+ const entityStem = stem(entityName);
477
+ // Find word-boundary matches in content for words with same stem
478
+ const wordPattern = /\b[A-Za-z]{4,}\b/g;
479
+ let wordMatch;
480
+ let bestStemMatch = null;
481
+ while ((wordMatch = wordPattern.exec(result)) !== null) {
482
+ const word = wordMatch[0];
483
+ if (stem(word) !== entityStem)
484
+ continue;
485
+ // Skip if same as entity name (already tried in exact pass)
486
+ if (word.toLowerCase() === entityName.toLowerCase())
487
+ continue;
488
+ const start = wordMatch.index;
489
+ const end = start + word.length;
490
+ // Must not be in a protected zone
491
+ if (rangeOverlapsProtectedZone(start, end, zones))
492
+ continue;
493
+ // Check bracket chars
494
+ const charBefore = start > 0 ? result[start - 1] : '';
495
+ const charAfter = end < result.length ? result[end] : '';
496
+ if ('()[]{}'.includes(charBefore) || '()[]{}'.includes(charAfter))
497
+ continue;
498
+ bestStemMatch = { start, end, matched: word };
499
+ break; // First occurrence only
500
+ }
501
+ if (bestStemMatch) {
502
+ const wikilink = `[[${entityName}|${bestStemMatch.matched}]]`;
503
+ result = result.slice(0, bestStemMatch.start) + wikilink + result.slice(bestStemMatch.end);
504
+ const shift = wikilink.length - bestStemMatch.matched.length;
505
+ zones = zones.map(zone => ({
506
+ ...zone,
507
+ start: zone.start <= bestStemMatch.start ? zone.start : zone.start + shift,
508
+ end: zone.end <= bestStemMatch.start ? zone.end : zone.end + shift,
509
+ }));
510
+ zones.push({ start: bestStemMatch.start, end: bestStemMatch.start + wikilink.length, type: 'wikilink' });
511
+ zones.sort((a, b) => a.start - b.start);
512
+ linksAdded++;
513
+ if (!linkedEntities.includes(entityName)) {
514
+ linkedEntities.push(entityName);
515
+ }
516
+ }
517
+ }
409
518
  }
410
519
  else {
411
520
  // For all occurrences mode, process each term
@@ -473,7 +582,7 @@ export function suggestWikilinks(content, entities, options = {}) {
473
582
  for (const entity of entities) {
474
583
  const terms = getSearchTerms(entity);
475
584
  for (const t of terms) {
476
- if (!shouldExcludeEntity(t.term)) {
585
+ if (!shouldExcludeEntity(t.term, t.isAlias)) {
477
586
  allSearchTerms.push(t);
478
587
  }
479
588
  }
@@ -486,9 +595,10 @@ export function suggestWikilinks(content, entities, options = {}) {
486
595
  // For firstOccurrenceOnly mode, find the earliest match across all terms
487
596
  // for each entity, similar to applyWikilinks behavior
488
597
  const entityAllMatches = new Map();
489
- for (const { term, entityName } of allSearchTerms) {
598
+ for (const { term, entityName, isAlias } of allSearchTerms) {
490
599
  const entityKey = entityName.toLowerCase();
491
- const matches = findEntityMatches(content, term, caseInsensitive);
600
+ const useCaseInsensitive = !(isAlias && term.length <= 4 && term === term.toUpperCase());
601
+ const matches = findEntityMatches(content, term, useCaseInsensitive ? caseInsensitive : false);
492
602
  // Filter out matches in protected zones
493
603
  const validMatches = matches.filter(match => !rangeOverlapsProtectedZone(match.start, match.end, zones));
494
604
  if (validMatches.length === 0)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@velvetmonkey/vault-core",
3
- "version": "2.0.142",
3
+ "version": "2.0.144",
4
4
  "description": "Shared vault utilities for Flywheel ecosystem (entity scanning, wikilinks, protected zones)",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -59,6 +59,7 @@
59
59
  "license": "AGPL-3.0-only",
60
60
  "files": [
61
61
  "dist",
62
+ "data",
62
63
  "!dist/**/*.js.map",
63
64
  "!dist/**/*.d.ts.map",
64
65
  "README.md"