@199-bio/engram 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +19 -0
- package/LICENSE +21 -0
- package/LIVING_PLAN.md +180 -0
- package/PLAN.md +514 -0
- package/README.md +304 -0
- package/dist/graph/extractor.d.ts.map +1 -0
- package/dist/graph/index.d.ts.map +1 -0
- package/dist/graph/knowledge-graph.d.ts.map +1 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +473 -0
- package/dist/retrieval/colbert.d.ts.map +1 -0
- package/dist/retrieval/hybrid.d.ts.map +1 -0
- package/dist/retrieval/index.d.ts.map +1 -0
- package/dist/storage/database.d.ts.map +1 -0
- package/dist/storage/index.d.ts.map +1 -0
- package/package.json +62 -0
- package/src/graph/extractor.ts +441 -0
- package/src/graph/index.ts +2 -0
- package/src/graph/knowledge-graph.ts +263 -0
- package/src/index.ts +558 -0
- package/src/retrieval/colbert-bridge.py +222 -0
- package/src/retrieval/colbert.ts +317 -0
- package/src/retrieval/hybrid.ts +218 -0
- package/src/retrieval/index.ts +2 -0
- package/src/storage/database.ts +527 -0
- package/src/storage/index.ts +1 -0
- package/tests/test-interactive.js +218 -0
- package/tests/test-mcp.sh +81 -0
- package/tsconfig.json +20 -0
|
@@ -0,0 +1,441 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Entity extraction from text using heuristics
|
|
3
|
+
* No external APIs - pure local processing
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export interface ExtractedEntity {
|
|
7
|
+
name: string;
|
|
8
|
+
type: "person" | "place" | "concept" | "event" | "organization";
|
|
9
|
+
confidence: number;
|
|
10
|
+
span: { start: number; end: number };
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
// Common words that look like names but aren't
|
|
14
|
+
const STOPWORDS = new Set([
|
|
15
|
+
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
|
|
16
|
+
"of", "with", "by", "from", "as", "is", "was", "are", "were", "been",
|
|
17
|
+
"be", "have", "has", "had", "do", "does", "did", "will", "would",
|
|
18
|
+
"could", "should", "may", "might", "must", "shall", "can", "need",
|
|
19
|
+
"this", "that", "these", "those", "i", "you", "he", "she", "it",
|
|
20
|
+
"we", "they", "what", "which", "who", "whom", "whose", "where",
|
|
21
|
+
"when", "why", "how", "all", "each", "every", "both", "few", "more",
|
|
22
|
+
"most", "other", "some", "such", "no", "not", "only", "same", "so",
|
|
23
|
+
"than", "too", "very", "just", "also", "now", "here", "there", "then",
|
|
24
|
+
"if", "because", "while", "although", "though", "after", "before",
|
|
25
|
+
"since", "until", "unless", "however", "therefore", "thus", "hence",
|
|
26
|
+
"monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
|
|
27
|
+
"january", "february", "march", "april", "may", "june", "july",
|
|
28
|
+
"august", "september", "october", "november", "december",
|
|
29
|
+
"today", "tomorrow", "yesterday", "morning", "afternoon", "evening", "night",
|
|
30
|
+
"said", "says", "told", "asked", "replied", "answered", "mentioned",
|
|
31
|
+
"think", "know", "believe", "feel", "want", "need", "like", "love",
|
|
32
|
+
]);
|
|
33
|
+
|
|
34
|
+
// Common titles that precede names
|
|
35
|
+
const TITLES = ["mr", "mrs", "ms", "miss", "dr", "prof", "sir", "lady", "lord"];
|
|
36
|
+
|
|
37
|
+
// Organization suffixes and keywords
|
|
38
|
+
const ORG_SUFFIXES = [
|
|
39
|
+
"inc", "inc.", "corp", "corp.", "corporation", "llc", "llp", "ltd", "ltd.",
|
|
40
|
+
"limited", "co", "co.", "company", "companies", "group", "holdings",
|
|
41
|
+
"partners", "partnership", "associates", "foundation", "institute",
|
|
42
|
+
"university", "college", "school", "hospital", "clinic", "bank",
|
|
43
|
+
"capital", "ventures", "labs", "laboratory", "laboratories",
|
|
44
|
+
"technologies", "tech", "software", "systems", "solutions", "services",
|
|
45
|
+
"industries", "international", "global", "worldwide", "enterprises",
|
|
46
|
+
];
|
|
47
|
+
|
|
48
|
+
// Well-known organizations (case-insensitive matching)
|
|
49
|
+
// Note: Avoid short words that could match common English words (e.g., "WHO")
|
|
50
|
+
const KNOWN_ORGANIZATIONS = new Set([
|
|
51
|
+
"goldman sachs", "morgan stanley", "jp morgan", "jpmorgan", "citibank",
|
|
52
|
+
"bank of america", "wells fargo", "barclays", "deutsche bank", "hsbc",
|
|
53
|
+
"credit suisse", "ubs", "blackrock", "blackstone", "kkr", "carlyle",
|
|
54
|
+
"apollo global", "bridgewater", "citadel", "two sigma", "renaissance technologies",
|
|
55
|
+
"google", "alphabet", "microsoft", "apple", "amazon", "meta", "facebook",
|
|
56
|
+
"netflix", "tesla", "nvidia", "intel", "amd", "ibm", "oracle", "salesforce",
|
|
57
|
+
"adobe", "spotify", "uber", "lyft", "airbnb", "stripe", "square", "paypal",
|
|
58
|
+
"twitter", "x corp", "linkedin", "snapchat", "tiktok", "bytedance",
|
|
59
|
+
"openai", "anthropic", "deepmind", "cohere", "stability ai", "midjourney",
|
|
60
|
+
"199 biotechnologies", "199 bio",
|
|
61
|
+
"harvard university", "stanford university", "yale university", "princeton university",
|
|
62
|
+
"columbia university", "oxford university", "cambridge university",
|
|
63
|
+
"mit", "caltech", "nyu", "ucla", "usc", "berkeley",
|
|
64
|
+
"fbi", "cia", "nsa", "nasa", "fda", "sec", "fcc", "epa", "doj",
|
|
65
|
+
"united nations", "world bank", "imf", "nato", "european union",
|
|
66
|
+
"red cross", "unicef", "greenpeace", "amnesty international",
|
|
67
|
+
"new york times", "washington post", "wall street journal", "bbc", "cnn",
|
|
68
|
+
"nbc", "abc news", "cbs news", "fox news", "reuters", "associated press", "bloomberg",
|
|
69
|
+
]);
|
|
70
|
+
|
|
71
|
+
// Words that look like names but aren't (nationalities, religions, etc.)
|
|
72
|
+
const NOT_PERSON_NAMES = new Set([
|
|
73
|
+
"russian", "american", "british", "chinese", "japanese", "german", "french",
|
|
74
|
+
"italian", "spanish", "indian", "brazilian", "mexican", "canadian", "australian",
|
|
75
|
+
"muslim", "christian", "jewish", "hindu", "buddhist", "atheist", "catholic",
|
|
76
|
+
"protestant", "orthodox", "sunni", "shia", "sikh", "jain",
|
|
77
|
+
"asian", "european", "african", "latin", "caucasian", "middle eastern",
|
|
78
|
+
]);
|
|
79
|
+
|
|
80
|
+
// Common places (US states, major cities, countries)
|
|
81
|
+
const KNOWN_PLACES = new Set([
|
|
82
|
+
"california", "new york", "texas", "florida", "washington", "massachusetts",
|
|
83
|
+
"colorado", "illinois", "pennsylvania", "ohio", "georgia", "michigan",
|
|
84
|
+
"san francisco", "los angeles", "seattle", "boston", "chicago", "miami",
|
|
85
|
+
"london", "paris", "tokyo", "singapore", "hong kong", "dubai", "berlin",
|
|
86
|
+
"sydney", "toronto", "vancouver", "amsterdam", "zurich", "geneva",
|
|
87
|
+
"usa", "uk", "china", "japan", "germany", "france", "india", "canada",
|
|
88
|
+
"australia", "brazil", "mexico", "russia", "spain", "italy", "switzerland",
|
|
89
|
+
]);
|
|
90
|
+
|
|
91
|
+
// Relationship words that often precede person mentions
|
|
92
|
+
const RELATION_WORDS = [
|
|
93
|
+
"brother", "sister", "mother", "father", "mom", "dad", "mum",
|
|
94
|
+
"son", "daughter", "wife", "husband", "partner", "boyfriend", "girlfriend",
|
|
95
|
+
"uncle", "aunt", "cousin", "nephew", "niece", "grandmother", "grandfather",
|
|
96
|
+
"grandma", "grandpa", "friend", "colleague", "boss", "ex", "fiancé", "fiancée",
|
|
97
|
+
];
|
|
98
|
+
|
|
99
|
+
export class EntityExtractor {
|
|
100
|
+
/**
|
|
101
|
+
* Extract all entities from text
|
|
102
|
+
*/
|
|
103
|
+
extractAll(text: string): ExtractedEntity[] {
|
|
104
|
+
const entities: ExtractedEntity[] = [];
|
|
105
|
+
|
|
106
|
+
// Extract organizations FIRST (higher priority)
|
|
107
|
+
const orgs = this.extractOrganizations(text);
|
|
108
|
+
entities.push(...orgs);
|
|
109
|
+
|
|
110
|
+
// Track organization names to avoid re-extracting as persons
|
|
111
|
+
const orgNames = new Set(orgs.map((o) => o.name.toLowerCase()));
|
|
112
|
+
|
|
113
|
+
// Extract persons (excluding already-found orgs)
|
|
114
|
+
const persons = this.extractPersons(text).filter(
|
|
115
|
+
(p) => !orgNames.has(p.name.toLowerCase())
|
|
116
|
+
);
|
|
117
|
+
entities.push(...persons);
|
|
118
|
+
|
|
119
|
+
// First: filter out entities with bad prefixes/suffixes
|
|
120
|
+
const badSuffixes = ["managing", "as", "last", "and", "or", "the", "a", "an", "for", "with"];
|
|
121
|
+
const badPrefixes = ["he", "she", "they", "my", "his", "her", "the", "a", "an", "joined"];
|
|
122
|
+
|
|
123
|
+
const cleanEntities = entities.filter((entity) => {
|
|
124
|
+
const words = entity.name.toLowerCase().split(/\s+/);
|
|
125
|
+
const lastWord = words[words.length - 1];
|
|
126
|
+
const firstWord = words[0];
|
|
127
|
+
if (badSuffixes.includes(lastWord)) return false;
|
|
128
|
+
if (badPrefixes.includes(firstWord)) return false;
|
|
129
|
+
return true;
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
// Deduplicate by name, preferring higher confidence and orgs over persons
|
|
133
|
+
const seen = new Map<string, ExtractedEntity>();
|
|
134
|
+
for (const entity of cleanEntities) {
|
|
135
|
+
const key = entity.name.toLowerCase();
|
|
136
|
+
const existing = seen.get(key);
|
|
137
|
+
if (!existing) {
|
|
138
|
+
seen.set(key, entity);
|
|
139
|
+
} else if (entity.type === "organization" && existing.type === "person") {
|
|
140
|
+
// Prefer org over person
|
|
141
|
+
seen.set(key, entity);
|
|
142
|
+
} else if (entity.confidence > existing.confidence && entity.type === existing.type) {
|
|
143
|
+
seen.set(key, entity);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Remove entities that are proper substrings of other entities with same type
|
|
148
|
+
const result = Array.from(seen.values());
|
|
149
|
+
return result.filter((entity) => {
|
|
150
|
+
const key = entity.name.toLowerCase();
|
|
151
|
+
for (const other of result) {
|
|
152
|
+
const otherKey = other.name.toLowerCase();
|
|
153
|
+
if (otherKey !== key && other.type === entity.type) {
|
|
154
|
+
// If this entity is a prefix of another (longer) entity, keep the shorter one
|
|
155
|
+
// unless the longer one has much higher confidence
|
|
156
|
+
if (otherKey.startsWith(key + " ") && other.confidence > entity.confidence + 0.1) {
|
|
157
|
+
return false;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
return true;
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Extract organizations from text
|
|
167
|
+
*/
|
|
168
|
+
extractOrganizations(text: string): ExtractedEntity[] {
|
|
169
|
+
const results: ExtractedEntity[] = [];
|
|
170
|
+
const foundNames = new Set<string>();
|
|
171
|
+
|
|
172
|
+
// Pattern 1: Check for known organizations
|
|
173
|
+
for (const orgName of KNOWN_ORGANIZATIONS) {
|
|
174
|
+
const pattern = new RegExp(`\\b${this.escapeRegex(orgName)}\\b`, "gi");
|
|
175
|
+
let match;
|
|
176
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
177
|
+
const name = match[0];
|
|
178
|
+
const key = name.toLowerCase();
|
|
179
|
+
if (!foundNames.has(key)) {
|
|
180
|
+
foundNames.add(key);
|
|
181
|
+
results.push({
|
|
182
|
+
name,
|
|
183
|
+
type: "organization",
|
|
184
|
+
confidence: 0.95,
|
|
185
|
+
span: { start: match.index, end: match.index + name.length },
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Pattern 2: Capitalized word(s) followed by org suffixes
|
|
192
|
+
// Allow single word + suffix (e.g., "Acme Corporation")
|
|
193
|
+
// Use case-sensitive matching for proper nouns, handle suffix case separately
|
|
194
|
+
const suffixPatternStr = ORG_SUFFIXES.map(s =>
|
|
195
|
+
`${s.charAt(0).toUpperCase()}${s.slice(1)}|${s.toLowerCase()}`
|
|
196
|
+
).join("|");
|
|
197
|
+
const suffixPattern = new RegExp(
|
|
198
|
+
`(?:^|[^A-Za-z])([A-Z][a-z]+(?:\\s+[A-Z][a-z]+)*)\\s+(${suffixPatternStr})(?=\\s|,|\\.|\\)|$)`,
|
|
199
|
+
"g"
|
|
200
|
+
);
|
|
201
|
+
let match;
|
|
202
|
+
while ((match = suffixPattern.exec(text)) !== null) {
|
|
203
|
+
const baseName = match[1].trim();
|
|
204
|
+
const suffix = match[2].trim();
|
|
205
|
+
const fullName = `${baseName} ${suffix}`;
|
|
206
|
+
const key = fullName.toLowerCase();
|
|
207
|
+
|
|
208
|
+
// Skip common adjective+suffix combos
|
|
209
|
+
const firstWord = baseName.split(/\s+/)[0].toLowerCase();
|
|
210
|
+
if (NOT_PERSON_NAMES.has(firstWord)) continue;
|
|
211
|
+
// Skip single words that are not proper nouns
|
|
212
|
+
if (STOPWORDS.has(firstWord)) continue;
|
|
213
|
+
|
|
214
|
+
if (!foundNames.has(key)) {
|
|
215
|
+
foundNames.add(key);
|
|
216
|
+
results.push({
|
|
217
|
+
name: fullName,
|
|
218
|
+
type: "organization",
|
|
219
|
+
confidence: 0.85,
|
|
220
|
+
span: { start: match.index, end: match.index + fullName.length },
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Pattern 3: "works at/for X", "joined X" - only extract multi-word org names
|
|
226
|
+
// Single-word orgs should be in KNOWN_ORGANIZATIONS
|
|
227
|
+
// Use case-sensitive matching for proper nouns (no 'i' flag)
|
|
228
|
+
const workPattern = /(?:works?\s+(?:at|for)|joined|employed\s+(?:at|by)|hired\s+by)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})(?=\s+[a-z]|\s*[,.]|\s*$)/g;
|
|
229
|
+
|
|
230
|
+
while ((match = workPattern.exec(text)) !== null) {
|
|
231
|
+
const name = match[1].trim();
|
|
232
|
+
const key = name.toLowerCase();
|
|
233
|
+
const words = name.split(/\s+/);
|
|
234
|
+
|
|
235
|
+
// Skip if first word is a stopword or nationality/religion
|
|
236
|
+
if (STOPWORDS.has(words[0].toLowerCase()) ||
|
|
237
|
+
NOT_PERSON_NAMES.has(words[0].toLowerCase())) {
|
|
238
|
+
continue;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
if (!foundNames.has(key)) {
|
|
242
|
+
foundNames.add(key);
|
|
243
|
+
results.push({
|
|
244
|
+
name,
|
|
245
|
+
type: "organization",
|
|
246
|
+
confidence: 0.7,
|
|
247
|
+
span: { start: match.index, end: match.index + match[0].length },
|
|
248
|
+
});
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
return results;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* Escape special regex characters
|
|
257
|
+
*/
|
|
258
|
+
private escapeRegex(str: string): string {
|
|
259
|
+
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Extract person names from text using heuristics
|
|
264
|
+
*/
|
|
265
|
+
extractPersons(text: string): ExtractedEntity[] {
|
|
266
|
+
const persons: ExtractedEntity[] = [];
|
|
267
|
+
|
|
268
|
+
// Pattern 1: Capitalized words (potential names)
|
|
269
|
+
persons.push(...this.extractCapitalizedNames(text));
|
|
270
|
+
|
|
271
|
+
// Pattern 2: Possessive patterns ("X's brother", "my friend X")
|
|
272
|
+
persons.push(...this.extractFromPossessives(text));
|
|
273
|
+
|
|
274
|
+
// Pattern 3: Relation patterns ("her brother", "my mom")
|
|
275
|
+
persons.push(...this.extractFromRelations(text));
|
|
276
|
+
|
|
277
|
+
return persons;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
/**
|
|
281
|
+
* Extract capitalized words that look like names
|
|
282
|
+
*/
|
|
283
|
+
private extractCapitalizedNames(text: string): ExtractedEntity[] {
|
|
284
|
+
const results: ExtractedEntity[] = [];
|
|
285
|
+
|
|
286
|
+
// Match capitalized words not at sentence start
|
|
287
|
+
// This regex finds sequences of capitalized words
|
|
288
|
+
const pattern = /(?<=[.!?]\s+|^)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)|(?<=[a-z]\s)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)/g;
|
|
289
|
+
|
|
290
|
+
let match;
|
|
291
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
292
|
+
const name = (match[1] || match[2]).trim();
|
|
293
|
+
const words = name.split(/\s+/);
|
|
294
|
+
|
|
295
|
+
// Filter out stopwords, nationality/religion words, places, and single common words
|
|
296
|
+
const cleanWords = words.filter(
|
|
297
|
+
(w) => !STOPWORDS.has(w.toLowerCase()) &&
|
|
298
|
+
!NOT_PERSON_NAMES.has(w.toLowerCase()) &&
|
|
299
|
+
!KNOWN_PLACES.has(w.toLowerCase()) &&
|
|
300
|
+
w.length > 1
|
|
301
|
+
);
|
|
302
|
+
|
|
303
|
+
if (cleanWords.length === 0) continue;
|
|
304
|
+
|
|
305
|
+
const cleanName = cleanWords.join(" ");
|
|
306
|
+
|
|
307
|
+
// Skip if it's just a common word
|
|
308
|
+
if (cleanWords.length === 1 && cleanWords[0].length < 4) continue;
|
|
309
|
+
|
|
310
|
+
// Higher confidence for multi-word names
|
|
311
|
+
const confidence = cleanWords.length >= 2 ? 0.8 : 0.5;
|
|
312
|
+
|
|
313
|
+
results.push({
|
|
314
|
+
name: cleanName,
|
|
315
|
+
type: "person",
|
|
316
|
+
confidence,
|
|
317
|
+
span: { start: match.index, end: match.index + match[0].length },
|
|
318
|
+
});
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
return results;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* Extract names from possessive patterns like "Sarah's brother"
|
|
326
|
+
*/
|
|
327
|
+
private extractFromPossessives(text: string): ExtractedEntity[] {
|
|
328
|
+
const results: ExtractedEntity[] = [];
|
|
329
|
+
|
|
330
|
+
// Match "Name's something"
|
|
331
|
+
const pattern = /([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)'s\s+(\w+)/g;
|
|
332
|
+
|
|
333
|
+
let match;
|
|
334
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
335
|
+
const name = match[1].trim();
|
|
336
|
+
const following = match[2].toLowerCase();
|
|
337
|
+
|
|
338
|
+
// Higher confidence if followed by a relationship word
|
|
339
|
+
const isRelation = RELATION_WORDS.includes(following);
|
|
340
|
+
const confidence = isRelation ? 0.95 : 0.7;
|
|
341
|
+
|
|
342
|
+
if (!STOPWORDS.has(name.toLowerCase())) {
|
|
343
|
+
results.push({
|
|
344
|
+
name,
|
|
345
|
+
type: "person",
|
|
346
|
+
confidence,
|
|
347
|
+
span: { start: match.index, end: match.index + name.length },
|
|
348
|
+
});
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
// If followed by relationship word, the whole thing might reference another person
|
|
352
|
+
// e.g., "Sarah's brother" - we create a derived entity
|
|
353
|
+
if (isRelation) {
|
|
354
|
+
results.push({
|
|
355
|
+
name: `${name}'s ${following}`,
|
|
356
|
+
type: "person",
|
|
357
|
+
confidence: 0.6,
|
|
358
|
+
span: { start: match.index, end: match.index + match[0].length },
|
|
359
|
+
});
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
return results;
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
/**
|
|
367
|
+
* Extract from relationship patterns like "her brother", "my friend John"
|
|
368
|
+
*/
|
|
369
|
+
private extractFromRelations(text: string): ExtractedEntity[] {
|
|
370
|
+
const results: ExtractedEntity[] = [];
|
|
371
|
+
|
|
372
|
+
// Pattern: possessive + relation word + optional name
|
|
373
|
+
const pronouns = ["my", "his", "her", "their", "our"];
|
|
374
|
+
const relationPattern = new RegExp(
|
|
375
|
+
`(${pronouns.join("|")})\\s+(${RELATION_WORDS.join("|")})(?:\\s+([A-Z][a-z]+))?`,
|
|
376
|
+
"gi"
|
|
377
|
+
);
|
|
378
|
+
|
|
379
|
+
let match;
|
|
380
|
+
while ((match = relationPattern.exec(text)) !== null) {
|
|
381
|
+
const pronoun = match[1];
|
|
382
|
+
const relation = match[2];
|
|
383
|
+
const name = match[3];
|
|
384
|
+
|
|
385
|
+
if (name && !STOPWORDS.has(name.toLowerCase())) {
|
|
386
|
+
// Explicit name mentioned
|
|
387
|
+
results.push({
|
|
388
|
+
name,
|
|
389
|
+
type: "person",
|
|
390
|
+
confidence: 0.9,
|
|
391
|
+
span: {
|
|
392
|
+
start: match.index + match[0].length - name.length,
|
|
393
|
+
end: match.index + match[0].length,
|
|
394
|
+
},
|
|
395
|
+
});
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
return results;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
/**
|
|
403
|
+
* Extract relationship mentions (not entities, but useful for graph)
|
|
404
|
+
*/
|
|
405
|
+
extractRelationships(text: string): Array<{
|
|
406
|
+
subject: string;
|
|
407
|
+
relation: string;
|
|
408
|
+
object: string;
|
|
409
|
+
confidence: number;
|
|
410
|
+
}> {
|
|
411
|
+
const relationships: Array<{
|
|
412
|
+
subject: string;
|
|
413
|
+
relation: string;
|
|
414
|
+
object: string;
|
|
415
|
+
confidence: number;
|
|
416
|
+
}> = [];
|
|
417
|
+
|
|
418
|
+
// Pattern: "X's [relation]" implies relationship
|
|
419
|
+
const possessivePattern = /([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)'s\s+(\w+)/g;
|
|
420
|
+
|
|
421
|
+
let match;
|
|
422
|
+
while ((match = possessivePattern.exec(text)) !== null) {
|
|
423
|
+
const subject = match[1].trim();
|
|
424
|
+
const relWord = match[2].toLowerCase();
|
|
425
|
+
|
|
426
|
+
if (RELATION_WORDS.includes(relWord)) {
|
|
427
|
+
relationships.push({
|
|
428
|
+
subject,
|
|
429
|
+
relation: relWord,
|
|
430
|
+
object: `${subject}'s ${relWord}`, // placeholder name
|
|
431
|
+
confidence: 0.7,
|
|
432
|
+
});
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
return relationships;
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// Singleton instance
|
|
441
|
+
export const entityExtractor = new EntityExtractor();
|