@johndimm/constellations 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/App.tsx +480 -0
- package/FullPageConstellations.tsx +74 -0
- package/FullPageConstellationsHostShell.tsx +27 -0
- package/README.md +116 -0
- package/components/AppConfirmDialog.tsx +46 -0
- package/components/AppHeader.tsx +73 -0
- package/components/AppNotifications.tsx +21 -0
- package/components/BrowsePeople.tsx +832 -0
- package/components/ControlPanel.tsx +1023 -0
- package/components/Graph.tsx +1525 -0
- package/components/HelpOverlay.tsx +168 -0
- package/components/NodeContextMenu.tsx +160 -0
- package/components/PeopleBrowserSidebar.tsx +690 -0
- package/components/Sidebar.tsx +271 -0
- package/components/TimelineView.tsx +4 -0
- package/hooks/useExpansion.ts +889 -0
- package/hooks/useGraphActions.ts +325 -0
- package/hooks/useGraphState.ts +414 -0
- package/hooks/useKioskMode.ts +47 -0
- package/hooks/useNodeClickHandler.ts +172 -0
- package/hooks/useSearchHandlers.ts +369 -0
- package/host.ts +16 -0
- package/index.css +101 -0
- package/index.tsx +16 -0
- package/kioskDomains.ts +307 -0
- package/package.json +78 -0
- package/services/aiUtils.ts +364 -0
- package/services/cacheService.ts +76 -0
- package/services/crossrefService.ts +107 -0
- package/services/geminiService.ts +1359 -0
- package/services/get-local-graphs.js +5 -0
- package/services/graphUtils.ts +347 -0
- package/services/imageService.ts +39 -0
- package/services/llmClient.ts +194 -0
- package/services/openAlexService.ts +173 -0
- package/services/wikipediaImage.ts +40 -0
- package/services/wikipediaService.ts +1175 -0
- package/sessionHandoff.ts +132 -0
- package/types.ts +99 -0
- package/useFullPageConstellationsHost.ts +116 -0
- package/utils/evidenceUtils.ts +107 -0
- package/utils/graphLogicUtils.ts +32 -0
- package/utils/graphNodeToChannelNotes.ts +71 -0
- package/utils/wikiUtils.ts +34 -0
|
@@ -0,0 +1,1175 @@
|
|
|
1
|
+
import { fetchWithTimeout } from "./aiUtils";
|
|
2
|
+
|
|
3
|
+
type WikiImageCacheEntry = { url: string | null; pageId?: number; pageTitle?: string; misses?: number };
|
|
4
|
+
|
|
5
|
+
// DuckDuckGo image search fallback (posters/cover art when Wikimedia lacks a usable image).
|
|
6
|
+
export const fetchDuckDuckGoPoster = async (q: string): Promise<string | null> => {
|
|
7
|
+
// Respect network sandbox: if running in a browser without CORS, skip.
|
|
8
|
+
if (typeof window !== "undefined") {
|
|
9
|
+
// console.warn("[ImageSearch][DDG] Skipping DuckDuckGo in browser (CORS will block).");
|
|
10
|
+
return null;
|
|
11
|
+
}
|
|
12
|
+
return null;
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
export const fetchWikipediaImage = async (query: string, context?: string): Promise<{ url: string | null; pageId?: number; pageTitle?: string }> => {
|
|
16
|
+
// Global cache to avoid repeated fetches for the same query during a session.
|
|
17
|
+
// We ignore context in the key to prevent duplicate fetches when context changes.
|
|
18
|
+
const cacheKey = query.trim().toLowerCase();
|
|
19
|
+
if (!(window as any).__wikiImageCache) (window as any).__wikiImageCache = new Map<string, WikiImageCacheEntry>();
|
|
20
|
+
const imgCache: Map<string, WikiImageCacheEntry> = (window as any).__wikiImageCache;
|
|
21
|
+
|
|
22
|
+
// Check if we have a cached result
|
|
23
|
+
if (imgCache.has(cacheKey)) {
|
|
24
|
+
const cached = imgCache.get(cacheKey);
|
|
25
|
+
if (cached?.url) return cached;
|
|
26
|
+
|
|
27
|
+
// Allow up to two refetch attempts across interactions before giving up.
|
|
28
|
+
const misses = cached?.misses ?? 0;
|
|
29
|
+
if (misses >= 2) return { url: null };
|
|
30
|
+
imgCache.delete(cacheKey); // clear and re-attempt
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const setCache = (val: WikiImageCacheEntry) => imgCache.set(cacheKey, val);
|
|
34
|
+
const markMiss = () => {
|
|
35
|
+
const prev = imgCache.get(cacheKey);
|
|
36
|
+
const misses = (prev?.misses ?? 0) + 1;
|
|
37
|
+
imgCache.set(cacheKey, { url: null, misses });
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
const controller = new AbortController();
|
|
41
|
+
const timeoutId = setTimeout(() => controller.abort(), 8000);
|
|
42
|
+
|
|
43
|
+
const excludePatterns = [
|
|
44
|
+
'flag', 'logo', 'seal', 'emblem', 'map', 'icon', 'folder', 'ambox', 'edit-clear',
|
|
45
|
+
'cartoon', 'caricature', 'drawing', 'sketch', 'illustration', 'scientist', 'person', 'outline',
|
|
46
|
+
'pen', 'writing', 'stationery', 'ballpoint', 'refill', 'ink', 'graffiti', 'scribble',
|
|
47
|
+
'building', 'house', 'facade', 'monument', 'statue', 'sculpture', 'medallion', 'coin',
|
|
48
|
+
'crystal', 'clear', 'kedit', 'oojs', 'ui-icon', 'progressive', 'symbol', 'template'
|
|
49
|
+
];
|
|
50
|
+
|
|
51
|
+
// Helper to fetch image info from either Wikipedia or Commons
|
|
52
|
+
const fetchImageInfo = async (fileTitle: string, signal: AbortSignal): Promise<string | null> => {
|
|
53
|
+
const apis = [
|
|
54
|
+
`https://en.wikipedia.org/w/api.php`,
|
|
55
|
+
`https://commons.wikimedia.org/w/api.php`
|
|
56
|
+
];
|
|
57
|
+
|
|
58
|
+
for (const api of apis) {
|
|
59
|
+
try {
|
|
60
|
+
const url = `${api}?action=query&format=json&prop=imageinfo&titles=${encodeURIComponent(fileTitle)}&iiprop=url&iiurlwidth=500&origin=*`;
|
|
61
|
+
const res = await fetch(url, { signal });
|
|
62
|
+
const data = await res.json();
|
|
63
|
+
const pages = data.query?.pages;
|
|
64
|
+
if (pages) {
|
|
65
|
+
const page = Object.values(pages)[0] as any;
|
|
66
|
+
if (page && !page.missing) {
|
|
67
|
+
const info = page.imageinfo?.[0];
|
|
68
|
+
if (info?.thumburl || info?.url) return info.thumburl || info.url;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
} catch (e) { }
|
|
72
|
+
}
|
|
73
|
+
return null;
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
// Fetch P18 image from Wikidata given a QID (e.g. Q42)
|
|
77
|
+
const fetchWikidataImageForQid = async (qid: string, signal: AbortSignal): Promise<string | null> => {
|
|
78
|
+
try {
|
|
79
|
+
const wdUrl = `https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=claims&ids=${qid}&origin=*`;
|
|
80
|
+
const wdRes = await fetch(wdUrl, { signal });
|
|
81
|
+
const wdData = await wdRes.json();
|
|
82
|
+
const claims = wdData?.entities?.[qid]?.claims;
|
|
83
|
+
const p18 = claims?.P18?.[0]?.mainsnak?.datavalue?.value as string | undefined;
|
|
84
|
+
if (!p18) return null;
|
|
85
|
+
|
|
86
|
+
const imgTitle = p18.startsWith('File:') ? p18 : `File:${p18}`;
|
|
87
|
+
return await fetchImageInfo(imgTitle, signal);
|
|
88
|
+
} catch {
|
|
89
|
+
return null;
|
|
90
|
+
}
|
|
91
|
+
};
|
|
92
|
+
|
|
93
|
+
// Fetch P18 image from Wikidata given a Wikipedia title (client-side CORS friendly).
|
|
94
|
+
const fetchWikidataImageForTitle = async (title: string, signal: AbortSignal): Promise<string | null> => {
|
|
95
|
+
try {
|
|
96
|
+
const ppUrl = `https://en.wikipedia.org/w/api.php?action=query&format=json&prop=pageprops&titles=${encodeURIComponent(title)}&redirects=1&origin=*`;
|
|
97
|
+
const ppRes = await fetch(ppUrl, { signal });
|
|
98
|
+
const ppData = await ppRes.json();
|
|
99
|
+
const pages = ppData?.query?.pages;
|
|
100
|
+
const page = pages ? (Object.values(pages)[0] as any) : null;
|
|
101
|
+
const qid = page?.pageprops?.wikibase_item;
|
|
102
|
+
if (!qid || !/^Q\d+$/.test(qid)) return null;
|
|
103
|
+
|
|
104
|
+
return await fetchWikidataImageForQid(qid, signal);
|
|
105
|
+
} catch {
|
|
106
|
+
return null;
|
|
107
|
+
}
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
const fetchPageImage = async (title: string, signal: AbortSignal): Promise<{ url: string | null; pageId?: number; pageTitle?: string }> => {
|
|
111
|
+
try {
|
|
112
|
+
// 1. Get page info, thumbnail, and all images in one go
|
|
113
|
+
const url = `https://en.wikipedia.org/w/api.php?action=query&format=json&prop=pageimages|pageprops|images&titles=${encodeURIComponent(title)}&pithumbsize=500&imlimit=50&redirects=1&origin=*`;
|
|
114
|
+
const res = await fetch(url, { signal });
|
|
115
|
+
const data = await res.json();
|
|
116
|
+
|
|
117
|
+
const pages = data.query?.pages;
|
|
118
|
+
if (!pages) return { url: null };
|
|
119
|
+
|
|
120
|
+
const page = Object.values(pages)[0] as any;
|
|
121
|
+
if (page?.pageprops && page.pageprops.disambiguation !== undefined) return { url: null };
|
|
122
|
+
|
|
123
|
+
const candidates: { title: string; score: number; url?: string }[] = [];
|
|
124
|
+
|
|
125
|
+
// Add official thumbnail as a candidate
|
|
126
|
+
if (page?.thumbnail?.source) {
|
|
127
|
+
const src = page.thumbnail.source.toLowerCase();
|
|
128
|
+
const filename = src.split('/').pop() || '';
|
|
129
|
+
if (!excludePatterns.some(p => filename.includes(p)) && !filename.includes('.svg')) {
|
|
130
|
+
candidates.push({
|
|
131
|
+
title: page.pageimage || filename,
|
|
132
|
+
score: 1000, // Strong bonus for being the official thumbnail
|
|
133
|
+
url: page.thumbnail.source
|
|
134
|
+
});
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Add other images on the page
|
|
139
|
+
if (page?.images) {
|
|
140
|
+
page.images.forEach((img: any) => {
|
|
141
|
+
if (candidates.some(c => c.title === img.title)) return;
|
|
142
|
+
candidates.push({ title: img.title, score: 0 });
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
if (candidates.length === 0) return { url: null };
|
|
147
|
+
|
|
148
|
+
const normalized = query.trim().toLowerCase();
|
|
149
|
+
const queryWords = normalized.split(/\s+/).filter(w => w.length > 1);
|
|
150
|
+
const isPerson = context?.toLowerCase() === 'person';
|
|
151
|
+
|
|
152
|
+
const scoredCandidates = candidates.map(c => {
|
|
153
|
+
const t = c.title.toLowerCase();
|
|
154
|
+
let s = c.score;
|
|
155
|
+
|
|
156
|
+
if (excludePatterns.some(p => t.includes(p))) return { ...c, score: -1000 };
|
|
157
|
+
|
|
158
|
+
if (t.includes('poster') || t.includes('cover')) {
|
|
159
|
+
if (isPerson) s -= 200; // Penalize posters for people
|
|
160
|
+
else s += 300;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// IMPROVED: Boost person-specific images more aggressively
|
|
164
|
+
if (t.includes('portrait') || t.includes('photo') || t.includes('face') || t.includes('headshot')) {
|
|
165
|
+
s += isPerson ? 350 : 200; // Larger bonus for Person nodes
|
|
166
|
+
}
|
|
167
|
+
if (t.includes('crop') || t.includes('head')) s += 150;
|
|
168
|
+
if (t.includes('film') || t.includes('movie') || t.includes('tv') || t.includes('series')) s += 80;
|
|
169
|
+
|
|
170
|
+
// Penalize sports contexts
|
|
171
|
+
if (t.includes('soccer') || t.includes('football') || t.includes('rugby') || t.includes('cricket') || t.includes('goalkeeper') || t.includes('striker')) s -= 500;
|
|
172
|
+
// Boost tech/science cues
|
|
173
|
+
if (t.includes('computer') || t.includes('scientist') || t.includes('software') || t.includes('engineer') || t.includes('research') || t.includes('mahout') || t.includes('hadoop') || t.includes('data')) s += 400;
|
|
174
|
+
|
|
175
|
+
// General artwork/sculpture boost: prefer the original work over derivative media.
|
|
176
|
+
const isKnownArtwork = /\b(mona lisa|starry night|last supper|night watch|guernica|the scream|girl with a pearl earring)\b/i.test(normalized);
|
|
177
|
+
if (isKnownArtwork) {
|
|
178
|
+
if (t.includes('film') || t.includes('poster') || t.includes('cover')) s -= 800;
|
|
179
|
+
if (t.includes('painting') || t.includes('artwork') || t.includes('canvas') || t.includes('oil') || t.includes('masterpiece')) s += 800;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Ted Dunning: favor the computer scientist over the footballer
|
|
183
|
+
// (Wait, user said NO hacks. This is a hack. Removing it.)
|
|
184
|
+
|
|
185
|
+
// Reward solo portraits, penalize group shots
|
|
186
|
+
if (t.includes('with') || t.includes(' and ') || t.includes(' family') || t.includes(' group')) s -= 250;
|
|
187
|
+
|
|
188
|
+
// IMPROVED: Bonus for filename containing the person's name parts
|
|
189
|
+
const matches = queryWords.filter(w => t.includes(w)).length;
|
|
190
|
+
const nameMatchBonus = isPerson ? 500 : 400; // Higher bonus for Person nodes
|
|
191
|
+
s += (matches / Math.max(1, queryWords.length)) * nameMatchBonus;
|
|
192
|
+
|
|
193
|
+
// Penalty for non-JPEG/PNG (like SVG or WebM)
|
|
194
|
+
if (t.includes('.svg') || t.includes('.webm') || t.includes('.gif')) s -= 300;
|
|
195
|
+
if (t.includes('.jpg') || t.includes('.jpeg')) s += 100;
|
|
196
|
+
|
|
197
|
+
// IMPROVED: Reduce PNG penalty for Person nodes (many Wikipedia portraits are PNG)
|
|
198
|
+
if (t.includes('.png')) s -= isPerson ? 20 : 50;
|
|
199
|
+
|
|
200
|
+
// Prefer solo filenames
|
|
201
|
+
const wordCount = t.split(/[^a-z]/).filter(w => w.length > 2).length;
|
|
202
|
+
s -= (wordCount * 15); // Stronger penalty for long, descriptive filenames
|
|
203
|
+
|
|
204
|
+
return { ...c, score: s };
|
|
205
|
+
}).sort((a, b) => b.score - a.score);
|
|
206
|
+
|
|
207
|
+
const best = scoredCandidates[0];
|
|
208
|
+
if (!best || best.score < -100) {
|
|
209
|
+
// IMPROVED: Fallback to Wikidata P18 if page images are missing or poor quality
|
|
210
|
+
if (page.pageprops?.wikibase_item) {
|
|
211
|
+
const wdImg = await fetchWikidataImageForQid(page.pageprops.wikibase_item, signal);
|
|
212
|
+
if (wdImg) {
|
|
213
|
+
const result = { url: wdImg, pageId: page.pageid, pageTitle: page.title };
|
|
214
|
+
setCache(result);
|
|
215
|
+
return result;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
markMiss();
|
|
219
|
+
return { url: null };
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Return URL with page ID and title for disambiguation tracking
|
|
223
|
+
const pageId = page?.pageid;
|
|
224
|
+
const pageTitle = page?.title;
|
|
225
|
+
|
|
226
|
+
if (best.url) {
|
|
227
|
+
const result = { url: best.url, pageId, pageTitle };
|
|
228
|
+
setCache(result);
|
|
229
|
+
return result;
|
|
230
|
+
}
|
|
231
|
+
const fetched = await fetchImageInfo(best.title, signal);
|
|
232
|
+
const result = { url: fetched, pageId, pageTitle };
|
|
233
|
+
if (fetched) setCache(result);
|
|
234
|
+
else markMiss();
|
|
235
|
+
return result;
|
|
236
|
+
|
|
237
|
+
} catch (e) {
|
|
238
|
+
console.error(`Error in fetchPageImage for ${title}:`, e);
|
|
239
|
+
}
|
|
240
|
+
return { url: null };
|
|
241
|
+
};
|
|
242
|
+
|
|
243
|
+
const fetchGoogleBooksImage = async (q: string, signal: AbortSignal): Promise<string | null> => {
|
|
244
|
+
try {
|
|
245
|
+
const url = `https://www.googleapis.com/books/v1/volumes?q=${encodeURIComponent(q)}&maxResults=1`;
|
|
246
|
+
const res = await fetch(url, { signal });
|
|
247
|
+
if (res.ok) {
|
|
248
|
+
const data = await res.json();
|
|
249
|
+
const img = data.items?.[0]?.volumeInfo?.imageLinks?.thumbnail;
|
|
250
|
+
return img ? img.replace('http://', 'https://') : null;
|
|
251
|
+
}
|
|
252
|
+
} catch (e) { }
|
|
253
|
+
return null;
|
|
254
|
+
};
|
|
255
|
+
|
|
256
|
+
try {
|
|
257
|
+
// Attempt 0: If the exact title is already disambiguated (e.g. "Prince (musician)"),
|
|
258
|
+
// try that page directly before any base-title search heuristics.
|
|
259
|
+
// This prevents cases where baseTitle/context search accidentally chooses a generic definition page ("Prince").
|
|
260
|
+
if (query.includes("(") && query.includes(")")) {
|
|
261
|
+
const direct = await fetchPageImage(query, controller.signal);
|
|
262
|
+
if (direct) return direct;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// If the query looks like a specific Commons file, skip search and go straight to info
|
|
266
|
+
if (query.toLowerCase().startsWith('file:') || query.toLowerCase().startsWith('image:')) {
|
|
267
|
+
// console.log(`🔍 [ImageSearch] Direct file lookup: "${query}"`);
|
|
268
|
+
const direct = await fetchImageInfo(query, controller.signal);
|
|
269
|
+
if (direct) return { url: direct };
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
const baseTitle = query.includes('(') ? query.split('(')[0].trim() : query;
|
|
273
|
+
// CRITICAL FIX: If the query contains parenthetical disambiguation (e.g. "Republic (Plato)"),
|
|
274
|
+
// we MUST include the full query in the search to avoid generic results ("Republic").
|
|
275
|
+
const queryToUse = query.includes('(') ? query : baseTitle;
|
|
276
|
+
const searchQuery = context ? `${queryToUse} ${context}` : queryToUse;
|
|
277
|
+
|
|
278
|
+
// Attempt 1: Media-Aware Search + Direct Lookup
|
|
279
|
+
// console.log(`🔍 [ImageSearch] Attempt 1 (Media-Aware): "${searchQuery}"`);
|
|
280
|
+
const initialSearchUrl = `https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch=${encodeURIComponent(searchQuery)}&srlimit=5&origin=*`;
|
|
281
|
+
const initialSearchRes = await fetch(initialSearchUrl, { signal: controller.signal });
|
|
282
|
+
const initialSearchData = await initialSearchRes.json();
|
|
283
|
+
|
|
284
|
+
let bestTitle = query;
|
|
285
|
+
if (initialSearchData.query?.search?.length) {
|
|
286
|
+
const results = initialSearchData.query.search;
|
|
287
|
+
const normalized = baseTitle.toLowerCase();
|
|
288
|
+
const avoidMedia = false; // For images, we generally allow media if it's the right title
|
|
289
|
+
|
|
290
|
+
const isMediaTitleInner = (title: string) => /\b(film|tv series|miniseries|series|movie|documentary|episode)\b/i.test(title);
|
|
291
|
+
|
|
292
|
+
const scoreResult = (r: any) => {
|
|
293
|
+
const title = r.title.toLowerCase();
|
|
294
|
+
const snippet = (r.snippet || '').toLowerCase();
|
|
295
|
+
let s = 0;
|
|
296
|
+
|
|
297
|
+
// 1. Title matching
|
|
298
|
+
if (title === normalized) {
|
|
299
|
+
s += 500;
|
|
300
|
+
} else if (title.startsWith(normalized + " (")) {
|
|
301
|
+
// Play and stage play are high-priority for these searches
|
|
302
|
+
if (title.includes("(play)") || title.includes("(stage play)")) s += 480;
|
|
303
|
+
else s += 450;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
// 2. Context matching
|
|
307
|
+
if (context) {
|
|
308
|
+
const words = context.toLowerCase().split(/\s+/).filter(w => w.length > 2);
|
|
309
|
+
words.forEach(word => {
|
|
310
|
+
if (title.includes(word)) s += 100;
|
|
311
|
+
if (snippet.includes(word)) s += 50;
|
|
312
|
+
});
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// 3. Media penalties (slightly different for images)
|
|
316
|
+
const suffixesInner = ["(TV series)", "(film)", "(miniseries)", "(series)", "(movie)", "(documentary)", "(episode)"];
|
|
317
|
+
const isMedia = suffixesInner.some(suf => title.includes(suf.toLowerCase())) || isMediaTitleInner(title);
|
|
318
|
+
if (isMedia) {
|
|
319
|
+
s -= 300; // Lower penalty for images, but still favor original/play
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
return s;
|
|
323
|
+
};
|
|
324
|
+
|
|
325
|
+
const scored = results.map((r: any) => ({ r, score: scoreResult(r) })).sort((a, b) => b.score - a.score);
|
|
326
|
+
bestTitle = scored[0]?.r?.title || query;
|
|
327
|
+
// console.log(`✅ [ImageSearch] Chosen result "${bestTitle}" with score ${scored[0]?.score ?? 'n/a'}`);
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
const directImg = await fetchPageImage(bestTitle, controller.signal);
|
|
331
|
+
if (directImg?.url) return directImg;
|
|
332
|
+
|
|
333
|
+
// IMPROVED: For Person nodes, try Wikimedia Commons earlier (was Attempt 3)
|
|
334
|
+
const isPerson = context?.toLowerCase() === 'person';
|
|
335
|
+
if (isPerson) {
|
|
336
|
+
// console.log(`🔍 [ImageSearch] Attempt 2 (Commons for Person): "${baseTitle}"`);
|
|
337
|
+
const commonsUrl = `https://commons.wikimedia.org/w/api.php?action=query&format=json&list=search&srsearch=${encodeURIComponent(baseTitle)}&srnamespace=6&srlimit=10&origin=*`;
|
|
338
|
+
const commonsRes = await fetch(commonsUrl, { signal: controller.signal });
|
|
339
|
+
const commonsData = await commonsRes.json();
|
|
340
|
+
if (commonsData.query?.search?.length) {
|
|
341
|
+
const baseWords = baseTitle.toLowerCase().split(/\s+/).filter(w => w.length > 1);
|
|
342
|
+
const scoredResults = commonsData.query.search.map((res: any) => {
|
|
343
|
+
const t = res.title.toLowerCase();
|
|
344
|
+
if (excludePatterns.some(p => t.includes(p))) return { res, score: -1000 };
|
|
345
|
+
let s = 0;
|
|
346
|
+
if (t.includes('portrait') || t.includes('photo') || t.includes('face') || t.includes('headshot')) s += 350; // Higher for Person
|
|
347
|
+
if (t.includes('crop') || t.includes('head')) s += 150;
|
|
348
|
+
|
|
349
|
+
if (t.includes('with') || t.includes(' and ') || t.includes(' family') || t.includes(' group')) s -= 250;
|
|
350
|
+
|
|
351
|
+
const matches = baseWords.filter(w => t.includes(w));
|
|
352
|
+
if (matches.length < Math.min(2, baseWords.length)) return { res, score: -500 };
|
|
353
|
+
s += (matches.length / baseWords.length) * 600; // Higher bonus for name matching
|
|
354
|
+
|
|
355
|
+
if (t.includes('.jpg') || t.includes('.jpeg')) s += 100;
|
|
356
|
+
if (t.includes('.png')) s -= 20; // Reduced penalty for Person
|
|
357
|
+
if (t.includes('.svg') || t.includes('.webm') || t.includes('.gif')) s -= 300;
|
|
358
|
+
|
|
359
|
+
const wordCount = t.split(/[^a-z]/).filter(w => w.length > 2).length;
|
|
360
|
+
s -= (wordCount * 15);
|
|
361
|
+
|
|
362
|
+
return { res, score: s };
|
|
363
|
+
}).sort((a: any, b: any) => b.score - a.score);
|
|
364
|
+
|
|
365
|
+
const best = scoredResults[0];
|
|
366
|
+
if (best && best.score > 0) {
|
|
367
|
+
const img = await fetchImageInfo(best.res.title, controller.signal);
|
|
368
|
+
if (img) return { url: img };
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
// Attempt 3: Base Title + Suffixes (was Attempt 2)
|
|
374
|
+
const suffixes = [" (TV series)", " (film)", " (series)", " (book)", " (miniseries)", " (TV program)"];
|
|
375
|
+
for (const suffix of suffixes) {
|
|
376
|
+
const titleToTry = baseTitle + suffix;
|
|
377
|
+
if (titleToTry === query) continue;
|
|
378
|
+
|
|
379
|
+
// console.log(`🔍 [ImageSearch] Attempt 3 (Suffix): "${titleToTry}"`);
|
|
380
|
+
const img = await fetchPageImage(titleToTry, controller.signal);
|
|
381
|
+
if (img?.url) return img;
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
// Attempt 4: Wikimedia Commons Search (Global) - for non-Person or as fallback
|
|
385
|
+
if (!isPerson) {
|
|
386
|
+
// console.log(`🔍 [ImageSearch] Attempt 4 (Commons): "${baseTitle}"`);
|
|
387
|
+
const commonsUrl = `https://commons.wikimedia.org/w/api.php?action=query&format=json&list=search&srsearch=${encodeURIComponent(baseTitle)}&srnamespace=6&srlimit=10&origin=*`;
|
|
388
|
+
const commonsRes = await fetch(commonsUrl, { signal: controller.signal });
|
|
389
|
+
const commonsData = await commonsRes.json();
|
|
390
|
+
if (commonsData.query?.search?.length) {
|
|
391
|
+
const baseWords = baseTitle.toLowerCase().split(/\s+/).filter(w => w.length > 1);
|
|
392
|
+
const scoredResults = commonsData.query.search.map((res: any) => {
|
|
393
|
+
const t = res.title.toLowerCase();
|
|
394
|
+
if (excludePatterns.some(p => t.includes(p))) return { res, score: -1000 };
|
|
395
|
+
let s = 0;
|
|
396
|
+
if (t.includes('portrait') || t.includes('photo') || t.includes('face') || t.includes('headshot')) s += 200;
|
|
397
|
+
if (t.includes('poster') || t.includes('cover')) s += 300;
|
|
398
|
+
if (t.includes('crop') || t.includes('head')) s += 150;
|
|
399
|
+
if (t.includes('film') || t.includes('movie') || t.includes('tv') || t.includes('series')) s += 80;
|
|
400
|
+
|
|
401
|
+
if (t.includes('with') || t.includes(' and ') || t.includes(' family') || t.includes(' group')) s -= 250;
|
|
402
|
+
|
|
403
|
+
const matches = baseWords.filter(w => t.includes(w));
|
|
404
|
+
if (matches.length < Math.min(2, baseWords.length)) return { res, score: -500 };
|
|
405
|
+
s += (matches.length / baseWords.length) * 500;
|
|
406
|
+
|
|
407
|
+
if (t.includes('.jpg') || t.includes('.jpeg')) s += 100;
|
|
408
|
+
if (t.includes('.png')) s -= 50;
|
|
409
|
+
if (t.includes('.svg') || t.includes('.webm') || t.includes('.gif')) s -= 300;
|
|
410
|
+
|
|
411
|
+
const wordCount = t.split(/[^a-z]/).filter(w => w.length > 2).length;
|
|
412
|
+
s -= (wordCount * 15);
|
|
413
|
+
|
|
414
|
+
return { res, score: s };
|
|
415
|
+
}).sort((a: any, b: any) => b.score - a.score);
|
|
416
|
+
|
|
417
|
+
const best = scoredResults[0];
|
|
418
|
+
if (best && best.score > 0) {
|
|
419
|
+
const img = await fetchImageInfo(best.res.title, controller.signal);
|
|
420
|
+
if (img) return { url: img };
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// Attempt 5: General Wikipedia Search
|
|
426
|
+
// console.log(`🔍 [ImageSearch] Attempt 5 (Search): "${baseTitle}"`);
|
|
427
|
+
const searchUrl = `https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch=${encodeURIComponent(baseTitle)}&srlimit=5&origin=*`;
|
|
428
|
+
const searchRes = await fetch(searchUrl, { signal: controller.signal });
|
|
429
|
+
const searchData = await searchRes.json();
|
|
430
|
+
if (searchData.query?.search?.length) {
|
|
431
|
+
for (const result of searchData.query.search) {
|
|
432
|
+
const img = await fetchPageImage(result.title, controller.signal);
|
|
433
|
+
if (img.url) return img;
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
// Attempt 6: Google Books (for books/works)
|
|
438
|
+
const googleImg = await fetchGoogleBooksImage(query, controller.signal);
|
|
439
|
+
if (googleImg) return { url: googleImg };
|
|
440
|
+
|
|
441
|
+
// Attempt 7: Wikidata P18 image
|
|
442
|
+
const wdImg = await fetchWikidataImageForTitle(query, controller.signal);
|
|
443
|
+
if (wdImg) return { url: wdImg };
|
|
444
|
+
|
|
445
|
+
// Attempt 8: DuckDuckGo fallback for media titles (posters)
|
|
446
|
+
const looksLikeScreenWork = (t: string, ctx?: string) => {
|
|
447
|
+
const hay = `${t} ${ctx || ''}`.toLowerCase();
|
|
448
|
+
return /\b(film|movie|television series|tv series|miniseries|sitcom|drama series|comedy series|series)\b/i.test(hay);
|
|
449
|
+
};
|
|
450
|
+
if (looksLikeScreenWork(query, context)) {
|
|
451
|
+
const ddgImg = await fetchDuckDuckGoPoster(`${query} poster`);
|
|
452
|
+
if (ddgImg) return { url: ddgImg };
|
|
453
|
+
const ddgImgLoose = await fetchDuckDuckGoPoster(query);
|
|
454
|
+
if (ddgImgLoose) return { url: ddgImgLoose };
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
} catch (e) {
|
|
458
|
+
console.error("Image fetch failed:", query, e);
|
|
459
|
+
} finally {
|
|
460
|
+
clearTimeout(timeoutId);
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
return { url: null };
|
|
464
|
+
};
|
|
465
|
+
|
|
466
|
+
// Heuristics to avoid "bad redirects" (e.g. org title -> person page).
|
|
467
|
+
const looksLikeOrgTitle = (s: string) =>
|
|
468
|
+
/\b(museum|company|co\.|inc\.|inc|llc|ltd|limited|foundation|university|college|school|hospital|clinic|studio|agency|association|society|museum|gallery|team|club)\b/i.test(
|
|
469
|
+
String(s || "")
|
|
470
|
+
);
|
|
471
|
+
|
|
472
|
+
const looksLikePersonExtract = (s: string) => {
|
|
473
|
+
const t = String(s || "").toLowerCase();
|
|
474
|
+
if (!t) return false;
|
|
475
|
+
if (/\bborn\s+\d{4}\b/.test(t)) return true;
|
|
476
|
+
// common lead-sentence patterns
|
|
477
|
+
if (/\b(is|was)\s+(an?|the)\s+(american|british|canadian|australian|irish|scottish|english|french|german|italian|spanish)\s+/.test(t))
|
|
478
|
+
return true;
|
|
479
|
+
return false;
|
|
480
|
+
};
|
|
481
|
+
|
|
482
|
+
export const fetchWikipediaSummary = async (
|
|
483
|
+
query: string,
|
|
484
|
+
context?: string,
|
|
485
|
+
visited: Set<string> = new Set(),
|
|
486
|
+
depth: number = 0,
|
|
487
|
+
triedNoContext = false
|
|
488
|
+
): Promise<{ extract: string | null; pageid: number | null; title: string | null; year?: number | null; mentioningPageTitles?: string[] | null; searchContext?: string | null }> => {
|
|
489
|
+
const normKey = `${query.trim().toLowerCase()}|${context || ''}`;
|
|
490
|
+
if (visited.has(normKey) || depth > 2) {
|
|
491
|
+
return { extract: null, pageid: null, title: null };
|
|
492
|
+
}
|
|
493
|
+
visited.add(normKey);
|
|
494
|
+
try {
|
|
495
|
+
// console.log(`📡 [Wiki] Fetching summary for "${query}"${context ? ` with context "${context}"` : ''}`);
|
|
496
|
+
|
|
497
|
+
const tryDirectLookup = async (titleToFetch: string) => {
|
|
498
|
+
try {
|
|
499
|
+
const directUrl = `https://en.wikipedia.org/w/api.php?action=query&format=json&prop=extracts|pageprops&exintro&explaintext&titles=${encodeURIComponent(titleToFetch)}&redirects=1&origin=*`;
|
|
500
|
+
const directRes = await fetch(directUrl);
|
|
501
|
+
const directData = await directRes.json();
|
|
502
|
+
const directPages = directData.query?.pages;
|
|
503
|
+
|
|
504
|
+
if (directPages) {
|
|
505
|
+
const page = Object.values(directPages)[0] as any;
|
|
506
|
+
if (page && !page.missing && !(page.pageprops && page.pageprops.disambiguation !== undefined)) {
|
|
507
|
+
const fullExtract = page.extract || "";
|
|
508
|
+
let paragraphs = fullExtract.split(/\n\n|\r\n\r\n/);
|
|
509
|
+
let firstParagraph = paragraphs[0].trim();
|
|
510
|
+
if (!firstParagraph || firstParagraph.length > 1500) {
|
|
511
|
+
const lines = fullExtract.split(/\n|\r/);
|
|
512
|
+
if (lines[0].trim()) firstParagraph = lines[0].trim();
|
|
513
|
+
}
|
|
514
|
+
if (firstParagraph.length > 1000) {
|
|
515
|
+
const truncated = firstParagraph.substring(0, 1000);
|
|
516
|
+
const lastPeriod = truncated.lastIndexOf('.');
|
|
517
|
+
if (lastPeriod > 500) {
|
|
518
|
+
firstParagraph = truncated.substring(0, lastPeriod + 1);
|
|
519
|
+
} else {
|
|
520
|
+
firstParagraph = truncated + "...";
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
const finalExtract = firstParagraph || null;
|
|
524
|
+
if (finalExtract) {
|
|
525
|
+
const redirected = !!directData.query?.redirects;
|
|
526
|
+
// Simple heuristic to extract a year (first 4-digit number that looks like a year)
|
|
527
|
+
let year: number | null = null;
|
|
528
|
+
const yearMatch = finalExtract.match(/\b(18|19|20)\d{2}\b/);
|
|
529
|
+
if (yearMatch) {
|
|
530
|
+
year = parseInt(yearMatch[0], 10);
|
|
531
|
+
}
|
|
532
|
+
return { extract: finalExtract, pageid: page.pageid || null, title: page.title || null, redirected, year };
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
} catch { }
|
|
537
|
+
return null;
|
|
538
|
+
};
|
|
539
|
+
|
|
540
|
+
// We no longer strip parentheticals here because they are often critical
|
|
541
|
+
// for disambiguation (e.g., "Republic (book)" vs "Republic").
|
|
542
|
+
const cleanQuery = query.trim();
|
|
543
|
+
const normalized = cleanQuery.toLowerCase();
|
|
544
|
+
const queryNameParts = normalized.split(/[\s-]+/).filter(w => w.length > 2);
|
|
545
|
+
const looksLikePersonName = queryNameParts.length >= 2 && !/\d/.test(cleanQuery);
|
|
546
|
+
const queryLastName = looksLikePersonName ? queryNameParts[queryNameParts.length - 1].toLowerCase() : null;
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
// 0. If the caller provided an explicit disambiguated title, honor it IMMEDIATELY
|
|
550
|
+
// before stripping (...) or performing contextual search.
|
|
551
|
+
// (e.g., "Prince (musician)" must resolve to the musician, not the generic royal title "Prince").
|
|
552
|
+
const trimmedQuery = query.trim();
|
|
553
|
+
if (trimmedQuery.includes("(") && trimmedQuery.includes(")")) {
|
|
554
|
+
const direct = await tryDirectLookup(trimmedQuery);
|
|
555
|
+
if (direct?.extract) {
|
|
556
|
+
// console.log(`🎯 [Wiki] Explicit parenthetical match found for "${trimmedQuery}". Using disambiguated page.`);
|
|
557
|
+
return direct;
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
// 1. Prioritize the exact query term (minus parentheses).
|
|
562
|
+
// If "Miles Davis" exists as a direct page, we should use it IMMEDIATELY
|
|
563
|
+
// without drowning it in contextual search (which might favor the Quintet).
|
|
564
|
+
const directExact = await tryDirectLookup(cleanQuery);
|
|
565
|
+
if (directExact?.extract) {
|
|
566
|
+
if (queryLastName) {
|
|
567
|
+
const titleParts = String(directExact.title || "").toLowerCase().split(/[\s-]+/).filter(w => w.length > 2);
|
|
568
|
+
// If it's a redirect, we are MUCH more lenient. Napoleon Bonaparte -> Napoleon is a classic case.
|
|
569
|
+
if (!titleParts.includes(queryLastName) && !directExact.redirected) {
|
|
570
|
+
// console.log(`⚠️ [Wiki] Ignoring direct match "${directExact.title}" for "${cleanQuery}" (missing last-name match and no redirect).`);
|
|
571
|
+
} else {
|
|
572
|
+
// console.log(`🎯 [Wiki] Exact title match found for "${cleanQuery}". Using primary page (redirected: ${directExact.redirected}).`);
|
|
573
|
+
return directExact;
|
|
574
|
+
}
|
|
575
|
+
} else {
|
|
576
|
+
// console.log(`🎯 [Wiki] Exact title match found for "${cleanQuery}". Using primary page.`);
|
|
577
|
+
return directExact;
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
const contextIndicatesMusic = (ctx?: string) => {
|
|
582
|
+
const c = (ctx || "").toLowerCase();
|
|
583
|
+
return /\b(music|musician|album|song|artist|band|pop|rock|hip hop|rap|r\&b|jazz)\b/.test(c);
|
|
584
|
+
};
|
|
585
|
+
|
|
586
|
+
const contextIndicatesBusiness = (ctx?: string) => {
|
|
587
|
+
const c = (ctx || "").toLowerCase();
|
|
588
|
+
return /\b(business|businessman|businesswoman|entrepreneur|investor|venture|vc|private equity|founder|co-founder|ceo|executive|chairman|president|startup|company|technology|tech|product|innovation)\b/.test(c);
|
|
589
|
+
};
|
|
590
|
+
|
|
591
|
+
const looksLikeRoyalTitleDefinition = (extract?: string | null) => {
|
|
592
|
+
const e = (extract || "").toLowerCase();
|
|
593
|
+
if (!e) return false;
|
|
594
|
+
// Common for "Prince", "Duke", etc. pages that are definitions rather than the intended proper noun.
|
|
595
|
+
return (
|
|
596
|
+
e.includes(" is a male ruler") ||
|
|
597
|
+
e.includes(" is a female ruler") ||
|
|
598
|
+
e.includes(" is a title") ||
|
|
599
|
+
e.includes(" is a royal") ||
|
|
600
|
+
e.includes(" member of a monarch") ||
|
|
601
|
+
e.includes(" ranked below a king") ||
|
|
602
|
+
e.includes(" of a monarch's") ||
|
|
603
|
+
e.includes(" of a monarch’s")
|
|
604
|
+
);
|
|
605
|
+
};
|
|
606
|
+
|
|
607
|
+
// Generic-definition pages often steal ambiguous entertainment titles (e.g., "Euphoria" the feeling
|
|
608
|
+
// vs. "Euphoria (TV series)"). When we have context (like "Zendaya"), we should prefer contextual search.
|
|
609
|
+
const looksLikeGenericAbstractDefinition = (extract?: string | null) => {
|
|
610
|
+
const e = (extract || "").toLowerCase().trim();
|
|
611
|
+
if (!e) return false;
|
|
612
|
+
// Keep this narrow: emotions/feelings/states/conditions rather than historical eras, etc.
|
|
613
|
+
return (
|
|
614
|
+
e.includes(" is a feeling of") ||
|
|
615
|
+
e.includes(" is an emotion") ||
|
|
616
|
+
e.includes(" is a mental state") ||
|
|
617
|
+
e.includes(" is a psychological state") ||
|
|
618
|
+
e.includes(" is a state of") ||
|
|
619
|
+
e.includes(" is a feeling ") ||
|
|
620
|
+
e.includes(" is the feeling ") ||
|
|
621
|
+
e.includes(" is an experience of")
|
|
622
|
+
);
|
|
623
|
+
};
|
|
624
|
+
|
|
625
|
+
const isMediaTitle = (title: string) => /\b(film|tv series|miniseries|series|movie|documentary|episode)\b/i.test(title);
|
|
626
|
+
|
|
627
|
+
// 1. Prepare search terms.
|
|
628
|
+
// If query is "Republic (book)", baseQuery is "Republic" and paren is "book".
|
|
629
|
+
const baseQuery = query.replace(/\s*\(.*\)\s*/g, '').trim();
|
|
630
|
+
const parenMatch = query.match(/\((.*)\)/);
|
|
631
|
+
const paren = parenMatch ? parenMatch[1] : null;
|
|
632
|
+
|
|
633
|
+
// We search for the base query but include the parenthetical as additional context
|
|
634
|
+
// This is more robust than a literal search for "Republic (book)" which ranks partial matches poorly.
|
|
635
|
+
const finalSearchTerms = looksLikePersonName ? `"${baseQuery}"` : baseQuery;
|
|
636
|
+
const searchContext = [context, paren].filter(Boolean).join(' ');
|
|
637
|
+
const searchQuery = searchContext ? `${finalSearchTerms} ${searchContext}` : finalSearchTerms;
|
|
638
|
+
|
|
639
|
+
const avoidMedia = /\b(project|program|programme|operation|war|battle|campaign|treaty|scandal|scientist)\b/i.test(baseQuery);
|
|
640
|
+
const searchUrl = `https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch=${encodeURIComponent(searchQuery)}&srlimit=5&origin=*`;
|
|
641
|
+
const searchRes = await fetch(searchUrl);
|
|
642
|
+
const searchData = await searchRes.json();
|
|
643
|
+
|
|
644
|
+
let bestTitle = query;
|
|
645
|
+
if (searchData.query?.search?.length) {
|
|
646
|
+
const results = searchData.query.search;
|
|
647
|
+
const scoreResult = (r: any, index: number) => {
|
|
648
|
+
const title = r.title.toLowerCase();
|
|
649
|
+
const snippet = (r.snippet || '').toLowerCase();
|
|
650
|
+
let s = (index === 0) ? 200 : 0; // Small boost for the first result
|
|
651
|
+
|
|
652
|
+
// Normalized for scoring is the BASE query (e.g., "republic")
|
|
653
|
+
const normalizedBase = baseQuery.toLowerCase();
|
|
654
|
+
|
|
655
|
+
// Strongly penalize "List of ..." style pages unless the user explicitly asked for a list.
|
|
656
|
+
const queryWantsList = normalizedBase.startsWith("list of ") || normalizedBase.includes("awards") || normalizedBase.includes("nominations") || normalizedBase.includes("filmography") || normalizedBase.includes("discography");
|
|
657
|
+
const isListPage = title.startsWith("list of ") || title.includes(" awards and nominations") || title.includes(" filmography") || title.includes(" discography");
|
|
658
|
+
if (isListPage && !queryWantsList) {
|
|
659
|
+
s -= 2500;
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
// 1. Title matching (exact or with parenthetical disambiguation)
|
|
663
|
+
// Ignore "The ", "A ", "An " at the start for matching
|
|
664
|
+
const cleanTitle = title.replace(/^(the|a|an)\s+/i, '');
|
|
665
|
+
const cleanNormalized = normalizedBase.replace(/^(the|a|an)\s+/i, '');
|
|
666
|
+
|
|
667
|
+
if (cleanTitle === cleanNormalized) {
|
|
668
|
+
s += 1000;
|
|
669
|
+
} else if (cleanTitle.startsWith(cleanNormalized + " (")) {
|
|
670
|
+
s += 800; // Match for "Base Title (Anything)"
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
// 2. Parenthetical matching
|
|
674
|
+
// If the user provided "(book)", and we find a page with info containing "book", give a bonus.
|
|
675
|
+
if (paren) {
|
|
676
|
+
const parenLower = paren.toLowerCase();
|
|
677
|
+
if (title.includes(parenLower)) s += 500;
|
|
678
|
+
if (snippet.includes(parenLower)) s += 200;
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
// Music disambiguation: prefer musician/band pages over generic title definitions.
|
|
682
|
+
const musicCtx = contextIndicatesMusic(context);
|
|
683
|
+
const bizCtx = contextIndicatesBusiness(context);
|
|
684
|
+
if (musicCtx) {
|
|
685
|
+
if (title.includes("(musician)") || title.includes("(singer)") || title.includes("(band)")) s += 1600;
|
|
686
|
+
if (/\b(singer|musician|songwriter|rapper|band)\b/.test(snippet)) s += 800;
|
|
687
|
+
// Penalize royalty-title definition pages when user context is music.
|
|
688
|
+
if (title === normalized && /\b(male ruler|monarch|royal|noble)\b/.test(snippet)) s -= 1600;
|
|
689
|
+
}
|
|
690
|
+
// Business disambiguation: prefer entrepreneur/business pages; penalize musicians.
|
|
691
|
+
if (bizCtx) {
|
|
692
|
+
if (title.includes("(businessman)") || title.includes("(entrepreneur)") || title.includes("(businesswoman)")) s += 1600;
|
|
693
|
+
if (/\b(entrepreneur|businessman|businesswoman|investor|executive|chief executive|ceo|founder|co-founder)\b/.test(snippet)) s += 900;
|
|
694
|
+
if (title.includes("(musician)") || title.includes("(singer)") || title.includes("(band)")) s -= 1400;
|
|
695
|
+
if (/\b(singer|musician|songwriter|rapper|band)\b/.test(snippet)) s -= 900;
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
// 2. Context matching
|
|
699
|
+
if (context) {
|
|
700
|
+
const words = context.toLowerCase().split(/\s+/).filter(w => w.length > 2);
|
|
701
|
+
words.forEach(word => {
|
|
702
|
+
if (title.includes(word)) s += 100;
|
|
703
|
+
if (snippet.includes(word)) s += 50;
|
|
704
|
+
});
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
// 3. Media penalties
|
|
708
|
+
const suffixes = ["(TV series)", "(film)", "(miniseries)", "(series)", "(movie)", "(documentary)", "(episode)"];
|
|
709
|
+
const isMedia = suffixes.some(suf => title.includes(suf.toLowerCase())) || isMediaTitle(title);
|
|
710
|
+
if (isMedia) {
|
|
711
|
+
if (avoidMedia) s -= 800;
|
|
712
|
+
else s -= 400;
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
// 4. Term scoring
|
|
716
|
+
const sportsTerms = ['football', 'soccer', 'rugby', 'cricket', 'goalkeeper', 'striker', 'club', 'fc', 'afc', 'baseball', 'mlb', 'pcl', 'outfield', 'pitcher'];
|
|
717
|
+
sportsTerms.forEach(t => {
|
|
718
|
+
const regex = new RegExp(`\\b${t}\\b`, 'i');
|
|
719
|
+
if (regex.test(title) || regex.test(snippet)) s -= 400;
|
|
720
|
+
});
|
|
721
|
+
|
|
722
|
+
// Contextual boost: if context clearly implies film/TV, upweight media pages
|
|
723
|
+
const filmContext = (context || '').toLowerCase().match(/\b(film|movie|director|screenplay|starring|cast|ridley scott|screenwriter|cinematography|box office)\b/);
|
|
724
|
+
if (filmContext) {
|
|
725
|
+
if (title.includes('(film)') || title.includes('(movie)') || title.includes('(tv') || title.includes('(television)')) {
|
|
726
|
+
s += 1200;
|
|
727
|
+
}
|
|
728
|
+
if (title.includes('(2000 film)') || title.includes('(199') || title.includes('(20')) {
|
|
729
|
+
s += 600; // gentle year-specific nudge, not title-specific
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
if (/born\s\d{4}/.test(snippet)) s += 80;
|
|
734
|
+
|
|
735
|
+
// General Infrastructure/Geographic penalty when searching for a proper person-like name
|
|
736
|
+
if (looksLikePersonName) {
|
|
737
|
+
const infraTerms = ['airport', 'station', 'stadium', 'university', 'bridge', 'plaza', 'square', 'park', 'boulevard', 'avenue', 'road', 'highway', 'complex', 'tower'];
|
|
738
|
+
infraTerms.forEach(t => {
|
|
739
|
+
if (title.includes(t)) s -= 2000;
|
|
740
|
+
});
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
// Penalize non-Latin characters if the query is Latin (prevents Japanese/Chinese/etc. titles on en.wikipedia)
|
|
744
|
+
const isLatinQuery = !/[^\u0000-\u024F]/.test(cleanQuery);
|
|
745
|
+
const titleHasNonLatin = /[^\u0000-\u024F]/.test(title);
|
|
746
|
+
if (isLatinQuery && titleHasNonLatin) {
|
|
747
|
+
s -= 2000;
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
return s;
|
|
751
|
+
};
|
|
752
|
+
|
|
753
|
+
const scored = results.map((r: any, idx: number) => ({ r, score: scoreResult(r, idx) })).sort((a: any, b: any) => b.score - a.score);
|
|
754
|
+
bestTitle = scored[0]?.r?.title || query;
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
const titleNameParts = bestTitle.toLowerCase().split(/[\s-]+/).filter(w => w.length > 2);
|
|
758
|
+
// Require at least one full word match, not just a substring overlap
|
|
759
|
+
const hasFullWordMatch = queryNameParts.some(q => titleNameParts.includes(q));
|
|
760
|
+
const hasOverlap = queryNameParts.some(q => titleNameParts.some(t => t.includes(q) || q.includes(t)));
|
|
761
|
+
|
|
762
|
+
// 4. Resolve the best matching page from the search results, skipping disambiguation pages.
|
|
763
|
+
const candidates = searchData.query?.search?.length ? scored.map((s: any) => s.r.title) : [query];
|
|
764
|
+
|
|
765
|
+
for (const titleToTry of candidates) {
|
|
766
|
+
if (queryNameParts.length > 0) {
|
|
767
|
+
const candidateParts = titleToTry.toLowerCase().split(/[\s-]+/).filter(w => w.length > 2);
|
|
768
|
+
|
|
769
|
+
// STRICT PERSON MATCHING:
|
|
770
|
+
// If we are looking for a person (query has 2+ name parts),
|
|
771
|
+
// require ALL significant query tokens to be present in the candidate title tokens.
|
|
772
|
+
// This prevents "Perry Neubauer" from matching "Jeff Neubauer".
|
|
773
|
+
if (queryNameParts.length >= 2) {
|
|
774
|
+
const allMatch = queryNameParts.every(q => candidateParts.includes(q));
|
|
775
|
+
// Special exemption: if the candidate title is a single word and it is the FIRST result
|
|
776
|
+
// and it is one of the query parts, allow it (e.g. "Napoleon").
|
|
777
|
+
const isFirstMatch = titleToTry === candidates[0];
|
|
778
|
+
const queryNameMatchesTitle = queryNameParts.some(q => candidateParts.includes(q));
|
|
779
|
+
const isShortFamousName = candidateParts.length === 1 && queryNameMatchesTitle;
|
|
780
|
+
|
|
781
|
+
if (!allMatch && !(isFirstMatch && isShortFamousName)) {
|
|
782
|
+
// console.log(`⚠️ [Wiki] Skipping title "${titleToTry}" for query "${cleanQuery}" (not all name parts match).`);
|
|
783
|
+
continue;
|
|
784
|
+
}
|
|
785
|
+
} else if (queryLastName && !candidateParts.includes(queryLastName)) {
|
|
786
|
+
// console.log(`⚠️ [Wiki] Skipping title "${titleToTry}" for query "${cleanQuery}" (missing last-name match).`);
|
|
787
|
+
continue;
|
|
788
|
+
}
|
|
789
|
+
}
|
|
790
|
+
const summaryUrl = `https://en.wikipedia.org/w/api.php?action=query&format=json&prop=extracts|pageprops&exintro&explaintext&titles=${encodeURIComponent(titleToTry)}&redirects=1&origin=*`;
|
|
791
|
+
const summaryRes = await fetch(summaryUrl);
|
|
792
|
+
const summaryData = await summaryRes.json();
|
|
793
|
+
const pages = summaryData.query?.pages;
|
|
794
|
+
|
|
795
|
+
if (pages) {
|
|
796
|
+
const page = Object.values(pages)[0] as any;
|
|
797
|
+
if (page && !page.missing && !(page.pageprops && page.pageprops.disambiguation !== undefined)) {
|
|
798
|
+
const fullExtract = page.extract || "";
|
|
799
|
+
|
|
800
|
+
// Guard: if a title that looks like an org/venue redirects to a person page, ignore it.
|
|
801
|
+
if (looksLikeOrgTitle(cleanQuery) && String(page.title || "").toLowerCase() !== cleanQuery.toLowerCase()) {
|
|
802
|
+
if (looksLikePersonExtract(fullExtract)) {
|
|
803
|
+
// console.log(`⚠️ [Wiki] Ignoring redirect mismatch for org-ish query "${cleanQuery}" -> "${page.title}"`);
|
|
804
|
+
continue; // Try next search result
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
// Split by double newline to get the first paragraph
|
|
809
|
+
let paragraphs = fullExtract.split(/\n\n|\r\n\r\n/);
|
|
810
|
+
let firstParagraph = paragraphs[0].trim();
|
|
811
|
+
|
|
812
|
+
if (!firstParagraph || firstParagraph.length > 1500) {
|
|
813
|
+
const lines = fullExtract.split(/\n|\r/);
|
|
814
|
+
if (lines[0].trim()) firstParagraph = lines[0].trim();
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
if (firstParagraph.length > 1000) {
|
|
818
|
+
const truncated = firstParagraph.substring(0, 1000);
|
|
819
|
+
const lastPeriod = truncated.lastIndexOf('.');
|
|
820
|
+
if (lastPeriod > 500) {
|
|
821
|
+
firstParagraph = truncated.substring(0, lastPeriod + 1);
|
|
822
|
+
} else {
|
|
823
|
+
firstParagraph = truncated + "...";
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
const finalExtract = firstParagraph || null;
|
|
828
|
+
if (!finalExtract || finalExtract.length < 50) {
|
|
829
|
+
// console.log(`⚠️ [Wiki] Extract for "${page.title}" too short (${finalExtract?.length || 0} chars). Skipping.`);
|
|
830
|
+
continue; // Try next search result
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
if (queryNameParts.length >= 2) {
|
|
834
|
+
const pageParts = String(page.title || "").toLowerCase().split(/[\s-]+/).filter(w => w.length > 2);
|
|
835
|
+
const allMatch = queryNameParts.every(q => pageParts.includes(q));
|
|
836
|
+
if (!allMatch) {
|
|
837
|
+
// console.log(`⚠️ [Wiki] Skipping resolved title "${page.title}" for "${cleanQuery}" (not all name parts match).`);
|
|
838
|
+
continue;
|
|
839
|
+
}
|
|
840
|
+
} else if (queryLastName) {
|
|
841
|
+
const pageParts = String(page.title || "").toLowerCase().split(/[\s-]+/).filter(w => w.length > 2);
|
|
842
|
+
if (!pageParts.includes(queryLastName)) {
|
|
843
|
+
// console.log(`⚠️ [Wiki] Skipping resolved title "${page.title}" for "${cleanQuery}" (missing last-name match).`);
|
|
844
|
+
continue;
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
// console.log(`✅ [Wiki] Found summary for "${page.title}": "${finalExtract?.substring(0, 100)}..." (${finalExtract?.length || 0} chars)`);
|
|
849
|
+
|
|
850
|
+
if (avoidMedia && isMediaTitle(page.title)) {
|
|
851
|
+
const retryQuery = `${cleanQuery} ${context || 'person'}`;
|
|
852
|
+
// console.log(`⚠️ [Wiki] Media page returned for "${cleanQuery}". Retrying with "${retryQuery}".`);
|
|
853
|
+
const retry = await fetchWikipediaSummary(retryQuery, context, visited, depth + 1);
|
|
854
|
+
if (retry.extract) return retry;
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
let year: number | null = null;
|
|
858
|
+
const yearMatch = (finalExtract || '').match(/\b(18|19|20)\d{2}\b/);
|
|
859
|
+
if (yearMatch) {
|
|
860
|
+
year = parseInt(yearMatch[0], 10);
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
return { extract: finalExtract, pageid: page.pageid || null, title: page.title || null, year, mentioningPageTitles: null, searchContext: null };
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
}
|
|
867
|
+
|
|
868
|
+
const validResults = results
|
|
869
|
+
.filter((r: any) => {
|
|
870
|
+
const snip = (r.snippet || "").toLowerCase();
|
|
871
|
+
const q = cleanQuery.toLowerCase();
|
|
872
|
+
if (snip.includes(q)) return true;
|
|
873
|
+
const parts = q.split(/\s+/).filter(p => p.length > 2);
|
|
874
|
+
if (parts.length >= 2) return parts.every(p => snip.includes(p));
|
|
875
|
+
return false;
|
|
876
|
+
});
|
|
877
|
+
|
|
878
|
+
const searchContext = validResults
|
|
879
|
+
.slice(0, 3)
|
|
880
|
+
.map((r: any) => (r.snippet || '').replace(/<[^>]*>/g, '').replace(/"/g, '"').trim())
|
|
881
|
+
.filter(Boolean)
|
|
882
|
+
.join(" ... ");
|
|
883
|
+
|
|
884
|
+
const mentioningPageTitles = validResults
|
|
885
|
+
.map((r: any) => r.title)
|
|
886
|
+
.filter((t: string) => !t.toLowerCase().startsWith('list of '))
|
|
887
|
+
.slice(0, 3);
|
|
888
|
+
|
|
889
|
+
if (searchContext.length > 50) {
|
|
890
|
+
// console.log(`ℹ️ [Wiki] No direct article match, using search snippets from ${mentioningPageTitles.join(', ')} as context for "${cleanQuery}".`);
|
|
891
|
+
return {
|
|
892
|
+
extract: searchContext,
|
|
893
|
+
pageid: null,
|
|
894
|
+
title: query,
|
|
895
|
+
mentioningPageTitles,
|
|
896
|
+
searchContext
|
|
897
|
+
};
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
// console.log(`❌ [Wiki] No summary found for "${bestTitle}" via search. Attempting direct lookup for "${cleanQuery}".`);
|
|
901
|
+
|
|
902
|
+
// Direct lookup fallback (reuse helper)
|
|
903
|
+
|
|
904
|
+
// 2. Try Title Case (e.g. "andrew schloss" -> "Andrew Schloss")
|
|
905
|
+
const toTitleCase = (str: string) => {
|
|
906
|
+
return str.replace(/\w\S*/g, (txt) => txt.charAt(0).toUpperCase() + txt.substr(1).toLowerCase());
|
|
907
|
+
};
|
|
908
|
+
const titleCased = toTitleCase(cleanQuery);
|
|
909
|
+
if (titleCased !== cleanQuery) {
|
|
910
|
+
// console.log(`⚠️ [Wiki] Direct lookup failed given casing. Retrying with Title Case: "${titleCased}"`);
|
|
911
|
+
const titleMatch = await tryDirectLookup(titleCased);
|
|
912
|
+
if (titleMatch) return titleMatch;
|
|
913
|
+
}
|
|
914
|
+
|
|
915
|
+
// console.log(`❌ [Wiki] No summary found for "${bestTitle}" matches.`);
|
|
916
|
+
}
|
|
917
|
+
} catch (e) {
|
|
918
|
+
console.error(`❌ [Wiki] Error fetching summary for "${query}":`, e);
|
|
919
|
+
}
|
|
920
|
+
// Final fallback: if context was provided and failed, retry once with no context
|
|
921
|
+
if (context && !triedNoContext) {
|
|
922
|
+
// console.log(`⚠️ [Wiki] Retrying "${query}" without context (previous attempt returned empty).`);
|
|
923
|
+
return await fetchWikipediaSummary(query, undefined, visited, depth + 1, true);
|
|
924
|
+
}
|
|
925
|
+
return { extract: null, pageid: null, title: null };
|
|
926
|
+
};
|
|
927
|
+
|
|
928
|
+
// Fetch a longer plain-text extract (not just the intro) to help find evidence snippets.
|
|
929
|
+
// Returns at most maxChars characters of the page extract.
|
|
930
|
+
export const fetchWikipediaExtract = async (
|
|
931
|
+
title: string,
|
|
932
|
+
maxChars: number = 12000
|
|
933
|
+
): Promise<{ extract: string | null; pageid: number | null; title: string | null }> => {
|
|
934
|
+
try {
|
|
935
|
+
// Note: exchars is intentionally omitted — the Wikipedia API mis-truncates short articles
|
|
936
|
+
// when exchars is set (returns fewer chars than the article actually contains). We fetch
|
|
937
|
+
// the full extract and truncate client-side instead.
|
|
938
|
+
const url = `https://en.wikipedia.org/w/api.php?action=query&format=json&prop=extracts|pageprops&explaintext&titles=${encodeURIComponent(title)}&redirects=1&origin=*`;
|
|
939
|
+
// Hard cap: a hung Wikipedia response must not strand graph expansion spinners indefinitely.
|
|
940
|
+
const res = await fetchWithTimeout(url, {}, 25_000);
|
|
941
|
+
const data = await res.json();
|
|
942
|
+
const pages = data.query?.pages;
|
|
943
|
+
if (!pages) return { extract: null, pageid: null, title: null };
|
|
944
|
+
const page = Object.values(pages)[0] as any;
|
|
945
|
+
if (page && !page.missing && !(page.pageprops && page.pageprops.disambiguation !== undefined)) {
|
|
946
|
+
// Guard: if a title that looks like an org/venue redirects to a person page, ignore it.
|
|
947
|
+
if (looksLikeOrgTitle(title) && String(page.title || "").toLowerCase() !== String(title).toLowerCase()) {
|
|
948
|
+
const full = String(page.extract || "");
|
|
949
|
+
if (looksLikePersonExtract(full)) return { extract: null, pageid: null, title: null };
|
|
950
|
+
}
|
|
951
|
+
const rawExtract: string | null = page.extract || null;
|
|
952
|
+
const extract = rawExtract && rawExtract.length > maxChars ? rawExtract.slice(0, maxChars) : rawExtract;
|
|
953
|
+
return { extract, pageid: page.pageid || null, title: page.title || null };
|
|
954
|
+
}
|
|
955
|
+
} catch (e) {
|
|
956
|
+
// console.warn("fetchWikipediaExtract failed:", title, e);
|
|
957
|
+
}
|
|
958
|
+
return { extract: null, pageid: null, title: null };
|
|
959
|
+
};
|
|
960
|
+
|
|
961
|
+
type WikidataKeyPeople = {
|
|
962
|
+
wikidataId: string;
|
|
963
|
+
founders: string[];
|
|
964
|
+
directors: string[];
|
|
965
|
+
ceos: string[];
|
|
966
|
+
keyPeople: string[];
|
|
967
|
+
};
|
|
968
|
+
|
|
969
|
+
const extractWikidataItemIds = (claims: any, prop: string): string[] => {
|
|
970
|
+
const arr = claims?.[prop] || [];
|
|
971
|
+
const ids: string[] = [];
|
|
972
|
+
for (const c of arr) {
|
|
973
|
+
const v = c?.mainsnak?.datavalue?.value;
|
|
974
|
+
const id = v?.id;
|
|
975
|
+
if (typeof id === "string" && /^Q\d+$/.test(id)) ids.push(id);
|
|
976
|
+
}
|
|
977
|
+
return ids;
|
|
978
|
+
};
|
|
979
|
+
|
|
980
|
+
// Fetch cast/performer labels from Wikidata (P161) for a given title.
|
|
981
|
+
export const fetchWikidataCastForTitle = async (title: string, limit: number = 12): Promise<string[]> => {
|
|
982
|
+
const controller = new AbortController();
|
|
983
|
+
const timeoutId = setTimeout(() => controller.abort(), 5000);
|
|
984
|
+
const signal = controller.signal;
|
|
985
|
+
|
|
986
|
+
try {
|
|
987
|
+
let wikidataId: string | null = null;
|
|
988
|
+
try {
|
|
989
|
+
const pagepropsUrl = `https://en.wikipedia.org/w/api.php?action=query&format=json&prop=pageprops&titles=${encodeURIComponent(title)}&redirects=1&origin=*`;
|
|
990
|
+
const ppRes = await fetch(pagepropsUrl, { signal });
|
|
991
|
+
const ppData = await ppRes.json();
|
|
992
|
+
const pages = ppData?.query?.pages;
|
|
993
|
+
if (pages) {
|
|
994
|
+
const page = Object.values(pages)[0] as any;
|
|
995
|
+
const candidate = page?.pageprops?.wikibase_item;
|
|
996
|
+
if (typeof candidate === "string" && /^Q\d+$/.test(candidate)) {
|
|
997
|
+
wikidataId = candidate;
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
} catch {
|
|
1001
|
+
// ignore; fall through to search
|
|
1002
|
+
}
|
|
1003
|
+
|
|
1004
|
+
if (!wikidataId) {
|
|
1005
|
+
wikidataId = await resolveWikidataIdBySearch(title, signal);
|
|
1006
|
+
}
|
|
1007
|
+
if (!wikidataId) return [];
|
|
1008
|
+
|
|
1009
|
+
const entityUrl = `https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=claims&ids=${encodeURIComponent(wikidataId)}&origin=*`;
|
|
1010
|
+
const entRes = await fetch(entityUrl, { signal });
|
|
1011
|
+
const entData = await entRes.json();
|
|
1012
|
+
const claims = entData?.entities?.[wikidataId]?.claims;
|
|
1013
|
+
if (!claims) return [];
|
|
1014
|
+
|
|
1015
|
+
const castIds = extractWikidataItemIds(claims, "P161");
|
|
1016
|
+
if (!castIds.length) return [];
|
|
1017
|
+
|
|
1018
|
+
const labelMap = await fetchWikidataLabels(castIds, signal);
|
|
1019
|
+
const labels = castIds
|
|
1020
|
+
.map(id => labelMap[id])
|
|
1021
|
+
.filter((x): x is string => typeof x === "string" && x.trim().length > 0);
|
|
1022
|
+
|
|
1023
|
+
return Array.from(new Set(labels)).slice(0, limit);
|
|
1024
|
+
} catch {
|
|
1025
|
+
return [];
|
|
1026
|
+
} finally {
|
|
1027
|
+
clearTimeout(timeoutId);
|
|
1028
|
+
}
|
|
1029
|
+
};
|
|
1030
|
+
|
|
1031
|
+
const fetchWikidataLabels = async (ids: string[], signal: AbortSignal): Promise<Record<string, string>> => {
|
|
1032
|
+
const out: Record<string, string> = {};
|
|
1033
|
+
const uniq = Array.from(new Set(ids)).filter(Boolean);
|
|
1034
|
+
for (let i = 0; i < uniq.length; i += 50) {
|
|
1035
|
+
const chunk = uniq.slice(i, i + 50);
|
|
1036
|
+
try {
|
|
1037
|
+
const url = `https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=labels&languages=en&ids=${encodeURIComponent(chunk.join("|"))}&origin=*`;
|
|
1038
|
+
const res = await fetch(url, { signal });
|
|
1039
|
+
const data = await res.json();
|
|
1040
|
+
const entities = data?.entities || {};
|
|
1041
|
+
for (const [id, ent] of Object.entries<any>(entities)) {
|
|
1042
|
+
const label = ent?.labels?.en?.value;
|
|
1043
|
+
if (label) out[id] = label;
|
|
1044
|
+
}
|
|
1045
|
+
} catch {
|
|
1046
|
+
// ignore partial failures
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
return out;
|
|
1050
|
+
};
|
|
1051
|
+
|
|
1052
|
+
const resolveWikidataIdBySearch = async (label: string, signal: AbortSignal): Promise<string | null> => {
|
|
1053
|
+
try {
|
|
1054
|
+
const url = `https://www.wikidata.org/w/api.php?action=wbsearchentities&format=json&language=en&limit=8&search=${encodeURIComponent(label)}&origin=*`;
|
|
1055
|
+
const res = await fetch(url, { signal });
|
|
1056
|
+
const data = await res.json();
|
|
1057
|
+
const results: any[] = data?.search || [];
|
|
1058
|
+
if (!results.length) return null;
|
|
1059
|
+
|
|
1060
|
+
const normalized = label.trim().toLowerCase();
|
|
1061
|
+
const mustContainMuseum = /\bmuseum\b/i.test(label);
|
|
1062
|
+
const scored = results.map(r => {
|
|
1063
|
+
const lab = String(r?.label || "");
|
|
1064
|
+
const desc = String(r?.description || "");
|
|
1065
|
+
const l = lab.trim().toLowerCase();
|
|
1066
|
+
const d = desc.trim().toLowerCase();
|
|
1067
|
+
let s = 0;
|
|
1068
|
+
if (l === normalized) s += 1000;
|
|
1069
|
+
if (l.includes(normalized)) s += 300;
|
|
1070
|
+
if (mustContainMuseum && (l.includes("museum") || d.includes("museum"))) s += 500;
|
|
1071
|
+
if (looksLikeOrgTitle(label) && (d.includes("museum") || d.includes("company") || d.includes("organisation") || d.includes("organization"))) s += 120;
|
|
1072
|
+
return { id: r?.id, score: s };
|
|
1073
|
+
}).sort((a, b) => b.score - a.score);
|
|
1074
|
+
|
|
1075
|
+
const best = scored[0]?.id;
|
|
1076
|
+
return typeof best === "string" && /^Q\d+$/.test(best) ? best : null;
|
|
1077
|
+
} catch {
|
|
1078
|
+
return null;
|
|
1079
|
+
}
|
|
1080
|
+
};
|
|
1081
|
+
|
|
1082
|
+
export const fetchWikidataKeyPeopleForTitle = async (title: string): Promise<WikidataKeyPeople | null> => {
|
|
1083
|
+
const cacheKey = `wikidata_key_people|${(title || "").trim().toLowerCase()}`;
|
|
1084
|
+
if (!(window as any).__wikidataPeopleCache) (window as any).__wikidataPeopleCache = new Map<string, WikidataKeyPeople | null>();
|
|
1085
|
+
const cache: Map<string, WikidataKeyPeople | null> = (window as any).__wikidataPeopleCache;
|
|
1086
|
+
if (cache.has(cacheKey)) return cache.get(cacheKey) || null;
|
|
1087
|
+
|
|
1088
|
+
const controller = new AbortController();
|
|
1089
|
+
const timeoutId = setTimeout(() => controller.abort(), 6000);
|
|
1090
|
+
const signal = controller.signal;
|
|
1091
|
+
|
|
1092
|
+
try {
|
|
1093
|
+
// 1) Resolve Wikidata Q-id from the English Wikipedia page.
|
|
1094
|
+
let wikidataId: string | null = null;
|
|
1095
|
+
try {
|
|
1096
|
+
const pagepropsUrl = `https://en.wikipedia.org/w/api.php?action=query&format=json&prop=pageprops&titles=${encodeURIComponent(title)}&redirects=1&origin=*`;
|
|
1097
|
+
const ppRes = await fetch(pagepropsUrl, { signal });
|
|
1098
|
+
const ppData = await ppRes.json();
|
|
1099
|
+
const pages = ppData?.query?.pages;
|
|
1100
|
+
if (pages) {
|
|
1101
|
+
const page = Object.values(pages)[0] as any;
|
|
1102
|
+
const resolvedTitle = String(page?.title || "");
|
|
1103
|
+
const candidate = page?.pageprops?.wikibase_item;
|
|
1104
|
+
// If the "Wikipedia title" redirects to an unrelated person page, ignore it and fall back to Wikidata search.
|
|
1105
|
+
const mismatch =
|
|
1106
|
+
looksLikeOrgTitle(title) &&
|
|
1107
|
+
resolvedTitle &&
|
|
1108
|
+
resolvedTitle.toLowerCase() !== String(title).toLowerCase() &&
|
|
1109
|
+
!/\bmuseum\b/i.test(resolvedTitle);
|
|
1110
|
+
if (!mismatch && typeof candidate === "string" && /^Q\d+$/.test(candidate)) {
|
|
1111
|
+
wikidataId = candidate;
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
} catch {
|
|
1115
|
+
// ignore and fall back to search
|
|
1116
|
+
}
|
|
1117
|
+
|
|
1118
|
+
// Fall back: label search (handles Wikipedia redirects like "WNDR Museum" -> a person).
|
|
1119
|
+
if (!wikidataId) {
|
|
1120
|
+
wikidataId = await resolveWikidataIdBySearch(title, signal);
|
|
1121
|
+
}
|
|
1122
|
+
if (!wikidataId) {
|
|
1123
|
+
cache.set(cacheKey, null);
|
|
1124
|
+
return null;
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
// 2) Pull key-people claims.
|
|
1128
|
+
const entityUrl = `https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=claims&ids=${encodeURIComponent(wikidataId)}&origin=*`;
|
|
1129
|
+
const entRes = await fetch(entityUrl, { signal });
|
|
1130
|
+
const entData = await entRes.json();
|
|
1131
|
+
const entity = entData?.entities?.[wikidataId];
|
|
1132
|
+
const claims = entity?.claims;
|
|
1133
|
+
if (!claims) {
|
|
1134
|
+
cache.set(cacheKey, null);
|
|
1135
|
+
return null;
|
|
1136
|
+
}
|
|
1137
|
+
|
|
1138
|
+
// Wikidata properties:
|
|
1139
|
+
// - P112: founder
|
|
1140
|
+
// - P1037: director/manager
|
|
1141
|
+
// - P169: chief executive officer
|
|
1142
|
+
// - P3342: significant person / key person
|
|
1143
|
+
const founderIds = extractWikidataItemIds(claims, "P112");
|
|
1144
|
+
const directorIds = extractWikidataItemIds(claims, "P1037");
|
|
1145
|
+
const ceoIds = extractWikidataItemIds(claims, "P169");
|
|
1146
|
+
const keyPersonIds = extractWikidataItemIds(claims, "P3342");
|
|
1147
|
+
|
|
1148
|
+
const labelMap = await fetchWikidataLabels(
|
|
1149
|
+
[...founderIds, ...directorIds, ...ceoIds, ...keyPersonIds],
|
|
1150
|
+
signal
|
|
1151
|
+
);
|
|
1152
|
+
|
|
1153
|
+
const toLabels = (ids: string[]) =>
|
|
1154
|
+
Array.from(new Set(ids.map(id => labelMap[id]).filter((x): x is string => typeof x === "string" && x.trim().length > 0)));
|
|
1155
|
+
|
|
1156
|
+
const result: WikidataKeyPeople = {
|
|
1157
|
+
wikidataId,
|
|
1158
|
+
founders: toLabels(founderIds),
|
|
1159
|
+
directors: toLabels(directorIds),
|
|
1160
|
+
ceos: toLabels(ceoIds),
|
|
1161
|
+
keyPeople: toLabels(keyPersonIds)
|
|
1162
|
+
};
|
|
1163
|
+
|
|
1164
|
+
const hasAny =
|
|
1165
|
+
result.founders.length || result.directors.length || result.ceos.length || result.keyPeople.length;
|
|
1166
|
+
|
|
1167
|
+
cache.set(cacheKey, hasAny ? result : null);
|
|
1168
|
+
return hasAny ? result : null;
|
|
1169
|
+
} catch {
|
|
1170
|
+
cache.set(cacheKey, null);
|
|
1171
|
+
return null;
|
|
1172
|
+
} finally {
|
|
1173
|
+
clearTimeout(timeoutId);
|
|
1174
|
+
}
|
|
1175
|
+
};
|