@terraleiloa/opportunity-extraction 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,20 @@
1
+ # @humandata/opportunity-extraction
2
+
3
+ Reusable pipeline for:
4
+
5
+ 1. Discovering instruction document URLs from an opportunity page (LLM + regex fallback, Grants.gov Simpler API optional).
6
+ 2. Fetching instruction text via injected `DocumentFetcher`.
7
+ 3. Chunk analysis + synthesis to produce `applicationSections` (same contract as `LLM_Instruction.md` / document-upload API).
8
+
9
+ ## Public API
10
+
11
+ - `extractOpportunitySections(input, deps)` — main orchestrator.
12
+ - `findApplicationInstructionUrls`, `getGrantsGovInstructionUrlsFromApi` — lower-level discovery.
13
+ - `parseSectionsFromAnalysisResponse`, `mapToAppSections` — parse Phase-2 JSON.
14
+ - `createOpenAiChatLlmClient(openai)` — OpenAI v4 adapter.
15
+
16
+ See [examples/second-app/README.md](./examples/second-app/README.md).
17
+
18
+ ## Tests
19
+
20
+ From repo root: `pnpm test:opportunity-extraction` (after install).
package/dist/index.cjs ADDED
@@ -0,0 +1,612 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+
20
+ // src/index.ts
21
+ var index_exports = {};
22
+ __export(index_exports, {
23
+ createOpenAiChatLlmClient: () => createOpenAiChatLlmClient,
24
+ defaultFetchHtml: () => defaultFetchHtml,
25
+ defaultSplitIntoChunks: () => defaultSplitIntoChunks,
26
+ extractGrantSectionsFromCombinedText: () => extractGrantSectionsFromCombinedText,
27
+ extractOpportunityIdFromUrl: () => extractOpportunityIdFromUrl,
28
+ extractOpportunitySections: () => extractOpportunitySections,
29
+ extractTextFromHtml: () => extractTextFromHtml,
30
+ findApplicationInstructionUrls: () => findApplicationInstructionUrls,
31
+ getGrantsGovInstructionUrlsFromApi: () => getGrantsGovInstructionUrlsFromApi,
32
+ getSourceSpecificGuidance: () => getSourceSpecificGuidance,
33
+ getUrlDiscoveryPrompt: () => getUrlDiscoveryPrompt,
34
+ mapToAppSections: () => mapToAppSections,
35
+ parseSectionsFromAnalysisResponse: () => parseSectionsFromAnalysisResponse,
36
+ resolveSourceType: () => resolveSourceType,
37
+ synthesizeSectionsFromDocumentText: () => synthesizeSectionsFromDocumentText,
38
+ validateAndResolveUrl: () => validateAndResolveUrl
39
+ });
40
+ module.exports = __toCommonJS(index_exports);
41
+
42
+ // src/adapters/resolveSourceType.ts
43
+ function resolveSourceType(pageUrl, hint) {
44
+ if (hint) return hint;
45
+ const url = pageUrl.toLowerCase();
46
+ if (url.includes("grants.gov")) return "grantsgov";
47
+ if (url.includes("sam.gov")) return "samgov";
48
+ if (url.includes("calgrants") || url.includes("california")) return "calgrants";
49
+ return "bespoke";
50
+ }
51
+
52
+ // src/discovery/html.ts
53
+ function resolveUrl(baseUrl, relativeUrl) {
54
+ try {
55
+ return new URL(relativeUrl, baseUrl).href;
56
+ } catch {
57
+ return relativeUrl;
58
+ }
59
+ }
60
+ function extractTextFromHtml(html) {
61
+ const cleanedHtml = html.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, "").replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, "");
62
+ const textMatches = cleanedHtml.match(/<[^>]+>|([^<]+)/g) || [];
63
+ const textParts = [];
64
+ for (const match of textMatches) {
65
+ if (!match.startsWith("<")) {
66
+ const trimmed = match.trim();
67
+ if (trimmed.length > 0) {
68
+ textParts.push(trimmed);
69
+ }
70
+ }
71
+ }
72
+ return textParts.join(" ").replace(/\s+/g, " ").trim();
73
+ }
74
+ function validateAndResolveUrl(url, baseUrl) {
75
+ try {
76
+ let resolved = url;
77
+ if (!url.startsWith("http://") && !url.startsWith("https://")) {
78
+ resolved = resolveUrl(baseUrl, url);
79
+ }
80
+ const urlObj = new URL(resolved);
81
+ const protocol = urlObj.protocol.toLowerCase();
82
+ if (protocol === "javascript:" || protocol === "data:" || protocol === "file:") {
83
+ return null;
84
+ }
85
+ if (protocol !== "http:" && protocol !== "https:") {
86
+ return null;
87
+ }
88
+ return resolved;
89
+ } catch {
90
+ return null;
91
+ }
92
+ }
93
+
94
+ // src/discovery/sourceGuidance.ts
95
+ function getSourceSpecificGuidance(sourceType) {
96
+ const sourceGuidance = {
97
+ grantsgov: `**GRANTS.GOV SPECIFIC INSTRUCTIONS:**
98
+
99
+ - Look for the "View Opportunity" link or opportunity synopsis
100
+ - Find the full solicitation/FOA document (usually a PDF)
101
+ - Check for required application guides or instructions
102
+ - Note if submission is through Grants.gov Workspace or another system
103
+ - Extract CFDA/ALN numbers if present
104
+ - Look for synergy descriptions at grants.gov and also simpler.grants.gov (the newer site)`,
105
+ calgrants: `**CALGRANTS SPECIFIC INSTRUCTIONS:**
106
+
107
+ - Look for the grant URL or application portal link
108
+ - Find downloadable guidelines or instructions (often PDF)
109
+ - Check for California-specific registration requirements
110
+ - Note the application submission portal or email requirements
111
+ - Extract any state-specific compliance requirements`,
112
+ samgov: `**SAM.GOV SPECIFIC INSTRUCTIONS:**
113
+
114
+ - Look for the full solicitation or RFP documents
115
+ - Check for amendments or modifications
116
+ - Find Section L (Instructions) and Section M (Evaluation) if present
117
+ - Note contract vehicle type and set-aside status
118
+ - Extract NAICS code requirements and size standards`,
119
+ bespoke: `**CUSTOM OPPORTUNITY INSTRUCTIONS:**
120
+
121
+ - Follow the provided URL to find application instructions
122
+ - Look for any embedded or linked application guidelines
123
+ - Extract contact information for questions
124
+ - Note any unique submission requirements`
125
+ };
126
+ return sourceGuidance[sourceType ?? "bespoke"] ?? sourceGuidance.bespoke;
127
+ }
128
+
129
+ // src/discovery/findInstructionUrls.ts
130
+ function stripJsonCodeFences(text) {
131
+ let s = text.trim();
132
+ if (s.includes("```json")) {
133
+ const match = s.match(/```json\n?([\s\S]*?)\n?```/);
134
+ if (match?.[1]) return match[1].trim();
135
+ }
136
+ if (s.includes("```")) {
137
+ const match = s.match(/```\n?([\s\S]*?)\n?```/);
138
+ if (match?.[1]) return match[1].trim();
139
+ }
140
+ return s;
141
+ }
142
+ function getUrlDiscoveryPrompt(pageUrl, sourceType) {
143
+ const sourceSpecific = getSourceSpecificGuidance(sourceType);
144
+ return `You are analyzing a government grant/contract opportunity page. Your task is to:
145
+
146
+ 1. **Extract all unique URLs** found on this page (including links in the HTML, buttons, data attributes, etc.)
147
+ 2. **Identify which URL(s) contain the full application instructions** - these are typically:
148
+ - Full announcement pages (often at files.simpler.grants.gov, grants.nih.gov, or agency-specific sites)
149
+ - PDF documents with "instructions", "guidance", "solicitation", "FOA", or "RFA" in the name
150
+ - Application guide pages
151
+ - Program summary pages
152
+
153
+ 3. **Rank URLs by likelihood** of containing complete instructions (most likely first, return top 1-2)
154
+
155
+ Return your response as a JSON object with this structure:
156
+ {
157
+ "instructionUrls": [
158
+ {
159
+ "url": "most likely instruction URL",
160
+ "confidence": "high|medium|low",
161
+ "reason": "why this URL likely contains instructions"
162
+ },
163
+ ...
164
+ ],
165
+ "uniqueUrls": ["url1", "url2", ...]
166
+ }
167
+
168
+ **URL TO ANALYZE: ${pageUrl}**
169
+
170
+ ${sourceSpecific}
171
+
172
+ Extract URLs from the following HTML content:`;
173
+ }
174
+ var SYSTEM_MESSAGE_URL_DISCOVERY = 'You are a web scraping assistant that extracts URLs from HTML content and identifies which URLs are most likely to contain application instructions. Return only valid JSON with the structure: { "instructionUrls": [...], "uniqueUrls": [...] }.';
175
+ function fallbackExtractInstructionUrls(html, baseUrl, maxCandidates) {
176
+ const urlRegex = /https?:\/\/[^\s"'<>)]+/gi;
177
+ const matches = html.match(urlRegex) || [];
178
+ const uniqueUrls = Array.from(new Set(matches));
179
+ const filtered = uniqueUrls.filter((u) => /(instruction|guide|solicitation|foa|rfa|announcement|full)/i.test(u)).slice(0, maxCandidates);
180
+ const resolved = [];
181
+ for (const u of filtered) {
182
+ const r = validateAndResolveUrl(u, baseUrl);
183
+ if (r) resolved.push(r);
184
+ }
185
+ return resolved;
186
+ }
187
+ var CONFIDENCE_ORDER = { high: 3, medium: 2, low: 1 };
188
+ async function findApplicationInstructionUrls(options) {
189
+ const { pageUrl, html, sourceType, llmCaller, maxCandidates = 2 } = options;
190
+ const textContent = extractTextFromHtml(html);
191
+ const prompt = getUrlDiscoveryPrompt(pageUrl, sourceType);
192
+ const userContent = `${prompt}
193
+
194
+ HTML Content:
195
+ ${textContent.substring(0, 5e4)}`;
196
+ if (llmCaller) {
197
+ try {
198
+ const response = await llmCaller([
199
+ { role: "system", content: SYSTEM_MESSAGE_URL_DISCOVERY },
200
+ { role: "user", content: userContent }
201
+ ]);
202
+ const jsonString = stripJsonCodeFences(response);
203
+ const parsed = JSON.parse(jsonString);
204
+ const candidates = parsed?.instructionUrls ?? [];
205
+ const sorted = [...candidates].sort((a, b) => {
206
+ const orderA = CONFIDENCE_ORDER[String(a.confidence).toLowerCase()] ?? 1;
207
+ const orderB = CONFIDENCE_ORDER[String(b.confidence).toLowerCase()] ?? 1;
208
+ return orderB - orderA;
209
+ });
210
+ const toTry = sorted.slice(0, maxCandidates);
211
+ const resolved = [];
212
+ const seen = /* @__PURE__ */ new Set();
213
+ for (const item of toTry) {
214
+ const url = item?.url;
215
+ if (!url || typeof url !== "string") continue;
216
+ const r = validateAndResolveUrl(url, pageUrl);
217
+ if (r && !seen.has(r)) {
218
+ seen.add(r);
219
+ resolved.push(r);
220
+ }
221
+ }
222
+ if (resolved.length > 0) return resolved;
223
+ } catch {
224
+ }
225
+ }
226
+ return fallbackExtractInstructionUrls(html, pageUrl, maxCandidates);
227
+ }
228
+
229
+ // src/discovery/grantsGovApi.ts
230
+ async function getGrantsGovInstructionUrlsFromApi(opportunityId, apiKey) {
231
+ const url = `https://api.simpler.grants.gov/v1/opportunities/${opportunityId}`;
232
+ const controller = new AbortController();
233
+ const timeoutId = setTimeout(() => controller.abort(), 3e4);
234
+ try {
235
+ const response = await fetch(url, {
236
+ method: "GET",
237
+ headers: {
238
+ "X-API-Key": apiKey,
239
+ "Content-Type": "application/json"
240
+ },
241
+ signal: controller.signal
242
+ });
243
+ clearTimeout(timeoutId);
244
+ if (!response.ok) {
245
+ if (response.status === 404) return [];
246
+ if (response.status === 401 || response.status === 403) {
247
+ throw new Error("Unauthorized: Invalid API key or insufficient permissions");
248
+ }
249
+ if (response.status === 429) {
250
+ throw new Error("Rate limit exceeded: Too many requests");
251
+ }
252
+ throw new Error(`Simpler API error: ${response.status} ${response.statusText}`);
253
+ }
254
+ const contentType = response.headers.get("content-type") ?? "";
255
+ if (!contentType.includes("application/json")) {
256
+ throw new Error("Simpler API returned non-JSON response");
257
+ }
258
+ const data = await response.json();
259
+ let attachments = [];
260
+ if (Array.isArray(data.attachments)) {
261
+ attachments = data.attachments;
262
+ } else if (data.data && Array.isArray(data.data.attachments)) {
263
+ attachments = data.data.attachments;
264
+ } else if (data.package_details) {
265
+ const packages = Array.isArray(data.package_details) ? data.package_details : [data.package_details];
266
+ const mostRecent = packages[packages.length - 1];
267
+ if (Array.isArray(mostRecent.attachments)) {
268
+ attachments = mostRecent.attachments;
269
+ } else if (Array.isArray(mostRecent.instruction_files)) {
270
+ attachments = mostRecent.instruction_files;
271
+ }
272
+ }
273
+ const instructionUrls = [];
274
+ for (const att of attachments) {
275
+ const downloadUrl = att.download_path ?? att.download_url ?? att.file_url ?? att.url ?? att.href;
276
+ if (!downloadUrl || typeof downloadUrl !== "string") continue;
277
+ const filename = String(att.file_name ?? att.filename ?? att.name ?? "").toLowerCase();
278
+ const type = String(att.type ?? att.contentType ?? att.content_type ?? "").toLowerCase();
279
+ const isPdf = type.includes("pdf") || filename.endsWith(".pdf");
280
+ const isHtml = type.includes("html") || filename.endsWith(".html") || filename.endsWith(".htm");
281
+ if (!isPdf && !isHtml) continue;
282
+ const isInstructionRelated = filename.includes("instruction") || filename.includes("guide") || filename.includes("application") || filename.includes("package") || filename.includes("solicitation") || filename.includes("foa") || filename.includes("rfa") || filename.includes("announcement") || att.isInstruction === true || att.category === "instruction";
283
+ if (isInstructionRelated) instructionUrls.push(downloadUrl);
284
+ }
285
+ if (instructionUrls.length === 0) {
286
+ const additionalInfoUrl = data.additional_info_url ?? data.additionalInfoUrl ?? data.data?.additional_info_url ?? data.data?.additionalInfoUrl;
287
+ if (additionalInfoUrl) instructionUrls.push(additionalInfoUrl);
288
+ }
289
+ const instructionUrl = data.instruction_url ?? data.instructionUrl ?? data.data?.instruction_url ?? data.data?.instructionUrl;
290
+ if (instructionUrl && !instructionUrls.includes(instructionUrl)) instructionUrls.push(instructionUrl);
291
+ const pdfUrl = data.pdf_url ?? data.pdfUrl ?? data.data?.pdf_url ?? data.data?.pdfUrl;
292
+ if (pdfUrl && !instructionUrls.includes(pdfUrl)) instructionUrls.push(pdfUrl);
293
+ return instructionUrls;
294
+ } finally {
295
+ clearTimeout(timeoutId);
296
+ }
297
+ }
298
+
299
+ // src/discovery/opportunityId.ts
300
+ function extractOpportunityIdFromUrl(url) {
301
+ try {
302
+ const detailMatch = url.match(/grants\.gov\/search-results-detail\/(\d+)/i);
303
+ if (detailMatch?.[1]) return detailMatch[1];
304
+ const simplerMatch = url.match(/simpler\.grants\.gov\/opportunity\/([a-f0-9-]+)/i);
305
+ if (simplerMatch?.[1]) return simplerMatch[1];
306
+ const oppIdMatch = url.match(/oppId=(\d+)/i);
307
+ if (oppIdMatch?.[1]) return oppIdMatch[1];
308
+ return null;
309
+ } catch {
310
+ return null;
311
+ }
312
+ }
313
+
314
+ // src/section-extraction/parseSections.ts
315
+ function parseSectionsFromAnalysisResponse(rawResponse) {
316
+ if (!rawResponse || typeof rawResponse !== "string") {
317
+ return [];
318
+ }
319
+ let jsonString = rawResponse.trim();
320
+ if (jsonString.includes("```json")) {
321
+ const match = jsonString.match(/```json\n?([\s\S]*?)\n?```/);
322
+ if (match?.[1]) {
323
+ jsonString = match[1].trim();
324
+ }
325
+ } else if (jsonString.includes("```")) {
326
+ const match = jsonString.match(/```\n?([\s\S]*?)\n?```/);
327
+ if (match?.[1]) {
328
+ jsonString = match[1].trim();
329
+ }
330
+ }
331
+ try {
332
+ const parsed = JSON.parse(jsonString);
333
+ if (!Array.isArray(parsed.applicationSections)) {
334
+ return [];
335
+ }
336
+ return parsed.applicationSections.filter(
337
+ (s) => typeof s === "object" && s !== null && typeof s.name === "string" && typeof s.required === "boolean" && Array.isArray(s.requirements)
338
+ );
339
+ } catch {
340
+ return [];
341
+ }
342
+ }
343
+ function mapToAppSections(sections) {
344
+ return sections.map((section) => ({
345
+ id: section.name.toLowerCase().replace(/\s+/g, "-"),
346
+ title: section.name,
347
+ description: section.description ?? "",
348
+ requirements: Array.isArray(section.requirements) ? section.requirements : [],
349
+ wordLimit: section.pageLimit ?? "Not specified",
350
+ required: section.required ?? true
351
+ }));
352
+ }
353
+
354
+ // src/section-extraction/synthesizeFromChunks.ts
355
+ var PHASE1_SYSTEM = "You are a specialized AI writing assistant for creating Request for Proposals (RFPs) for federal, state, and local government agencies. Your primary function is to help government procurement and grant professionals draft clear, compliant, and effective solicitation documents for grants and contracts. Extract the eligibility section, method of application, websites to register at, links to application guides, required items for application and key information from this document chunk, and return a comprehensive summary of it. Focus on the specific content in this chunk and note if it contains eligibility section, method of application, websites to register at, links to application guides, required items for application or other key information.";
356
+ var PHASE2_SYSTEM = 'You are a specialized AI writing assistant for creating Request for Proposals (RFPs) for federal, state, and local government agencies. Your primary function is to help government procurement and grant professionals draft clear, compliant, and effective solicitation documents for grants and contracts. You have been given analyses of different chunks of a document. Please synthesize these analyses into one comprehensive, coherent summary that covers the entire document. Focus on eligibility criteria, requirements, and key information across all chunks. Additionally, identify and extract all required grant application sections from this document. Return your response as a JSON object with the following structure: { "analysis": "comprehensive summary", "eligibilitySection": "extracted eligibility requirements", "methodOfApplication": "how to apply", "websitesToRegister": ["list of websites"], "applicationGuides": ["list of guides"], "requiredItems": ["list of required items"], "keyInformation": "other important information", "applicationSections": [{"name": "section name", "required": true/false, "pageLimit": "page limit if specified", "description": "what should be included", "requirements": ["list of specific requirements"]}] }';
357
+ var DEFAULT_CHUNK_SIZE = 12e3;
358
+ async function defaultSplitIntoChunks(text) {
359
+ if (text.length <= DEFAULT_CHUNK_SIZE) return [text];
360
+ const chunks = [];
361
+ for (let i = 0; i < text.length; i += DEFAULT_CHUNK_SIZE) {
362
+ chunks.push(text.slice(i, i + DEFAULT_CHUNK_SIZE));
363
+ }
364
+ return chunks;
365
+ }
366
+ async function synthesizeSectionsFromDocumentText(options) {
367
+ const { llm, combinedDocumentText, splitIntoChunks, maxChunksPerRequest = 3 } = options;
368
+ const splitter = splitIntoChunks ?? defaultSplitIntoChunks;
369
+ const chunks = await splitter(combinedDocumentText);
370
+ let combinedAnalysis = "";
371
+ for (let i = 0; i < chunks.length; i += maxChunksPerRequest) {
372
+ const currentChunks = chunks.slice(i, i + maxChunksPerRequest);
373
+ const chunkText = currentChunks.join("\n\n--- CHUNK SEPARATOR ---\n\n");
374
+ const chunkAnalysis = await llm([
375
+ { role: "system", content: PHASE1_SYSTEM },
376
+ {
377
+ role: "user",
378
+ content: `Please analyze this document chunk and extract the eligibility section, method of application, websites to register at, links to application guides, required items for application and key information. Provide a comprehensive summary.
379
+
380
+ Document chunk content:
381
+ ${chunkText}`
382
+ }
383
+ ]);
384
+ combinedAnalysis += `
385
+
386
+ --- CHUNK ${i + 1}-${Math.min(i + maxChunksPerRequest, chunks.length)} ANALYSIS ---
387
+
388
+ ${chunkAnalysis}`;
389
+ }
390
+ const rawPhase2 = await llm([
391
+ { role: "system", content: PHASE2_SYSTEM },
392
+ {
393
+ role: "user",
394
+ content: `Please synthesize the following chunk analyses into one comprehensive summary of the entire document:
395
+
396
+ ${combinedAnalysis}`
397
+ }
398
+ ]);
399
+ const artifacts = parseStructuredArtifacts(rawPhase2);
400
+ return { rawPhase2, artifacts };
401
+ }
402
+ function parseStructuredArtifacts(raw) {
403
+ let jsonString = raw.trim();
404
+ if (jsonString.includes("```json")) {
405
+ const m = jsonString.match(/```json\n([\s\S]*?)\n```/);
406
+ if (m?.[1]) jsonString = m[1].trim();
407
+ } else if (jsonString.includes("```")) {
408
+ const m = jsonString.match(/```\n([\s\S]*?)\n```/);
409
+ if (m?.[1]) jsonString = m[1].trim();
410
+ }
411
+ try {
412
+ const parsed = JSON.parse(jsonString);
413
+ return {
414
+ analysis: parsed.analysis,
415
+ eligibilitySection: parsed.eligibilitySection,
416
+ methodOfApplication: parsed.methodOfApplication,
417
+ websitesToRegister: Array.isArray(parsed.websitesToRegister) ? parsed.websitesToRegister : [],
418
+ applicationGuides: Array.isArray(parsed.applicationGuides) ? parsed.applicationGuides : [],
419
+ requiredItems: Array.isArray(parsed.requiredItems) ? parsed.requiredItems : [],
420
+ keyInformation: parsed.keyInformation
421
+ };
422
+ } catch {
423
+ return {};
424
+ }
425
+ }
426
+ async function extractGrantSectionsFromCombinedText(options) {
427
+ const { rawPhase2, artifacts } = await synthesizeSectionsFromDocumentText(options);
428
+ const grantSections = parseSectionsFromAnalysisResponse(rawPhase2);
429
+ return { rawPhase2, artifacts, grantSections };
430
+ }
431
+
432
+ // src/pipeline/defaultFetchHtml.ts
433
+ async function defaultFetchHtml(url) {
434
+ const response = await fetch(url, {
435
+ headers: {
436
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
437
+ },
438
+ redirect: "follow"
439
+ });
440
+ if (!response.ok) {
441
+ throw new Error(`Failed to fetch URL: ${response.status} ${response.statusText}`);
442
+ }
443
+ return response.text();
444
+ }
445
+
446
+ // src/pipeline/extractOpportunitySections.ts
447
+ async function extractOpportunitySections(input, deps) {
448
+ const warnings = [];
449
+ const instructionUrlsTried = [];
450
+ const pageUrl = input.pageUrl.trim();
451
+ const sourceType = resolveSourceType(pageUrl, input.sourceType);
452
+ const maxUrls = deps.maxInstructionUrls ?? 2;
453
+ const fetchHtml = deps.fetchHtml ?? defaultFetchHtml;
454
+ let candidateUrls = [];
455
+ if (sourceType === "grantsgov" && deps.grantsGovApiKey) {
456
+ const id = extractOpportunityIdFromUrl(pageUrl);
457
+ if (id) {
458
+ try {
459
+ candidateUrls = await getGrantsGovInstructionUrlsFromApi(id, deps.grantsGovApiKey);
460
+ candidateUrls = candidateUrls.slice(0, maxUrls);
461
+ for (const u of candidateUrls) {
462
+ instructionUrlsTried.push({ url: u, fetched: false });
463
+ }
464
+ } catch (e) {
465
+ warnings.push(`Simpler API: ${e instanceof Error ? e.message : String(e)}`);
466
+ }
467
+ }
468
+ }
469
+ if (candidateUrls.length === 0) {
470
+ let html;
471
+ try {
472
+ html = input.html ?? await fetchHtml(pageUrl);
473
+ } catch (e) {
474
+ return {
475
+ success: false,
476
+ sections: [],
477
+ artifacts: {},
478
+ instructionUrlsTried,
479
+ warnings,
480
+ error: e instanceof Error ? e.message : String(e)
481
+ };
482
+ }
483
+ const discovered = await findApplicationInstructionUrls({
484
+ pageUrl,
485
+ html,
486
+ sourceType,
487
+ llmCaller: deps.llm,
488
+ maxCandidates: maxUrls
489
+ });
490
+ candidateUrls = discovered;
491
+ for (const u of discovered) {
492
+ instructionUrlsTried.push({ url: u, fetched: false });
493
+ }
494
+ }
495
+ const parts = [];
496
+ for (const u of candidateUrls) {
497
+ try {
498
+ const doc = await deps.fetchDocument(u);
499
+ if (doc) {
500
+ const idx = instructionUrlsTried.findIndex((x) => x.url === u);
501
+ if (idx >= 0) {
502
+ instructionUrlsTried[idx] = { ...instructionUrlsTried[idx], fetched: true };
503
+ }
504
+ parts.push(
505
+ `
506
+
507
+ --- Content from ${doc.finalUrl}${doc.filename ? ` (${doc.filename})` : ""} ---
508
+
509
+ ${doc.text.substring(0, 5e4)}`
510
+ );
511
+ } else {
512
+ warnings.push(`Failed to fetch document: ${u}`);
513
+ }
514
+ } catch (e) {
515
+ warnings.push(`Error fetching ${u}: ${e instanceof Error ? e.message : String(e)}`);
516
+ }
517
+ }
518
+ if (parts.length === 0 && input.html) {
519
+ const t = extractTextFromHtml(input.html);
520
+ if (t.length > 100) {
521
+ parts.push(`
522
+
523
+ --- Opportunity page content ---
524
+
525
+ ${t.substring(0, 5e4)}`);
526
+ warnings.push("Used opportunity page HTML only; no instruction documents could be fetched.");
527
+ }
528
+ }
529
+ if (parts.length === 0 && !input.html) {
530
+ try {
531
+ const html = await fetchHtml(pageUrl);
532
+ const t = extractTextFromHtml(html);
533
+ if (t.length > 100) {
534
+ parts.push(`
535
+
536
+ --- Opportunity page content ---
537
+
538
+ ${t.substring(0, 5e4)}`);
539
+ warnings.push("Used fetched opportunity page only; instruction documents could not be loaded.");
540
+ }
541
+ } catch (e) {
542
+ return {
543
+ success: false,
544
+ sections: [],
545
+ artifacts: {},
546
+ instructionUrlsTried,
547
+ warnings,
548
+ error: e instanceof Error ? e.message : String(e)
549
+ };
550
+ }
551
+ }
552
+ if (parts.length === 0) {
553
+ return {
554
+ success: false,
555
+ sections: [],
556
+ artifacts: {},
557
+ instructionUrlsTried,
558
+ warnings: [...warnings, "No instruction text available for synthesis."],
559
+ error: "No instruction text available for synthesis."
560
+ };
561
+ }
562
+ const combined = parts.join("\n\n");
563
+ const { grantSections, artifacts, rawPhase2 } = await extractGrantSectionsFromCombinedText({
564
+ llm: deps.llm,
565
+ combinedDocumentText: combined,
566
+ splitIntoChunks: deps.splitIntoChunks
567
+ });
568
+ const sections = mapToAppSections(grantSections);
569
+ return {
570
+ success: sections.length > 0 || Boolean(artifacts.analysis),
571
+ sections,
572
+ artifacts: {
573
+ ...artifacts,
574
+ analysis: artifacts.analysis ?? rawPhase2
575
+ },
576
+ instructionUrlsTried,
577
+ warnings
578
+ };
579
+ }
580
+
581
+ // src/llm/openaiChat.ts
582
+ function createOpenAiChatLlmClient(openai, options) {
583
+ const model = options?.model ?? "gpt-4o";
584
+ const maxTokens = options?.maxTokens ?? 4e3;
585
+ return async (messages) => {
586
+ const completion = await openai.chat.completions.create({
587
+ model,
588
+ messages,
589
+ max_tokens: maxTokens
590
+ });
591
+ return completion.choices[0]?.message?.content ?? "";
592
+ };
593
+ }
594
+ // Annotate the CommonJS export names for ESM import in node:
595
+ 0 && (module.exports = {
596
+ createOpenAiChatLlmClient,
597
+ defaultFetchHtml,
598
+ defaultSplitIntoChunks,
599
+ extractGrantSectionsFromCombinedText,
600
+ extractOpportunityIdFromUrl,
601
+ extractOpportunitySections,
602
+ extractTextFromHtml,
603
+ findApplicationInstructionUrls,
604
+ getGrantsGovInstructionUrlsFromApi,
605
+ getSourceSpecificGuidance,
606
+ getUrlDiscoveryPrompt,
607
+ mapToAppSections,
608
+ parseSectionsFromAnalysisResponse,
609
+ resolveSourceType,
610
+ synthesizeSectionsFromDocumentText,
611
+ validateAndResolveUrl
612
+ });