@terraleiloa/opportunity-extraction 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,213 @@
1
+ /**
2
+ * Source-agnostic contracts for opportunity instruction discovery and section extraction.
3
+ */
4
+ type SourceType = "grantsgov" | "calgrants" | "samgov" | "bespoke";
5
+ interface LLMMessage {
6
+ role: "system" | "user" | "assistant";
7
+ content: string;
8
+ }
9
+ /** Generic chat completion; plug in OpenAI, Anthropic, etc. */
10
+ type LLMClient = (messages: LLMMessage[]) => Promise<string>;
11
+ interface FetchedDocument {
12
+ /** Original request URL */
13
+ url: string;
14
+ /** Final URL after redirects */
15
+ finalUrl: string;
16
+ /** Extracted plain text (HTML stripped, PDF parsed) */
17
+ text: string;
18
+ mimeType: string;
19
+ filename?: string;
20
+ }
21
+ type DocumentFetcher = (url: string) => Promise<FetchedDocument | null>;
22
+ /**
23
+ * Fetch raw HTML for a page (needed for link discovery).
24
+ * If omitted, the pipeline uses a built-in fetch with a browser-like User-Agent.
25
+ */
26
+ type HtmlFetcher = (url: string) => Promise<string>;
27
+ interface OpportunityInput {
28
+ pageUrl: string;
29
+ sourceType?: SourceType;
30
+ /** When set, skips fetching the opportunity page */
31
+ html?: string;
32
+ }
33
+ interface OpportunityExtractionDeps {
34
+ llm: LLMClient;
35
+ /** Fetch and parse instruction documents (PDF/HTML → text). */
36
+ fetchDocument: DocumentFetcher;
37
+ /**
38
+ * Optional: fetch raw HTML for the opportunity listing page.
39
+ * Used when `html` is not provided on the input.
40
+ */
41
+ fetchHtml?: HtmlFetcher;
42
+ /** Grants.gov Simpler API key (optional; enables API-first discovery). */
43
+ grantsGovApiKey?: string;
44
+ maxInstructionUrls?: number;
45
+ /** Override default chunking for Phase 1 section synthesis. */
46
+ splitIntoChunks?: (text: string) => Promise<string[]>;
47
+ }
48
+ interface NormalizedGrantSection {
49
+ id: string;
50
+ title: string;
51
+ description: string;
52
+ requirements: string[];
53
+ wordLimit: string | number;
54
+ required: boolean;
55
+ }
56
+ interface InstructionUrlAttempt {
57
+ url: string;
58
+ confidence?: string;
59
+ reason?: string;
60
+ fetched?: boolean;
61
+ }
62
+ interface SectionExtractionArtifacts {
63
+ analysis?: string;
64
+ eligibilitySection?: string;
65
+ methodOfApplication?: string;
66
+ websitesToRegister?: string[];
67
+ applicationGuides?: string[];
68
+ requiredItems?: string[];
69
+ keyInformation?: string;
70
+ }
71
+ interface SectionExtractionResult {
72
+ success: boolean;
73
+ sections: NormalizedGrantSection[];
74
+ artifacts: SectionExtractionArtifacts;
75
+ instructionUrlsTried: InstructionUrlAttempt[];
76
+ warnings: string[];
77
+ error?: string;
78
+ }
79
+
80
+ /**
81
+ * Resolves source type from URL when not explicitly provided (matches handler registry behavior).
82
+ */
83
+ declare function resolveSourceType(pageUrl: string, hint?: SourceType): SourceType;
84
+
85
+ /**
86
+ * End-to-end: discover instruction URLs, fetch text, run chunk+synthesis, return normalized sections.
87
+ */
88
+ declare function extractOpportunitySections(input: OpportunityInput, deps: OpportunityExtractionDeps): Promise<SectionExtractionResult>;
89
+
90
+ /**
91
+ * Browser-like fetch for raw HTML (link discovery).
92
+ */
93
+ declare function defaultFetchHtml(url: string): Promise<string>;
94
+
95
+ /**
96
+ * Extracts text content from HTML (strips script/style, normalizes spaces).
97
+ */
98
+ declare function extractTextFromHtml(html: string): string;
99
+ /**
100
+ * Validates and resolves a URL against a base. Returns null for invalid or non-http(s) URLs.
101
+ */
102
+ declare function validateAndResolveUrl(url: string, baseUrl: string): string | null;
103
+
104
+ /**
105
+ * Extracts opportunity ID from various grants.gov URL formats.
106
+ */
107
+ declare function extractOpportunityIdFromUrl(url: string): string | null;
108
+
109
+ declare function getSourceSpecificGuidance(sourceType?: SourceType): string;
110
+
111
+ interface InstructionUrlCandidate {
112
+ url: string;
113
+ confidence: "high" | "medium" | "low";
114
+ reason: string;
115
+ }
116
+ interface FindInstructionUrlsOptions {
117
+ pageUrl: string;
118
+ html: string;
119
+ sourceType?: SourceType;
120
+ llmCaller?: (messages: Array<{
121
+ role: "system" | "user" | "assistant";
122
+ content: string;
123
+ }>) => Promise<string>;
124
+ maxCandidates?: number;
125
+ }
126
+ declare function getUrlDiscoveryPrompt(pageUrl: string, sourceType?: SourceType): string;
127
+ /**
128
+ * Finds URLs that are most likely to contain full application instructions from an opportunity page.
129
+ */
130
+ declare function findApplicationInstructionUrls(options: FindInstructionUrlsOptions): Promise<string[]>;
131
+
132
+ /**
133
+ * Grants.gov Simpler API – instruction URL discovery without scraping the listing page.
134
+ */
135
+ declare function getGrantsGovInstructionUrlsFromApi(opportunityId: string, apiKey: string): Promise<string[]>;
136
+
137
+ interface GrantApplicationSection {
138
+ name: string;
139
+ required: boolean;
140
+ pageLimit?: string;
141
+ description: string;
142
+ requirements: string[];
143
+ }
144
+ interface MappedAppSection {
145
+ id: string;
146
+ title: string;
147
+ description: string;
148
+ requirements: string[];
149
+ wordLimit: string | number;
150
+ required: boolean;
151
+ }
152
+ /**
153
+ * Parses Phase 2 synthesis LLM response and returns applicationSections.
154
+ */
155
+ declare function parseSectionsFromAnalysisResponse(rawResponse: string): GrantApplicationSection[];
156
+ declare function mapToAppSections(sections: GrantApplicationSection[]): MappedAppSection[];
157
+
158
+ /**
159
+ * Simple fixed-size chunking when no custom splitter is provided.
160
+ */
161
+ declare function defaultSplitIntoChunks(text: string): Promise<string[]>;
162
+ interface SynthesizeOptions {
163
+ llm: LLMClient;
164
+ combinedDocumentText: string;
165
+ splitIntoChunks?: (text: string) => Promise<string[]>;
166
+ maxChunksPerRequest?: number;
167
+ }
168
+ /**
169
+ * Phase 1 + Phase 2 synthesis (matches document-upload route behavior).
170
+ */
171
+ declare function synthesizeSectionsFromDocumentText(options: SynthesizeOptions): Promise<{
172
+ rawPhase2: string;
173
+ artifacts: SectionExtractionArtifacts;
174
+ }>;
175
+ /**
176
+ * Full path: chunk analyses → Phase 2 → parsed sections + artifacts.
177
+ */
178
+ declare function extractGrantSectionsFromCombinedText(options: SynthesizeOptions): Promise<{
179
+ rawPhase2: string;
180
+ artifacts: SectionExtractionArtifacts;
181
+ grantSections: GrantApplicationSection[];
182
+ }>;
183
+
184
+ type ChatCompletionsCreate = (args: {
185
+ model: string;
186
+ messages: unknown[];
187
+ max_tokens?: number;
188
+ response_format?: {
189
+ type: "json_object";
190
+ };
191
+ }) => Promise<{
192
+ choices: Array<{
193
+ message?: {
194
+ content?: string | null;
195
+ };
196
+ }>;
197
+ }>;
198
+ /**
199
+ * Adapts an OpenAI client (v4) to {@link LLMClient}.
200
+ * Use with `import OpenAI from "openai"` in the host application.
201
+ */
202
+ declare function createOpenAiChatLlmClient(openai: {
203
+ chat: {
204
+ completions: {
205
+ create: ChatCompletionsCreate;
206
+ };
207
+ };
208
+ }, options?: {
209
+ model?: string;
210
+ maxTokens?: number;
211
+ }): LLMClient;
212
+
213
+ export { type DocumentFetcher, type FetchedDocument, type FindInstructionUrlsOptions, type GrantApplicationSection, type HtmlFetcher, type InstructionUrlAttempt, type InstructionUrlCandidate, type LLMClient, type LLMMessage, type MappedAppSection, type NormalizedGrantSection, type OpportunityExtractionDeps, type OpportunityInput, type SectionExtractionArtifacts, type SectionExtractionResult, type SourceType, createOpenAiChatLlmClient, defaultFetchHtml, defaultSplitIntoChunks, extractGrantSectionsFromCombinedText, extractOpportunityIdFromUrl, extractOpportunitySections, extractTextFromHtml, findApplicationInstructionUrls, getGrantsGovInstructionUrlsFromApi, getSourceSpecificGuidance, getUrlDiscoveryPrompt, mapToAppSections, parseSectionsFromAnalysisResponse, resolveSourceType, synthesizeSectionsFromDocumentText, validateAndResolveUrl };
@@ -0,0 +1,213 @@
1
+ /**
2
+ * Source-agnostic contracts for opportunity instruction discovery and section extraction.
3
+ */
4
+ type SourceType = "grantsgov" | "calgrants" | "samgov" | "bespoke";
5
+ interface LLMMessage {
6
+ role: "system" | "user" | "assistant";
7
+ content: string;
8
+ }
9
+ /** Generic chat completion; plug in OpenAI, Anthropic, etc. */
10
+ type LLMClient = (messages: LLMMessage[]) => Promise<string>;
11
+ interface FetchedDocument {
12
+ /** Original request URL */
13
+ url: string;
14
+ /** Final URL after redirects */
15
+ finalUrl: string;
16
+ /** Extracted plain text (HTML stripped, PDF parsed) */
17
+ text: string;
18
+ mimeType: string;
19
+ filename?: string;
20
+ }
21
+ type DocumentFetcher = (url: string) => Promise<FetchedDocument | null>;
22
+ /**
23
+ * Fetch raw HTML for a page (needed for link discovery).
24
+ * If omitted, the pipeline uses a built-in fetch with a browser-like User-Agent.
25
+ */
26
+ type HtmlFetcher = (url: string) => Promise<string>;
27
+ interface OpportunityInput {
28
+ pageUrl: string;
29
+ sourceType?: SourceType;
30
+ /** When set, skips fetching the opportunity page */
31
+ html?: string;
32
+ }
33
+ interface OpportunityExtractionDeps {
34
+ llm: LLMClient;
35
+ /** Fetch and parse instruction documents (PDF/HTML → text). */
36
+ fetchDocument: DocumentFetcher;
37
+ /**
38
+ * Optional: fetch raw HTML for the opportunity listing page.
39
+ * Used when `html` is not provided on the input.
40
+ */
41
+ fetchHtml?: HtmlFetcher;
42
+ /** Grants.gov Simpler API key (optional; enables API-first discovery). */
43
+ grantsGovApiKey?: string;
44
+ maxInstructionUrls?: number;
45
+ /** Override default chunking for Phase 1 section synthesis. */
46
+ splitIntoChunks?: (text: string) => Promise<string[]>;
47
+ }
48
+ interface NormalizedGrantSection {
49
+ id: string;
50
+ title: string;
51
+ description: string;
52
+ requirements: string[];
53
+ wordLimit: string | number;
54
+ required: boolean;
55
+ }
56
+ interface InstructionUrlAttempt {
57
+ url: string;
58
+ confidence?: string;
59
+ reason?: string;
60
+ fetched?: boolean;
61
+ }
62
+ interface SectionExtractionArtifacts {
63
+ analysis?: string;
64
+ eligibilitySection?: string;
65
+ methodOfApplication?: string;
66
+ websitesToRegister?: string[];
67
+ applicationGuides?: string[];
68
+ requiredItems?: string[];
69
+ keyInformation?: string;
70
+ }
71
+ interface SectionExtractionResult {
72
+ success: boolean;
73
+ sections: NormalizedGrantSection[];
74
+ artifacts: SectionExtractionArtifacts;
75
+ instructionUrlsTried: InstructionUrlAttempt[];
76
+ warnings: string[];
77
+ error?: string;
78
+ }
79
+
80
+ /**
81
+ * Resolves source type from URL when not explicitly provided (matches handler registry behavior).
82
+ */
83
+ declare function resolveSourceType(pageUrl: string, hint?: SourceType): SourceType;
84
+
85
+ /**
86
+ * End-to-end: discover instruction URLs, fetch text, run chunk+synthesis, return normalized sections.
87
+ */
88
+ declare function extractOpportunitySections(input: OpportunityInput, deps: OpportunityExtractionDeps): Promise<SectionExtractionResult>;
89
+
90
+ /**
91
+ * Browser-like fetch for raw HTML (link discovery).
92
+ */
93
+ declare function defaultFetchHtml(url: string): Promise<string>;
94
+
95
+ /**
96
+ * Extracts text content from HTML (strips script/style, normalizes spaces).
97
+ */
98
+ declare function extractTextFromHtml(html: string): string;
99
+ /**
100
+ * Validates and resolves a URL against a base. Returns null for invalid or non-http(s) URLs.
101
+ */
102
+ declare function validateAndResolveUrl(url: string, baseUrl: string): string | null;
103
+
104
+ /**
105
+ * Extracts opportunity ID from various grants.gov URL formats.
106
+ */
107
+ declare function extractOpportunityIdFromUrl(url: string): string | null;
108
+
109
+ declare function getSourceSpecificGuidance(sourceType?: SourceType): string;
110
+
111
+ interface InstructionUrlCandidate {
112
+ url: string;
113
+ confidence: "high" | "medium" | "low";
114
+ reason: string;
115
+ }
116
+ interface FindInstructionUrlsOptions {
117
+ pageUrl: string;
118
+ html: string;
119
+ sourceType?: SourceType;
120
+ llmCaller?: (messages: Array<{
121
+ role: "system" | "user" | "assistant";
122
+ content: string;
123
+ }>) => Promise<string>;
124
+ maxCandidates?: number;
125
+ }
126
+ declare function getUrlDiscoveryPrompt(pageUrl: string, sourceType?: SourceType): string;
127
+ /**
128
+ * Finds URLs that are most likely to contain full application instructions from an opportunity page.
129
+ */
130
+ declare function findApplicationInstructionUrls(options: FindInstructionUrlsOptions): Promise<string[]>;
131
+
132
+ /**
133
+ * Grants.gov Simpler API – instruction URL discovery without scraping the listing page.
134
+ */
135
+ declare function getGrantsGovInstructionUrlsFromApi(opportunityId: string, apiKey: string): Promise<string[]>;
136
+
137
+ interface GrantApplicationSection {
138
+ name: string;
139
+ required: boolean;
140
+ pageLimit?: string;
141
+ description: string;
142
+ requirements: string[];
143
+ }
144
+ interface MappedAppSection {
145
+ id: string;
146
+ title: string;
147
+ description: string;
148
+ requirements: string[];
149
+ wordLimit: string | number;
150
+ required: boolean;
151
+ }
152
+ /**
153
+ * Parses Phase 2 synthesis LLM response and returns applicationSections.
154
+ */
155
+ declare function parseSectionsFromAnalysisResponse(rawResponse: string): GrantApplicationSection[];
156
+ declare function mapToAppSections(sections: GrantApplicationSection[]): MappedAppSection[];
157
+
158
+ /**
159
+ * Simple fixed-size chunking when no custom splitter is provided.
160
+ */
161
+ declare function defaultSplitIntoChunks(text: string): Promise<string[]>;
162
+ interface SynthesizeOptions {
163
+ llm: LLMClient;
164
+ combinedDocumentText: string;
165
+ splitIntoChunks?: (text: string) => Promise<string[]>;
166
+ maxChunksPerRequest?: number;
167
+ }
168
+ /**
169
+ * Phase 1 + Phase 2 synthesis (matches document-upload route behavior).
170
+ */
171
+ declare function synthesizeSectionsFromDocumentText(options: SynthesizeOptions): Promise<{
172
+ rawPhase2: string;
173
+ artifacts: SectionExtractionArtifacts;
174
+ }>;
175
+ /**
176
+ * Full path: chunk analyses → Phase 2 → parsed sections + artifacts.
177
+ */
178
+ declare function extractGrantSectionsFromCombinedText(options: SynthesizeOptions): Promise<{
179
+ rawPhase2: string;
180
+ artifacts: SectionExtractionArtifacts;
181
+ grantSections: GrantApplicationSection[];
182
+ }>;
183
+
184
+ type ChatCompletionsCreate = (args: {
185
+ model: string;
186
+ messages: unknown[];
187
+ max_tokens?: number;
188
+ response_format?: {
189
+ type: "json_object";
190
+ };
191
+ }) => Promise<{
192
+ choices: Array<{
193
+ message?: {
194
+ content?: string | null;
195
+ };
196
+ }>;
197
+ }>;
198
+ /**
199
+ * Adapts an OpenAI client (v4) to {@link LLMClient}.
200
+ * Use with `import OpenAI from "openai"` in the host application.
201
+ */
202
+ declare function createOpenAiChatLlmClient(openai: {
203
+ chat: {
204
+ completions: {
205
+ create: ChatCompletionsCreate;
206
+ };
207
+ };
208
+ }, options?: {
209
+ model?: string;
210
+ maxTokens?: number;
211
+ }): LLMClient;
212
+
213
+ export { type DocumentFetcher, type FetchedDocument, type FindInstructionUrlsOptions, type GrantApplicationSection, type HtmlFetcher, type InstructionUrlAttempt, type InstructionUrlCandidate, type LLMClient, type LLMMessage, type MappedAppSection, type NormalizedGrantSection, type OpportunityExtractionDeps, type OpportunityInput, type SectionExtractionArtifacts, type SectionExtractionResult, type SourceType, createOpenAiChatLlmClient, defaultFetchHtml, defaultSplitIntoChunks, extractGrantSectionsFromCombinedText, extractOpportunityIdFromUrl, extractOpportunitySections, extractTextFromHtml, findApplicationInstructionUrls, getGrantsGovInstructionUrlsFromApi, getSourceSpecificGuidance, getUrlDiscoveryPrompt, mapToAppSections, parseSectionsFromAnalysisResponse, resolveSourceType, synthesizeSectionsFromDocumentText, validateAndResolveUrl };