@dromney/mapthis 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +56 -0
- package/dist/ai/index.cjs +474 -0
- package/dist/ai/index.cjs.map +1 -0
- package/dist/ai/index.d.cts +117 -0
- package/dist/ai/index.d.ts +117 -0
- package/dist/ai/index.js +447 -0
- package/dist/ai/index.js.map +1 -0
- package/dist/domain-CZ-L-ntu.d.ts +163 -0
- package/dist/domain-Dc1wSTkf.d.cts +163 -0
- package/dist/errors-Bw97z_4m.d.cts +12 -0
- package/dist/errors-Bw97z_4m.d.ts +12 -0
- package/dist/generate/index.cjs +222 -0
- package/dist/generate/index.cjs.map +1 -0
- package/dist/generate/index.d.cts +140 -0
- package/dist/generate/index.d.ts +140 -0
- package/dist/generate/index.js +220 -0
- package/dist/generate/index.js.map +1 -0
- package/dist/geocoding/index.cjs +90 -0
- package/dist/geocoding/index.cjs.map +1 -0
- package/dist/geocoding/index.d.cts +36 -0
- package/dist/geocoding/index.d.ts +36 -0
- package/dist/geocoding/index.js +86 -0
- package/dist/geocoding/index.js.map +1 -0
- package/dist/index.cjs +546 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +5 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.js +469 -0
- package/dist/index.js.map +1 -0
- package/dist/parser-CzXzpmVv.d.cts +111 -0
- package/dist/parser-N7-fNxeu.d.ts +111 -0
- package/dist/react/index.cjs +394 -0
- package/dist/react/index.cjs.map +1 -0
- package/dist/react/index.js +383 -0
- package/dist/react/index.js.map +1 -0
- package/dist/schemas-Dy5coqXo.d.cts +484 -0
- package/dist/schemas-Dy5coqXo.d.ts +484 -0
- package/dist/scrape/index.cjs +133 -0
- package/dist/scrape/index.cjs.map +1 -0
- package/dist/scrape/index.d.cts +60 -0
- package/dist/scrape/index.d.ts +60 -0
- package/dist/scrape/index.js +125 -0
- package/dist/scrape/index.js.map +1 -0
- package/dist/search/index.cjs +76 -0
- package/dist/search/index.cjs.map +1 -0
- package/dist/search/index.d.cts +75 -0
- package/dist/search/index.d.ts +75 -0
- package/dist/search/index.js +71 -0
- package/dist/search/index.js.map +1 -0
- package/dist/types/index.cjs +215 -0
- package/dist/types/index.cjs.map +1 -0
- package/dist/types/index.d.cts +4 -0
- package/dist/types/index.d.ts +4 -0
- package/dist/types/index.js +171 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types-BhqKlq0k.d.ts +31 -0
- package/dist/types-rFjK5YcJ.d.cts +31 -0
- package/dist/utils/index.cjs +335 -0
- package/dist/utils/index.cjs.map +1 -0
- package/dist/utils/index.d.cts +363 -0
- package/dist/utils/index.d.ts +363 -0
- package/dist/utils/index.js +301 -0
- package/dist/utils/index.js.map +1 -0
- package/package.json +150 -0
|
@@ -0,0 +1,474 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
var zod = require('zod');
|
|
4
|
+
var htmlToText$1 = require('html-to-text');
|
|
5
|
+
var jsTiktoken = require('js-tiktoken');
|
|
6
|
+
var OpenAI = require('openai');
|
|
7
|
+
|
|
8
|
+
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
9
|
+
|
|
10
|
+
var OpenAI__default = /*#__PURE__*/_interopDefault(OpenAI);
|
|
11
|
+
|
|
12
|
+
// src/types/errors.ts
|
|
13
|
+
var MapthisError = class extends Error {
|
|
14
|
+
constructor(message, options) {
|
|
15
|
+
super(message, options);
|
|
16
|
+
this.name = "MapthisError";
|
|
17
|
+
}
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
// src/ai/errors.ts
|
|
21
|
+
var AiError = class extends MapthisError {
|
|
22
|
+
constructor(message, options) {
|
|
23
|
+
super(message, options);
|
|
24
|
+
this.name = "AiError";
|
|
25
|
+
}
|
|
26
|
+
};
|
|
27
|
+
var NoLocationsFoundError = class extends AiError {
|
|
28
|
+
constructor(message, options) {
|
|
29
|
+
super(message ?? "No locations found in input", options);
|
|
30
|
+
this.name = "NoLocationsFoundError";
|
|
31
|
+
}
|
|
32
|
+
};
|
|
33
|
+
var SummarizeTextError = class extends AiError {
|
|
34
|
+
constructor(message, options) {
|
|
35
|
+
super(message, options);
|
|
36
|
+
this.name = "SummarizeTextError";
|
|
37
|
+
}
|
|
38
|
+
};
|
|
39
|
+
var InvalidJsonSchemaError = class extends AiError {
|
|
40
|
+
constructor(message, options) {
|
|
41
|
+
super(message, options);
|
|
42
|
+
this.name = "InvalidJsonSchemaError";
|
|
43
|
+
}
|
|
44
|
+
};
|
|
45
|
+
var AiResponseJsonError = class extends AiError {
|
|
46
|
+
constructor(message, options) {
|
|
47
|
+
super(message, options);
|
|
48
|
+
this.name = "AiResponseJsonError";
|
|
49
|
+
}
|
|
50
|
+
};
|
|
51
|
+
var AiInputLengthError = class extends AiError {
|
|
52
|
+
constructor(message, options) {
|
|
53
|
+
super(message, options);
|
|
54
|
+
this.name = "AiInputLengthError";
|
|
55
|
+
}
|
|
56
|
+
};
|
|
57
|
+
var AiOutputLengthError = class extends AiError {
|
|
58
|
+
constructor(message, options) {
|
|
59
|
+
super(message, options);
|
|
60
|
+
this.name = "AiOutputLengthError";
|
|
61
|
+
}
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
// src/ai/prompts.ts
|
|
65
|
+
var LOCATIONS_PROMPT_VERSION = "0.0.1";
|
|
66
|
+
var LOCATIONS_FUNCTIONS_NAME = "get_places";
|
|
67
|
+
var LOCATIONS_SYSTEM_MESSAGE = `You parse a list of place addresses, corresponding brief one-sentence descriptions, and a descriptive title (preferably directly quoted) from text. These addresses might be countries, cities, attractions, parks, bars, etc.
|
|
68
|
+
If the text seems to be listing places that are all within a single parent place, you don't return the parent place, but you do add it to all the childrens' addresses.
|
|
69
|
+
For example, for an article about places in greece, you would not return "greece", but if the Acropolis was mentioned, you would return "Acropolis, Athens, Greece"
|
|
70
|
+
|
|
71
|
+
You ignore extraneous text, such as recommended articles, advertisments, etc.
|
|
72
|
+
`;
|
|
73
|
+
var LOCATIONS_SCHEMA = {
|
|
74
|
+
type: "object",
|
|
75
|
+
required: ["locations", "title"],
|
|
76
|
+
properties: {
|
|
77
|
+
locations: {
|
|
78
|
+
type: "array",
|
|
79
|
+
items: {
|
|
80
|
+
type: "object",
|
|
81
|
+
required: ["address", "description"],
|
|
82
|
+
properties: {
|
|
83
|
+
address: { type: "string" },
|
|
84
|
+
description: { type: "string" }
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
},
|
|
88
|
+
title: { type: "string" }
|
|
89
|
+
}
|
|
90
|
+
};
|
|
91
|
+
var LOCATIONS_MAX_OUTPUT_TOKENS = 500;
|
|
92
|
+
var SUMMARIZE_PROMPT_VERSION = "0.0.1";
|
|
93
|
+
var SUMMARIZATION_PROMPT = `Summarize the given text. The summary must reduce the number of characters from the original text by at least a factor of {RATIO}.
|
|
94
|
+
For example, text of 1000 characters must output a summary less than {EXAMPLE} characters.
|
|
95
|
+
This summary will be used to extract a list of locations and addresses, so keep any text that seems to be a list of locations/places, a location and paragraph describing it, or otherwise clearly a location name or address.
|
|
96
|
+
Here is the text:
|
|
97
|
+
`;
|
|
98
|
+
|
|
99
|
+
// src/scrape/errors.ts
|
|
100
|
+
var ScrapeError = class extends MapthisError {
|
|
101
|
+
constructor(message, options) {
|
|
102
|
+
super(message, options);
|
|
103
|
+
this.name = "ScrapeError";
|
|
104
|
+
}
|
|
105
|
+
};
|
|
106
|
+
var InvalidUrlError = class extends ScrapeError {
|
|
107
|
+
constructor(message, options) {
|
|
108
|
+
super(message, options);
|
|
109
|
+
this.name = "InvalidUrlError";
|
|
110
|
+
}
|
|
111
|
+
};
|
|
112
|
+
var HtmlUnauthorizedError = class extends ScrapeError {
|
|
113
|
+
constructor(message, options) {
|
|
114
|
+
super(message, options);
|
|
115
|
+
this.name = "HtmlUnauthorizedError";
|
|
116
|
+
}
|
|
117
|
+
};
|
|
118
|
+
var HtmlToTextError = class extends ScrapeError {
|
|
119
|
+
constructor(message, options) {
|
|
120
|
+
super(message, options);
|
|
121
|
+
this.name = "HtmlToTextError";
|
|
122
|
+
}
|
|
123
|
+
};
|
|
124
|
+
|
|
125
|
+
// src/scrape/html.ts
|
|
126
|
+
var withHttps = (url) => {
|
|
127
|
+
const hasHttp = /^http?:\/\//i.test(url);
|
|
128
|
+
const hasHttps = /^https?:\/\//i.test(url);
|
|
129
|
+
if (!hasHttp && !hasHttps) return `https://${url}`;
|
|
130
|
+
return url;
|
|
131
|
+
};
|
|
132
|
+
async function getHtmlFromUrl(url) {
|
|
133
|
+
const urlFixed = withHttps(url.toLowerCase());
|
|
134
|
+
const isUrl = zod.z.string().url().safeParse(urlFixed);
|
|
135
|
+
if (!isUrl.success) throw new InvalidUrlError(`Invalid URL: ${url}`);
|
|
136
|
+
try {
|
|
137
|
+
const response = await fetch(urlFixed);
|
|
138
|
+
const html = await response.text();
|
|
139
|
+
if (response.status === 403 || response.status === 401) {
|
|
140
|
+
throw new HtmlUnauthorizedError(
|
|
141
|
+
`${response.status} Not authorized to scrape this website`
|
|
142
|
+
);
|
|
143
|
+
}
|
|
144
|
+
if (!response.ok) {
|
|
145
|
+
throw new ScrapeError(
|
|
146
|
+
`Bad response when getting HTML. Status: ${response.status} ${response.statusText}`
|
|
147
|
+
);
|
|
148
|
+
}
|
|
149
|
+
return html;
|
|
150
|
+
} catch (error) {
|
|
151
|
+
if (error instanceof TypeError) {
|
|
152
|
+
throw new InvalidUrlError(`Invalid URL. Original error: ${error.message}`);
|
|
153
|
+
}
|
|
154
|
+
throw error;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
var SKIP_SELECTORS = [
|
|
158
|
+
"img",
|
|
159
|
+
"header",
|
|
160
|
+
"footer",
|
|
161
|
+
"audio",
|
|
162
|
+
"button",
|
|
163
|
+
"canvas",
|
|
164
|
+
"code",
|
|
165
|
+
"nav",
|
|
166
|
+
"#nav",
|
|
167
|
+
"figure",
|
|
168
|
+
"figcaption",
|
|
169
|
+
".comment",
|
|
170
|
+
".comments",
|
|
171
|
+
"#comments",
|
|
172
|
+
"#related-posts",
|
|
173
|
+
"#related",
|
|
174
|
+
".related",
|
|
175
|
+
".related-posts"
|
|
176
|
+
];
|
|
177
|
+
function htmlToText(html) {
|
|
178
|
+
try {
|
|
179
|
+
let out = htmlToText$1.convert(html, {
|
|
180
|
+
wordwrap: false,
|
|
181
|
+
selectors: [
|
|
182
|
+
{
|
|
183
|
+
selector: "a",
|
|
184
|
+
options: {
|
|
185
|
+
ignoreHref: true,
|
|
186
|
+
hideLinkHrefIfSameAsText: true
|
|
187
|
+
}
|
|
188
|
+
},
|
|
189
|
+
...SKIP_SELECTORS.map((tag) => ({ selector: tag, format: "skip" }))
|
|
190
|
+
]
|
|
191
|
+
});
|
|
192
|
+
out = out.replaceAll("\r\n", "\n");
|
|
193
|
+
for (let i = 0; i < 20; i++) {
|
|
194
|
+
out = out.replaceAll("\n\n\n", "\n\n");
|
|
195
|
+
out = out.replaceAll(" ", " ");
|
|
196
|
+
}
|
|
197
|
+
return out;
|
|
198
|
+
} catch (error) {
|
|
199
|
+
if (error instanceof Error) {
|
|
200
|
+
throw new HtmlToTextError(`HTML to text conversion failed: ${error.message}`, {
|
|
201
|
+
cause: error
|
|
202
|
+
});
|
|
203
|
+
}
|
|
204
|
+
throw new HtmlToTextError("HTML to text conversion failed");
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
// src/ai/summarize.ts
|
|
209
|
+
async function summarizeText(backend, text, finalPrompt = "", finalOutputTokens = 0, chunkOutputSafetyFactor = 0.3, tokenLimitSafetyFactor = 0.02) {
|
|
210
|
+
const tokenLimit = backend.tokenLimit * (1 - tokenLimitSafetyFactor);
|
|
211
|
+
const textTokens = backend.countTokens(text);
|
|
212
|
+
const summarizePromptTokens = backend.countTokens(SUMMARIZATION_PROMPT);
|
|
213
|
+
const finalPromptTokens = backend.countTokens(finalPrompt);
|
|
214
|
+
const finalInputTokens = tokenLimit - finalOutputTokens;
|
|
215
|
+
const summaryOutputTokens = finalInputTokens - finalPromptTokens;
|
|
216
|
+
const numChunks = Math.max(
|
|
217
|
+
1,
|
|
218
|
+
Math.ceil((textTokens + summaryOutputTokens) / (tokenLimit - summarizePromptTokens))
|
|
219
|
+
);
|
|
220
|
+
const chunkOutputTokens = Math.floor(
|
|
221
|
+
summaryOutputTokens / numChunks * (1 - chunkOutputSafetyFactor)
|
|
222
|
+
);
|
|
223
|
+
const chunkInputTokens = Math.floor(tokenLimit - summarizePromptTokens - chunkOutputTokens);
|
|
224
|
+
const summaryRatio = chunkOutputTokens / chunkInputTokens;
|
|
225
|
+
const textLines = text.split("\n");
|
|
226
|
+
const chunks = [];
|
|
227
|
+
let currentChunk = "";
|
|
228
|
+
let currentChunkTokens = 0;
|
|
229
|
+
const nextChunk = () => {
|
|
230
|
+
if (currentChunk) chunks.push(currentChunk);
|
|
231
|
+
currentChunk = "";
|
|
232
|
+
currentChunkTokens = 0;
|
|
233
|
+
};
|
|
234
|
+
for (const line of textLines) {
|
|
235
|
+
const numLineTokens = backend.countTokens(line) + 1;
|
|
236
|
+
if (numLineTokens > chunkInputTokens) {
|
|
237
|
+
nextChunk();
|
|
238
|
+
const charsPerToken = line.length / Math.max(1, backend.countTokens(line));
|
|
239
|
+
const charsPerChunk = Math.max(1, Math.floor(chunkInputTokens * charsPerToken));
|
|
240
|
+
for (let i = 0; i < line.length; i += charsPerChunk) {
|
|
241
|
+
chunks.push(line.slice(i, i + charsPerChunk));
|
|
242
|
+
}
|
|
243
|
+
continue;
|
|
244
|
+
}
|
|
245
|
+
if (currentChunkTokens + numLineTokens > chunkInputTokens) {
|
|
246
|
+
nextChunk();
|
|
247
|
+
}
|
|
248
|
+
currentChunkTokens += numLineTokens;
|
|
249
|
+
currentChunk += line + "\n";
|
|
250
|
+
}
|
|
251
|
+
nextChunk();
|
|
252
|
+
try {
|
|
253
|
+
const ratioText = (Math.ceil(summaryRatio * 10) / 10).toFixed(1);
|
|
254
|
+
const exampleText = Math.ceil(1e3 / Math.max(summaryRatio, 0.01)).toFixed(0);
|
|
255
|
+
const summaryPrompt = SUMMARIZATION_PROMPT.replace("{RATIO}", ratioText).replace(
|
|
256
|
+
"{EXAMPLE}",
|
|
257
|
+
exampleText
|
|
258
|
+
);
|
|
259
|
+
let inputTokens = 0;
|
|
260
|
+
let outputTokens = 0;
|
|
261
|
+
const summaries = await Promise.all(
|
|
262
|
+
chunks.map(async (chunk) => {
|
|
263
|
+
const response = await backend.chatCompletion(summaryPrompt + chunk);
|
|
264
|
+
inputTokens += response.inputTokens;
|
|
265
|
+
outputTokens += response.outputTokens;
|
|
266
|
+
return response.output;
|
|
267
|
+
})
|
|
268
|
+
);
|
|
269
|
+
const summarized = summaries.join("\n");
|
|
270
|
+
const summarizedTokens = backend.countTokens(summarized);
|
|
271
|
+
return {
|
|
272
|
+
text,
|
|
273
|
+
textTokens,
|
|
274
|
+
summarized,
|
|
275
|
+
summarizedTokens,
|
|
276
|
+
inputTokens,
|
|
277
|
+
outputTokens,
|
|
278
|
+
chunks: chunks.length,
|
|
279
|
+
summarizePromptVersion: SUMMARIZE_PROMPT_VERSION
|
|
280
|
+
};
|
|
281
|
+
} catch (error) {
|
|
282
|
+
if (error instanceof Error) {
|
|
283
|
+
throw new SummarizeTextError(`Failed to summarize text: ${error.message}`, {
|
|
284
|
+
cause: error
|
|
285
|
+
});
|
|
286
|
+
}
|
|
287
|
+
throw new SummarizeTextError("Failed to summarize text (unknown error)");
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// src/ai/locations.ts
|
|
292
|
+
var FINAL_PROMPT = LOCATIONS_SYSTEM_MESSAGE + JSON.stringify(LOCATIONS_SCHEMA) + LOCATIONS_FUNCTIONS_NAME;
|
|
293
|
+
async function callLocationExtraction(backend, text) {
|
|
294
|
+
const out = await backend.chatFunctionJson(
|
|
295
|
+
LOCATIONS_SYSTEM_MESSAGE,
|
|
296
|
+
text,
|
|
297
|
+
LOCATIONS_SCHEMA,
|
|
298
|
+
LOCATIONS_FUNCTIONS_NAME
|
|
299
|
+
);
|
|
300
|
+
const seen = /* @__PURE__ */ new Set();
|
|
301
|
+
const unique = out.output.locations.filter((loc) => {
|
|
302
|
+
if (seen.has(loc.address)) return false;
|
|
303
|
+
seen.add(loc.address);
|
|
304
|
+
return true;
|
|
305
|
+
});
|
|
306
|
+
if (unique.length === 0) throw new NoLocationsFoundError();
|
|
307
|
+
out.output.locations = unique;
|
|
308
|
+
return out;
|
|
309
|
+
}
|
|
310
|
+
async function parseLocationsFromText(backend, text) {
|
|
311
|
+
const {
|
|
312
|
+
summarized,
|
|
313
|
+
inputTokens: sumIn,
|
|
314
|
+
outputTokens: sumOut,
|
|
315
|
+
textTokens,
|
|
316
|
+
summarizedTokens,
|
|
317
|
+
summarizePromptVersion
|
|
318
|
+
} = await summarizeText(backend, text, FINAL_PROMPT, LOCATIONS_MAX_OUTPUT_TOKENS);
|
|
319
|
+
const {
|
|
320
|
+
output,
|
|
321
|
+
inputTokens: locIn,
|
|
322
|
+
outputTokens: locOut
|
|
323
|
+
} = await callLocationExtraction(backend, summarized);
|
|
324
|
+
return {
|
|
325
|
+
output,
|
|
326
|
+
locationInputTokens: locIn,
|
|
327
|
+
locationOutputTokens: locOut,
|
|
328
|
+
summaryInputTokens: sumIn,
|
|
329
|
+
summaryOutputTokens: sumOut,
|
|
330
|
+
inputTokens: sumIn + locIn,
|
|
331
|
+
outputTokens: sumOut + locOut,
|
|
332
|
+
textTokens,
|
|
333
|
+
summarizedTokens,
|
|
334
|
+
summaryPromptVersion: summarizePromptVersion,
|
|
335
|
+
locationPromptVersion: LOCATIONS_PROMPT_VERSION
|
|
336
|
+
};
|
|
337
|
+
}
|
|
338
|
+
async function parseLocationsFromHtml(backend, html) {
|
|
339
|
+
return parseLocationsFromText(backend, htmlToText(html));
|
|
340
|
+
}
|
|
341
|
+
async function parseLocationsFromUrl(backend, url) {
|
|
342
|
+
const html = await getHtmlFromUrl(url);
|
|
343
|
+
return parseLocationsFromHtml(backend, html);
|
|
344
|
+
}
|
|
345
|
+
var DEFAULT_MODEL = "gpt-4o-mini";
|
|
346
|
+
var DEFAULT_TOKEN_LIMIT = 128e3;
|
|
347
|
+
function resolveEncoding(model) {
|
|
348
|
+
try {
|
|
349
|
+
return jsTiktoken.encodingForModel(model);
|
|
350
|
+
} catch {
|
|
351
|
+
return jsTiktoken.getEncoding("cl100k_base");
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
function createOpenAiBackend(config) {
|
|
355
|
+
const model = config.model ?? DEFAULT_MODEL;
|
|
356
|
+
const tokenLimit = config.tokenLimit ?? DEFAULT_TOKEN_LIMIT;
|
|
357
|
+
const openai = new OpenAI__default.default({
|
|
358
|
+
apiKey: config.apiKey,
|
|
359
|
+
...config.baseURL ? { baseURL: config.baseURL } : {}
|
|
360
|
+
});
|
|
361
|
+
const encoding = resolveEncoding(model);
|
|
362
|
+
return {
|
|
363
|
+
model,
|
|
364
|
+
tokenLimit,
|
|
365
|
+
countTokens(text) {
|
|
366
|
+
return encoding.encode(text).length;
|
|
367
|
+
},
|
|
368
|
+
async chatCompletion(content) {
|
|
369
|
+
const response = await openai.chat.completions.create({
|
|
370
|
+
messages: [{ role: "user", content }],
|
|
371
|
+
model
|
|
372
|
+
});
|
|
373
|
+
return {
|
|
374
|
+
output: response.choices[0]?.message.content ?? "",
|
|
375
|
+
inputTokens: response.usage?.prompt_tokens ?? 0,
|
|
376
|
+
outputTokens: response.usage?.completion_tokens ?? 0
|
|
377
|
+
};
|
|
378
|
+
},
|
|
379
|
+
async chatFunctionJson(systemMessage, userMessage, schema, functionName) {
|
|
380
|
+
try {
|
|
381
|
+
const completion = await openai.chat.completions.create({
|
|
382
|
+
model,
|
|
383
|
+
messages: [
|
|
384
|
+
{ role: "system", content: systemMessage },
|
|
385
|
+
{ role: "user", content: userMessage }
|
|
386
|
+
],
|
|
387
|
+
functions: [{ name: functionName, parameters: schema }],
|
|
388
|
+
function_call: { name: functionName }
|
|
389
|
+
});
|
|
390
|
+
const functionCall = completion.choices[0]?.message.function_call;
|
|
391
|
+
if (!functionCall) {
|
|
392
|
+
throw new AiResponseJsonError("No function call in OpenAI response");
|
|
393
|
+
}
|
|
394
|
+
const inputTokens = completion.usage?.prompt_tokens ?? 0;
|
|
395
|
+
const outputTokens = completion.usage?.completion_tokens ?? 0;
|
|
396
|
+
try {
|
|
397
|
+
const output = JSON.parse(functionCall.arguments);
|
|
398
|
+
return { output, inputTokens, outputTokens };
|
|
399
|
+
} catch (error) {
|
|
400
|
+
if (error instanceof SyntaxError) {
|
|
401
|
+
const msg = error.message.toLowerCase();
|
|
402
|
+
if (msg.includes("unexpected end of json input") || msg.includes("unterminated string in js")) {
|
|
403
|
+
throw new AiOutputLengthError(
|
|
404
|
+
`Output tokens exceeded: ${error.message}`,
|
|
405
|
+
{ cause: error }
|
|
406
|
+
);
|
|
407
|
+
}
|
|
408
|
+
throw new AiResponseJsonError(
|
|
409
|
+
`Failed to parse JSON from OpenAI response: ${error.message}`,
|
|
410
|
+
{ cause: error }
|
|
411
|
+
);
|
|
412
|
+
}
|
|
413
|
+
throw new AiResponseJsonError(
|
|
414
|
+
"Failed to parse JSON from OpenAI response (unknown reason)"
|
|
415
|
+
);
|
|
416
|
+
}
|
|
417
|
+
} catch (error) {
|
|
418
|
+
if (error instanceof AiError) throw error;
|
|
419
|
+
if (error instanceof OpenAI__default.default.APIError) {
|
|
420
|
+
const msg = error.message.toLowerCase();
|
|
421
|
+
if (msg.includes("maximum context")) {
|
|
422
|
+
throw new AiInputLengthError(`Text too long: ${error.message}`, {
|
|
423
|
+
cause: error
|
|
424
|
+
});
|
|
425
|
+
}
|
|
426
|
+
if (msg.includes("invalid schema")) {
|
|
427
|
+
throw new InvalidJsonSchemaError(`Invalid schema: ${error.message}`, {
|
|
428
|
+
cause: error
|
|
429
|
+
});
|
|
430
|
+
}
|
|
431
|
+
throw new AiError(`OpenAI error: ${error.message}`, { cause: error });
|
|
432
|
+
}
|
|
433
|
+
throw new AiError("OpenAI error (unknown)", {
|
|
434
|
+
cause: error instanceof Error ? error : void 0
|
|
435
|
+
});
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
};
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
// src/ai/parser.ts
|
|
442
|
+
function createLocationParser(config) {
|
|
443
|
+
const backend = createOpenAiBackend(config);
|
|
444
|
+
return {
|
|
445
|
+
parseLocationsFromText: (text) => parseLocationsFromText(backend, text),
|
|
446
|
+
parseLocationsFromHtml: (html) => parseLocationsFromHtml(backend, html),
|
|
447
|
+
parseLocationsFromUrl: (url) => parseLocationsFromUrl(backend, url)
|
|
448
|
+
};
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
exports.AiError = AiError;
|
|
452
|
+
exports.AiInputLengthError = AiInputLengthError;
|
|
453
|
+
exports.AiOutputLengthError = AiOutputLengthError;
|
|
454
|
+
exports.AiResponseJsonError = AiResponseJsonError;
|
|
455
|
+
exports.DEFAULT_MODEL = DEFAULT_MODEL;
|
|
456
|
+
exports.DEFAULT_TOKEN_LIMIT = DEFAULT_TOKEN_LIMIT;
|
|
457
|
+
exports.InvalidJsonSchemaError = InvalidJsonSchemaError;
|
|
458
|
+
exports.LOCATIONS_FUNCTIONS_NAME = LOCATIONS_FUNCTIONS_NAME;
|
|
459
|
+
exports.LOCATIONS_MAX_OUTPUT_TOKENS = LOCATIONS_MAX_OUTPUT_TOKENS;
|
|
460
|
+
exports.LOCATIONS_PROMPT_VERSION = LOCATIONS_PROMPT_VERSION;
|
|
461
|
+
exports.LOCATIONS_SCHEMA = LOCATIONS_SCHEMA;
|
|
462
|
+
exports.LOCATIONS_SYSTEM_MESSAGE = LOCATIONS_SYSTEM_MESSAGE;
|
|
463
|
+
exports.NoLocationsFoundError = NoLocationsFoundError;
|
|
464
|
+
exports.SUMMARIZATION_PROMPT = SUMMARIZATION_PROMPT;
|
|
465
|
+
exports.SUMMARIZE_PROMPT_VERSION = SUMMARIZE_PROMPT_VERSION;
|
|
466
|
+
exports.SummarizeTextError = SummarizeTextError;
|
|
467
|
+
exports.createLocationParser = createLocationParser;
|
|
468
|
+
exports.createOpenAiBackend = createOpenAiBackend;
|
|
469
|
+
exports.parseLocationsFromHtml = parseLocationsFromHtml;
|
|
470
|
+
exports.parseLocationsFromText = parseLocationsFromText;
|
|
471
|
+
exports.parseLocationsFromUrl = parseLocationsFromUrl;
|
|
472
|
+
exports.summarizeText = summarizeText;
|
|
473
|
+
//# sourceMappingURL=index.cjs.map
|
|
474
|
+
//# sourceMappingURL=index.cjs.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../../src/types/errors.ts","../../src/ai/errors.ts","../../src/ai/prompts.ts","../../src/scrape/errors.ts","../../src/scrape/html.ts","../../src/scrape/text.ts","../../src/ai/summarize.ts","../../src/ai/locations.ts","../../src/ai/openai.ts","../../src/ai/parser.ts"],"names":["z","convert","encodingForModel","getEncoding","OpenAI"],"mappings":";;;;;;;;;;;;AAOO,IAAM,YAAA,GAAN,cAA2B,KAAA,CAAM;AAAA,EACpC,WAAA,CAAY,SAAkB,OAAA,EAAwB;AAClD,IAAA,KAAA,CAAM,SAAS,OAAO,CAAA;AACtB,IAAA,IAAA,CAAK,IAAA,GAAO,cAAA;AAAA,EAChB;AACJ,CAAA;;;ACVO,IAAM,OAAA,GAAN,cAAsB,YAAA,CAAa;AAAA,EACtC,WAAA,CAAY,SAAkB,OAAA,EAAwB;AAClD,IAAA,KAAA,CAAM,SAAS,OAAO,CAAA;AACtB,IAAA,IAAA,CAAK,IAAA,GAAO,SAAA;AAAA,EAChB;AACJ;AAKO,IAAM,qBAAA,GAAN,cAAoC,OAAA,CAAQ;AAAA,EAC/C,WAAA,CAAY,SAAkB,OAAA,EAAwB;AAClD,IAAA,KAAA,CAAM,OAAA,IAAW,+BAA+B,OAAO,CAAA;AACvD,IAAA,IAAA,CAAK,IAAA,GAAO,uBAAA;AAAA,EAChB;AACJ;AAKO,IAAM,kBAAA,GAAN,cAAiC,OAAA,CAAQ;AAAA,EAC5C,WAAA,CAAY,SAAkB,OAAA,EAAwB;AAClD,IAAA,KAAA,CAAM,SAAS,OAAO,CAAA;AACtB,IAAA,IAAA,CAAK,IAAA,GAAO,oBAAA;AAAA,EAChB;AACJ;AAKO,IAAM,sBAAA,GAAN,cAAqC,OAAA,CAAQ;AAAA,EAChD,WAAA,CAAY,SAAkB,OAAA,EAAwB;AAClD,IAAA,KAAA,CAAM,SAAS,OAAO,CAAA;AACtB,IAAA,IAAA,CAAK,IAAA,GAAO,wBAAA;AAAA,EAChB;AACJ;AAMO,IAAM,mBAAA,GAAN,cAAkC,OAAA,CAAQ;AAAA,EAC7C,WAAA,CAAY,SAAkB,OAAA,EAAwB;AAClD,IAAA,KAAA,CAAM,SAAS,OAAO,CAAA;AACtB,IAAA,IAAA,CAAK,IAAA,GAAO,qBAAA;AAAA,EAChB;AACJ;AAKO,IAAM,kBAAA,GAAN,cAAiC,OAAA,CAAQ;AAAA,EAC5C,WAAA,CAAY,SAAkB,OAAA,EAAwB;AAClD,IAAA,KAAA,CAAM,SAAS,OAAO,CAAA;AACtB,IAAA,IAAA,CAAK,IAAA,GAAO,oBAAA;AAAA,EAChB;AACJ;AAKO,IAAM,mBAAA,GAAN,cAAkC,OAAA,CAAQ;AAAA,EAC7C,WAAA,CAAY,SAAkB,OAAA,EAAwB;AAClD,IAAA,KAAA,CAAM,SAAS,OAAO,CAAA;AACtB,IAAA,IAAA,CAAK,IAAA,GAAO,qBAAA;AAAA,EAChB;AACJ;;;AC5DO,IAAM,wBAAA,GAA2B;AAEjC,IAAM,wBAAA,GAA2B;AAEjC,IAAM,wBAAA,GAA2B,CAAA;AAAA;AAAA;;AAAA;AAAA;AAOjC,IAAM,gBAAA,GAAsC;AAAA,EAC/C,IAAA,EAAM,QAAA;AAAA,EACN,QAAA,EAAU,CAAC,WAAA,EAAa,OAAO,CAAA;AAAA,EAC/B,UAAA,EAAY;AAAA,IACR,SAAA,EAAW;AAAA,MACP,IAAA,EAAM,OAAA;AAAA,MACN,KAAA,EAAO;AAAA,QACH,IAAA,EAAM,QAAA;AAAA,QACN,QAAA,EAAU,CAAC,SAAA,EAAW,aAAa,CAAA;AAAA,QACnC,UAAA,EAAY;AAAA,UACR,OAAA,EAAS,EAAE,IAAA,EAAM,QAAA,EAAS;AAAA,UAC1B,WAAA,EAAa,EAAE,IAAA,EAAM,QAAA;AAAS;AAClC;AACJ,KACJ;AAAA,IACA,KAAA,EAAO,EAAE,IAAA,EAAM,QAAA;AAAS;AAEhC;AAEO,IAAM,2BAAA,GAA8B;AAMpC,IAAM,wBAAA,GAA2B;AAEjC,IAAM,oBAAA,GAAuB,CAAA;AAAA;AAAA;AAAA;AAAA;;;AC5C7B,IAAM,WAAA,GAAN,cAA0B,YAAA,CAAa;AAAA,EAC1C,WAAA,CAAY,SAAkB,OAAA,EAAwB;AAClD,IAAA,KAAA,CAAM,SAAS,OAAO,CAAA;AACtB,IAAA,IAAA,CAAK,IAAA,GAAO,aAAA;AAAA,EAChB;AACJ,CAAA;AAKO,IAAM,eAAA,GAAN,cAA8B,WAAA,CAAY;AAAA,EAC7C,WAAA,CAAY,SAAkB,OAAA,EAAwB;AAClD,IAAA,KAAA,CAAM,SAAS,OAAO,CAAA;AACtB,IAAA,IAAA,CAAK,IAAA,GAAO,iBAAA;AAAA,EAChB;AACJ,CAAA;AAMO,IAAM,qBAAA,GAAN,cAAoC,WAAA,CAAY;AAAA,EACnD,WAAA,CAAY,SAAkB,OAAA,EAAwB;AAClD,IAAA,KAAA,CAAM,SAAS,OAAO,CAAA;AACtB,IAAA,IAAA,CAAK,IAAA,GAAO,uBAAA;AAAA,EAChB;AACJ,CAAA;AAKO,IAAM,eAAA,GAAN,cAA8B,WAAA,CAAY;AAAA,EAC7C,WAAA,CAAY,SAAkB,OAAA,EAAwB;AAClD,IAAA,KAAA,CAAM,SAAS,OAAO,CAAA;AACtB,IAAA,IAAA,CAAK,IAAA,GAAO,iBAAA;AAAA,EAChB;AACJ,CAAA;;;ACnCA,IAAM,SAAA,GAAY,CAAC,GAAA,KAAwB;AACvC,EAAA,MAAM,OAAA,GAAU,cAAA,CAAe,IAAA,CAAK,GAAG,CAAA;AACvC,EAAA,MAAM,QAAA,GAAW,eAAA,CAAgB,IAAA,CAAK,GAAG,CAAA;AACzC,EAAA,IAAI,CAAC,OAAA,IAAW,CAAC,QAAA,EAAU,OAAO,WAAW,GAAG,CAAA,CAAA;AAChD,EAAA,OAAO,GAAA;AACX,CAAA;AAgBA,eAAsB,eAAe,GAAA,EAA8B;AAC/D,EAAA,MAAM,QAAA,GAAW,SAAA,CAAU,GAAA,CAAI,WAAA,EAAa,CAAA;AAC5C,EAAA,MAAM,QAAQA,KAAA,CAAE,MAAA,GAAS,GAAA,EAAI,CAAE,UAAU,QAAQ,CAAA;AACjD,EAAA,IAAI,CAAC,MAAM,OAAA,EAAS,MAAM,IAAI,eAAA,CAAgB,CAAA,aAAA,EAAgB,GAAG,CAAA,CAAE,CAAA;AAEnE,EAAA,IAAI;AACA,IAAA,MAAM,QAAA,GAAW,MAAM,KAAA,CAAM,QAAQ,CAAA;AACrC,IAAA,MAAM,IAAA,GAAO,MAAM,QAAA,CAAS,IAAA,EAAK;AACjC,IAAA,IAAI,QAAA,CAAS,MAAA,KAAW,GAAA,IAAO,QAAA,CAAS,WAAW,GAAA,EAAK;AACpD,MAAA,MAAM,IAAI,qBAAA;AAAA,QACN,CAAA,EAAG,SAAS,MAAM,CAAA,sCAAA;AAAA,OACtB;AAAA,IACJ;AACA,IAAA,IAAI,CAAC,SAAS,EAAA,EAAI;AACd,MAAA,MAAM,IAAI,WAAA;AAAA,QACN,CAAA,wCAAA,EAA2C,QAAA,CAAS,MAAM,CAAA,CAAA,EAAI,SAAS,UAAU,CAAA;AAAA,OACrF;AAAA,IACJ;AACA,IAAA,OAAO,IAAA;AAAA,EACX,SAAS,KAAA,EAAO;AACZ,IAAA,IAAI,iBAAiB,SAAA,EAAW;AAC5B,MAAA,MAAM,IAAI,eAAA,CAAgB,CAAA,6BAAA,EAAgC,KAAA,CAAM,OAAO,CAAA,CAAE,CAAA;AAAA,IAC7E;AACA,IAAA,MAAM,KAAA;AAAA,EACV;AACJ;AC7CA,IAAM,cAAA,GAAiB;AAAA,EACnB,KAAA;AAAA,EACA,QAAA;AAAA,EACA,QAAA;AAAA,EACA,OAAA;AAAA,EACA,QAAA;AAAA,EACA,QAAA;AAAA,EACA,MAAA;AAAA,EACA,KAAA;AAAA,EACA,MAAA;AAAA,EACA,QAAA;AAAA,EACA,YAAA;AAAA,EACA,UAAA;AAAA,EACA,WAAA;AAAA,EACA,WAAA;AAAA,EACA,gBAAA;AAAA,EACA,UAAA;AAAA,EACA,UAAA;AAAA,EACA;AACJ,CAAA;AAYO,SAAS,WAAW,IAAA,EAAsB;AAC7C,EAAA,IAAI;AACA,IAAA,IAAI,GAAA,GAAMC,qBAAQ,IAAA,EAAM;AAAA,MACpB,QAAA,EAAU,KAAA;AAAA,MACV,SAAA,EAAW;AAAA,QACP;AAAA,UACI,QAAA,EAAU,GAAA;AAAA,UACV,OAAA,EAAS;AAAA,YACL,UAAA,EAAY,IAAA;AAAA,YACZ,wBAAA,EAA0B;AAAA;AAC9B,SACJ;AAAA,QACA,GAAG,cAAA,CAAe,GAAA,CAAI,CAAC,GAAA,MAAS,EAAE,QAAA,EAAU,GAAA,EAAK,MAAA,EAAQ,MAAA,EAAO,CAAE;AAAA;AACtE,KACH,CAAA;AACD,IAAA,GAAA,GAAM,GAAA,CAAI,UAAA,CAAW,MAAA,EAAQ,IAAI,CAAA;AAGjC,IAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,EAAA,EAAI,CAAA,EAAA,EAAK;AACzB,MAAA,GAAA,GAAM,GAAA,CAAI,UAAA,CAAW,QAAA,EAAU,MAAM,CAAA;AACrC,MAAA,GAAA,GAAM,GAAA,CAAI,UAAA,CAAW,IAAA,EAAM,GAAG,CAAA;AAAA,IAClC;AACA,IAAA,OAAO,GAAA;AAAA,EACX,SAAS,KAAA,EAAO;AACZ,IAAA,IAAI,iBAAiB,KAAA,EAAO;AACxB,MAAA,MAAM,IAAI,eAAA,CAAgB,CAAA,gCAAA,EAAmC,KAAA,CAAM,OAAO,CAAA,CAAA,EAAI;AAAA,QAC1E,KAAA,EAAO;AAAA,OACV,CAAA;AAAA,IACL;AACA,IAAA,MAAM,IAAI,gBAAgB,gCAAgC,CAAA;AAAA,EAC9D;AACJ;;;AC/BA,eAAsB,aAAA,CAClB,OAAA,EACA,IAAA,EACA,WAAA,GAAc,EAAA,EACd,oBAAoB,CAAA,EACpB,uBAAA,GAA0B,GAAA,EAC1B,sBAAA,GAAyB,IAAA,EACF;AACvB,EAAA,MAAM,UAAA,GAAa,OAAA,CAAQ,UAAA,IAAc,CAAA,GAAI,sBAAA,CAAA;AAC7C,EAAA,MAAM,UAAA,GAAa,OAAA,CAAQ,WAAA,CAAY,IAAI,CAAA;AAC3C,EAAA,MAAM,qBAAA,GAAwB,OAAA,CAAQ,WAAA,CAAY,oBAAoB,CAAA;AACtE,EAAA,MAAM,iBAAA,GAAoB,OAAA,CAAQ,WAAA,CAAY,WAAW,CAAA;AAEzD,EAAA,MAAM,mBAAmB,UAAA,GAAa,iBAAA;AACtC,EAAA,MAAM,sBAAsB,gBAAA,GAAmB,iBAAA;AAE/C,EAAA,MAAM,YAAY,IAAA,CAAK,GAAA;AAAA,IACnB,CAAA;AAAA,IACA,IAAA,CAAK,IAAA,CAAA,CAAM,UAAA,GAAa,mBAAA,KAAwB,aAAa,qBAAA,CAAsB;AAAA,GACvF;AACA,EAAA,MAAM,oBAAoB,IAAA,CAAK,KAAA;AAAA,IAC1B,mBAAA,GAAsB,aAAc,CAAA,GAAI,uBAAA;AAAA,GAC7C;AACA,EAAA,MAAM,gBAAA,GAAmB,IAAA,CAAK,KAAA,CAAM,UAAA,GAAa,wBAAwB,iBAAiB,CAAA;AAC1F,EAAA,MAAM,eAAe,iBAAA,GAAoB,gBAAA;AAKzC,EAAA,MAAM,SAAA,GAAY,IAAA,CAAK,KAAA,CAAM,IAAI,CAAA;AACjC,EAAA,MAAM,SAAmB,EAAC;AAE1B,EAAA,IAAI,YAAA,GAAe,EAAA;AACnB,EAAA,IAAI,kBAAA,GAAqB,CAAA;AACzB,EAAA,MAAM,YAAY,MAAY;AAC1B,IAAA,IAAI,YAAA,EAAc,MAAA,CAAO,IAAA,CAAK,YAAY,CAAA;AAC1C,IAAA,YAAA,GAAe,EAAA;AACf,IAAA,kBAAA,GAAqB,CAAA;AAAA,EACzB,CAAA;AAOA,EAAA,KAAA,MAAW,QAAQ,SAAA,EAAW;AAC1B,IAAA,MAAM,aAAA,GAAgB,OAAA,CAAQ,WAAA,CAAY,IAAI,CAAA,GAAI,CAAA;AAElD,IAAA,IAAI,gBAAgB,gBAAA,EAAkB;AAGlC,MAAA,SAAA,EAAU;AACV,MAAA,MAAM,aAAA,GAAgB,KAAK,MAAA,GAAS,IAAA,CAAK,IAAI,CAAA,EAAG,OAAA,CAAQ,WAAA,CAAY,IAAI,CAAC,CAAA;AACzE,MAAA,MAAM,aAAA,GAAgB,KAAK,GAAA,CAAI,CAAA,EAAG,KAAK,KAAA,CAAM,gBAAA,GAAmB,aAAa,CAAC,CAAA;AAC9E,MAAA,KAAA,IAAS,IAAI,CAAA,EAAG,CAAA,GAAI,IAAA,CAAK,MAAA,EAAQ,KAAK,aAAA,EAAe;AACjD,QAAA,MAAA,CAAO,KAAK,IAAA,CAAK,KAAA,CAAM,CAAA,EAAG,CAAA,GAAI,aAAa,CAAC,CAAA;AAAA,MAChD;AACA,MAAA;AAAA,IACJ;AAEA,IAAA,IAAI,kBAAA,GAAqB,gBAAgB,gBAAA,EAAkB;AACvD,MAAA,SAAA,EAAU;AAAA,IACd;AACA,IAAA,kBAAA,IAAsB,aAAA;AACtB,IAAA,YAAA,IAAgB,IAAA,GAAO,IAAA;AAAA,EAC3B;AACA,EAAA,SAAA,EAAU;AAEV,EAAA,IAAI;AACA,IAAA,MAAM,SAAA,GAAA,CAAa,KAAK,IAAA,CAAK,YAAA,GAAe,EAAE,CAAA,GAAI,EAAA,EAAI,QAAQ,CAAC,CAAA;AAC/D,IAAA,MAAM,WAAA,GAAc,IAAA,CAAK,IAAA,CAAK,GAAA,GAAO,IAAA,CAAK,GAAA,CAAI,YAAA,EAAc,IAAI,CAAC,CAAA,CAAE,OAAA,CAAQ,CAAC,CAAA;AAC5E,IAAA,MAAM,aAAA,GAAgB,oBAAA,CAAqB,OAAA,CAAQ,SAAA,EAAW,SAAS,CAAA,CAAE,OAAA;AAAA,MACrE,WAAA;AAAA,MACA;AAAA,KACJ;AAEA,IAAA,IAAI,WAAA,GAAc,CAAA;AAClB,IAAA,IAAI,YAAA,GAAe,CAAA;AACnB,IAAA,MAAM,SAAA,GAAY,MAAM,OAAA,CAAQ,GAAA;AAAA,MAC5B,MAAA,CAAO,GAAA,CAAI,OAAO,KAAA,KAAU;AACxB,QAAA,MAAM,QAAA,GAAW,MAAM,OAAA,CAAQ,cAAA,CAAe,gBAAgB,KAAK,CAAA;AACnE,QAAA,WAAA,IAAe,QAAA,CAAS,WAAA;AACxB,QAAA,YAAA,IAAgB,QAAA,CAAS,YAAA;AACzB,QAAA,OAAO,QAAA,CAAS,MAAA;AAAA,MACpB,CAAC;AAAA,KACL;AAEA,IAAA,MAAM,UAAA,GAAa,SAAA,CAAU,IAAA,CAAK,IAAI,CAAA;AACtC,IAAA,MAAM,gBAAA,GAAmB,OAAA,CAAQ,WAAA,CAAY,UAAU,CAAA;AAEvD,IAAA,OAAO;AAAA,MACH,IAAA;AAAA,MACA,UAAA;AAAA,MACA,UAAA;AAAA,MACA,gBAAA;AAAA,MACA,WAAA;AAAA,MACA,YAAA;AAAA,MACA,QAAQ,MAAA,CAAO,MAAA;AAAA,MACf,sBAAA,EAAwB;AAAA,KAC5B;AAAA,EACJ,SAAS,KAAA,EAAO;AACZ,IAAA,IAAI,iBAAiB,KAAA,EAAO;AACxB,MAAA,MAAM,IAAI,kBAAA,CAAmB,CAAA,0BAAA,EAA6B,KAAA,CAAM,OAAO,CAAA,CAAA,EAAI;AAAA,QACvE,KAAA,EAAO;AAAA,OACV,CAAA;AAAA,IACL;AACA,IAAA,MAAM,IAAI,mBAAmB,0CAA0C,CAAA;AAAA,EAC3E;AACJ;;;AChIA,IAAM,YAAA,GAAe,wBAAA,GAA2B,IAAA,CAAK,SAAA,CAAU,gBAAgB,CAAA,GAAI,wBAAA;AAEnF,eAAe,sBAAA,CAAuB,SAAwB,IAAA,EAAc;AACxE,EAAA,MAAM,GAAA,GAAM,MAAM,OAAA,CAAQ,gBAAA;AAAA,IACtB,wBAAA;AAAA,IACA,IAAA;AAAA,IACA,gBAAA;AAAA,IACA;AAAA,GACJ;AAIA,EAAA,MAAM,IAAA,uBAAW,GAAA,EAAY;AAC7B,EAAA,MAAM,SAAS,GAAA,CAAI,MAAA,CAAO,SAAA,CAAU,MAAA,CAAO,CAAC,GAAA,KAAQ;AAChD,IAAA,IAAI,IAAA,CAAK,GAAA,CAAI,GAAA,CAAI,OAAO,GAAG,OAAO,KAAA;AAClC,IAAA,IAAA,CAAK,GAAA,CAAI,IAAI,OAAO,CAAA;AACpB,IAAA,OAAO,IAAA;AAAA,EACX,CAAC,CAAA;AACD,EAAA,IAAI,MAAA,CAAO,MAAA,KAAW,CAAA,EAAG,MAAM,IAAI,qBAAA,EAAsB;AACzD,EAAA,GAAA,CAAI,OAAO,SAAA,GAAY,MAAA;AAEvB,EAAA,OAAO,GAAA;AACX;AAMA,eAAsB,sBAAA,CAClB,SACA,IAAA,EAC6B;AAC7B,EAAA,MAAM;AAAA,IACF,UAAA;AAAA,IACA,WAAA,EAAa,KAAA;AAAA,IACb,YAAA,EAAc,MAAA;AAAA,IACd,UAAA;AAAA,IACA,gBAAA;AAAA,IACA;AAAA,MACA,MAAM,aAAA,CAAc,OAAA,EAAS,IAAA,EAAM,cAAc,2BAA2B,CAAA;AAEhF,EAAA,MAAM;AAAA,IACF,MAAA;AAAA,IACA,WAAA,EAAa,KAAA;AAAA,IACb,YAAA,EAAc;AAAA,GAClB,GAAI,MAAM,sBAAA,CAAuB,OAAA,EAAS,UAAU,CAAA;AAEpD,EAAA,OAAO;AAAA,IACH,MAAA;AAAA,IACA,mBAAA,EAAqB,KAAA;AAAA,IACrB,oBAAA,EAAsB,MAAA;AAAA,IACtB,kBAAA,EAAoB,KAAA;AAAA,IACpB,mBAAA,EAAqB,MAAA;AAAA,IACrB,aAAa,KAAA,GAAQ,KAAA;AAAA,IACrB,cAAc,MAAA,GAAS,MAAA;AAAA,IACvB,UAAA;AAAA,IACA,gBAAA;AAAA,IACA,oBAAA,EAAsB,sBAAA;AAAA,IACtB,qBAAA,EAAuB;AAAA,GAC3B;AACJ;AAMA,eAAsB,sBAAA,CAClB,SACA,IAAA,EAC6B;AAC7B,EAAA,OAAO,sBAAA,CAAuB,OAAA,EAAS,UAAA,CAAW,IAAI,CAAC,CAAA;AAC3D;AAOA,eAAsB,qBAAA,CAClB,SACA,GAAA,EAC6B;AAC7B,EAAA,MAAM,IAAA,GAAO,MAAM,cAAA,CAAe,GAAG,CAAA;AACrC,EAAA,OAAO,sBAAA,CAAuB,SAAS,IAAI,CAAA;AAC/C;ACvFO,IAAM,aAAA,GAAgB;AACtB,IAAM,mBAAA,GAAsB;AAkCnC,SAAS,gBAAgB,KAAA,EAAyB;AAC9C,EAAA,IAAI;AACA,IAAA,OAAOC,4BAAiB,KAAsB,CAAA;AAAA,EAClD,CAAA,CAAA,MAAQ;AAEJ,IAAA,OAAOC,uBAAY,aAAa,CAAA;AAAA,EACpC;AACJ;AAKO,SAAS,oBAAoB,MAAA,EAA4C;AAC5E,EAAA,MAAM,KAAA,GAAQ,OAAO,KAAA,IAAS,aAAA;AAC9B,EAAA,MAAM,UAAA,GAAa,OAAO,UAAA,IAAc,mBAAA;AACxC,EAAA,MAAM,MAAA,GAAS,IAAIC,uBAAA,CAAO;AAAA,IACtB,QAAQ,MAAA,CAAO,MAAA;AAAA,IACf,GAAI,OAAO,OAAA,GAAU,EAAE,SAAS,MAAA,CAAO,OAAA,KAAY;AAAC,GACvD,CAAA;AACD,EAAA,MAAM,QAAA,GAAW,gBAAgB,KAAK,CAAA;AAEtC,EAAA,OAAO;AAAA,IACH,KAAA;AAAA,IACA,UAAA;AAAA,IAEA,YAAY,IAAA,EAAsB;AAC9B,MAAA,OAAO,QAAA,CAAS,MAAA,CAAO,IAAI,CAAA,CAAE,MAAA;AAAA,IACjC,CAAA;AAAA,IAEA,MAAM,eAAe,OAAA,EAAgD;AACjE,MAAA,MAAM,QAAA,GAAW,MAAM,MAAA,CAAO,IAAA,CAAK,YAAY,MAAA,CAAO;AAAA,QAClD,UAAU,CAAC,EAAE,IAAA,EAAM,MAAA,EAAQ,SAAS,CAAA;AAAA,QACpC;AAAA,OACH,CAAA;AACD,MAAA,OAAO;AAAA,QACH,QAAQ,QAAA,CAAS,OAAA,CAAQ,CAAC,CAAA,EAAG,QAAQ,OAAA,IAAW,EAAA;AAAA,QAChD,WAAA,EAAa,QAAA,CAAS,KAAA,EAAO,aAAA,IAAiB,CAAA;AAAA,QAC9C,YAAA,EAAc,QAAA,CAAS,KAAA,EAAO,iBAAA,IAAqB;AAAA,OACvD;AAAA,IACJ,CAAA;AAAA,IAEA,MAAM,gBAAA,CACF,aAAA,EACA,WAAA,EACA,QACA,YAAA,EACwB;AACxB,MAAA,IAAI;AACA,QAAA,MAAM,UAAA,GAAa,MAAM,MAAA,CAAO,IAAA,CAAK,YAAY,MAAA,CAAO;AAAA,UACpD,KAAA;AAAA,UACA,QAAA,EAAU;AAAA,YACN,EAAE,IAAA,EAAM,QAAA,EAAU,OAAA,EAAS,aAAA,EAAc;AAAA,YACzC,EAAE,IAAA,EAAM,MAAA,EAAQ,OAAA,EAAS,WAAA;AAAY,WACzC;AAAA,UACA,WAAW,CAAC,EAAE,MAAM,YAAA,EAAc,UAAA,EAAY,QAAQ,CAAA;AAAA,UACtD,aAAA,EAAe,EAAE,IAAA,EAAM,YAAA;AAAa,SACvC,CAAA;AAED,QAAA,MAAM,YAAA,GAAe,UAAA,CAAW,OAAA,CAAQ,CAAC,GAAG,OAAA,CAAQ,aAAA;AACpD,QAAA,IAAI,CAAC,YAAA,EAAc;AACf,UAAA,MAAM,IAAI,oBAAoB,qCAAqC,CAAA;AAAA,QACvE;AAEA,QAAA,MAAM,WAAA,GAAc,UAAA,CAAW,KAAA,EAAO,aAAA,IAAiB,CAAA;AACvD,QAAA,MAAM,YAAA,GAAe,UAAA,CAAW,KAAA,EAAO,iBAAA,IAAqB,CAAA;AAE5D,QAAA,IAAI;AACA,UAAA,MAAM,MAAA,GAAS,IAAA,CAAK,KAAA,CAAM,YAAA,CAAa,SAAS,CAAA;AAChD,UAAA,OAAO,EAAE,MAAA,EAAQ,WAAA,EAAa,YAAA,EAAa;AAAA,QAC/C,SAAS,KAAA,EAAO;AACZ,UAAA,IAAI,iBAAiB,WAAA,EAAa;AAC9B,YAAA,MAAM,GAAA,GAAM,KAAA,CAAM,OAAA,CAAQ,WAAA,EAAY;AACtC,YAAA,IACI,IAAI,QAAA,CAAS,8BAA8B,KAC3C,GAAA,CAAI,QAAA,CAAS,2BAA2B,CAAA,EAC1C;AACE,cAAA,MAAM,IAAI,mBAAA;AAAA,gBACN,CAAA,wBAAA,EAA2B,MAAM,OAAO,CAAA,CAAA;AAAA,gBACxC,EAAE,OAAO,KAAA;AAAM,eACnB;AAAA,YACJ;AACA,YAAA,MAAM,IAAI,mBAAA;AAAA,cACN,CAAA,2CAAA,EAA8C,MAAM,OAAO,CAAA,CAAA;AAAA,cAC3D,EAAE,OAAO,KAAA;AAAM,aACnB;AAAA,UACJ;AACA,UAAA,MAAM,IAAI,mBAAA;AAAA,YACN;AAAA,WACJ;AAAA,QACJ;AAAA,MACJ,SAAS,KAAA,EAAO;AACZ,QAAA,IAAI,KAAA,YAAiB,SAAS,MAAM,KAAA;AACpC,QAAA,IAAI,KAAA,YAAiBA,wBAAO,QAAA,EAAU;AAClC,UAAA,MAAM,GAAA,GAAM,KAAA,CAAM,OAAA,CAAQ,WAAA,EAAY;AACtC,UAAA,IAAI,GAAA,CAAI,QAAA,CAAS,iBAAiB,CAAA,EAAG;AACjC,YAAA,MAAM,IAAI,kBAAA,CAAmB,CAAA,eAAA,EAAkB,KAAA,CAAM,OAAO,CAAA,CAAA,EAAI;AAAA,cAC5D,KAAA,EAAO;AAAA,aACV,CAAA;AAAA,UACL;AACA,UAAA,IAAI,GAAA,CAAI,QAAA,CAAS,gBAAgB,CAAA,EAAG;AAChC,YAAA,MAAM,IAAI,sBAAA,CAAuB,CAAA,gBAAA,EAAmB,KAAA,CAAM,OAAO,CAAA,CAAA,EAAI;AAAA,cACjE,KAAA,EAAO;AAAA,aACV,CAAA;AAAA,UACL;AACA,UAAA,MAAM,IAAI,QAAQ,CAAA,cAAA,EAAiB,KAAA,CAAM,OAAO,CAAA,CAAA,EAAI,EAAE,KAAA,EAAO,KAAA,EAAO,CAAA;AAAA,QACxE;AACA,QAAA,MAAM,IAAI,QAAQ,wBAAA,EAA0B;AAAA,UACxC,KAAA,EAAO,KAAA,YAAiB,KAAA,GAAQ,KAAA,GAAQ;AAAA,SAC3C,CAAA;AAAA,MACL;AAAA,IACJ;AAAA,GACJ;AACJ;;;AC5HO,SAAS,qBAAqB,MAAA,EAA8C;AAC/E,EAAA,MAAM,OAAA,GAAU,oBAAoB,MAAM,CAAA;AAC1C,EAAA,OAAO;AAAA,IACH,sBAAA,EAAwB,CAAC,IAAA,KAAS,sBAAA,CAAuB,SAAS,IAAI,CAAA;AAAA,IACtE,sBAAA,EAAwB,CAAC,IAAA,KAAS,sBAAA,CAAuB,SAAS,IAAI,CAAA;AAAA,IACtE,qBAAA,EAAuB,CAAC,GAAA,KAAQ,qBAAA,CAAsB,SAAS,GAAG;AAAA,GACtE;AACJ","file":"index.cjs","sourcesContent":["/**\n * Base error class for every error thrown by the mapthis package.\n *\n * All feature-specific error classes (scrape, search, ai, geocoding) extend\n * this, so consumers can do `catch (e) { if (e instanceof MapthisError) ... }`\n * to distinguish library errors from other exceptions.\n */\nexport class MapthisError extends Error {\n constructor(message?: string, options?: ErrorOptions) {\n super(message, options)\n this.name = \"MapthisError\"\n }\n}\n","import { MapthisError } from \"../types/errors\"\n\nexport class AiError extends MapthisError {\n constructor(message?: string, options?: ErrorOptions) {\n super(message, options)\n this.name = \"AiError\"\n }\n}\n\n/**\n * The LLM returned zero locations after deduplication.\n */\nexport class NoLocationsFoundError extends AiError {\n constructor(message?: string, options?: ErrorOptions) {\n super(message ?? \"No locations found in input\", options)\n this.name = \"NoLocationsFoundError\"\n }\n}\n\n/**\n * Summarization pipeline failed (one of the chunk calls rejected).\n */\nexport class SummarizeTextError extends AiError {\n constructor(message?: string, options?: ErrorOptions) {\n super(message, options)\n this.name = \"SummarizeTextError\"\n }\n}\n\n/**\n * A JSON schema passed to the LLM was rejected as invalid by the provider.\n */\nexport class InvalidJsonSchemaError extends AiError {\n constructor(message?: string, options?: ErrorOptions) {\n super(message, options)\n this.name = \"InvalidJsonSchemaError\"\n }\n}\n\n/**\n * The LLM response could not be parsed as JSON (typically because the output\n * was truncated or malformed).\n */\nexport class AiResponseJsonError extends AiError {\n constructor(message?: string, options?: ErrorOptions) {\n super(message, options)\n this.name = \"AiResponseJsonError\"\n }\n}\n\n/**\n * Input text exceeded the model's context window.\n */\nexport class AiInputLengthError extends AiError {\n constructor(message?: string, options?: ErrorOptions) {\n super(message, options)\n this.name = \"AiInputLengthError\"\n }\n}\n\n/**\n * The LLM hit the output token limit before completing its JSON response.\n */\nexport class AiOutputLengthError extends AiError {\n constructor(message?: string, options?: ErrorOptions) {\n super(message, options)\n this.name = \"AiOutputLengthError\"\n }\n}\n","import type { JSONSchema7Object } from \"json-schema\"\n\n/**\n * Semantic version of the location-extraction system prompt. Stored alongside\n * each place record in the private app so results can be re-run when the\n * prompt is updated. Bump when you change {@link LOCATIONS_SYSTEM_MESSAGE} or\n * {@link LOCATIONS_SCHEMA} in a way that meaningfully changes output.\n */\nexport const LOCATIONS_PROMPT_VERSION = \"0.0.1\"\n\nexport const LOCATIONS_FUNCTIONS_NAME = \"get_places\"\n\nexport const LOCATIONS_SYSTEM_MESSAGE = `You parse a list of place addresses, corresponding brief one-sentence descriptions, and a descriptive title (preferably directly quoted) from text. These addresses might be countries, cities, attractions, parks, bars, etc.\nIf the text seems to be listing places that are all within a single parent place, you don't return the parent place, but you do add it to all the childrens' addresses.\nFor example, for an article about places in greece, you would not return \"greece\", but if the Acropolis was mentioned, you would return \"Acropolis, Athens, Greece\"\n\nYou ignore extraneous text, such as recommended articles, advertisments, etc.\n`\n\nexport const LOCATIONS_SCHEMA: JSONSchema7Object = {\n type: \"object\",\n required: [\"locations\", \"title\"],\n properties: {\n locations: {\n type: \"array\",\n items: {\n type: \"object\",\n required: [\"address\", \"description\"],\n properties: {\n address: { type: \"string\" },\n description: { type: \"string\" },\n },\n },\n },\n title: { type: \"string\" },\n },\n}\n\nexport const LOCATIONS_MAX_OUTPUT_TOKENS = 500\n\n/**\n * Semantic version of the summarization prompt. Bump when meaningful changes\n * are made to {@link SUMMARIZATION_PROMPT}.\n */\nexport const SUMMARIZE_PROMPT_VERSION = \"0.0.1\"\n\nexport const SUMMARIZATION_PROMPT = `Summarize the given text. The summary must reduce the number of characters from the original text by at least a factor of {RATIO}.\nFor example, text of 1000 characters must output a summary less than {EXAMPLE} characters.\nThis summary will be used to extract a list of locations and addresses, so keep any text that seems to be a list of locations/places, a location and paragraph describing it, or otherwise clearly a location name or address.\nHere is the text:\n`\n","import { MapthisError } from \"../types/errors\"\n\nexport class ScrapeError extends MapthisError {\n constructor(message?: string, options?: ErrorOptions) {\n super(message, options)\n this.name = \"ScrapeError\"\n }\n}\n\n/**\n * Thrown when a provided URL fails validation or cannot be parsed by `fetch`.\n */\nexport class InvalidUrlError extends ScrapeError {\n constructor(message?: string, options?: ErrorOptions) {\n super(message, options)\n this.name = \"InvalidUrlError\"\n }\n}\n\n/**\n * Thrown when a remote server returns 401 or 403 for an HTML fetch, meaning\n * the page does not permit anonymous scraping.\n */\nexport class HtmlUnauthorizedError extends ScrapeError {\n constructor(message?: string, options?: ErrorOptions) {\n super(message, options)\n this.name = \"HtmlUnauthorizedError\"\n }\n}\n\n/**\n * Thrown when the underlying `html-to-text` conversion fails.\n */\nexport class HtmlToTextError extends ScrapeError {\n constructor(message?: string, options?: ErrorOptions) {\n super(message, options)\n this.name = \"HtmlToTextError\"\n }\n}\n","import { z } from \"zod\"\nimport { HtmlUnauthorizedError, InvalidUrlError, ScrapeError } from \"./errors\"\n\nconst withHttps = (url: string): string => {\n const hasHttp = /^http?:\\/\\//i.test(url)\n const hasHttps = /^https?:\\/\\//i.test(url)\n if (!hasHttp && !hasHttps) return `https://${url}`\n return url\n}\n\n/**\n * Fetch the raw HTML of a URL, performing light validation first.\n *\n * - URLs without a scheme are prefixed with `https://`.\n * - URL shape is validated via Zod before the network call.\n * - 401/403 responses throw {@link HtmlUnauthorizedError}.\n * - Other non-OK responses throw {@link ScrapeError} with the status text.\n * - `TypeError` from `fetch` (typically malformed URL) is rethrown as\n * {@link InvalidUrlError}.\n *\n * This function does not set any special headers, user agent, or cookies.\n * Sites with aggressive bot-detection may return empty bodies or errors; the\n * caller is responsible for retry / user-agent rotation strategies if needed.\n */\nexport async function getHtmlFromUrl(url: string): Promise<string> {\n const urlFixed = withHttps(url.toLowerCase())\n const isUrl = z.string().url().safeParse(urlFixed)\n if (!isUrl.success) throw new InvalidUrlError(`Invalid URL: ${url}`)\n\n try {\n const response = await fetch(urlFixed)\n const html = await response.text()\n if (response.status === 403 || response.status === 401) {\n throw new HtmlUnauthorizedError(\n `${response.status} Not authorized to scrape this website`,\n )\n }\n if (!response.ok) {\n throw new ScrapeError(\n `Bad response when getting HTML. Status: ${response.status} ${response.statusText}`,\n )\n }\n return html\n } catch (error) {\n if (error instanceof TypeError) {\n throw new InvalidUrlError(`Invalid URL. Original error: ${error.message}`)\n }\n throw error\n }\n}\n","import { convert } from \"html-to-text\"\nimport { HtmlToTextError } from \"./errors\"\nimport { getHtmlFromUrl } from \"./html\"\n\nconst SKIP_SELECTORS = [\n \"img\",\n \"header\",\n \"footer\",\n \"audio\",\n \"button\",\n \"canvas\",\n \"code\",\n \"nav\",\n \"#nav\",\n \"figure\",\n \"figcaption\",\n \".comment\",\n \".comments\",\n \"#comments\",\n \"#related-posts\",\n \"#related\",\n \".related\",\n \".related-posts\",\n] as const\n\n/**\n * Convert raw HTML to plain text suitable for LLM consumption.\n *\n * The converter is configured to:\n * - strip anchor hrefs when they duplicate the link text;\n * - skip navigation, footer, comments, related-post widgets, images, and\n * other non-content elements;\n * - collapse repeated blank lines and double spaces so token count doesn't\n * balloon from whitespace.\n */\nexport function htmlToText(html: string): string {\n try {\n let out = convert(html, {\n wordwrap: false,\n selectors: [\n {\n selector: \"a\",\n options: {\n ignoreHref: true,\n hideLinkHrefIfSameAsText: true,\n },\n },\n ...SKIP_SELECTORS.map((tag) => ({ selector: tag, format: \"skip\" })),\n ],\n })\n out = out.replaceAll(\"\\r\\n\", \"\\n\")\n // Iteratively collapse blank lines and double spaces. 20 passes is\n // overkill for any realistic input but cheap and bounds worst case.\n for (let i = 0; i < 20; i++) {\n out = out.replaceAll(\"\\n\\n\\n\", \"\\n\\n\")\n out = out.replaceAll(\" \", \" \")\n }\n return out\n } catch (error) {\n if (error instanceof Error) {\n throw new HtmlToTextError(`HTML to text conversion failed: ${error.message}`, {\n cause: error,\n })\n }\n throw new HtmlToTextError(\"HTML to text conversion failed\")\n }\n}\n\n/**\n * Convenience: fetch a URL and convert its HTML to plain text in one call.\n *\n * Equivalent to `htmlToText(await getHtmlFromUrl(url))`.\n */\nexport async function getTextFromUrl(url: string): Promise<string> {\n const html = await getHtmlFromUrl(url)\n return htmlToText(html)\n}\n","import { SummarizeTextError } from \"./errors\"\nimport type { OpenAiBackend } from \"./openai\"\nimport { SUMMARIZATION_PROMPT, SUMMARIZE_PROMPT_VERSION } from \"./prompts\"\nimport type { SummarizedText } from \"./types\"\n\n/**\n * Reduce a large text to a summary that will fit inside the model's context\n * window alongside a known downstream prompt (e.g. the location-extraction\n * call).\n *\n * Algorithm overview\n * ------------------\n * We need to fit both the summary and a downstream \"final\" prompt into the\n * model's context window. Given:\n *\n * - `token_limit` — model context window (with a safety factor)\n * - `text_tokens` — size of the input text\n * - `summarize_prompt_tokens` — size of the summarization prompt itself\n * - `final_prompt_tokens` — size of the downstream prompt\n * - `final_output_tokens` — expected size of the downstream response\n *\n * We split the input into chunks, summarize each in parallel, and concatenate\n * the results. The per-chunk size is chosen so that the combined summary\n * leaves room for the downstream prompt:\n *\n * final_input_tokens = token_limit - final_output_tokens\n * summary_output_tokens = final_input_tokens - final_prompt_tokens\n * num_chunks = ceil((text_tokens + summary_output_tokens)\n * / (token_limit - summarize_prompt_tokens))\n * chunk_output_tokens = summary_output_tokens / num_chunks\n * chunk_input_tokens = token_limit - summarize_prompt_tokens - chunk_output_tokens\n *\n * A safety factor is applied to chunk output sizes because LLMs routinely\n * overshoot explicit length instructions.\n */\nexport async function summarizeText(\n backend: OpenAiBackend,\n text: string,\n finalPrompt = \"\",\n finalOutputTokens = 0,\n chunkOutputSafetyFactor = 0.3,\n tokenLimitSafetyFactor = 0.02,\n): Promise<SummarizedText> {\n const tokenLimit = backend.tokenLimit * (1 - tokenLimitSafetyFactor)\n const textTokens = backend.countTokens(text)\n const summarizePromptTokens = backend.countTokens(SUMMARIZATION_PROMPT)\n const finalPromptTokens = backend.countTokens(finalPrompt)\n\n const finalInputTokens = tokenLimit - finalOutputTokens\n const summaryOutputTokens = finalInputTokens - finalPromptTokens\n\n const numChunks = Math.max(\n 1,\n Math.ceil((textTokens + summaryOutputTokens) / (tokenLimit - summarizePromptTokens)),\n )\n const chunkOutputTokens = Math.floor(\n (summaryOutputTokens / numChunks) * (1 - chunkOutputSafetyFactor),\n )\n const chunkInputTokens = Math.floor(tokenLimit - summarizePromptTokens - chunkOutputTokens)\n const summaryRatio = chunkOutputTokens / chunkInputTokens\n\n // Split the text into chunks that each fit inside `chunkInputTokens`.\n // Lines longer than a single chunk are force-split by token, not by\n // character, so UTF-8 multibyte characters aren't corrupted.\n const textLines = text.split(\"\\n\")\n const chunks: string[] = []\n\n let currentChunk = \"\"\n let currentChunkTokens = 0\n const nextChunk = (): void => {\n if (currentChunk) chunks.push(currentChunk)\n currentChunk = \"\"\n currentChunkTokens = 0\n }\n\n // We need an encoder for the force-split path. The backend's countTokens\n // gives us length but not the encoded array; for force-splitting we only\n // need to chunk by an approximate token count using the same rounding.\n // To keep this function backend-agnostic, we fall back to character-based\n // splitting for over-long lines (rare edge case in practice).\n for (const line of textLines) {\n const numLineTokens = backend.countTokens(line) + 1 // +1 for the newline\n\n if (numLineTokens > chunkInputTokens) {\n // Long line: flush current chunk, then split this line by\n // approximate token ratio into sub-chunks.\n nextChunk()\n const charsPerToken = line.length / Math.max(1, backend.countTokens(line))\n const charsPerChunk = Math.max(1, Math.floor(chunkInputTokens * charsPerToken))\n for (let i = 0; i < line.length; i += charsPerChunk) {\n chunks.push(line.slice(i, i + charsPerChunk))\n }\n continue\n }\n\n if (currentChunkTokens + numLineTokens > chunkInputTokens) {\n nextChunk()\n }\n currentChunkTokens += numLineTokens\n currentChunk += line + \"\\n\"\n }\n nextChunk()\n\n try {\n const ratioText = (Math.ceil(summaryRatio * 10) / 10).toFixed(1)\n const exampleText = Math.ceil(1000 / Math.max(summaryRatio, 0.01)).toFixed(0)\n const summaryPrompt = SUMMARIZATION_PROMPT.replace(\"{RATIO}\", ratioText).replace(\n \"{EXAMPLE}\",\n exampleText,\n )\n\n let inputTokens = 0\n let outputTokens = 0\n const summaries = await Promise.all(\n chunks.map(async (chunk) => {\n const response = await backend.chatCompletion(summaryPrompt + chunk)\n inputTokens += response.inputTokens\n outputTokens += response.outputTokens\n return response.output\n }),\n )\n\n const summarized = summaries.join(\"\\n\")\n const summarizedTokens = backend.countTokens(summarized)\n\n return {\n text,\n textTokens,\n summarized,\n summarizedTokens,\n inputTokens,\n outputTokens,\n chunks: chunks.length,\n summarizePromptVersion: SUMMARIZE_PROMPT_VERSION,\n }\n } catch (error) {\n if (error instanceof Error) {\n throw new SummarizeTextError(`Failed to summarize text: ${error.message}`, {\n cause: error,\n })\n }\n throw new SummarizeTextError(\"Failed to summarize text (unknown error)\")\n }\n}\n","import { getHtmlFromUrl } from \"../scrape/html\"\nimport { htmlToText } from \"../scrape/text\"\nimport type { ParsedLocations } from \"../types/domain\"\nimport { NoLocationsFoundError } from \"./errors\"\nimport type { OpenAiBackend } from \"./openai\"\nimport {\n LOCATIONS_FUNCTIONS_NAME,\n LOCATIONS_MAX_OUTPUT_TOKENS,\n LOCATIONS_PROMPT_VERSION,\n LOCATIONS_SCHEMA,\n LOCATIONS_SYSTEM_MESSAGE,\n} from \"./prompts\"\nimport { summarizeText } from \"./summarize\"\nimport type { GetLocationsResponse } from \"./types\"\n\nconst FINAL_PROMPT = LOCATIONS_SYSTEM_MESSAGE + JSON.stringify(LOCATIONS_SCHEMA) + LOCATIONS_FUNCTIONS_NAME\n\nasync function callLocationExtraction(backend: OpenAiBackend, text: string) {\n const out = await backend.chatFunctionJson<ParsedLocations>(\n LOCATIONS_SYSTEM_MESSAGE,\n text,\n LOCATIONS_SCHEMA,\n LOCATIONS_FUNCTIONS_NAME,\n )\n\n // LLM frequently returns the same address twice across different prompts.\n // Dedupe by address string.\n const seen = new Set<string>()\n const unique = out.output.locations.filter((loc) => {\n if (seen.has(loc.address)) return false\n seen.add(loc.address)\n return true\n })\n if (unique.length === 0) throw new NoLocationsFoundError()\n out.output.locations = unique\n\n return out\n}\n\n/**\n * Parse locations from freeform text. Two-stage: summarize to fit the context\n * window, then a function-calling JSON completion for extraction.\n */\nexport async function parseLocationsFromText(\n backend: OpenAiBackend,\n text: string,\n): Promise<GetLocationsResponse> {\n const {\n summarized,\n inputTokens: sumIn,\n outputTokens: sumOut,\n textTokens,\n summarizedTokens,\n summarizePromptVersion,\n } = await summarizeText(backend, text, FINAL_PROMPT, LOCATIONS_MAX_OUTPUT_TOKENS)\n\n const {\n output,\n inputTokens: locIn,\n outputTokens: locOut,\n } = await callLocationExtraction(backend, summarized)\n\n return {\n output,\n locationInputTokens: locIn,\n locationOutputTokens: locOut,\n summaryInputTokens: sumIn,\n summaryOutputTokens: sumOut,\n inputTokens: sumIn + locIn,\n outputTokens: sumOut + locOut,\n textTokens,\n summarizedTokens,\n summaryPromptVersion: summarizePromptVersion,\n locationPromptVersion: LOCATIONS_PROMPT_VERSION,\n }\n}\n\n/**\n * Parse locations from HTML. Equivalent to `htmlToText` followed by\n * {@link parseLocationsFromText}.\n */\nexport async function parseLocationsFromHtml(\n backend: OpenAiBackend,\n html: string,\n): Promise<GetLocationsResponse> {\n return parseLocationsFromText(backend, htmlToText(html))\n}\n\n/**\n * Parse locations from a URL. Equivalent to fetching the HTML, converting to\n * text, and calling {@link parseLocationsFromText}. Throws any scrape errors\n * unchanged.\n */\nexport async function parseLocationsFromUrl(\n backend: OpenAiBackend,\n url: string,\n): Promise<GetLocationsResponse> {\n const html = await getHtmlFromUrl(url)\n return parseLocationsFromHtml(backend, html)\n}\n","import { encodingForModel, getEncoding, type Tiktoken, type TiktokenModel } from \"js-tiktoken\"\nimport OpenAI from \"openai\"\nimport type { JSONSchema7Object } from \"json-schema\"\nimport {\n AiInputLengthError,\n AiOutputLengthError,\n AiResponseJsonError,\n AiError,\n InvalidJsonSchemaError,\n} from \"./errors\"\nimport type { OpenAIOutput } from \"./types\"\n\nexport const DEFAULT_MODEL = \"gpt-4o-mini\"\nexport const DEFAULT_TOKEN_LIMIT = 128000\n\n/**\n * An OpenAI-backed inference context shared by the summarizer and location\n * parser. Instances are produced by {@link createOpenAiBackend} so API keys\n * and model choices live in the consumer's code, never in module-level state.\n */\nexport interface OpenAiBackend {\n readonly model: string\n readonly tokenLimit: number\n countTokens(text: string): number\n chatCompletion(content: string): Promise<OpenAIOutput<string>>\n chatFunctionJson<T>(\n systemMessage: string,\n userMessage: string,\n schema: JSONSchema7Object,\n functionName: string,\n ): Promise<OpenAIOutput<T>>\n}\n\nexport interface OpenAiBackendConfig {\n /** OpenAI API key. Passed directly to the SDK. */\n apiKey: string\n /** Model name passed to the completions API. Defaults to `gpt-4o-mini`. */\n model?: string\n /** Context-window budget used by the summarizer. Defaults to 128k. */\n tokenLimit?: number\n /**\n * Optional base URL for the OpenAI API. Use this to route through an\n * OpenAI-compatible gateway (Azure, Together, etc.).\n */\n baseURL?: string\n}\n\nfunction resolveEncoding(model: string): Tiktoken {\n try {\n return encodingForModel(model as TiktokenModel)\n } catch {\n // Fallback for models tiktoken does not recognize by name.\n return getEncoding(\"cl100k_base\")\n }\n}\n\n/**\n * Construct an {@link OpenAiBackend} bound to a specific API key and model.\n */\nexport function createOpenAiBackend(config: OpenAiBackendConfig): OpenAiBackend {\n const model = config.model ?? DEFAULT_MODEL\n const tokenLimit = config.tokenLimit ?? DEFAULT_TOKEN_LIMIT\n const openai = new OpenAI({\n apiKey: config.apiKey,\n ...(config.baseURL ? { baseURL: config.baseURL } : {}),\n })\n const encoding = resolveEncoding(model)\n\n return {\n model,\n tokenLimit,\n\n countTokens(text: string): number {\n return encoding.encode(text).length\n },\n\n async chatCompletion(content: string): Promise<OpenAIOutput<string>> {\n const response = await openai.chat.completions.create({\n messages: [{ role: \"user\", content }],\n model,\n })\n return {\n output: response.choices[0]?.message.content ?? \"\",\n inputTokens: response.usage?.prompt_tokens ?? 0,\n outputTokens: response.usage?.completion_tokens ?? 0,\n }\n },\n\n async chatFunctionJson<T>(\n systemMessage: string,\n userMessage: string,\n schema: JSONSchema7Object,\n functionName: string,\n ): Promise<OpenAIOutput<T>> {\n try {\n const completion = await openai.chat.completions.create({\n model,\n messages: [\n { role: \"system\", content: systemMessage },\n { role: \"user\", content: userMessage },\n ],\n functions: [{ name: functionName, parameters: schema }],\n function_call: { name: functionName },\n })\n\n const functionCall = completion.choices[0]?.message.function_call\n if (!functionCall) {\n throw new AiResponseJsonError(\"No function call in OpenAI response\")\n }\n\n const inputTokens = completion.usage?.prompt_tokens ?? 0\n const outputTokens = completion.usage?.completion_tokens ?? 0\n\n try {\n const output = JSON.parse(functionCall.arguments) as T\n return { output, inputTokens, outputTokens }\n } catch (error) {\n if (error instanceof SyntaxError) {\n const msg = error.message.toLowerCase()\n if (\n msg.includes(\"unexpected end of json input\") ||\n msg.includes(\"unterminated string in js\")\n ) {\n throw new AiOutputLengthError(\n `Output tokens exceeded: ${error.message}`,\n { cause: error },\n )\n }\n throw new AiResponseJsonError(\n `Failed to parse JSON from OpenAI response: ${error.message}`,\n { cause: error },\n )\n }\n throw new AiResponseJsonError(\n \"Failed to parse JSON from OpenAI response (unknown reason)\",\n )\n }\n } catch (error) {\n if (error instanceof AiError) throw error\n if (error instanceof OpenAI.APIError) {\n const msg = error.message.toLowerCase()\n if (msg.includes(\"maximum context\")) {\n throw new AiInputLengthError(`Text too long: ${error.message}`, {\n cause: error,\n })\n }\n if (msg.includes(\"invalid schema\")) {\n throw new InvalidJsonSchemaError(`Invalid schema: ${error.message}`, {\n cause: error,\n })\n }\n throw new AiError(`OpenAI error: ${error.message}`, { cause: error })\n }\n throw new AiError(\"OpenAI error (unknown)\", {\n cause: error instanceof Error ? error : undefined,\n })\n }\n },\n }\n}\n","import {\n parseLocationsFromHtml,\n parseLocationsFromText,\n parseLocationsFromUrl,\n} from \"./locations\"\nimport { createOpenAiBackend, type OpenAiBackendConfig } from \"./openai\"\nimport type { GetLocationsResponse } from \"./types\"\n\n/**\n * A configured location-extraction pipeline. Hold onto one of these for the\n * lifetime of your app — it wraps an OpenAI client and a tiktoken encoder.\n */\nexport interface LocationParser {\n parseLocationsFromText(text: string): Promise<GetLocationsResponse>\n parseLocationsFromHtml(html: string): Promise<GetLocationsResponse>\n parseLocationsFromUrl(url: string): Promise<GetLocationsResponse>\n}\n\nexport type LocationParserConfig = OpenAiBackendConfig\n\n/**\n * Create a {@link LocationParser} bound to an OpenAI API key.\n *\n * @example\n * ```ts\n * const parser = createLocationParser({\n * apiKey: process.env.OPENAI_API_KEY!,\n * model: \"gpt-4o-mini\",\n * })\n * const { output } = await parser.parseLocationsFromUrl(\n * \"https://example.com/best-restaurants-tokyo\",\n * )\n * console.log(output.title, output.locations)\n * ```\n */\nexport function createLocationParser(config: LocationParserConfig): LocationParser {\n const backend = createOpenAiBackend(config)\n return {\n parseLocationsFromText: (text) => parseLocationsFromText(backend, text),\n parseLocationsFromHtml: (html) => parseLocationsFromHtml(backend, html),\n parseLocationsFromUrl: (url) => parseLocationsFromUrl(backend, url),\n }\n}\n"]}
|