@dromney/mapthis 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +56 -0
  3. package/dist/ai/index.cjs +474 -0
  4. package/dist/ai/index.cjs.map +1 -0
  5. package/dist/ai/index.d.cts +117 -0
  6. package/dist/ai/index.d.ts +117 -0
  7. package/dist/ai/index.js +447 -0
  8. package/dist/ai/index.js.map +1 -0
  9. package/dist/domain-CZ-L-ntu.d.ts +163 -0
  10. package/dist/domain-Dc1wSTkf.d.cts +163 -0
  11. package/dist/errors-Bw97z_4m.d.cts +12 -0
  12. package/dist/errors-Bw97z_4m.d.ts +12 -0
  13. package/dist/generate/index.cjs +222 -0
  14. package/dist/generate/index.cjs.map +1 -0
  15. package/dist/generate/index.d.cts +140 -0
  16. package/dist/generate/index.d.ts +140 -0
  17. package/dist/generate/index.js +220 -0
  18. package/dist/generate/index.js.map +1 -0
  19. package/dist/geocoding/index.cjs +90 -0
  20. package/dist/geocoding/index.cjs.map +1 -0
  21. package/dist/geocoding/index.d.cts +36 -0
  22. package/dist/geocoding/index.d.ts +36 -0
  23. package/dist/geocoding/index.js +86 -0
  24. package/dist/geocoding/index.js.map +1 -0
  25. package/dist/index.cjs +546 -0
  26. package/dist/index.cjs.map +1 -0
  27. package/dist/index.d.cts +5 -0
  28. package/dist/index.d.ts +5 -0
  29. package/dist/index.js +469 -0
  30. package/dist/index.js.map +1 -0
  31. package/dist/parser-CzXzpmVv.d.cts +111 -0
  32. package/dist/parser-N7-fNxeu.d.ts +111 -0
  33. package/dist/react/index.cjs +394 -0
  34. package/dist/react/index.cjs.map +1 -0
  35. package/dist/react/index.js +383 -0
  36. package/dist/react/index.js.map +1 -0
  37. package/dist/schemas-Dy5coqXo.d.cts +484 -0
  38. package/dist/schemas-Dy5coqXo.d.ts +484 -0
  39. package/dist/scrape/index.cjs +133 -0
  40. package/dist/scrape/index.cjs.map +1 -0
  41. package/dist/scrape/index.d.cts +60 -0
  42. package/dist/scrape/index.d.ts +60 -0
  43. package/dist/scrape/index.js +125 -0
  44. package/dist/scrape/index.js.map +1 -0
  45. package/dist/search/index.cjs +76 -0
  46. package/dist/search/index.cjs.map +1 -0
  47. package/dist/search/index.d.cts +75 -0
  48. package/dist/search/index.d.ts +75 -0
  49. package/dist/search/index.js +71 -0
  50. package/dist/search/index.js.map +1 -0
  51. package/dist/types/index.cjs +215 -0
  52. package/dist/types/index.cjs.map +1 -0
  53. package/dist/types/index.d.cts +4 -0
  54. package/dist/types/index.d.ts +4 -0
  55. package/dist/types/index.js +171 -0
  56. package/dist/types/index.js.map +1 -0
  57. package/dist/types-BhqKlq0k.d.ts +31 -0
  58. package/dist/types-rFjK5YcJ.d.cts +31 -0
  59. package/dist/utils/index.cjs +335 -0
  60. package/dist/utils/index.cjs.map +1 -0
  61. package/dist/utils/index.d.cts +363 -0
  62. package/dist/utils/index.d.ts +363 -0
  63. package/dist/utils/index.js +301 -0
  64. package/dist/utils/index.js.map +1 -0
  65. package/package.json +150 -0
@@ -0,0 +1,117 @@
1
+ import { M as MapthisError } from '../errors-Bw97z_4m.cjs';
2
+ import { O as OpenAiBackend, S as SummarizedText, G as GetLocationsResponse } from '../parser-CzXzpmVv.cjs';
3
+ export { D as DEFAULT_MODEL, a as DEFAULT_TOKEN_LIMIT, b as GenericAIJsonRequest, L as LocationParser, c as LocationParserConfig, d as OpenAIOutput, e as OpenAiBackendConfig, f as createLocationParser, g as createOpenAiBackend } from '../parser-CzXzpmVv.cjs';
4
+ import { JSONSchema7Object } from 'json-schema';
5
+ import '../domain-Dc1wSTkf.cjs';
6
+ import '../schemas-Dy5coqXo.cjs';
7
+ import 'zod';
8
+
9
+ declare class AiError extends MapthisError {
10
+ constructor(message?: string, options?: ErrorOptions);
11
+ }
12
+ /**
13
+ * The LLM returned zero locations after deduplication.
14
+ */
15
+ declare class NoLocationsFoundError extends AiError {
16
+ constructor(message?: string, options?: ErrorOptions);
17
+ }
18
+ /**
19
+ * Summarization pipeline failed (one of the chunk calls rejected).
20
+ */
21
+ declare class SummarizeTextError extends AiError {
22
+ constructor(message?: string, options?: ErrorOptions);
23
+ }
24
+ /**
25
+ * A JSON schema passed to the LLM was rejected as invalid by the provider.
26
+ */
27
+ declare class InvalidJsonSchemaError extends AiError {
28
+ constructor(message?: string, options?: ErrorOptions);
29
+ }
30
+ /**
31
+ * The LLM response could not be parsed as JSON (typically because the output
32
+ * was truncated or malformed).
33
+ */
34
+ declare class AiResponseJsonError extends AiError {
35
+ constructor(message?: string, options?: ErrorOptions);
36
+ }
37
+ /**
38
+ * Input text exceeded the model's context window.
39
+ */
40
+ declare class AiInputLengthError extends AiError {
41
+ constructor(message?: string, options?: ErrorOptions);
42
+ }
43
+ /**
44
+ * The LLM hit the output token limit before completing its JSON response.
45
+ */
46
+ declare class AiOutputLengthError extends AiError {
47
+ constructor(message?: string, options?: ErrorOptions);
48
+ }
49
+
50
+ /**
51
+ * Semantic version of the location-extraction system prompt. Stored alongside
52
+ * each place record in the private app so results can be re-run when the
53
+ * prompt is updated. Bump when you change {@link LOCATIONS_SYSTEM_MESSAGE} or
54
+ * {@link LOCATIONS_SCHEMA} in a way that meaningfully changes output.
55
+ */
56
+ declare const LOCATIONS_PROMPT_VERSION = "0.0.1";
57
+ declare const LOCATIONS_FUNCTIONS_NAME = "get_places";
58
+ declare const LOCATIONS_SYSTEM_MESSAGE = "You parse a list of place addresses, corresponding brief one-sentence descriptions, and a descriptive title (preferably directly quoted) from text. These addresses might be countries, cities, attractions, parks, bars, etc.\nIf the text seems to be listing places that are all within a single parent place, you don't return the parent place, but you do add it to all the childrens' addresses.\nFor example, for an article about places in greece, you would not return \"greece\", but if the Acropolis was mentioned, you would return \"Acropolis, Athens, Greece\"\n\nYou ignore extraneous text, such as recommended articles, advertisments, etc.\n";
59
+ declare const LOCATIONS_SCHEMA: JSONSchema7Object;
60
+ declare const LOCATIONS_MAX_OUTPUT_TOKENS = 500;
61
+ /**
62
+ * Semantic version of the summarization prompt. Bump when meaningful changes
63
+ * are made to {@link SUMMARIZATION_PROMPT}.
64
+ */
65
+ declare const SUMMARIZE_PROMPT_VERSION = "0.0.1";
66
+ declare const SUMMARIZATION_PROMPT = "Summarize the given text. The summary must reduce the number of characters from the original text by at least a factor of {RATIO}.\nFor example, text of 1000 characters must output a summary less than {EXAMPLE} characters.\nThis summary will be used to extract a list of locations and addresses, so keep any text that seems to be a list of locations/places, a location and paragraph describing it, or otherwise clearly a location name or address.\nHere is the text:\n";
67
+
68
+ /**
69
+ * Reduce a large text to a summary that will fit inside the model's context
70
+ * window alongside a known downstream prompt (e.g. the location-extraction
71
+ * call).
72
+ *
73
+ * Algorithm overview
74
+ * ------------------
75
+ * We need to fit both the summary and a downstream "final" prompt into the
76
+ * model's context window. Given:
77
+ *
78
+ * - `token_limit` — model context window (with a safety factor)
79
+ * - `text_tokens` — size of the input text
80
+ * - `summarize_prompt_tokens` — size of the summarization prompt itself
81
+ * - `final_prompt_tokens` — size of the downstream prompt
82
+ * - `final_output_tokens` — expected size of the downstream response
83
+ *
84
+ * We split the input into chunks, summarize each in parallel, and concatenate
85
+ * the results. The per-chunk size is chosen so that the combined summary
86
+ * leaves room for the downstream prompt:
87
+ *
88
+ * final_input_tokens = token_limit - final_output_tokens
89
+ * summary_output_tokens = final_input_tokens - final_prompt_tokens
90
+ * num_chunks = ceil((text_tokens + summary_output_tokens)
91
+ * / (token_limit - summarize_prompt_tokens))
92
+ * chunk_output_tokens = summary_output_tokens / num_chunks
93
+ * chunk_input_tokens = token_limit - summarize_prompt_tokens - chunk_output_tokens
94
+ *
95
+ * A safety factor is applied to chunk output sizes because LLMs routinely
96
+ * overshoot explicit length instructions.
97
+ */
98
+ declare function summarizeText(backend: OpenAiBackend, text: string, finalPrompt?: string, finalOutputTokens?: number, chunkOutputSafetyFactor?: number, tokenLimitSafetyFactor?: number): Promise<SummarizedText>;
99
+
100
+ /**
101
+ * Parse locations from freeform text. Two-stage: summarize to fit the context
102
+ * window, then a function-calling JSON completion for extraction.
103
+ */
104
+ declare function parseLocationsFromText(backend: OpenAiBackend, text: string): Promise<GetLocationsResponse>;
105
+ /**
106
+ * Parse locations from HTML. Equivalent to `htmlToText` followed by
107
+ * {@link parseLocationsFromText}.
108
+ */
109
+ declare function parseLocationsFromHtml(backend: OpenAiBackend, html: string): Promise<GetLocationsResponse>;
110
+ /**
111
+ * Parse locations from a URL. Equivalent to fetching the HTML, converting to
112
+ * text, and calling {@link parseLocationsFromText}. Throws any scrape errors
113
+ * unchanged.
114
+ */
115
+ declare function parseLocationsFromUrl(backend: OpenAiBackend, url: string): Promise<GetLocationsResponse>;
116
+
117
+ export { AiError, AiInputLengthError, AiOutputLengthError, AiResponseJsonError, GetLocationsResponse, InvalidJsonSchemaError, LOCATIONS_FUNCTIONS_NAME, LOCATIONS_MAX_OUTPUT_TOKENS, LOCATIONS_PROMPT_VERSION, LOCATIONS_SCHEMA, LOCATIONS_SYSTEM_MESSAGE, NoLocationsFoundError, OpenAiBackend, SUMMARIZATION_PROMPT, SUMMARIZE_PROMPT_VERSION, SummarizeTextError, SummarizedText, parseLocationsFromHtml, parseLocationsFromText, parseLocationsFromUrl, summarizeText };
@@ -0,0 +1,117 @@
1
+ import { M as MapthisError } from '../errors-Bw97z_4m.js';
2
+ import { O as OpenAiBackend, S as SummarizedText, G as GetLocationsResponse } from '../parser-N7-fNxeu.js';
3
+ export { D as DEFAULT_MODEL, a as DEFAULT_TOKEN_LIMIT, b as GenericAIJsonRequest, L as LocationParser, c as LocationParserConfig, d as OpenAIOutput, e as OpenAiBackendConfig, f as createLocationParser, g as createOpenAiBackend } from '../parser-N7-fNxeu.js';
4
+ import { JSONSchema7Object } from 'json-schema';
5
+ import '../domain-CZ-L-ntu.js';
6
+ import '../schemas-Dy5coqXo.js';
7
+ import 'zod';
8
+
9
+ declare class AiError extends MapthisError {
10
+ constructor(message?: string, options?: ErrorOptions);
11
+ }
12
+ /**
13
+ * The LLM returned zero locations after deduplication.
14
+ */
15
+ declare class NoLocationsFoundError extends AiError {
16
+ constructor(message?: string, options?: ErrorOptions);
17
+ }
18
+ /**
19
+ * Summarization pipeline failed (one of the chunk calls rejected).
20
+ */
21
+ declare class SummarizeTextError extends AiError {
22
+ constructor(message?: string, options?: ErrorOptions);
23
+ }
24
+ /**
25
+ * A JSON schema passed to the LLM was rejected as invalid by the provider.
26
+ */
27
+ declare class InvalidJsonSchemaError extends AiError {
28
+ constructor(message?: string, options?: ErrorOptions);
29
+ }
30
+ /**
31
+ * The LLM response could not be parsed as JSON (typically because the output
32
+ * was truncated or malformed).
33
+ */
34
+ declare class AiResponseJsonError extends AiError {
35
+ constructor(message?: string, options?: ErrorOptions);
36
+ }
37
+ /**
38
+ * Input text exceeded the model's context window.
39
+ */
40
+ declare class AiInputLengthError extends AiError {
41
+ constructor(message?: string, options?: ErrorOptions);
42
+ }
43
+ /**
44
+ * The LLM hit the output token limit before completing its JSON response.
45
+ */
46
+ declare class AiOutputLengthError extends AiError {
47
+ constructor(message?: string, options?: ErrorOptions);
48
+ }
49
+
50
+ /**
51
+ * Semantic version of the location-extraction system prompt. Stored alongside
52
+ * each place record in the private app so results can be re-run when the
53
+ * prompt is updated. Bump when you change {@link LOCATIONS_SYSTEM_MESSAGE} or
54
+ * {@link LOCATIONS_SCHEMA} in a way that meaningfully changes output.
55
+ */
56
+ declare const LOCATIONS_PROMPT_VERSION = "0.0.1";
57
+ declare const LOCATIONS_FUNCTIONS_NAME = "get_places";
58
+ declare const LOCATIONS_SYSTEM_MESSAGE = "You parse a list of place addresses, corresponding brief one-sentence descriptions, and a descriptive title (preferably directly quoted) from text. These addresses might be countries, cities, attractions, parks, bars, etc.\nIf the text seems to be listing places that are all within a single parent place, you don't return the parent place, but you do add it to all the childrens' addresses.\nFor example, for an article about places in greece, you would not return \"greece\", but if the Acropolis was mentioned, you would return \"Acropolis, Athens, Greece\"\n\nYou ignore extraneous text, such as recommended articles, advertisments, etc.\n";
59
+ declare const LOCATIONS_SCHEMA: JSONSchema7Object;
60
+ declare const LOCATIONS_MAX_OUTPUT_TOKENS = 500;
61
+ /**
62
+ * Semantic version of the summarization prompt. Bump when meaningful changes
63
+ * are made to {@link SUMMARIZATION_PROMPT}.
64
+ */
65
+ declare const SUMMARIZE_PROMPT_VERSION = "0.0.1";
66
+ declare const SUMMARIZATION_PROMPT = "Summarize the given text. The summary must reduce the number of characters from the original text by at least a factor of {RATIO}.\nFor example, text of 1000 characters must output a summary less than {EXAMPLE} characters.\nThis summary will be used to extract a list of locations and addresses, so keep any text that seems to be a list of locations/places, a location and paragraph describing it, or otherwise clearly a location name or address.\nHere is the text:\n";
67
+
68
+ /**
69
+ * Reduce a large text to a summary that will fit inside the model's context
70
+ * window alongside a known downstream prompt (e.g. the location-extraction
71
+ * call).
72
+ *
73
+ * Algorithm overview
74
+ * ------------------
75
+ * We need to fit both the summary and a downstream "final" prompt into the
76
+ * model's context window. Given:
77
+ *
78
+ * - `token_limit` — model context window (with a safety factor)
79
+ * - `text_tokens` — size of the input text
80
+ * - `summarize_prompt_tokens` — size of the summarization prompt itself
81
+ * - `final_prompt_tokens` — size of the downstream prompt
82
+ * - `final_output_tokens` — expected size of the downstream response
83
+ *
84
+ * We split the input into chunks, summarize each in parallel, and concatenate
85
+ * the results. The per-chunk size is chosen so that the combined summary
86
+ * leaves room for the downstream prompt:
87
+ *
88
+ * final_input_tokens = token_limit - final_output_tokens
89
+ * summary_output_tokens = final_input_tokens - final_prompt_tokens
90
+ * num_chunks = ceil((text_tokens + summary_output_tokens)
91
+ * / (token_limit - summarize_prompt_tokens))
92
+ * chunk_output_tokens = summary_output_tokens / num_chunks
93
+ * chunk_input_tokens = token_limit - summarize_prompt_tokens - chunk_output_tokens
94
+ *
95
+ * A safety factor is applied to chunk output sizes because LLMs routinely
96
+ * overshoot explicit length instructions.
97
+ */
98
+ declare function summarizeText(backend: OpenAiBackend, text: string, finalPrompt?: string, finalOutputTokens?: number, chunkOutputSafetyFactor?: number, tokenLimitSafetyFactor?: number): Promise<SummarizedText>;
99
+
100
+ /**
101
+ * Parse locations from freeform text. Two-stage: summarize to fit the context
102
+ * window, then a function-calling JSON completion for extraction.
103
+ */
104
+ declare function parseLocationsFromText(backend: OpenAiBackend, text: string): Promise<GetLocationsResponse>;
105
+ /**
106
+ * Parse locations from HTML. Equivalent to `htmlToText` followed by
107
+ * {@link parseLocationsFromText}.
108
+ */
109
+ declare function parseLocationsFromHtml(backend: OpenAiBackend, html: string): Promise<GetLocationsResponse>;
110
+ /**
111
+ * Parse locations from a URL. Equivalent to fetching the HTML, converting to
112
+ * text, and calling {@link parseLocationsFromText}. Throws any scrape errors
113
+ * unchanged.
114
+ */
115
+ declare function parseLocationsFromUrl(backend: OpenAiBackend, url: string): Promise<GetLocationsResponse>;
116
+
117
+ export { AiError, AiInputLengthError, AiOutputLengthError, AiResponseJsonError, GetLocationsResponse, InvalidJsonSchemaError, LOCATIONS_FUNCTIONS_NAME, LOCATIONS_MAX_OUTPUT_TOKENS, LOCATIONS_PROMPT_VERSION, LOCATIONS_SCHEMA, LOCATIONS_SYSTEM_MESSAGE, NoLocationsFoundError, OpenAiBackend, SUMMARIZATION_PROMPT, SUMMARIZE_PROMPT_VERSION, SummarizeTextError, SummarizedText, parseLocationsFromHtml, parseLocationsFromText, parseLocationsFromUrl, summarizeText };
@@ -0,0 +1,447 @@
1
+ import { z } from 'zod';
2
+ import { convert } from 'html-to-text';
3
+ import { encodingForModel, getEncoding } from 'js-tiktoken';
4
+ import OpenAI from 'openai';
5
+
6
+ // src/types/errors.ts
7
+ var MapthisError = class extends Error {
8
+ constructor(message, options) {
9
+ super(message, options);
10
+ this.name = "MapthisError";
11
+ }
12
+ };
13
+
14
+ // src/ai/errors.ts
15
+ var AiError = class extends MapthisError {
16
+ constructor(message, options) {
17
+ super(message, options);
18
+ this.name = "AiError";
19
+ }
20
+ };
21
+ var NoLocationsFoundError = class extends AiError {
22
+ constructor(message, options) {
23
+ super(message ?? "No locations found in input", options);
24
+ this.name = "NoLocationsFoundError";
25
+ }
26
+ };
27
+ var SummarizeTextError = class extends AiError {
28
+ constructor(message, options) {
29
+ super(message, options);
30
+ this.name = "SummarizeTextError";
31
+ }
32
+ };
33
+ var InvalidJsonSchemaError = class extends AiError {
34
+ constructor(message, options) {
35
+ super(message, options);
36
+ this.name = "InvalidJsonSchemaError";
37
+ }
38
+ };
39
+ var AiResponseJsonError = class extends AiError {
40
+ constructor(message, options) {
41
+ super(message, options);
42
+ this.name = "AiResponseJsonError";
43
+ }
44
+ };
45
+ var AiInputLengthError = class extends AiError {
46
+ constructor(message, options) {
47
+ super(message, options);
48
+ this.name = "AiInputLengthError";
49
+ }
50
+ };
51
+ var AiOutputLengthError = class extends AiError {
52
+ constructor(message, options) {
53
+ super(message, options);
54
+ this.name = "AiOutputLengthError";
55
+ }
56
+ };
57
+
58
+ // src/ai/prompts.ts
59
+ var LOCATIONS_PROMPT_VERSION = "0.0.1";
60
+ var LOCATIONS_FUNCTIONS_NAME = "get_places";
61
+ var LOCATIONS_SYSTEM_MESSAGE = `You parse a list of place addresses, corresponding brief one-sentence descriptions, and a descriptive title (preferably directly quoted) from text. These addresses might be countries, cities, attractions, parks, bars, etc.
62
+ If the text seems to be listing places that are all within a single parent place, you don't return the parent place, but you do add it to all the childrens' addresses.
63
+ For example, for an article about places in greece, you would not return "greece", but if the Acropolis was mentioned, you would return "Acropolis, Athens, Greece"
64
+
65
+ You ignore extraneous text, such as recommended articles, advertisments, etc.
66
+ `;
67
+ var LOCATIONS_SCHEMA = {
68
+ type: "object",
69
+ required: ["locations", "title"],
70
+ properties: {
71
+ locations: {
72
+ type: "array",
73
+ items: {
74
+ type: "object",
75
+ required: ["address", "description"],
76
+ properties: {
77
+ address: { type: "string" },
78
+ description: { type: "string" }
79
+ }
80
+ }
81
+ },
82
+ title: { type: "string" }
83
+ }
84
+ };
85
+ var LOCATIONS_MAX_OUTPUT_TOKENS = 500;
86
+ var SUMMARIZE_PROMPT_VERSION = "0.0.1";
87
+ var SUMMARIZATION_PROMPT = `Summarize the given text. The summary must reduce the number of characters from the original text by at least a factor of {RATIO}.
88
+ For example, text of 1000 characters must output a summary less than {EXAMPLE} characters.
89
+ This summary will be used to extract a list of locations and addresses, so keep any text that seems to be a list of locations/places, a location and paragraph describing it, or otherwise clearly a location name or address.
90
+ Here is the text:
91
+ `;
92
+
93
+ // src/scrape/errors.ts
94
+ var ScrapeError = class extends MapthisError {
95
+ constructor(message, options) {
96
+ super(message, options);
97
+ this.name = "ScrapeError";
98
+ }
99
+ };
100
+ var InvalidUrlError = class extends ScrapeError {
101
+ constructor(message, options) {
102
+ super(message, options);
103
+ this.name = "InvalidUrlError";
104
+ }
105
+ };
106
+ var HtmlUnauthorizedError = class extends ScrapeError {
107
+ constructor(message, options) {
108
+ super(message, options);
109
+ this.name = "HtmlUnauthorizedError";
110
+ }
111
+ };
112
+ var HtmlToTextError = class extends ScrapeError {
113
+ constructor(message, options) {
114
+ super(message, options);
115
+ this.name = "HtmlToTextError";
116
+ }
117
+ };
118
+
119
+ // src/scrape/html.ts
120
+ var withHttps = (url) => {
121
+ const hasHttp = /^http?:\/\//i.test(url);
122
+ const hasHttps = /^https?:\/\//i.test(url);
123
+ if (!hasHttp && !hasHttps) return `https://${url}`;
124
+ return url;
125
+ };
126
+ async function getHtmlFromUrl(url) {
127
+ const urlFixed = withHttps(url.toLowerCase());
128
+ const isUrl = z.string().url().safeParse(urlFixed);
129
+ if (!isUrl.success) throw new InvalidUrlError(`Invalid URL: ${url}`);
130
+ try {
131
+ const response = await fetch(urlFixed);
132
+ const html = await response.text();
133
+ if (response.status === 403 || response.status === 401) {
134
+ throw new HtmlUnauthorizedError(
135
+ `${response.status} Not authorized to scrape this website`
136
+ );
137
+ }
138
+ if (!response.ok) {
139
+ throw new ScrapeError(
140
+ `Bad response when getting HTML. Status: ${response.status} ${response.statusText}`
141
+ );
142
+ }
143
+ return html;
144
+ } catch (error) {
145
+ if (error instanceof TypeError) {
146
+ throw new InvalidUrlError(`Invalid URL. Original error: ${error.message}`);
147
+ }
148
+ throw error;
149
+ }
150
+ }
151
+ var SKIP_SELECTORS = [
152
+ "img",
153
+ "header",
154
+ "footer",
155
+ "audio",
156
+ "button",
157
+ "canvas",
158
+ "code",
159
+ "nav",
160
+ "#nav",
161
+ "figure",
162
+ "figcaption",
163
+ ".comment",
164
+ ".comments",
165
+ "#comments",
166
+ "#related-posts",
167
+ "#related",
168
+ ".related",
169
+ ".related-posts"
170
+ ];
171
+ function htmlToText(html) {
172
+ try {
173
+ let out = convert(html, {
174
+ wordwrap: false,
175
+ selectors: [
176
+ {
177
+ selector: "a",
178
+ options: {
179
+ ignoreHref: true,
180
+ hideLinkHrefIfSameAsText: true
181
+ }
182
+ },
183
+ ...SKIP_SELECTORS.map((tag) => ({ selector: tag, format: "skip" }))
184
+ ]
185
+ });
186
+ out = out.replaceAll("\r\n", "\n");
187
+ for (let i = 0; i < 20; i++) {
188
+ out = out.replaceAll("\n\n\n", "\n\n");
189
+ out = out.replaceAll(" ", " ");
190
+ }
191
+ return out;
192
+ } catch (error) {
193
+ if (error instanceof Error) {
194
+ throw new HtmlToTextError(`HTML to text conversion failed: ${error.message}`, {
195
+ cause: error
196
+ });
197
+ }
198
+ throw new HtmlToTextError("HTML to text conversion failed");
199
+ }
200
+ }
201
+
202
+ // src/ai/summarize.ts
203
+ async function summarizeText(backend, text, finalPrompt = "", finalOutputTokens = 0, chunkOutputSafetyFactor = 0.3, tokenLimitSafetyFactor = 0.02) {
204
+ const tokenLimit = backend.tokenLimit * (1 - tokenLimitSafetyFactor);
205
+ const textTokens = backend.countTokens(text);
206
+ const summarizePromptTokens = backend.countTokens(SUMMARIZATION_PROMPT);
207
+ const finalPromptTokens = backend.countTokens(finalPrompt);
208
+ const finalInputTokens = tokenLimit - finalOutputTokens;
209
+ const summaryOutputTokens = finalInputTokens - finalPromptTokens;
210
+ const numChunks = Math.max(
211
+ 1,
212
+ Math.ceil((textTokens + summaryOutputTokens) / (tokenLimit - summarizePromptTokens))
213
+ );
214
+ const chunkOutputTokens = Math.floor(
215
+ summaryOutputTokens / numChunks * (1 - chunkOutputSafetyFactor)
216
+ );
217
+ const chunkInputTokens = Math.floor(tokenLimit - summarizePromptTokens - chunkOutputTokens);
218
+ const summaryRatio = chunkOutputTokens / chunkInputTokens;
219
+ const textLines = text.split("\n");
220
+ const chunks = [];
221
+ let currentChunk = "";
222
+ let currentChunkTokens = 0;
223
+ const nextChunk = () => {
224
+ if (currentChunk) chunks.push(currentChunk);
225
+ currentChunk = "";
226
+ currentChunkTokens = 0;
227
+ };
228
+ for (const line of textLines) {
229
+ const numLineTokens = backend.countTokens(line) + 1;
230
+ if (numLineTokens > chunkInputTokens) {
231
+ nextChunk();
232
+ const charsPerToken = line.length / Math.max(1, backend.countTokens(line));
233
+ const charsPerChunk = Math.max(1, Math.floor(chunkInputTokens * charsPerToken));
234
+ for (let i = 0; i < line.length; i += charsPerChunk) {
235
+ chunks.push(line.slice(i, i + charsPerChunk));
236
+ }
237
+ continue;
238
+ }
239
+ if (currentChunkTokens + numLineTokens > chunkInputTokens) {
240
+ nextChunk();
241
+ }
242
+ currentChunkTokens += numLineTokens;
243
+ currentChunk += line + "\n";
244
+ }
245
+ nextChunk();
246
+ try {
247
+ const ratioText = (Math.ceil(summaryRatio * 10) / 10).toFixed(1);
248
+ const exampleText = Math.ceil(1e3 / Math.max(summaryRatio, 0.01)).toFixed(0);
249
+ const summaryPrompt = SUMMARIZATION_PROMPT.replace("{RATIO}", ratioText).replace(
250
+ "{EXAMPLE}",
251
+ exampleText
252
+ );
253
+ let inputTokens = 0;
254
+ let outputTokens = 0;
255
+ const summaries = await Promise.all(
256
+ chunks.map(async (chunk) => {
257
+ const response = await backend.chatCompletion(summaryPrompt + chunk);
258
+ inputTokens += response.inputTokens;
259
+ outputTokens += response.outputTokens;
260
+ return response.output;
261
+ })
262
+ );
263
+ const summarized = summaries.join("\n");
264
+ const summarizedTokens = backend.countTokens(summarized);
265
+ return {
266
+ text,
267
+ textTokens,
268
+ summarized,
269
+ summarizedTokens,
270
+ inputTokens,
271
+ outputTokens,
272
+ chunks: chunks.length,
273
+ summarizePromptVersion: SUMMARIZE_PROMPT_VERSION
274
+ };
275
+ } catch (error) {
276
+ if (error instanceof Error) {
277
+ throw new SummarizeTextError(`Failed to summarize text: ${error.message}`, {
278
+ cause: error
279
+ });
280
+ }
281
+ throw new SummarizeTextError("Failed to summarize text (unknown error)");
282
+ }
283
+ }
284
+
285
+ // src/ai/locations.ts
286
+ var FINAL_PROMPT = LOCATIONS_SYSTEM_MESSAGE + JSON.stringify(LOCATIONS_SCHEMA) + LOCATIONS_FUNCTIONS_NAME;
287
+ async function callLocationExtraction(backend, text) {
288
+ const out = await backend.chatFunctionJson(
289
+ LOCATIONS_SYSTEM_MESSAGE,
290
+ text,
291
+ LOCATIONS_SCHEMA,
292
+ LOCATIONS_FUNCTIONS_NAME
293
+ );
294
+ const seen = /* @__PURE__ */ new Set();
295
+ const unique = out.output.locations.filter((loc) => {
296
+ if (seen.has(loc.address)) return false;
297
+ seen.add(loc.address);
298
+ return true;
299
+ });
300
+ if (unique.length === 0) throw new NoLocationsFoundError();
301
+ out.output.locations = unique;
302
+ return out;
303
+ }
304
+ async function parseLocationsFromText(backend, text) {
305
+ const {
306
+ summarized,
307
+ inputTokens: sumIn,
308
+ outputTokens: sumOut,
309
+ textTokens,
310
+ summarizedTokens,
311
+ summarizePromptVersion
312
+ } = await summarizeText(backend, text, FINAL_PROMPT, LOCATIONS_MAX_OUTPUT_TOKENS);
313
+ const {
314
+ output,
315
+ inputTokens: locIn,
316
+ outputTokens: locOut
317
+ } = await callLocationExtraction(backend, summarized);
318
+ return {
319
+ output,
320
+ locationInputTokens: locIn,
321
+ locationOutputTokens: locOut,
322
+ summaryInputTokens: sumIn,
323
+ summaryOutputTokens: sumOut,
324
+ inputTokens: sumIn + locIn,
325
+ outputTokens: sumOut + locOut,
326
+ textTokens,
327
+ summarizedTokens,
328
+ summaryPromptVersion: summarizePromptVersion,
329
+ locationPromptVersion: LOCATIONS_PROMPT_VERSION
330
+ };
331
+ }
332
+ async function parseLocationsFromHtml(backend, html) {
333
+ return parseLocationsFromText(backend, htmlToText(html));
334
+ }
335
+ async function parseLocationsFromUrl(backend, url) {
336
+ const html = await getHtmlFromUrl(url);
337
+ return parseLocationsFromHtml(backend, html);
338
+ }
339
+ var DEFAULT_MODEL = "gpt-4o-mini";
340
+ var DEFAULT_TOKEN_LIMIT = 128e3;
341
+ function resolveEncoding(model) {
342
+ try {
343
+ return encodingForModel(model);
344
+ } catch {
345
+ return getEncoding("cl100k_base");
346
+ }
347
+ }
348
+ function createOpenAiBackend(config) {
349
+ const model = config.model ?? DEFAULT_MODEL;
350
+ const tokenLimit = config.tokenLimit ?? DEFAULT_TOKEN_LIMIT;
351
+ const openai = new OpenAI({
352
+ apiKey: config.apiKey,
353
+ ...config.baseURL ? { baseURL: config.baseURL } : {}
354
+ });
355
+ const encoding = resolveEncoding(model);
356
+ return {
357
+ model,
358
+ tokenLimit,
359
+ countTokens(text) {
360
+ return encoding.encode(text).length;
361
+ },
362
+ async chatCompletion(content) {
363
+ const response = await openai.chat.completions.create({
364
+ messages: [{ role: "user", content }],
365
+ model
366
+ });
367
+ return {
368
+ output: response.choices[0]?.message.content ?? "",
369
+ inputTokens: response.usage?.prompt_tokens ?? 0,
370
+ outputTokens: response.usage?.completion_tokens ?? 0
371
+ };
372
+ },
373
+ async chatFunctionJson(systemMessage, userMessage, schema, functionName) {
374
+ try {
375
+ const completion = await openai.chat.completions.create({
376
+ model,
377
+ messages: [
378
+ { role: "system", content: systemMessage },
379
+ { role: "user", content: userMessage }
380
+ ],
381
+ functions: [{ name: functionName, parameters: schema }],
382
+ function_call: { name: functionName }
383
+ });
384
+ const functionCall = completion.choices[0]?.message.function_call;
385
+ if (!functionCall) {
386
+ throw new AiResponseJsonError("No function call in OpenAI response");
387
+ }
388
+ const inputTokens = completion.usage?.prompt_tokens ?? 0;
389
+ const outputTokens = completion.usage?.completion_tokens ?? 0;
390
+ try {
391
+ const output = JSON.parse(functionCall.arguments);
392
+ return { output, inputTokens, outputTokens };
393
+ } catch (error) {
394
+ if (error instanceof SyntaxError) {
395
+ const msg = error.message.toLowerCase();
396
+ if (msg.includes("unexpected end of json input") || msg.includes("unterminated string in js")) {
397
+ throw new AiOutputLengthError(
398
+ `Output tokens exceeded: ${error.message}`,
399
+ { cause: error }
400
+ );
401
+ }
402
+ throw new AiResponseJsonError(
403
+ `Failed to parse JSON from OpenAI response: ${error.message}`,
404
+ { cause: error }
405
+ );
406
+ }
407
+ throw new AiResponseJsonError(
408
+ "Failed to parse JSON from OpenAI response (unknown reason)"
409
+ );
410
+ }
411
+ } catch (error) {
412
+ if (error instanceof AiError) throw error;
413
+ if (error instanceof OpenAI.APIError) {
414
+ const msg = error.message.toLowerCase();
415
+ if (msg.includes("maximum context")) {
416
+ throw new AiInputLengthError(`Text too long: ${error.message}`, {
417
+ cause: error
418
+ });
419
+ }
420
+ if (msg.includes("invalid schema")) {
421
+ throw new InvalidJsonSchemaError(`Invalid schema: ${error.message}`, {
422
+ cause: error
423
+ });
424
+ }
425
+ throw new AiError(`OpenAI error: ${error.message}`, { cause: error });
426
+ }
427
+ throw new AiError("OpenAI error (unknown)", {
428
+ cause: error instanceof Error ? error : void 0
429
+ });
430
+ }
431
+ }
432
+ };
433
+ }
434
+
435
+ // src/ai/parser.ts
436
+ function createLocationParser(config) {
437
+ const backend = createOpenAiBackend(config);
438
+ return {
439
+ parseLocationsFromText: (text) => parseLocationsFromText(backend, text),
440
+ parseLocationsFromHtml: (html) => parseLocationsFromHtml(backend, html),
441
+ parseLocationsFromUrl: (url) => parseLocationsFromUrl(backend, url)
442
+ };
443
+ }
444
+
445
+ export { AiError, AiInputLengthError, AiOutputLengthError, AiResponseJsonError, DEFAULT_MODEL, DEFAULT_TOKEN_LIMIT, InvalidJsonSchemaError, LOCATIONS_FUNCTIONS_NAME, LOCATIONS_MAX_OUTPUT_TOKENS, LOCATIONS_PROMPT_VERSION, LOCATIONS_SCHEMA, LOCATIONS_SYSTEM_MESSAGE, NoLocationsFoundError, SUMMARIZATION_PROMPT, SUMMARIZE_PROMPT_VERSION, SummarizeTextError, createLocationParser, createOpenAiBackend, parseLocationsFromHtml, parseLocationsFromText, parseLocationsFromUrl, summarizeText };
446
+ //# sourceMappingURL=index.js.map
447
+ //# sourceMappingURL=index.js.map