@dromney/mapthis 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +56 -0
- package/dist/ai/index.cjs +474 -0
- package/dist/ai/index.cjs.map +1 -0
- package/dist/ai/index.d.cts +117 -0
- package/dist/ai/index.d.ts +117 -0
- package/dist/ai/index.js +447 -0
- package/dist/ai/index.js.map +1 -0
- package/dist/domain-CZ-L-ntu.d.ts +163 -0
- package/dist/domain-Dc1wSTkf.d.cts +163 -0
- package/dist/errors-Bw97z_4m.d.cts +12 -0
- package/dist/errors-Bw97z_4m.d.ts +12 -0
- package/dist/generate/index.cjs +222 -0
- package/dist/generate/index.cjs.map +1 -0
- package/dist/generate/index.d.cts +140 -0
- package/dist/generate/index.d.ts +140 -0
- package/dist/generate/index.js +220 -0
- package/dist/generate/index.js.map +1 -0
- package/dist/geocoding/index.cjs +90 -0
- package/dist/geocoding/index.cjs.map +1 -0
- package/dist/geocoding/index.d.cts +36 -0
- package/dist/geocoding/index.d.ts +36 -0
- package/dist/geocoding/index.js +86 -0
- package/dist/geocoding/index.js.map +1 -0
- package/dist/index.cjs +546 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +5 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.js +469 -0
- package/dist/index.js.map +1 -0
- package/dist/parser-CzXzpmVv.d.cts +111 -0
- package/dist/parser-N7-fNxeu.d.ts +111 -0
- package/dist/react/index.cjs +394 -0
- package/dist/react/index.cjs.map +1 -0
- package/dist/react/index.js +383 -0
- package/dist/react/index.js.map +1 -0
- package/dist/schemas-Dy5coqXo.d.cts +484 -0
- package/dist/schemas-Dy5coqXo.d.ts +484 -0
- package/dist/scrape/index.cjs +133 -0
- package/dist/scrape/index.cjs.map +1 -0
- package/dist/scrape/index.d.cts +60 -0
- package/dist/scrape/index.d.ts +60 -0
- package/dist/scrape/index.js +125 -0
- package/dist/scrape/index.js.map +1 -0
- package/dist/search/index.cjs +76 -0
- package/dist/search/index.cjs.map +1 -0
- package/dist/search/index.d.cts +75 -0
- package/dist/search/index.d.ts +75 -0
- package/dist/search/index.js +71 -0
- package/dist/search/index.js.map +1 -0
- package/dist/types/index.cjs +215 -0
- package/dist/types/index.cjs.map +1 -0
- package/dist/types/index.d.cts +4 -0
- package/dist/types/index.d.ts +4 -0
- package/dist/types/index.js +171 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types-BhqKlq0k.d.ts +31 -0
- package/dist/types-rFjK5YcJ.d.cts +31 -0
- package/dist/utils/index.cjs +335 -0
- package/dist/utils/index.cjs.map +1 -0
- package/dist/utils/index.d.cts +363 -0
- package/dist/utils/index.d.ts +363 -0
- package/dist/utils/index.js +301 -0
- package/dist/utils/index.js.map +1 -0
- package/package.json +150 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import { M as MapthisError } from '../errors-Bw97z_4m.cjs';
|
|
2
|
+
import { O as OpenAiBackend, S as SummarizedText, G as GetLocationsResponse } from '../parser-CzXzpmVv.cjs';
|
|
3
|
+
export { D as DEFAULT_MODEL, a as DEFAULT_TOKEN_LIMIT, b as GenericAIJsonRequest, L as LocationParser, c as LocationParserConfig, d as OpenAIOutput, e as OpenAiBackendConfig, f as createLocationParser, g as createOpenAiBackend } from '../parser-CzXzpmVv.cjs';
|
|
4
|
+
import { JSONSchema7Object } from 'json-schema';
|
|
5
|
+
import '../domain-Dc1wSTkf.cjs';
|
|
6
|
+
import '../schemas-Dy5coqXo.cjs';
|
|
7
|
+
import 'zod';
|
|
8
|
+
|
|
9
|
+
declare class AiError extends MapthisError {
|
|
10
|
+
constructor(message?: string, options?: ErrorOptions);
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* The LLM returned zero locations after deduplication.
|
|
14
|
+
*/
|
|
15
|
+
declare class NoLocationsFoundError extends AiError {
|
|
16
|
+
constructor(message?: string, options?: ErrorOptions);
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Summarization pipeline failed (one of the chunk calls rejected).
|
|
20
|
+
*/
|
|
21
|
+
declare class SummarizeTextError extends AiError {
|
|
22
|
+
constructor(message?: string, options?: ErrorOptions);
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* A JSON schema passed to the LLM was rejected as invalid by the provider.
|
|
26
|
+
*/
|
|
27
|
+
declare class InvalidJsonSchemaError extends AiError {
|
|
28
|
+
constructor(message?: string, options?: ErrorOptions);
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* The LLM response could not be parsed as JSON (typically because the output
|
|
32
|
+
* was truncated or malformed).
|
|
33
|
+
*/
|
|
34
|
+
declare class AiResponseJsonError extends AiError {
|
|
35
|
+
constructor(message?: string, options?: ErrorOptions);
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Input text exceeded the model's context window.
|
|
39
|
+
*/
|
|
40
|
+
declare class AiInputLengthError extends AiError {
|
|
41
|
+
constructor(message?: string, options?: ErrorOptions);
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* The LLM hit the output token limit before completing its JSON response.
|
|
45
|
+
*/
|
|
46
|
+
declare class AiOutputLengthError extends AiError {
|
|
47
|
+
constructor(message?: string, options?: ErrorOptions);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Semantic version of the location-extraction system prompt. Stored alongside
|
|
52
|
+
* each place record in the private app so results can be re-run when the
|
|
53
|
+
* prompt is updated. Bump when you change {@link LOCATIONS_SYSTEM_MESSAGE} or
|
|
54
|
+
* {@link LOCATIONS_SCHEMA} in a way that meaningfully changes output.
|
|
55
|
+
*/
|
|
56
|
+
declare const LOCATIONS_PROMPT_VERSION = "0.0.1";
|
|
57
|
+
declare const LOCATIONS_FUNCTIONS_NAME = "get_places";
|
|
58
|
+
declare const LOCATIONS_SYSTEM_MESSAGE = "You parse a list of place addresses, corresponding brief one-sentence descriptions, and a descriptive title (preferably directly quoted) from text. These addresses might be countries, cities, attractions, parks, bars, etc.\nIf the text seems to be listing places that are all within a single parent place, you don't return the parent place, but you do add it to all the childrens' addresses.\nFor example, for an article about places in greece, you would not return \"greece\", but if the Acropolis was mentioned, you would return \"Acropolis, Athens, Greece\"\n\nYou ignore extraneous text, such as recommended articles, advertisments, etc.\n";
|
|
59
|
+
declare const LOCATIONS_SCHEMA: JSONSchema7Object;
|
|
60
|
+
declare const LOCATIONS_MAX_OUTPUT_TOKENS = 500;
|
|
61
|
+
/**
|
|
62
|
+
* Semantic version of the summarization prompt. Bump when meaningful changes
|
|
63
|
+
* are made to {@link SUMMARIZATION_PROMPT}.
|
|
64
|
+
*/
|
|
65
|
+
declare const SUMMARIZE_PROMPT_VERSION = "0.0.1";
|
|
66
|
+
declare const SUMMARIZATION_PROMPT = "Summarize the given text. The summary must reduce the number of characters from the original text by at least a factor of {RATIO}.\nFor example, text of 1000 characters must output a summary less than {EXAMPLE} characters.\nThis summary will be used to extract a list of locations and addresses, so keep any text that seems to be a list of locations/places, a location and paragraph describing it, or otherwise clearly a location name or address.\nHere is the text:\n";
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Reduce a large text to a summary that will fit inside the model's context
|
|
70
|
+
* window alongside a known downstream prompt (e.g. the location-extraction
|
|
71
|
+
* call).
|
|
72
|
+
*
|
|
73
|
+
* Algorithm overview
|
|
74
|
+
* ------------------
|
|
75
|
+
* We need to fit both the summary and a downstream "final" prompt into the
|
|
76
|
+
* model's context window. Given:
|
|
77
|
+
*
|
|
78
|
+
* - `token_limit` — model context window (with a safety factor)
|
|
79
|
+
* - `text_tokens` — size of the input text
|
|
80
|
+
* - `summarize_prompt_tokens` — size of the summarization prompt itself
|
|
81
|
+
* - `final_prompt_tokens` — size of the downstream prompt
|
|
82
|
+
* - `final_output_tokens` — expected size of the downstream response
|
|
83
|
+
*
|
|
84
|
+
* We split the input into chunks, summarize each in parallel, and concatenate
|
|
85
|
+
* the results. The per-chunk size is chosen so that the combined summary
|
|
86
|
+
* leaves room for the downstream prompt:
|
|
87
|
+
*
|
|
88
|
+
* final_input_tokens = token_limit - final_output_tokens
|
|
89
|
+
* summary_output_tokens = final_input_tokens - final_prompt_tokens
|
|
90
|
+
* num_chunks = ceil((text_tokens + summary_output_tokens)
|
|
91
|
+
* / (token_limit - summarize_prompt_tokens))
|
|
92
|
+
* chunk_output_tokens = summary_output_tokens / num_chunks
|
|
93
|
+
* chunk_input_tokens = token_limit - summarize_prompt_tokens - chunk_output_tokens
|
|
94
|
+
*
|
|
95
|
+
* A safety factor is applied to chunk output sizes because LLMs routinely
|
|
96
|
+
* overshoot explicit length instructions.
|
|
97
|
+
*/
|
|
98
|
+
declare function summarizeText(backend: OpenAiBackend, text: string, finalPrompt?: string, finalOutputTokens?: number, chunkOutputSafetyFactor?: number, tokenLimitSafetyFactor?: number): Promise<SummarizedText>;
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Parse locations from freeform text. Two-stage: summarize to fit the context
|
|
102
|
+
* window, then a function-calling JSON completion for extraction.
|
|
103
|
+
*/
|
|
104
|
+
declare function parseLocationsFromText(backend: OpenAiBackend, text: string): Promise<GetLocationsResponse>;
|
|
105
|
+
/**
|
|
106
|
+
* Parse locations from HTML. Equivalent to `htmlToText` followed by
|
|
107
|
+
* {@link parseLocationsFromText}.
|
|
108
|
+
*/
|
|
109
|
+
declare function parseLocationsFromHtml(backend: OpenAiBackend, html: string): Promise<GetLocationsResponse>;
|
|
110
|
+
/**
|
|
111
|
+
* Parse locations from a URL. Equivalent to fetching the HTML, converting to
|
|
112
|
+
* text, and calling {@link parseLocationsFromText}. Throws any scrape errors
|
|
113
|
+
* unchanged.
|
|
114
|
+
*/
|
|
115
|
+
declare function parseLocationsFromUrl(backend: OpenAiBackend, url: string): Promise<GetLocationsResponse>;
|
|
116
|
+
|
|
117
|
+
export { AiError, AiInputLengthError, AiOutputLengthError, AiResponseJsonError, GetLocationsResponse, InvalidJsonSchemaError, LOCATIONS_FUNCTIONS_NAME, LOCATIONS_MAX_OUTPUT_TOKENS, LOCATIONS_PROMPT_VERSION, LOCATIONS_SCHEMA, LOCATIONS_SYSTEM_MESSAGE, NoLocationsFoundError, OpenAiBackend, SUMMARIZATION_PROMPT, SUMMARIZE_PROMPT_VERSION, SummarizeTextError, SummarizedText, parseLocationsFromHtml, parseLocationsFromText, parseLocationsFromUrl, summarizeText };
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import { M as MapthisError } from '../errors-Bw97z_4m.js';
|
|
2
|
+
import { O as OpenAiBackend, S as SummarizedText, G as GetLocationsResponse } from '../parser-N7-fNxeu.js';
|
|
3
|
+
export { D as DEFAULT_MODEL, a as DEFAULT_TOKEN_LIMIT, b as GenericAIJsonRequest, L as LocationParser, c as LocationParserConfig, d as OpenAIOutput, e as OpenAiBackendConfig, f as createLocationParser, g as createOpenAiBackend } from '../parser-N7-fNxeu.js';
|
|
4
|
+
import { JSONSchema7Object } from 'json-schema';
|
|
5
|
+
import '../domain-CZ-L-ntu.js';
|
|
6
|
+
import '../schemas-Dy5coqXo.js';
|
|
7
|
+
import 'zod';
|
|
8
|
+
|
|
9
|
+
declare class AiError extends MapthisError {
|
|
10
|
+
constructor(message?: string, options?: ErrorOptions);
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* The LLM returned zero locations after deduplication.
|
|
14
|
+
*/
|
|
15
|
+
declare class NoLocationsFoundError extends AiError {
|
|
16
|
+
constructor(message?: string, options?: ErrorOptions);
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Summarization pipeline failed (one of the chunk calls rejected).
|
|
20
|
+
*/
|
|
21
|
+
declare class SummarizeTextError extends AiError {
|
|
22
|
+
constructor(message?: string, options?: ErrorOptions);
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* A JSON schema passed to the LLM was rejected as invalid by the provider.
|
|
26
|
+
*/
|
|
27
|
+
declare class InvalidJsonSchemaError extends AiError {
|
|
28
|
+
constructor(message?: string, options?: ErrorOptions);
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* The LLM response could not be parsed as JSON (typically because the output
|
|
32
|
+
* was truncated or malformed).
|
|
33
|
+
*/
|
|
34
|
+
declare class AiResponseJsonError extends AiError {
|
|
35
|
+
constructor(message?: string, options?: ErrorOptions);
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Input text exceeded the model's context window.
|
|
39
|
+
*/
|
|
40
|
+
declare class AiInputLengthError extends AiError {
|
|
41
|
+
constructor(message?: string, options?: ErrorOptions);
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* The LLM hit the output token limit before completing its JSON response.
|
|
45
|
+
*/
|
|
46
|
+
declare class AiOutputLengthError extends AiError {
|
|
47
|
+
constructor(message?: string, options?: ErrorOptions);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Semantic version of the location-extraction system prompt. Stored alongside
|
|
52
|
+
* each place record in the private app so results can be re-run when the
|
|
53
|
+
* prompt is updated. Bump when you change {@link LOCATIONS_SYSTEM_MESSAGE} or
|
|
54
|
+
* {@link LOCATIONS_SCHEMA} in a way that meaningfully changes output.
|
|
55
|
+
*/
|
|
56
|
+
declare const LOCATIONS_PROMPT_VERSION = "0.0.1";
|
|
57
|
+
declare const LOCATIONS_FUNCTIONS_NAME = "get_places";
|
|
58
|
+
declare const LOCATIONS_SYSTEM_MESSAGE = "You parse a list of place addresses, corresponding brief one-sentence descriptions, and a descriptive title (preferably directly quoted) from text. These addresses might be countries, cities, attractions, parks, bars, etc.\nIf the text seems to be listing places that are all within a single parent place, you don't return the parent place, but you do add it to all the childrens' addresses.\nFor example, for an article about places in greece, you would not return \"greece\", but if the Acropolis was mentioned, you would return \"Acropolis, Athens, Greece\"\n\nYou ignore extraneous text, such as recommended articles, advertisments, etc.\n";
|
|
59
|
+
declare const LOCATIONS_SCHEMA: JSONSchema7Object;
|
|
60
|
+
declare const LOCATIONS_MAX_OUTPUT_TOKENS = 500;
|
|
61
|
+
/**
|
|
62
|
+
* Semantic version of the summarization prompt. Bump when meaningful changes
|
|
63
|
+
* are made to {@link SUMMARIZATION_PROMPT}.
|
|
64
|
+
*/
|
|
65
|
+
declare const SUMMARIZE_PROMPT_VERSION = "0.0.1";
|
|
66
|
+
declare const SUMMARIZATION_PROMPT = "Summarize the given text. The summary must reduce the number of characters from the original text by at least a factor of {RATIO}.\nFor example, text of 1000 characters must output a summary less than {EXAMPLE} characters.\nThis summary will be used to extract a list of locations and addresses, so keep any text that seems to be a list of locations/places, a location and paragraph describing it, or otherwise clearly a location name or address.\nHere is the text:\n";
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Reduce a large text to a summary that will fit inside the model's context
|
|
70
|
+
* window alongside a known downstream prompt (e.g. the location-extraction
|
|
71
|
+
* call).
|
|
72
|
+
*
|
|
73
|
+
* Algorithm overview
|
|
74
|
+
* ------------------
|
|
75
|
+
* We need to fit both the summary and a downstream "final" prompt into the
|
|
76
|
+
* model's context window. Given:
|
|
77
|
+
*
|
|
78
|
+
* - `token_limit` — model context window (with a safety factor)
|
|
79
|
+
* - `text_tokens` — size of the input text
|
|
80
|
+
* - `summarize_prompt_tokens` — size of the summarization prompt itself
|
|
81
|
+
* - `final_prompt_tokens` — size of the downstream prompt
|
|
82
|
+
* - `final_output_tokens` — expected size of the downstream response
|
|
83
|
+
*
|
|
84
|
+
* We split the input into chunks, summarize each in parallel, and concatenate
|
|
85
|
+
* the results. The per-chunk size is chosen so that the combined summary
|
|
86
|
+
* leaves room for the downstream prompt:
|
|
87
|
+
*
|
|
88
|
+
* final_input_tokens = token_limit - final_output_tokens
|
|
89
|
+
* summary_output_tokens = final_input_tokens - final_prompt_tokens
|
|
90
|
+
* num_chunks = ceil((text_tokens + summary_output_tokens)
|
|
91
|
+
* / (token_limit - summarize_prompt_tokens))
|
|
92
|
+
* chunk_output_tokens = summary_output_tokens / num_chunks
|
|
93
|
+
* chunk_input_tokens = token_limit - summarize_prompt_tokens - chunk_output_tokens
|
|
94
|
+
*
|
|
95
|
+
* A safety factor is applied to chunk output sizes because LLMs routinely
|
|
96
|
+
* overshoot explicit length instructions.
|
|
97
|
+
*/
|
|
98
|
+
declare function summarizeText(backend: OpenAiBackend, text: string, finalPrompt?: string, finalOutputTokens?: number, chunkOutputSafetyFactor?: number, tokenLimitSafetyFactor?: number): Promise<SummarizedText>;
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Parse locations from freeform text. Two-stage: summarize to fit the context
|
|
102
|
+
* window, then a function-calling JSON completion for extraction.
|
|
103
|
+
*/
|
|
104
|
+
declare function parseLocationsFromText(backend: OpenAiBackend, text: string): Promise<GetLocationsResponse>;
|
|
105
|
+
/**
|
|
106
|
+
* Parse locations from HTML. Equivalent to `htmlToText` followed by
|
|
107
|
+
* {@link parseLocationsFromText}.
|
|
108
|
+
*/
|
|
109
|
+
declare function parseLocationsFromHtml(backend: OpenAiBackend, html: string): Promise<GetLocationsResponse>;
|
|
110
|
+
/**
|
|
111
|
+
* Parse locations from a URL. Equivalent to fetching the HTML, converting to
|
|
112
|
+
* text, and calling {@link parseLocationsFromText}. Throws any scrape errors
|
|
113
|
+
* unchanged.
|
|
114
|
+
*/
|
|
115
|
+
declare function parseLocationsFromUrl(backend: OpenAiBackend, url: string): Promise<GetLocationsResponse>;
|
|
116
|
+
|
|
117
|
+
export { AiError, AiInputLengthError, AiOutputLengthError, AiResponseJsonError, GetLocationsResponse, InvalidJsonSchemaError, LOCATIONS_FUNCTIONS_NAME, LOCATIONS_MAX_OUTPUT_TOKENS, LOCATIONS_PROMPT_VERSION, LOCATIONS_SCHEMA, LOCATIONS_SYSTEM_MESSAGE, NoLocationsFoundError, OpenAiBackend, SUMMARIZATION_PROMPT, SUMMARIZE_PROMPT_VERSION, SummarizeTextError, SummarizedText, parseLocationsFromHtml, parseLocationsFromText, parseLocationsFromUrl, summarizeText };
|
package/dist/ai/index.js
ADDED
|
@@ -0,0 +1,447 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { convert } from 'html-to-text';
|
|
3
|
+
import { encodingForModel, getEncoding } from 'js-tiktoken';
|
|
4
|
+
import OpenAI from 'openai';
|
|
5
|
+
|
|
6
|
+
// src/types/errors.ts
|
|
7
|
+
var MapthisError = class extends Error {
|
|
8
|
+
constructor(message, options) {
|
|
9
|
+
super(message, options);
|
|
10
|
+
this.name = "MapthisError";
|
|
11
|
+
}
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
// src/ai/errors.ts
|
|
15
|
+
var AiError = class extends MapthisError {
|
|
16
|
+
constructor(message, options) {
|
|
17
|
+
super(message, options);
|
|
18
|
+
this.name = "AiError";
|
|
19
|
+
}
|
|
20
|
+
};
|
|
21
|
+
var NoLocationsFoundError = class extends AiError {
|
|
22
|
+
constructor(message, options) {
|
|
23
|
+
super(message ?? "No locations found in input", options);
|
|
24
|
+
this.name = "NoLocationsFoundError";
|
|
25
|
+
}
|
|
26
|
+
};
|
|
27
|
+
var SummarizeTextError = class extends AiError {
|
|
28
|
+
constructor(message, options) {
|
|
29
|
+
super(message, options);
|
|
30
|
+
this.name = "SummarizeTextError";
|
|
31
|
+
}
|
|
32
|
+
};
|
|
33
|
+
var InvalidJsonSchemaError = class extends AiError {
|
|
34
|
+
constructor(message, options) {
|
|
35
|
+
super(message, options);
|
|
36
|
+
this.name = "InvalidJsonSchemaError";
|
|
37
|
+
}
|
|
38
|
+
};
|
|
39
|
+
var AiResponseJsonError = class extends AiError {
|
|
40
|
+
constructor(message, options) {
|
|
41
|
+
super(message, options);
|
|
42
|
+
this.name = "AiResponseJsonError";
|
|
43
|
+
}
|
|
44
|
+
};
|
|
45
|
+
var AiInputLengthError = class extends AiError {
|
|
46
|
+
constructor(message, options) {
|
|
47
|
+
super(message, options);
|
|
48
|
+
this.name = "AiInputLengthError";
|
|
49
|
+
}
|
|
50
|
+
};
|
|
51
|
+
var AiOutputLengthError = class extends AiError {
|
|
52
|
+
constructor(message, options) {
|
|
53
|
+
super(message, options);
|
|
54
|
+
this.name = "AiOutputLengthError";
|
|
55
|
+
}
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
// src/ai/prompts.ts
|
|
59
|
+
var LOCATIONS_PROMPT_VERSION = "0.0.1";
|
|
60
|
+
var LOCATIONS_FUNCTIONS_NAME = "get_places";
|
|
61
|
+
var LOCATIONS_SYSTEM_MESSAGE = `You parse a list of place addresses, corresponding brief one-sentence descriptions, and a descriptive title (preferably directly quoted) from text. These addresses might be countries, cities, attractions, parks, bars, etc.
|
|
62
|
+
If the text seems to be listing places that are all within a single parent place, you don't return the parent place, but you do add it to all the childrens' addresses.
|
|
63
|
+
For example, for an article about places in greece, you would not return "greece", but if the Acropolis was mentioned, you would return "Acropolis, Athens, Greece"
|
|
64
|
+
|
|
65
|
+
You ignore extraneous text, such as recommended articles, advertisments, etc.
|
|
66
|
+
`;
|
|
67
|
+
var LOCATIONS_SCHEMA = {
|
|
68
|
+
type: "object",
|
|
69
|
+
required: ["locations", "title"],
|
|
70
|
+
properties: {
|
|
71
|
+
locations: {
|
|
72
|
+
type: "array",
|
|
73
|
+
items: {
|
|
74
|
+
type: "object",
|
|
75
|
+
required: ["address", "description"],
|
|
76
|
+
properties: {
|
|
77
|
+
address: { type: "string" },
|
|
78
|
+
description: { type: "string" }
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
},
|
|
82
|
+
title: { type: "string" }
|
|
83
|
+
}
|
|
84
|
+
};
|
|
85
|
+
var LOCATIONS_MAX_OUTPUT_TOKENS = 500;
|
|
86
|
+
var SUMMARIZE_PROMPT_VERSION = "0.0.1";
|
|
87
|
+
var SUMMARIZATION_PROMPT = `Summarize the given text. The summary must reduce the number of characters from the original text by at least a factor of {RATIO}.
|
|
88
|
+
For example, text of 1000 characters must output a summary less than {EXAMPLE} characters.
|
|
89
|
+
This summary will be used to extract a list of locations and addresses, so keep any text that seems to be a list of locations/places, a location and paragraph describing it, or otherwise clearly a location name or address.
|
|
90
|
+
Here is the text:
|
|
91
|
+
`;
|
|
92
|
+
|
|
93
|
+
// src/scrape/errors.ts
|
|
94
|
+
var ScrapeError = class extends MapthisError {
|
|
95
|
+
constructor(message, options) {
|
|
96
|
+
super(message, options);
|
|
97
|
+
this.name = "ScrapeError";
|
|
98
|
+
}
|
|
99
|
+
};
|
|
100
|
+
var InvalidUrlError = class extends ScrapeError {
|
|
101
|
+
constructor(message, options) {
|
|
102
|
+
super(message, options);
|
|
103
|
+
this.name = "InvalidUrlError";
|
|
104
|
+
}
|
|
105
|
+
};
|
|
106
|
+
var HtmlUnauthorizedError = class extends ScrapeError {
|
|
107
|
+
constructor(message, options) {
|
|
108
|
+
super(message, options);
|
|
109
|
+
this.name = "HtmlUnauthorizedError";
|
|
110
|
+
}
|
|
111
|
+
};
|
|
112
|
+
var HtmlToTextError = class extends ScrapeError {
|
|
113
|
+
constructor(message, options) {
|
|
114
|
+
super(message, options);
|
|
115
|
+
this.name = "HtmlToTextError";
|
|
116
|
+
}
|
|
117
|
+
};
|
|
118
|
+
|
|
119
|
+
// src/scrape/html.ts
|
|
120
|
+
var withHttps = (url) => {
|
|
121
|
+
const hasHttp = /^http?:\/\//i.test(url);
|
|
122
|
+
const hasHttps = /^https?:\/\//i.test(url);
|
|
123
|
+
if (!hasHttp && !hasHttps) return `https://${url}`;
|
|
124
|
+
return url;
|
|
125
|
+
};
|
|
126
|
+
async function getHtmlFromUrl(url) {
|
|
127
|
+
const urlFixed = withHttps(url.toLowerCase());
|
|
128
|
+
const isUrl = z.string().url().safeParse(urlFixed);
|
|
129
|
+
if (!isUrl.success) throw new InvalidUrlError(`Invalid URL: ${url}`);
|
|
130
|
+
try {
|
|
131
|
+
const response = await fetch(urlFixed);
|
|
132
|
+
const html = await response.text();
|
|
133
|
+
if (response.status === 403 || response.status === 401) {
|
|
134
|
+
throw new HtmlUnauthorizedError(
|
|
135
|
+
`${response.status} Not authorized to scrape this website`
|
|
136
|
+
);
|
|
137
|
+
}
|
|
138
|
+
if (!response.ok) {
|
|
139
|
+
throw new ScrapeError(
|
|
140
|
+
`Bad response when getting HTML. Status: ${response.status} ${response.statusText}`
|
|
141
|
+
);
|
|
142
|
+
}
|
|
143
|
+
return html;
|
|
144
|
+
} catch (error) {
|
|
145
|
+
if (error instanceof TypeError) {
|
|
146
|
+
throw new InvalidUrlError(`Invalid URL. Original error: ${error.message}`);
|
|
147
|
+
}
|
|
148
|
+
throw error;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
var SKIP_SELECTORS = [
|
|
152
|
+
"img",
|
|
153
|
+
"header",
|
|
154
|
+
"footer",
|
|
155
|
+
"audio",
|
|
156
|
+
"button",
|
|
157
|
+
"canvas",
|
|
158
|
+
"code",
|
|
159
|
+
"nav",
|
|
160
|
+
"#nav",
|
|
161
|
+
"figure",
|
|
162
|
+
"figcaption",
|
|
163
|
+
".comment",
|
|
164
|
+
".comments",
|
|
165
|
+
"#comments",
|
|
166
|
+
"#related-posts",
|
|
167
|
+
"#related",
|
|
168
|
+
".related",
|
|
169
|
+
".related-posts"
|
|
170
|
+
];
|
|
171
|
+
function htmlToText(html) {
|
|
172
|
+
try {
|
|
173
|
+
let out = convert(html, {
|
|
174
|
+
wordwrap: false,
|
|
175
|
+
selectors: [
|
|
176
|
+
{
|
|
177
|
+
selector: "a",
|
|
178
|
+
options: {
|
|
179
|
+
ignoreHref: true,
|
|
180
|
+
hideLinkHrefIfSameAsText: true
|
|
181
|
+
}
|
|
182
|
+
},
|
|
183
|
+
...SKIP_SELECTORS.map((tag) => ({ selector: tag, format: "skip" }))
|
|
184
|
+
]
|
|
185
|
+
});
|
|
186
|
+
out = out.replaceAll("\r\n", "\n");
|
|
187
|
+
for (let i = 0; i < 20; i++) {
|
|
188
|
+
out = out.replaceAll("\n\n\n", "\n\n");
|
|
189
|
+
out = out.replaceAll(" ", " ");
|
|
190
|
+
}
|
|
191
|
+
return out;
|
|
192
|
+
} catch (error) {
|
|
193
|
+
if (error instanceof Error) {
|
|
194
|
+
throw new HtmlToTextError(`HTML to text conversion failed: ${error.message}`, {
|
|
195
|
+
cause: error
|
|
196
|
+
});
|
|
197
|
+
}
|
|
198
|
+
throw new HtmlToTextError("HTML to text conversion failed");
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// src/ai/summarize.ts
|
|
203
|
+
async function summarizeText(backend, text, finalPrompt = "", finalOutputTokens = 0, chunkOutputSafetyFactor = 0.3, tokenLimitSafetyFactor = 0.02) {
|
|
204
|
+
const tokenLimit = backend.tokenLimit * (1 - tokenLimitSafetyFactor);
|
|
205
|
+
const textTokens = backend.countTokens(text);
|
|
206
|
+
const summarizePromptTokens = backend.countTokens(SUMMARIZATION_PROMPT);
|
|
207
|
+
const finalPromptTokens = backend.countTokens(finalPrompt);
|
|
208
|
+
const finalInputTokens = tokenLimit - finalOutputTokens;
|
|
209
|
+
const summaryOutputTokens = finalInputTokens - finalPromptTokens;
|
|
210
|
+
const numChunks = Math.max(
|
|
211
|
+
1,
|
|
212
|
+
Math.ceil((textTokens + summaryOutputTokens) / (tokenLimit - summarizePromptTokens))
|
|
213
|
+
);
|
|
214
|
+
const chunkOutputTokens = Math.floor(
|
|
215
|
+
summaryOutputTokens / numChunks * (1 - chunkOutputSafetyFactor)
|
|
216
|
+
);
|
|
217
|
+
const chunkInputTokens = Math.floor(tokenLimit - summarizePromptTokens - chunkOutputTokens);
|
|
218
|
+
const summaryRatio = chunkOutputTokens / chunkInputTokens;
|
|
219
|
+
const textLines = text.split("\n");
|
|
220
|
+
const chunks = [];
|
|
221
|
+
let currentChunk = "";
|
|
222
|
+
let currentChunkTokens = 0;
|
|
223
|
+
const nextChunk = () => {
|
|
224
|
+
if (currentChunk) chunks.push(currentChunk);
|
|
225
|
+
currentChunk = "";
|
|
226
|
+
currentChunkTokens = 0;
|
|
227
|
+
};
|
|
228
|
+
for (const line of textLines) {
|
|
229
|
+
const numLineTokens = backend.countTokens(line) + 1;
|
|
230
|
+
if (numLineTokens > chunkInputTokens) {
|
|
231
|
+
nextChunk();
|
|
232
|
+
const charsPerToken = line.length / Math.max(1, backend.countTokens(line));
|
|
233
|
+
const charsPerChunk = Math.max(1, Math.floor(chunkInputTokens * charsPerToken));
|
|
234
|
+
for (let i = 0; i < line.length; i += charsPerChunk) {
|
|
235
|
+
chunks.push(line.slice(i, i + charsPerChunk));
|
|
236
|
+
}
|
|
237
|
+
continue;
|
|
238
|
+
}
|
|
239
|
+
if (currentChunkTokens + numLineTokens > chunkInputTokens) {
|
|
240
|
+
nextChunk();
|
|
241
|
+
}
|
|
242
|
+
currentChunkTokens += numLineTokens;
|
|
243
|
+
currentChunk += line + "\n";
|
|
244
|
+
}
|
|
245
|
+
nextChunk();
|
|
246
|
+
try {
|
|
247
|
+
const ratioText = (Math.ceil(summaryRatio * 10) / 10).toFixed(1);
|
|
248
|
+
const exampleText = Math.ceil(1e3 / Math.max(summaryRatio, 0.01)).toFixed(0);
|
|
249
|
+
const summaryPrompt = SUMMARIZATION_PROMPT.replace("{RATIO}", ratioText).replace(
|
|
250
|
+
"{EXAMPLE}",
|
|
251
|
+
exampleText
|
|
252
|
+
);
|
|
253
|
+
let inputTokens = 0;
|
|
254
|
+
let outputTokens = 0;
|
|
255
|
+
const summaries = await Promise.all(
|
|
256
|
+
chunks.map(async (chunk) => {
|
|
257
|
+
const response = await backend.chatCompletion(summaryPrompt + chunk);
|
|
258
|
+
inputTokens += response.inputTokens;
|
|
259
|
+
outputTokens += response.outputTokens;
|
|
260
|
+
return response.output;
|
|
261
|
+
})
|
|
262
|
+
);
|
|
263
|
+
const summarized = summaries.join("\n");
|
|
264
|
+
const summarizedTokens = backend.countTokens(summarized);
|
|
265
|
+
return {
|
|
266
|
+
text,
|
|
267
|
+
textTokens,
|
|
268
|
+
summarized,
|
|
269
|
+
summarizedTokens,
|
|
270
|
+
inputTokens,
|
|
271
|
+
outputTokens,
|
|
272
|
+
chunks: chunks.length,
|
|
273
|
+
summarizePromptVersion: SUMMARIZE_PROMPT_VERSION
|
|
274
|
+
};
|
|
275
|
+
} catch (error) {
|
|
276
|
+
if (error instanceof Error) {
|
|
277
|
+
throw new SummarizeTextError(`Failed to summarize text: ${error.message}`, {
|
|
278
|
+
cause: error
|
|
279
|
+
});
|
|
280
|
+
}
|
|
281
|
+
throw new SummarizeTextError("Failed to summarize text (unknown error)");
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// src/ai/locations.ts
|
|
286
|
+
var FINAL_PROMPT = LOCATIONS_SYSTEM_MESSAGE + JSON.stringify(LOCATIONS_SCHEMA) + LOCATIONS_FUNCTIONS_NAME;
|
|
287
|
+
async function callLocationExtraction(backend, text) {
|
|
288
|
+
const out = await backend.chatFunctionJson(
|
|
289
|
+
LOCATIONS_SYSTEM_MESSAGE,
|
|
290
|
+
text,
|
|
291
|
+
LOCATIONS_SCHEMA,
|
|
292
|
+
LOCATIONS_FUNCTIONS_NAME
|
|
293
|
+
);
|
|
294
|
+
const seen = /* @__PURE__ */ new Set();
|
|
295
|
+
const unique = out.output.locations.filter((loc) => {
|
|
296
|
+
if (seen.has(loc.address)) return false;
|
|
297
|
+
seen.add(loc.address);
|
|
298
|
+
return true;
|
|
299
|
+
});
|
|
300
|
+
if (unique.length === 0) throw new NoLocationsFoundError();
|
|
301
|
+
out.output.locations = unique;
|
|
302
|
+
return out;
|
|
303
|
+
}
|
|
304
|
+
async function parseLocationsFromText(backend, text) {
|
|
305
|
+
const {
|
|
306
|
+
summarized,
|
|
307
|
+
inputTokens: sumIn,
|
|
308
|
+
outputTokens: sumOut,
|
|
309
|
+
textTokens,
|
|
310
|
+
summarizedTokens,
|
|
311
|
+
summarizePromptVersion
|
|
312
|
+
} = await summarizeText(backend, text, FINAL_PROMPT, LOCATIONS_MAX_OUTPUT_TOKENS);
|
|
313
|
+
const {
|
|
314
|
+
output,
|
|
315
|
+
inputTokens: locIn,
|
|
316
|
+
outputTokens: locOut
|
|
317
|
+
} = await callLocationExtraction(backend, summarized);
|
|
318
|
+
return {
|
|
319
|
+
output,
|
|
320
|
+
locationInputTokens: locIn,
|
|
321
|
+
locationOutputTokens: locOut,
|
|
322
|
+
summaryInputTokens: sumIn,
|
|
323
|
+
summaryOutputTokens: sumOut,
|
|
324
|
+
inputTokens: sumIn + locIn,
|
|
325
|
+
outputTokens: sumOut + locOut,
|
|
326
|
+
textTokens,
|
|
327
|
+
summarizedTokens,
|
|
328
|
+
summaryPromptVersion: summarizePromptVersion,
|
|
329
|
+
locationPromptVersion: LOCATIONS_PROMPT_VERSION
|
|
330
|
+
};
|
|
331
|
+
}
|
|
332
|
+
async function parseLocationsFromHtml(backend, html) {
|
|
333
|
+
return parseLocationsFromText(backend, htmlToText(html));
|
|
334
|
+
}
|
|
335
|
+
async function parseLocationsFromUrl(backend, url) {
|
|
336
|
+
const html = await getHtmlFromUrl(url);
|
|
337
|
+
return parseLocationsFromHtml(backend, html);
|
|
338
|
+
}
|
|
339
|
+
var DEFAULT_MODEL = "gpt-4o-mini";
|
|
340
|
+
var DEFAULT_TOKEN_LIMIT = 128e3;
|
|
341
|
+
function resolveEncoding(model) {
|
|
342
|
+
try {
|
|
343
|
+
return encodingForModel(model);
|
|
344
|
+
} catch {
|
|
345
|
+
return getEncoding("cl100k_base");
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
function createOpenAiBackend(config) {
|
|
349
|
+
const model = config.model ?? DEFAULT_MODEL;
|
|
350
|
+
const tokenLimit = config.tokenLimit ?? DEFAULT_TOKEN_LIMIT;
|
|
351
|
+
const openai = new OpenAI({
|
|
352
|
+
apiKey: config.apiKey,
|
|
353
|
+
...config.baseURL ? { baseURL: config.baseURL } : {}
|
|
354
|
+
});
|
|
355
|
+
const encoding = resolveEncoding(model);
|
|
356
|
+
return {
|
|
357
|
+
model,
|
|
358
|
+
tokenLimit,
|
|
359
|
+
countTokens(text) {
|
|
360
|
+
return encoding.encode(text).length;
|
|
361
|
+
},
|
|
362
|
+
async chatCompletion(content) {
|
|
363
|
+
const response = await openai.chat.completions.create({
|
|
364
|
+
messages: [{ role: "user", content }],
|
|
365
|
+
model
|
|
366
|
+
});
|
|
367
|
+
return {
|
|
368
|
+
output: response.choices[0]?.message.content ?? "",
|
|
369
|
+
inputTokens: response.usage?.prompt_tokens ?? 0,
|
|
370
|
+
outputTokens: response.usage?.completion_tokens ?? 0
|
|
371
|
+
};
|
|
372
|
+
},
|
|
373
|
+
async chatFunctionJson(systemMessage, userMessage, schema, functionName) {
|
|
374
|
+
try {
|
|
375
|
+
const completion = await openai.chat.completions.create({
|
|
376
|
+
model,
|
|
377
|
+
messages: [
|
|
378
|
+
{ role: "system", content: systemMessage },
|
|
379
|
+
{ role: "user", content: userMessage }
|
|
380
|
+
],
|
|
381
|
+
functions: [{ name: functionName, parameters: schema }],
|
|
382
|
+
function_call: { name: functionName }
|
|
383
|
+
});
|
|
384
|
+
const functionCall = completion.choices[0]?.message.function_call;
|
|
385
|
+
if (!functionCall) {
|
|
386
|
+
throw new AiResponseJsonError("No function call in OpenAI response");
|
|
387
|
+
}
|
|
388
|
+
const inputTokens = completion.usage?.prompt_tokens ?? 0;
|
|
389
|
+
const outputTokens = completion.usage?.completion_tokens ?? 0;
|
|
390
|
+
try {
|
|
391
|
+
const output = JSON.parse(functionCall.arguments);
|
|
392
|
+
return { output, inputTokens, outputTokens };
|
|
393
|
+
} catch (error) {
|
|
394
|
+
if (error instanceof SyntaxError) {
|
|
395
|
+
const msg = error.message.toLowerCase();
|
|
396
|
+
if (msg.includes("unexpected end of json input") || msg.includes("unterminated string in js")) {
|
|
397
|
+
throw new AiOutputLengthError(
|
|
398
|
+
`Output tokens exceeded: ${error.message}`,
|
|
399
|
+
{ cause: error }
|
|
400
|
+
);
|
|
401
|
+
}
|
|
402
|
+
throw new AiResponseJsonError(
|
|
403
|
+
`Failed to parse JSON from OpenAI response: ${error.message}`,
|
|
404
|
+
{ cause: error }
|
|
405
|
+
);
|
|
406
|
+
}
|
|
407
|
+
throw new AiResponseJsonError(
|
|
408
|
+
"Failed to parse JSON from OpenAI response (unknown reason)"
|
|
409
|
+
);
|
|
410
|
+
}
|
|
411
|
+
} catch (error) {
|
|
412
|
+
if (error instanceof AiError) throw error;
|
|
413
|
+
if (error instanceof OpenAI.APIError) {
|
|
414
|
+
const msg = error.message.toLowerCase();
|
|
415
|
+
if (msg.includes("maximum context")) {
|
|
416
|
+
throw new AiInputLengthError(`Text too long: ${error.message}`, {
|
|
417
|
+
cause: error
|
|
418
|
+
});
|
|
419
|
+
}
|
|
420
|
+
if (msg.includes("invalid schema")) {
|
|
421
|
+
throw new InvalidJsonSchemaError(`Invalid schema: ${error.message}`, {
|
|
422
|
+
cause: error
|
|
423
|
+
});
|
|
424
|
+
}
|
|
425
|
+
throw new AiError(`OpenAI error: ${error.message}`, { cause: error });
|
|
426
|
+
}
|
|
427
|
+
throw new AiError("OpenAI error (unknown)", {
|
|
428
|
+
cause: error instanceof Error ? error : void 0
|
|
429
|
+
});
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
};
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
// src/ai/parser.ts
|
|
436
|
+
function createLocationParser(config) {
|
|
437
|
+
const backend = createOpenAiBackend(config);
|
|
438
|
+
return {
|
|
439
|
+
parseLocationsFromText: (text) => parseLocationsFromText(backend, text),
|
|
440
|
+
parseLocationsFromHtml: (html) => parseLocationsFromHtml(backend, html),
|
|
441
|
+
parseLocationsFromUrl: (url) => parseLocationsFromUrl(backend, url)
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
export { AiError, AiInputLengthError, AiOutputLengthError, AiResponseJsonError, DEFAULT_MODEL, DEFAULT_TOKEN_LIMIT, InvalidJsonSchemaError, LOCATIONS_FUNCTIONS_NAME, LOCATIONS_MAX_OUTPUT_TOKENS, LOCATIONS_PROMPT_VERSION, LOCATIONS_SCHEMA, LOCATIONS_SYSTEM_MESSAGE, NoLocationsFoundError, SUMMARIZATION_PROMPT, SUMMARIZE_PROMPT_VERSION, SummarizeTextError, createLocationParser, createOpenAiBackend, parseLocationsFromHtml, parseLocationsFromText, parseLocationsFromUrl, summarizeText };
|
|
446
|
+
//# sourceMappingURL=index.js.map
|
|
447
|
+
//# sourceMappingURL=index.js.map
|