@oh-my-pi/pi-coding-agent 13.3.6 → 13.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +115 -0
- package/package.json +9 -18
- package/scripts/format-prompts.ts +7 -172
- package/src/capability/mcp.ts +5 -0
- package/src/cli/args.ts +1 -0
- package/src/config/prompt-templates.ts +9 -55
- package/src/config/settings-schema.ts +24 -0
- package/src/discovery/builtin.ts +1 -0
- package/src/discovery/codex.ts +1 -2
- package/src/discovery/helpers.ts +0 -5
- package/src/discovery/mcp-json.ts +2 -0
- package/src/internal-urls/docs-index.generated.ts +1 -1
- package/src/lsp/client.ts +8 -0
- package/src/lsp/config.ts +2 -3
- package/src/lsp/index.ts +379 -99
- package/src/lsp/render.ts +21 -31
- package/src/lsp/types.ts +21 -8
- package/src/lsp/utils.ts +193 -1
- package/src/mcp/config-writer.ts +3 -0
- package/src/mcp/config.ts +1 -0
- package/src/mcp/oauth-flow.ts +3 -1
- package/src/mcp/types.ts +5 -0
- package/src/modes/components/settings-defs.ts +9 -0
- package/src/modes/components/status-line.ts +1 -1
- package/src/modes/controllers/mcp-command-controller.ts +6 -2
- package/src/modes/interactive-mode.ts +8 -1
- package/src/modes/theme/mermaid-cache.ts +4 -4
- package/src/modes/theme/theme.ts +33 -0
- package/src/prompts/system/custom-system-prompt.md +0 -10
- package/src/prompts/system/subagent-user-prompt.md +2 -0
- package/src/prompts/system/system-prompt.md +12 -9
- package/src/prompts/tools/ast-find.md +20 -0
- package/src/prompts/tools/ast-replace.md +21 -0
- package/src/prompts/tools/bash.md +2 -0
- package/src/prompts/tools/hashline.md +26 -8
- package/src/prompts/tools/lsp.md +22 -5
- package/src/prompts/tools/task.md +0 -1
- package/src/sdk.ts +11 -5
- package/src/session/agent-session.ts +293 -83
- package/src/system-prompt.ts +3 -34
- package/src/task/executor.ts +8 -7
- package/src/task/index.ts +8 -55
- package/src/task/template.ts +2 -4
- package/src/task/types.ts +0 -5
- package/src/task/worktree.ts +6 -2
- package/src/tools/ast-find.ts +316 -0
- package/src/tools/ast-replace.ts +294 -0
- package/src/tools/bash.ts +2 -1
- package/src/tools/browser.ts +2 -8
- package/src/tools/fetch.ts +55 -18
- package/src/tools/index.ts +8 -0
- package/src/tools/jtd-to-json-schema.ts +29 -13
- package/src/tools/path-utils.ts +34 -0
- package/src/tools/python.ts +2 -1
- package/src/tools/renderers.ts +4 -0
- package/src/tools/ssh.ts +2 -1
- package/src/tools/submit-result.ts +143 -44
- package/src/tools/todo-write.ts +34 -0
- package/src/tools/tool-timeouts.ts +29 -0
- package/src/utils/mime.ts +37 -14
- package/src/utils/prompt-format.ts +172 -0
- package/src/web/scrapers/arxiv.ts +12 -12
- package/src/web/scrapers/go-pkg.ts +2 -2
- package/src/web/scrapers/iacr.ts +17 -9
- package/src/web/scrapers/readthedocs.ts +3 -3
- package/src/web/scrapers/twitter.ts +11 -11
- package/src/web/scrapers/wikipedia.ts +4 -5
- package/src/utils/ignore-files.ts +0 -119
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
* Subagents must call this tool to finish and return structured JSON output.
|
|
5
5
|
*/
|
|
6
6
|
import type { AgentTool, AgentToolContext, AgentToolResult, AgentToolUpdateCallback } from "@oh-my-pi/pi-agent-core";
|
|
7
|
+
import { enforceStrictSchema, sanitizeSchemaForStrictMode } from "@oh-my-pi/pi-ai/utils/typebox-helpers";
|
|
7
8
|
import type { Static, TSchema } from "@sinclair/typebox";
|
|
8
9
|
import { Type } from "@sinclair/typebox";
|
|
9
10
|
import Ajv, { type ErrorObject, type ValidateFunction } from "ajv";
|
|
@@ -51,6 +52,53 @@ function formatAjvErrors(errors: ErrorObject[] | null | undefined): string {
|
|
|
51
52
|
.join("; ");
|
|
52
53
|
}
|
|
53
54
|
|
|
55
|
+
/**
|
|
56
|
+
* Resolve all $ref references in a JSON Schema by inlining definitions.
|
|
57
|
+
* Handles $defs and definitions at any nesting level.
|
|
58
|
+
* Removes $defs/definitions from the output since all refs are inlined.
|
|
59
|
+
*/
|
|
60
|
+
function resolveSchemaRefs(schema: Record<string, unknown>): Record<string, unknown> {
|
|
61
|
+
const defs: Record<string, Record<string, unknown>> = {};
|
|
62
|
+
const defsObj = schema.$defs ?? schema.definitions;
|
|
63
|
+
if (defsObj && typeof defsObj === "object" && !Array.isArray(defsObj)) {
|
|
64
|
+
for (const [name, def] of Object.entries(defsObj as Record<string, unknown>)) {
|
|
65
|
+
if (def && typeof def === "object" && !Array.isArray(def)) {
|
|
66
|
+
defs[name] = def as Record<string, unknown>;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
if (Object.keys(defs).length === 0) return schema;
|
|
71
|
+
|
|
72
|
+
const inlining = new Set<string>();
|
|
73
|
+
function inline(node: unknown): unknown {
|
|
74
|
+
if (node === null || typeof node !== "object") return node;
|
|
75
|
+
if (Array.isArray(node)) return node.map(inline);
|
|
76
|
+
const obj = node as Record<string, unknown>;
|
|
77
|
+
const ref = obj.$ref;
|
|
78
|
+
if (typeof ref === "string") {
|
|
79
|
+
const match = ref.match(/^#\/(?:\$defs|definitions)\/(.+)$/);
|
|
80
|
+
if (match) {
|
|
81
|
+
const name = match[1];
|
|
82
|
+
const def = defs[name];
|
|
83
|
+
if (def) {
|
|
84
|
+
if (inlining.has(name)) return {};
|
|
85
|
+
inlining.add(name);
|
|
86
|
+
const resolved = inline(def);
|
|
87
|
+
inlining.delete(name);
|
|
88
|
+
return resolved;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
const result: Record<string, unknown> = {};
|
|
93
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
94
|
+
if (key === "$defs" || key === "definitions") continue;
|
|
95
|
+
result[key] = inline(value);
|
|
96
|
+
}
|
|
97
|
+
return result;
|
|
98
|
+
}
|
|
99
|
+
return inline(schema) as Record<string, unknown>;
|
|
100
|
+
}
|
|
101
|
+
|
|
54
102
|
export class SubmitResultTool implements AgentTool<TSchema, SubmitResultDetails> {
|
|
55
103
|
readonly name = "submit_result";
|
|
56
104
|
readonly label = "Submit Result";
|
|
@@ -58,51 +106,96 @@ export class SubmitResultTool implements AgentTool<TSchema, SubmitResultDetails>
|
|
|
58
106
|
"Finish the task with structured JSON output. Call exactly once at the end of the task.\n\n" +
|
|
59
107
|
"If you cannot complete the task, call with an error message payload.";
|
|
60
108
|
readonly parameters: TSchema;
|
|
61
|
-
|
|
109
|
+
strict = true;
|
|
110
|
+
lenientArgValidation = true;
|
|
62
111
|
|
|
63
112
|
readonly #validate?: ValidateFunction;
|
|
64
|
-
|
|
113
|
+
#schemaValidationFailures = 0;
|
|
65
114
|
|
|
66
115
|
constructor(session: ToolSession) {
|
|
67
|
-
const
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
116
|
+
const createParameters = (dataSchema: TSchema): TSchema =>
|
|
117
|
+
Type.Object(
|
|
118
|
+
{
|
|
119
|
+
result: Type.Union([
|
|
120
|
+
Type.Object({ data: dataSchema }, { description: "Successfully completed the task" }),
|
|
121
|
+
Type.Object({
|
|
122
|
+
error: Type.String({ description: "Error message when the task cannot be completed" }),
|
|
123
|
+
}),
|
|
124
|
+
]),
|
|
125
|
+
},
|
|
126
|
+
{
|
|
127
|
+
additionalProperties: false,
|
|
128
|
+
description: "Submit either `data` for success or `error` for failure",
|
|
129
|
+
},
|
|
130
|
+
) as TSchema;
|
|
131
|
+
|
|
132
|
+
let validate: ValidateFunction | undefined;
|
|
133
|
+
let dataSchema: TSchema;
|
|
134
|
+
let parameters: TSchema;
|
|
135
|
+
let strict = true;
|
|
136
|
+
|
|
137
|
+
try {
|
|
138
|
+
const schemaResult = normalizeSchema(session.outputSchema);
|
|
139
|
+
// Convert JTD to JSON Schema if needed (auto-detected)
|
|
140
|
+
const normalizedSchema =
|
|
141
|
+
schemaResult.normalized !== undefined ? jtdToJsonSchema(schemaResult.normalized) : undefined;
|
|
142
|
+
let schemaError = schemaResult.error;
|
|
143
|
+
|
|
144
|
+
if (!schemaError && normalizedSchema === false) {
|
|
145
|
+
schemaError = "boolean false schema rejects all outputs";
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (normalizedSchema !== undefined && normalizedSchema !== false && !schemaError) {
|
|
149
|
+
try {
|
|
150
|
+
validate = ajv.compile(normalizedSchema as Record<string, unknown> | boolean);
|
|
151
|
+
} catch (err) {
|
|
152
|
+
schemaError = err instanceof Error ? err.message : String(err);
|
|
153
|
+
}
|
|
78
154
|
}
|
|
155
|
+
|
|
156
|
+
const schemaHint = formatSchema(normalizedSchema ?? session.outputSchema);
|
|
157
|
+
const schemaDescription = schemaError
|
|
158
|
+
? `Structured JSON output (output schema invalid; accepting unconstrained object): ${schemaError}`
|
|
159
|
+
: `Structured output matching the schema:\n${schemaHint}`;
|
|
160
|
+
const sanitizedSchema =
|
|
161
|
+
!schemaError &&
|
|
162
|
+
normalizedSchema != null &&
|
|
163
|
+
typeof normalizedSchema === "object" &&
|
|
164
|
+
!Array.isArray(normalizedSchema)
|
|
165
|
+
? sanitizeSchemaForStrictMode(normalizedSchema as Record<string, unknown>)
|
|
166
|
+
: !schemaError && normalizedSchema === true
|
|
167
|
+
? {}
|
|
168
|
+
: undefined;
|
|
169
|
+
|
|
170
|
+
if (sanitizedSchema !== undefined) {
|
|
171
|
+
const resolved = resolveSchemaRefs({
|
|
172
|
+
...sanitizedSchema,
|
|
173
|
+
description: schemaDescription,
|
|
174
|
+
});
|
|
175
|
+
dataSchema = Type.Unsafe(resolved);
|
|
176
|
+
} else {
|
|
177
|
+
dataSchema = Type.Record(Type.String(), Type.Any(), {
|
|
178
|
+
description: schemaError ? schemaDescription : "Structured JSON output (no schema specified)",
|
|
179
|
+
});
|
|
180
|
+
}
|
|
181
|
+
parameters = createParameters(dataSchema);
|
|
182
|
+
const strictParameters = enforceStrictSchema(parameters as unknown as Record<string, unknown>);
|
|
183
|
+
JSON.stringify(strictParameters);
|
|
184
|
+
// Verify the final parameters compile with AJV (catches unresolved $ref, etc.)
|
|
185
|
+
ajv.compile(parameters as Record<string, unknown>);
|
|
186
|
+
} catch (err) {
|
|
187
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
188
|
+
dataSchema = Type.Record(Type.String(), Type.Any(), {
|
|
189
|
+
description: `Structured JSON output (schema processing failed: ${errorMsg})`,
|
|
190
|
+
});
|
|
191
|
+
parameters = createParameters(dataSchema);
|
|
192
|
+
validate = undefined;
|
|
193
|
+
strict = false;
|
|
79
194
|
}
|
|
80
195
|
|
|
81
|
-
this.#
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
// Use actual schema if provided, otherwise fall back to Type.Any
|
|
86
|
-
// Merge description into the JSON schema for better tool documentation
|
|
87
|
-
const dataSchema = normalizedSchema
|
|
88
|
-
? Type.Unsafe({
|
|
89
|
-
...(normalizedSchema as object),
|
|
90
|
-
description: `Structured output matching the schema:\n${schemaHint}`,
|
|
91
|
-
})
|
|
92
|
-
: Type.Record(Type.String(), Type.Any(), { description: "Structured JSON output (no schema specified)" });
|
|
93
|
-
|
|
94
|
-
this.parameters = Type.Object(
|
|
95
|
-
{
|
|
96
|
-
result: Type.Union([
|
|
97
|
-
Type.Object({ data: dataSchema }, { description: "Successfully completed the task" }),
|
|
98
|
-
Type.Object({ error: Type.String({ description: "Error message when the task cannot be completed" }) }),
|
|
99
|
-
]),
|
|
100
|
-
},
|
|
101
|
-
{
|
|
102
|
-
additionalProperties: false,
|
|
103
|
-
description: "Submit either `data` for success or `error` for failure",
|
|
104
|
-
},
|
|
105
|
-
);
|
|
196
|
+
this.#validate = validate;
|
|
197
|
+
this.parameters = parameters;
|
|
198
|
+
this.strict = strict;
|
|
106
199
|
}
|
|
107
200
|
|
|
108
201
|
async execute(
|
|
@@ -130,20 +223,26 @@ export class SubmitResultTool implements AgentTool<TSchema, SubmitResultDetails>
|
|
|
130
223
|
}
|
|
131
224
|
|
|
132
225
|
const status = errorMessage !== undefined ? "aborted" : "success";
|
|
226
|
+
let schemaValidationOverridden = false;
|
|
133
227
|
if (status === "success") {
|
|
134
228
|
if (data === undefined || data === null) {
|
|
135
229
|
throw new Error("data is required when submit_result indicates success");
|
|
136
230
|
}
|
|
137
|
-
if (this.#schemaError) {
|
|
138
|
-
throw new Error(`Invalid output schema: ${this.#schemaError}`);
|
|
139
|
-
}
|
|
140
231
|
if (this.#validate && !this.#validate(data)) {
|
|
141
|
-
|
|
232
|
+
this.#schemaValidationFailures++;
|
|
233
|
+
if (this.#schemaValidationFailures <= 1) {
|
|
234
|
+
throw new Error(`Output does not match schema: ${formatAjvErrors(this.#validate.errors)}`);
|
|
235
|
+
}
|
|
236
|
+
schemaValidationOverridden = true;
|
|
142
237
|
}
|
|
143
238
|
}
|
|
144
239
|
|
|
145
|
-
const responseText =
|
|
146
|
-
|
|
240
|
+
const responseText =
|
|
241
|
+
status === "aborted"
|
|
242
|
+
? `Task aborted: ${errorMessage}`
|
|
243
|
+
: schemaValidationOverridden
|
|
244
|
+
? `Result submitted (schema validation overridden after ${this.#schemaValidationFailures} failed attempt(s)).`
|
|
245
|
+
: "Result submitted.";
|
|
147
246
|
return {
|
|
148
247
|
content: [{ type: "text", text: responseText }],
|
|
149
248
|
details: { data, status, error: errorMessage },
|
package/src/tools/todo-write.ts
CHANGED
|
@@ -161,6 +161,23 @@ function clonePhases(phases: TodoPhase[]): TodoPhase[] {
|
|
|
161
161
|
return phases.map(phase => ({ ...phase, tasks: phase.tasks.map(task => ({ ...task })) }));
|
|
162
162
|
}
|
|
163
163
|
|
|
164
|
+
function normalizeInProgressTask(phases: TodoPhase[]): void {
|
|
165
|
+
const orderedTasks = phases.flatMap(phase => phase.tasks);
|
|
166
|
+
if (orderedTasks.length === 0) return;
|
|
167
|
+
|
|
168
|
+
const inProgressTasks = orderedTasks.filter(task => task.status === "in_progress");
|
|
169
|
+
if (inProgressTasks.length > 1) {
|
|
170
|
+
for (const task of inProgressTasks.slice(1)) {
|
|
171
|
+
task.status = "pending";
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if (inProgressTasks.length > 0) return;
|
|
176
|
+
|
|
177
|
+
const firstPendingTask = orderedTasks.find(task => task.status === "pending");
|
|
178
|
+
if (firstPendingTask) firstPendingTask.status = "in_progress";
|
|
179
|
+
}
|
|
180
|
+
|
|
164
181
|
export function getLatestTodoPhasesFromEntries(entries: SessionEntry[]): TodoPhase[] {
|
|
165
182
|
for (let i = entries.length - 1; i >= 0; i--) {
|
|
166
183
|
const entry = entries[i];
|
|
@@ -246,6 +263,7 @@ function applyOps(file: TodoFile, ops: TodoWriteParams["ops"]): { file: TodoFile
|
|
|
246
263
|
}
|
|
247
264
|
}
|
|
248
265
|
|
|
266
|
+
normalizeInProgressTask(file.phases);
|
|
249
267
|
return { file, errors };
|
|
250
268
|
}
|
|
251
269
|
|
|
@@ -253,6 +271,14 @@ function formatSummary(phases: TodoPhase[], errors: string[]): string {
|
|
|
253
271
|
const tasks = phases.flatMap(p => p.tasks);
|
|
254
272
|
if (tasks.length === 0) return errors.length > 0 ? `Errors: ${errors.join("; ")}` : "Todo list cleared.";
|
|
255
273
|
|
|
274
|
+
const remainingByPhase = phases
|
|
275
|
+
.map(phase => ({
|
|
276
|
+
name: phase.name,
|
|
277
|
+
tasks: phase.tasks.filter(task => task.status === "pending" || task.status === "in_progress"),
|
|
278
|
+
}))
|
|
279
|
+
.filter(phase => phase.tasks.length > 0);
|
|
280
|
+
const remainingTasks = remainingByPhase.flatMap(phase => phase.tasks.map(task => ({ ...task, phase: phase.name })));
|
|
281
|
+
|
|
256
282
|
// Find current phase
|
|
257
283
|
let currentIdx = phases.findIndex(p => p.tasks.some(t => t.status === "pending" || t.status === "in_progress"));
|
|
258
284
|
if (currentIdx === -1) currentIdx = phases.length - 1;
|
|
@@ -261,6 +287,14 @@ function formatSummary(phases: TodoPhase[], errors: string[]): string {
|
|
|
261
287
|
|
|
262
288
|
const lines: string[] = [];
|
|
263
289
|
if (errors.length > 0) lines.push(`Errors: ${errors.join("; ")}`);
|
|
290
|
+
if (remainingTasks.length === 0) {
|
|
291
|
+
lines.push("Remaining items: none.");
|
|
292
|
+
} else {
|
|
293
|
+
lines.push(`Remaining items (${remainingTasks.length}):`);
|
|
294
|
+
for (const task of remainingTasks) {
|
|
295
|
+
lines.push(` - ${task.id} ${task.content} [${task.status}] (${task.phase})`);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
264
298
|
lines.push(
|
|
265
299
|
`Phase ${currentIdx + 1}/${phases.length} "${current.name}" — ${done}/${current.tasks.length} tasks complete`,
|
|
266
300
|
);
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
export interface ToolTimeoutConfig {
|
|
2
|
+
/** Default timeout in seconds when agent omits the field */
|
|
3
|
+
default: number;
|
|
4
|
+
/** Minimum allowed timeout in seconds */
|
|
5
|
+
min: number;
|
|
6
|
+
/** Maximum allowed timeout in seconds (per-tool ceiling) */
|
|
7
|
+
max: number;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export const TOOL_TIMEOUTS = {
|
|
11
|
+
bash: { default: 300, min: 1, max: 3600 },
|
|
12
|
+
python: { default: 30, min: 1, max: 600 },
|
|
13
|
+
browser: { default: 30, min: 1, max: 120 },
|
|
14
|
+
ssh: { default: 60, min: 1, max: 3600 },
|
|
15
|
+
fetch: { default: 20, min: 1, max: 45 },
|
|
16
|
+
lsp: { default: 20, min: 5, max: 60 },
|
|
17
|
+
} as const satisfies Record<string, ToolTimeoutConfig>;
|
|
18
|
+
|
|
19
|
+
export type ToolWithTimeout = keyof typeof TOOL_TIMEOUTS;
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Clamp a raw timeout to the allowed range for a tool.
|
|
23
|
+
* If rawTimeout is undefined, returns the tool's default.
|
|
24
|
+
*/
|
|
25
|
+
export function clampTimeout(tool: ToolWithTimeout, rawTimeout?: number): number {
|
|
26
|
+
const config = TOOL_TIMEOUTS[tool];
|
|
27
|
+
const timeout = rawTimeout ?? config.default;
|
|
28
|
+
return Math.max(config.min, Math.min(config.max, timeout));
|
|
29
|
+
}
|
package/src/utils/mime.ts
CHANGED
|
@@ -1,9 +1,42 @@
|
|
|
1
1
|
import * as fs from "node:fs/promises";
|
|
2
|
-
import { fileTypeFromBuffer } from "file-type";
|
|
3
2
|
|
|
4
|
-
const
|
|
3
|
+
const FILE_TYPE_SNIFF_BYTES = 12;
|
|
5
4
|
|
|
6
|
-
|
|
5
|
+
function detectMimeFromBytes(buf: Buffer, bytesRead: number): string | null {
|
|
6
|
+
if (bytesRead >= 3 && buf[0] === 0xff && buf[1] === 0xd8 && buf[2] === 0xff) {
|
|
7
|
+
return "image/jpeg";
|
|
8
|
+
}
|
|
9
|
+
if (
|
|
10
|
+
bytesRead >= 8 &&
|
|
11
|
+
buf[0] === 0x89 &&
|
|
12
|
+
buf[1] === 0x50 &&
|
|
13
|
+
buf[2] === 0x4e &&
|
|
14
|
+
buf[3] === 0x47 &&
|
|
15
|
+
buf[4] === 0x0d &&
|
|
16
|
+
buf[5] === 0x0a &&
|
|
17
|
+
buf[6] === 0x1a &&
|
|
18
|
+
buf[7] === 0x0a
|
|
19
|
+
) {
|
|
20
|
+
return "image/png";
|
|
21
|
+
}
|
|
22
|
+
if (bytesRead >= 4 && buf[0] === 0x47 && buf[1] === 0x49 && buf[2] === 0x46 && buf[3] === 0x38) {
|
|
23
|
+
return "image/gif";
|
|
24
|
+
}
|
|
25
|
+
if (
|
|
26
|
+
bytesRead >= 12 &&
|
|
27
|
+
buf[0] === 0x52 &&
|
|
28
|
+
buf[1] === 0x49 &&
|
|
29
|
+
buf[2] === 0x46 &&
|
|
30
|
+
buf[3] === 0x46 &&
|
|
31
|
+
buf[8] === 0x57 &&
|
|
32
|
+
buf[9] === 0x45 &&
|
|
33
|
+
buf[10] === 0x42 &&
|
|
34
|
+
buf[11] === 0x50
|
|
35
|
+
) {
|
|
36
|
+
return "image/webp";
|
|
37
|
+
}
|
|
38
|
+
return null;
|
|
39
|
+
}
|
|
7
40
|
|
|
8
41
|
export async function detectSupportedImageMimeTypeFromFile(filePath: string): Promise<string | null> {
|
|
9
42
|
const fileHandle = await fs.open(filePath, "r");
|
|
@@ -13,17 +46,7 @@ export async function detectSupportedImageMimeTypeFromFile(filePath: string): Pr
|
|
|
13
46
|
if (bytesRead === 0) {
|
|
14
47
|
return null;
|
|
15
48
|
}
|
|
16
|
-
|
|
17
|
-
const fileType = await fileTypeFromBuffer(buffer.subarray(0, bytesRead));
|
|
18
|
-
if (!fileType) {
|
|
19
|
-
return null;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
if (!IMAGE_MIME_TYPES.has(fileType.mime)) {
|
|
23
|
-
return null;
|
|
24
|
-
}
|
|
25
|
-
|
|
26
|
-
return fileType.mime;
|
|
49
|
+
return detectMimeFromBytes(buffer, bytesRead);
|
|
27
50
|
} finally {
|
|
28
51
|
await fileHandle.close();
|
|
29
52
|
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
export type PromptRenderPhase = "pre-render" | "post-render";
|
|
2
|
+
|
|
3
|
+
export interface PromptFormatOptions {
|
|
4
|
+
renderPhase?: PromptRenderPhase;
|
|
5
|
+
replaceAsciiSymbols?: boolean;
|
|
6
|
+
boldRfc2119Keywords?: boolean;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
// Opening XML tag (not self-closing, not closing)
|
|
10
|
+
const OPENING_XML = /^<([a-z_-]+)(?:\s+[^>]*)?>$/;
|
|
11
|
+
// Closing XML tag
|
|
12
|
+
const CLOSING_XML = /^<\/([a-z_-]+)>$/;
|
|
13
|
+
// Handlebars block start: {{#if}}, {{#has}}, {{#list}}, etc.
|
|
14
|
+
const OPENING_HBS = /^\{\{#/;
|
|
15
|
+
// Handlebars block end: {{/if}}, {{/has}}, {{/list}}, etc.
|
|
16
|
+
const CLOSING_HBS = /^\{\{\//;
|
|
17
|
+
// List item (- or * or 1.)
|
|
18
|
+
const LIST_ITEM = /^(?:[-*]\s|\d+\.\s)/;
|
|
19
|
+
// Code fence
|
|
20
|
+
const CODE_FENCE = /^```/;
|
|
21
|
+
// Table row
|
|
22
|
+
const TABLE_ROW = /^\|.*\|$/;
|
|
23
|
+
// Table separator (|---|---|)
|
|
24
|
+
const TABLE_SEP = /^\|[-:\s|]+\|$/;
|
|
25
|
+
|
|
26
|
+
/** RFC 2119 keywords used in prompts. */
|
|
27
|
+
const RFC2119_KEYWORDS = /\b(?:MUST NOT|SHOULD NOT|SHALL NOT|RECOMMENDED|REQUIRED|OPTIONAL|SHOULD|SHALL|MUST|MAY)\b/g;
|
|
28
|
+
|
|
29
|
+
function boldRfc2119Keywords(line: string): string {
|
|
30
|
+
return line.replace(RFC2119_KEYWORDS, (match, offset, source) => {
|
|
31
|
+
const isAlreadyBold =
|
|
32
|
+
source[offset - 2] === "*" &&
|
|
33
|
+
source[offset - 1] === "*" &&
|
|
34
|
+
source[offset + match.length] === "*" &&
|
|
35
|
+
source[offset + match.length + 1] === "*";
|
|
36
|
+
if (isAlreadyBold) {
|
|
37
|
+
return match;
|
|
38
|
+
}
|
|
39
|
+
return `**${match}**`;
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/** Compact a table row by trimming cell padding */
|
|
44
|
+
function compactTableRow(line: string): string {
|
|
45
|
+
const cells = line.split("|");
|
|
46
|
+
return cells.map(c => c.trim()).join("|");
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Compact a table separator row */
|
|
50
|
+
function compactTableSep(line: string): string {
|
|
51
|
+
const cells = line.split("|").filter(c => c.trim());
|
|
52
|
+
const normalized = cells.map(c => {
|
|
53
|
+
const trimmed = c.trim();
|
|
54
|
+
const left = trimmed.startsWith(":");
|
|
55
|
+
const right = trimmed.endsWith(":");
|
|
56
|
+
if (left && right) return ":---:";
|
|
57
|
+
if (left) return ":---";
|
|
58
|
+
if (right) return "---:";
|
|
59
|
+
return "---";
|
|
60
|
+
});
|
|
61
|
+
return `|${normalized.join("|")}|`;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function replaceCommonAsciiSymbols(line: string): string {
|
|
65
|
+
return line
|
|
66
|
+
.replace(/\.{3}/g, "…")
|
|
67
|
+
.replace(/<->/g, "↔")
|
|
68
|
+
.replace(/->/g, "→")
|
|
69
|
+
.replace(/<-/g, "←")
|
|
70
|
+
.replace(/!=/g, "≠")
|
|
71
|
+
.replace(/<=/g, "≤")
|
|
72
|
+
.replace(/>=/g, "≥");
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export function formatPromptContent(content: string, options: PromptFormatOptions = {}): string {
|
|
76
|
+
const {
|
|
77
|
+
renderPhase = "post-render",
|
|
78
|
+
replaceAsciiSymbols = false,
|
|
79
|
+
boldRfc2119Keywords: shouldBoldRfc2119 = false,
|
|
80
|
+
} = options;
|
|
81
|
+
const isPreRender = renderPhase === "pre-render";
|
|
82
|
+
const lines = content.split("\n");
|
|
83
|
+
const result: string[] = [];
|
|
84
|
+
let inCodeBlock = false;
|
|
85
|
+
const topLevelTags: string[] = [];
|
|
86
|
+
|
|
87
|
+
for (let i = 0; i < lines.length; i++) {
|
|
88
|
+
let line = lines[i].trimEnd();
|
|
89
|
+
const trimmed = line.trimStart();
|
|
90
|
+
|
|
91
|
+
if (CODE_FENCE.test(trimmed)) {
|
|
92
|
+
inCodeBlock = !inCodeBlock;
|
|
93
|
+
result.push(line);
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
if (inCodeBlock) {
|
|
98
|
+
result.push(line);
|
|
99
|
+
continue;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (replaceAsciiSymbols) {
|
|
103
|
+
line = replaceCommonAsciiSymbols(line);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
const isOpeningXml = OPENING_XML.test(trimmed) && !trimmed.endsWith("/>");
|
|
107
|
+
if (isOpeningXml && line.length === trimmed.length) {
|
|
108
|
+
const match = OPENING_XML.exec(trimmed);
|
|
109
|
+
if (match) topLevelTags.push(match[1]);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const closingMatch = CLOSING_XML.exec(trimmed);
|
|
113
|
+
if (closingMatch) {
|
|
114
|
+
const tagName = closingMatch[1];
|
|
115
|
+
if (topLevelTags.length > 0 && topLevelTags[topLevelTags.length - 1] === tagName) {
|
|
116
|
+
line = trimmed;
|
|
117
|
+
topLevelTags.pop();
|
|
118
|
+
} else {
|
|
119
|
+
line = line.trimEnd();
|
|
120
|
+
}
|
|
121
|
+
} else if (isPreRender && trimmed.startsWith("{{")) {
|
|
122
|
+
line = trimmed;
|
|
123
|
+
} else if (TABLE_SEP.test(trimmed)) {
|
|
124
|
+
line = compactTableSep(trimmed);
|
|
125
|
+
} else if (TABLE_ROW.test(trimmed)) {
|
|
126
|
+
line = compactTableRow(trimmed);
|
|
127
|
+
} else {
|
|
128
|
+
line = line.trimEnd();
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
if (shouldBoldRfc2119) {
|
|
132
|
+
line = boldRfc2119Keywords(line);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const isBlank = trimmed === "";
|
|
136
|
+
if (isBlank) {
|
|
137
|
+
const prevLine = result[result.length - 1]?.trim() ?? "";
|
|
138
|
+
const nextLine = lines[i + 1]?.trim() ?? "";
|
|
139
|
+
|
|
140
|
+
if (LIST_ITEM.test(nextLine)) {
|
|
141
|
+
continue;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
if (OPENING_XML.test(prevLine) || (isPreRender && OPENING_HBS.test(prevLine))) {
|
|
145
|
+
continue;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (CLOSING_XML.test(nextLine) || (isPreRender && CLOSING_HBS.test(nextLine))) {
|
|
149
|
+
continue;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
const prevIsBlank = prevLine === "";
|
|
153
|
+
if (prevIsBlank) {
|
|
154
|
+
continue;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
if (CLOSING_XML.test(trimmed) || (isPreRender && CLOSING_HBS.test(trimmed))) {
|
|
159
|
+
while (result.length > 0 && result[result.length - 1].trim() === "") {
|
|
160
|
+
result.pop();
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
result.push(line);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
while (result.length > 0 && result[result.length - 1].trim() === "") {
|
|
168
|
+
result.pop();
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return result.join("\n");
|
|
172
|
+
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { parseHTML } from "linkedom";
|
|
2
2
|
import type { RenderResult, SpecialHandler } from "./types";
|
|
3
3
|
import { buildResult, loadPage } from "./types";
|
|
4
4
|
import { convertWithMarkitdown, fetchBinary } from "./utils";
|
|
@@ -31,22 +31,22 @@ export const handleArxiv: SpecialHandler = async (
|
|
|
31
31
|
if (!result.ok) return null;
|
|
32
32
|
|
|
33
33
|
// Parse the Atom feed response
|
|
34
|
-
const doc =
|
|
34
|
+
const doc = parseHTML(result.content).document;
|
|
35
35
|
const entry = doc.querySelector("entry");
|
|
36
36
|
|
|
37
37
|
if (!entry) return null;
|
|
38
38
|
|
|
39
|
-
const title = entry.querySelector("title")?.
|
|
40
|
-
const summary = entry.querySelector("summary")?.
|
|
41
|
-
const authors = entry
|
|
42
|
-
.
|
|
43
|
-
.
|
|
44
|
-
|
|
45
|
-
const
|
|
46
|
-
|
|
47
|
-
|
|
39
|
+
const title = entry.querySelector("title")?.textContent?.trim()?.replace(/\s+/g, " ");
|
|
40
|
+
const summary = entry.querySelector("summary")?.textContent?.trim();
|
|
41
|
+
const authors = Array.from(entry.querySelectorAll("author name") as Iterable<{ textContent: string | null }>)
|
|
42
|
+
.map(n => n.textContent?.trim())
|
|
43
|
+
.filter((name): name is string => Boolean(name));
|
|
44
|
+
const published = entry.querySelector("published")?.textContent?.trim()?.split("T")[0];
|
|
45
|
+
const categories = Array.from(
|
|
46
|
+
entry.querySelectorAll("category") as Iterable<{ getAttribute: (name: string) => string | null }>,
|
|
47
|
+
)
|
|
48
48
|
.map(c => c.getAttribute("term"))
|
|
49
|
-
.filter(Boolean);
|
|
49
|
+
.filter((term): term is string => Boolean(term));
|
|
50
50
|
const pdfLink = entry.querySelector('link[title="pdf"]')?.getAttribute("href");
|
|
51
51
|
|
|
52
52
|
let md = `# ${title || "arXiv Paper"}\n\n`;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { tryParseJson } from "@oh-my-pi/pi-utils";
|
|
2
|
-
import {
|
|
2
|
+
import { parseHTML } from "linkedom";
|
|
3
3
|
import type { RenderResult, SpecialHandler } from "./types";
|
|
4
4
|
import { buildResult, htmlToBasicMarkdown, loadPage } from "./types";
|
|
5
5
|
|
|
@@ -97,7 +97,7 @@ export const handleGoPkg: SpecialHandler = async (
|
|
|
97
97
|
});
|
|
98
98
|
}
|
|
99
99
|
|
|
100
|
-
const doc =
|
|
100
|
+
const doc = parseHTML(pageResult.content).document;
|
|
101
101
|
|
|
102
102
|
// Extract actual module path from breadcrumb or header
|
|
103
103
|
const breadcrumb = doc.querySelector(".go-Breadcrumb");
|
package/src/web/scrapers/iacr.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { parseHTML } from "linkedom";
|
|
2
2
|
import type { RenderResult, SpecialHandler } from "./types";
|
|
3
3
|
import { buildResult, loadPage } from "./types";
|
|
4
4
|
import { convertWithMarkitdown, fetchBinary } from "./utils";
|
|
@@ -30,22 +30,30 @@ export const handleIacr: SpecialHandler = async (
|
|
|
30
30
|
|
|
31
31
|
if (!result.ok) return null;
|
|
32
32
|
|
|
33
|
-
const doc =
|
|
33
|
+
const doc = parseHTML(result.content).document;
|
|
34
34
|
|
|
35
35
|
// Extract metadata from the page
|
|
36
36
|
const title =
|
|
37
|
-
doc.querySelector("h3.mb-3")?.
|
|
37
|
+
doc.querySelector("h3.mb-3")?.textContent?.trim() ||
|
|
38
38
|
doc.querySelector('meta[name="citation_title"]')?.getAttribute("content");
|
|
39
|
-
const authors =
|
|
40
|
-
.querySelectorAll('meta[name="citation_author"]')
|
|
39
|
+
const authors = Array.from(
|
|
40
|
+
doc.querySelectorAll('meta[name="citation_author"]') as Iterable<{
|
|
41
|
+
getAttribute: (name: string) => string | null;
|
|
42
|
+
}>,
|
|
43
|
+
)
|
|
41
44
|
.map(m => m.getAttribute("content"))
|
|
42
|
-
.filter(Boolean);
|
|
45
|
+
.filter((author): author is string => Boolean(author));
|
|
43
46
|
// Abstract is in <p> after <h5>Abstract</h5>
|
|
44
|
-
const abstractHeading =
|
|
47
|
+
const abstractHeading = Array.from(
|
|
48
|
+
doc.querySelectorAll("h5") as Iterable<{
|
|
49
|
+
textContent: string | null;
|
|
50
|
+
parentElement?: { querySelector: (selector: string) => { textContent: string | null } | null } | null;
|
|
51
|
+
}>,
|
|
52
|
+
).find(h => h.textContent?.includes("Abstract"));
|
|
45
53
|
const abstract =
|
|
46
|
-
abstractHeading?.
|
|
54
|
+
abstractHeading?.parentElement?.querySelector("p")?.textContent?.trim() ||
|
|
47
55
|
doc.querySelector('meta[name="description"]')?.getAttribute("content");
|
|
48
|
-
const keywords = doc.querySelector(".keywords")?.
|
|
56
|
+
const keywords = doc.querySelector(".keywords")?.textContent?.replace("Keywords:", "").trim();
|
|
49
57
|
const pubDate = doc.querySelector('meta[name="citation_publication_date"]')?.getAttribute("content");
|
|
50
58
|
|
|
51
59
|
let md = `# ${title || "IACR ePrint Paper"}\n\n`;
|