@aria-cli/tools 1.0.9 → 1.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +9 -5
- package/src/__tests__/web-fetch-download.test.ts +0 -433
- package/src/__tests__/web-tools.test.ts +0 -619
- package/src/ask-user-interaction.ts +0 -33
- package/src/cache/web-cache.ts +0 -110
- package/src/definitions/arion.ts +0 -118
- package/src/definitions/browser/browser.ts +0 -502
- package/src/definitions/browser/index.ts +0 -5
- package/src/definitions/browser/pw-downloads.ts +0 -142
- package/src/definitions/browser/pw-interactions.ts +0 -282
- package/src/definitions/browser/pw-responses.ts +0 -98
- package/src/definitions/browser/pw-session.ts +0 -405
- package/src/definitions/browser/pw-shared.ts +0 -85
- package/src/definitions/browser/pw-snapshot.ts +0 -383
- package/src/definitions/browser/pw-state.ts +0 -101
- package/src/definitions/browser/types.ts +0 -203
- package/src/definitions/code-intelligence.ts +0 -526
- package/src/definitions/core.ts +0 -118
- package/src/definitions/delegation.ts +0 -567
- package/src/definitions/deploy.ts +0 -73
- package/src/definitions/filesystem.ts +0 -217
- package/src/definitions/frg.ts +0 -67
- package/src/definitions/index.ts +0 -28
- package/src/definitions/memory.ts +0 -150
- package/src/definitions/messaging.ts +0 -734
- package/src/definitions/meta.ts +0 -392
- package/src/definitions/network.ts +0 -179
- package/src/definitions/outlook.ts +0 -318
- package/src/definitions/patch/apply-patch.ts +0 -235
- package/src/definitions/patch/fuzzy-match.ts +0 -217
- package/src/definitions/patch/index.ts +0 -1
- package/src/definitions/patch/patch-parser.ts +0 -297
- package/src/definitions/patch/sandbox-paths.ts +0 -129
- package/src/definitions/process/index.ts +0 -5
- package/src/definitions/process/process-registry.ts +0 -303
- package/src/definitions/process/process.ts +0 -456
- package/src/definitions/process/pty-keys.ts +0 -298
- package/src/definitions/process/session-slug.ts +0 -147
- package/src/definitions/quip.ts +0 -225
- package/src/definitions/search.ts +0 -67
- package/src/definitions/session-history.ts +0 -79
- package/src/definitions/shell.ts +0 -202
- package/src/definitions/slack.ts +0 -211
- package/src/definitions/web.ts +0 -119
- package/src/executors/apply-patch.ts +0 -1035
- package/src/executors/arion.ts +0 -199
- package/src/executors/code-intelligence.ts +0 -1179
- package/src/executors/deploy.ts +0 -1066
- package/src/executors/filesystem.ts +0 -1428
- package/src/executors/frg-freshness.ts +0 -743
- package/src/executors/frg.ts +0 -394
- package/src/executors/index.ts +0 -280
- package/src/executors/learning-meta.ts +0 -1367
- package/src/executors/lsp-client.ts +0 -355
- package/src/executors/memory.ts +0 -978
- package/src/executors/meta.ts +0 -293
- package/src/executors/process-registry.ts +0 -570
- package/src/executors/pty-session-store.ts +0 -43
- package/src/executors/pty.ts +0 -342
- package/src/executors/restart.ts +0 -133
- package/src/executors/search-freshness.ts +0 -249
- package/src/executors/search-types.ts +0 -98
- package/src/executors/search.ts +0 -89
- package/src/executors/self-diagnose.ts +0 -552
- package/src/executors/session-history.ts +0 -435
- package/src/executors/shell-safety.ts +0 -519
- package/src/executors/shell.ts +0 -1243
- package/src/executors/utils.ts +0 -40
- package/src/executors/web.ts +0 -786
- package/src/extraction/content-extraction.ts +0 -281
- package/src/extraction/index.ts +0 -5
- package/src/headless-control-contract.ts +0 -1149
- package/src/index.ts +0 -788
- package/src/local-control-http-auth.ts +0 -2
- package/src/mcp/client.ts +0 -218
- package/src/mcp/connection.ts +0 -568
- package/src/mcp/index.ts +0 -11
- package/src/mcp/jsonrpc.ts +0 -195
- package/src/mcp/types.ts +0 -199
- package/src/network-control-adapter.ts +0 -88
- package/src/network-runtime/address-types.ts +0 -218
- package/src/network-runtime/db-owner-fencing.ts +0 -91
- package/src/network-runtime/delivery-receipts.ts +0 -372
- package/src/network-runtime/direct-endpoint-authority.ts +0 -35
- package/src/network-runtime/index.ts +0 -316
- package/src/network-runtime/local-control-contract.ts +0 -784
- package/src/network-runtime/node-store-contract.ts +0 -46
- package/src/network-runtime/pair-route-contract.ts +0 -97
- package/src/network-runtime/peer-capabilities.ts +0 -48
- package/src/network-runtime/peer-principal-ref.ts +0 -20
- package/src/network-runtime/peer-state-machine.ts +0 -160
- package/src/network-runtime/protocol-schemas.ts +0 -265
- package/src/network-runtime/runtime-bootstrap-contract.ts +0 -83
- package/src/outlook/desktop-session.ts +0 -409
- package/src/policy.ts +0 -171
- package/src/providers/brave.ts +0 -80
- package/src/providers/duckduckgo.ts +0 -199
- package/src/providers/exa.ts +0 -85
- package/src/providers/firecrawl.ts +0 -77
- package/src/providers/index.ts +0 -8
- package/src/providers/jina.ts +0 -70
- package/src/providers/router.ts +0 -121
- package/src/providers/search-provider.ts +0 -74
- package/src/providers/tavily.ts +0 -74
- package/src/quip/desktop-session.ts +0 -435
- package/src/registry/index.ts +0 -1
- package/src/registry/registry.ts +0 -905
- package/src/runtime-socket-local-control-client.ts +0 -632
- package/src/security/dns-normalization.ts +0 -34
- package/src/security/dns-pinning.ts +0 -138
- package/src/security/external-content.ts +0 -129
- package/src/security/ssrf.ts +0 -207
- package/src/slack/desktop-session.ts +0 -493
- package/src/tool-factory.ts +0 -91
- package/src/types.ts +0 -1341
- package/src/utils/retry.ts +0 -163
- package/src/utils/safe-parse-json.ts +0 -176
- package/src/utils/url.ts +0 -20
- package/tests/benchmarks/registry.bench.ts +0 -57
- package/tests/cache/web-cache.test.ts +0 -147
- package/tests/critical-integration.test.ts +0 -1465
- package/tests/definitions/apply-patch.test.ts +0 -586
- package/tests/definitions/browser.test.ts +0 -495
- package/tests/definitions/delegation-pause-resume.test.ts +0 -758
- package/tests/definitions/execution.test.ts +0 -671
- package/tests/definitions/messaging-inbox-scope.test.ts +0 -229
- package/tests/definitions/messaging.test.ts +0 -1468
- package/tests/definitions/outlook.test.ts +0 -30
- package/tests/definitions/process.test.ts +0 -469
- package/tests/definitions/slack.test.ts +0 -28
- package/tests/definitions/tool-inventory.test.ts +0 -218
- package/tests/e2e/delegation-quest-orchestration.e2e.test.ts +0 -433
- package/tests/e2e/memory-tool-discovery-contract.e2e.test.ts +0 -81
- package/tests/executors/apply-patch.test.ts +0 -538
- package/tests/executors/arion.test.ts +0 -309
- package/tests/executors/conversation-primitives.test.ts +0 -250
- package/tests/executors/deploy.test.ts +0 -746
- package/tests/executors/filesystem-tools.test.ts +0 -357
- package/tests/executors/filesystem.test.ts +0 -959
- package/tests/executors/frg-freshness.test.ts +0 -136
- package/tests/executors/frg-merge.test.ts +0 -70
- package/tests/executors/frg-session-content.test.ts +0 -40
- package/tests/executors/frg.test.ts +0 -56
- package/tests/executors/memory-bugfixes.test.ts +0 -257
- package/tests/executors/memory-real-memoria.integration.test.ts +0 -316
- package/tests/executors/memory.test.ts +0 -853
- package/tests/executors/meta-tools.test.ts +0 -411
- package/tests/executors/meta.test.ts +0 -683
- package/tests/executors/path-containment.test.ts +0 -51
- package/tests/executors/process-registry.test.ts +0 -505
- package/tests/executors/pty.test.ts +0 -664
- package/tests/executors/quest-security.test.ts +0 -249
- package/tests/executors/read-file-media.test.ts +0 -230
- package/tests/executors/recall-knowledge-schema.test.ts +0 -209
- package/tests/executors/recall-tags.test.ts +0 -278
- package/tests/executors/remember-null-safety.contract.test.ts +0 -41
- package/tests/executors/restart.test.ts +0 -67
- package/tests/executors/search-unified.test.ts +0 -381
- package/tests/executors/session-history.test.ts +0 -340
- package/tests/executors/session-transcript.test.ts +0 -561
- package/tests/executors/shell-abort.test.ts +0 -416
- package/tests/executors/shell-env-blocklist.test.ts +0 -648
- package/tests/executors/shell-env-process.test.ts +0 -245
- package/tests/executors/shell-process-registry.test.ts +0 -334
- package/tests/executors/shell-tools.test.ts +0 -393
- package/tests/executors/shell.test.ts +0 -690
- package/tests/executors/web-abort-vs-timeout.test.ts +0 -213
- package/tests/executors/web-integration.test.ts +0 -633
- package/tests/executors/web-symlink.test.ts +0 -18
- package/tests/executors/web.test.ts +0 -1400
- package/tests/executors/write-stdin.test.ts +0 -145
- package/tests/extraction/content-extraction.test.ts +0 -153
- package/tests/guards/tools-default-test-lane.integration.test.ts +0 -21
- package/tests/guards/tools-package-test-commands.e2e.test.ts +0 -43
- package/tests/guards/tools-test-lane-manifest.contract.test.ts +0 -76
- package/tests/guards/tools-vitest-workspace-alias.contract.test.ts +0 -63
- package/tests/helpers/async-waits.ts +0 -53
- package/tests/integration/headless-control-contract.integration.test.ts +0 -153
- package/tests/integration/memory-tool-schema-parity.integration.test.ts +0 -67
- package/tests/integration/meta-tools-round-trip.integration.test.ts +0 -506
- package/tests/integration/quest-round-trip.test.ts +0 -303
- package/tests/integration/registry-executor-flow.test.ts +0 -85
- package/tests/integration.test.ts +0 -177
- package/tests/loading-tier.test.ts +0 -126
- package/tests/mcp/client-reconnect.test.ts +0 -267
- package/tests/mcp/connection.test.ts +0 -846
- package/tests/mcp/injectable-logger.test.ts +0 -83
- package/tests/mcp/jsonrpc.test.ts +0 -109
- package/tests/mcp/lifecycle.test.ts +0 -879
- package/tests/network-runtime/address-types.contract.test.ts +0 -143
- package/tests/network-runtime/continuity-bind-schema.contract.test.ts +0 -203
- package/tests/network-runtime/local-control-contract.test.ts +0 -869
- package/tests/network-runtime/local-control-invite-token.contract.test.ts +0 -146
- package/tests/network-runtime/node-store-contract.test.ts +0 -11
- package/tests/network-runtime/pair-protocol-nodeid.contract.test.ts +0 -15
- package/tests/network-runtime/peer-state-machine.contract.test.ts +0 -148
- package/tests/network-runtime/protocol-schemas.contract.test.ts +0 -512
- package/tests/network-runtime/relay-pending-nodeid.contract.test.ts +0 -62
- package/tests/network-runtime/runtime-bootstrap-contract.test.ts +0 -227
- package/tests/network-runtime/runtime-socket-local-control-client.test.ts +0 -621
- package/tests/network-runtime/wait-for-message-script.test.ts +0 -288
- package/tests/parallel.test.ts +0 -71
- package/tests/policy.test.ts +0 -184
- package/tests/print-default-test-lane.ts +0 -14
- package/tests/print-test-lane-manifest.ts +0 -22
- package/tests/providers/brave.test.ts +0 -159
- package/tests/providers/duckduckgo.test.ts +0 -207
- package/tests/providers/exa.test.ts +0 -175
- package/tests/providers/firecrawl.test.ts +0 -168
- package/tests/providers/jina.test.ts +0 -144
- package/tests/providers/router.test.ts +0 -328
- package/tests/providers/tavily.test.ts +0 -165
- package/tests/registry/discovery.test.ts +0 -154
- package/tests/registry/injectable-logger.test.ts +0 -230
- package/tests/registry/input-validation.test.ts +0 -361
- package/tests/registry/interface-completeness.test.ts +0 -85
- package/tests/registry/mcp-integration.test.ts +0 -103
- package/tests/registry/mcp-read-only-hint.test.ts +0 -60
- package/tests/registry/memoria-discovery.test.ts +0 -390
- package/tests/registry/nested-validation.test.ts +0 -283
- package/tests/registry/pseudo-tool-filtering.test.ts +0 -258
- package/tests/registry/registration-lifecycle.test.ts +0 -133
- package/tests/registry-validation.test.ts +0 -424
- package/tests/registry.test.ts +0 -460
- package/tests/security/dns-pinning.test.ts +0 -162
- package/tests/security/external-content.test.ts +0 -144
- package/tests/security/ssrf.test.ts +0 -118
- package/tests/shell-safety-integration.test.ts +0 -32
- package/tests/shell-safety.test.ts +0 -365
- package/tests/slack/desktop-session.test.ts +0 -50
- package/tests/test-lane-manifest.ts +0 -440
- package/tests/test-utils.ts +0 -27
- package/tests/tool-factory.test.ts +0 -188
- package/tests/utils/retry.test.ts +0 -231
- package/tests/utils/url.test.ts +0 -63
- package/tsconfig.cjs.json +0 -24
- package/tsconfig.json +0 -12
- package/vitest.config.ts +0 -55
- package/vitest.e2e.config.ts +0 -24
- package/vitest.integration.config.ts +0 -24
- package/vitest.native.config.ts +0 -24
|
@@ -1,281 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Content Extraction — HTML to Markdown conversion with article detection
|
|
3
|
-
*
|
|
4
|
-
* Three-tier fallback strategy:
|
|
5
|
-
* 1. Readability.js (Mozilla) + Turndown for article-like content
|
|
6
|
-
* 2. Turndown raw HTML if Readability fails
|
|
7
|
-
* 3. Regex-based strip if both fail
|
|
8
|
-
*/
|
|
9
|
-
|
|
10
|
-
import { JSDOM } from "jsdom";
|
|
11
|
-
import { Readability } from "@mozilla/readability";
|
|
12
|
-
import TurndownService from "turndown";
|
|
13
|
-
|
|
14
|
-
/** Maximum content length (50K chars) */
|
|
15
|
-
const MAX_CONTENT_LENGTH = 50_000;
|
|
16
|
-
|
|
17
|
-
/** Timeout for Readability parse in milliseconds */
|
|
18
|
-
const PARSE_TIMEOUT_MS = 10_000;
|
|
19
|
-
|
|
20
|
-
/** Content types that should be parsed as HTML */
|
|
21
|
-
const HTML_CONTENT_TYPES = ["text/html", "text/xhtml+xml", "application/xhtml+xml"];
|
|
22
|
-
|
|
23
|
-
// Simple concurrency semaphore for JSDOM memory protection
|
|
24
|
-
const configuredConcurrentExtractions = Number.parseInt(
|
|
25
|
-
process.env.ARIA_MAX_CONCURRENT_EXTRACTIONS ?? "3",
|
|
26
|
-
10,
|
|
27
|
-
);
|
|
28
|
-
const MAX_CONCURRENT_EXTRACTIONS =
|
|
29
|
-
Number.isFinite(configuredConcurrentExtractions) && configuredConcurrentExtractions > 0
|
|
30
|
-
? configuredConcurrentExtractions
|
|
31
|
-
: 3;
|
|
32
|
-
let activeExtractions = 0;
|
|
33
|
-
const extractionQueue: Array<() => void> = [];
|
|
34
|
-
|
|
35
|
-
async function acquireExtractionSlot(): Promise<void> {
|
|
36
|
-
if (activeExtractions < MAX_CONCURRENT_EXTRACTIONS) {
|
|
37
|
-
activeExtractions++;
|
|
38
|
-
return;
|
|
39
|
-
}
|
|
40
|
-
return new Promise<void>((resolve) => {
|
|
41
|
-
extractionQueue.push(() => {
|
|
42
|
-
activeExtractions++;
|
|
43
|
-
resolve();
|
|
44
|
-
});
|
|
45
|
-
});
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
function releaseExtractionSlot(): void {
|
|
49
|
-
activeExtractions--;
|
|
50
|
-
const next = extractionQueue.shift();
|
|
51
|
-
if (next) next();
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
/**
|
|
55
|
-
* Result of content extraction
|
|
56
|
-
*/
|
|
57
|
-
export interface ExtractedContent {
|
|
58
|
-
/** Page title */
|
|
59
|
-
title: string;
|
|
60
|
-
/** Extracted content as Markdown */
|
|
61
|
-
content: string;
|
|
62
|
-
/** Whether Readability detected article-like content */
|
|
63
|
-
isArticle: boolean;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
/**
|
|
67
|
-
* Checks whether a Content-Type header value represents HTML content
|
|
68
|
-
* that should be processed through Readability/Turndown extraction.
|
|
69
|
-
*/
|
|
70
|
-
export function isHtmlContentType(contentType: string | undefined | null): boolean {
|
|
71
|
-
if (!contentType) return true; // Default to HTML extraction if no Content-Type
|
|
72
|
-
const [mime] = contentType.toLowerCase().split(";");
|
|
73
|
-
const lower = (mime ?? "").trim();
|
|
74
|
-
return HTML_CONTENT_TYPES.includes(lower);
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
/**
|
|
78
|
-
* Extracts content from a response body, checking Content-Type first.
|
|
79
|
-
* Non-HTML content types (JSON, PDF, images, etc.) are returned as-is
|
|
80
|
-
* or with a descriptive message rather than being fed through Readability.
|
|
81
|
-
*
|
|
82
|
-
* @param body - The response body text
|
|
83
|
-
* @param url - The source URL
|
|
84
|
-
* @param contentType - The Content-Type header value (optional)
|
|
85
|
-
* @returns Extracted content
|
|
86
|
-
*/
|
|
87
|
-
export async function extractFromResponse(
|
|
88
|
-
body: string,
|
|
89
|
-
url: string,
|
|
90
|
-
contentType?: string | null,
|
|
91
|
-
): Promise<ExtractedContent> {
|
|
92
|
-
if (!isHtmlContentType(contentType)) {
|
|
93
|
-
// Non-HTML content: return raw text truncated to limit
|
|
94
|
-
const [rawMimeType] = contentType?.split(";") ?? [];
|
|
95
|
-
const mimeType = (rawMimeType ?? "unknown").trim() || "unknown";
|
|
96
|
-
const truncated = body.slice(0, MAX_CONTENT_LENGTH);
|
|
97
|
-
return {
|
|
98
|
-
title: "",
|
|
99
|
-
content: truncated || `[Non-HTML content: ${mimeType}]`,
|
|
100
|
-
isArticle: false,
|
|
101
|
-
};
|
|
102
|
-
}
|
|
103
|
-
return extractContent(body, url);
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
/**
|
|
107
|
-
* Wraps a synchronous/async operation with a timeout.
|
|
108
|
-
* Returns the result if completed within the timeout, or rejects.
|
|
109
|
-
*/
|
|
110
|
-
function withTimeout<T>(promise: Promise<T>, ms: number, label: string): Promise<T> {
|
|
111
|
-
return new Promise<T>((resolve, reject) => {
|
|
112
|
-
const timer = setTimeout(() => {
|
|
113
|
-
reject(new Error(`${label} timed out after ${ms}ms`));
|
|
114
|
-
}, ms);
|
|
115
|
-
promise.then(
|
|
116
|
-
(val) => {
|
|
117
|
-
clearTimeout(timer);
|
|
118
|
-
resolve(val);
|
|
119
|
-
},
|
|
120
|
-
(err) => {
|
|
121
|
-
clearTimeout(timer);
|
|
122
|
-
reject(err);
|
|
123
|
-
},
|
|
124
|
-
);
|
|
125
|
-
});
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
function extractOversizedHtmlTitle(html: string): string {
|
|
129
|
-
const match = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
130
|
-
return match?.[1]?.replace(/\s+/g, " ").trim() ?? "";
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
/**
|
|
134
|
-
* Extracts content from HTML and converts to Markdown.
|
|
135
|
-
*
|
|
136
|
-
* Strategy:
|
|
137
|
-
* 1. Try Readability.js (article detection) + Turndown
|
|
138
|
-
* 2. Fall back to Turndown on raw HTML
|
|
139
|
-
* 3. Fall back to regex-based tag stripping
|
|
140
|
-
*
|
|
141
|
-
* @param html - The HTML content to extract from
|
|
142
|
-
* @param url - The source URL (for link resolution)
|
|
143
|
-
* @returns Extracted content with title and article detection
|
|
144
|
-
*/
|
|
145
|
-
export async function extractContent(html: string, url: string): Promise<ExtractedContent> {
|
|
146
|
-
// Handle empty input
|
|
147
|
-
if (!html || html.trim().length === 0) {
|
|
148
|
-
return { title: "", content: "", isArticle: false };
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
// Oversized pages will be truncated anyway, so skip expensive DOM/readability
|
|
152
|
-
// work and strip directly from the raw response body.
|
|
153
|
-
if (html.length > MAX_CONTENT_LENGTH * 2) {
|
|
154
|
-
return {
|
|
155
|
-
title: extractOversizedHtmlTitle(html),
|
|
156
|
-
content: stripHtmlTags(html).slice(0, MAX_CONTENT_LENGTH),
|
|
157
|
-
isArticle: false,
|
|
158
|
-
};
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
await acquireExtractionSlot();
|
|
162
|
-
try {
|
|
163
|
-
// Parse HTML with JSDOM
|
|
164
|
-
const dom = new JSDOM(html, { url });
|
|
165
|
-
try {
|
|
166
|
-
const document = dom.window.document;
|
|
167
|
-
|
|
168
|
-
// Extract title
|
|
169
|
-
const title = document.querySelector("title")?.textContent?.trim() || "";
|
|
170
|
-
|
|
171
|
-
// Detect semantic article markers before Readability mutates the DOM
|
|
172
|
-
const hasArticleMarkers = !!document.querySelector(
|
|
173
|
-
"article, [role='article'], [role='main'], [itemtype*='Article']",
|
|
174
|
-
);
|
|
175
|
-
|
|
176
|
-
// Initialize Turndown for Markdown conversion
|
|
177
|
-
const turndown = new TurndownService({
|
|
178
|
-
headingStyle: "atx",
|
|
179
|
-
codeBlockStyle: "fenced",
|
|
180
|
-
});
|
|
181
|
-
// Remove noise elements before conversion
|
|
182
|
-
turndown.remove(["script", "style", "meta", "link", "noscript"]);
|
|
183
|
-
|
|
184
|
-
// Tier 1: Try Readability.js for article extraction (only for article-like pages)
|
|
185
|
-
if (hasArticleMarkers) {
|
|
186
|
-
try {
|
|
187
|
-
const reader = new Readability(document.cloneNode(true) as Document);
|
|
188
|
-
// Wrap Readability.parse() with a timeout to prevent hung parsing on malformed HTML
|
|
189
|
-
const article = await withTimeout(
|
|
190
|
-
Promise.resolve(reader.parse()),
|
|
191
|
-
PARSE_TIMEOUT_MS,
|
|
192
|
-
"Readability.parse()",
|
|
193
|
-
);
|
|
194
|
-
|
|
195
|
-
if (article && article.content) {
|
|
196
|
-
// Readability succeeded — convert to Markdown, normalize URLs
|
|
197
|
-
let markdown = turndown.turndown(article.content);
|
|
198
|
-
markdown = normalizeTrailingSlashes(markdown);
|
|
199
|
-
const truncated = markdown.slice(0, MAX_CONTENT_LENGTH);
|
|
200
|
-
return {
|
|
201
|
-
title: article.title || title,
|
|
202
|
-
content: truncated,
|
|
203
|
-
isArticle: true,
|
|
204
|
-
};
|
|
205
|
-
}
|
|
206
|
-
} catch {
|
|
207
|
-
// Readability failed or timed out, fall through to Tier 2
|
|
208
|
-
}
|
|
209
|
-
}
|
|
210
|
-
|
|
211
|
-
// Tier 2: Turndown on raw body HTML (non-article pages or Readability failure)
|
|
212
|
-
try {
|
|
213
|
-
const bodyHtml = document.body?.innerHTML || "";
|
|
214
|
-
if (bodyHtml) {
|
|
215
|
-
let markdown = turndown.turndown(bodyHtml);
|
|
216
|
-
markdown = normalizeTrailingSlashes(markdown);
|
|
217
|
-
const truncated = markdown.slice(0, MAX_CONTENT_LENGTH);
|
|
218
|
-
return {
|
|
219
|
-
title,
|
|
220
|
-
content: truncated,
|
|
221
|
-
isArticle: false,
|
|
222
|
-
};
|
|
223
|
-
}
|
|
224
|
-
} catch {
|
|
225
|
-
// Turndown failed, fall through to Tier 3
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
// Tier 3: Regex-based fallback (strip tags, extract text)
|
|
229
|
-
const textContent = stripHtmlTags(html);
|
|
230
|
-
const truncated = textContent.slice(0, MAX_CONTENT_LENGTH);
|
|
231
|
-
return {
|
|
232
|
-
title,
|
|
233
|
-
content: truncated,
|
|
234
|
-
isArticle: false,
|
|
235
|
-
};
|
|
236
|
-
} finally {
|
|
237
|
-
dom.window.close();
|
|
238
|
-
}
|
|
239
|
-
} catch {
|
|
240
|
-
// Catastrophic failure — return empty result
|
|
241
|
-
return { title: "", content: "", isArticle: false };
|
|
242
|
-
} finally {
|
|
243
|
-
releaseExtractionSlot();
|
|
244
|
-
}
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
/**
|
|
248
|
-
* Remove trailing slashes from URLs in markdown links that were added by JSDOM normalization.
|
|
249
|
-
* JSDOM normalizes `https://example.com` → `https://example.com/`, breaking exact URL matches.
|
|
250
|
-
*/
|
|
251
|
-
function normalizeTrailingSlashes(markdown: string): string {
|
|
252
|
-
return markdown.replace(/\]\(([^)]+?)\/\)/g, (match, url) => {
|
|
253
|
-
// Only strip trailing slash from URLs that are just a domain (no path)
|
|
254
|
-
try {
|
|
255
|
-
const parsed = new URL(url + "/");
|
|
256
|
-
if (parsed.pathname === "/") {
|
|
257
|
-
return `](${url})`;
|
|
258
|
-
}
|
|
259
|
-
} catch {
|
|
260
|
-
// Not a valid URL, leave as-is
|
|
261
|
-
}
|
|
262
|
-
return match;
|
|
263
|
-
});
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
/**
|
|
267
|
-
* Strips HTML tags using regex (last resort fallback)
|
|
268
|
-
*/
|
|
269
|
-
function stripHtmlTags(html: string): string {
|
|
270
|
-
return (
|
|
271
|
-
html
|
|
272
|
-
// Remove script and style tags and their content
|
|
273
|
-
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, "")
|
|
274
|
-
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, "")
|
|
275
|
-
// Remove all other HTML tags
|
|
276
|
-
.replace(/<[^>]+>/g, " ")
|
|
277
|
-
// Collapse multiple whitespace
|
|
278
|
-
.replace(/\s+/g, " ")
|
|
279
|
-
.trim()
|
|
280
|
-
);
|
|
281
|
-
}
|