wellmarked 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +233 -0
- package/dist/cjs/client.cjs +332 -0
- package/dist/cjs/client.d.ts +173 -0
- package/dist/cjs/errors.cjs +143 -0
- package/dist/cjs/errors.d.ts +66 -0
- package/dist/cjs/index.cjs +31 -0
- package/dist/cjs/index.d.ts +15 -0
- package/dist/cjs/models.cjs +166 -0
- package/dist/cjs/models.d.ts +141 -0
- package/dist/cjs/package.json +3 -0
- package/dist/cjs/version.cjs +4 -0
- package/dist/cjs/version.d.ts +1 -0
- package/dist/esm/client.d.ts +173 -0
- package/dist/esm/client.js +330 -0
- package/dist/esm/errors.d.ts +66 -0
- package/dist/esm/errors.js +130 -0
- package/dist/esm/index.d.ts +15 -0
- package/dist/esm/index.js +15 -0
- package/dist/esm/models.d.ts +141 -0
- package/dist/esm/models.js +154 -0
- package/dist/esm/version.d.ts +1 -0
- package/dist/esm/version.js +1 -0
- package/dist/types/client.d.ts +173 -0
- package/dist/types/errors.d.ts +66 -0
- package/dist/types/index.d.ts +15 -0
- package/dist/types/models.d.ts +141 -0
- package/dist/types/version.d.ts +1 -0
- package/package.json +73 -0
- package/src/client.ts +463 -0
- package/src/errors.ts +162 -0
- package/src/index.ts +45 -0
- package/src/models.ts +311 -0
- package/src/version.ts +1 -0
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Typed response objects returned by the WellMarked SDK.
|
|
3
|
+
*
|
|
4
|
+
* These mirror the JSON shapes documented at https://api.wellmarked.io/docs.
|
|
5
|
+
*
|
|
6
|
+
* Response objects carry only the body fields documented in the API
|
|
7
|
+
* reference — they do not surface HTTP headers. Quota state lives on the
|
|
8
|
+
* account, so use `WellMarked.getUsage()` to read it.
|
|
9
|
+
*/
|
|
10
|
+
/**
|
|
11
|
+
* Per-article metadata returned with each extraction.
|
|
12
|
+
*
|
|
13
|
+
* `date` is the article's published date as a string (often `null` — not
|
|
14
|
+
* every page publishes one). `retrievedAt` is the timestamp at which
|
|
15
|
+
* WellMarked actually fetched the page, populated on every successful
|
|
16
|
+
* extraction. The two fields are independent.
|
|
17
|
+
*/
|
|
18
|
+
export interface ExtractionMeta {
|
|
19
|
+
url: string;
|
|
20
|
+
title: string | null;
|
|
21
|
+
author: string | null;
|
|
22
|
+
date: string | null;
|
|
23
|
+
retrievedAt: Date | null;
|
|
24
|
+
}
|
|
25
|
+
export declare function extractionMetaFromDict(data: Record<string, unknown>): ExtractionMeta;
|
|
26
|
+
/** Result of `POST /extract`. */
|
|
27
|
+
export interface ExtractResult {
|
|
28
|
+
markdown: string;
|
|
29
|
+
metadata: ExtractionMeta;
|
|
30
|
+
requestId: string;
|
|
31
|
+
}
|
|
32
|
+
export declare function extractResultFromResponse(body: Record<string, unknown>): ExtractResult;
|
|
33
|
+
/**
|
|
34
|
+
* One entry in a bulk job's `results` list.
|
|
35
|
+
*
|
|
36
|
+
* On success, `markdown` and `metadata` are populated and `error` is null.
|
|
37
|
+
* On a per-URL failure, `markdown`/`metadata` are null and `error` carries
|
|
38
|
+
* an API error code (e.g. `target_timeout`).
|
|
39
|
+
*/
|
|
40
|
+
export interface BulkItem {
|
|
41
|
+
url: string;
|
|
42
|
+
markdown: string | null;
|
|
43
|
+
metadata: ExtractionMeta | null;
|
|
44
|
+
error: string | null;
|
|
45
|
+
/** True when the item completed successfully (no error + has markdown). */
|
|
46
|
+
readonly ok: boolean;
|
|
47
|
+
}
|
|
48
|
+
export declare function bulkItemFromDict(data: Record<string, unknown>): BulkItem;
|
|
49
|
+
/** "queued" | "processing" | "done" */
|
|
50
|
+
export type JobStatus = "queued" | "processing" | "done";
|
|
51
|
+
/**
|
|
52
|
+
* Status of a bulk extraction job.
|
|
53
|
+
*
|
|
54
|
+
* Returned from both `POST /bulk` and `GET /bulk/{jobId}`. Also one of the
|
|
55
|
+
* two possible return types from the polymorphic `WellMarked.getJob` and
|
|
56
|
+
* `WellMarked.waitForJob` — use the `kind` discriminator (or a type guard)
|
|
57
|
+
* to distinguish from `CrawlJob` if you need to read crawl-specific fields.
|
|
58
|
+
*/
|
|
59
|
+
export interface BulkJob {
|
|
60
|
+
readonly kind: "bulk";
|
|
61
|
+
jobId: string;
|
|
62
|
+
status: JobStatus;
|
|
63
|
+
total: number;
|
|
64
|
+
completed: number;
|
|
65
|
+
results: BulkItem[];
|
|
66
|
+
createdAt: Date | null;
|
|
67
|
+
finishedAt: Date | null;
|
|
68
|
+
/** True when `status === "done"`. */
|
|
69
|
+
readonly done: boolean;
|
|
70
|
+
}
|
|
71
|
+
export declare function bulkJobFromResponse(body: Record<string, unknown>): BulkJob;
|
|
72
|
+
/**
|
|
73
|
+
* One page in a crawl job's `results` list.
|
|
74
|
+
*
|
|
75
|
+
* Shape mirrors `BulkItem` with an added `depth` field showing how far
|
|
76
|
+
* from the root URL this page sits in the BFS.
|
|
77
|
+
*/
|
|
78
|
+
export interface CrawlItem {
|
|
79
|
+
url: string;
|
|
80
|
+
depth: number;
|
|
81
|
+
markdown: string | null;
|
|
82
|
+
metadata: ExtractionMeta | null;
|
|
83
|
+
error: string | null;
|
|
84
|
+
/** True when the page completed successfully (no error + has markdown). */
|
|
85
|
+
readonly ok: boolean;
|
|
86
|
+
}
|
|
87
|
+
export declare function crawlItemFromDict(data: Record<string, unknown>): CrawlItem;
|
|
88
|
+
export type TruncatedReason = "page_cap_reached" | "quota_exhausted";
|
|
89
|
+
/**
|
|
90
|
+
* Status of a crawl job. Returned from `POST /crawl` and `GET /crawl/{jobId}`.
|
|
91
|
+
*
|
|
92
|
+
* Two crawl-only fields:
|
|
93
|
+
* - `truncated` — true when the crawl stopped before exhausting
|
|
94
|
+
* the frontier (depth/page cap or quota).
|
|
95
|
+
* - `truncatedReason` — `"page_cap_reached"` | `"quota_exhausted"` |
|
|
96
|
+
* `null`.
|
|
97
|
+
*/
|
|
98
|
+
export interface CrawlJob {
|
|
99
|
+
readonly kind: "crawl";
|
|
100
|
+
jobId: string;
|
|
101
|
+
status: JobStatus;
|
|
102
|
+
total: number;
|
|
103
|
+
completed: number;
|
|
104
|
+
results: CrawlItem[];
|
|
105
|
+
truncated: boolean;
|
|
106
|
+
truncatedReason: TruncatedReason | null;
|
|
107
|
+
createdAt: Date | null;
|
|
108
|
+
finishedAt: Date | null;
|
|
109
|
+
/** True when `status === "done"`. */
|
|
110
|
+
readonly done: boolean;
|
|
111
|
+
}
|
|
112
|
+
export declare function crawlJobFromResponse(body: Record<string, unknown>): CrawlJob;
|
|
113
|
+
/**
|
|
114
|
+
* Result of `GET /usage` — current-period quota state.
|
|
115
|
+
*
|
|
116
|
+
* This is the source of truth for rate-limit / quota information. The SDK
|
|
117
|
+
* does not surface `X-RateLimit-*` headers on extract/bulk responses;
|
|
118
|
+
* call `WellMarked.getUsage()` instead.
|
|
119
|
+
*/
|
|
120
|
+
export interface Usage {
|
|
121
|
+
plan: string;
|
|
122
|
+
period: string;
|
|
123
|
+
used: number;
|
|
124
|
+
limit: number;
|
|
125
|
+
remaining: number;
|
|
126
|
+
}
|
|
127
|
+
export declare function usageFromResponse(body: Record<string, unknown>): Usage;
|
|
128
|
+
/**
|
|
129
|
+
* Result of `POST /keys/rotate`.
|
|
130
|
+
*
|
|
131
|
+
* `apiKey` is the new raw key — store it before discarding this object,
|
|
132
|
+
* there is no recovery flow. The previous key is invalidated the moment
|
|
133
|
+
* the rotation call returns 200.
|
|
134
|
+
*/
|
|
135
|
+
export interface RotatedKey {
|
|
136
|
+
apiKey: string;
|
|
137
|
+
rotatedAt: Date | null;
|
|
138
|
+
}
|
|
139
|
+
export declare function rotatedKeyFromResponse(body: Record<string, unknown>): RotatedKey;
|
|
140
|
+
export declare function isBulkJob(job: BulkJob | CrawlJob): job is BulkJob;
|
|
141
|
+
export declare function isCrawlJob(job: BulkJob | CrawlJob): job is CrawlJob;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export declare const VERSION = "1.0.0";
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WellMarked client.
|
|
3
|
+
*
|
|
4
|
+
* The client is a thin, typed wrapper around the HTTP API. All endpoint
|
|
5
|
+
* methods are async — there is no separate sync/async split as in the
|
|
6
|
+
* Python SDK because JavaScript I/O is async by default.
|
|
7
|
+
*
|
|
8
|
+
* import { WellMarked } from "wellmarked";
|
|
9
|
+
*
|
|
10
|
+
* const wm = new WellMarked({ apiKey: "wm_..." });
|
|
11
|
+
* const result = await wm.extract("https://example.com/article");
|
|
12
|
+
* console.log(result.markdown);
|
|
13
|
+
*
|
|
14
|
+
* The API key can also be passed via the `WELLMARKED_API_KEY` environment
|
|
15
|
+
* variable (Node.js), in which case `new WellMarked()` is enough.
|
|
16
|
+
*/
|
|
17
|
+
import { APIStatusError } from "./errors.js";
|
|
18
|
+
import { type BulkJob, type CrawlJob, type ExtractResult, type RotatedKey, type Usage } from "./models.js";
|
|
19
|
+
export interface WellMarkedOptions {
|
|
20
|
+
/**
|
|
21
|
+
* Your WellMarked API key (`wm_...`). Falls back to the
|
|
22
|
+
* `WELLMARKED_API_KEY` environment variable (Node.js only).
|
|
23
|
+
*/
|
|
24
|
+
apiKey?: string;
|
|
25
|
+
/** API base URL. Override for testing. */
|
|
26
|
+
baseUrl?: string;
|
|
27
|
+
/** Per-request timeout, milliseconds. Defaults to 30000 (30s). */
|
|
28
|
+
timeoutMs?: number;
|
|
29
|
+
/**
|
|
30
|
+
* Bring your own `fetch`. Defaults to the global `fetch`. Useful for
|
|
31
|
+
* polyfills, custom agents/proxies, or test mocking.
|
|
32
|
+
*/
|
|
33
|
+
fetch?: typeof fetch;
|
|
34
|
+
/**
|
|
35
|
+
* Extra headers sent on every request — useful for adding an internal
|
|
36
|
+
* correlation id, a custom user agent suffix, etc.
|
|
37
|
+
*
|
|
38
|
+
* Authorization / Content-Type / Accept are reserved and silently
|
|
39
|
+
* ignored if passed (the SDK manages those itself).
|
|
40
|
+
*/
|
|
41
|
+
headers?: Record<string, string>;
|
|
42
|
+
}
|
|
43
|
+
export interface ExtractOptions {
|
|
44
|
+
/**
|
|
45
|
+
* Use Playwright to render JS-heavy pages. Requires a Pro/Enterprise
|
|
46
|
+
* plan AND `ENABLE_JS_RENDERING=true` on the API instance.
|
|
47
|
+
*/
|
|
48
|
+
renderJs?: boolean;
|
|
49
|
+
}
|
|
50
|
+
export interface BulkOptions {
|
|
51
|
+
renderJs?: boolean;
|
|
52
|
+
}
|
|
53
|
+
export interface CrawlOptions {
|
|
54
|
+
/** Max BFS depth from the root. Defaults to 1. Must be >= 0. */
|
|
55
|
+
depth?: number;
|
|
56
|
+
renderJs?: boolean;
|
|
57
|
+
}
|
|
58
|
+
export interface WaitForJobOptions {
|
|
59
|
+
/** Milliseconds to sleep between polls. Defaults to 2000. */
|
|
60
|
+
pollIntervalMs?: number;
|
|
61
|
+
/** Total ms to wait before timing out. `null` waits forever. Defaults to 300000 (5 min). */
|
|
62
|
+
timeoutMs?: number | null;
|
|
63
|
+
}
|
|
64
|
+
export declare class WellMarked {
|
|
65
|
+
private apiKey;
|
|
66
|
+
private readonly baseUrl;
|
|
67
|
+
private readonly timeoutMs;
|
|
68
|
+
private readonly fetchImpl;
|
|
69
|
+
private readonly extraHeaders;
|
|
70
|
+
constructor(options?: WellMarkedOptions);
|
|
71
|
+
/**
|
|
72
|
+
* Extract clean Markdown from a single URL.
|
|
73
|
+
*
|
|
74
|
+
* Throws:
|
|
75
|
+
* - `RateLimitError` — monthly plan limit reached.
|
|
76
|
+
* - `UnprocessableEntityError` — `no_content`, `target_timeout`, or
|
|
77
|
+
* `js_rendering_disabled`.
|
|
78
|
+
* - `AuthenticationError` — missing or invalid API key.
|
|
79
|
+
*/
|
|
80
|
+
extract(url: string, options?: ExtractOptions): Promise<ExtractResult>;
|
|
81
|
+
/**
|
|
82
|
+
* Submit a batch of URLs for concurrent extraction.
|
|
83
|
+
*
|
|
84
|
+
* Returns immediately with `status="queued"`. Poll with `getJob` or
|
|
85
|
+
* block with `waitForJob` to collect results.
|
|
86
|
+
*
|
|
87
|
+
* Throws:
|
|
88
|
+
* - `PermissionDeniedError` — `plan_not_supported` (Free tier).
|
|
89
|
+
* - `UnprocessableEntityError` — `bulk_cap_exceeded` (50 on Pro).
|
|
90
|
+
* - `RateLimitError` — would exceed remaining monthly quota.
|
|
91
|
+
*/
|
|
92
|
+
bulk(urls: Iterable<string>, options?: BulkOptions): Promise<BulkJob>;
|
|
93
|
+
/**
|
|
94
|
+
* Polymorphic job lookup — works for both bulk and crawl jobs.
|
|
95
|
+
*
|
|
96
|
+
* Calls `GET /bulk/{jobId}` first, then inspects the response's `kind`
|
|
97
|
+
* discriminator field. If the job is actually a crawl, a second request
|
|
98
|
+
* to `GET /crawl/{jobId}` fetches the full crawl shape (with per-item
|
|
99
|
+
* depth and the truncated flags). Returns `BulkJob` or `CrawlJob`
|
|
100
|
+
* accordingly.
|
|
101
|
+
*
|
|
102
|
+
* Use `isCrawlJob(job)` (or check `job.kind === "crawl"`) to branch on
|
|
103
|
+
* crawl-specific behavior. The shared interface (`status`, `completed`,
|
|
104
|
+
* `total`, `results`, `done`) works on either type.
|
|
105
|
+
*
|
|
106
|
+
* Jobs are retained for 6 hours after completion.
|
|
107
|
+
*/
|
|
108
|
+
getJob(jobId: string): Promise<BulkJob | CrawlJob>;
|
|
109
|
+
/**
|
|
110
|
+
* Block until a job reaches `status="done"` (or timeout). Works for both
|
|
111
|
+
* bulk and crawl jobs.
|
|
112
|
+
*
|
|
113
|
+
* The first call uses the polymorphic `getJob` to discover the job's
|
|
114
|
+
* kind. Subsequent polls go directly to the typed endpoint, so a crawl
|
|
115
|
+
* job only pays the dispatch round-trip once.
|
|
116
|
+
*
|
|
117
|
+
* Throws:
|
|
118
|
+
* - `Error` with message "did not finish within ..." — the job didn't
|
|
119
|
+
* finish before `timeoutMs` elapsed.
|
|
120
|
+
*/
|
|
121
|
+
waitForJob(jobId: string, options?: WaitForJobOptions): Promise<BulkJob | CrawlJob>;
|
|
122
|
+
/**
|
|
123
|
+
* Crawl a site starting from `url`, BFS to `depth`.
|
|
124
|
+
*
|
|
125
|
+
* Returns immediately with `status="queued"`. Use `getJob` to poll, or
|
|
126
|
+
* `waitForJob` to block until done — both handle crawl and bulk jobIds
|
|
127
|
+
* transparently.
|
|
128
|
+
*
|
|
129
|
+
* Plan caps:
|
|
130
|
+
* - Free → `PermissionDeniedError` (`plan_not_supported`)
|
|
131
|
+
* - Pro → max depth 5, up to 1,000 pages per crawl
|
|
132
|
+
* - Enterprise → unlimited depth and pages
|
|
133
|
+
*
|
|
134
|
+
* Throws:
|
|
135
|
+
* - `PermissionDeniedError` — `plan_not_supported` (Free tier).
|
|
136
|
+
* - `UnprocessableEntityError` — `crawl_depth_exceeded`.
|
|
137
|
+
*/
|
|
138
|
+
crawl(url: string, options?: CrawlOptions): Promise<CrawlJob>;
|
|
139
|
+
/**
|
|
140
|
+
* Add or replace a per-request header for the rest of this client's life.
|
|
141
|
+
*
|
|
142
|
+
* Authorization / Content-Type / Accept are reserved — calls that try
|
|
143
|
+
* to set those are silently ignored. To rotate the bearer token, use
|
|
144
|
+
* `rotateKey()`.
|
|
145
|
+
*/
|
|
146
|
+
setHeader(name: string, value: string): void;
|
|
147
|
+
/** Remove a header previously added via `headers:` or `setHeader()`. */
|
|
148
|
+
removeHeader(name: string): void;
|
|
149
|
+
/**
|
|
150
|
+
* Return your usage for the current billing period.
|
|
151
|
+
*
|
|
152
|
+
* Does not count toward your monthly quota.
|
|
153
|
+
*/
|
|
154
|
+
getUsage(): Promise<Usage>;
|
|
155
|
+
/**
|
|
156
|
+
* Mint a new API key. The current key is invalidated immediately.
|
|
157
|
+
*
|
|
158
|
+
* The new raw key is in the returned `apiKey` field — store it before
|
|
159
|
+
* discarding the result. There is no recovery flow.
|
|
160
|
+
*
|
|
161
|
+
* The client auto-swaps to the new key for subsequent requests.
|
|
162
|
+
*
|
|
163
|
+
* Does not count toward your monthly quota.
|
|
164
|
+
*/
|
|
165
|
+
rotateKey(): Promise<RotatedKey>;
|
|
166
|
+
/**
|
|
167
|
+
* Internal: read the current API key. Exposed for tests.
|
|
168
|
+
* Not part of the public, semver-stable surface.
|
|
169
|
+
*/
|
|
170
|
+
_getApiKey(): string;
|
|
171
|
+
private request;
|
|
172
|
+
}
|
|
173
|
+
export { APIStatusError };
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WellMarked client.
|
|
3
|
+
*
|
|
4
|
+
* The client is a thin, typed wrapper around the HTTP API. All endpoint
|
|
5
|
+
* methods are async — there is no separate sync/async split as in the
|
|
6
|
+
* Python SDK because JavaScript I/O is async by default.
|
|
7
|
+
*
|
|
8
|
+
* import { WellMarked } from "wellmarked";
|
|
9
|
+
*
|
|
10
|
+
* const wm = new WellMarked({ apiKey: "wm_..." });
|
|
11
|
+
* const result = await wm.extract("https://example.com/article");
|
|
12
|
+
* console.log(result.markdown);
|
|
13
|
+
*
|
|
14
|
+
* The API key can also be passed via the `WELLMARKED_API_KEY` environment
|
|
15
|
+
* variable (Node.js), in which case `new WellMarked()` is enough.
|
|
16
|
+
*/
|
|
17
|
+
import { APIConnectionError, APIStatusError, WellMarkedError, fromResponse, } from "./errors.js";
|
|
18
|
+
import { bulkJobFromResponse, crawlJobFromResponse, extractResultFromResponse, rotatedKeyFromResponse, usageFromResponse, } from "./models.js";
|
|
19
|
+
import { VERSION } from "./version.js";
|
|
20
|
+
const DEFAULT_BASE_URL = "https://api.wellmarked.io";
|
|
21
|
+
const DEFAULT_TIMEOUT_MS = 30000;
|
|
22
|
+
const RESERVED_HEADERS = new Set([
|
|
23
|
+
"authorization",
|
|
24
|
+
"content-type",
|
|
25
|
+
"accept",
|
|
26
|
+
]);
|
|
27
|
+
function resolveApiKey(apiKey) {
|
|
28
|
+
if (apiKey)
|
|
29
|
+
return apiKey;
|
|
30
|
+
const env = typeof process !== "undefined" && process.env
|
|
31
|
+
? process.env.WELLMARKED_API_KEY
|
|
32
|
+
: undefined;
|
|
33
|
+
if (env)
|
|
34
|
+
return env;
|
|
35
|
+
throw new Error("No API key provided. Pass apiKey: ... to the client or set the " +
|
|
36
|
+
"WELLMARKED_API_KEY environment variable. Generate a key at " +
|
|
37
|
+
"https://wellmarked.io.");
|
|
38
|
+
}
|
|
39
|
+
function defaultHeaders(apiKey) {
|
|
40
|
+
return {
|
|
41
|
+
Authorization: `Bearer ${apiKey}`,
|
|
42
|
+
"Content-Type": "application/json",
|
|
43
|
+
Accept: "application/json",
|
|
44
|
+
"User-Agent": `wellmarked-js/${VERSION}`,
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
function mergeHeaders(apiKey, extra) {
|
|
48
|
+
const out = defaultHeaders(apiKey);
|
|
49
|
+
if (!extra)
|
|
50
|
+
return out;
|
|
51
|
+
for (const [k, v] of Object.entries(extra)) {
|
|
52
|
+
if (RESERVED_HEADERS.has(k.toLowerCase()))
|
|
53
|
+
continue;
|
|
54
|
+
out[k] = v;
|
|
55
|
+
}
|
|
56
|
+
return out;
|
|
57
|
+
}
|
|
58
|
+
function sleep(ms) {
|
|
59
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
60
|
+
}
|
|
61
|
+
export class WellMarked {
|
|
62
|
+
constructor(options = {}) {
|
|
63
|
+
this.apiKey = resolveApiKey(options.apiKey);
|
|
64
|
+
this.baseUrl = (options.baseUrl ?? DEFAULT_BASE_URL).replace(/\/+$/, "");
|
|
65
|
+
this.timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
66
|
+
const f = options.fetch ?? (typeof fetch !== "undefined" ? fetch : undefined);
|
|
67
|
+
if (!f) {
|
|
68
|
+
throw new Error("No fetch implementation available. Pass `fetch:` to the client " +
|
|
69
|
+
"(undici, node-fetch, etc.) or upgrade to Node 18+.");
|
|
70
|
+
}
|
|
71
|
+
// Bind so `this` isn't lost when calling globalThis.fetch.
|
|
72
|
+
this.fetchImpl = f.bind(globalThis);
|
|
73
|
+
this.extraHeaders = {};
|
|
74
|
+
if (options.headers) {
|
|
75
|
+
for (const [k, v] of Object.entries(options.headers)) {
|
|
76
|
+
if (RESERVED_HEADERS.has(k.toLowerCase()))
|
|
77
|
+
continue;
|
|
78
|
+
this.extraHeaders[k] = v;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
// ── Endpoints ──────────────────────────────────────────────────────────────
|
|
83
|
+
/**
|
|
84
|
+
* Extract clean Markdown from a single URL.
|
|
85
|
+
*
|
|
86
|
+
* Throws:
|
|
87
|
+
* - `RateLimitError` — monthly plan limit reached.
|
|
88
|
+
* - `UnprocessableEntityError` — `no_content`, `target_timeout`, or
|
|
89
|
+
* `js_rendering_disabled`.
|
|
90
|
+
* - `AuthenticationError` — missing or invalid API key.
|
|
91
|
+
*/
|
|
92
|
+
async extract(url, options = {}) {
|
|
93
|
+
const body = await this.request("POST", "/extract", {
|
|
94
|
+
url,
|
|
95
|
+
render_js: options.renderJs === true,
|
|
96
|
+
});
|
|
97
|
+
return extractResultFromResponse(body);
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Submit a batch of URLs for concurrent extraction.
|
|
101
|
+
*
|
|
102
|
+
* Returns immediately with `status="queued"`. Poll with `getJob` or
|
|
103
|
+
* block with `waitForJob` to collect results.
|
|
104
|
+
*
|
|
105
|
+
* Throws:
|
|
106
|
+
* - `PermissionDeniedError` — `plan_not_supported` (Free tier).
|
|
107
|
+
* - `UnprocessableEntityError` — `bulk_cap_exceeded` (50 on Pro).
|
|
108
|
+
* - `RateLimitError` — would exceed remaining monthly quota.
|
|
109
|
+
*/
|
|
110
|
+
async bulk(urls, options = {}) {
|
|
111
|
+
const urlList = Array.from(urls);
|
|
112
|
+
if (urlList.length === 0) {
|
|
113
|
+
throw new Error("bulk() requires at least one URL.");
|
|
114
|
+
}
|
|
115
|
+
const body = await this.request("POST", "/bulk", {
|
|
116
|
+
urls: urlList,
|
|
117
|
+
render_js: options.renderJs === true,
|
|
118
|
+
});
|
|
119
|
+
return bulkJobFromResponse(body);
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Polymorphic job lookup — works for both bulk and crawl jobs.
|
|
123
|
+
*
|
|
124
|
+
* Calls `GET /bulk/{jobId}` first, then inspects the response's `kind`
|
|
125
|
+
* discriminator field. If the job is actually a crawl, a second request
|
|
126
|
+
* to `GET /crawl/{jobId}` fetches the full crawl shape (with per-item
|
|
127
|
+
* depth and the truncated flags). Returns `BulkJob` or `CrawlJob`
|
|
128
|
+
* accordingly.
|
|
129
|
+
*
|
|
130
|
+
* Use `isCrawlJob(job)` (or check `job.kind === "crawl"`) to branch on
|
|
131
|
+
* crawl-specific behavior. The shared interface (`status`, `completed`,
|
|
132
|
+
* `total`, `results`, `done`) works on either type.
|
|
133
|
+
*
|
|
134
|
+
* Jobs are retained for 6 hours after completion.
|
|
135
|
+
*/
|
|
136
|
+
async getJob(jobId) {
|
|
137
|
+
const body = (await this.request("GET", `/bulk/${jobId}`));
|
|
138
|
+
// /bulk/{id} answers for any jobId today (the endpoint just serializes
|
|
139
|
+
// results in the bulk shape regardless of stored job_type). The `kind`
|
|
140
|
+
// field tells us whether we got a bulk-shaped response of a crawl
|
|
141
|
+
// job; if so, re-fetch via /crawl/{id} for the proper shape.
|
|
142
|
+
if (body.kind === "crawl") {
|
|
143
|
+
const crawlBody = (await this.request("GET", `/crawl/${jobId}`));
|
|
144
|
+
return crawlJobFromResponse(crawlBody);
|
|
145
|
+
}
|
|
146
|
+
return bulkJobFromResponse(body);
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Block until a job reaches `status="done"` (or timeout). Works for both
|
|
150
|
+
* bulk and crawl jobs.
|
|
151
|
+
*
|
|
152
|
+
* The first call uses the polymorphic `getJob` to discover the job's
|
|
153
|
+
* kind. Subsequent polls go directly to the typed endpoint, so a crawl
|
|
154
|
+
* job only pays the dispatch round-trip once.
|
|
155
|
+
*
|
|
156
|
+
* Throws:
|
|
157
|
+
* - `Error` with message "did not finish within ..." — the job didn't
|
|
158
|
+
* finish before `timeoutMs` elapsed.
|
|
159
|
+
*/
|
|
160
|
+
async waitForJob(jobId, options = {}) {
|
|
161
|
+
const pollIntervalMs = options.pollIntervalMs ?? 2000;
|
|
162
|
+
const timeoutMs = options.timeoutMs === undefined ? 300000 : options.timeoutMs;
|
|
163
|
+
const deadline = timeoutMs === null ? null : Date.now() + timeoutMs;
|
|
164
|
+
let job = await this.getJob(jobId);
|
|
165
|
+
const isCrawl = job.kind === "crawl";
|
|
166
|
+
while (!job.done) {
|
|
167
|
+
if (deadline !== null && Date.now() >= deadline) {
|
|
168
|
+
throw new Error(`Job ${jobId} did not finish within ${timeoutMs}ms ` +
|
|
169
|
+
`(last status: ${job.status}, ${job.completed}/${job.total})`);
|
|
170
|
+
}
|
|
171
|
+
await sleep(pollIntervalMs);
|
|
172
|
+
const path = isCrawl ? `/crawl/${jobId}` : `/bulk/${jobId}`;
|
|
173
|
+
const body = (await this.request("GET", path));
|
|
174
|
+
job = isCrawl ? crawlJobFromResponse(body) : bulkJobFromResponse(body);
|
|
175
|
+
}
|
|
176
|
+
return job;
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Crawl a site starting from `url`, BFS to `depth`.
|
|
180
|
+
*
|
|
181
|
+
* Returns immediately with `status="queued"`. Use `getJob` to poll, or
|
|
182
|
+
* `waitForJob` to block until done — both handle crawl and bulk jobIds
|
|
183
|
+
* transparently.
|
|
184
|
+
*
|
|
185
|
+
* Plan caps:
|
|
186
|
+
* - Free → `PermissionDeniedError` (`plan_not_supported`)
|
|
187
|
+
* - Pro → max depth 5, up to 1,000 pages per crawl
|
|
188
|
+
* - Enterprise → unlimited depth and pages
|
|
189
|
+
*
|
|
190
|
+
* Throws:
|
|
191
|
+
* - `PermissionDeniedError` — `plan_not_supported` (Free tier).
|
|
192
|
+
* - `UnprocessableEntityError` — `crawl_depth_exceeded`.
|
|
193
|
+
*/
|
|
194
|
+
async crawl(url, options = {}) {
|
|
195
|
+
const depth = options.depth ?? 1;
|
|
196
|
+
if (depth < 0) {
|
|
197
|
+
throw new Error("depth must be >= 0.");
|
|
198
|
+
}
|
|
199
|
+
const body = await this.request("POST", "/crawl", {
|
|
200
|
+
url,
|
|
201
|
+
depth,
|
|
202
|
+
render_js: options.renderJs === true,
|
|
203
|
+
});
|
|
204
|
+
return crawlJobFromResponse(body);
|
|
205
|
+
}
|
|
206
|
+
// ── Custom headers ─────────────────────────────────────────────────────────
|
|
207
|
+
/**
|
|
208
|
+
* Add or replace a per-request header for the rest of this client's life.
|
|
209
|
+
*
|
|
210
|
+
* Authorization / Content-Type / Accept are reserved — calls that try
|
|
211
|
+
* to set those are silently ignored. To rotate the bearer token, use
|
|
212
|
+
* `rotateKey()`.
|
|
213
|
+
*/
|
|
214
|
+
setHeader(name, value) {
|
|
215
|
+
if (RESERVED_HEADERS.has(name.toLowerCase()))
|
|
216
|
+
return;
|
|
217
|
+
this.extraHeaders[name] = value;
|
|
218
|
+
}
|
|
219
|
+
/** Remove a header previously added via `headers:` or `setHeader()`. */
|
|
220
|
+
removeHeader(name) {
|
|
221
|
+
delete this.extraHeaders[name];
|
|
222
|
+
}
|
|
223
|
+
/**
|
|
224
|
+
* Return your usage for the current billing period.
|
|
225
|
+
*
|
|
226
|
+
* Does not count toward your monthly quota.
|
|
227
|
+
*/
|
|
228
|
+
async getUsage() {
|
|
229
|
+
const body = await this.request("GET", "/usage");
|
|
230
|
+
return usageFromResponse(body);
|
|
231
|
+
}
|
|
232
|
+
/**
|
|
233
|
+
* Mint a new API key. The current key is invalidated immediately.
|
|
234
|
+
*
|
|
235
|
+
* The new raw key is in the returned `apiKey` field — store it before
|
|
236
|
+
* discarding the result. There is no recovery flow.
|
|
237
|
+
*
|
|
238
|
+
* The client auto-swaps to the new key for subsequent requests.
|
|
239
|
+
*
|
|
240
|
+
* Does not count toward your monthly quota.
|
|
241
|
+
*/
|
|
242
|
+
async rotateKey() {
|
|
243
|
+
const body = await this.request("POST", "/keys/rotate");
|
|
244
|
+
const rotated = rotatedKeyFromResponse(body);
|
|
245
|
+
if (rotated.apiKey) {
|
|
246
|
+
this.apiKey = rotated.apiKey;
|
|
247
|
+
}
|
|
248
|
+
return rotated;
|
|
249
|
+
}
|
|
250
|
+
/**
|
|
251
|
+
* Internal: read the current API key. Exposed for tests.
|
|
252
|
+
* Not part of the public, semver-stable surface.
|
|
253
|
+
*/
|
|
254
|
+
_getApiKey() {
|
|
255
|
+
return this.apiKey;
|
|
256
|
+
}
|
|
257
|
+
// ── Transport ──────────────────────────────────────────────────────────────
|
|
258
|
+
async request(method, path, json) {
|
|
259
|
+
const url = `${this.baseUrl}${path}`;
|
|
260
|
+
const headers = mergeHeaders(this.apiKey, this.extraHeaders);
|
|
261
|
+
const init = { method, headers };
|
|
262
|
+
if (json !== undefined) {
|
|
263
|
+
init.body = JSON.stringify(json);
|
|
264
|
+
}
|
|
265
|
+
let controller = null;
|
|
266
|
+
let timer = null;
|
|
267
|
+
if (this.timeoutMs > 0 && typeof AbortController !== "undefined") {
|
|
268
|
+
controller = new AbortController();
|
|
269
|
+
init.signal = controller.signal;
|
|
270
|
+
timer = setTimeout(() => controller.abort(), this.timeoutMs);
|
|
271
|
+
}
|
|
272
|
+
let response;
|
|
273
|
+
try {
|
|
274
|
+
response = await this.fetchImpl(url, init);
|
|
275
|
+
}
|
|
276
|
+
catch (err) {
|
|
277
|
+
throw new APIConnectionError(`Could not reach the WellMarked API: ${stringifyError(err)}`, { cause: err });
|
|
278
|
+
}
|
|
279
|
+
finally {
|
|
280
|
+
if (timer !== null)
|
|
281
|
+
clearTimeout(timer);
|
|
282
|
+
}
|
|
283
|
+
let bodyText = "";
|
|
284
|
+
try {
|
|
285
|
+
bodyText = await response.text();
|
|
286
|
+
}
|
|
287
|
+
catch (err) {
|
|
288
|
+
throw new APIConnectionError(`Could not read API response body: ${stringifyError(err)}`, { cause: err });
|
|
289
|
+
}
|
|
290
|
+
let body = null;
|
|
291
|
+
if (bodyText.length > 0) {
|
|
292
|
+
try {
|
|
293
|
+
body = JSON.parse(bodyText);
|
|
294
|
+
}
|
|
295
|
+
catch {
|
|
296
|
+
body = null;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
return parseResponse(response.status, body);
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
function parseResponse(statusCode, body) {
|
|
303
|
+
let requestId;
|
|
304
|
+
if (body && typeof body === "object") {
|
|
305
|
+
const rid = body.request_id;
|
|
306
|
+
if (typeof rid === "string")
|
|
307
|
+
requestId = rid;
|
|
308
|
+
}
|
|
309
|
+
if (statusCode >= 200 && statusCode < 300) {
|
|
310
|
+
if (body === null) {
|
|
311
|
+
// The API contract says every documented endpoint returns a JSON
|
|
312
|
+
// body on 2xx. A null body means the server broke that contract
|
|
313
|
+
// (or a middlebox stripped it); fail loudly rather than letting
|
|
314
|
+
// downstream parsing crash on `body.foo` of null.
|
|
315
|
+
throw new WellMarkedError(`API returned HTTP ${statusCode} with no JSON body. ` +
|
|
316
|
+
"This is a contract violation — please report it.", { statusCode });
|
|
317
|
+
}
|
|
318
|
+
return body;
|
|
319
|
+
}
|
|
320
|
+
throw fromResponse(statusCode, body, requestId);
|
|
321
|
+
}
|
|
322
|
+
function stringifyError(err) {
|
|
323
|
+
if (err instanceof Error) {
|
|
324
|
+
return `${err.name}: ${err.message}`;
|
|
325
|
+
}
|
|
326
|
+
return String(err);
|
|
327
|
+
}
|
|
328
|
+
// Re-export the APIStatusError type so consumers can narrow without
|
|
329
|
+
// pulling from "./errors" directly.
|
|
330
|
+
export { APIStatusError };
|