@ozzylabs/feedradar 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ja.md +12 -6
- package/README.md +11 -6
- package/dist/agents/claude-code.d.ts +12 -1
- package/dist/agents/claude-code.d.ts.map +1 -1
- package/dist/agents/claude-code.js +9 -5
- package/dist/agents/claude-code.js.map +1 -1
- package/dist/agents/codex-cli.d.ts +7 -1
- package/dist/agents/codex-cli.d.ts.map +1 -1
- package/dist/agents/codex-cli.js +9 -5
- package/dist/agents/codex-cli.js.map +1 -1
- package/dist/agents/copilot.d.ts +7 -1
- package/dist/agents/copilot.d.ts.map +1 -1
- package/dist/agents/copilot.js +9 -5
- package/dist/agents/copilot.js.map +1 -1
- package/dist/agents/gemini-cli.d.ts +7 -1
- package/dist/agents/gemini-cli.d.ts.map +1 -1
- package/dist/agents/gemini-cli.js +9 -5
- package/dist/agents/gemini-cli.js.map +1 -1
- package/dist/agents/index.d.ts +1 -1
- package/dist/agents/index.d.ts.map +1 -1
- package/dist/agents/types.d.ts +33 -0
- package/dist/agents/types.d.ts.map +1 -1
- package/dist/cli/_progress.d.ts +138 -0
- package/dist/cli/_progress.d.ts.map +1 -0
- package/dist/cli/_progress.js +176 -0
- package/dist/cli/_progress.js.map +1 -0
- package/dist/cli/index.d.ts.map +1 -1
- package/dist/cli/index.js +2 -0
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/research.d.ts +18 -20
- package/dist/cli/research.d.ts.map +1 -1
- package/dist/cli/research.js +318 -203
- package/dist/cli/research.js.map +1 -1
- package/dist/cli/review.d.ts +7 -0
- package/dist/cli/review.d.ts.map +1 -1
- package/dist/cli/review.js +46 -1
- package/dist/cli/review.js.map +1 -1
- package/dist/cli/source.d.ts +23 -2
- package/dist/cli/source.d.ts.map +1 -1
- package/dist/cli/source.js +428 -7
- package/dist/cli/source.js.map +1 -1
- package/dist/cli/update.d.ts +7 -0
- package/dist/cli/update.d.ts.map +1 -1
- package/dist/cli/update.js +41 -1
- package/dist/cli/update.js.map +1 -1
- package/dist/cli/watch.d.ts.map +1 -1
- package/dist/cli/watch.js +67 -3
- package/dist/cli/watch.js.map +1 -1
- package/dist/cli/workflow/generate-combined.d.ts +100 -0
- package/dist/cli/workflow/generate-combined.d.ts.map +1 -0
- package/dist/cli/workflow/generate-combined.js +387 -0
- package/dist/cli/workflow/generate-combined.js.map +1 -0
- package/dist/cli/workflow/generate-watch.d.ts +142 -0
- package/dist/cli/workflow/generate-watch.d.ts.map +1 -0
- package/dist/cli/workflow/generate-watch.js +338 -0
- package/dist/cli/workflow/generate-watch.js.map +1 -0
- package/dist/cli/workflow.d.ts +29 -0
- package/dist/cli/workflow.d.ts.map +1 -0
- package/dist/cli/workflow.js +66 -0
- package/dist/cli/workflow.js.map +1 -0
- package/dist/core/feeds/_fetch.d.ts +10 -0
- package/dist/core/feeds/_fetch.d.ts.map +1 -1
- package/dist/core/feeds/_fetch.js +182 -0
- package/dist/core/feeds/_fetch.js.map +1 -1
- package/dist/core/feeds/_jsonpath.d.ts +57 -0
- package/dist/core/feeds/_jsonpath.d.ts.map +1 -0
- package/dist/core/feeds/_jsonpath.js +207 -0
- package/dist/core/feeds/_jsonpath.js.map +1 -0
- package/dist/core/feeds/html-js.d.ts +8 -0
- package/dist/core/feeds/html-js.d.ts.map +1 -1
- package/dist/core/feeds/html-js.js +47 -1
- package/dist/core/feeds/html-js.js.map +1 -1
- package/dist/core/feeds/index.d.ts +1 -1
- package/dist/core/feeds/index.d.ts.map +1 -1
- package/dist/core/feeds/index.js +4 -0
- package/dist/core/feeds/index.js.map +1 -1
- package/dist/core/feeds/json-api.d.ts +29 -0
- package/dist/core/feeds/json-api.d.ts.map +1 -0
- package/dist/core/feeds/json-api.js +860 -0
- package/dist/core/feeds/json-api.js.map +1 -0
- package/dist/core/feeds/json-feed.d.ts +11 -0
- package/dist/core/feeds/json-feed.d.ts.map +1 -0
- package/dist/core/feeds/json-feed.js +242 -0
- package/dist/core/feeds/json-feed.js.map +1 -0
- package/dist/core/feeds/types.d.ts +123 -0
- package/dist/core/feeds/types.d.ts.map +1 -1
- package/dist/core/progress.d.ts +101 -0
- package/dist/core/progress.d.ts.map +1 -0
- package/dist/core/progress.js +212 -0
- package/dist/core/progress.js.map +1 -0
- package/dist/core/recipes.d.ts +138 -0
- package/dist/core/recipes.d.ts.map +1 -0
- package/dist/core/recipes.js +242 -0
- package/dist/core/recipes.js.map +1 -0
- package/dist/core/watcher.d.ts +61 -1
- package/dist/core/watcher.d.ts.map +1 -1
- package/dist/core/watcher.js +99 -2
- package/dist/core/watcher.js.map +1 -1
- package/dist/recipes/aws-whats-new.yaml +87 -0
- package/dist/recipes/dev-to.yaml +40 -0
- package/dist/schemas/index.d.ts +1 -0
- package/dist/schemas/index.d.ts.map +1 -1
- package/dist/schemas/index.js +1 -0
- package/dist/schemas/index.js.map +1 -1
- package/dist/schemas/recipe.d.ts +127 -0
- package/dist/schemas/recipe.d.ts.map +1 -0
- package/dist/schemas/recipe.js +57 -0
- package/dist/schemas/recipe.js.map +1 -0
- package/dist/schemas/source.d.ts +222 -0
- package/dist/schemas/source.d.ts.map +1 -1
- package/dist/schemas/source.js +234 -0
- package/dist/schemas/source.js.map +1 -1
- package/dist/templates/agents/AGENTS.md +33 -3
- package/dist/templates/feedradar.md +23 -8
- package/dist/templates/workflows/combined.template.yaml.tmpl +110 -0
- package/dist/templates/workflows/watch.template.yaml.tmpl +103 -0
- package/package.json +1 -2
|
@@ -0,0 +1,860 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { ItemSchema } from "../../schemas/index.js";
|
|
3
|
+
import { fetchWithRetry } from "./_fetch.js";
|
|
4
|
+
import { selectAll, selectOne } from "./_jsonpath.js";
|
|
5
|
+
import { deriveItemId, deriveStableKey } from "./derive-id.js";
|
|
6
|
+
const USER_AGENT = "feedradar/0.0.0 (+https://github.com/ozzy-labs/feedradar)";
|
|
7
|
+
/**
|
|
8
|
+
* Prefix marking a content-hash entry (vs a real ETag) inside `state.lastEtag`.
|
|
9
|
+
* Mirrors `_html-common.ts` so re-fetches without a server ETag still dedup.
|
|
10
|
+
*/
|
|
11
|
+
const CONTENT_HASH_PREFIX = "sha256:";
|
|
12
|
+
/**
|
|
13
|
+
* Default selector chain consulted when `jsonSelectors.items` is omitted
|
|
14
|
+
* (ADR-0012 §D2). Resolved against the page-0 response body.
|
|
15
|
+
*/
|
|
16
|
+
const DEFAULT_ITEMS_PATHS = [
|
|
17
|
+
"$.items[*]",
|
|
18
|
+
"$.data[*]",
|
|
19
|
+
"$.results[*]",
|
|
20
|
+
"$.posts[*]",
|
|
21
|
+
"$.entries[*]",
|
|
22
|
+
"$[*]",
|
|
23
|
+
];
|
|
24
|
+
/**
|
|
25
|
+
* Per-field default selector chain consulted when the corresponding
|
|
26
|
+
* `jsonSelectors.<field>` is omitted (#174). For each item element we walk
|
|
27
|
+
* the chain in order and use the first path that yields a non-nullish value;
|
|
28
|
+
* this lets recipes for "simple" APIs (dev.to, generic JSON Feed clones)
|
|
29
|
+
* skip selectors entirely. Adoption is recorded once per fetch (first item)
|
|
30
|
+
* and surfaced via `FeedFetchDiag.selectorAdoption` so users can audit which
|
|
31
|
+
* candidate was picked.
|
|
32
|
+
*/
|
|
33
|
+
const DEFAULT_FIELD_PATHS = {
|
|
34
|
+
title: ["$.title", "$.name", "$.headline"],
|
|
35
|
+
link: ["$.url", "$.link", "$.permalink", "$.html_url"],
|
|
36
|
+
publishedAt: ["$.publishedAt", "$.published_at", "$.date", "$.created_at", "$.pubDate"],
|
|
37
|
+
summary: ["$.summary", "$.description", "$.excerpt", "$.body"],
|
|
38
|
+
};
|
|
39
|
+
/**
|
|
40
|
+
* Maximum response body size per page. ADR-0012 §D5a hardcodes this so a
|
|
41
|
+
* malformed recipe cannot blow up memory / context window. The cap is
|
|
42
|
+
* intentionally not user-configurable.
|
|
43
|
+
*/
|
|
44
|
+
const RESPONSE_SIZE_CAP_BYTES = 10 * 1024 * 1024; // 10 MB
|
|
45
|
+
/**
|
|
46
|
+
* `${VAR}` env interpolation (ADR-0012 §D5c).
|
|
47
|
+
*
|
|
48
|
+
* - Unresolved variables cause the header to be omitted entirely (degraded
|
|
49
|
+
* fetch), so public APIs work without env wiring while authenticated APIs
|
|
50
|
+
* fail-fast with a 401/403 at runtime.
|
|
51
|
+
* - The returned value MUST NEVER be logged. Callers route it directly into
|
|
52
|
+
* the `headers` map passed to fetch.
|
|
53
|
+
*/
|
|
54
|
+
function interpolateHeaderValue(raw, env) {
|
|
55
|
+
// Optimization: most headers contain no `${...}` and pass straight through.
|
|
56
|
+
if (!raw.includes("${"))
|
|
57
|
+
return raw;
|
|
58
|
+
let resolved = "";
|
|
59
|
+
let i = 0;
|
|
60
|
+
while (i < raw.length) {
|
|
61
|
+
const dollar = raw.indexOf("${", i);
|
|
62
|
+
if (dollar === -1) {
|
|
63
|
+
resolved += raw.slice(i);
|
|
64
|
+
break;
|
|
65
|
+
}
|
|
66
|
+
resolved += raw.slice(i, dollar);
|
|
67
|
+
const close = raw.indexOf("}", dollar + 2);
|
|
68
|
+
if (close === -1) {
|
|
69
|
+
// Malformed: treat as literal so we don't accidentally leak `${` markers
|
|
70
|
+
// into outbound requests. Equivalent to "no interpolation needed".
|
|
71
|
+
resolved += raw.slice(dollar);
|
|
72
|
+
break;
|
|
73
|
+
}
|
|
74
|
+
const name = raw.slice(dollar + 2, close);
|
|
75
|
+
const value = env[name];
|
|
76
|
+
if (value === undefined || value.length === 0) {
|
|
77
|
+
// ADR-0012 §D5c: unresolved → drop the entire header.
|
|
78
|
+
return undefined;
|
|
79
|
+
}
|
|
80
|
+
resolved += value;
|
|
81
|
+
i = close + 1;
|
|
82
|
+
}
|
|
83
|
+
return resolved;
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Build the outgoing `headers` map from the source recipe.
|
|
87
|
+
*
|
|
88
|
+
* Always includes a `user-agent` and `accept: application/json` so most APIs
|
|
89
|
+
* serve JSON without further config. Recipe-supplied headers take precedence
|
|
90
|
+
* over defaults (callers can override `accept` if a site insists on
|
|
91
|
+
* `application/vnd.api+json` etc.).
|
|
92
|
+
*/
|
|
93
|
+
function buildHeaders(source, env) {
|
|
94
|
+
const headers = {
|
|
95
|
+
accept: "application/json, */*;q=0.5",
|
|
96
|
+
"user-agent": USER_AGENT,
|
|
97
|
+
};
|
|
98
|
+
const recipeHeaders = source.http?.headers ?? {};
|
|
99
|
+
for (const [key, raw] of Object.entries(recipeHeaders)) {
|
|
100
|
+
const resolved = interpolateHeaderValue(raw, env);
|
|
101
|
+
if (resolved !== undefined) {
|
|
102
|
+
headers[key.toLowerCase()] = resolved;
|
|
103
|
+
}
|
|
104
|
+
// else: drop unresolved-env header, per ADR-0012 §D5c degraded-fetch policy.
|
|
105
|
+
}
|
|
106
|
+
return headers;
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Compute the next URL for `type: link-header` pagination by parsing
|
|
110
|
+
* `Link: <url>; rel="next", <...>; rel="prev"`. Returns `null` when no
|
|
111
|
+
* `rel="next"` is present (= end of pagination).
|
|
112
|
+
*
|
|
113
|
+
* NOTE on SSRF: a malicious or compromised upstream could emit a `Link`
|
|
114
|
+
* header pointing at `http://127.0.0.1:…` / cloud-metadata endpoints. The
|
|
115
|
+
* host-allowlist defense specified in ADR-0012 §D5b lives in the shared
|
|
116
|
+
* fetch wrapper (`src/core/feeds/_fetch.ts`), which sees every request URL
|
|
117
|
+
* regardless of the adapter that produced it; layering the check here would
|
|
118
|
+
* leave the same gap for `cursor` / `token` pagination and direct
|
|
119
|
+
* `source.url`. Tracking that wrapper-level enforcement as cross-cutting
|
|
120
|
+
* work outside this PR's scope.
|
|
121
|
+
*/
|
|
122
|
+
function parseLinkHeader(value) {
|
|
123
|
+
if (!value)
|
|
124
|
+
return null;
|
|
125
|
+
// RFC 5988: each link is `<url>; param1=val1; param2=val2`, comma-separated.
|
|
126
|
+
// We do not need a full parser — just the first segment whose rel includes
|
|
127
|
+
// "next". Whitespace in URLs is invalid so we can safely match `<...>`.
|
|
128
|
+
const segments = value.split(",");
|
|
129
|
+
for (const segment of segments) {
|
|
130
|
+
const match = segment.match(/<([^>]+)>\s*;\s*rel\s*=\s*"?([^";]+)"?/i);
|
|
131
|
+
if (!match)
|
|
132
|
+
continue;
|
|
133
|
+
const [, url, rel] = match;
|
|
134
|
+
if (!url || !rel)
|
|
135
|
+
continue;
|
|
136
|
+
// Some servers emit `rel="next first"` — split on whitespace.
|
|
137
|
+
const rels = rel
|
|
138
|
+
.toLowerCase()
|
|
139
|
+
.split(/\s+/)
|
|
140
|
+
.map((s) => s.trim());
|
|
141
|
+
if (rels.includes("next"))
|
|
142
|
+
return url;
|
|
143
|
+
}
|
|
144
|
+
return null;
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* Apply a query parameter to `url`, replacing any existing one with the same
|
|
148
|
+
* name. Used to thread page / offset / token / pageSize into pagination URLs
|
|
149
|
+
* without re-parsing the recipe URL string each iteration.
|
|
150
|
+
*/
|
|
151
|
+
function setQueryParam(url, name, value) {
|
|
152
|
+
const u = new URL(url);
|
|
153
|
+
u.searchParams.set(name, String(value));
|
|
154
|
+
return u.toString();
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Resolve `selectors.items` against a page body, falling back to the default
|
|
158
|
+
* selector chain when the recipe omitted the field (ADR-0012 §D2 default
|
|
159
|
+
* chain). Returns the matched item list and the path that produced it (for
|
|
160
|
+
* debug surfaces like `source test`).
|
|
161
|
+
*/
|
|
162
|
+
function resolveItemsList(selectors, body) {
|
|
163
|
+
if (selectors.items) {
|
|
164
|
+
return { matches: selectAll(selectors.items, body), path: selectors.items };
|
|
165
|
+
}
|
|
166
|
+
for (const candidate of DEFAULT_ITEMS_PATHS) {
|
|
167
|
+
const matches = selectAll(candidate, body);
|
|
168
|
+
if (matches.length > 0)
|
|
169
|
+
return { matches, path: candidate };
|
|
170
|
+
}
|
|
171
|
+
return { matches: [], path: DEFAULT_ITEMS_PATHS[0] };
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Resolve a per-item field with optional default-chain fallback.
|
|
175
|
+
*
|
|
176
|
+
* `explicit` is the recipe-supplied path. When undefined, we walk
|
|
177
|
+
* `DEFAULT_FIELD_PATHS[field]` and return the first candidate that yields
|
|
178
|
+
* a non-nullish value, or `{ value: undefined, path: null }` when every
|
|
179
|
+
* candidate misses.
|
|
180
|
+
*
|
|
181
|
+
* Returning the matched path lets the adapter record adoption once (first
|
|
182
|
+
* item) and surface it via `diag.selectorAdoption` so `source test` can
|
|
183
|
+
* print "title ← $.headline を採用".
|
|
184
|
+
*/
|
|
185
|
+
function resolveFieldWithFallback(field, explicit, element) {
|
|
186
|
+
if (explicit) {
|
|
187
|
+
return { value: selectOne(explicit, element), path: explicit };
|
|
188
|
+
}
|
|
189
|
+
for (const candidate of DEFAULT_FIELD_PATHS[field]) {
|
|
190
|
+
const value = selectOne(candidate, element);
|
|
191
|
+
if (value !== undefined && value !== null) {
|
|
192
|
+
return { value, path: candidate };
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
return { value: undefined, path: null };
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Resolve a `link` value against a base URL (#204).
|
|
199
|
+
*
|
|
200
|
+
* Many JSON APIs (notably AWS What's New) return the per-item link as a
|
|
201
|
+
* relative path like `/about-aws/whats-new/.../` rather than a fully
|
|
202
|
+
* qualified URL. Without resolution `ItemSchema`'s `z.string().url()`
|
|
203
|
+
* silently drops every item.
|
|
204
|
+
*
|
|
205
|
+
* The base is `selectors.linkBase` when set, otherwise `source.url` (which
|
|
206
|
+
* mirrors the html adapter's `new URL(href, source.url)` behavior). Absolute
|
|
207
|
+
* URLs pass through unchanged because `new URL("https://x/y", base)` ignores
|
|
208
|
+
* the base.
|
|
209
|
+
*
|
|
210
|
+
* We swallow `URL` constructor errors so a malformed `link` surfaces as a
|
|
211
|
+
* normal `ItemSchema` validation drop later (preserving the existing "one
|
|
212
|
+
* broken record does not abort the whole page" semantics).
|
|
213
|
+
*/
|
|
214
|
+
function resolveLinkUrl(raw, base) {
|
|
215
|
+
try {
|
|
216
|
+
return new URL(raw, base).toString();
|
|
217
|
+
}
|
|
218
|
+
catch {
|
|
219
|
+
return raw;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
/** Coerce a JSON value to a trimmed non-empty string, or `undefined`. */
|
|
223
|
+
function coerceString(value) {
|
|
224
|
+
if (value == null)
|
|
225
|
+
return undefined;
|
|
226
|
+
if (typeof value === "string") {
|
|
227
|
+
const trimmed = value.trim();
|
|
228
|
+
return trimmed.length === 0 ? undefined : trimmed;
|
|
229
|
+
}
|
|
230
|
+
if (typeof value === "number" || typeof value === "boolean")
|
|
231
|
+
return String(value);
|
|
232
|
+
return undefined;
|
|
233
|
+
}
|
|
234
|
+
/** Coerce a JSON value to ISO 8601, returning `undefined` for invalid input. */
|
|
235
|
+
function coerceIsoDate(value) {
|
|
236
|
+
const raw = coerceString(value);
|
|
237
|
+
if (!raw)
|
|
238
|
+
return undefined;
|
|
239
|
+
const date = new Date(raw);
|
|
240
|
+
if (Number.isNaN(date.getTime()))
|
|
241
|
+
return undefined;
|
|
242
|
+
return date.toISOString();
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* Normalize one element matched by `selectors.items` into our canonical
|
|
246
|
+
* `Item` shape. Returns `null` when the candidate fails schema validation
|
|
247
|
+
* (e.g. missing url) so one broken record does not abort the whole page.
|
|
248
|
+
*
|
|
249
|
+
* `Item.id` derivation follows ADR-0002:
|
|
250
|
+
*
|
|
251
|
+
* 1. `selectors.publisherId` (explicit, most stable)
|
|
252
|
+
* 2. `selectors.link` URL (canonical identifier)
|
|
253
|
+
* 3. `sha1:` hash of title + publishedAt (fallback)
|
|
254
|
+
*
|
|
255
|
+
* `adoption` is mutated in place: for each defaultable field, the first call
|
|
256
|
+
* records the JSONPath candidate that produced a usable value (or `null` if
|
|
257
|
+
* every candidate missed). Subsequent calls leave it alone so adoption
|
|
258
|
+
* reflects the very first item — that is what `source test` reports.
|
|
259
|
+
*/
|
|
260
|
+
function elementToItem(element, source, selectors, fetchedAt, adoption) {
|
|
261
|
+
const titleResolved = resolveFieldWithFallback("title", selectors.title, element);
|
|
262
|
+
if (adoption.title === undefined) {
|
|
263
|
+
adoption.title = titleResolved.path;
|
|
264
|
+
}
|
|
265
|
+
const title = coerceString(titleResolved.value) ?? "";
|
|
266
|
+
const linkResolved = resolveFieldWithFallback("link", selectors.link, element);
|
|
267
|
+
if (adoption.link === undefined) {
|
|
268
|
+
adoption.link = linkResolved.path;
|
|
269
|
+
}
|
|
270
|
+
const rawLink = coerceString(linkResolved.value);
|
|
271
|
+
if (!rawLink)
|
|
272
|
+
return null;
|
|
273
|
+
// Resolve relative paths against `linkBase` (or `source.url` as fallback)
|
|
274
|
+
// so APIs that return `/about-aws/whats-new/.../` instead of an absolute
|
|
275
|
+
// URL still produce valid `Item.url` values (#204). Absolute URLs pass
|
|
276
|
+
// through `new URL()` unchanged.
|
|
277
|
+
const url = resolveLinkUrl(rawLink, selectors.linkBase ?? source.url);
|
|
278
|
+
const publisherId = selectors.publisherId
|
|
279
|
+
? coerceString(selectOne(selectors.publisherId, element))
|
|
280
|
+
: undefined;
|
|
281
|
+
const publishedAtResolved = resolveFieldWithFallback("publishedAt", selectors.publishedAt, element);
|
|
282
|
+
if (adoption.publishedAt === undefined) {
|
|
283
|
+
adoption.publishedAt = publishedAtResolved.path;
|
|
284
|
+
}
|
|
285
|
+
const publishedAt = coerceIsoDate(publishedAtResolved.value);
|
|
286
|
+
const summaryResolved = resolveFieldWithFallback("summary", selectors.summary, element);
|
|
287
|
+
if (adoption.summary === undefined) {
|
|
288
|
+
adoption.summary = summaryResolved.path;
|
|
289
|
+
}
|
|
290
|
+
const summary = coerceString(summaryResolved.value);
|
|
291
|
+
const body = selectors.body ? coerceString(selectOne(selectors.body, element)) : undefined;
|
|
292
|
+
// `selectors.tags` is recognized by the schema but currently silently passed
|
|
293
|
+
// through into `raw` only. The filter pipeline (`buildHaystack`) does not
|
|
294
|
+
// structurally read `Item.tags` for any adapter, so surfacing tags
|
|
295
|
+
// structurally here would not improve filtering. Keep them inside `raw`
|
|
296
|
+
// (already attached below) until a future filter extension consumes them.
|
|
297
|
+
const stableKey = deriveStableKey({
|
|
298
|
+
publisherId,
|
|
299
|
+
url,
|
|
300
|
+
fallbackHashInputs: [title, publishedAt],
|
|
301
|
+
});
|
|
302
|
+
const id = deriveItemId(title, stableKey);
|
|
303
|
+
const candidate = {
|
|
304
|
+
id,
|
|
305
|
+
sourceId: source.id,
|
|
306
|
+
title,
|
|
307
|
+
url,
|
|
308
|
+
fetchedAt,
|
|
309
|
+
raw: element,
|
|
310
|
+
};
|
|
311
|
+
if (publishedAt)
|
|
312
|
+
candidate.publishedAt = publishedAt;
|
|
313
|
+
if (summary)
|
|
314
|
+
candidate.summary = summary;
|
|
315
|
+
// Body is preserved inside `raw`; we surface it through summary when the
|
|
316
|
+
// recipe explicitly mapped a body selector and no summary selector. This
|
|
317
|
+
// keeps the Item schema lean while still letting recipes pull in a long
|
|
318
|
+
// description.
|
|
319
|
+
if (!summary && body)
|
|
320
|
+
candidate.summary = body;
|
|
321
|
+
const result = ItemSchema.safeParse(candidate);
|
|
322
|
+
return result.success ? result.data : null;
|
|
323
|
+
}
|
|
324
|
+
/**
|
|
325
|
+
* One iteration of pagination: issue a GET, decode the JSON, return the body
|
|
326
|
+
* + the URL of the next page (or `null` when traversal is done).
|
|
327
|
+
*
|
|
328
|
+
* Errors are thrown to the caller; the adapter wraps them with source-id
|
|
329
|
+
* context before propagating to the watcher.
|
|
330
|
+
*/
|
|
331
|
+
async function fetchPage(url, fetchImpl, headers, pagination, pageIndex, state) {
|
|
332
|
+
// Forward conditional-GET headers only on page 0 — pagination URLs are
|
|
333
|
+
// ephemeral and most servers will not 304 them. ETag-aware short-circuit
|
|
334
|
+
// is mainly useful for the "no items have changed since last run" case.
|
|
335
|
+
// We also skip conditional GET in backfill mode (caller sets
|
|
336
|
+
// `sendConditional: false`) so a stale ETag from a previous normal-mode
|
|
337
|
+
// run does not 304-out the requested full-history traversal.
|
|
338
|
+
const requestHeaders = { ...headers };
|
|
339
|
+
if (pageIndex === 0 &&
|
|
340
|
+
state.sendConditional !== false &&
|
|
341
|
+
state.etag &&
|
|
342
|
+
!state.etag.startsWith(CONTENT_HASH_PREFIX) &&
|
|
343
|
+
!("if-none-match" in requestHeaders)) {
|
|
344
|
+
requestHeaders["if-none-match"] = state.etag;
|
|
345
|
+
}
|
|
346
|
+
const response = await fetchWithRetry(fetchImpl, url, { headers: requestHeaders });
|
|
347
|
+
const etag = response.headers.get("etag");
|
|
348
|
+
const linkNext = pagination.type === "link-header" ? parseLinkHeader(response.headers.get("link")) : null;
|
|
349
|
+
if (response.status === 304) {
|
|
350
|
+
return { body: null, bodyText: "", status: 304, etag, linkNext };
|
|
351
|
+
}
|
|
352
|
+
if (response.status < 200 || response.status >= 300) {
|
|
353
|
+
throw new Error(`json-api adapter: HTTP ${response.status} from ${url}`);
|
|
354
|
+
}
|
|
355
|
+
const bodyText = await response.text();
|
|
356
|
+
if (bodyText.length > RESPONSE_SIZE_CAP_BYTES) {
|
|
357
|
+
throw new Error(`json-api adapter: response too large (${bodyText.length} bytes > ${RESPONSE_SIZE_CAP_BYTES} cap) from ${url}`);
|
|
358
|
+
}
|
|
359
|
+
let parsed;
|
|
360
|
+
try {
|
|
361
|
+
parsed = JSON.parse(bodyText);
|
|
362
|
+
}
|
|
363
|
+
catch (e) {
|
|
364
|
+
throw new Error(`json-api adapter: failed to parse JSON from ${url}: ${e instanceof Error ? e.message : String(e)}`);
|
|
365
|
+
}
|
|
366
|
+
return { body: parsed, bodyText, status: response.status, etag, linkNext };
|
|
367
|
+
}
|
|
368
|
+
/**
|
|
369
|
+
* Compute the next page URL based on the pagination strategy + the current
|
|
370
|
+
* page's body. Returns `null` when traversal should stop (no more pages).
|
|
371
|
+
*
|
|
372
|
+
* `link-header` is handled by the caller (it depends on the response headers,
|
|
373
|
+
* which `fetchPage` reads); we return `null` here so the loop terminates if
|
|
374
|
+
* the recipe says `link-header` but no `Link` header was returned.
|
|
375
|
+
*/
|
|
376
|
+
function computeNextUrl(source, pagination, currentUrl, currentBody, currentItemsLength, pageCountSoFar) {
|
|
377
|
+
switch (pagination.type) {
|
|
378
|
+
case "none":
|
|
379
|
+
return null;
|
|
380
|
+
case "link-header":
|
|
381
|
+
// The Link header is read in fetchPage; this branch should never be
|
|
382
|
+
// consulted to compute the next URL directly. Returning null is a safe
|
|
383
|
+
// fallback for buggy recipes that mix `link-header` with explicit
|
|
384
|
+
// `param`.
|
|
385
|
+
return null;
|
|
386
|
+
case "page": {
|
|
387
|
+
if (currentItemsLength === 0)
|
|
388
|
+
return null;
|
|
389
|
+
const param = pagination.param ?? "page";
|
|
390
|
+
const start = pagination.start ?? 0;
|
|
391
|
+
const nextPage = start + pageCountSoFar;
|
|
392
|
+
let url = setQueryParam(currentUrl, param, nextPage);
|
|
393
|
+
if (pagination.pageSize !== undefined) {
|
|
394
|
+
const sizeParam = pagination.pageSizeParam ?? "pageSize";
|
|
395
|
+
url = setQueryParam(url, sizeParam, pagination.pageSize);
|
|
396
|
+
}
|
|
397
|
+
return url;
|
|
398
|
+
}
|
|
399
|
+
case "offset": {
|
|
400
|
+
if (currentItemsLength === 0)
|
|
401
|
+
return null;
|
|
402
|
+
const param = pagination.param ?? "offset";
|
|
403
|
+
const start = pagination.start ?? 0;
|
|
404
|
+
const limit = pagination.pageSize ?? currentItemsLength;
|
|
405
|
+
const nextOffset = start + pageCountSoFar * limit;
|
|
406
|
+
let url = setQueryParam(currentUrl, param, nextOffset);
|
|
407
|
+
if (pagination.pageSize !== undefined) {
|
|
408
|
+
const sizeParam = pagination.pageSizeParam ?? "limit";
|
|
409
|
+
url = setQueryParam(url, sizeParam, limit);
|
|
410
|
+
}
|
|
411
|
+
return url;
|
|
412
|
+
}
|
|
413
|
+
case "cursor":
|
|
414
|
+
case "token": {
|
|
415
|
+
if (!pagination.nextCursorPath)
|
|
416
|
+
return null;
|
|
417
|
+
const cursor = coerceString(selectOne(pagination.nextCursorPath, currentBody));
|
|
418
|
+
if (!cursor)
|
|
419
|
+
return null;
|
|
420
|
+
const param = pagination.param ?? (pagination.type === "cursor" ? "after" : "pageToken");
|
|
421
|
+
return setQueryParam(source.url, param, cursor);
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
/**
|
|
426
|
+
* Build the initial (page 0) URL by stamping in `start` / `pageSize` from the
|
|
427
|
+
* recipe. For `cursor` / `token` paginations the start cursor is implicit —
|
|
428
|
+
* the recipe URL should already contain whatever initial cursor / token the
|
|
429
|
+
* site expects (typically none).
|
|
430
|
+
*/
|
|
431
|
+
function initialUrl(source, pagination) {
|
|
432
|
+
switch (pagination.type) {
|
|
433
|
+
case "none":
|
|
434
|
+
case "link-header":
|
|
435
|
+
case "cursor":
|
|
436
|
+
case "token":
|
|
437
|
+
// For these types page 0 is just the recipe URL as written.
|
|
438
|
+
return source.url;
|
|
439
|
+
case "page": {
|
|
440
|
+
const param = pagination.param ?? "page";
|
|
441
|
+
const start = pagination.start ?? 0;
|
|
442
|
+
let url = setQueryParam(source.url, param, start);
|
|
443
|
+
if (pagination.pageSize !== undefined) {
|
|
444
|
+
const sizeParam = pagination.pageSizeParam ?? "pageSize";
|
|
445
|
+
url = setQueryParam(url, sizeParam, pagination.pageSize);
|
|
446
|
+
}
|
|
447
|
+
return url;
|
|
448
|
+
}
|
|
449
|
+
case "offset": {
|
|
450
|
+
const param = pagination.param ?? "offset";
|
|
451
|
+
const start = pagination.start ?? 0;
|
|
452
|
+
let url = setQueryParam(source.url, param, start);
|
|
453
|
+
if (pagination.pageSize !== undefined) {
|
|
454
|
+
const sizeParam = pagination.pageSizeParam ?? "limit";
|
|
455
|
+
url = setQueryParam(url, sizeParam, pagination.pageSize);
|
|
456
|
+
}
|
|
457
|
+
return url;
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
/**
|
|
462
|
+
* Effective page cap. Normal mode honors the recipe (`pagination.maxPages`).
|
|
463
|
+
* The natural stop conditions inside the loop (lastSeenIds hit, items.length
|
|
464
|
+
* less than pageSize, empty page) terminate normal-mode traversal earlier
|
|
465
|
+
* than the cap for periodic ingest. Backfill mode honors the recipe cap up
|
|
466
|
+
* to the `--max-pages` override.
|
|
467
|
+
*/
|
|
468
|
+
function effectiveMaxPages(pagination, backfill, override) {
|
|
469
|
+
const recipeCap = pagination.maxPages;
|
|
470
|
+
if (!backfill) {
|
|
471
|
+
return recipeCap;
|
|
472
|
+
}
|
|
473
|
+
if (override !== undefined)
|
|
474
|
+
return Math.min(recipeCap, override);
|
|
475
|
+
return recipeCap;
|
|
476
|
+
}
|
|
477
|
+
/**
|
|
478
|
+
* Apply a single facet value to the source URL by injecting the templated
|
|
479
|
+
* query parameter. Replaces any existing value of `facet.param` so a recipe
|
|
480
|
+
* URL with a placeholder/default does not double-up at fetch time.
|
|
481
|
+
*/
|
|
482
|
+
function applyFacetValue(rawUrl, facet, value) {
|
|
483
|
+
const u = new URL(rawUrl);
|
|
484
|
+
const substituted = facet.template.replace("{}", String(value));
|
|
485
|
+
u.searchParams.set(facet.param, substituted);
|
|
486
|
+
return u.toString();
|
|
487
|
+
}
|
|
488
|
+
/**
|
|
489
|
+
* Enumerate the facet values for a single facet spec.
|
|
490
|
+
*
|
|
491
|
+
* - `range`: `[start, end]` inclusive, walked with `step` (default 1).
|
|
492
|
+
* Schema guarantees `step > 0` and `start <= end` so the loop terminates.
|
|
493
|
+
* - `enum`: returns the explicit list verbatim (string or number).
|
|
494
|
+
*/
|
|
495
|
+
function* generateFacetValues(facet) {
|
|
496
|
+
if (facet.type === "range") {
|
|
497
|
+
const [start, end] = facet.range;
|
|
498
|
+
const step = facet.step;
|
|
499
|
+
for (let v = start; v <= end; v += step)
|
|
500
|
+
yield v;
|
|
501
|
+
return;
|
|
502
|
+
}
|
|
503
|
+
for (const v of facet.values)
|
|
504
|
+
yield v;
|
|
505
|
+
}
|
|
506
|
+
/**
|
|
507
|
+
* Inner fetch — the original single-axis (pagination-only) traversal. The
|
|
508
|
+
* public adapter delegates here either directly (no facets) or once per
|
|
509
|
+
* facet value (facet sweep mode).
|
|
510
|
+
*
|
|
511
|
+
* `dryRun` is preserved (single-page fetch behaviour) but the public
|
|
512
|
+
* adapter narrows it further in facet sweep mode to "first facet value
|
|
513
|
+
* only" so `source test` does not walk every year.
|
|
514
|
+
*/
|
|
515
|
+
async function fetchSingle(source, options) {
|
|
516
|
+
if (!source.pagination) {
|
|
517
|
+
throw new Error(`json-api adapter: source '${source.id}' has no pagination config`);
|
|
518
|
+
}
|
|
519
|
+
const fetchImpl = options.fetch ?? globalThis.fetch;
|
|
520
|
+
if (typeof fetchImpl !== "function") {
|
|
521
|
+
throw new Error("json-api adapter: no fetch implementation available (Node 22+ required)");
|
|
522
|
+
}
|
|
523
|
+
const pagination = source.pagination;
|
|
524
|
+
// `jsonSelectors` is optional in the schema (#174). When omitted, every
|
|
525
|
+
// field falls back to its default chain so trivial APIs (dev.to,
|
|
526
|
+
// generic JSON Feed clones) work without a selector block at all.
|
|
527
|
+
const selectors = source.jsonSelectors ?? {};
|
|
528
|
+
const env = options.env ?? process.env;
|
|
529
|
+
const headers = buildHeaders(source, env);
|
|
530
|
+
const previous = options.state;
|
|
531
|
+
const previousSeen = new Set(previous?.lastSeenIds ?? []);
|
|
532
|
+
const fetchedAt = new Date().toISOString();
|
|
533
|
+
const backfill = options.backfill === true;
|
|
534
|
+
const dryRun = options.dryRun === true;
|
|
535
|
+
const warn = options.warn ?? (() => { });
|
|
536
|
+
const onPage = options.onPage;
|
|
537
|
+
const maxPages = effectiveMaxPages(pagination, backfill, options.maxPagesOverride);
|
|
538
|
+
let currentUrl = initialUrl(source, pagination);
|
|
539
|
+
let pageIndex = 0;
|
|
540
|
+
const items = [];
|
|
541
|
+
let lastEtag = null;
|
|
542
|
+
let firstBodyText = null;
|
|
543
|
+
let firstBody = null;
|
|
544
|
+
let notModified = false;
|
|
545
|
+
// `undefined` means "not seen yet"; once we normalize the first item we
|
|
546
|
+
// overwrite each entry with either the matched path (string) or `null`
|
|
547
|
+
// (no candidate yielded a value). The diag payload reports the final
|
|
548
|
+
// state at end-of-fetch.
|
|
549
|
+
const adoption = {
|
|
550
|
+
title: undefined,
|
|
551
|
+
link: undefined,
|
|
552
|
+
publishedAt: undefined,
|
|
553
|
+
summary: undefined,
|
|
554
|
+
};
|
|
555
|
+
let itemsPath = null;
|
|
556
|
+
let paginationPreview;
|
|
557
|
+
// Effective cap may tighten mid-traversal when `totalPath` resolves to a
|
|
558
|
+
// value smaller than the recipe's `maxPages` (backfill early stop).
|
|
559
|
+
let effectiveCap = maxPages;
|
|
560
|
+
// Dry-run mode short-circuits after page 0: we record the diag preview
|
|
561
|
+
// (next URL / Link header / nextCursor) but never fetch page 1.
|
|
562
|
+
if (dryRun)
|
|
563
|
+
effectiveCap = Math.min(effectiveCap, 1);
|
|
564
|
+
while (pageIndex < effectiveCap) {
|
|
565
|
+
const response = await fetchPage(currentUrl, fetchImpl, headers, pagination, pageIndex, {
|
|
566
|
+
etag: previous?.lastEtag,
|
|
567
|
+
// Skip conditional GET in backfill mode so a stale ETag from a
|
|
568
|
+
// previous normal-mode run does not 304-out a requested full-history
|
|
569
|
+
// traversal.
|
|
570
|
+
sendConditional: !backfill,
|
|
571
|
+
});
|
|
572
|
+
if (pageIndex === 0) {
|
|
573
|
+
firstBody = response.body;
|
|
574
|
+
firstBodyText = response.bodyText;
|
|
575
|
+
lastEtag = response.etag;
|
|
576
|
+
if (response.status === 304) {
|
|
577
|
+
notModified = true;
|
|
578
|
+
break;
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
if (response.status === 304) {
|
|
582
|
+
// 304 on a later page is unusual but treat as end-of-pagination.
|
|
583
|
+
break;
|
|
584
|
+
}
|
|
585
|
+
const itemsResult = resolveItemsList(selectors, response.body);
|
|
586
|
+
if (pageIndex === 0)
|
|
587
|
+
itemsPath = itemsResult.path;
|
|
588
|
+
const matches = itemsResult.matches;
|
|
589
|
+
const pageItems = matches
|
|
590
|
+
.map((m) => elementToItem(m, source, selectors, fetchedAt, adoption))
|
|
591
|
+
.filter((i) => i !== null);
|
|
592
|
+
// Surface a pagination preview for `source test` on page 0 only. We
|
|
593
|
+
// compute the *would-be* next URL / cursor / Link header but never
|
|
594
|
+
// actually fetch it in dry-run mode (#174 state-clean invariant).
|
|
595
|
+
if (pageIndex === 0) {
|
|
596
|
+
const linkHeaderNext = pagination.type === "link-header" ? response.linkNext : undefined;
|
|
597
|
+
let nextCursor;
|
|
598
|
+
if ((pagination.type === "cursor" || pagination.type === "token") &&
|
|
599
|
+
pagination.nextCursorPath) {
|
|
600
|
+
nextCursor = coerceString(selectOne(pagination.nextCursorPath, response.body)) ?? null;
|
|
601
|
+
}
|
|
602
|
+
let previewNextUrl;
|
|
603
|
+
if (pagination.type === "link-header") {
|
|
604
|
+
previewNextUrl = response.linkNext;
|
|
605
|
+
}
|
|
606
|
+
else {
|
|
607
|
+
previewNextUrl = computeNextUrl(source, pagination, currentUrl, response.body, pageItems.length, 1);
|
|
608
|
+
}
|
|
609
|
+
paginationPreview = {
|
|
610
|
+
strategy: pagination.type,
|
|
611
|
+
nextUrl: previewNextUrl,
|
|
612
|
+
...(linkHeaderNext !== undefined ? { linkHeaderNext } : {}),
|
|
613
|
+
...(nextCursor !== undefined ? { nextCursor } : {}),
|
|
614
|
+
};
|
|
615
|
+
}
|
|
616
|
+
// Normal-mode early stop: if this page contains an id we have already
|
|
617
|
+
// seen, the older pages will all be older still — stop paginating.
|
|
618
|
+
let hitSeen = false;
|
|
619
|
+
if (!backfill && previousSeen.size > 0) {
|
|
620
|
+
for (const item of pageItems) {
|
|
621
|
+
if (previousSeen.has(item.id)) {
|
|
622
|
+
hitSeen = true;
|
|
623
|
+
break;
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
items.push(...pageItems);
|
|
628
|
+
// Backfill-mode early stop via `totalPath`: if the recipe declared a
|
|
629
|
+
// total-count selector, narrow the page budget so we exit after the
|
|
630
|
+
// implied last page rather than walking the full `maxPages` cap. We
|
|
631
|
+
// only consult `totalPath` on page 0 because the value is unlikely to
|
|
632
|
+
// change mid-traversal and re-evaluating per page would cost an extra
|
|
633
|
+
// JSONPath walk for negligible benefit.
|
|
634
|
+
//
|
|
635
|
+
// Applied BEFORE the `onPage` callback below so the user-visible
|
|
636
|
+
// `Page N/M` denominator already reflects the narrowed cap on the
|
|
637
|
+
// very first page event (otherwise the spinner ratio would jump
|
|
638
|
+
// from `1/20` to `1/2` between page 0 and page 1, which reads as a
|
|
639
|
+
// bug).
|
|
640
|
+
if (backfill && pagination.totalPath && pageIndex === 0) {
|
|
641
|
+
const totalRaw = selectOne(pagination.totalPath, response.body);
|
|
642
|
+
const total = typeof totalRaw === "number" ? totalRaw : Number(coerceString(totalRaw));
|
|
643
|
+
if (Number.isFinite(total) && total > 0 && pagination.pageSize) {
|
|
644
|
+
const computedMax = Math.max(1, Math.ceil(total / pagination.pageSize));
|
|
645
|
+
if (computedMax < effectiveCap) {
|
|
646
|
+
effectiveCap = computedMax;
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
// Surface per-page progress to the CLI spinner / non-TTY log (#198).
|
|
651
|
+
// The callback is invoked before any early-exit checks below so the
|
|
652
|
+
// user always sees a final `Page N/N` event for the page that decided
|
|
653
|
+
// termination. `effectiveCap` is the denominator the loop will respect
|
|
654
|
+
// (recipe `maxPages`, narrowed by `totalPath` on page 0 in backfill
|
|
655
|
+
// mode above), so the user-visible ratio shrinks as the budget tightens.
|
|
656
|
+
if (onPage) {
|
|
657
|
+
onPage({
|
|
658
|
+
pageIndex,
|
|
659
|
+
pageTotal: effectiveCap,
|
|
660
|
+
items: pageItems.length,
|
|
661
|
+
});
|
|
662
|
+
}
|
|
663
|
+
// Stop when the page yielded zero items — protects against runaway
|
|
664
|
+
// pagination on broken recipes / empty trailing pages.
|
|
665
|
+
if (matches.length === 0)
|
|
666
|
+
break;
|
|
667
|
+
if (hitSeen)
|
|
668
|
+
break;
|
|
669
|
+
// End-of-pagination heuristic: when the recipe declared a `pageSize`
|
|
670
|
+
// and this page returned fewer matches than that, treat it as the last
|
|
671
|
+
// page. Saves one extra round-trip per source on the common "trailing
|
|
672
|
+
// partial page" case (page 0 of size N, …, page K returns K' < N).
|
|
673
|
+
// Skipped for `cursor` / `token` pagination where `nextCursor` is the
|
|
674
|
+
// authoritative signal — those types may legitimately return fewer
|
|
675
|
+
// items per page than the requested size.
|
|
676
|
+
if (pagination.pageSize !== undefined &&
|
|
677
|
+
(pagination.type === "page" || pagination.type === "offset") &&
|
|
678
|
+
matches.length < pagination.pageSize) {
|
|
679
|
+
break;
|
|
680
|
+
}
|
|
681
|
+
// Compute next URL.
|
|
682
|
+
let nextUrl;
|
|
683
|
+
if (pagination.type === "link-header") {
|
|
684
|
+
nextUrl = response.linkNext;
|
|
685
|
+
}
|
|
686
|
+
else {
|
|
687
|
+
nextUrl = computeNextUrl(source, pagination, currentUrl, response.body, pageItems.length, pageIndex + 1);
|
|
688
|
+
}
|
|
689
|
+
if (!nextUrl)
|
|
690
|
+
break;
|
|
691
|
+
currentUrl = nextUrl;
|
|
692
|
+
pageIndex++;
|
|
693
|
+
}
|
|
694
|
+
// Warn for default-chain fields where every candidate returned null —
|
|
695
|
+
// recipe authors typically want to know the API has a non-standard
|
|
696
|
+
// shape (e.g. `additionalFields.headline` instead of `$.title`). We
|
|
697
|
+
// skip the warning when the recipe explicitly declared the selector
|
|
698
|
+
// (the absence is then on the user, not the default chain).
|
|
699
|
+
for (const field of Object.keys(adoption)) {
|
|
700
|
+
const explicit = selectors[field];
|
|
701
|
+
if (!explicit && adoption[field] === null) {
|
|
702
|
+
warn(`json-api adapter: source '${source.id}' — default selector chain for '${field}' produced no value; consider setting jsonSelectors.${field} explicitly`);
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
// Build state. Prefer the server-supplied ETag; otherwise hash the page-0
|
|
706
|
+
// body so re-runs without a server ETag still dedup correctly (mirrors the
|
|
707
|
+
// html adapter's content-hash fallback).
|
|
708
|
+
let nextEtag = previous?.lastEtag;
|
|
709
|
+
if (lastEtag) {
|
|
710
|
+
nextEtag = lastEtag;
|
|
711
|
+
}
|
|
712
|
+
else if (firstBodyText && firstBodyText.length > 0) {
|
|
713
|
+
nextEtag = `${CONTENT_HASH_PREFIX}${createHash("sha256").update(firstBodyText).digest("hex")}`;
|
|
714
|
+
}
|
|
715
|
+
// Avoid unused-variable warnings while keeping `firstBody` available for
|
|
716
|
+
// future debug surfaces (`source test` may want to print the first page
|
|
717
|
+
// body when no items matched).
|
|
718
|
+
void firstBody;
|
|
719
|
+
// Compose diag payload for `source test --show-content`. The selector
|
|
720
|
+
// adoption map reports the JSONPath candidate that won the fallback
|
|
721
|
+
// chain per field (or the recipe-supplied path verbatim, or `null` when
|
|
722
|
+
// every candidate missed). Pagination preview surfaces the next-URL /
|
|
723
|
+
// Link / cursor extraction so users can spot misconfigurations without
|
|
724
|
+
// letting the dry-run actually walk page 1.
|
|
725
|
+
const selectorAdoption = {
|
|
726
|
+
items: itemsPath ?? null,
|
|
727
|
+
title: adoption.title ?? null,
|
|
728
|
+
link: adoption.link ?? null,
|
|
729
|
+
publishedAt: adoption.publishedAt ?? null,
|
|
730
|
+
summary: adoption.summary ?? null,
|
|
731
|
+
};
|
|
732
|
+
const diag = {
|
|
733
|
+
selectorAdoption,
|
|
734
|
+
...(paginationPreview ? { paginationPreview } : {}),
|
|
735
|
+
};
|
|
736
|
+
if (notModified) {
|
|
737
|
+
return {
|
|
738
|
+
items: [],
|
|
739
|
+
notModified: true,
|
|
740
|
+
state: {
|
|
741
|
+
lastFetchedAt: fetchedAt,
|
|
742
|
+
lastEtag: nextEtag,
|
|
743
|
+
},
|
|
744
|
+
diag,
|
|
745
|
+
};
|
|
746
|
+
}
|
|
747
|
+
return {
|
|
748
|
+
items,
|
|
749
|
+
state: {
|
|
750
|
+
lastFetchedAt: fetchedAt,
|
|
751
|
+
lastEtag: nextEtag,
|
|
752
|
+
},
|
|
753
|
+
diag,
|
|
754
|
+
};
|
|
755
|
+
}
|
|
756
|
+
/**
|
|
757
|
+
* Public adapter. When `source.facets` is set, wraps {@link fetchSingle}
|
|
758
|
+
* in an outer facet sweep loop (ADR-0017). Each iteration:
|
|
759
|
+
*
|
|
760
|
+
* - injects the facet value into the URL via {@link applyFacetValue}
|
|
761
|
+
* - delegates to {@link fetchSingle} with `facets: undefined` so the
|
|
762
|
+
* inner traversal sees the modified URL but does not recurse
|
|
763
|
+
* - disables conditional GET in facet sweep mode (ADR-0017 §State —
|
|
764
|
+
* per-facet ETag tracking is deferred to a future ADR)
|
|
765
|
+
* - merges state.lastSeenIds globally across facet values (item IDs are
|
|
766
|
+
* unique across facets in the documented AWS What's New use case)
|
|
767
|
+
*
|
|
768
|
+
* Inner traversal semantics (`lastSeenIds` early-stop, `pagination.maxPages`
|
|
769
|
+
* cap, `--max-pages` override, `--backfill` full traversal) apply unchanged
|
|
770
|
+
* to each facet value. The outer loop walks every facet value in both
|
|
771
|
+
* normal and `--backfill` modes — normal mode gets the early-stop benefit
|
|
772
|
+
* inside each value but never skips a facet outright (that would silently
|
|
773
|
+
* miss items in a facet whose first page has not changed since last run).
|
|
774
|
+
*
|
|
775
|
+
* Dry-run (`source test`) iterates only the first facet value so the
|
|
776
|
+
* selector adoption preview is meaningful without walking every year.
|
|
777
|
+
*
|
|
778
|
+
* Phase 1 limitation: a single facet entry only. Multi-facet (e.g. year ×
|
|
779
|
+
* category) requires composition rules that are out of scope here — see
|
|
780
|
+
* ADR-0017 §Scope.
|
|
781
|
+
*/
|
|
782
|
+
export const jsonApiAdapter = {
|
|
783
|
+
kind: "json-api",
|
|
784
|
+
fetch: async (source, options = {}) => {
|
|
785
|
+
if (!source.facets || Object.keys(source.facets).length === 0) {
|
|
786
|
+
return fetchSingle(source, options);
|
|
787
|
+
}
|
|
788
|
+
const facetEntries = Object.entries(source.facets);
|
|
789
|
+
if (facetEntries.length > 1) {
|
|
790
|
+
// Phase 1 single-facet guard. The schema accepts a record shape for
|
|
791
|
+
// forward-compat, but composing two axes (year × category) needs
|
|
792
|
+
// explicit ordering / dedup semantics that ADR-0017 defers.
|
|
793
|
+
throw new Error(`json-api adapter: source '${source.id}' declares ${facetEntries.length} facets — multi-facet sweep is not supported in Phase 1 (ADR-0017 §Scope)`);
|
|
794
|
+
}
|
|
795
|
+
const [, facetSpec] = facetEntries[0];
|
|
796
|
+
const dryRun = options.dryRun === true;
|
|
797
|
+
// Aggregate items + lastSeenIds across every facet value. ETag is
|
|
798
|
+
// intentionally NOT persisted: a single ETag cannot represent the
|
|
799
|
+
// combined state of N facet values, and re-using last-run's ETag
|
|
800
|
+
// would 304-out the next sweep. Per-facet ETag is future work.
|
|
801
|
+
const aggregatedItems = [];
|
|
802
|
+
const aggregatedSeen = new Set(options.state?.lastSeenIds ?? []);
|
|
803
|
+
let aggregatedDiag;
|
|
804
|
+
let aggregatedNotModified = true;
|
|
805
|
+
const fetchedAt = new Date().toISOString();
|
|
806
|
+
for (const value of generateFacetValues(facetSpec)) {
|
|
807
|
+
const innerUrl = applyFacetValue(source.url, facetSpec, value);
|
|
808
|
+
// Build a "single-axis" view of the source: same id / pagination /
|
|
809
|
+
// selectors but with the facet-stamped URL and `facets: undefined`
|
|
810
|
+
// so the inner fetch does not recurse.
|
|
811
|
+
const innerSource = { ...source, url: innerUrl, facets: undefined };
|
|
812
|
+
// Share the running lastSeenIds set with the inner fetch so the
|
|
813
|
+
// per-facet early-stop heuristic dedupes against items already
|
|
814
|
+
// observed in earlier facets. Conditional GET is disabled: each
|
|
815
|
+
// facet value has its own ETag and re-using the previous value's
|
|
816
|
+
// would silently 304-out the next slice.
|
|
817
|
+
const innerOptions = {
|
|
818
|
+
...options,
|
|
819
|
+
state: options.state
|
|
820
|
+
? {
|
|
821
|
+
...options.state,
|
|
822
|
+
lastEtag: undefined,
|
|
823
|
+
lastSeenIds: Array.from(aggregatedSeen),
|
|
824
|
+
}
|
|
825
|
+
: {
|
|
826
|
+
sourceId: source.id,
|
|
827
|
+
lastSeenIds: Array.from(aggregatedSeen),
|
|
828
|
+
},
|
|
829
|
+
};
|
|
830
|
+
const result = await fetchSingle(innerSource, innerOptions);
|
|
831
|
+
// Capture the diag from the FIRST facet value only — it serves as
|
|
832
|
+
// the representative selector-adoption / pagination-preview surface
|
|
833
|
+
// for `source test`. Later iterations overwrite nothing.
|
|
834
|
+
if (aggregatedDiag === undefined)
|
|
835
|
+
aggregatedDiag = result.diag;
|
|
836
|
+
if (!result.notModified)
|
|
837
|
+
aggregatedNotModified = false;
|
|
838
|
+
for (const item of result.items) {
|
|
839
|
+
aggregatedItems.push(item);
|
|
840
|
+
aggregatedSeen.add(item.id);
|
|
841
|
+
}
|
|
842
|
+
// Dry-run: walk only the first facet value so `source test` stays
|
|
843
|
+
// cheap and the per-page-0 selector preview is meaningful.
|
|
844
|
+
if (dryRun)
|
|
845
|
+
break;
|
|
846
|
+
}
|
|
847
|
+
return {
|
|
848
|
+
items: aggregatedItems,
|
|
849
|
+
// ADR-0017 §State: ETag disabled in facet sweep mode. Persist
|
|
850
|
+
// `undefined` so the next run starts fresh.
|
|
851
|
+
state: {
|
|
852
|
+
lastFetchedAt: fetchedAt,
|
|
853
|
+
lastEtag: undefined,
|
|
854
|
+
},
|
|
855
|
+
...(aggregatedNotModified && aggregatedItems.length === 0 ? { notModified: true } : {}),
|
|
856
|
+
...(aggregatedDiag ? { diag: aggregatedDiag } : {}),
|
|
857
|
+
};
|
|
858
|
+
},
|
|
859
|
+
};
|
|
860
|
+
//# sourceMappingURL=json-api.js.map
|