@ozzylabs/feedradar 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/README.ja.md +12 -6
  2. package/README.md +11 -6
  3. package/dist/agents/claude-code.d.ts +12 -1
  4. package/dist/agents/claude-code.d.ts.map +1 -1
  5. package/dist/agents/claude-code.js +9 -5
  6. package/dist/agents/claude-code.js.map +1 -1
  7. package/dist/agents/codex-cli.d.ts +7 -1
  8. package/dist/agents/codex-cli.d.ts.map +1 -1
  9. package/dist/agents/codex-cli.js +9 -5
  10. package/dist/agents/codex-cli.js.map +1 -1
  11. package/dist/agents/copilot.d.ts +7 -1
  12. package/dist/agents/copilot.d.ts.map +1 -1
  13. package/dist/agents/copilot.js +9 -5
  14. package/dist/agents/copilot.js.map +1 -1
  15. package/dist/agents/gemini-cli.d.ts +7 -1
  16. package/dist/agents/gemini-cli.d.ts.map +1 -1
  17. package/dist/agents/gemini-cli.js +9 -5
  18. package/dist/agents/gemini-cli.js.map +1 -1
  19. package/dist/agents/index.d.ts +1 -1
  20. package/dist/agents/index.d.ts.map +1 -1
  21. package/dist/agents/types.d.ts +33 -0
  22. package/dist/agents/types.d.ts.map +1 -1
  23. package/dist/cli/_progress.d.ts +138 -0
  24. package/dist/cli/_progress.d.ts.map +1 -0
  25. package/dist/cli/_progress.js +176 -0
  26. package/dist/cli/_progress.js.map +1 -0
  27. package/dist/cli/index.d.ts.map +1 -1
  28. package/dist/cli/index.js +2 -0
  29. package/dist/cli/index.js.map +1 -1
  30. package/dist/cli/research.d.ts +18 -20
  31. package/dist/cli/research.d.ts.map +1 -1
  32. package/dist/cli/research.js +318 -203
  33. package/dist/cli/research.js.map +1 -1
  34. package/dist/cli/review.d.ts +7 -0
  35. package/dist/cli/review.d.ts.map +1 -1
  36. package/dist/cli/review.js +46 -1
  37. package/dist/cli/review.js.map +1 -1
  38. package/dist/cli/source.d.ts +23 -2
  39. package/dist/cli/source.d.ts.map +1 -1
  40. package/dist/cli/source.js +428 -7
  41. package/dist/cli/source.js.map +1 -1
  42. package/dist/cli/update.d.ts +7 -0
  43. package/dist/cli/update.d.ts.map +1 -1
  44. package/dist/cli/update.js +41 -1
  45. package/dist/cli/update.js.map +1 -1
  46. package/dist/cli/watch.d.ts.map +1 -1
  47. package/dist/cli/watch.js +67 -3
  48. package/dist/cli/watch.js.map +1 -1
  49. package/dist/cli/workflow/generate-combined.d.ts +100 -0
  50. package/dist/cli/workflow/generate-combined.d.ts.map +1 -0
  51. package/dist/cli/workflow/generate-combined.js +387 -0
  52. package/dist/cli/workflow/generate-combined.js.map +1 -0
  53. package/dist/cli/workflow/generate-watch.d.ts +142 -0
  54. package/dist/cli/workflow/generate-watch.d.ts.map +1 -0
  55. package/dist/cli/workflow/generate-watch.js +338 -0
  56. package/dist/cli/workflow/generate-watch.js.map +1 -0
  57. package/dist/cli/workflow.d.ts +29 -0
  58. package/dist/cli/workflow.d.ts.map +1 -0
  59. package/dist/cli/workflow.js +66 -0
  60. package/dist/cli/workflow.js.map +1 -0
  61. package/dist/core/feeds/_fetch.d.ts +10 -0
  62. package/dist/core/feeds/_fetch.d.ts.map +1 -1
  63. package/dist/core/feeds/_fetch.js +182 -0
  64. package/dist/core/feeds/_fetch.js.map +1 -1
  65. package/dist/core/feeds/_jsonpath.d.ts +57 -0
  66. package/dist/core/feeds/_jsonpath.d.ts.map +1 -0
  67. package/dist/core/feeds/_jsonpath.js +207 -0
  68. package/dist/core/feeds/_jsonpath.js.map +1 -0
  69. package/dist/core/feeds/html-js.d.ts +8 -0
  70. package/dist/core/feeds/html-js.d.ts.map +1 -1
  71. package/dist/core/feeds/html-js.js +47 -1
  72. package/dist/core/feeds/html-js.js.map +1 -1
  73. package/dist/core/feeds/index.d.ts +1 -1
  74. package/dist/core/feeds/index.d.ts.map +1 -1
  75. package/dist/core/feeds/index.js +4 -0
  76. package/dist/core/feeds/index.js.map +1 -1
  77. package/dist/core/feeds/json-api.d.ts +29 -0
  78. package/dist/core/feeds/json-api.d.ts.map +1 -0
  79. package/dist/core/feeds/json-api.js +860 -0
  80. package/dist/core/feeds/json-api.js.map +1 -0
  81. package/dist/core/feeds/json-feed.d.ts +11 -0
  82. package/dist/core/feeds/json-feed.d.ts.map +1 -0
  83. package/dist/core/feeds/json-feed.js +242 -0
  84. package/dist/core/feeds/json-feed.js.map +1 -0
  85. package/dist/core/feeds/types.d.ts +123 -0
  86. package/dist/core/feeds/types.d.ts.map +1 -1
  87. package/dist/core/progress.d.ts +101 -0
  88. package/dist/core/progress.d.ts.map +1 -0
  89. package/dist/core/progress.js +212 -0
  90. package/dist/core/progress.js.map +1 -0
  91. package/dist/core/recipes.d.ts +138 -0
  92. package/dist/core/recipes.d.ts.map +1 -0
  93. package/dist/core/recipes.js +242 -0
  94. package/dist/core/recipes.js.map +1 -0
  95. package/dist/core/watcher.d.ts +61 -1
  96. package/dist/core/watcher.d.ts.map +1 -1
  97. package/dist/core/watcher.js +99 -2
  98. package/dist/core/watcher.js.map +1 -1
  99. package/dist/recipes/aws-whats-new.yaml +87 -0
  100. package/dist/recipes/dev-to.yaml +40 -0
  101. package/dist/schemas/index.d.ts +1 -0
  102. package/dist/schemas/index.d.ts.map +1 -1
  103. package/dist/schemas/index.js +1 -0
  104. package/dist/schemas/index.js.map +1 -1
  105. package/dist/schemas/recipe.d.ts +127 -0
  106. package/dist/schemas/recipe.d.ts.map +1 -0
  107. package/dist/schemas/recipe.js +57 -0
  108. package/dist/schemas/recipe.js.map +1 -0
  109. package/dist/schemas/source.d.ts +222 -0
  110. package/dist/schemas/source.d.ts.map +1 -1
  111. package/dist/schemas/source.js +234 -0
  112. package/dist/schemas/source.js.map +1 -1
  113. package/dist/templates/agents/AGENTS.md +33 -3
  114. package/dist/templates/feedradar.md +23 -8
  115. package/dist/templates/workflows/combined.template.yaml.tmpl +110 -0
  116. package/dist/templates/workflows/watch.template.yaml.tmpl +103 -0
  117. package/package.json +1 -2
@@ -0,0 +1,860 @@
1
+ import { createHash } from "node:crypto";
2
+ import { ItemSchema } from "../../schemas/index.js";
3
+ import { fetchWithRetry } from "./_fetch.js";
4
+ import { selectAll, selectOne } from "./_jsonpath.js";
5
+ import { deriveItemId, deriveStableKey } from "./derive-id.js";
6
+ const USER_AGENT = "feedradar/0.0.0 (+https://github.com/ozzy-labs/feedradar)";
7
+ /**
8
+ * Prefix marking a content-hash entry (vs a real ETag) inside `state.lastEtag`.
9
+ * Mirrors `_html-common.ts` so re-fetches without a server ETag still dedup.
10
+ */
11
+ const CONTENT_HASH_PREFIX = "sha256:";
12
+ /**
13
+ * Default selector chain consulted when `jsonSelectors.items` is omitted
14
+ * (ADR-0012 §D2). Resolved against the page-0 response body.
15
+ */
16
+ const DEFAULT_ITEMS_PATHS = [
17
+ "$.items[*]",
18
+ "$.data[*]",
19
+ "$.results[*]",
20
+ "$.posts[*]",
21
+ "$.entries[*]",
22
+ "$[*]",
23
+ ];
24
+ /**
25
+ * Per-field default selector chain consulted when the corresponding
26
+ * `jsonSelectors.<field>` is omitted (#174). For each item element we walk
27
+ * the chain in order and use the first path that yields a non-nullish value;
28
+ * this lets recipes for "simple" APIs (dev.to, generic JSON Feed clones)
29
+ * skip selectors entirely. Adoption is recorded once per fetch (first item)
30
+ * and surfaced via `FeedFetchDiag.selectorAdoption` so users can audit which
31
+ * candidate was picked.
32
+ */
33
+ const DEFAULT_FIELD_PATHS = {
34
+ title: ["$.title", "$.name", "$.headline"],
35
+ link: ["$.url", "$.link", "$.permalink", "$.html_url"],
36
+ publishedAt: ["$.publishedAt", "$.published_at", "$.date", "$.created_at", "$.pubDate"],
37
+ summary: ["$.summary", "$.description", "$.excerpt", "$.body"],
38
+ };
39
+ /**
40
+ * Maximum response body size per page. ADR-0012 §D5a hardcodes this so a
41
+ * malformed recipe cannot blow up memory / context window. The cap is
42
+ * intentionally not user-configurable.
43
+ */
44
+ const RESPONSE_SIZE_CAP_BYTES = 10 * 1024 * 1024; // 10 MB
45
+ /**
46
+ * `${VAR}` env interpolation (ADR-0012 §D5c).
47
+ *
48
+ * - Unresolved variables cause the header to be omitted entirely (degraded
49
+ * fetch), so public APIs work without env wiring while authenticated APIs
50
+ * fail-fast with a 401/403 at runtime.
51
+ * - The returned value MUST NEVER be logged. Callers route it directly into
52
+ * the `headers` map passed to fetch.
53
+ */
54
+ function interpolateHeaderValue(raw, env) {
55
+ // Optimization: most headers contain no `${...}` and pass straight through.
56
+ if (!raw.includes("${"))
57
+ return raw;
58
+ let resolved = "";
59
+ let i = 0;
60
+ while (i < raw.length) {
61
+ const dollar = raw.indexOf("${", i);
62
+ if (dollar === -1) {
63
+ resolved += raw.slice(i);
64
+ break;
65
+ }
66
+ resolved += raw.slice(i, dollar);
67
+ const close = raw.indexOf("}", dollar + 2);
68
+ if (close === -1) {
69
+ // Malformed: treat as literal so we don't accidentally leak `${` markers
70
+ // into outbound requests. Equivalent to "no interpolation needed".
71
+ resolved += raw.slice(dollar);
72
+ break;
73
+ }
74
+ const name = raw.slice(dollar + 2, close);
75
+ const value = env[name];
76
+ if (value === undefined || value.length === 0) {
77
+ // ADR-0012 §D5c: unresolved → drop the entire header.
78
+ return undefined;
79
+ }
80
+ resolved += value;
81
+ i = close + 1;
82
+ }
83
+ return resolved;
84
+ }
85
+ /**
86
+ * Build the outgoing `headers` map from the source recipe.
87
+ *
88
+ * Always includes a `user-agent` and `accept: application/json` so most APIs
89
+ * serve JSON without further config. Recipe-supplied headers take precedence
90
+ * over defaults (callers can override `accept` if a site insists on
91
+ * `application/vnd.api+json` etc.).
92
+ */
93
+ function buildHeaders(source, env) {
94
+ const headers = {
95
+ accept: "application/json, */*;q=0.5",
96
+ "user-agent": USER_AGENT,
97
+ };
98
+ const recipeHeaders = source.http?.headers ?? {};
99
+ for (const [key, raw] of Object.entries(recipeHeaders)) {
100
+ const resolved = interpolateHeaderValue(raw, env);
101
+ if (resolved !== undefined) {
102
+ headers[key.toLowerCase()] = resolved;
103
+ }
104
+ // else: drop unresolved-env header, per ADR-0012 §D5c degraded-fetch policy.
105
+ }
106
+ return headers;
107
+ }
108
+ /**
109
+ * Compute the next URL for `type: link-header` pagination by parsing
110
+ * `Link: <url>; rel="next", <...>; rel="prev"`. Returns `null` when no
111
+ * `rel="next"` is present (= end of pagination).
112
+ *
113
+ * NOTE on SSRF: a malicious or compromised upstream could emit a `Link`
114
+ * header pointing at `http://127.0.0.1:…` / cloud-metadata endpoints. The
115
+ * host-allowlist defense specified in ADR-0012 §D5b lives in the shared
116
+ * fetch wrapper (`src/core/feeds/_fetch.ts`), which sees every request URL
117
+ * regardless of the adapter that produced it; layering the check here would
118
+ * leave the same gap for `cursor` / `token` pagination and direct
119
+ * `source.url`. Tracking that wrapper-level enforcement as cross-cutting
120
+ * work outside this PR's scope.
121
+ */
122
+ function parseLinkHeader(value) {
123
+ if (!value)
124
+ return null;
125
+ // RFC 5988: each link is `<url>; param1=val1; param2=val2`, comma-separated.
126
+ // We do not need a full parser — just the first segment whose rel includes
127
+ // "next". Whitespace in URLs is invalid so we can safely match `<...>`.
128
+ const segments = value.split(",");
129
+ for (const segment of segments) {
130
+ const match = segment.match(/<([^>]+)>\s*;\s*rel\s*=\s*"?([^";]+)"?/i);
131
+ if (!match)
132
+ continue;
133
+ const [, url, rel] = match;
134
+ if (!url || !rel)
135
+ continue;
136
+ // Some servers emit `rel="next first"` — split on whitespace.
137
+ const rels = rel
138
+ .toLowerCase()
139
+ .split(/\s+/)
140
+ .map((s) => s.trim());
141
+ if (rels.includes("next"))
142
+ return url;
143
+ }
144
+ return null;
145
+ }
146
+ /**
147
+ * Apply a query parameter to `url`, replacing any existing one with the same
148
+ * name. Used to thread page / offset / token / pageSize into pagination URLs
149
+ * without re-parsing the recipe URL string each iteration.
150
+ */
151
+ function setQueryParam(url, name, value) {
152
+ const u = new URL(url);
153
+ u.searchParams.set(name, String(value));
154
+ return u.toString();
155
+ }
156
+ /**
157
+ * Resolve `selectors.items` against a page body, falling back to the default
158
+ * selector chain when the recipe omitted the field (ADR-0012 §D2 default
159
+ * chain). Returns the matched item list and the path that produced it (for
160
+ * debug surfaces like `source test`).
161
+ */
162
+ function resolveItemsList(selectors, body) {
163
+ if (selectors.items) {
164
+ return { matches: selectAll(selectors.items, body), path: selectors.items };
165
+ }
166
+ for (const candidate of DEFAULT_ITEMS_PATHS) {
167
+ const matches = selectAll(candidate, body);
168
+ if (matches.length > 0)
169
+ return { matches, path: candidate };
170
+ }
171
+ return { matches: [], path: DEFAULT_ITEMS_PATHS[0] };
172
+ }
173
+ /**
174
+ * Resolve a per-item field with optional default-chain fallback.
175
+ *
176
+ * `explicit` is the recipe-supplied path. When undefined, we walk
177
+ * `DEFAULT_FIELD_PATHS[field]` and return the first candidate that yields
178
+ * a non-nullish value, or `{ value: undefined, path: null }` when every
179
+ * candidate misses.
180
+ *
181
+ * Returning the matched path lets the adapter record adoption once (first
182
+ * item) and surface it via `diag.selectorAdoption` so `source test` can
183
+ * print "title ← $.headline を採用".
184
+ */
185
+ function resolveFieldWithFallback(field, explicit, element) {
186
+ if (explicit) {
187
+ return { value: selectOne(explicit, element), path: explicit };
188
+ }
189
+ for (const candidate of DEFAULT_FIELD_PATHS[field]) {
190
+ const value = selectOne(candidate, element);
191
+ if (value !== undefined && value !== null) {
192
+ return { value, path: candidate };
193
+ }
194
+ }
195
+ return { value: undefined, path: null };
196
+ }
197
+ /**
198
+ * Resolve a `link` value against a base URL (#204).
199
+ *
200
+ * Many JSON APIs (notably AWS What's New) return the per-item link as a
201
+ * relative path like `/about-aws/whats-new/.../` rather than a fully
202
+ * qualified URL. Without resolution `ItemSchema`'s `z.string().url()`
203
+ * silently drops every item.
204
+ *
205
+ * The base is `selectors.linkBase` when set, otherwise `source.url` (which
206
+ * mirrors the html adapter's `new URL(href, source.url)` behavior). Absolute
207
+ * URLs pass through unchanged because `new URL("https://x/y", base)` ignores
208
+ * the base.
209
+ *
210
+ * We swallow `URL` constructor errors so a malformed `link` surfaces as a
211
+ * normal `ItemSchema` validation drop later (preserving the existing "one
212
+ * broken record does not abort the whole page" semantics).
213
+ */
214
+ function resolveLinkUrl(raw, base) {
215
+ try {
216
+ return new URL(raw, base).toString();
217
+ }
218
+ catch {
219
+ return raw;
220
+ }
221
+ }
222
+ /** Coerce a JSON value to a trimmed non-empty string, or `undefined`. */
223
+ function coerceString(value) {
224
+ if (value == null)
225
+ return undefined;
226
+ if (typeof value === "string") {
227
+ const trimmed = value.trim();
228
+ return trimmed.length === 0 ? undefined : trimmed;
229
+ }
230
+ if (typeof value === "number" || typeof value === "boolean")
231
+ return String(value);
232
+ return undefined;
233
+ }
234
+ /** Coerce a JSON value to ISO 8601, returning `undefined` for invalid input. */
235
+ function coerceIsoDate(value) {
236
+ const raw = coerceString(value);
237
+ if (!raw)
238
+ return undefined;
239
+ const date = new Date(raw);
240
+ if (Number.isNaN(date.getTime()))
241
+ return undefined;
242
+ return date.toISOString();
243
+ }
244
+ /**
245
+ * Normalize one element matched by `selectors.items` into our canonical
246
+ * `Item` shape. Returns `null` when the candidate fails schema validation
247
+ * (e.g. missing url) so one broken record does not abort the whole page.
248
+ *
249
+ * `Item.id` derivation follows ADR-0002:
250
+ *
251
+ * 1. `selectors.publisherId` (explicit, most stable)
252
+ * 2. `selectors.link` URL (canonical identifier)
253
+ * 3. `sha1:` hash of title + publishedAt (fallback)
254
+ *
255
+ * `adoption` is mutated in place: for each defaultable field, the first call
256
+ * records the JSONPath candidate that produced a usable value (or `null` if
257
+ * every candidate missed). Subsequent calls leave it alone so adoption
258
+ * reflects the very first item — that is what `source test` reports.
259
+ */
260
+ function elementToItem(element, source, selectors, fetchedAt, adoption) {
261
+ const titleResolved = resolveFieldWithFallback("title", selectors.title, element);
262
+ if (adoption.title === undefined) {
263
+ adoption.title = titleResolved.path;
264
+ }
265
+ const title = coerceString(titleResolved.value) ?? "";
266
+ const linkResolved = resolveFieldWithFallback("link", selectors.link, element);
267
+ if (adoption.link === undefined) {
268
+ adoption.link = linkResolved.path;
269
+ }
270
+ const rawLink = coerceString(linkResolved.value);
271
+ if (!rawLink)
272
+ return null;
273
+ // Resolve relative paths against `linkBase` (or `source.url` as fallback)
274
+ // so APIs that return `/about-aws/whats-new/.../` instead of an absolute
275
+ // URL still produce valid `Item.url` values (#204). Absolute URLs pass
276
+ // through `new URL()` unchanged.
277
+ const url = resolveLinkUrl(rawLink, selectors.linkBase ?? source.url);
278
+ const publisherId = selectors.publisherId
279
+ ? coerceString(selectOne(selectors.publisherId, element))
280
+ : undefined;
281
+ const publishedAtResolved = resolveFieldWithFallback("publishedAt", selectors.publishedAt, element);
282
+ if (adoption.publishedAt === undefined) {
283
+ adoption.publishedAt = publishedAtResolved.path;
284
+ }
285
+ const publishedAt = coerceIsoDate(publishedAtResolved.value);
286
+ const summaryResolved = resolveFieldWithFallback("summary", selectors.summary, element);
287
+ if (adoption.summary === undefined) {
288
+ adoption.summary = summaryResolved.path;
289
+ }
290
+ const summary = coerceString(summaryResolved.value);
291
+ const body = selectors.body ? coerceString(selectOne(selectors.body, element)) : undefined;
292
+ // `selectors.tags` is recognized by the schema but currently silently passed
293
+ // through into `raw` only. The filter pipeline (`buildHaystack`) does not
294
+ // structurally read `Item.tags` for any adapter, so surfacing tags
295
+ // structurally here would not improve filtering. Keep them inside `raw`
296
+ // (already attached below) until a future filter extension consumes them.
297
+ const stableKey = deriveStableKey({
298
+ publisherId,
299
+ url,
300
+ fallbackHashInputs: [title, publishedAt],
301
+ });
302
+ const id = deriveItemId(title, stableKey);
303
+ const candidate = {
304
+ id,
305
+ sourceId: source.id,
306
+ title,
307
+ url,
308
+ fetchedAt,
309
+ raw: element,
310
+ };
311
+ if (publishedAt)
312
+ candidate.publishedAt = publishedAt;
313
+ if (summary)
314
+ candidate.summary = summary;
315
+ // Body is preserved inside `raw`; we surface it through summary when the
316
+ // recipe explicitly mapped a body selector and no summary selector. This
317
+ // keeps the Item schema lean while still letting recipes pull in a long
318
+ // description.
319
+ if (!summary && body)
320
+ candidate.summary = body;
321
+ const result = ItemSchema.safeParse(candidate);
322
+ return result.success ? result.data : null;
323
+ }
324
+ /**
325
+ * One iteration of pagination: issue a GET, decode the JSON, return the body
326
+ * + the URL of the next page (or `null` when traversal is done).
327
+ *
328
+ * Errors are thrown to the caller; the adapter wraps them with source-id
329
+ * context before propagating to the watcher.
330
+ */
331
+ async function fetchPage(url, fetchImpl, headers, pagination, pageIndex, state) {
332
+ // Forward conditional-GET headers only on page 0 — pagination URLs are
333
+ // ephemeral and most servers will not 304 them. ETag-aware short-circuit
334
+ // is mainly useful for the "no items have changed since last run" case.
335
+ // We also skip conditional GET in backfill mode (caller sets
336
+ // `sendConditional: false`) so a stale ETag from a previous normal-mode
337
+ // run does not 304-out the requested full-history traversal.
338
+ const requestHeaders = { ...headers };
339
+ if (pageIndex === 0 &&
340
+ state.sendConditional !== false &&
341
+ state.etag &&
342
+ !state.etag.startsWith(CONTENT_HASH_PREFIX) &&
343
+ !("if-none-match" in requestHeaders)) {
344
+ requestHeaders["if-none-match"] = state.etag;
345
+ }
346
+ const response = await fetchWithRetry(fetchImpl, url, { headers: requestHeaders });
347
+ const etag = response.headers.get("etag");
348
+ const linkNext = pagination.type === "link-header" ? parseLinkHeader(response.headers.get("link")) : null;
349
+ if (response.status === 304) {
350
+ return { body: null, bodyText: "", status: 304, etag, linkNext };
351
+ }
352
+ if (response.status < 200 || response.status >= 300) {
353
+ throw new Error(`json-api adapter: HTTP ${response.status} from ${url}`);
354
+ }
355
+ const bodyText = await response.text();
356
+ if (bodyText.length > RESPONSE_SIZE_CAP_BYTES) {
357
+ throw new Error(`json-api adapter: response too large (${bodyText.length} bytes > ${RESPONSE_SIZE_CAP_BYTES} cap) from ${url}`);
358
+ }
359
+ let parsed;
360
+ try {
361
+ parsed = JSON.parse(bodyText);
362
+ }
363
+ catch (e) {
364
+ throw new Error(`json-api adapter: failed to parse JSON from ${url}: ${e instanceof Error ? e.message : String(e)}`);
365
+ }
366
+ return { body: parsed, bodyText, status: response.status, etag, linkNext };
367
+ }
368
+ /**
369
+ * Compute the next page URL based on the pagination strategy + the current
370
+ * page's body. Returns `null` when traversal should stop (no more pages).
371
+ *
372
+ * `link-header` is handled by the caller (it depends on the response headers,
373
+ * which `fetchPage` reads); we return `null` here so the loop terminates if
374
+ * the recipe says `link-header` but no `Link` header was returned.
375
+ */
376
+ function computeNextUrl(source, pagination, currentUrl, currentBody, currentItemsLength, pageCountSoFar) {
377
+ switch (pagination.type) {
378
+ case "none":
379
+ return null;
380
+ case "link-header":
381
+ // The Link header is read in fetchPage; this branch should never be
382
+ // consulted to compute the next URL directly. Returning null is a safe
383
+ // fallback for buggy recipes that mix `link-header` with explicit
384
+ // `param`.
385
+ return null;
386
+ case "page": {
387
+ if (currentItemsLength === 0)
388
+ return null;
389
+ const param = pagination.param ?? "page";
390
+ const start = pagination.start ?? 0;
391
+ const nextPage = start + pageCountSoFar;
392
+ let url = setQueryParam(currentUrl, param, nextPage);
393
+ if (pagination.pageSize !== undefined) {
394
+ const sizeParam = pagination.pageSizeParam ?? "pageSize";
395
+ url = setQueryParam(url, sizeParam, pagination.pageSize);
396
+ }
397
+ return url;
398
+ }
399
+ case "offset": {
400
+ if (currentItemsLength === 0)
401
+ return null;
402
+ const param = pagination.param ?? "offset";
403
+ const start = pagination.start ?? 0;
404
+ const limit = pagination.pageSize ?? currentItemsLength;
405
+ const nextOffset = start + pageCountSoFar * limit;
406
+ let url = setQueryParam(currentUrl, param, nextOffset);
407
+ if (pagination.pageSize !== undefined) {
408
+ const sizeParam = pagination.pageSizeParam ?? "limit";
409
+ url = setQueryParam(url, sizeParam, limit);
410
+ }
411
+ return url;
412
+ }
413
+ case "cursor":
414
+ case "token": {
415
+ if (!pagination.nextCursorPath)
416
+ return null;
417
+ const cursor = coerceString(selectOne(pagination.nextCursorPath, currentBody));
418
+ if (!cursor)
419
+ return null;
420
+ const param = pagination.param ?? (pagination.type === "cursor" ? "after" : "pageToken");
421
+ return setQueryParam(source.url, param, cursor);
422
+ }
423
+ }
424
+ }
425
+ /**
426
+ * Build the initial (page 0) URL by stamping in `start` / `pageSize` from the
427
+ * recipe. For `cursor` / `token` paginations the start cursor is implicit —
428
+ * the recipe URL should already contain whatever initial cursor / token the
429
+ * site expects (typically none).
430
+ */
431
+ function initialUrl(source, pagination) {
432
+ switch (pagination.type) {
433
+ case "none":
434
+ case "link-header":
435
+ case "cursor":
436
+ case "token":
437
+ // For these types page 0 is just the recipe URL as written.
438
+ return source.url;
439
+ case "page": {
440
+ const param = pagination.param ?? "page";
441
+ const start = pagination.start ?? 0;
442
+ let url = setQueryParam(source.url, param, start);
443
+ if (pagination.pageSize !== undefined) {
444
+ const sizeParam = pagination.pageSizeParam ?? "pageSize";
445
+ url = setQueryParam(url, sizeParam, pagination.pageSize);
446
+ }
447
+ return url;
448
+ }
449
+ case "offset": {
450
+ const param = pagination.param ?? "offset";
451
+ const start = pagination.start ?? 0;
452
+ let url = setQueryParam(source.url, param, start);
453
+ if (pagination.pageSize !== undefined) {
454
+ const sizeParam = pagination.pageSizeParam ?? "limit";
455
+ url = setQueryParam(url, sizeParam, pagination.pageSize);
456
+ }
457
+ return url;
458
+ }
459
+ }
460
+ }
461
+ /**
462
+ * Effective page cap. Normal mode honors the recipe (`pagination.maxPages`).
463
+ * The natural stop conditions inside the loop (lastSeenIds hit, items.length
464
+ * less than pageSize, empty page) terminate normal-mode traversal earlier
465
+ * than the cap for periodic ingest. Backfill mode honors the recipe cap up
466
+ * to the `--max-pages` override.
467
+ */
468
+ function effectiveMaxPages(pagination, backfill, override) {
469
+ const recipeCap = pagination.maxPages;
470
+ if (!backfill) {
471
+ return recipeCap;
472
+ }
473
+ if (override !== undefined)
474
+ return Math.min(recipeCap, override);
475
+ return recipeCap;
476
+ }
477
+ /**
478
+ * Apply a single facet value to the source URL by injecting the templated
479
+ * query parameter. Replaces any existing value of `facet.param` so a recipe
480
+ * URL with a placeholder/default does not double-up at fetch time.
481
+ */
482
+ function applyFacetValue(rawUrl, facet, value) {
483
+ const u = new URL(rawUrl);
484
+ const substituted = facet.template.replace("{}", String(value));
485
+ u.searchParams.set(facet.param, substituted);
486
+ return u.toString();
487
+ }
488
+ /**
489
+ * Enumerate the facet values for a single facet spec.
490
+ *
491
+ * - `range`: `[start, end]` inclusive, walked with `step` (default 1).
492
+ * Schema guarantees `step > 0` and `start <= end` so the loop terminates.
493
+ * - `enum`: returns the explicit list verbatim (string or number).
494
+ */
495
+ function* generateFacetValues(facet) {
496
+ if (facet.type === "range") {
497
+ const [start, end] = facet.range;
498
+ const step = facet.step;
499
+ for (let v = start; v <= end; v += step)
500
+ yield v;
501
+ return;
502
+ }
503
+ for (const v of facet.values)
504
+ yield v;
505
+ }
506
+ /**
507
+ * Inner fetch — the original single-axis (pagination-only) traversal. The
508
+ * public adapter delegates here either directly (no facets) or once per
509
+ * facet value (facet sweep mode).
510
+ *
511
+ * `dryRun` is preserved (single-page fetch behaviour) but the public
512
+ * adapter narrows it further in facet sweep mode to "first facet value
513
+ * only" so `source test` does not walk every year.
514
+ */
515
+ async function fetchSingle(source, options) {
516
+ if (!source.pagination) {
517
+ throw new Error(`json-api adapter: source '${source.id}' has no pagination config`);
518
+ }
519
+ const fetchImpl = options.fetch ?? globalThis.fetch;
520
+ if (typeof fetchImpl !== "function") {
521
+ throw new Error("json-api adapter: no fetch implementation available (Node 22+ required)");
522
+ }
523
+ const pagination = source.pagination;
524
+ // `jsonSelectors` is optional in the schema (#174). When omitted, every
525
+ // field falls back to its default chain so trivial APIs (dev.to,
526
+ // generic JSON Feed clones) work without a selector block at all.
527
+ const selectors = source.jsonSelectors ?? {};
528
+ const env = options.env ?? process.env;
529
+ const headers = buildHeaders(source, env);
530
+ const previous = options.state;
531
+ const previousSeen = new Set(previous?.lastSeenIds ?? []);
532
+ const fetchedAt = new Date().toISOString();
533
+ const backfill = options.backfill === true;
534
+ const dryRun = options.dryRun === true;
535
+ const warn = options.warn ?? (() => { });
536
+ const onPage = options.onPage;
537
+ const maxPages = effectiveMaxPages(pagination, backfill, options.maxPagesOverride);
538
+ let currentUrl = initialUrl(source, pagination);
539
+ let pageIndex = 0;
540
+ const items = [];
541
+ let lastEtag = null;
542
+ let firstBodyText = null;
543
+ let firstBody = null;
544
+ let notModified = false;
545
+ // `undefined` means "not seen yet"; once we normalize the first item we
546
+ // overwrite each entry with either the matched path (string) or `null`
547
+ // (no candidate yielded a value). The diag payload reports the final
548
+ // state at end-of-fetch.
549
+ const adoption = {
550
+ title: undefined,
551
+ link: undefined,
552
+ publishedAt: undefined,
553
+ summary: undefined,
554
+ };
555
+ let itemsPath = null;
556
+ let paginationPreview;
557
+ // Effective cap may tighten mid-traversal when `totalPath` resolves to a
558
+ // value smaller than the recipe's `maxPages` (backfill early stop).
559
+ let effectiveCap = maxPages;
560
+ // Dry-run mode short-circuits after page 0: we record the diag preview
561
+ // (next URL / Link header / nextCursor) but never fetch page 1.
562
+ if (dryRun)
563
+ effectiveCap = Math.min(effectiveCap, 1);
564
+ while (pageIndex < effectiveCap) {
565
+ const response = await fetchPage(currentUrl, fetchImpl, headers, pagination, pageIndex, {
566
+ etag: previous?.lastEtag,
567
+ // Skip conditional GET in backfill mode so a stale ETag from a
568
+ // previous normal-mode run does not 304-out a requested full-history
569
+ // traversal.
570
+ sendConditional: !backfill,
571
+ });
572
+ if (pageIndex === 0) {
573
+ firstBody = response.body;
574
+ firstBodyText = response.bodyText;
575
+ lastEtag = response.etag;
576
+ if (response.status === 304) {
577
+ notModified = true;
578
+ break;
579
+ }
580
+ }
581
+ if (response.status === 304) {
582
+ // 304 on a later page is unusual but treat as end-of-pagination.
583
+ break;
584
+ }
585
+ const itemsResult = resolveItemsList(selectors, response.body);
586
+ if (pageIndex === 0)
587
+ itemsPath = itemsResult.path;
588
+ const matches = itemsResult.matches;
589
+ const pageItems = matches
590
+ .map((m) => elementToItem(m, source, selectors, fetchedAt, adoption))
591
+ .filter((i) => i !== null);
592
+ // Surface a pagination preview for `source test` on page 0 only. We
593
+ // compute the *would-be* next URL / cursor / Link header but never
594
+ // actually fetch it in dry-run mode (#174 state-clean invariant).
595
+ if (pageIndex === 0) {
596
+ const linkHeaderNext = pagination.type === "link-header" ? response.linkNext : undefined;
597
+ let nextCursor;
598
+ if ((pagination.type === "cursor" || pagination.type === "token") &&
599
+ pagination.nextCursorPath) {
600
+ nextCursor = coerceString(selectOne(pagination.nextCursorPath, response.body)) ?? null;
601
+ }
602
+ let previewNextUrl;
603
+ if (pagination.type === "link-header") {
604
+ previewNextUrl = response.linkNext;
605
+ }
606
+ else {
607
+ previewNextUrl = computeNextUrl(source, pagination, currentUrl, response.body, pageItems.length, 1);
608
+ }
609
+ paginationPreview = {
610
+ strategy: pagination.type,
611
+ nextUrl: previewNextUrl,
612
+ ...(linkHeaderNext !== undefined ? { linkHeaderNext } : {}),
613
+ ...(nextCursor !== undefined ? { nextCursor } : {}),
614
+ };
615
+ }
616
+ // Normal-mode early stop: if this page contains an id we have already
617
+ // seen, the older pages will all be older still — stop paginating.
618
+ let hitSeen = false;
619
+ if (!backfill && previousSeen.size > 0) {
620
+ for (const item of pageItems) {
621
+ if (previousSeen.has(item.id)) {
622
+ hitSeen = true;
623
+ break;
624
+ }
625
+ }
626
+ }
627
+ items.push(...pageItems);
628
+ // Backfill-mode early stop via `totalPath`: if the recipe declared a
629
+ // total-count selector, narrow the page budget so we exit after the
630
+ // implied last page rather than walking the full `maxPages` cap. We
631
+ // only consult `totalPath` on page 0 because the value is unlikely to
632
+ // change mid-traversal and re-evaluating per page would cost an extra
633
+ // JSONPath walk for negligible benefit.
634
+ //
635
+ // Applied BEFORE the `onPage` callback below so the user-visible
636
+ // `Page N/M` denominator already reflects the narrowed cap on the
637
+ // very first page event (otherwise the spinner ratio would jump
638
+ // from `1/20` to `1/2` between page 0 and page 1, which reads as a
639
+ // bug).
640
+ if (backfill && pagination.totalPath && pageIndex === 0) {
641
+ const totalRaw = selectOne(pagination.totalPath, response.body);
642
+ const total = typeof totalRaw === "number" ? totalRaw : Number(coerceString(totalRaw));
643
+ if (Number.isFinite(total) && total > 0 && pagination.pageSize) {
644
+ const computedMax = Math.max(1, Math.ceil(total / pagination.pageSize));
645
+ if (computedMax < effectiveCap) {
646
+ effectiveCap = computedMax;
647
+ }
648
+ }
649
+ }
650
+ // Surface per-page progress to the CLI spinner / non-TTY log (#198).
651
+ // The callback is invoked before any early-exit checks below so the
652
+ // user always sees a final `Page N/N` event for the page that decided
653
+ // termination. `effectiveCap` is the denominator the loop will respect
654
+ // (recipe `maxPages`, narrowed by `totalPath` on page 0 in backfill
655
+ // mode above), so the user-visible ratio shrinks as the budget tightens.
656
+ if (onPage) {
657
+ onPage({
658
+ pageIndex,
659
+ pageTotal: effectiveCap,
660
+ items: pageItems.length,
661
+ });
662
+ }
663
+ // Stop when the page yielded zero items — protects against runaway
664
+ // pagination on broken recipes / empty trailing pages.
665
+ if (matches.length === 0)
666
+ break;
667
+ if (hitSeen)
668
+ break;
669
+ // End-of-pagination heuristic: when the recipe declared a `pageSize`
670
+ // and this page returned fewer matches than that, treat it as the last
671
+ // page. Saves one extra round-trip per source on the common "trailing
672
+ // partial page" case (page 0 of size N, …, page K returns K' < N).
673
+ // Skipped for `cursor` / `token` pagination where `nextCursor` is the
674
+ // authoritative signal — those types may legitimately return fewer
675
+ // items per page than the requested size.
676
+ if (pagination.pageSize !== undefined &&
677
+ (pagination.type === "page" || pagination.type === "offset") &&
678
+ matches.length < pagination.pageSize) {
679
+ break;
680
+ }
681
+ // Compute next URL.
682
+ let nextUrl;
683
+ if (pagination.type === "link-header") {
684
+ nextUrl = response.linkNext;
685
+ }
686
+ else {
687
+ nextUrl = computeNextUrl(source, pagination, currentUrl, response.body, pageItems.length, pageIndex + 1);
688
+ }
689
+ if (!nextUrl)
690
+ break;
691
+ currentUrl = nextUrl;
692
+ pageIndex++;
693
+ }
694
+ // Warn for default-chain fields where every candidate returned null —
695
+ // recipe authors typically want to know the API has a non-standard
696
+ // shape (e.g. `additionalFields.headline` instead of `$.title`). We
697
+ // skip the warning when the recipe explicitly declared the selector
698
+ // (the absence is then on the user, not the default chain).
699
+ for (const field of Object.keys(adoption)) {
700
+ const explicit = selectors[field];
701
+ if (!explicit && adoption[field] === null) {
702
+ warn(`json-api adapter: source '${source.id}' — default selector chain for '${field}' produced no value; consider setting jsonSelectors.${field} explicitly`);
703
+ }
704
+ }
705
+ // Build state. Prefer the server-supplied ETag; otherwise hash the page-0
706
+ // body so re-runs without a server ETag still dedup correctly (mirrors the
707
+ // html adapter's content-hash fallback).
708
+ let nextEtag = previous?.lastEtag;
709
+ if (lastEtag) {
710
+ nextEtag = lastEtag;
711
+ }
712
+ else if (firstBodyText && firstBodyText.length > 0) {
713
+ nextEtag = `${CONTENT_HASH_PREFIX}${createHash("sha256").update(firstBodyText).digest("hex")}`;
714
+ }
715
+ // Avoid unused-variable warnings while keeping `firstBody` available for
716
+ // future debug surfaces (`source test` may want to print the first page
717
+ // body when no items matched).
718
+ void firstBody;
719
+ // Compose diag payload for `source test --show-content`. The selector
720
+ // adoption map reports the JSONPath candidate that won the fallback
721
+ // chain per field (or the recipe-supplied path verbatim, or `null` when
722
+ // every candidate missed). Pagination preview surfaces the next-URL /
723
+ // Link / cursor extraction so users can spot misconfigurations without
724
+ // letting the dry-run actually walk page 1.
725
+ const selectorAdoption = {
726
+ items: itemsPath ?? null,
727
+ title: adoption.title ?? null,
728
+ link: adoption.link ?? null,
729
+ publishedAt: adoption.publishedAt ?? null,
730
+ summary: adoption.summary ?? null,
731
+ };
732
+ const diag = {
733
+ selectorAdoption,
734
+ ...(paginationPreview ? { paginationPreview } : {}),
735
+ };
736
+ if (notModified) {
737
+ return {
738
+ items: [],
739
+ notModified: true,
740
+ state: {
741
+ lastFetchedAt: fetchedAt,
742
+ lastEtag: nextEtag,
743
+ },
744
+ diag,
745
+ };
746
+ }
747
+ return {
748
+ items,
749
+ state: {
750
+ lastFetchedAt: fetchedAt,
751
+ lastEtag: nextEtag,
752
+ },
753
+ diag,
754
+ };
755
+ }
756
+ /**
757
+ * Public adapter. When `source.facets` is set, wraps {@link fetchSingle}
758
+ * in an outer facet sweep loop (ADR-0017). Each iteration:
759
+ *
760
+ * - injects the facet value into the URL via {@link applyFacetValue}
761
+ * - delegates to {@link fetchSingle} with `facets: undefined` so the
762
+ * inner traversal sees the modified URL but does not recurse
763
+ * - disables conditional GET in facet sweep mode (ADR-0017 §State —
764
+ * per-facet ETag tracking is deferred to a future ADR)
765
+ * - merges state.lastSeenIds globally across facet values (item IDs are
766
+ * unique across facets in the documented AWS What's New use case)
767
+ *
768
+ * Inner traversal semantics (`lastSeenIds` early-stop, `pagination.maxPages`
769
+ * cap, `--max-pages` override, `--backfill` full traversal) apply unchanged
770
+ * to each facet value. The outer loop walks every facet value in both
771
+ * normal and `--backfill` modes — normal mode gets the early-stop benefit
772
+ * inside each value but never skips a facet outright (that would silently
773
+ * miss items in a facet whose first page has not changed since last run).
774
+ *
775
+ * Dry-run (`source test`) iterates only the first facet value so the
776
+ * selector adoption preview is meaningful without walking every year.
777
+ *
778
+ * Phase 1 limitation: a single facet entry only. Multi-facet (e.g. year ×
779
+ * category) requires composition rules that are out of scope here — see
780
+ * ADR-0017 §Scope.
781
+ */
782
+ export const jsonApiAdapter = {
783
+ kind: "json-api",
784
+ fetch: async (source, options = {}) => {
785
+ if (!source.facets || Object.keys(source.facets).length === 0) {
786
+ return fetchSingle(source, options);
787
+ }
788
+ const facetEntries = Object.entries(source.facets);
789
+ if (facetEntries.length > 1) {
790
+ // Phase 1 single-facet guard. The schema accepts a record shape for
791
+ // forward-compat, but composing two axes (year × category) needs
792
+ // explicit ordering / dedup semantics that ADR-0017 defers.
793
+ throw new Error(`json-api adapter: source '${source.id}' declares ${facetEntries.length} facets — multi-facet sweep is not supported in Phase 1 (ADR-0017 §Scope)`);
794
+ }
795
+ const [, facetSpec] = facetEntries[0];
796
+ const dryRun = options.dryRun === true;
797
+ // Aggregate items + lastSeenIds across every facet value. ETag is
798
+ // intentionally NOT persisted: a single ETag cannot represent the
799
+ // combined state of N facet values, and re-using last-run's ETag
800
+ // would 304-out the next sweep. Per-facet ETag is future work.
801
+ const aggregatedItems = [];
802
+ const aggregatedSeen = new Set(options.state?.lastSeenIds ?? []);
803
+ let aggregatedDiag;
804
+ let aggregatedNotModified = true;
805
+ const fetchedAt = new Date().toISOString();
806
+ for (const value of generateFacetValues(facetSpec)) {
807
+ const innerUrl = applyFacetValue(source.url, facetSpec, value);
808
+ // Build a "single-axis" view of the source: same id / pagination /
809
+ // selectors but with the facet-stamped URL and `facets: undefined`
810
+ // so the inner fetch does not recurse.
811
+ const innerSource = { ...source, url: innerUrl, facets: undefined };
812
+ // Share the running lastSeenIds set with the inner fetch so the
813
+ // per-facet early-stop heuristic dedupes against items already
814
+ // observed in earlier facets. Conditional GET is disabled: each
815
+ // facet value has its own ETag and re-using the previous value's
816
+ // would silently 304-out the next slice.
817
+ const innerOptions = {
818
+ ...options,
819
+ state: options.state
820
+ ? {
821
+ ...options.state,
822
+ lastEtag: undefined,
823
+ lastSeenIds: Array.from(aggregatedSeen),
824
+ }
825
+ : {
826
+ sourceId: source.id,
827
+ lastSeenIds: Array.from(aggregatedSeen),
828
+ },
829
+ };
830
+ const result = await fetchSingle(innerSource, innerOptions);
831
+ // Capture the diag from the FIRST facet value only — it serves as
832
+ // the representative selector-adoption / pagination-preview surface
833
+ // for `source test`. Later iterations overwrite nothing.
834
+ if (aggregatedDiag === undefined)
835
+ aggregatedDiag = result.diag;
836
+ if (!result.notModified)
837
+ aggregatedNotModified = false;
838
+ for (const item of result.items) {
839
+ aggregatedItems.push(item);
840
+ aggregatedSeen.add(item.id);
841
+ }
842
+ // Dry-run: walk only the first facet value so `source test` stays
843
+ // cheap and the per-page-0 selector preview is meaningful.
844
+ if (dryRun)
845
+ break;
846
+ }
847
+ return {
848
+ items: aggregatedItems,
849
+ // ADR-0017 §State: ETag disabled in facet sweep mode. Persist
850
+ // `undefined` so the next run starts fresh.
851
+ state: {
852
+ lastFetchedAt: fetchedAt,
853
+ lastEtag: undefined,
854
+ },
855
+ ...(aggregatedNotModified && aggregatedItems.length === 0 ? { notModified: true } : {}),
856
+ ...(aggregatedDiag ? { diag: aggregatedDiag } : {}),
857
+ };
858
+ },
859
+ };
860
+ //# sourceMappingURL=json-api.js.map