@ozzylabs/feedradar 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ja.md +13 -5
- package/README.md +13 -5
- package/dist/cli/doctor.d.ts +83 -0
- package/dist/cli/doctor.d.ts.map +1 -0
- package/dist/cli/doctor.js +260 -0
- package/dist/cli/doctor.js.map +1 -0
- package/dist/cli/index.d.ts.map +1 -1
- package/dist/cli/index.js +2 -2
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/source.d.ts.map +1 -1
- package/dist/cli/source.js +6 -3
- package/dist/cli/source.js.map +1 -1
- package/dist/cli/watch.d.ts +16 -0
- package/dist/cli/watch.d.ts.map +1 -1
- package/dist/cli/watch.js +3 -0
- package/dist/cli/watch.js.map +1 -1
- package/dist/core/feeds/_html-common.d.ts +30 -0
- package/dist/core/feeds/_html-common.d.ts.map +1 -0
- package/dist/core/feeds/_html-common.js +192 -0
- package/dist/core/feeds/_html-common.js.map +1 -0
- package/dist/core/feeds/html-js.d.ts +50 -0
- package/dist/core/feeds/html-js.d.ts.map +1 -0
- package/dist/core/feeds/html-js.js +135 -0
- package/dist/core/feeds/html-js.js.map +1 -0
- package/dist/core/feeds/html.d.ts +1 -7
- package/dist/core/feeds/html.d.ts.map +1 -1
- package/dist/core/feeds/html.js +5 -180
- package/dist/core/feeds/html.js.map +1 -1
- package/dist/core/feeds/index.d.ts.map +1 -1
- package/dist/core/feeds/index.js +2 -0
- package/dist/core/feeds/index.js.map +1 -1
- package/dist/core/playwright-check.d.ts +134 -0
- package/dist/core/playwright-check.d.ts.map +1 -0
- package/dist/core/playwright-check.js +98 -0
- package/dist/core/playwright-check.js.map +1 -0
- package/dist/core/watcher.d.ts +17 -0
- package/dist/core/watcher.d.ts.map +1 -1
- package/dist/core/watcher.js +59 -0
- package/dist/core/watcher.js.map +1 -1
- package/dist/schemas/source.d.ts +42 -0
- package/dist/schemas/source.d.ts.map +1 -1
- package/dist/schemas/source.js +42 -7
- package/dist/schemas/source.js.map +1 -1
- package/dist/templates/agents/AGENTS.md +2 -2
- package/dist/templates/feedradar.md +2 -2
- package/package.json +11 -1
package/dist/cli/watch.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import type { installChromium, ProbeOptions } from "../core/playwright-check.js";
|
|
1
2
|
import type { Command } from "./index.js";
|
|
2
3
|
export interface WatchIO {
|
|
3
4
|
log?: (message: string) => void;
|
|
@@ -9,6 +10,21 @@ export interface WatchCommandOptions {
|
|
|
9
10
|
io?: WatchIO;
|
|
10
11
|
/** Test seam: override the adapter HTTP fetcher. */
|
|
11
12
|
fetch?: typeof globalThis.fetch;
|
|
13
|
+
/**
|
|
14
|
+
* Test seam: override the Playwright probe used by the lazy `html-js`
|
|
15
|
+
* pre-check. Threaded straight through to `watchRun` — see watcher.ts.
|
|
16
|
+
*/
|
|
17
|
+
playwrightProbeOptions?: ProbeOptions;
|
|
18
|
+
/**
|
|
19
|
+
* Test seam: override `process.env` lookup so the test can toggle
|
|
20
|
+
* `RADAR_AUTO_INSTALL_CHROMIUM=1` deterministically.
|
|
21
|
+
*/
|
|
22
|
+
env?: NodeJS.ProcessEnv;
|
|
23
|
+
/**
|
|
24
|
+
* Test seam: override the Chromium auto-install function. Tests inject a
|
|
25
|
+
* stub that records invocation without spawning the real `npx`.
|
|
26
|
+
*/
|
|
27
|
+
installChromiumImpl?: typeof installChromium;
|
|
12
28
|
}
|
|
13
29
|
/**
|
|
14
30
|
* Implementation of `watch run`.
|
package/dist/cli/watch.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"watch.d.ts","sourceRoot":"","sources":["../../src/cli/watch.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"watch.d.ts","sourceRoot":"","sources":["../../src/cli/watch.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,YAAY,EAAE,MAAM,6BAA6B,CAAC;AAEjF,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAE1C,MAAM,WAAW,OAAO;IACtB,GAAG,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;IAChC,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;IACjC,KAAK,CAAC,EAAE,CAAC,OAAO,EAAE,MAAM,KAAK,IAAI,CAAC;CACnC;AAED,MAAM,WAAW,mBAAmB;IAClC,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,EAAE,CAAC,EAAE,OAAO,CAAC;IACb,oDAAoD;IACpD,KAAK,CAAC,EAAE,OAAO,UAAU,CAAC,KAAK,CAAC;IAChC;;;OAGG;IACH,sBAAsB,CAAC,EAAE,YAAY,CAAC;IACtC;;;OAGG;IACH,GAAG,CAAC,EAAE,MAAM,CAAC,UAAU,CAAC;IACxB;;;OAGG;IACH,mBAAmB,CAAC,EAAE,OAAO,eAAe,CAAC;CAC9C;AA2CD;;;;;;GAMG;AACH,wBAAsB,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,OAAO,GAAE,mBAAwB,GAAG,OAAO,CAAC,MAAM,CAAC,CA+CjG;AAED,eAAO,MAAM,YAAY,EAAE,OAgB1B,CAAC"}
|
package/dist/cli/watch.js
CHANGED
package/dist/cli/watch.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"watch.js","sourceRoot":"","sources":["../../src/cli/watch.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"watch.js","sourceRoot":"","sources":["../../src/cli/watch.ts"],"names":[],"mappings":"AACA,OAAO,EAAuB,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AAqCnE,SAAS,YAAY,CAAC,IAAc;IAClC,MAAM,GAAG,GAAiB,EAAE,CAAC;IAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;QAClB,IAAI,CAAC,KAAK,IAAI,IAAI,CAAC,KAAK,QAAQ,EAAE,CAAC;YACjC,GAAG,CAAC,IAAI,GAAG,IAAI,CAAC;YAChB,SAAS;QACX,CAAC;QACD,IAAI,CAAC,KAAK,UAAU,EAAE,CAAC;YACrB,GAAG,CAAC,QAAQ,GAAG,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC;YACzB,SAAS;QACX,CAAC;QACD,IAAI,CAAC,KAAK,aAAa,EAAE,CAAC;YACxB,GAAG,CAAC,SAAS,GAAG,IAAI,CAAC;YACrB,SAAS;QACX,CAAC;QACD,IAAI,CAAC,EAAE,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;YACxB,MAAM,IAAI,KAAK,CAAC,mBAAmB,CAAC,EAAE,CAAC,CAAC;QAC1C,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,wBAAwB,CAAC,EAAE,CAAC,CAAC;IAC/C,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED,SAAS,cAAc,CAAC,GAAwB;IAC9C,GAAG,CAAC,oCAAoC,CAAC,CAAC;IAC1C,GAAG,CAAC,EAAE,CAAC,CAAC;IACR,GAAG,CAAC,cAAc,CAAC,CAAC;IACpB,GAAG,CAAC,uEAAuE,CAAC,CAAC;IAC7E,GAAG,CAAC,EAAE,CAAC,CAAC;IACR,GAAG,CAAC,kBAAkB,CAAC,CAAC;IACxB,GAAG,CAAC,uDAAuD,CAAC,CAAC;IAC7D,GAAG,CAAC,oFAAoF,CAAC,CAAC;AAC5F,CAAC;AAED;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAAC,IAAc,EAAE,UAA+B,EAAE;IAC9E,MAAM,GAAG,GAAG,OAAO,CAAC,GAAG,IAAI,OAAO,CAAC,GAAG,EAAE,CAAC;IACzC,MAAM,GAAG,GAAG,OAAO,CAAC,EAAE,EAAE,GAAG,IAAI,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IAC/D,MAAM,IAAI,GAAG,OAAO,CAAC,EAAE,EAAE,IAAI,IAAI,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IAClE,MAAM,KAAK,GAAG,OAAO,CAAC,EAAE,EAAE,KAAK,IAAI,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAErE,IAAI,MAAoB,CAAC;IACzB,IAAI,CAAC;QACH,MAAM,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;IAC9B,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,KAAK,CAAC,cAAc,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QAClE,OAAO,CAAC,CAAC;IACX,CAAC;IACD,IAAI,MAAM,CAAC,IAAI,EAAE,CAAC;QAChB,cAAc,CAAC,GAAG,CAAC,CAAC;QACpB,OAAO,CAAC,CAAC;IACX,CAAC;IAED,IAAI,MAAsB,CAAC;IAC3B,IAAI,CAAC;QACH,MAAM,GAAG,MAAM,QAAQ,CAAC;YACtB,GAAG;YACH,QAAQ,EAAE,MAAM,CAAC,QAAQ;YACzB,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,KAAK,EAAE,OAAO,CAAC,KAAc;YAC7B,GAAG;YACH,IAAI;YACJ,KAAK;YACL,GAAG,EAAE,OAAO,CAAC,GAAG;YAChB,sBAAsB,EAAE,OAAO,CAAC,sBAAsB;YACtD,mBAAmB,EAAE,OAAO,CAAC,mBAAmB;SACjD,CAAC,CAAC;IACL,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,KAAK,CAAC,cAAc,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QAClE,OAAO,CAAC,CAAC;IACX,CAAC;IAED,MAAM,aAAa,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACjG,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;QACrB,GAAG,CAAC,kCAAkC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,MAAM,WAAW,CAAC,CAAC;IACtF,CAAC;SAAM,CAAC;QACN,GAAG,CACD,cAAc,aAAa,uBAAuB,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,MAAM,YAAY,CAChG,CAAC;IACJ,CAAC;IAED,OAAO,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;AAC1C,CAAC;AAED,MAAM,CAAC,MAAM,YAAY,GAAY;IACnC,IAAI,EAAE,OAAO;IACb,OAAO,EAAE,gDAAgD;IACzD,GAAG,EAAE,KAAK,EAAE,IAAI,EAAE,EAAE;QAClB,MAAM,CAAC,GAAG,EAAE,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC;QAC5B,IAAI,CAAC,GAAG,IAAI,GAAG,KAAK,IAAI,IAAI,GAAG,KAAK,QAAQ,IAAI,GAAG,KAAK,MAAM,EAAE,CAAC;YAC/D,cAAc,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YACtC,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACrB,CAAC;QACD,IAAI,GAAG,KAAK,KAAK,EAAE,CAAC;YAClB,OAAO,QAAQ,CAAC,IAAI,CAAC,CAAC;QACxB,CAAC;QACD,OAAO,CAAC,KAAK,CAAC,8BAA8B,GAAG,GAAG,CAAC,CAAC;QACpD,cAAc,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QACxC,OAAO,CAAC,CAAC;IACX,CAAC;CACF,CAAC"}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import type { Item, Source } from "../../schemas/index.js";
|
|
2
|
+
/**
|
|
3
|
+
* Shared parsing primitives for the `kind: html` (static) and `kind: html-js`
|
|
4
|
+
* (Playwright-rendered) adapters (ADR-0010 §D1).
|
|
5
|
+
*
|
|
6
|
+
* Both adapters apply the same `SourceSelectors` contract to a serialized HTML
|
|
7
|
+
* string — the only difference is how that string was acquired (raw HTTP body
|
|
8
|
+
* vs `page.content()` after JS execution). Extracting `parseHtmlDocument` and
|
|
9
|
+
* `contentHash` here keeps the selector semantics and dedup marker format in
|
|
10
|
+
* lockstep so a switch from `html` to `html-js` is transparent to downstream
|
|
11
|
+
* consumers (dedup, state file, watcher).
|
|
12
|
+
*/
|
|
13
|
+
/**
|
|
14
|
+
* Prefix that flags an `lastEtag` slot as carrying a content hash rather than
|
|
15
|
+
* an actual HTTP ETag. Both adapters reuse the `lastEtag` field so neither
|
|
16
|
+
* has to migrate `SourceState` (see `docs/design/source-html.md`).
|
|
17
|
+
*/
|
|
18
|
+
export declare const CONTENT_HASH_PREFIX = "sha256:";
|
|
19
|
+
/**
|
|
20
|
+
* Parse an HTML document into validated `Item[]` using the source's
|
|
21
|
+
* `selectors`. Both `kind: html` and `kind: html-js` go through here so the
|
|
22
|
+
* selector contract stays in one place.
|
|
23
|
+
*/
|
|
24
|
+
export declare function parseHtmlDocument(html: string, source: Source, fetchedAt: string): Item[];
|
|
25
|
+
/**
|
|
26
|
+
* Compute the sha256 of the raw response body, prefixed so callers can tell
|
|
27
|
+
* it apart from a real ETag inside `SourceState.lastEtag`.
|
|
28
|
+
*/
|
|
29
|
+
export declare function contentHash(body: string): string;
|
|
30
|
+
//# sourceMappingURL=_html-common.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"_html-common.d.ts","sourceRoot":"","sources":["../../../src/core/feeds/_html-common.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,EAAmB,MAAM,wBAAwB,CAAC;AAI5E;;;;;;;;;;GAUG;AAEH;;;;GAIG;AACH,eAAO,MAAM,mBAAmB,YAAY,CAAC;AAoJ7C;;;;GAIG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,IAAI,EAAE,CAiBzF;AAED;;;GAGG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEhD"}
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { parse as parseHtml } from "node-html-parser";
|
|
3
|
+
import { ItemSchema } from "../../schemas/index.js";
|
|
4
|
+
import { deriveItemId, deriveStableKey } from "./derive-id.js";
|
|
5
|
+
/**
|
|
6
|
+
* Shared parsing primitives for the `kind: html` (static) and `kind: html-js`
|
|
7
|
+
* (Playwright-rendered) adapters (ADR-0010 §D1).
|
|
8
|
+
*
|
|
9
|
+
* Both adapters apply the same `SourceSelectors` contract to a serialized HTML
|
|
10
|
+
* string — the only difference is how that string was acquired (raw HTTP body
|
|
11
|
+
* vs `page.content()` after JS execution). Extracting `parseHtmlDocument` and
|
|
12
|
+
* `contentHash` here keeps the selector semantics and dedup marker format in
|
|
13
|
+
* lockstep so a switch from `html` to `html-js` is transparent to downstream
|
|
14
|
+
* consumers (dedup, state file, watcher).
|
|
15
|
+
*/
|
|
16
|
+
/**
|
|
17
|
+
* Prefix that flags an `lastEtag` slot as carrying a content hash rather than
|
|
18
|
+
* an actual HTTP ETag. Both adapters reuse the `lastEtag` field so neither
|
|
19
|
+
* has to migrate `SourceState` (see `docs/design/source-html.md`).
|
|
20
|
+
*/
|
|
21
|
+
export const CONTENT_HASH_PREFIX = "sha256:";
|
|
22
|
+
/** Attributes the parser checks before falling back to text content. */
|
|
23
|
+
const DATETIME_ATTRS = ["datetime", "content", "value"];
|
|
24
|
+
/**
|
|
25
|
+
* Convert an `HTMLElement | null` to its trimmed text, or `undefined` when
|
|
26
|
+
* the selector did not match. We always trim because raw scrapes routinely
|
|
27
|
+
* carry surrounding whitespace from formatted markup.
|
|
28
|
+
*/
|
|
29
|
+
function textOf(el) {
|
|
30
|
+
if (!el)
|
|
31
|
+
return undefined;
|
|
32
|
+
const text = el.text?.trim();
|
|
33
|
+
return text ? text : undefined;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Apply a CSS selector relative to `root` and return the first match.
|
|
37
|
+
* `node-html-parser` returns `null` instead of throwing for invalid input,
|
|
38
|
+
* which matches what callers want here (a missing field, not a hard error).
|
|
39
|
+
*/
|
|
40
|
+
function queryFirst(root, selector) {
|
|
41
|
+
return root.querySelector(selector);
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Resolve the `link` selector to an `href` (or text fallback).
|
|
45
|
+
*
|
|
46
|
+
* Anchor tags expose the URL via `href` so we prefer the attribute. When the
|
|
47
|
+
* selector points at a non-anchor (e.g. a `<div data-link>` wrapper used by
|
|
48
|
+
* some changelog layouts), we fall back to text content so the adapter can
|
|
49
|
+
* still operate, deferring URL validation to `ItemSchema`.
|
|
50
|
+
*/
|
|
51
|
+
function pickLink(el) {
|
|
52
|
+
if (!el)
|
|
53
|
+
return undefined;
|
|
54
|
+
const href = el.getAttribute("href");
|
|
55
|
+
if (href && href.trim())
|
|
56
|
+
return href.trim();
|
|
57
|
+
return textOf(el);
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Resolve `publishedAt` to a candidate string for `new Date()`.
|
|
61
|
+
*
|
|
62
|
+
* `<time datetime="2026-05-12">` and `<meta content="..."/>` markup hide the
|
|
63
|
+
* canonical timestamp in attributes; the visible text is often a
|
|
64
|
+
* localized "May 12, 2026" that is harder to parse reliably. We probe the
|
|
65
|
+
* known attributes first, then fall back to element text.
|
|
66
|
+
*/
|
|
67
|
+
function pickDatetime(el) {
|
|
68
|
+
if (!el)
|
|
69
|
+
return undefined;
|
|
70
|
+
for (const attr of DATETIME_ATTRS) {
|
|
71
|
+
const value = el.getAttribute(attr);
|
|
72
|
+
if (value && value.trim())
|
|
73
|
+
return value.trim();
|
|
74
|
+
}
|
|
75
|
+
return textOf(el);
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Try to parse a candidate timestamp into ISO 8601. Returns `undefined` for
|
|
79
|
+
* unparseable inputs so the item can still be emitted (RSS adapter parity).
|
|
80
|
+
*/
|
|
81
|
+
function toIsoDate(value) {
|
|
82
|
+
if (!value)
|
|
83
|
+
return undefined;
|
|
84
|
+
const date = new Date(value);
|
|
85
|
+
if (Number.isNaN(date.getTime()))
|
|
86
|
+
return undefined;
|
|
87
|
+
return date.toISOString();
|
|
88
|
+
}
|
|
89
|
+
/** Collect the trimmed text of every match for `selector`. */
|
|
90
|
+
function collectTags(root, selector) {
|
|
91
|
+
if (!selector)
|
|
92
|
+
return undefined;
|
|
93
|
+
const tags = root
|
|
94
|
+
.querySelectorAll(selector)
|
|
95
|
+
.map((el) => el.text?.trim())
|
|
96
|
+
.filter((t) => !!t && t.length > 0);
|
|
97
|
+
return tags.length > 0 ? tags : undefined;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Resolve a relative `link` against the source URL.
|
|
101
|
+
*
|
|
102
|
+
* Many sites publish `<a href="/changelog/foo">` rather than absolute URLs;
|
|
103
|
+
* without resolution `ItemSchema`'s `z.string().url()` would drop them. We
|
|
104
|
+
* intentionally swallow `URL` constructor errors so a malformed `link`
|
|
105
|
+
* surfaces as a normal validation drop later instead of breaking the whole
|
|
106
|
+
* fetch.
|
|
107
|
+
*/
|
|
108
|
+
function resolveUrl(raw, base) {
|
|
109
|
+
try {
|
|
110
|
+
return new URL(raw, base).toString();
|
|
111
|
+
}
|
|
112
|
+
catch {
|
|
113
|
+
return raw;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
/** Normalize one matched element into an Item, or `null` to drop it. */
|
|
117
|
+
function parseItem(itemEl, selectors, source, fetchedAt) {
|
|
118
|
+
const title = textOf(queryFirst(itemEl, selectors.title));
|
|
119
|
+
const linkRaw = pickLink(queryFirst(itemEl, selectors.link));
|
|
120
|
+
if (!title || !linkRaw)
|
|
121
|
+
return null;
|
|
122
|
+
const url = resolveUrl(linkRaw, source.url);
|
|
123
|
+
const summary = selectors.summary ? textOf(queryFirst(itemEl, selectors.summary)) : undefined;
|
|
124
|
+
const body = selectors.body ? textOf(queryFirst(itemEl, selectors.body)) : undefined;
|
|
125
|
+
const publishedAt = selectors.publishedAt
|
|
126
|
+
? toIsoDate(pickDatetime(queryFirst(itemEl, selectors.publishedAt)))
|
|
127
|
+
: undefined;
|
|
128
|
+
const tags = collectTags(itemEl, selectors.tags);
|
|
129
|
+
const stableKey = deriveStableKey({
|
|
130
|
+
url,
|
|
131
|
+
fallbackHashInputs: [title, publishedAt],
|
|
132
|
+
});
|
|
133
|
+
const id = deriveItemId(title, stableKey);
|
|
134
|
+
// Preserve a structured snapshot of the raw scrape rather than the
|
|
135
|
+
// `HTMLElement` instance itself — the watcher serializes `raw` to YAML and
|
|
136
|
+
// we want the on-disk payload to be diff-friendly.
|
|
137
|
+
const raw = { title, link: linkRaw };
|
|
138
|
+
if (summary !== undefined)
|
|
139
|
+
raw.summary = summary;
|
|
140
|
+
if (body !== undefined)
|
|
141
|
+
raw.body = body;
|
|
142
|
+
if (publishedAt !== undefined)
|
|
143
|
+
raw.publishedAt = publishedAt;
|
|
144
|
+
if (tags !== undefined)
|
|
145
|
+
raw.tags = tags;
|
|
146
|
+
return validateItem({
|
|
147
|
+
id,
|
|
148
|
+
sourceId: source.id,
|
|
149
|
+
title,
|
|
150
|
+
url,
|
|
151
|
+
summary,
|
|
152
|
+
publishedAt,
|
|
153
|
+
fetchedAt,
|
|
154
|
+
raw,
|
|
155
|
+
});
|
|
156
|
+
}
|
|
157
|
+
function validateItem(candidate) {
|
|
158
|
+
const result = ItemSchema.safeParse(candidate);
|
|
159
|
+
// Items that fail validation (e.g. unresolvable URL) are dropped silently —
|
|
160
|
+
// see rss.ts for the same fail-soft rationale.
|
|
161
|
+
return result.success ? result.data : null;
|
|
162
|
+
}
|
|
163
|
+
/**
|
|
164
|
+
* Parse an HTML document into validated `Item[]` using the source's
|
|
165
|
+
* `selectors`. Both `kind: html` and `kind: html-js` go through here so the
|
|
166
|
+
* selector contract stays in one place.
|
|
167
|
+
*/
|
|
168
|
+
export function parseHtmlDocument(html, source, fetchedAt) {
|
|
169
|
+
if (!source.selectors) {
|
|
170
|
+
throw new Error(`html adapter: source '${source.id}' has no selectors`);
|
|
171
|
+
}
|
|
172
|
+
const selectors = source.selectors;
|
|
173
|
+
let root;
|
|
174
|
+
try {
|
|
175
|
+
root = parseHtml(html);
|
|
176
|
+
}
|
|
177
|
+
catch (e) {
|
|
178
|
+
throw new Error(`html adapter: failed to parse HTML: ${e instanceof Error ? e.message : String(e)}`);
|
|
179
|
+
}
|
|
180
|
+
const itemEls = root.querySelectorAll(selectors.item);
|
|
181
|
+
return itemEls
|
|
182
|
+
.map((el) => parseItem(el, selectors, source, fetchedAt))
|
|
183
|
+
.filter((i) => i !== null);
|
|
184
|
+
}
|
|
185
|
+
/**
|
|
186
|
+
* Compute the sha256 of the raw response body, prefixed so callers can tell
|
|
187
|
+
* it apart from a real ETag inside `SourceState.lastEtag`.
|
|
188
|
+
*/
|
|
189
|
+
export function contentHash(body) {
|
|
190
|
+
return `${CONTENT_HASH_PREFIX}${createHash("sha256").update(body).digest("hex")}`;
|
|
191
|
+
}
|
|
192
|
+
//# sourceMappingURL=_html-common.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"_html-common.js","sourceRoot":"","sources":["../../../src/core/feeds/_html-common.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAoB,KAAK,IAAI,SAAS,EAAE,MAAM,kBAAkB,CAAC;AAExE,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AACpD,OAAO,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AAE/D;;;;;;;;;;GAUG;AAEH;;;;GAIG;AACH,MAAM,CAAC,MAAM,mBAAmB,GAAG,SAAS,CAAC;AAE7C,wEAAwE;AACxE,MAAM,cAAc,GAAG,CAAC,UAAU,EAAE,SAAS,EAAE,OAAO,CAAU,CAAC;AAEjE;;;;GAIG;AACH,SAAS,MAAM,CAAC,EAAsB;IACpC,IAAI,CAAC,EAAE;QAAE,OAAO,SAAS,CAAC;IAC1B,MAAM,IAAI,GAAG,EAAE,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;IAC7B,OAAO,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC;AACjC,CAAC;AAED;;;;GAIG;AACH,SAAS,UAAU,CAAC,IAAiB,EAAE,QAAgB;IACrD,OAAO,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC,CAAC;AACtC,CAAC;AAED;;;;;;;GAOG;AACH,SAAS,QAAQ,CAAC,EAAsB;IACtC,IAAI,CAAC,EAAE;QAAE,OAAO,SAAS,CAAC;IAC1B,MAAM,IAAI,GAAG,EAAE,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;IACrC,IAAI,IAAI,IAAI,IAAI,CAAC,IAAI,EAAE;QAAE,OAAO,IAAI,CAAC,IAAI,EAAE,CAAC;IAC5C,OAAO,MAAM,CAAC,EAAE,CAAC,CAAC;AACpB,CAAC;AAED;;;;;;;GAOG;AACH,SAAS,YAAY,CAAC,EAAsB;IAC1C,IAAI,CAAC,EAAE;QAAE,OAAO,SAAS,CAAC;IAC1B,KAAK,MAAM,IAAI,IAAI,cAAc,EAAE,CAAC;QAClC,MAAM,KAAK,GAAG,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;QACpC,IAAI,KAAK,IAAI,KAAK,CAAC,IAAI,EAAE;YAAE,OAAO,KAAK,CAAC,IAAI,EAAE,CAAC;IACjD,CAAC;IACD,OAAO,MAAM,CAAC,EAAE,CAAC,CAAC;AACpB,CAAC;AAED;;;GAGG;AACH,SAAS,SAAS,CAAC,KAAyB;IAC1C,IAAI,CAAC,KAAK;QAAE,OAAO,SAAS,CAAC;IAC7B,MAAM,IAAI,GAAG,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC;IAC7B,IAAI,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,EAAE,CAAC;QAAE,OAAO,SAAS,CAAC;IACnD,OAAO,IAAI,CAAC,WAAW,EAAE,CAAC;AAC5B,CAAC;AAED,8DAA8D;AAC9D,SAAS,WAAW,CAAC,IAAiB,EAAE,QAA4B;IAClE,IAAI,CAAC,QAAQ;QAAE,OAAO,SAAS,CAAC;IAChC,MAAM,IAAI,GAAG,IAAI;SACd,gBAAgB,CAAC,QAAQ,CAAC;SAC1B,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;SAC5B,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACnD,OAAO,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC;AAC5C,CAAC;AAED;;;;;;;;GAQG;AACH,SAAS,UAAU,CAAC,GAAW,EAAE,IAAY;IAC3C,IAAI,CAAC;QACH,OAAO,IAAI,GAAG,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC;IACvC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,GAAG,CAAC;IACb,CAAC;AACH,CAAC;AAED,wEAAwE;AACxE,SAAS,SAAS,CAChB,MAAmB,EACnB,SAA0B,EAC1B,MAAc,EACd,SAAiB;IAEjB,MAAM,KAAK,GAAG,MAAM,CAAC,UAAU,CAAC,MAAM,EAAE,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC;IAC1D,MAAM,OAAO,GAAG,QAAQ,CAAC,UAAU,CAAC,MAAM,EAAE,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC;IAC7D,IAAI,CAAC,KAAK,IAAI,CAAC,OAAO;QAAE,OAAO,IAAI,CAAC;IACpC,MAAM,GAAG,GAAG,UAAU,CAAC,OAAO,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC;IAE5C,MAAM,OAAO,GAAG,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,UAAU,CAAC,MAAM,EAAE,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IAC9F,MAAM,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,UAAU,CAAC,MAAM,EAAE,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;IACrF,MAAM,WAAW,GAAG,SAAS,CAAC,WAAW;QACvC,CAAC,CAAC,SAAS,CAAC,YAAY,CAAC,UAAU,CAAC,MAAM,EAAE,SAAS,CAAC,WAAW,CAAC,CAAC,CAAC;QACpE,CAAC,CAAC,SAAS,CAAC;IACd,MAAM,IAAI,GAAG,WAAW,CAAC,MAAM,EAAE,SAAS,CAAC,IAAI,CAAC,CAAC;IAEjD,MAAM,SAAS,GAAG,eAAe,CAAC;QAChC,GAAG;QACH,kBAAkB,EAAE,CAAC,KAAK,EAAE,WAAW,CAAC;KACzC,CAAC,CAAC;IACH,MAAM,EAAE,GAAG,YAAY,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;IAE1C,mEAAmE;IACnE,2EAA2E;IAC3E,mDAAmD;IACnD,MAAM,GAAG,GAA4B,EAAE,KAAK,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;IAC9D,IAAI,OAAO,KAAK,SAAS;QAAE,GAAG,CAAC,OAAO,GAAG,OAAO,CAAC;IACjD,IAAI,IAAI,KAAK,SAAS;QAAE,GAAG,CAAC,IAAI,GAAG,IAAI,CAAC;IACxC,IAAI,WAAW,KAAK,SAAS;QAAE,GAAG,CAAC,WAAW,GAAG,WAAW,CAAC;IAC7D,IAAI,IAAI,KAAK,SAAS;QAAE,GAAG,CAAC,IAAI,GAAG,IAAI,CAAC;IAExC,OAAO,YAAY,CAAC;QAClB,EAAE;QACF,QAAQ,EAAE,MAAM,CAAC,EAAE;QACnB,KAAK;QACL,GAAG;QACH,OAAO;QACP,WAAW;QACX,SAAS;QACT,GAAG;KACJ,CAAC,CAAC;AACL,CAAC;AAED,SAAS,YAAY,CAAC,SAAkC;IACtD,MAAM,MAAM,GAAG,UAAU,CAAC,SAAS,CAAC,SAAS,CAAC,CAAC;IAC/C,4EAA4E;IAC5E,+CAA+C;IAC/C,OAAO,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC;AAC7C,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAAY,EAAE,MAAc,EAAE,SAAiB;IAC/E,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC;QACtB,MAAM,IAAI,KAAK,CAAC,yBAAyB,MAAM,CAAC,EAAE,oBAAoB,CAAC,CAAC;IAC1E,CAAC;IACD,MAAM,SAAS,GAAG,MAAM,CAAC,SAAS,CAAC;IACnC,IAAI,IAAiB,CAAC;IACtB,IAAI,CAAC;QACH,IAAI,GAAG,SAAS,CAAC,IAAI,CAAC,CAAC;IACzB,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CACb,uCAAuC,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CACpF,CAAC;IACJ,CAAC;IACD,MAAM,OAAO,GAAG,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;IACtD,OAAO,OAAO;SACX,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,SAAS,CAAC,EAAE,EAAE,SAAS,EAAE,MAAM,EAAE,SAAS,CAAC,CAAC;SACxD,MAAM,CAAC,CAAC,CAAC,EAAa,EAAE,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC;AAC1C,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,WAAW,CAAC,IAAY;IACtC,OAAO,GAAG,mBAAmB,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC;AACpF,CAAC"}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import type { SourceJsOptions } from "../../schemas/index.js";
|
|
2
|
+
import type { FeedAdapter, FeedAdapterOptions } from "./types.js";
|
|
3
|
+
/**
|
|
4
|
+
* Minimal subset of the Playwright surface this adapter uses. Defined
|
|
5
|
+
* structurally so the `chromium` argument passed by tests does not need to
|
|
6
|
+
* pull in the full Playwright type tree (which is itself an optional peer
|
|
7
|
+
* dep and therefore not guaranteed to be installed in dev).
|
|
8
|
+
*/
|
|
9
|
+
export interface PlaywrightLike {
|
|
10
|
+
chromium: {
|
|
11
|
+
launch(options?: {
|
|
12
|
+
headless?: boolean;
|
|
13
|
+
}): Promise<PlaywrightBrowserLike>;
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
export interface PlaywrightBrowserLike {
|
|
17
|
+
newContext(options?: {
|
|
18
|
+
acceptDownloads?: boolean;
|
|
19
|
+
userAgent?: string;
|
|
20
|
+
}): Promise<PlaywrightContextLike>;
|
|
21
|
+
close(): Promise<void>;
|
|
22
|
+
}
|
|
23
|
+
export interface PlaywrightContextLike {
|
|
24
|
+
newPage(): Promise<PlaywrightPageLike>;
|
|
25
|
+
close(): Promise<void>;
|
|
26
|
+
}
|
|
27
|
+
export interface PlaywrightPageLike {
|
|
28
|
+
goto(url: string, options?: {
|
|
29
|
+
waitUntil?: SourceJsOptions["waitUntil"];
|
|
30
|
+
timeout?: number;
|
|
31
|
+
}): Promise<unknown>;
|
|
32
|
+
waitForSelector(selector: string, options?: {
|
|
33
|
+
timeout?: number;
|
|
34
|
+
}): Promise<unknown>;
|
|
35
|
+
content(): Promise<string>;
|
|
36
|
+
close(): Promise<void>;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Test-only extension to `FeedAdapterOptions` for the `html-js` adapter.
|
|
40
|
+
*
|
|
41
|
+
* Production callers leave `playwright` unset and the adapter dynamically
|
|
42
|
+
* imports it. Tests inject a fake module so they can exercise the adapter
|
|
43
|
+
* without spinning up real Chromium. The shape mirrors the subset above.
|
|
44
|
+
*/
|
|
45
|
+
export interface HtmlJsAdapterOptions extends FeedAdapterOptions {
|
|
46
|
+
/** Injected Playwright module (tests only). Production uses dynamic import. */
|
|
47
|
+
playwright?: PlaywrightLike;
|
|
48
|
+
}
|
|
49
|
+
export declare const htmlJsAdapter: FeedAdapter;
|
|
50
|
+
//# sourceMappingURL=html-js.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html-js.d.ts","sourceRoot":"","sources":["../../../src/core/feeds/html-js.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAU,eAAe,EAAE,MAAM,wBAAwB,CAAC;AAEtE,OAAO,KAAK,EAAE,WAAW,EAAE,kBAAkB,EAAE,MAAM,YAAY,CAAC;AA+ClE;;;;;GAKG;AACH,MAAM,WAAW,cAAc;IAC7B,QAAQ,EAAE;QACR,MAAM,CAAC,OAAO,CAAC,EAAE;YAAE,QAAQ,CAAC,EAAE,OAAO,CAAA;SAAE,GAAG,OAAO,CAAC,qBAAqB,CAAC,CAAC;KAC1E,CAAC;CACH;AAED,MAAM,WAAW,qBAAqB;IACpC,UAAU,CAAC,OAAO,CAAC,EAAE;QACnB,eAAe,CAAC,EAAE,OAAO,CAAC;QAC1B,SAAS,CAAC,EAAE,MAAM,CAAC;KACpB,GAAG,OAAO,CAAC,qBAAqB,CAAC,CAAC;IACnC,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CACxB;AAED,MAAM,WAAW,qBAAqB;IACpC,OAAO,IAAI,OAAO,CAAC,kBAAkB,CAAC,CAAC;IACvC,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CACxB;AAED,MAAM,WAAW,kBAAkB;IACjC,IAAI,CACF,GAAG,EAAE,MAAM,EACX,OAAO,CAAC,EAAE;QAAE,SAAS,CAAC,EAAE,eAAe,CAAC,WAAW,CAAC,CAAC;QAAC,OAAO,CAAC,EAAE,MAAM,CAAA;KAAE,GACvE,OAAO,CAAC,OAAO,CAAC,CAAC;IACpB,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE;QAAE,OAAO,CAAC,EAAE,MAAM,CAAA;KAAE,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC;IACpF,OAAO,IAAI,OAAO,CAAC,MAAM,CAAC,CAAC;IAC3B,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CACxB;AAED;;;;;;GAMG;AACH,MAAM,WAAW,oBAAqB,SAAQ,kBAAkB;IAC9D,+EAA+E;IAC/E,UAAU,CAAC,EAAE,cAAc,CAAC;CAC7B;AAqBD,eAAO,MAAM,aAAa,EAAE,WAyE3B,CAAC"}
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import { contentHash, parseHtmlDocument } from "./_html-common.js";
|
|
2
|
+
/**
|
|
3
|
+
* `kind: html-js` adapter — Playwright-rendered HTML scraping (ADR-0010).
|
|
4
|
+
*
|
|
5
|
+
* Same selector contract as `kind: html` (delegates to `parseHtmlDocument`),
|
|
6
|
+
* but acquires the document by driving headless Chromium so SPA / CSR pages
|
|
7
|
+
* (Next.js, Notion embeds, Algolia DocSearch, etc.) that ship empty initial
|
|
8
|
+
* HTML can still be scraped.
|
|
9
|
+
*
|
|
10
|
+
* ## Hardening (ADR-0010 §D5 — hardcoded, NOT user-configurable)
|
|
11
|
+
*
|
|
12
|
+
* | Policy | Value | Rationale |
|
|
13
|
+
* |---------------------|----------------------|----------------------------------------------------------|
|
|
14
|
+
* | `headless` | `true` | UI mode is CI-incompatible and an operator-UI risk. |
|
|
15
|
+
* | `acceptDownloads` | `false` | Block drive-by downloads (page JS-triggered file saves). |
|
|
16
|
+
* | context reuse | none — fresh each fetch | Prevent SW / IndexedDB / localStorage injection persistence and cross-source state mixing. |
|
|
17
|
+
* | default `timeout` | 30000ms | Cap OOM / infinite loops on pathological pages. |
|
|
18
|
+
* | `page.close()` | in `finally` | Prevent page leak / memory accumulation. |
|
|
19
|
+
* | viewport | Playwright default | Avoid bloating DOM with oversized viewports. |
|
|
20
|
+
*
|
|
21
|
+
* The above are intentionally NOT exposed through `SourceJsOptions`. Users
|
|
22
|
+
* may tune `waitFor` / `waitUntil` / `timeout` / `userAgent`, but the threat
|
|
23
|
+
* model assumes the policy floor above always holds.
|
|
24
|
+
*
|
|
25
|
+
* ## Optional peer dep
|
|
26
|
+
*
|
|
27
|
+
* Playwright is declared as an *optional* peer dependency (ADR-0010 §D3) so
|
|
28
|
+
* users who only run `kind: rss` / `kind: html` are not forced to install
|
|
29
|
+
* Chromium. The import is therefore `await import("playwright")` and resolves
|
|
30
|
+
* lazily on the first `html-js` fetch; missing-module errors are translated
|
|
31
|
+
* into a user-friendly install hint.
|
|
32
|
+
*/
|
|
33
|
+
/**
|
|
34
|
+
* Default per-step timeout in ms when `Source.js?.timeout` is omitted.
|
|
35
|
+
* Mirrors `SourceJsOptionsSchema`'s default so adapter-direct callers (not
|
|
36
|
+
* going through schema parse) still get the documented behavior.
|
|
37
|
+
*/
|
|
38
|
+
const DEFAULT_TIMEOUT_MS = 30_000;
|
|
39
|
+
/**
|
|
40
|
+
* Default Playwright `page.goto()` waitUntil mode. `networkidle` is the
|
|
41
|
+
* safest default for SPA / CSR pages where item data arrives via XHR after
|
|
42
|
+
* the document has loaded.
|
|
43
|
+
*/
|
|
44
|
+
const DEFAULT_WAIT_UNTIL = "networkidle";
|
|
45
|
+
/**
|
|
46
|
+
* Dynamically import Playwright. Translates the very common
|
|
47
|
+
* "package not installed" failure into the install hint from ADR-0010 §D3.
|
|
48
|
+
*/
|
|
49
|
+
async function loadPlaywright() {
|
|
50
|
+
try {
|
|
51
|
+
// Bare specifier: resolves via the consumer project's node_modules. The
|
|
52
|
+
// type assertion narrows the dynamic import to the subset we use.
|
|
53
|
+
const mod = (await import("playwright"));
|
|
54
|
+
return mod;
|
|
55
|
+
}
|
|
56
|
+
catch (e) {
|
|
57
|
+
const message = e instanceof Error ? e.message : String(e);
|
|
58
|
+
throw new Error(`html-js adapter: failed to load Playwright (${message}). ` +
|
|
59
|
+
"Install it with: `npm i playwright && npx playwright install chromium`");
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
export const htmlJsAdapter = {
|
|
63
|
+
kind: "html-js",
|
|
64
|
+
fetch: async (source, options = {}) => {
|
|
65
|
+
if (!source.selectors) {
|
|
66
|
+
throw new Error(`html-js adapter: source '${source.id}' has no selectors`);
|
|
67
|
+
}
|
|
68
|
+
const selectors = source.selectors;
|
|
69
|
+
const js = source.js;
|
|
70
|
+
const timeout = js?.timeout ?? DEFAULT_TIMEOUT_MS;
|
|
71
|
+
const waitUntil = js?.waitUntil ?? DEFAULT_WAIT_UNTIL;
|
|
72
|
+
// When `waitFor` is omitted we wait for the item selector itself — the
|
|
73
|
+
// common "wait until the item list rendered" intent without extra config.
|
|
74
|
+
const waitFor = js?.waitFor ?? selectors.item;
|
|
75
|
+
const playwright = options.playwright ?? (await loadPlaywright());
|
|
76
|
+
const previous = options.state;
|
|
77
|
+
const fetchedAt = new Date().toISOString();
|
|
78
|
+
// Hardening: headless is forced true. Even if a future Playwright default
|
|
79
|
+
// changes, the adapter pins it explicitly here.
|
|
80
|
+
const browser = await playwright.chromium.launch({ headless: true });
|
|
81
|
+
let html;
|
|
82
|
+
try {
|
|
83
|
+
// Hardening: fresh context per fetch (no SW / IndexedDB / localStorage
|
|
84
|
+
// persistence across fetches or sources). `acceptDownloads: false`
|
|
85
|
+
// blocks drive-by download routes (page JS triggering file saves).
|
|
86
|
+
const context = await browser.newContext({
|
|
87
|
+
acceptDownloads: false,
|
|
88
|
+
...(js?.userAgent ? { userAgent: js.userAgent } : {}),
|
|
89
|
+
});
|
|
90
|
+
try {
|
|
91
|
+
const page = await context.newPage();
|
|
92
|
+
try {
|
|
93
|
+
await page.goto(source.url, { waitUntil, timeout });
|
|
94
|
+
await page.waitForSelector(waitFor, { timeout });
|
|
95
|
+
html = await page.content();
|
|
96
|
+
}
|
|
97
|
+
finally {
|
|
98
|
+
// `finally` guarantees page close even on goto / waitFor timeout —
|
|
99
|
+
// prevents page leak / memory accumulation per ADR-0010 §D5.
|
|
100
|
+
await page.close();
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
finally {
|
|
104
|
+
await context.close();
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
finally {
|
|
108
|
+
await browser.close();
|
|
109
|
+
}
|
|
110
|
+
// Dedup via content hash stored in the `lastEtag` slot (same convention
|
|
111
|
+
// as `kind: html` — see `_html-common.ts`). Server-side ETags are not
|
|
112
|
+
// observable from `page.content()`, so the content hash is the only
|
|
113
|
+
// dedup signal available here.
|
|
114
|
+
const bodyHash = contentHash(html);
|
|
115
|
+
if (previous?.lastEtag === bodyHash) {
|
|
116
|
+
return {
|
|
117
|
+
items: [],
|
|
118
|
+
notModified: true,
|
|
119
|
+
state: {
|
|
120
|
+
lastFetchedAt: fetchedAt,
|
|
121
|
+
lastEtag: bodyHash,
|
|
122
|
+
},
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
const items = parseHtmlDocument(html, source, fetchedAt);
|
|
126
|
+
return {
|
|
127
|
+
items,
|
|
128
|
+
state: {
|
|
129
|
+
lastFetchedAt: fetchedAt,
|
|
130
|
+
lastEtag: bodyHash,
|
|
131
|
+
},
|
|
132
|
+
};
|
|
133
|
+
},
|
|
134
|
+
};
|
|
135
|
+
//# sourceMappingURL=html-js.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"html-js.js","sourceRoot":"","sources":["../../../src/core/feeds/html-js.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AAGnE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AAEH;;;;GAIG;AACH,MAAM,kBAAkB,GAAG,MAAM,CAAC;AAClC;;;;GAIG;AACH,MAAM,kBAAkB,GAAiC,aAAa,CAAC;AAiDvE;;;GAGG;AACH,KAAK,UAAU,cAAc;IAC3B,IAAI,CAAC;QACH,wEAAwE;QACxE,kEAAkE;QAClE,MAAM,GAAG,GAAG,CAAC,MAAM,MAAM,CAAC,YAAY,CAAC,CAA8B,CAAC;QACtE,OAAO,GAAG,CAAC;IACb,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,OAAO,GAAG,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;QAC3D,MAAM,IAAI,KAAK,CACb,+CAA+C,OAAO,KAAK;YACzD,wEAAwE,CAC3E,CAAC;IACJ,CAAC;AACH,CAAC;AAED,MAAM,CAAC,MAAM,aAAa,GAAgB;IACxC,IAAI,EAAE,SAAS;IACf,KAAK,EAAE,KAAK,EAAE,MAAc,EAAE,UAAgC,EAAE,EAAE,EAAE;QAClE,IAAI,CAAC,MAAM,CAAC,SAAS,EAAE,CAAC;YACtB,MAAM,IAAI,KAAK,CAAC,4BAA4B,MAAM,CAAC,EAAE,oBAAoB,CAAC,CAAC;QAC7E,CAAC;QACD,MAAM,SAAS,GAAG,MAAM,CAAC,SAAS,CAAC;QACnC,MAAM,EAAE,GAAG,MAAM,CAAC,EAAE,CAAC;QACrB,MAAM,OAAO,GAAG,EAAE,EAAE,OAAO,IAAI,kBAAkB,CAAC;QAClD,MAAM,SAAS,GAAG,EAAE,EAAE,SAAS,IAAI,kBAAkB,CAAC;QACtD,uEAAuE;QACvE,0EAA0E;QAC1E,MAAM,OAAO,GAAG,EAAE,EAAE,OAAO,IAAI,SAAS,CAAC,IAAI,CAAC;QAE9C,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,IAAI,CAAC,MAAM,cAAc,EAAE,CAAC,CAAC;QAClE,MAAM,QAAQ,GAAG,OAAO,CAAC,KAAK,CAAC;QAC/B,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;QAE3C,0EAA0E;QAC1E,gDAAgD;QAChD,MAAM,OAAO,GAAG,MAAM,UAAU,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC;QACrE,IAAI,IAAY,CAAC;QACjB,IAAI,CAAC;YACH,uEAAuE;YACvE,mEAAmE;YACnE,mEAAmE;YACnE,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,UAAU,CAAC;gBACvC,eAAe,EAAE,KAAK;gBACtB,GAAG,CAAC,EAAE,EAAE,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,EAAE,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;aACtD,CAAC,CAAC;YACH,IAAI,CAAC;gBACH,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,OAAO,EAAE,CAAC;gBACrC,IAAI,CAAC;oBACH,MAAM,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,OAAO,EAAE,CAAC,CAAC;oBACpD,MAAM,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,EAAE,OAAO,EAAE,CAAC,CAAC;oBACjD,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;gBAC9B,CAAC;wBAAS,CAAC;oBACT,mEAAmE;oBACnE,6DAA6D;oBAC7D,MAAM,IAAI,CAAC,KAAK,EAAE,CAAC;gBACrB,CAAC;YACH,CAAC;oBAAS,CAAC;gBACT,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;YACxB,CAAC;QACH,CAAC;gBAAS,CAAC;YACT,MAAM,OAAO,CAAC,KAAK,EAAE,CAAC;QACxB,CAAC;QAED,wEAAwE;QACxE,sEAAsE;QACtE,oEAAoE;QACpE,+BAA+B;QAC/B,MAAM,QAAQ,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC;QACnC,IAAI,QAAQ,EAAE,QAAQ,KAAK,QAAQ,EAAE,CAAC;YACpC,OAAO;gBACL,KAAK,EAAE,EAAE;gBACT,WAAW,EAAE,IAAI;gBACjB,KAAK,EAAE;oBACL,aAAa,EAAE,SAAS;oBACxB,QAAQ,EAAE,QAAQ;iBACnB;aACF,CAAC;QACJ,CAAC;QAED,MAAM,KAAK,GAAG,iBAAiB,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,CAAC,CAAC;QACzD,OAAO;YACL,KAAK;YACL,KAAK,EAAE;gBACL,aAAa,EAAE,SAAS;gBACxB,QAAQ,EAAE,QAAQ;aACnB;SACF,CAAC;IACJ,CAAC;CACF,CAAC"}
|
|
@@ -1,10 +1,4 @@
|
|
|
1
|
-
import type { Item, Source } from "../../schemas/index.js";
|
|
2
1
|
import type { FeedAdapter } from "./types.js";
|
|
3
|
-
|
|
4
|
-
* Parse an HTML document into validated `Item[]` using the source's
|
|
5
|
-
* `selectors`. Exported so tests can drive the parser directly without
|
|
6
|
-
* needing a fake HTTP layer.
|
|
7
|
-
*/
|
|
8
|
-
export declare function parseHtmlDocument(html: string, source: Source, fetchedAt: string): Item[];
|
|
2
|
+
export { parseHtmlDocument } from "./_html-common.js";
|
|
9
3
|
export declare const htmlAdapter: FeedAdapter;
|
|
10
4
|
//# sourceMappingURL=html.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"html.d.ts","sourceRoot":"","sources":["../../../src/core/feeds/html.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,
|
|
1
|
+
{"version":3,"file":"html.d.ts","sourceRoot":"","sources":["../../../src/core/feeds/html.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,WAAW,EAAiC,MAAM,YAAY,CAAC;AAK7E,OAAO,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AAyCtD,eAAO,MAAM,WAAW,EAAE,WAuDzB,CAAC"}
|