@neurowire/ingest 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +42 -1
- package/dist/index.js +155 -12
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -11,9 +11,29 @@ interface RawDocument {
|
|
|
11
11
|
url: string;
|
|
12
12
|
contentType: string;
|
|
13
13
|
body: string;
|
|
14
|
+
etag?: string;
|
|
15
|
+
lastModified?: string;
|
|
16
|
+
/** True when the body was served from the cache via a 304 Not Modified. */
|
|
17
|
+
notModified?: boolean;
|
|
14
18
|
}
|
|
19
|
+
/** A previously fetched response plus its validators, kept for conditional requests. */
|
|
20
|
+
interface CachedResponse {
|
|
21
|
+
url: string;
|
|
22
|
+
contentType: string;
|
|
23
|
+
body: string;
|
|
24
|
+
etag?: string;
|
|
25
|
+
lastModified?: string;
|
|
26
|
+
}
|
|
27
|
+
/** A store of cached responses, injected by the caller (the library keeps no global state). */
|
|
28
|
+
interface ConditionalCache {
|
|
29
|
+
get(url: string): CachedResponse | undefined;
|
|
30
|
+
set(url: string, value: CachedResponse): void;
|
|
31
|
+
}
|
|
32
|
+
/** A simple Map-backed ConditionalCache. */
|
|
33
|
+
declare function createMemoryCache(): ConditionalCache;
|
|
15
34
|
interface FetchOptions {
|
|
16
35
|
signal?: AbortSignal;
|
|
36
|
+
cache?: ConditionalCache;
|
|
17
37
|
}
|
|
18
38
|
/** Fetch a URL over HTTP(S), following redirects, returning the body and final URL. */
|
|
19
39
|
declare function fetchDocument(url: string, options?: FetchOptions): Promise<RawDocument>;
|
|
@@ -92,6 +112,8 @@ interface FetchFeedOptions {
|
|
|
92
112
|
signal?: AbortSignal;
|
|
93
113
|
/** Max number of feed-link redirects to follow (default 3). */
|
|
94
114
|
maxDepth?: number;
|
|
115
|
+
/** A conditional (ETag/Last-Modified) response cache, owned by the caller. */
|
|
116
|
+
cache?: ConditionalCache;
|
|
95
117
|
}
|
|
96
118
|
/** Fetch a URL (website, RSS, or Atom) and normalize it to a NeurowireFeed. */
|
|
97
119
|
declare function fetchFeed(url: string, options?: FetchFeedOptions): Promise<NeurowireFeed>;
|
|
@@ -102,6 +124,8 @@ interface FetchMeshOptions {
|
|
|
102
124
|
signal?: AbortSignal;
|
|
103
125
|
/** Keep only the newest N merged entries. */
|
|
104
126
|
limit?: number;
|
|
127
|
+
/** A conditional response cache shared by every mesh source. */
|
|
128
|
+
cache?: ConditionalCache;
|
|
105
129
|
}
|
|
106
130
|
/**
|
|
107
131
|
* Fetch every source in a mesh (in parallel) and merge them into one feed.
|
|
@@ -124,10 +148,27 @@ declare function discoverFeedLink($: CheerioAPI, base: string): string | undefin
|
|
|
124
148
|
*/
|
|
125
149
|
declare function autodetect($: CheerioAPI, ctx: ParseContext): NeurowireFeed | null;
|
|
126
150
|
|
|
151
|
+
/** A proposed tap plus a preview of what it extracts, so a user can author a tap without DOM-spelunking. */
|
|
152
|
+
interface TemplateProposal {
|
|
153
|
+
template: FeedTemplate;
|
|
154
|
+
matched: number;
|
|
155
|
+
sampleTitles: string[];
|
|
156
|
+
}
|
|
157
|
+
/**
|
|
158
|
+
* Inspect a feed-less HTML page and PROPOSE a FeedTemplate (CSS selectors) for it.
|
|
159
|
+
*
|
|
160
|
+
* Heuristic: find repeated item-like containers (sibling `article`/`li`/class-patterned
|
|
161
|
+
* `div`s that each hold a heading and an `<a href>`), pick the selector whose matched set
|
|
162
|
+
* is largest and consistent, then derive `title`/`link`/`date` selectors relative to the
|
|
163
|
+
* item. The candidate is validated by running `applyTemplate`: a proposal is returned only
|
|
164
|
+
* when it extracts at least one entry, otherwise `undefined`.
|
|
165
|
+
*/
|
|
166
|
+
declare function proposeTemplate(html: string, url: string): TemplateProposal | undefined;
|
|
167
|
+
|
|
127
168
|
/** Register a per-host template. Validated with zod; ignored if it has no `host`. */
|
|
128
169
|
declare function registerTemplate(template: FeedTemplate): void;
|
|
129
170
|
/** Look up a template for a URL's hostname. */
|
|
130
171
|
declare function findTemplate(url: string): FeedTemplate | undefined;
|
|
131
172
|
declare function listTemplates(): FeedTemplate[];
|
|
132
173
|
|
|
133
|
-
export { type FeedDraft, type FeedKind, type FeedTemplate, FeedTemplateSchema, type FetchFeedOptions, type FetchMeshOptions, type FetchOptions, type ParseContext, type RawDocument, applyTemplate, autodetect, detectKind, discoverFeedLink, fetchDocument, fetchFeed, fetchMesh, finalizeFeed, findTemplate, ingestDocument, listTemplates, normDate, parseAtom, parseFeedString, parseJsonFeed, parseRdf, parseRss, registerTemplate, resolveUrl, stripHtml };
|
|
174
|
+
export { type CachedResponse, type ConditionalCache, type FeedDraft, type FeedKind, type FeedTemplate, FeedTemplateSchema, type FetchFeedOptions, type FetchMeshOptions, type FetchOptions, type ParseContext, type RawDocument, type TemplateProposal, applyTemplate, autodetect, createMemoryCache, detectKind, discoverFeedLink, fetchDocument, fetchFeed, fetchMesh, finalizeFeed, findTemplate, ingestDocument, listTemplates, normDate, parseAtom, parseFeedString, parseJsonFeed, parseRdf, parseRss, proposeTemplate, registerTemplate, resolveUrl, stripHtml };
|
package/dist/index.js
CHANGED
|
@@ -16,6 +16,15 @@ function detectKind(contentType, body) {
|
|
|
16
16
|
}
|
|
17
17
|
|
|
18
18
|
// src/fetch.ts
|
|
19
|
+
function createMemoryCache() {
|
|
20
|
+
const store = /* @__PURE__ */ new Map();
|
|
21
|
+
return {
|
|
22
|
+
get: (url) => store.get(url),
|
|
23
|
+
set: (url, value) => {
|
|
24
|
+
store.set(url, value);
|
|
25
|
+
}
|
|
26
|
+
};
|
|
27
|
+
}
|
|
19
28
|
var USER_AGENT = "Neurowire/0.1 (+https://github.com/neurowire/neurowire)";
|
|
20
29
|
var ACCEPT = "application/atom+xml, application/rss+xml, application/feed+json, application/json;q=0.9, text/html;q=0.8, */*;q=0.5";
|
|
21
30
|
async function fetchDocument(url, options = {}) {
|
|
@@ -28,17 +37,37 @@ async function fetchDocument(url, options = {}) {
|
|
|
28
37
|
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
|
|
29
38
|
throw new Error(`Unsupported protocol: ${parsed.protocol}`);
|
|
30
39
|
}
|
|
40
|
+
const headers = { "user-agent": USER_AGENT, accept: ACCEPT };
|
|
41
|
+
const cached = options.cache?.get(url);
|
|
42
|
+
if (cached?.etag) headers["if-none-match"] = cached.etag;
|
|
43
|
+
if (cached?.lastModified) headers["if-modified-since"] = cached.lastModified;
|
|
31
44
|
const res = await fetch(url, {
|
|
32
45
|
redirect: "follow",
|
|
33
46
|
signal: options.signal,
|
|
34
|
-
headers
|
|
47
|
+
headers
|
|
35
48
|
});
|
|
49
|
+
if (res.status === 304 && cached) {
|
|
50
|
+
return {
|
|
51
|
+
url: cached.url,
|
|
52
|
+
contentType: cached.contentType,
|
|
53
|
+
body: cached.body,
|
|
54
|
+
etag: cached.etag,
|
|
55
|
+
lastModified: cached.lastModified,
|
|
56
|
+
notModified: true
|
|
57
|
+
};
|
|
58
|
+
}
|
|
36
59
|
if (!res.ok) {
|
|
37
60
|
throw new Error(`Upstream responded ${res.status} ${res.statusText} for ${url}`);
|
|
38
61
|
}
|
|
39
62
|
const contentType = res.headers.get("content-type") ?? "";
|
|
40
63
|
const body = await res.text();
|
|
41
|
-
|
|
64
|
+
const etag = res.headers.get("etag") ?? void 0;
|
|
65
|
+
const lastModified = res.headers.get("last-modified") ?? void 0;
|
|
66
|
+
const finalUrl = res.url || url;
|
|
67
|
+
if (options.cache) {
|
|
68
|
+
options.cache.set(url, { url: finalUrl, contentType, body, etag, lastModified });
|
|
69
|
+
}
|
|
70
|
+
return { url: finalUrl, contentType, body, etag, lastModified };
|
|
42
71
|
}
|
|
43
72
|
|
|
44
73
|
// src/ingest.ts
|
|
@@ -96,7 +125,10 @@ function attr(node, name) {
|
|
|
96
125
|
}
|
|
97
126
|
|
|
98
127
|
// src/util.ts
|
|
99
|
-
import {
|
|
128
|
+
import {
|
|
129
|
+
GENERATOR,
|
|
130
|
+
stableId
|
|
131
|
+
} from "@neurowire/core";
|
|
100
132
|
function resolveUrl(href, base) {
|
|
101
133
|
try {
|
|
102
134
|
return new URL(href, base).toString();
|
|
@@ -124,13 +156,17 @@ function newestEntryDate(entries) {
|
|
|
124
156
|
}
|
|
125
157
|
return max === Number.NEGATIVE_INFINITY ? void 0 : new Date(max).toISOString();
|
|
126
158
|
}
|
|
159
|
+
function withStableId(entry) {
|
|
160
|
+
if (entry.id.trim()) return entry;
|
|
161
|
+
return { ...entry, id: stableId(entry.link, entry.title) };
|
|
162
|
+
}
|
|
127
163
|
function finalizeFeed(draft, ctx) {
|
|
128
164
|
const updated = normDate(draft.updated) ?? newestEntryDate(draft.entries) ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
129
165
|
const feed = {
|
|
130
166
|
id: draft.id?.trim() || ctx.sourceUrl,
|
|
131
167
|
title: draft.title?.trim() || "Untitled",
|
|
132
168
|
updated,
|
|
133
|
-
entries: draft.entries,
|
|
169
|
+
entries: draft.entries.map(withStableId),
|
|
134
170
|
generator: { name: GENERATOR.name, version: GENERATOR.version }
|
|
135
171
|
};
|
|
136
172
|
if (draft.home) feed.home = resolveUrl(draft.home, ctx.sourceUrl);
|
|
@@ -209,7 +245,7 @@ function jsonLdEntry(node, ctx) {
|
|
|
209
245
|
const title = str(node.headline) ?? str(node.name);
|
|
210
246
|
if (!url || !title) return null;
|
|
211
247
|
const link = resolveUrl(url, ctx.sourceUrl);
|
|
212
|
-
const entry = { id:
|
|
248
|
+
const entry = { id: "", title, link };
|
|
213
249
|
const published = normDate(str(node.datePublished));
|
|
214
250
|
const updated = normDate(str(node.dateModified));
|
|
215
251
|
if (published) entry.published = published;
|
|
@@ -258,7 +294,7 @@ function fromSemantic($, ctx) {
|
|
|
258
294
|
const href = $a.attr("href") ?? $el.find("a[href]").first().attr("href");
|
|
259
295
|
if (!title || !href) return;
|
|
260
296
|
const link = resolveUrl(href, ctx.sourceUrl);
|
|
261
|
-
const entry = { id:
|
|
297
|
+
const entry = { id: "", title, link };
|
|
262
298
|
const $time = $el.find("time[datetime]").first();
|
|
263
299
|
const date = normDate($time.attr("datetime") ?? $el.find("time").first().text().trim());
|
|
264
300
|
if (date) entry.published = date;
|
|
@@ -322,7 +358,7 @@ function applyTemplate($, template, ctx) {
|
|
|
322
358
|
const href = $link.attr("href") ?? $link.find("a").first().attr("href");
|
|
323
359
|
if (!title2 || !href) return;
|
|
324
360
|
const link = resolveUrl(href, ctx.sourceUrl);
|
|
325
|
-
const entry = { id:
|
|
361
|
+
const entry = { id: "", title: title2, link };
|
|
326
362
|
if (template.date) {
|
|
327
363
|
const $date = $el.find(template.date).first();
|
|
328
364
|
const date = normDate($date.attr("datetime") ?? $date.text().trim());
|
|
@@ -403,7 +439,7 @@ function atomEntry(node, ctx) {
|
|
|
403
439
|
const href = pickLink(links, "alternate") ?? attr(links[0], "href") ?? text(get(node, "id")) ?? "";
|
|
404
440
|
const link = resolveUrl(href, ctx.sourceUrl);
|
|
405
441
|
const entry = {
|
|
406
|
-
id: text(get(node, "id")) ??
|
|
442
|
+
id: text(get(node, "id")) ?? "",
|
|
407
443
|
title: text(get(node, "title")) ?? "Untitled",
|
|
408
444
|
link
|
|
409
445
|
};
|
|
@@ -439,7 +475,7 @@ function rssItem(node, ctx) {
|
|
|
439
475
|
const guid = text(get(node, "guid"));
|
|
440
476
|
const link = resolveUrl(text(get(node, "link")) ?? guid ?? "", ctx.sourceUrl);
|
|
441
477
|
const entry = {
|
|
442
|
-
id: guid ??
|
|
478
|
+
id: guid ?? "",
|
|
443
479
|
title: text(get(node, "title")) ?? "Untitled",
|
|
444
480
|
link
|
|
445
481
|
};
|
|
@@ -497,7 +533,7 @@ function parseJsonFeed(raw, ctx) {
|
|
|
497
533
|
const entries = toArray(data.items).map((item) => {
|
|
498
534
|
const link = resolveUrl(item.url ?? item.external_url ?? "", ctx.sourceUrl);
|
|
499
535
|
const entry = {
|
|
500
|
-
id: item.id !== void 0 ? String(item.id) :
|
|
536
|
+
id: item.id !== void 0 ? String(item.id) : "",
|
|
501
537
|
title: item.title ?? "Untitled",
|
|
502
538
|
link
|
|
503
539
|
};
|
|
@@ -538,7 +574,7 @@ async function fetchFeed(url, options = {}) {
|
|
|
538
574
|
return ingest(url, options, 0);
|
|
539
575
|
}
|
|
540
576
|
async function ingest(url, options, depth) {
|
|
541
|
-
const doc = await fetchDocument(url, { signal: options.signal });
|
|
577
|
+
const doc = await fetchDocument(url, { signal: options.signal, cache: options.cache });
|
|
542
578
|
return ingestDocument(doc, options, depth);
|
|
543
579
|
}
|
|
544
580
|
async function ingestDocument(doc, options = {}, depth = 0) {
|
|
@@ -571,7 +607,7 @@ async function ingestDocument(doc, options = {}, depth = 0) {
|
|
|
571
607
|
// src/mesh.ts
|
|
572
608
|
import { mergeFeeds } from "@neurowire/core";
|
|
573
609
|
async function fetchMesh(mesh, options = {}) {
|
|
574
|
-
const fetchOptions = { signal: options.signal };
|
|
610
|
+
const fetchOptions = { signal: options.signal, cache: options.cache };
|
|
575
611
|
const results = await Promise.allSettled(
|
|
576
612
|
mesh.sources.map(
|
|
577
613
|
async (source) => ({
|
|
@@ -589,10 +625,116 @@ async function fetchMesh(mesh, options = {}) {
|
|
|
589
625
|
}
|
|
590
626
|
return mergeFeeds(mesh.name, parts, { limit: options.limit });
|
|
591
627
|
}
|
|
628
|
+
|
|
629
|
+
// src/html/propose.ts
|
|
630
|
+
import { load as load2 } from "cheerio";
|
|
631
|
+
var HEADING_SELECTOR = "h1, h2, h3, h4";
|
|
632
|
+
var TITLE_CANDIDATES = ["h2", "h3", "h1", "h4"];
|
|
633
|
+
function hostOf(url) {
|
|
634
|
+
try {
|
|
635
|
+
return new URL(url).hostname || void 0;
|
|
636
|
+
} catch {
|
|
637
|
+
return void 0;
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
function looksLikeItem($, el) {
|
|
641
|
+
const $el = $(el);
|
|
642
|
+
const hasLink = $el.is("a[href]") || $el.find("a[href]").length > 0;
|
|
643
|
+
const hasTitle = $el.find(HEADING_SELECTOR).length > 0 || $el.is("a[href]");
|
|
644
|
+
return hasLink && hasTitle;
|
|
645
|
+
}
|
|
646
|
+
function selectorFor($, el) {
|
|
647
|
+
const $el = $(el);
|
|
648
|
+
const tag = ($el.prop("tagName") ?? "div").toLowerCase();
|
|
649
|
+
const className = ($el.attr("class") ?? "").trim();
|
|
650
|
+
if (!className) return tag;
|
|
651
|
+
const first = className.split(/\s+/)[0];
|
|
652
|
+
return first ? `${tag}.${first}` : tag;
|
|
653
|
+
}
|
|
654
|
+
function candidateSelectors($) {
|
|
655
|
+
const counts = /* @__PURE__ */ new Map();
|
|
656
|
+
$("article, li, div, a[href]").each((_, el) => {
|
|
657
|
+
if (!looksLikeItem($, el)) return;
|
|
658
|
+
const selector = selectorFor($, el);
|
|
659
|
+
counts.set(selector, (counts.get(selector) ?? 0) + 1);
|
|
660
|
+
});
|
|
661
|
+
for (const base of ["article"]) {
|
|
662
|
+
const n = $(base).filter((_, el) => looksLikeItem($, el)).length;
|
|
663
|
+
if (n > 0) counts.set(base, Math.max(counts.get(base) ?? 0, n));
|
|
664
|
+
}
|
|
665
|
+
const anchorRooted = (selector) => selector === "a" || selector.startsWith("a.");
|
|
666
|
+
return [...counts.entries()].filter(([, n]) => n >= 2).sort((a, b) => b[1] - a[1] || Number(anchorRooted(a[0])) - Number(anchorRooted(b[0]))).map(([selector]) => selector);
|
|
667
|
+
}
|
|
668
|
+
function titleSelectorFor($, itemSelector) {
|
|
669
|
+
const $first = $(itemSelector).first();
|
|
670
|
+
if ($first.is("a[href]") && !$first.find(HEADING_SELECTOR).length) return "a";
|
|
671
|
+
for (const candidate of TITLE_CANDIDATES) {
|
|
672
|
+
if ($first.find(candidate).first().text().trim()) return candidate;
|
|
673
|
+
}
|
|
674
|
+
if ($first.find("a[href]").first().text().trim()) return "a";
|
|
675
|
+
return void 0;
|
|
676
|
+
}
|
|
677
|
+
function itemIsLink($, itemSelector) {
|
|
678
|
+
let total = 0;
|
|
679
|
+
let anchors = 0;
|
|
680
|
+
$(itemSelector).each((_, el) => {
|
|
681
|
+
total += 1;
|
|
682
|
+
if ($(el).is("a[href]")) anchors += 1;
|
|
683
|
+
});
|
|
684
|
+
return total > 0 && anchors === total;
|
|
685
|
+
}
|
|
686
|
+
function linkSelectorFor($, itemSelector, titleSelector) {
|
|
687
|
+
if (itemIsLink($, itemSelector)) return void 0;
|
|
688
|
+
const $first = $(itemSelector).first();
|
|
689
|
+
if ($first.find(`${titleSelector} a[href]`).length) return `${titleSelector} a`;
|
|
690
|
+
return "a";
|
|
691
|
+
}
|
|
692
|
+
function dateSelectorFor($, itemSelector) {
|
|
693
|
+
const $first = $(itemSelector).first();
|
|
694
|
+
if ($first.find("time").length) return "time";
|
|
695
|
+
if ($first.find("[datetime]").length) return "[datetime]";
|
|
696
|
+
return void 0;
|
|
697
|
+
}
|
|
698
|
+
function feedTitleOf($) {
|
|
699
|
+
const title = $("title").first().text().trim();
|
|
700
|
+
if (title) return title;
|
|
701
|
+
const h1 = $("h1").first().text().trim();
|
|
702
|
+
return h1 || void 0;
|
|
703
|
+
}
|
|
704
|
+
function proposeTemplate(html, url) {
|
|
705
|
+
const $ = load2(html);
|
|
706
|
+
const ctx = { sourceUrl: url };
|
|
707
|
+
let best;
|
|
708
|
+
for (const itemSelector of candidateSelectors($)) {
|
|
709
|
+
const titleSelector = titleSelectorFor($, itemSelector);
|
|
710
|
+
if (!titleSelector) continue;
|
|
711
|
+
const template = { item: itemSelector, title: titleSelector };
|
|
712
|
+
const host = hostOf(url);
|
|
713
|
+
if (host) template.host = host;
|
|
714
|
+
const feedTitle = feedTitleOf($);
|
|
715
|
+
if (feedTitle) template.feedTitle = feedTitle;
|
|
716
|
+
const link = linkSelectorFor($, itemSelector, titleSelector);
|
|
717
|
+
if (link) template.link = link;
|
|
718
|
+
const date = dateSelectorFor($, itemSelector);
|
|
719
|
+
if (date) template.date = date;
|
|
720
|
+
const feed = applyTemplate($, template, ctx);
|
|
721
|
+
const matched = feed.entries.length;
|
|
722
|
+
if (matched === 0) continue;
|
|
723
|
+
if (!best || matched > best.matched) {
|
|
724
|
+
best = {
|
|
725
|
+
template,
|
|
726
|
+
matched,
|
|
727
|
+
sampleTitles: feed.entries.slice(0, 5).map((e) => e.title)
|
|
728
|
+
};
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
return best;
|
|
732
|
+
}
|
|
592
733
|
export {
|
|
593
734
|
FeedTemplateSchema,
|
|
594
735
|
applyTemplate,
|
|
595
736
|
autodetect,
|
|
737
|
+
createMemoryCache,
|
|
596
738
|
detectKind,
|
|
597
739
|
discoverFeedLink,
|
|
598
740
|
fetchDocument,
|
|
@@ -608,6 +750,7 @@ export {
|
|
|
608
750
|
parseJsonFeed,
|
|
609
751
|
parseRdf,
|
|
610
752
|
parseRss,
|
|
753
|
+
proposeTemplate,
|
|
611
754
|
registerTemplate,
|
|
612
755
|
resolveUrl,
|
|
613
756
|
stripHtml
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@neurowire/ingest",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Fetch, detect, and parse RSS/Atom/RDF/JSON feeds and HTML pages into the Neurowire model.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|
|
@@ -48,7 +48,7 @@
|
|
|
48
48
|
"cheerio": "^1.0.0",
|
|
49
49
|
"fast-xml-parser": "^4.5.1",
|
|
50
50
|
"zod": "^3.24.1",
|
|
51
|
-
"@neurowire/core": "0.
|
|
51
|
+
"@neurowire/core": "0.5.0"
|
|
52
52
|
},
|
|
53
53
|
"devDependencies": {
|
|
54
54
|
"@types/node": "^22.10.5",
|