@neurowire/ingest 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -11,9 +11,29 @@ interface RawDocument {
11
11
  url: string;
12
12
  contentType: string;
13
13
  body: string;
14
+ etag?: string;
15
+ lastModified?: string;
16
+ /** True when the body was served from the cache via a 304 Not Modified. */
17
+ notModified?: boolean;
14
18
  }
19
+ /** A previously fetched response plus its validators, kept for conditional requests. */
20
+ interface CachedResponse {
21
+ url: string;
22
+ contentType: string;
23
+ body: string;
24
+ etag?: string;
25
+ lastModified?: string;
26
+ }
27
+ /** A store of cached responses, injected by the caller (the library keeps no global state). */
28
+ interface ConditionalCache {
29
+ get(url: string): CachedResponse | undefined;
30
+ set(url: string, value: CachedResponse): void;
31
+ }
32
+ /** A simple Map-backed ConditionalCache. */
33
+ declare function createMemoryCache(): ConditionalCache;
15
34
  interface FetchOptions {
16
35
  signal?: AbortSignal;
36
+ cache?: ConditionalCache;
17
37
  }
18
38
  /** Fetch a URL over HTTP(S), following redirects, returning the body and final URL. */
19
39
  declare function fetchDocument(url: string, options?: FetchOptions): Promise<RawDocument>;
@@ -92,6 +112,8 @@ interface FetchFeedOptions {
92
112
  signal?: AbortSignal;
93
113
  /** Max number of feed-link redirects to follow (default 3). */
94
114
  maxDepth?: number;
115
+ /** A conditional (ETag/Last-Modified) response cache, owned by the caller. */
116
+ cache?: ConditionalCache;
95
117
  }
96
118
  /** Fetch a URL (website, RSS, or Atom) and normalize it to a NeurowireFeed. */
97
119
  declare function fetchFeed(url: string, options?: FetchFeedOptions): Promise<NeurowireFeed>;
@@ -102,6 +124,8 @@ interface FetchMeshOptions {
102
124
  signal?: AbortSignal;
103
125
  /** Keep only the newest N merged entries. */
104
126
  limit?: number;
127
+ /** A conditional response cache shared by every mesh source. */
128
+ cache?: ConditionalCache;
105
129
  }
106
130
  /**
107
131
  * Fetch every source in a mesh (in parallel) and merge them into one feed.
@@ -124,10 +148,27 @@ declare function discoverFeedLink($: CheerioAPI, base: string): string | undefin
124
148
  */
125
149
  declare function autodetect($: CheerioAPI, ctx: ParseContext): NeurowireFeed | null;
126
150
 
151
+ /** A proposed tap plus a preview of what it extracts, so a user can author a tap without DOM-spelunking. */
152
+ interface TemplateProposal {
153
+ template: FeedTemplate;
154
+ matched: number;
155
+ sampleTitles: string[];
156
+ }
157
+ /**
158
+ * Inspect a feed-less HTML page and PROPOSE a FeedTemplate (CSS selectors) for it.
159
+ *
160
+ * Heuristic: find repeated item-like containers (sibling `article`/`li`/class-patterned
161
+ * `div`s that each hold a heading and an `<a href>`), pick the selector whose matched set
162
+ * is largest and consistent, then derive `title`/`link`/`date` selectors relative to the
163
+ * item. The candidate is validated by running `applyTemplate`: a proposal is returned only
164
+ * when it extracts at least one entry, otherwise `undefined`.
165
+ */
166
+ declare function proposeTemplate(html: string, url: string): TemplateProposal | undefined;
167
+
127
168
  /** Register a per-host template. Validated with zod; ignored if it has no `host`. */
128
169
  declare function registerTemplate(template: FeedTemplate): void;
129
170
  /** Look up a template for a URL's hostname. */
130
171
  declare function findTemplate(url: string): FeedTemplate | undefined;
131
172
  declare function listTemplates(): FeedTemplate[];
132
173
 
133
- export { type FeedDraft, type FeedKind, type FeedTemplate, FeedTemplateSchema, type FetchFeedOptions, type FetchMeshOptions, type FetchOptions, type ParseContext, type RawDocument, applyTemplate, autodetect, detectKind, discoverFeedLink, fetchDocument, fetchFeed, fetchMesh, finalizeFeed, findTemplate, ingestDocument, listTemplates, normDate, parseAtom, parseFeedString, parseJsonFeed, parseRdf, parseRss, registerTemplate, resolveUrl, stripHtml };
174
+ export { type CachedResponse, type ConditionalCache, type FeedDraft, type FeedKind, type FeedTemplate, FeedTemplateSchema, type FetchFeedOptions, type FetchMeshOptions, type FetchOptions, type ParseContext, type RawDocument, type TemplateProposal, applyTemplate, autodetect, createMemoryCache, detectKind, discoverFeedLink, fetchDocument, fetchFeed, fetchMesh, finalizeFeed, findTemplate, ingestDocument, listTemplates, normDate, parseAtom, parseFeedString, parseJsonFeed, parseRdf, parseRss, proposeTemplate, registerTemplate, resolveUrl, stripHtml };
package/dist/index.js CHANGED
@@ -16,6 +16,15 @@ function detectKind(contentType, body) {
16
16
  }
17
17
 
18
18
  // src/fetch.ts
19
+ function createMemoryCache() {
20
+ const store = /* @__PURE__ */ new Map();
21
+ return {
22
+ get: (url) => store.get(url),
23
+ set: (url, value) => {
24
+ store.set(url, value);
25
+ }
26
+ };
27
+ }
19
28
  var USER_AGENT = "Neurowire/0.1 (+https://github.com/neurowire/neurowire)";
20
29
  var ACCEPT = "application/atom+xml, application/rss+xml, application/feed+json, application/json;q=0.9, text/html;q=0.8, */*;q=0.5";
21
30
  async function fetchDocument(url, options = {}) {
@@ -28,17 +37,37 @@ async function fetchDocument(url, options = {}) {
28
37
  if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
29
38
  throw new Error(`Unsupported protocol: ${parsed.protocol}`);
30
39
  }
40
+ const headers = { "user-agent": USER_AGENT, accept: ACCEPT };
41
+ const cached = options.cache?.get(url);
42
+ if (cached?.etag) headers["if-none-match"] = cached.etag;
43
+ if (cached?.lastModified) headers["if-modified-since"] = cached.lastModified;
31
44
  const res = await fetch(url, {
32
45
  redirect: "follow",
33
46
  signal: options.signal,
34
- headers: { "user-agent": USER_AGENT, accept: ACCEPT }
47
+ headers
35
48
  });
49
+ if (res.status === 304 && cached) {
50
+ return {
51
+ url: cached.url,
52
+ contentType: cached.contentType,
53
+ body: cached.body,
54
+ etag: cached.etag,
55
+ lastModified: cached.lastModified,
56
+ notModified: true
57
+ };
58
+ }
36
59
  if (!res.ok) {
37
60
  throw new Error(`Upstream responded ${res.status} ${res.statusText} for ${url}`);
38
61
  }
39
62
  const contentType = res.headers.get("content-type") ?? "";
40
63
  const body = await res.text();
41
- return { url: res.url || url, contentType, body };
64
+ const etag = res.headers.get("etag") ?? void 0;
65
+ const lastModified = res.headers.get("last-modified") ?? void 0;
66
+ const finalUrl = res.url || url;
67
+ if (options.cache) {
68
+ options.cache.set(url, { url: finalUrl, contentType, body, etag, lastModified });
69
+ }
70
+ return { url: finalUrl, contentType, body, etag, lastModified };
42
71
  }
43
72
 
44
73
  // src/ingest.ts
@@ -96,7 +125,10 @@ function attr(node, name) {
96
125
  }
97
126
 
98
127
  // src/util.ts
99
- import { GENERATOR } from "@neurowire/core";
128
+ import {
129
+ GENERATOR,
130
+ stableId
131
+ } from "@neurowire/core";
100
132
  function resolveUrl(href, base) {
101
133
  try {
102
134
  return new URL(href, base).toString();
@@ -124,13 +156,17 @@ function newestEntryDate(entries) {
124
156
  }
125
157
  return max === Number.NEGATIVE_INFINITY ? void 0 : new Date(max).toISOString();
126
158
  }
159
+ function withStableId(entry) {
160
+ if (entry.id.trim()) return entry;
161
+ return { ...entry, id: stableId(entry.link, entry.title) };
162
+ }
127
163
  function finalizeFeed(draft, ctx) {
128
164
  const updated = normDate(draft.updated) ?? newestEntryDate(draft.entries) ?? (/* @__PURE__ */ new Date()).toISOString();
129
165
  const feed = {
130
166
  id: draft.id?.trim() || ctx.sourceUrl,
131
167
  title: draft.title?.trim() || "Untitled",
132
168
  updated,
133
- entries: draft.entries,
169
+ entries: draft.entries.map(withStableId),
134
170
  generator: { name: GENERATOR.name, version: GENERATOR.version }
135
171
  };
136
172
  if (draft.home) feed.home = resolveUrl(draft.home, ctx.sourceUrl);
@@ -209,7 +245,7 @@ function jsonLdEntry(node, ctx) {
209
245
  const title = str(node.headline) ?? str(node.name);
210
246
  if (!url || !title) return null;
211
247
  const link = resolveUrl(url, ctx.sourceUrl);
212
- const entry = { id: link, title, link };
248
+ const entry = { id: "", title, link };
213
249
  const published = normDate(str(node.datePublished));
214
250
  const updated = normDate(str(node.dateModified));
215
251
  if (published) entry.published = published;
@@ -258,7 +294,7 @@ function fromSemantic($, ctx) {
258
294
  const href = $a.attr("href") ?? $el.find("a[href]").first().attr("href");
259
295
  if (!title || !href) return;
260
296
  const link = resolveUrl(href, ctx.sourceUrl);
261
- const entry = { id: link, title, link };
297
+ const entry = { id: "", title, link };
262
298
  const $time = $el.find("time[datetime]").first();
263
299
  const date = normDate($time.attr("datetime") ?? $el.find("time").first().text().trim());
264
300
  if (date) entry.published = date;
@@ -322,7 +358,7 @@ function applyTemplate($, template, ctx) {
322
358
  const href = $link.attr("href") ?? $link.find("a").first().attr("href");
323
359
  if (!title2 || !href) return;
324
360
  const link = resolveUrl(href, ctx.sourceUrl);
325
- const entry = { id: link, title: title2, link };
361
+ const entry = { id: "", title: title2, link };
326
362
  if (template.date) {
327
363
  const $date = $el.find(template.date).first();
328
364
  const date = normDate($date.attr("datetime") ?? $date.text().trim());
@@ -403,7 +439,7 @@ function atomEntry(node, ctx) {
403
439
  const href = pickLink(links, "alternate") ?? attr(links[0], "href") ?? text(get(node, "id")) ?? "";
404
440
  const link = resolveUrl(href, ctx.sourceUrl);
405
441
  const entry = {
406
- id: text(get(node, "id")) ?? link,
442
+ id: text(get(node, "id")) ?? "",
407
443
  title: text(get(node, "title")) ?? "Untitled",
408
444
  link
409
445
  };
@@ -439,7 +475,7 @@ function rssItem(node, ctx) {
439
475
  const guid = text(get(node, "guid"));
440
476
  const link = resolveUrl(text(get(node, "link")) ?? guid ?? "", ctx.sourceUrl);
441
477
  const entry = {
442
- id: guid ?? link,
478
+ id: guid ?? "",
443
479
  title: text(get(node, "title")) ?? "Untitled",
444
480
  link
445
481
  };
@@ -497,7 +533,7 @@ function parseJsonFeed(raw, ctx) {
497
533
  const entries = toArray(data.items).map((item) => {
498
534
  const link = resolveUrl(item.url ?? item.external_url ?? "", ctx.sourceUrl);
499
535
  const entry = {
500
- id: item.id !== void 0 ? String(item.id) : link,
536
+ id: item.id !== void 0 ? String(item.id) : "",
501
537
  title: item.title ?? "Untitled",
502
538
  link
503
539
  };
@@ -538,7 +574,7 @@ async function fetchFeed(url, options = {}) {
538
574
  return ingest(url, options, 0);
539
575
  }
540
576
  async function ingest(url, options, depth) {
541
- const doc = await fetchDocument(url, { signal: options.signal });
577
+ const doc = await fetchDocument(url, { signal: options.signal, cache: options.cache });
542
578
  return ingestDocument(doc, options, depth);
543
579
  }
544
580
  async function ingestDocument(doc, options = {}, depth = 0) {
@@ -571,7 +607,7 @@ async function ingestDocument(doc, options = {}, depth = 0) {
571
607
  // src/mesh.ts
572
608
  import { mergeFeeds } from "@neurowire/core";
573
609
  async function fetchMesh(mesh, options = {}) {
574
- const fetchOptions = { signal: options.signal };
610
+ const fetchOptions = { signal: options.signal, cache: options.cache };
575
611
  const results = await Promise.allSettled(
576
612
  mesh.sources.map(
577
613
  async (source) => ({
@@ -589,10 +625,116 @@ async function fetchMesh(mesh, options = {}) {
589
625
  }
590
626
  return mergeFeeds(mesh.name, parts, { limit: options.limit });
591
627
  }
628
+
629
+ // src/html/propose.ts
630
+ import { load as load2 } from "cheerio";
631
+ var HEADING_SELECTOR = "h1, h2, h3, h4";
632
+ var TITLE_CANDIDATES = ["h2", "h3", "h1", "h4"];
633
+ function hostOf(url) {
634
+ try {
635
+ return new URL(url).hostname || void 0;
636
+ } catch {
637
+ return void 0;
638
+ }
639
+ }
640
+ function looksLikeItem($, el) {
641
+ const $el = $(el);
642
+ const hasLink = $el.is("a[href]") || $el.find("a[href]").length > 0;
643
+ const hasTitle = $el.find(HEADING_SELECTOR).length > 0 || $el.is("a[href]");
644
+ return hasLink && hasTitle;
645
+ }
646
+ function selectorFor($, el) {
647
+ const $el = $(el);
648
+ const tag = ($el.prop("tagName") ?? "div").toLowerCase();
649
+ const className = ($el.attr("class") ?? "").trim();
650
+ if (!className) return tag;
651
+ const first = className.split(/\s+/)[0];
652
+ return first ? `${tag}.${first}` : tag;
653
+ }
654
+ function candidateSelectors($) {
655
+ const counts = /* @__PURE__ */ new Map();
656
+ $("article, li, div, a[href]").each((_, el) => {
657
+ if (!looksLikeItem($, el)) return;
658
+ const selector = selectorFor($, el);
659
+ counts.set(selector, (counts.get(selector) ?? 0) + 1);
660
+ });
661
+ for (const base of ["article"]) {
662
+ const n = $(base).filter((_, el) => looksLikeItem($, el)).length;
663
+ if (n > 0) counts.set(base, Math.max(counts.get(base) ?? 0, n));
664
+ }
665
+ const anchorRooted = (selector) => selector === "a" || selector.startsWith("a.");
666
+ return [...counts.entries()].filter(([, n]) => n >= 2).sort((a, b) => b[1] - a[1] || Number(anchorRooted(a[0])) - Number(anchorRooted(b[0]))).map(([selector]) => selector);
667
+ }
668
+ function titleSelectorFor($, itemSelector) {
669
+ const $first = $(itemSelector).first();
670
+ if ($first.is("a[href]") && !$first.find(HEADING_SELECTOR).length) return "a";
671
+ for (const candidate of TITLE_CANDIDATES) {
672
+ if ($first.find(candidate).first().text().trim()) return candidate;
673
+ }
674
+ if ($first.find("a[href]").first().text().trim()) return "a";
675
+ return void 0;
676
+ }
677
+ function itemIsLink($, itemSelector) {
678
+ let total = 0;
679
+ let anchors = 0;
680
+ $(itemSelector).each((_, el) => {
681
+ total += 1;
682
+ if ($(el).is("a[href]")) anchors += 1;
683
+ });
684
+ return total > 0 && anchors === total;
685
+ }
686
+ function linkSelectorFor($, itemSelector, titleSelector) {
687
+ if (itemIsLink($, itemSelector)) return void 0;
688
+ const $first = $(itemSelector).first();
689
+ if ($first.find(`${titleSelector} a[href]`).length) return `${titleSelector} a`;
690
+ return "a";
691
+ }
692
+ function dateSelectorFor($, itemSelector) {
693
+ const $first = $(itemSelector).first();
694
+ if ($first.find("time").length) return "time";
695
+ if ($first.find("[datetime]").length) return "[datetime]";
696
+ return void 0;
697
+ }
698
+ function feedTitleOf($) {
699
+ const title = $("title").first().text().trim();
700
+ if (title) return title;
701
+ const h1 = $("h1").first().text().trim();
702
+ return h1 || void 0;
703
+ }
704
+ function proposeTemplate(html, url) {
705
+ const $ = load2(html);
706
+ const ctx = { sourceUrl: url };
707
+ let best;
708
+ for (const itemSelector of candidateSelectors($)) {
709
+ const titleSelector = titleSelectorFor($, itemSelector);
710
+ if (!titleSelector) continue;
711
+ const template = { item: itemSelector, title: titleSelector };
712
+ const host = hostOf(url);
713
+ if (host) template.host = host;
714
+ const feedTitle = feedTitleOf($);
715
+ if (feedTitle) template.feedTitle = feedTitle;
716
+ const link = linkSelectorFor($, itemSelector, titleSelector);
717
+ if (link) template.link = link;
718
+ const date = dateSelectorFor($, itemSelector);
719
+ if (date) template.date = date;
720
+ const feed = applyTemplate($, template, ctx);
721
+ const matched = feed.entries.length;
722
+ if (matched === 0) continue;
723
+ if (!best || matched > best.matched) {
724
+ best = {
725
+ template,
726
+ matched,
727
+ sampleTitles: feed.entries.slice(0, 5).map((e) => e.title)
728
+ };
729
+ }
730
+ }
731
+ return best;
732
+ }
592
733
  export {
593
734
  FeedTemplateSchema,
594
735
  applyTemplate,
595
736
  autodetect,
737
+ createMemoryCache,
596
738
  detectKind,
597
739
  discoverFeedLink,
598
740
  fetchDocument,
@@ -608,6 +750,7 @@ export {
608
750
  parseJsonFeed,
609
751
  parseRdf,
610
752
  parseRss,
753
+ proposeTemplate,
611
754
  registerTemplate,
612
755
  resolveUrl,
613
756
  stripHtml
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@neurowire/ingest",
3
- "version": "0.2.0",
3
+ "version": "0.4.0",
4
4
  "description": "Fetch, detect, and parse RSS/Atom/RDF/JSON feeds and HTML pages into the Neurowire model.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",
@@ -48,7 +48,7 @@
48
48
  "cheerio": "^1.0.0",
49
49
  "fast-xml-parser": "^4.5.1",
50
50
  "zod": "^3.24.1",
51
- "@neurowire/core": "0.3.0"
51
+ "@neurowire/core": "0.5.0"
52
52
  },
53
53
  "devDependencies": {
54
54
  "@types/node": "^22.10.5",