feedcanon 1.2.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -41,8 +41,8 @@ This is a simplified flow. For complete details, see [How It Works](https://feed
41
41
  1. Fetch the input URL and parse the feed to establish reference content.
42
42
  2. Extract the feed's declared self URL (if present).
43
43
  3. Validate the self URL by fetching and comparing content.
44
- 4. Generate URL variants ordered from cleanest to least clean.
45
- 5. Test variants in order—the first one serving identical content wins.
44
+ 4. Generate URL candidates ordered from cleanest to least clean.
45
+ 5. Test candidates in order—the first one serving identical content wins.
46
46
  6. Upgrade HTTP to HTTPS if both serve identical content.
47
47
 
48
48
  ### Customization
@@ -53,7 +53,7 @@ Feedcanon is designed to be flexible. Every major component can be replaced or e
53
53
  - **Database lookup** — use `existsFn` to check if a URL already exists in your database.
54
54
  - **Custom fetch** — use your own HTTP client (Axios, Got, Ky, etc.)
55
55
  - **Custom parser** — bring your own parser (Feedsmith by default).
56
- - **Custom tiers** — define your own URL normalization variants.
56
+ - **Custom tiers** — define your own URL normalization tiers.
57
57
  - **Custom platforms** — add handlers to normalize domain aliases (like FeedBurner).
58
58
 
59
59
  ## Quick Start
package/dist/defaults.cjs CHANGED
@@ -1,3 +1,4 @@
1
+ const require_utils = require('./utils.cjs');
1
2
  let feedsmith = require("feedsmith");
2
3
 
3
4
  //#region src/defaults.ts
@@ -183,15 +184,6 @@ const defaultFetch = async (url, options) => {
183
184
  status: response.status
184
185
  };
185
186
  };
186
- const trailingSlashPattern = /("(?:https?:\/\/|\/)[^"]+)\/([?"])/g;
187
- const neutralizeFeedUrls = (signature, url) => {
188
- try {
189
- const escapedHost = new URL("/", url).host.replace(/^www\./, "").replaceAll(".", "\\.");
190
- return signature.replace(new RegExp(`https?://(?:www\\.)?${escapedHost}(?=[/"])(/)?`, "g"), "/").replace(trailingSlashPattern, "$1$2");
191
- } catch {
192
- return signature;
193
- }
194
- };
195
187
  const retrieveSelfLink = (parsed) => {
196
188
  switch (parsed.format) {
197
189
  case "atom": return parsed.feed.links?.find((link) => link.rel === "self");
@@ -209,49 +201,48 @@ const defaultParser = {
209
201
  return parsed.format === "json" ? parsed.feed.feed_url : retrieveSelfLink(parsed)?.href;
210
202
  },
211
203
  getSignature: (parsed, url) => {
212
- if (parsed.format === "json") {
213
- const originalSelfUrl = parsed.feed.feed_url;
214
- parsed.feed.feed_url = void 0;
215
- const signature$1 = JSON.stringify(parsed.feed);
216
- parsed.feed.feed_url = originalSelfUrl;
217
- return neutralizeFeedUrls(signature$1, url);
218
- }
219
204
  let signature;
220
- let originalBuildDate;
221
- let originalPubDate;
222
- let originalLink;
223
- if (parsed.format === "rss") {
224
- originalBuildDate = parsed.feed.lastBuildDate;
225
- originalPubDate = parsed.feed.pubDate;
226
- originalLink = parsed.feed.link;
227
- parsed.feed.lastBuildDate = void 0;
228
- parsed.feed.pubDate = void 0;
229
- parsed.feed.link = void 0;
230
- } else if (parsed.format === "rdf") {
231
- originalLink = parsed.feed.link;
232
- parsed.feed.link = void 0;
233
- } else if (parsed.format === "atom") {
234
- originalBuildDate = parsed.feed.updated;
235
- parsed.feed.updated = void 0;
236
- }
237
- const link = retrieveSelfLink(parsed);
238
- if (!link) signature = JSON.stringify(parsed.feed);
239
- else {
240
- const originalSelfUrl = link.href;
241
- link.href = void 0;
242
- signature = JSON.stringify(parsed.feed);
243
- link.href = originalSelfUrl;
205
+ let contentUrl;
206
+ if (parsed.format === "json") {
207
+ contentUrl = parsed.feed.home_page_url;
208
+ signature = require_utils.createSignature(parsed.feed, ["feed_url"]);
209
+ } else {
210
+ const selfLink = retrieveSelfLink(parsed);
211
+ const savedSelfHref = selfLink?.href;
212
+ if (selfLink) selfLink.href = void 0;
213
+ if (parsed.format === "rss") {
214
+ contentUrl = parsed.feed.link;
215
+ signature = require_utils.createSignature(parsed.feed, [
216
+ "lastBuildDate",
217
+ "pubDate",
218
+ "link",
219
+ "generator"
220
+ ]);
221
+ } else if (parsed.format === "rdf") {
222
+ contentUrl = parsed.feed.link;
223
+ signature = require_utils.createSignature(parsed.feed, ["link"]);
224
+ } else signature = require_utils.createSignature(parsed.feed, ["updated", "generator"]);
225
+ if (selfLink) selfLink.href = savedSelfHref;
244
226
  }
245
- if (parsed.format === "rss") {
246
- parsed.feed.lastBuildDate = originalBuildDate;
247
- parsed.feed.pubDate = originalPubDate;
248
- parsed.feed.link = originalLink;
249
- } else if (parsed.format === "rdf") parsed.feed.link = originalLink;
250
- else if (parsed.format === "atom") parsed.feed.updated = originalBuildDate;
251
- return neutralizeFeedUrls(signature, url);
227
+ return require_utils.neutralizeUrls(signature, contentUrl ? [url, contentUrl] : [url]);
252
228
  }
253
229
  };
254
230
  const defaultTiers = [
231
+ {
232
+ stripProtocol: false,
233
+ stripAuthentication: false,
234
+ stripWww: true,
235
+ stripTrailingSlash: true,
236
+ stripRootSlash: true,
237
+ collapseSlashes: true,
238
+ stripHash: true,
239
+ sortQueryParams: false,
240
+ stripQuery: true,
241
+ stripEmptyQuery: true,
242
+ normalizeEncoding: true,
243
+ normalizeUnicode: true,
244
+ convertToPunycode: true
245
+ },
255
246
  {
256
247
  stripProtocol: false,
257
248
  stripAuthentication: false,
@@ -304,5 +295,4 @@ exports.defaultFetch = defaultFetch;
304
295
  exports.defaultNormalizeOptions = defaultNormalizeOptions;
305
296
  exports.defaultParser = defaultParser;
306
297
  exports.defaultStrippedParams = defaultStrippedParams;
307
- exports.defaultTiers = defaultTiers;
308
- exports.neutralizeFeedUrls = neutralizeFeedUrls;
298
+ exports.defaultTiers = defaultTiers;
@@ -4,8 +4,7 @@ import { DefaultParserResult, FetchFn, NormalizeOptions, ParserAdapter, Tier } f
4
4
  declare const defaultStrippedParams: string[];
5
5
  declare const defaultNormalizeOptions: NormalizeOptions;
6
6
  declare const defaultFetch: FetchFn;
7
- declare const neutralizeFeedUrls: (signature: string, url: string) => string;
8
7
  declare const defaultParser: ParserAdapter<DefaultParserResult>;
9
8
  declare const defaultTiers: Array<Tier>;
10
9
  //#endregion
11
- export { defaultFetch, defaultNormalizeOptions, defaultParser, defaultStrippedParams, defaultTiers, neutralizeFeedUrls };
10
+ export { defaultFetch, defaultNormalizeOptions, defaultParser, defaultStrippedParams, defaultTiers };
@@ -4,8 +4,7 @@ import { DefaultParserResult, FetchFn, NormalizeOptions, ParserAdapter, Tier } f
4
4
  declare const defaultStrippedParams: string[];
5
5
  declare const defaultNormalizeOptions: NormalizeOptions;
6
6
  declare const defaultFetch: FetchFn;
7
- declare const neutralizeFeedUrls: (signature: string, url: string) => string;
8
7
  declare const defaultParser: ParserAdapter<DefaultParserResult>;
9
8
  declare const defaultTiers: Array<Tier>;
10
9
  //#endregion
11
- export { defaultFetch, defaultNormalizeOptions, defaultParser, defaultStrippedParams, defaultTiers, neutralizeFeedUrls };
10
+ export { defaultFetch, defaultNormalizeOptions, defaultParser, defaultStrippedParams, defaultTiers };
package/dist/defaults.js CHANGED
@@ -1,3 +1,4 @@
1
+ import { createSignature, neutralizeUrls } from "./utils.js";
1
2
  import { parseFeed } from "feedsmith";
2
3
 
3
4
  //#region src/defaults.ts
@@ -183,15 +184,6 @@ const defaultFetch = async (url, options) => {
183
184
  status: response.status
184
185
  };
185
186
  };
186
- const trailingSlashPattern = /("(?:https?:\/\/|\/)[^"]+)\/([?"])/g;
187
- const neutralizeFeedUrls = (signature, url) => {
188
- try {
189
- const escapedHost = new URL("/", url).host.replace(/^www\./, "").replaceAll(".", "\\.");
190
- return signature.replace(new RegExp(`https?://(?:www\\.)?${escapedHost}(?=[/"])(/)?`, "g"), "/").replace(trailingSlashPattern, "$1$2");
191
- } catch {
192
- return signature;
193
- }
194
- };
195
187
  const retrieveSelfLink = (parsed) => {
196
188
  switch (parsed.format) {
197
189
  case "atom": return parsed.feed.links?.find((link) => link.rel === "self");
@@ -209,49 +201,48 @@ const defaultParser = {
209
201
  return parsed.format === "json" ? parsed.feed.feed_url : retrieveSelfLink(parsed)?.href;
210
202
  },
211
203
  getSignature: (parsed, url) => {
212
- if (parsed.format === "json") {
213
- const originalSelfUrl = parsed.feed.feed_url;
214
- parsed.feed.feed_url = void 0;
215
- const signature$1 = JSON.stringify(parsed.feed);
216
- parsed.feed.feed_url = originalSelfUrl;
217
- return neutralizeFeedUrls(signature$1, url);
218
- }
219
204
  let signature;
220
- let originalBuildDate;
221
- let originalPubDate;
222
- let originalLink;
223
- if (parsed.format === "rss") {
224
- originalBuildDate = parsed.feed.lastBuildDate;
225
- originalPubDate = parsed.feed.pubDate;
226
- originalLink = parsed.feed.link;
227
- parsed.feed.lastBuildDate = void 0;
228
- parsed.feed.pubDate = void 0;
229
- parsed.feed.link = void 0;
230
- } else if (parsed.format === "rdf") {
231
- originalLink = parsed.feed.link;
232
- parsed.feed.link = void 0;
233
- } else if (parsed.format === "atom") {
234
- originalBuildDate = parsed.feed.updated;
235
- parsed.feed.updated = void 0;
236
- }
237
- const link = retrieveSelfLink(parsed);
238
- if (!link) signature = JSON.stringify(parsed.feed);
239
- else {
240
- const originalSelfUrl = link.href;
241
- link.href = void 0;
242
- signature = JSON.stringify(parsed.feed);
243
- link.href = originalSelfUrl;
205
+ let contentUrl;
206
+ if (parsed.format === "json") {
207
+ contentUrl = parsed.feed.home_page_url;
208
+ signature = createSignature(parsed.feed, ["feed_url"]);
209
+ } else {
210
+ const selfLink = retrieveSelfLink(parsed);
211
+ const savedSelfHref = selfLink?.href;
212
+ if (selfLink) selfLink.href = void 0;
213
+ if (parsed.format === "rss") {
214
+ contentUrl = parsed.feed.link;
215
+ signature = createSignature(parsed.feed, [
216
+ "lastBuildDate",
217
+ "pubDate",
218
+ "link",
219
+ "generator"
220
+ ]);
221
+ } else if (parsed.format === "rdf") {
222
+ contentUrl = parsed.feed.link;
223
+ signature = createSignature(parsed.feed, ["link"]);
224
+ } else signature = createSignature(parsed.feed, ["updated", "generator"]);
225
+ if (selfLink) selfLink.href = savedSelfHref;
244
226
  }
245
- if (parsed.format === "rss") {
246
- parsed.feed.lastBuildDate = originalBuildDate;
247
- parsed.feed.pubDate = originalPubDate;
248
- parsed.feed.link = originalLink;
249
- } else if (parsed.format === "rdf") parsed.feed.link = originalLink;
250
- else if (parsed.format === "atom") parsed.feed.updated = originalBuildDate;
251
- return neutralizeFeedUrls(signature, url);
227
+ return neutralizeUrls(signature, contentUrl ? [url, contentUrl] : [url]);
252
228
  }
253
229
  };
254
230
  const defaultTiers = [
231
+ {
232
+ stripProtocol: false,
233
+ stripAuthentication: false,
234
+ stripWww: true,
235
+ stripTrailingSlash: true,
236
+ stripRootSlash: true,
237
+ collapseSlashes: true,
238
+ stripHash: true,
239
+ sortQueryParams: false,
240
+ stripQuery: true,
241
+ stripEmptyQuery: true,
242
+ normalizeEncoding: true,
243
+ normalizeUnicode: true,
244
+ convertToPunycode: true
245
+ },
255
246
  {
256
247
  stripProtocol: false,
257
248
  stripAuthentication: false,
@@ -300,4 +291,4 @@ const defaultTiers = [
300
291
  ];
301
292
 
302
293
  //#endregion
303
- export { defaultFetch, defaultNormalizeOptions, defaultParser, defaultStrippedParams, defaultTiers, neutralizeFeedUrls };
294
+ export { defaultFetch, defaultNormalizeOptions, defaultParser, defaultStrippedParams, defaultTiers };
package/dist/exports.cjs CHANGED
@@ -1,6 +1,7 @@
1
- const require_defaults = require('./defaults.cjs');
2
1
  const require_utils = require('./utils.cjs');
2
+ const require_defaults = require('./defaults.cjs');
3
3
  const require_index = require('./index.cjs');
4
+ const require_wordpress = require('./probes/wordpress.cjs');
4
5
  const require_blogger = require('./rewrites/blogger.cjs');
5
6
  const require_feedburner = require('./rewrites/feedburner.cjs');
6
7
 
@@ -15,4 +16,5 @@ exports.findCanonical = require_index.findCanonical;
15
16
  exports.fixMalformedProtocol = require_utils.fixMalformedProtocol;
16
17
  exports.normalizeUrl = require_utils.normalizeUrl;
17
18
  exports.resolveFeedProtocol = require_utils.resolveFeedProtocol;
18
- exports.resolveUrl = require_utils.resolveUrl;
19
+ exports.resolveUrl = require_utils.resolveUrl;
20
+ exports.wordpressProbe = require_wordpress.wordpressProbe;
@@ -1,7 +1,8 @@
1
- import { DefaultParserResult, ExistsFn, FetchFn, FetchFnOptions, FetchFnResponse, FindCanonicalOptions, NormalizeOptions, OnExistsFn, OnFetchFn, OnMatchFn, ParserAdapter, Rewrite } from "./types.cjs";
1
+ import { DefaultParserResult, ExistsFn, FetchFn, FetchFnOptions, FetchFnResponse, FindCanonicalOptions, NormalizeOptions, OnExistsFn, OnFetchFn, OnMatchFn, ParserAdapter, Probe, Rewrite, Tier } from "./types.cjs";
2
2
  import { defaultFetch, defaultParser, defaultStrippedParams, defaultTiers } from "./defaults.cjs";
3
3
  import { findCanonical } from "./index.cjs";
4
+ import { wordpressProbe } from "./probes/wordpress.cjs";
4
5
  import { bloggerRewrite } from "./rewrites/blogger.cjs";
5
6
  import { feedburnerRewrite } from "./rewrites/feedburner.cjs";
6
7
  import { addMissingProtocol, fixMalformedProtocol, normalizeUrl, resolveFeedProtocol, resolveUrl } from "./utils.cjs";
7
- export { type DefaultParserResult, type ExistsFn, type FetchFn, type FetchFnOptions, type FetchFnResponse, type FindCanonicalOptions, type NormalizeOptions, type OnExistsFn, type OnFetchFn, type OnMatchFn, type ParserAdapter, type Rewrite, addMissingProtocol, bloggerRewrite, defaultFetch, defaultParser, defaultStrippedParams, defaultTiers, feedburnerRewrite, findCanonical, fixMalformedProtocol, normalizeUrl, resolveFeedProtocol, resolveUrl };
8
+ export { type DefaultParserResult, type ExistsFn, type FetchFn, type FetchFnOptions, type FetchFnResponse, type FindCanonicalOptions, type NormalizeOptions, type OnExistsFn, type OnFetchFn, type OnMatchFn, type ParserAdapter, type Probe, type Rewrite, type Tier, addMissingProtocol, bloggerRewrite, defaultFetch, defaultParser, defaultStrippedParams, defaultTiers, feedburnerRewrite, findCanonical, fixMalformedProtocol, normalizeUrl, resolveFeedProtocol, resolveUrl, wordpressProbe };
package/dist/exports.d.ts CHANGED
@@ -1,7 +1,8 @@
1
- import { DefaultParserResult, ExistsFn, FetchFn, FetchFnOptions, FetchFnResponse, FindCanonicalOptions, NormalizeOptions, OnExistsFn, OnFetchFn, OnMatchFn, ParserAdapter, Rewrite } from "./types.js";
1
+ import { DefaultParserResult, ExistsFn, FetchFn, FetchFnOptions, FetchFnResponse, FindCanonicalOptions, NormalizeOptions, OnExistsFn, OnFetchFn, OnMatchFn, ParserAdapter, Probe, Rewrite, Tier } from "./types.js";
2
2
  import { defaultFetch, defaultParser, defaultStrippedParams, defaultTiers } from "./defaults.js";
3
3
  import { findCanonical } from "./index.js";
4
+ import { wordpressProbe } from "./probes/wordpress.js";
4
5
  import { bloggerRewrite } from "./rewrites/blogger.js";
5
6
  import { feedburnerRewrite } from "./rewrites/feedburner.js";
6
7
  import { addMissingProtocol, fixMalformedProtocol, normalizeUrl, resolveFeedProtocol, resolveUrl } from "./utils.js";
7
- export { type DefaultParserResult, type ExistsFn, type FetchFn, type FetchFnOptions, type FetchFnResponse, type FindCanonicalOptions, type NormalizeOptions, type OnExistsFn, type OnFetchFn, type OnMatchFn, type ParserAdapter, type Rewrite, addMissingProtocol, bloggerRewrite, defaultFetch, defaultParser, defaultStrippedParams, defaultTiers, feedburnerRewrite, findCanonical, fixMalformedProtocol, normalizeUrl, resolveFeedProtocol, resolveUrl };
8
+ export { type DefaultParserResult, type ExistsFn, type FetchFn, type FetchFnOptions, type FetchFnResponse, type FindCanonicalOptions, type NormalizeOptions, type OnExistsFn, type OnFetchFn, type OnMatchFn, type ParserAdapter, type Probe, type Rewrite, type Tier, addMissingProtocol, bloggerRewrite, defaultFetch, defaultParser, defaultStrippedParams, defaultTiers, feedburnerRewrite, findCanonical, fixMalformedProtocol, normalizeUrl, resolveFeedProtocol, resolveUrl, wordpressProbe };
package/dist/exports.js CHANGED
@@ -1,7 +1,8 @@
1
- import { defaultFetch, defaultParser, defaultStrippedParams, defaultTiers } from "./defaults.js";
2
1
  import { addMissingProtocol, fixMalformedProtocol, normalizeUrl, resolveFeedProtocol, resolveUrl } from "./utils.js";
2
+ import { defaultFetch, defaultParser, defaultStrippedParams, defaultTiers } from "./defaults.js";
3
3
  import { findCanonical } from "./index.js";
4
+ import { wordpressProbe } from "./probes/wordpress.js";
4
5
  import { bloggerRewrite } from "./rewrites/blogger.js";
5
6
  import { feedburnerRewrite } from "./rewrites/feedburner.js";
6
7
 
7
- export { addMissingProtocol, bloggerRewrite, defaultFetch, defaultParser, defaultStrippedParams, defaultTiers, feedburnerRewrite, findCanonical, fixMalformedProtocol, normalizeUrl, resolveFeedProtocol, resolveUrl };
8
+ export { addMissingProtocol, bloggerRewrite, defaultFetch, defaultParser, defaultStrippedParams, defaultTiers, feedburnerRewrite, findCanonical, fixMalformedProtocol, normalizeUrl, resolveFeedProtocol, resolveUrl, wordpressProbe };
package/dist/index.cjs CHANGED
@@ -1,15 +1,15 @@
1
- const require_defaults = require('./defaults.cjs');
2
1
  const require_utils = require('./utils.cjs');
2
+ const require_defaults = require('./defaults.cjs');
3
3
 
4
4
  //#region src/index.ts
5
5
  async function findCanonical(inputUrl, options) {
6
- const { parser = require_defaults.defaultParser, fetchFn = require_defaults.defaultFetch, existsFn, tiers = require_defaults.defaultTiers, rewrites, stripQueryParams = require_defaults.defaultStrippedParams, onFetch, onMatch, onExists } = options ?? {};
6
+ const { parser = require_defaults.defaultParser, fetchFn = require_defaults.defaultFetch, existsFn, tiers = require_defaults.defaultTiers, rewrites, probes, stripQueryParams = require_defaults.defaultStrippedParams, onFetch, onMatch, onExists } = options ?? {};
7
7
  const stripParams = (url) => {
8
- return stripQueryParams?.length ? require_utils.normalizeUrl(url, {
8
+ return require_utils.normalizeUrl(url, {
9
9
  stripQueryParams,
10
10
  sortQueryParams: true,
11
11
  stripEmptyQuery: true
12
- }) : url;
12
+ });
13
13
  };
14
14
  const resolveAndApplyRewrites = (url, baseUrl) => {
15
15
  const resolved = require_utils.resolveUrl(url, baseUrl);
@@ -73,7 +73,7 @@ async function findCanonical(inputUrl, options) {
73
73
  if (!await compareWithInitialResponse(response.body, response.url)) return;
74
74
  return response;
75
75
  };
76
- let variantSourceUrl = initialResponseUrl;
76
+ let candidateSourceUrl = initialResponseUrl;
77
77
  if (selfRequestUrl && selfRequestUrl !== initialResponseUrl) {
78
78
  const urlsToTry = [selfRequestUrl];
79
79
  if (selfRequestUrl.startsWith("https://")) urlsToTry.push(selfRequestUrl.replace("https://", "http://"));
@@ -86,42 +86,53 @@ async function findCanonical(inputUrl, options) {
86
86
  response,
87
87
  feed: initialResponseFeed
88
88
  });
89
- variantSourceUrl = resolveAndApplyRewrites(response.url) ?? initialResponseUrl;
90
- variantSourceUrl = stripParams(variantSourceUrl);
89
+ candidateSourceUrl = resolveAndApplyRewrites(response.url) ?? initialResponseUrl;
90
+ candidateSourceUrl = stripParams(candidateSourceUrl);
91
91
  break;
92
92
  }
93
93
  }
94
94
  }
95
- const variantUrls = new Set(tiers.map((tier) => resolveAndApplyRewrites(require_utils.normalizeUrl(variantSourceUrl, tier))).filter((variantUrl) => !!variantUrl));
96
- variantUrls.add(variantSourceUrl);
97
- let winningUrl = variantSourceUrl;
98
- for (const variantUrl of variantUrls) {
95
+ if (probes?.length) candidateSourceUrl = await require_utils.applyProbes(candidateSourceUrl, probes, async (candidateUrl) => {
96
+ const response = await fetchAndCompare(candidateUrl);
97
+ if (response) {
98
+ onMatch?.({
99
+ url: candidateUrl,
100
+ response,
101
+ feed: initialResponseFeed
102
+ });
103
+ return stripParams(resolveAndApplyRewrites(response.url) ?? candidateUrl);
104
+ }
105
+ });
106
+ const candidateUrls = new Set(tiers.map((tier) => resolveAndApplyRewrites(require_utils.normalizeUrl(candidateSourceUrl, tier))).filter((candidateUrl) => !!candidateUrl));
107
+ candidateUrls.add(candidateSourceUrl);
108
+ let winningUrl = candidateSourceUrl;
109
+ for (const candidateUrl of candidateUrls) {
99
110
  if (existsFn) {
100
- const data = await existsFn(variantUrl);
111
+ const data = await existsFn(candidateUrl);
101
112
  if (data !== void 0) {
102
113
  onExists?.({
103
- url: variantUrl,
114
+ url: candidateUrl,
104
115
  data
105
116
  });
106
- return variantUrl;
117
+ return candidateUrl;
107
118
  }
108
119
  }
109
- if (variantUrl === variantSourceUrl) continue;
110
- if (variantUrl === initialResponseUrl) {
120
+ if (candidateUrl === candidateSourceUrl) continue;
121
+ if (candidateUrl === initialResponseUrl) {
111
122
  winningUrl = initialResponseUrl;
112
123
  break;
113
124
  }
114
- const variantResponse = await fetchAndCompare(variantUrl);
115
- if (variantResponse) {
116
- let variantResponseUrl = resolveAndApplyRewrites(variantResponse.url);
117
- if (variantResponseUrl) variantResponseUrl = stripParams(variantResponseUrl);
118
- if (variantResponseUrl === variantSourceUrl || variantResponseUrl === initialResponseUrl) continue;
125
+ const candidateResponse = await fetchAndCompare(candidateUrl);
126
+ if (candidateResponse) {
127
+ let candidateResponseUrl = resolveAndApplyRewrites(candidateResponse.url);
128
+ if (candidateResponseUrl) candidateResponseUrl = stripParams(candidateResponseUrl);
129
+ if (candidateResponseUrl === candidateSourceUrl || candidateResponseUrl === initialResponseUrl) continue;
119
130
  onMatch?.({
120
- url: variantUrl,
121
- response: variantResponse,
131
+ url: candidateUrl,
132
+ response: candidateResponse,
122
133
  feed: initialResponseFeed
123
134
  });
124
- winningUrl = variantUrl;
135
+ winningUrl = candidateUrl;
125
136
  break;
126
137
  }
127
138
  }
package/dist/index.js CHANGED
@@ -1,15 +1,15 @@
1
+ import { applyProbes, applyRewrites, normalizeUrl, resolveUrl } from "./utils.js";
1
2
  import { defaultFetch, defaultParser, defaultStrippedParams, defaultTiers } from "./defaults.js";
2
- import { applyRewrites, normalizeUrl, resolveUrl } from "./utils.js";
3
3
 
4
4
  //#region src/index.ts
5
5
  async function findCanonical(inputUrl, options) {
6
- const { parser = defaultParser, fetchFn = defaultFetch, existsFn, tiers = defaultTiers, rewrites, stripQueryParams = defaultStrippedParams, onFetch, onMatch, onExists } = options ?? {};
6
+ const { parser = defaultParser, fetchFn = defaultFetch, existsFn, tiers = defaultTiers, rewrites, probes, stripQueryParams = defaultStrippedParams, onFetch, onMatch, onExists } = options ?? {};
7
7
  const stripParams = (url) => {
8
- return stripQueryParams?.length ? normalizeUrl(url, {
8
+ return normalizeUrl(url, {
9
9
  stripQueryParams,
10
10
  sortQueryParams: true,
11
11
  stripEmptyQuery: true
12
- }) : url;
12
+ });
13
13
  };
14
14
  const resolveAndApplyRewrites = (url, baseUrl) => {
15
15
  const resolved = resolveUrl(url, baseUrl);
@@ -73,7 +73,7 @@ async function findCanonical(inputUrl, options) {
73
73
  if (!await compareWithInitialResponse(response.body, response.url)) return;
74
74
  return response;
75
75
  };
76
- let variantSourceUrl = initialResponseUrl;
76
+ let candidateSourceUrl = initialResponseUrl;
77
77
  if (selfRequestUrl && selfRequestUrl !== initialResponseUrl) {
78
78
  const urlsToTry = [selfRequestUrl];
79
79
  if (selfRequestUrl.startsWith("https://")) urlsToTry.push(selfRequestUrl.replace("https://", "http://"));
@@ -86,42 +86,53 @@ async function findCanonical(inputUrl, options) {
86
86
  response,
87
87
  feed: initialResponseFeed
88
88
  });
89
- variantSourceUrl = resolveAndApplyRewrites(response.url) ?? initialResponseUrl;
90
- variantSourceUrl = stripParams(variantSourceUrl);
89
+ candidateSourceUrl = resolveAndApplyRewrites(response.url) ?? initialResponseUrl;
90
+ candidateSourceUrl = stripParams(candidateSourceUrl);
91
91
  break;
92
92
  }
93
93
  }
94
94
  }
95
- const variantUrls = new Set(tiers.map((tier) => resolveAndApplyRewrites(normalizeUrl(variantSourceUrl, tier))).filter((variantUrl) => !!variantUrl));
96
- variantUrls.add(variantSourceUrl);
97
- let winningUrl = variantSourceUrl;
98
- for (const variantUrl of variantUrls) {
95
+ if (probes?.length) candidateSourceUrl = await applyProbes(candidateSourceUrl, probes, async (candidateUrl) => {
96
+ const response = await fetchAndCompare(candidateUrl);
97
+ if (response) {
98
+ onMatch?.({
99
+ url: candidateUrl,
100
+ response,
101
+ feed: initialResponseFeed
102
+ });
103
+ return stripParams(resolveAndApplyRewrites(response.url) ?? candidateUrl);
104
+ }
105
+ });
106
+ const candidateUrls = new Set(tiers.map((tier) => resolveAndApplyRewrites(normalizeUrl(candidateSourceUrl, tier))).filter((candidateUrl) => !!candidateUrl));
107
+ candidateUrls.add(candidateSourceUrl);
108
+ let winningUrl = candidateSourceUrl;
109
+ for (const candidateUrl of candidateUrls) {
99
110
  if (existsFn) {
100
- const data = await existsFn(variantUrl);
111
+ const data = await existsFn(candidateUrl);
101
112
  if (data !== void 0) {
102
113
  onExists?.({
103
- url: variantUrl,
114
+ url: candidateUrl,
104
115
  data
105
116
  });
106
- return variantUrl;
117
+ return candidateUrl;
107
118
  }
108
119
  }
109
- if (variantUrl === variantSourceUrl) continue;
110
- if (variantUrl === initialResponseUrl) {
120
+ if (candidateUrl === candidateSourceUrl) continue;
121
+ if (candidateUrl === initialResponseUrl) {
111
122
  winningUrl = initialResponseUrl;
112
123
  break;
113
124
  }
114
- const variantResponse = await fetchAndCompare(variantUrl);
115
- if (variantResponse) {
116
- let variantResponseUrl = resolveAndApplyRewrites(variantResponse.url);
117
- if (variantResponseUrl) variantResponseUrl = stripParams(variantResponseUrl);
118
- if (variantResponseUrl === variantSourceUrl || variantResponseUrl === initialResponseUrl) continue;
125
+ const candidateResponse = await fetchAndCompare(candidateUrl);
126
+ if (candidateResponse) {
127
+ let candidateResponseUrl = resolveAndApplyRewrites(candidateResponse.url);
128
+ if (candidateResponseUrl) candidateResponseUrl = stripParams(candidateResponseUrl);
129
+ if (candidateResponseUrl === candidateSourceUrl || candidateResponseUrl === initialResponseUrl) continue;
119
130
  onMatch?.({
120
- url: variantUrl,
121
- response: variantResponse,
131
+ url: candidateUrl,
132
+ response: candidateResponse,
122
133
  feed: initialResponseFeed
123
134
  });
124
- winningUrl = variantUrl;
135
+ winningUrl = candidateUrl;
125
136
  break;
126
137
  }
127
138
  }
@@ -0,0 +1,49 @@
1
+
2
+ //#region src/probes/wordpress.ts
3
+ const feedTypes = [
4
+ "atom",
5
+ "rss2",
6
+ "rss",
7
+ "rdf"
8
+ ];
9
+ const wordpressProbe = {
10
+ match: (url) => {
11
+ const feed = url.searchParams.get("feed")?.toLowerCase();
12
+ if (!feed) return false;
13
+ const type = feed.startsWith("comments-") ? feed.slice(9) : feed;
14
+ return feedTypes.includes(type);
15
+ },
16
+ getCandidates: (url) => {
17
+ const feed = url.searchParams.get("feed")?.toLowerCase();
18
+ if (!feed) return [];
19
+ const candidates = [];
20
+ const isComment = feed.startsWith("comments-");
21
+ const type = isComment ? feed.slice(9) : feed;
22
+ if ((isComment ? /\/comments\/feed(\/|$)/ : /\/feed(\/|$)/).test(url.pathname)) {
23
+ const withoutSlash = new URL(url);
24
+ withoutSlash.pathname = url.pathname.replace(/\/$/, "");
25
+ withoutSlash.searchParams.delete("feed");
26
+ candidates.push(withoutSlash.href);
27
+ const withSlash$1 = new URL(url);
28
+ withSlash$1.pathname = url.pathname.replace(/\/?$/, "/");
29
+ withSlash$1.searchParams.delete("feed");
30
+ candidates.push(withSlash$1.href);
31
+ return candidates;
32
+ }
33
+ const basePath = url.pathname.replace(/\/$/, "");
34
+ const feedSegment = type === "atom" ? "/feed/atom" : "/feed";
35
+ const feedPath = isComment ? `/comments${feedSegment}` : feedSegment;
36
+ const primary = new URL(url);
37
+ primary.pathname = basePath + feedPath;
38
+ primary.searchParams.delete("feed");
39
+ candidates.push(primary.href);
40
+ const withSlash = new URL(url);
41
+ withSlash.pathname = `${basePath}${feedPath}/`;
42
+ withSlash.searchParams.delete("feed");
43
+ candidates.push(withSlash.href);
44
+ return candidates;
45
+ }
46
+ };
47
+
48
+ //#endregion
49
+ exports.wordpressProbe = wordpressProbe;
@@ -0,0 +1,6 @@
1
+ import { Probe } from "../types.cjs";
2
+
3
+ //#region src/probes/wordpress.d.ts
4
+ declare const wordpressProbe: Probe;
5
+ //#endregion
6
+ export { wordpressProbe };
@@ -0,0 +1,6 @@
1
+ import { Probe } from "../types.js";
2
+
3
+ //#region src/probes/wordpress.d.ts
4
+ declare const wordpressProbe: Probe;
5
+ //#endregion
6
+ export { wordpressProbe };
@@ -0,0 +1,48 @@
1
+ //#region src/probes/wordpress.ts
2
+ const feedTypes = [
3
+ "atom",
4
+ "rss2",
5
+ "rss",
6
+ "rdf"
7
+ ];
8
+ const wordpressProbe = {
9
+ match: (url) => {
10
+ const feed = url.searchParams.get("feed")?.toLowerCase();
11
+ if (!feed) return false;
12
+ const type = feed.startsWith("comments-") ? feed.slice(9) : feed;
13
+ return feedTypes.includes(type);
14
+ },
15
+ getCandidates: (url) => {
16
+ const feed = url.searchParams.get("feed")?.toLowerCase();
17
+ if (!feed) return [];
18
+ const candidates = [];
19
+ const isComment = feed.startsWith("comments-");
20
+ const type = isComment ? feed.slice(9) : feed;
21
+ if ((isComment ? /\/comments\/feed(\/|$)/ : /\/feed(\/|$)/).test(url.pathname)) {
22
+ const withoutSlash = new URL(url);
23
+ withoutSlash.pathname = url.pathname.replace(/\/$/, "");
24
+ withoutSlash.searchParams.delete("feed");
25
+ candidates.push(withoutSlash.href);
26
+ const withSlash$1 = new URL(url);
27
+ withSlash$1.pathname = url.pathname.replace(/\/?$/, "/");
28
+ withSlash$1.searchParams.delete("feed");
29
+ candidates.push(withSlash$1.href);
30
+ return candidates;
31
+ }
32
+ const basePath = url.pathname.replace(/\/$/, "");
33
+ const feedSegment = type === "atom" ? "/feed/atom" : "/feed";
34
+ const feedPath = isComment ? `/comments${feedSegment}` : feedSegment;
35
+ const primary = new URL(url);
36
+ primary.pathname = basePath + feedPath;
37
+ primary.searchParams.delete("feed");
38
+ candidates.push(primary.href);
39
+ const withSlash = new URL(url);
40
+ withSlash.pathname = `${basePath}${feedPath}/`;
41
+ withSlash.searchParams.delete("feed");
42
+ candidates.push(withSlash.href);
43
+ return candidates;
44
+ }
45
+ };
46
+
47
+ //#endregion
48
+ export { wordpressProbe };
package/dist/types.d.cts CHANGED
@@ -11,6 +11,10 @@ type Rewrite = {
11
11
  match: (url: URL) => boolean;
12
12
  rewrite: (url: URL) => URL;
13
13
  };
14
+ type Probe = {
15
+ match: (url: URL) => boolean;
16
+ getCandidates: (url: URL) => Array<string>;
17
+ };
14
18
  type NormalizeOptions = {
15
19
  stripProtocol?: boolean;
16
20
  stripAuthentication?: boolean;
@@ -45,8 +49,9 @@ type FindCanonicalOptions<TFeed = DefaultParserResult, TResponse extends FetchFn
45
49
  parser?: ParserAdapter<TFeed>;
46
50
  fetchFn?: FetchFn<TResponse>;
47
51
  existsFn?: ExistsFn<TExisting>;
48
- tiers?: Array<Tier>;
49
52
  rewrites?: Array<Rewrite>;
53
+ probes?: Array<Probe>;
54
+ tiers?: Array<Tier>;
50
55
  stripQueryParams?: Array<string>;
51
56
  onFetch?: OnFetchFn<TResponse>;
52
57
  onMatch?: OnMatchFn<TFeed, TResponse>;
@@ -65,4 +70,4 @@ type FetchFnResponse = {
65
70
  };
66
71
  type FetchFn<TResponse extends FetchFnResponse = FetchFnResponse> = (url: string, options?: FetchFnOptions) => Promise<TResponse>;
67
72
  //#endregion
68
- export { DefaultParserResult, ExistsFn, FetchFn, FetchFnOptions, FetchFnResponse, FindCanonicalOptions, NormalizeOptions, OnExistsFn, OnFetchFn, OnMatchFn, ParserAdapter, Rewrite, Tier };
73
+ export { DefaultParserResult, ExistsFn, FetchFn, FetchFnOptions, FetchFnResponse, FindCanonicalOptions, NormalizeOptions, OnExistsFn, OnFetchFn, OnMatchFn, ParserAdapter, Probe, Rewrite, Tier };
package/dist/types.d.ts CHANGED
@@ -11,6 +11,10 @@ type Rewrite = {
11
11
  match: (url: URL) => boolean;
12
12
  rewrite: (url: URL) => URL;
13
13
  };
14
+ type Probe = {
15
+ match: (url: URL) => boolean;
16
+ getCandidates: (url: URL) => Array<string>;
17
+ };
14
18
  type NormalizeOptions = {
15
19
  stripProtocol?: boolean;
16
20
  stripAuthentication?: boolean;
@@ -45,8 +49,9 @@ type FindCanonicalOptions<TFeed = DefaultParserResult, TResponse extends FetchFn
45
49
  parser?: ParserAdapter<TFeed>;
46
50
  fetchFn?: FetchFn<TResponse>;
47
51
  existsFn?: ExistsFn<TExisting>;
48
- tiers?: Array<Tier>;
49
52
  rewrites?: Array<Rewrite>;
53
+ probes?: Array<Probe>;
54
+ tiers?: Array<Tier>;
50
55
  stripQueryParams?: Array<string>;
51
56
  onFetch?: OnFetchFn<TResponse>;
52
57
  onMatch?: OnMatchFn<TFeed, TResponse>;
@@ -65,4 +70,4 @@ type FetchFnResponse = {
65
70
  };
66
71
  type FetchFn<TResponse extends FetchFnResponse = FetchFnResponse> = (url: string, options?: FetchFnOptions) => Promise<TResponse>;
67
72
  //#endregion
68
- export { DefaultParserResult, ExistsFn, FetchFn, FetchFnOptions, FetchFnResponse, FindCanonicalOptions, NormalizeOptions, OnExistsFn, OnFetchFn, OnMatchFn, ParserAdapter, Rewrite, Tier };
73
+ export { DefaultParserResult, ExistsFn, FetchFn, FetchFnOptions, FetchFnResponse, FindCanonicalOptions, NormalizeOptions, OnExistsFn, OnFetchFn, OnMatchFn, ParserAdapter, Probe, Rewrite, Tier };
package/dist/utils.cjs CHANGED
@@ -155,11 +155,51 @@ const applyRewrites = (url, rewrites) => {
155
155
  return url;
156
156
  }
157
157
  };
158
+ const applyProbes = async (url, probes, testCandidate) => {
159
+ try {
160
+ const parsed = new URL(url);
161
+ for (const probe of probes) {
162
+ if (!probe.match(parsed)) continue;
163
+ for (const candidate of probe.getCandidates(parsed)) {
164
+ const result = await testCandidate(candidate);
165
+ if (result) return result;
166
+ }
167
+ break;
168
+ }
169
+ return url;
170
+ } catch {
171
+ return url;
172
+ }
173
+ };
174
+ const createSignature = (object, fields) => {
175
+ const saved = fields.map((key) => [key, object[key]]);
176
+ for (const key of fields) object[key] = void 0;
177
+ const signature = JSON.stringify(object);
178
+ for (const [key, val] of saved) object[key] = val;
179
+ return signature;
180
+ };
181
+ const trailingSlashPattern = /("(?:https?:\/\/|\/)[^"]+)\/([?"])/g;
182
+ const neutralizeUrls = (text, urls) => {
183
+ const escapeHost = (url) => {
184
+ try {
185
+ return new URL("/", url).host.replace(/^www\./, "").replaceAll(".", "\\.");
186
+ } catch {
187
+ return;
188
+ }
189
+ };
190
+ const hosts = urls.map(escapeHost).filter(Boolean);
191
+ if (hosts.length === 0) return text;
192
+ const hostPattern = hosts.length === 1 ? hosts[0] : `(?:${hosts.join("|")})`;
193
+ return text.replace(new RegExp(`https?://(?:www\\.)?${hostPattern}(?=[/"])(/)?`, "g"), "/").replace(trailingSlashPattern, "$1$2");
194
+ };
158
195
 
159
196
  //#endregion
160
197
  exports.addMissingProtocol = addMissingProtocol;
198
+ exports.applyProbes = applyProbes;
161
199
  exports.applyRewrites = applyRewrites;
200
+ exports.createSignature = createSignature;
162
201
  exports.fixMalformedProtocol = fixMalformedProtocol;
202
+ exports.neutralizeUrls = neutralizeUrls;
163
203
  exports.normalizeUrl = normalizeUrl;
164
204
  exports.resolveFeedProtocol = resolveFeedProtocol;
165
205
  exports.resolveUrl = resolveUrl;
package/dist/utils.js CHANGED
@@ -155,6 +155,43 @@ const applyRewrites = (url, rewrites) => {
155
155
  return url;
156
156
  }
157
157
  };
158
+ const applyProbes = async (url, probes, testCandidate) => {
159
+ try {
160
+ const parsed = new URL(url);
161
+ for (const probe of probes) {
162
+ if (!probe.match(parsed)) continue;
163
+ for (const candidate of probe.getCandidates(parsed)) {
164
+ const result = await testCandidate(candidate);
165
+ if (result) return result;
166
+ }
167
+ break;
168
+ }
169
+ return url;
170
+ } catch {
171
+ return url;
172
+ }
173
+ };
174
+ const createSignature = (object, fields) => {
175
+ const saved = fields.map((key) => [key, object[key]]);
176
+ for (const key of fields) object[key] = void 0;
177
+ const signature = JSON.stringify(object);
178
+ for (const [key, val] of saved) object[key] = val;
179
+ return signature;
180
+ };
181
+ const trailingSlashPattern = /("(?:https?:\/\/|\/)[^"]+)\/([?"])/g;
182
+ const neutralizeUrls = (text, urls) => {
183
+ const escapeHost = (url) => {
184
+ try {
185
+ return new URL("/", url).host.replace(/^www\./, "").replaceAll(".", "\\.");
186
+ } catch {
187
+ return;
188
+ }
189
+ };
190
+ const hosts = urls.map(escapeHost).filter(Boolean);
191
+ if (hosts.length === 0) return text;
192
+ const hostPattern = hosts.length === 1 ? hosts[0] : `(?:${hosts.join("|")})`;
193
+ return text.replace(new RegExp(`https?://(?:www\\.)?${hostPattern}(?=[/"])(/)?`, "g"), "/").replace(trailingSlashPattern, "$1$2");
194
+ };
158
195
 
159
196
  //#endregion
160
- export { addMissingProtocol, applyRewrites, fixMalformedProtocol, normalizeUrl, resolveFeedProtocol, resolveUrl };
197
+ export { addMissingProtocol, applyProbes, applyRewrites, createSignature, fixMalformedProtocol, neutralizeUrls, normalizeUrl, resolveFeedProtocol, resolveUrl };
package/package.json CHANGED
@@ -63,5 +63,5 @@
63
63
  "tsdown": "^0.18.4",
64
64
  "vitepress": "^1.6.4"
65
65
  },
66
- "version": "1.2.2"
66
+ "version": "1.3.0"
67
67
  }