@ariesfish/feedloom 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -38,7 +38,11 @@ function profileFromTomlRule(name, rule) {
38
38
  },
39
39
  metadata: {
40
40
  fixedAuthor: rule.metadata?.fixed_author,
41
- titleSuffixPatterns: rule.metadata?.strip_title_regexes
41
+ titleSuffixPatterns: rule.metadata?.strip_title_regexes,
42
+ authorSelectors: rule.metadata?.author_selectors,
43
+ authorMetaNames: rule.metadata?.author_meta_names,
44
+ authorMetaItemprops: rule.metadata?.author_meta_itemprops,
45
+ authorMetaProperties: rule.metadata?.author_meta_properties
42
46
  }
43
47
  };
44
48
  }
@@ -921,6 +925,22 @@ function removeTrailingSiblings(element, removals, reason) {
921
925
  sibling = next;
922
926
  }
923
927
  }
928
+ function truncationCutPoint(root, element) {
929
+ let current = element;
930
+ let best = element;
931
+ while (current.parentElement && current.parentElement !== root) {
932
+ if (current.previousElementSibling) {
933
+ best = current;
934
+ }
935
+ current = current.parentElement;
936
+ }
937
+ return current.previousElementSibling ? current : best;
938
+ }
939
+ function truncateFromElement(root, element, removals, reason) {
940
+ const cutPoint = truncationCutPoint(root, element);
941
+ removeTrailingSiblings(cutPoint, removals, reason);
942
+ removeElement(removals, "site-profile:content-pattern", reason, cutPoint);
943
+ }
924
944
  function compileProfileRegexes(profiles, key) {
925
945
  return profiles.flatMap(
926
946
  (profile) => (profile.removals?.[key] ?? []).map((pattern) => ({ profile: profile.name, regex: new RegExp(pattern, "i") }))
@@ -952,8 +972,7 @@ function removeByTextPatterns(root, profiles, removals) {
952
972
  }
953
973
  const cut = text.length <= 240 ? cutContains.find((entry) => text.includes(entry.marker)) ?? cutRegexes.find((entry) => entry.regex.test(text)) : void 0;
954
974
  if (cut) {
955
- removeTrailingSiblings(element, removals, cut.profile);
956
- removeElement(removals, "site-profile:content-pattern", cut.profile, element);
975
+ truncateFromElement(root, element, removals, cut.profile);
957
976
  return;
958
977
  }
959
978
  const exactProfile = dropExact.get(text);
@@ -1045,7 +1064,28 @@ function jsonLdValue(document2, keys) {
1045
1064
  }
1046
1065
  return void 0;
1047
1066
  }
1048
- function toMetadata(result, document2) {
1067
+ function profileAuthorFromDocument(document2, profiles) {
1068
+ for (const profile of profiles) {
1069
+ const metadata = profile.metadata;
1070
+ if (!metadata) continue;
1071
+ for (const selector of metadata.authorSelectors ?? []) {
1072
+ const author = document2.querySelector(selector)?.textContent?.replace(/\s+/g, " ").trim();
1073
+ if (author) return author;
1074
+ }
1075
+ const metaNames = [
1076
+ ...(metadata.authorMetaNames ?? []).map((value) => ({ attr: "name", value })),
1077
+ ...(metadata.authorMetaItemprops ?? []).map((value) => ({ attr: "itemprop", value })),
1078
+ ...(metadata.authorMetaProperties ?? []).map((value) => ({ attr: "property", value }))
1079
+ ];
1080
+ for (const entry of metaNames) {
1081
+ const escaped = entry.value.replace(/"/g, '\\"');
1082
+ const author = document2.querySelector(`meta[${entry.attr}="${escaped}"]`)?.getAttribute("content")?.trim();
1083
+ if (author) return author;
1084
+ }
1085
+ }
1086
+ return void 0;
1087
+ }
1088
+ function toMetadata(result, document2, profiles) {
1049
1089
  return {
1050
1090
  title: result.title || firstMetaContent(document2, ["og:title", "twitter:title"]) || document2.querySelector("title")?.textContent?.trim() || void 0,
1051
1091
  description: result.description || firstMetaContent(document2, ["description", "og:description", "twitter:description"]),
@@ -1054,7 +1094,7 @@ function toMetadata(result, document2) {
1054
1094
  image: result.image || firstMetaContent(document2, ["og:image", "twitter:image"]),
1055
1095
  language: result.language || document2.documentElement.getAttribute("lang") || void 0,
1056
1096
  published: result.published || firstMetaContent(document2, ["article:published_time", "date", "datePublished", "pubdate", "publishdate"]) || jsonLdValue(document2, ["datePublished", "dateCreated"]),
1057
- author: result.author || firstMetaContent(document2, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document2, ["author", "creator"]),
1097
+ author: result.author || profileAuthorFromDocument(document2, profiles) || firstMetaContent(document2, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document2, ["author", "creator"]),
1058
1098
  site: result.site || firstMetaContent(document2, ["og:site_name", "application-name"]),
1059
1099
  schemaOrgData: result.schemaOrgData,
1060
1100
  wordCount: result.wordCount,
@@ -1102,7 +1142,7 @@ var HtmlCleaner = class {
1102
1142
  standardize: this.options.standardize
1103
1143
  });
1104
1144
  const result = parser2.parseAsync ? await parser2.parseAsync() : parser2.parse();
1105
- const metadata = toMetadata(result, document2);
1145
+ const metadata = toMetadata(result, document2, activeProfiles);
1106
1146
  applyMetadataProfiles(metadata, activeProfiles);
1107
1147
  const content = serializeProfiledContent(result.content, postProfiles, removals);
1108
1148
  return {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ariesfish/feedloom",
3
- "version": "0.1.1",
3
+ "version": "0.1.2",
4
4
  "type": "module",
5
5
  "author": "ariesfish",
6
6
  "license": "MIT",
@@ -22,6 +22,8 @@ npx -y @ariesfish/feedloom <inputs...> [options]
22
22
 
23
23
  ## Common usage
24
24
 
25
+ Before running Feedloom, check whether this skill directory has a `site-rules/` directory. If it exists, always pass it with `--site-rules-dir $HOME/.agents/skills/feedloom/site-rules`; do not omit available site rules.
26
+
25
27
  ```bash
26
28
  npx -y @ariesfish/feedloom "https://example.com/article"
27
29
  npx -y @ariesfish/feedloom urls.txt
@@ -56,20 +58,20 @@ Use the least expensive mode that works:
56
58
  - `--wait-ms <ms>`, `--wait-selector <selector>`, `--scroll-to-bottom`: give dynamic pages time or actions to reveal article content.
57
59
  - `--click-selector <selector...>`: click dismiss/expand selectors before extracting HTML.
58
60
  - `--headful`: show the browser window for debugging login, popups, or dynamic loading.
59
- - `--site-rules-dir <dir>`: load optional private TOML extraction/cleaning rules from a local directory, for example `skills/feedloom/site-rules/` reference folder.
61
+ - `--site-rules-dir <dir>`: load optional private TOML extraction/cleaning rules from a local directory, for example `$HOME/.agents/skills/feedloom/site-rules/` reference folder.
60
62
  - `--solve-cloudflare`, `--proxy <server>`, `--dns-over-https`: use only when stealth fetching needs them.
61
63
 
62
64
  Run `npx -y @ariesfish/feedloom --help` for the complete option list. Do not invent unsupported options.
63
65
 
64
66
  ## Private site rules
65
67
 
66
- Site-specific TOML rules are intentionally optional and should not be assumed to be bundled with the package. If the user keeps private rules next to this skill, pass that directory explicitly:
68
+ Site-specific TOML rules are optional in the package, but mandatory to use when present next to this skill. Always check for `$HOME/.agents/skills/feedloom/site-rules/` before clipping. If that directory exists, pass it explicitly on every Feedloom command using the `$HOME`-prefixed path:
67
69
 
68
70
  ```bash
69
- npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir skills/feedloom/site-rules
71
+ npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir $HOME/.agents/skills/feedloom/site-rules
70
72
  ```
71
73
 
72
- Treat rule files in `skills/feedloom/site-rules/` as local reference material: use them only when present and relevant.
74
+ Treat rule files in `$HOME/.agents/skills/feedloom/site-rules/` as local reference material and use them whenever available; never skip an existing site-rules directory unless the user explicitly asks not to use it.
73
75
 
74
76
  ## Output
75
77