@ariesfish/feedloom 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +46 -6
- package/package.json +1 -1
- package/skills/feedloom/SKILL.md +6 -4
package/dist/cli.js
CHANGED
|
@@ -38,7 +38,11 @@ function profileFromTomlRule(name, rule) {
|
|
|
38
38
|
},
|
|
39
39
|
metadata: {
|
|
40
40
|
fixedAuthor: rule.metadata?.fixed_author,
|
|
41
|
-
titleSuffixPatterns: rule.metadata?.strip_title_regexes
|
|
41
|
+
titleSuffixPatterns: rule.metadata?.strip_title_regexes,
|
|
42
|
+
authorSelectors: rule.metadata?.author_selectors,
|
|
43
|
+
authorMetaNames: rule.metadata?.author_meta_names,
|
|
44
|
+
authorMetaItemprops: rule.metadata?.author_meta_itemprops,
|
|
45
|
+
authorMetaProperties: rule.metadata?.author_meta_properties
|
|
42
46
|
}
|
|
43
47
|
};
|
|
44
48
|
}
|
|
@@ -921,6 +925,22 @@ function removeTrailingSiblings(element, removals, reason) {
|
|
|
921
925
|
sibling = next;
|
|
922
926
|
}
|
|
923
927
|
}
|
|
928
|
+
function truncationCutPoint(root, element) {
|
|
929
|
+
let current = element;
|
|
930
|
+
let best = element;
|
|
931
|
+
while (current.parentElement && current.parentElement !== root) {
|
|
932
|
+
if (current.previousElementSibling) {
|
|
933
|
+
best = current;
|
|
934
|
+
}
|
|
935
|
+
current = current.parentElement;
|
|
936
|
+
}
|
|
937
|
+
return current.previousElementSibling ? current : best;
|
|
938
|
+
}
|
|
939
|
+
function truncateFromElement(root, element, removals, reason) {
|
|
940
|
+
const cutPoint = truncationCutPoint(root, element);
|
|
941
|
+
removeTrailingSiblings(cutPoint, removals, reason);
|
|
942
|
+
removeElement(removals, "site-profile:content-pattern", reason, cutPoint);
|
|
943
|
+
}
|
|
924
944
|
function compileProfileRegexes(profiles, key) {
|
|
925
945
|
return profiles.flatMap(
|
|
926
946
|
(profile) => (profile.removals?.[key] ?? []).map((pattern) => ({ profile: profile.name, regex: new RegExp(pattern, "i") }))
|
|
@@ -952,8 +972,7 @@ function removeByTextPatterns(root, profiles, removals) {
|
|
|
952
972
|
}
|
|
953
973
|
const cut = text.length <= 240 ? cutContains.find((entry) => text.includes(entry.marker)) ?? cutRegexes.find((entry) => entry.regex.test(text)) : void 0;
|
|
954
974
|
if (cut) {
|
|
955
|
-
|
|
956
|
-
removeElement(removals, "site-profile:content-pattern", cut.profile, element);
|
|
975
|
+
truncateFromElement(root, element, removals, cut.profile);
|
|
957
976
|
return;
|
|
958
977
|
}
|
|
959
978
|
const exactProfile = dropExact.get(text);
|
|
@@ -1045,7 +1064,28 @@ function jsonLdValue(document2, keys) {
|
|
|
1045
1064
|
}
|
|
1046
1065
|
return void 0;
|
|
1047
1066
|
}
|
|
1048
|
-
function
|
|
1067
|
+
function profileAuthorFromDocument(document2, profiles) {
|
|
1068
|
+
for (const profile of profiles) {
|
|
1069
|
+
const metadata = profile.metadata;
|
|
1070
|
+
if (!metadata) continue;
|
|
1071
|
+
for (const selector of metadata.authorSelectors ?? []) {
|
|
1072
|
+
const author = document2.querySelector(selector)?.textContent?.replace(/\s+/g, " ").trim();
|
|
1073
|
+
if (author) return author;
|
|
1074
|
+
}
|
|
1075
|
+
const metaNames = [
|
|
1076
|
+
...(metadata.authorMetaNames ?? []).map((value) => ({ attr: "name", value })),
|
|
1077
|
+
...(metadata.authorMetaItemprops ?? []).map((value) => ({ attr: "itemprop", value })),
|
|
1078
|
+
...(metadata.authorMetaProperties ?? []).map((value) => ({ attr: "property", value }))
|
|
1079
|
+
];
|
|
1080
|
+
for (const entry of metaNames) {
|
|
1081
|
+
const escaped = entry.value.replace(/"/g, '\\"');
|
|
1082
|
+
const author = document2.querySelector(`meta[${entry.attr}="${escaped}"]`)?.getAttribute("content")?.trim();
|
|
1083
|
+
if (author) return author;
|
|
1084
|
+
}
|
|
1085
|
+
}
|
|
1086
|
+
return void 0;
|
|
1087
|
+
}
|
|
1088
|
+
function toMetadata(result, document2, profiles) {
|
|
1049
1089
|
return {
|
|
1050
1090
|
title: result.title || firstMetaContent(document2, ["og:title", "twitter:title"]) || document2.querySelector("title")?.textContent?.trim() || void 0,
|
|
1051
1091
|
description: result.description || firstMetaContent(document2, ["description", "og:description", "twitter:description"]),
|
|
@@ -1054,7 +1094,7 @@ function toMetadata(result, document2) {
|
|
|
1054
1094
|
image: result.image || firstMetaContent(document2, ["og:image", "twitter:image"]),
|
|
1055
1095
|
language: result.language || document2.documentElement.getAttribute("lang") || void 0,
|
|
1056
1096
|
published: result.published || firstMetaContent(document2, ["article:published_time", "date", "datePublished", "pubdate", "publishdate"]) || jsonLdValue(document2, ["datePublished", "dateCreated"]),
|
|
1057
|
-
author: result.author || firstMetaContent(document2, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document2, ["author", "creator"]),
|
|
1097
|
+
author: result.author || profileAuthorFromDocument(document2, profiles) || firstMetaContent(document2, ["author", "article:author", "twitter:creator"]) || jsonLdValue(document2, ["author", "creator"]),
|
|
1058
1098
|
site: result.site || firstMetaContent(document2, ["og:site_name", "application-name"]),
|
|
1059
1099
|
schemaOrgData: result.schemaOrgData,
|
|
1060
1100
|
wordCount: result.wordCount,
|
|
@@ -1102,7 +1142,7 @@ var HtmlCleaner = class {
|
|
|
1102
1142
|
standardize: this.options.standardize
|
|
1103
1143
|
});
|
|
1104
1144
|
const result = parser2.parseAsync ? await parser2.parseAsync() : parser2.parse();
|
|
1105
|
-
const metadata = toMetadata(result, document2);
|
|
1145
|
+
const metadata = toMetadata(result, document2, activeProfiles);
|
|
1106
1146
|
applyMetadataProfiles(metadata, activeProfiles);
|
|
1107
1147
|
const content = serializeProfiledContent(result.content, postProfiles, removals);
|
|
1108
1148
|
return {
|
package/package.json
CHANGED
package/skills/feedloom/SKILL.md
CHANGED
|
@@ -22,6 +22,8 @@ npx -y @ariesfish/feedloom <inputs...> [options]
|
|
|
22
22
|
|
|
23
23
|
## Common usage
|
|
24
24
|
|
|
25
|
+
Before running Feedloom, check whether this skill directory has a `site-rules/` directory. If it exists, always pass it with `--site-rules-dir $HOME/.agents/skills/feedloom/site-rules`; do not omit available site rules.
|
|
26
|
+
|
|
25
27
|
```bash
|
|
26
28
|
npx -y @ariesfish/feedloom "https://example.com/article"
|
|
27
29
|
npx -y @ariesfish/feedloom urls.txt
|
|
@@ -56,20 +58,20 @@ Use the least expensive mode that works:
|
|
|
56
58
|
- `--wait-ms <ms>`, `--wait-selector <selector>`, `--scroll-to-bottom`: give dynamic pages time or actions to reveal article content.
|
|
57
59
|
- `--click-selector <selector...>`: click dismiss/expand selectors before extracting HTML.
|
|
58
60
|
- `--headful`: show the browser window for debugging login, popups, or dynamic loading.
|
|
59
|
-
- `--site-rules-dir <dir>`: load optional private TOML extraction/cleaning rules from a local directory, for example
|
|
61
|
+
- `--site-rules-dir <dir>`: load optional private TOML extraction/cleaning rules from a local directory, for example `$HOME/.agents/skills/feedloom/site-rules/` reference folder.
|
|
60
62
|
- `--solve-cloudflare`, `--proxy <server>`, `--dns-over-https`: use only when stealth fetching needs them.
|
|
61
63
|
|
|
62
64
|
Run `npx -y @ariesfish/feedloom --help` for the complete option list. Do not invent unsupported options.
|
|
63
65
|
|
|
64
66
|
## Private site rules
|
|
65
67
|
|
|
66
|
-
Site-specific TOML rules are
|
|
68
|
+
Site-specific TOML rules are optional in the package, but mandatory to use when present next to this skill. Always check for `$HOME/.agents/skills/feedloom/site-rules/` before clipping. If that directory exists, pass it explicitly on every Feedloom command using the `$HOME`-prefixed path:
|
|
67
69
|
|
|
68
70
|
```bash
|
|
69
|
-
npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir skills/feedloom/site-rules
|
|
71
|
+
npx -y @ariesfish/feedloom "https://example.com/article" --site-rules-dir $HOME/.agents/skills/feedloom/site-rules
|
|
70
72
|
```
|
|
71
73
|
|
|
72
|
-
Treat rule files in
|
|
74
|
+
Treat rule files in `$HOME/.agents/skills/feedloom/site-rules/` as local reference material and use them whenever available; never skip an existing site-rules directory unless the user explicitly asks not to use it.
|
|
73
75
|
|
|
74
76
|
## Output
|
|
75
77
|
|