@heart-of-gold/toolkit 0.1.44 → 0.1.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@heart-of-gold/toolkit",
3
- "version": "0.1.44",
3
+ "version": "0.1.45",
4
4
  "type": "module",
5
5
  "description": "Cross-platform installer for Heart of Gold skills — works with Codex, OpenCode, Pi, Claude Code, and more",
6
6
  "bin": {
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "guide",
3
- "version": "0.3.0",
3
+ "version": "0.3.1",
4
4
  "description": "The Hitchhiker's Guide — content creation suite with automated pipeline, daily briefs, and blog writing",
5
5
  "author": {
6
6
  "name": "ondrej-svec",
@@ -0,0 +1,163 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ fetch-anthropic-news.py — Fetch Anthropic news posts from sitemap.xml as signals.
4
+
5
+ Anthropic publishes no RSS feed, but anthropic.com/sitemap.xml lists every
6
+ /news/ post with a <lastmod> timestamp. We parse that, filter by freshness,
7
+ fetch each recent page for its <title> + meta description, and emit signals
8
+ in the same schema as fetch-rss.py.
9
+
10
+ Usage:
11
+ python3 fetch-anthropic-news.py --config <path-to-config.yaml>
12
+
13
+ Config (under sources.anthropic_news):
14
+ enabled: true
15
+ freshness_hours: 168
16
+ max_items: 5
17
+ """
18
+ import argparse
19
+ import json
20
+ import re
21
+ import sys
22
+ import urllib.error
23
+ import urllib.parse
24
+ import urllib.request
25
+ from datetime import datetime, timedelta, timezone
26
+ from xml.etree import ElementTree as ET
27
+
28
+ import yaml
29
+
30
+ SITEMAP_URL = "https://www.anthropic.com/sitemap.xml"
31
+ PATH_PREFIX = "/news/"
32
+ USER_AGENT = "heart-of-gold-pipeline/1.0"
33
+ FETCH_TIMEOUT = 10
34
+ SITEMAP_NS = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
35
+
36
+ TITLE_RE = re.compile(r"<title[^>]*>([^<]*)</title>", re.IGNORECASE)
37
+ META_DESC_RES = [
38
+ re.compile(
39
+ r'<meta[^>]+name=["\']description["\'][^>]*content=["\']([^"\']*)["\']',
40
+ re.IGNORECASE,
41
+ ),
42
+ re.compile(
43
+ r'<meta[^>]+content=["\']([^"\']*)["\'][^>]*name=["\']description["\']',
44
+ re.IGNORECASE,
45
+ ),
46
+ re.compile(
47
+ r'<meta[^>]+property=["\']og:description["\'][^>]*content=["\']([^"\']*)["\']',
48
+ re.IGNORECASE,
49
+ ),
50
+ ]
51
+
52
+
53
+ def fetch_url(url, timeout=FETCH_TIMEOUT):
54
+ req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
55
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
56
+ return resp.read().decode("utf-8", errors="replace")
57
+
58
+
59
+ def parse_sitemap_urls(xml_text, path_prefix, cutoff):
60
+ root = ET.fromstring(xml_text)
61
+ entries = []
62
+ for url_elem in root.findall(f"{SITEMAP_NS}url"):
63
+ loc = (url_elem.findtext(f"{SITEMAP_NS}loc") or "").strip()
64
+ lastmod = (url_elem.findtext(f"{SITEMAP_NS}lastmod") or "").strip()
65
+ if not loc:
66
+ continue
67
+ path = urllib.parse.urlparse(loc).path
68
+ if not path.startswith(path_prefix) or path == path_prefix:
69
+ continue
70
+ if not lastmod:
71
+ continue
72
+ try:
73
+ dt = datetime.fromisoformat(lastmod.replace("Z", "+00:00"))
74
+ except ValueError:
75
+ continue
76
+ if dt < cutoff:
77
+ continue
78
+ entries.append((loc, dt))
79
+ return entries
80
+
81
+
82
+ def extract_meta(html):
83
+ title = ""
84
+ m = TITLE_RE.search(html)
85
+ if m:
86
+ title = m.group(1).strip()
87
+ description = ""
88
+ for pattern in META_DESC_RES:
89
+ m = pattern.search(html)
90
+ if m:
91
+ description = m.group(1).strip()
92
+ break
93
+ return title, description
94
+
95
+
96
+ def slug_to_title(url):
97
+ slug = urllib.parse.urlparse(url).path.rstrip("/").rsplit("/", 1)[-1]
98
+ return slug.replace("-", " ").title()
99
+
100
+
101
+ def main():
102
+ parser = argparse.ArgumentParser(description="Fetch Anthropic news signals from sitemap")
103
+ parser.add_argument("--config", required=True)
104
+ args = parser.parse_args()
105
+
106
+ try:
107
+ with open(args.config) as f:
108
+ config = yaml.safe_load(f)
109
+ except Exception as e:
110
+ print(f"Error reading config: {e}", file=sys.stderr)
111
+ sys.exit(1)
112
+
113
+ src = (config.get("sources") or {}).get("anthropic_news") or {}
114
+ if not src.get("enabled", False):
115
+ print("[]")
116
+ sys.exit(0)
117
+
118
+ freshness_hours = src.get("freshness_hours", 168)
119
+ max_items = src.get("max_items", 5)
120
+
121
+ cutoff = datetime.now(timezone.utc) - timedelta(hours=freshness_hours)
122
+
123
+ try:
124
+ sitemap_xml = fetch_url(SITEMAP_URL)
125
+ except (urllib.error.URLError, TimeoutError) as e:
126
+ print(f"Error fetching sitemap: {e}", file=sys.stderr)
127
+ print("[]")
128
+ sys.exit(0)
129
+
130
+ try:
131
+ entries = parse_sitemap_urls(sitemap_xml, PATH_PREFIX, cutoff)
132
+ except ET.ParseError as e:
133
+ print(f"Error parsing sitemap: {e}", file=sys.stderr)
134
+ print("[]")
135
+ sys.exit(0)
136
+
137
+ entries.sort(key=lambda x: x[1], reverse=True)
138
+ entries = entries[:max_items]
139
+
140
+ signals = []
141
+ for url, lastmod in entries:
142
+ try:
143
+ html = fetch_url(url)
144
+ title, description = extract_meta(html)
145
+ except (urllib.error.URLError, TimeoutError) as e:
146
+ print(f"Warn: could not fetch {url}: {e}", file=sys.stderr)
147
+ title, description = "", ""
148
+
149
+ signals.append({
150
+ "source": "rss",
151
+ "title": title or slug_to_title(url),
152
+ "url": url,
153
+ "content": description,
154
+ "published_at": lastmod.isoformat(),
155
+ "metadata": {"source_name": "anthropic-news"},
156
+ })
157
+
158
+ print(json.dumps(signals, indent=2))
159
+ sys.exit(0)
160
+
161
+
162
+ if __name__ == "__main__":
163
+ main()
@@ -172,6 +172,12 @@ if [[ "$EVENTS_ENABLED" == "true" ]]; then
172
172
  run_source "events" "python3 '$SCRIPTS_DIR/fetch-events.py' --config '$CONFIG_PATH'"
173
173
  fi
174
174
 
175
+ # 5. Anthropic news (sitemap-driven)
176
+ ANTHROPIC_NEWS_ENABLED=$(echo "$CONFIG_JSON" | python3 -c "import json,sys; d=json.load(sys.stdin); print('true' if d['sources'].get('anthropic_news',{}).get('enabled',False) else 'false')")
177
+ if [[ "$ANTHROPIC_NEWS_ENABLED" == "true" ]]; then
178
+ run_source "anthropic_news" "python3 '$SCRIPTS_DIR/fetch-anthropic-news.py' --config '$CONFIG_PATH'"
179
+ fi
180
+
175
181
  # ── Combine and normalize ─────────────────────────────────────────────────────
176
182
 
177
183
  echo " · Combining signals..." >&2
@@ -183,7 +189,7 @@ sys.path.insert(0, '$SCRIPTS_DIR')
183
189
  from pipeline_utils import combine_signals, validate_signal, next_pipeline_path, normalize_score
184
190
 
185
191
  # Read all source outputs
186
- source_files = ['rss', 'hn', 'gmail', 'events']
192
+ source_files = ['rss', 'hn', 'gmail', 'events', 'anthropic_news']
187
193
  all_signals = []
188
194
  for name in source_files:
189
195
  path = '$WORK_DIR/' + name + '.json'