@heart-of-gold/toolkit 0.1.44 → 0.1.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
fetch-anthropic-news.py — Fetch Anthropic news posts from sitemap.xml as signals.
|
|
4
|
+
|
|
5
|
+
Anthropic publishes no RSS feed, but anthropic.com/sitemap.xml lists every
|
|
6
|
+
/news/ post with a <lastmod> timestamp. We parse that, filter by freshness,
|
|
7
|
+
fetch each recent page for its <title> + meta description, and emit signals
|
|
8
|
+
in the same schema as fetch-rss.py.
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
python3 fetch-anthropic-news.py --config <path-to-config.yaml>
|
|
12
|
+
|
|
13
|
+
Config (under sources.anthropic_news):
|
|
14
|
+
enabled: true
|
|
15
|
+
freshness_hours: 168
|
|
16
|
+
max_items: 5
|
|
17
|
+
"""
|
|
18
|
+
import argparse
|
|
19
|
+
import json
|
|
20
|
+
import re
|
|
21
|
+
import sys
|
|
22
|
+
import urllib.error
|
|
23
|
+
import urllib.parse
|
|
24
|
+
import urllib.request
|
|
25
|
+
from datetime import datetime, timedelta, timezone
|
|
26
|
+
from xml.etree import ElementTree as ET
|
|
27
|
+
|
|
28
|
+
import yaml
|
|
29
|
+
|
|
30
|
+
SITEMAP_URL = "https://www.anthropic.com/sitemap.xml"
|
|
31
|
+
PATH_PREFIX = "/news/"
|
|
32
|
+
USER_AGENT = "heart-of-gold-pipeline/1.0"
|
|
33
|
+
FETCH_TIMEOUT = 10
|
|
34
|
+
SITEMAP_NS = "{http://www.sitemaps.org/schemas/sitemap/0.9}"
|
|
35
|
+
|
|
36
|
+
TITLE_RE = re.compile(r"<title[^>]*>([^<]*)</title>", re.IGNORECASE)
|
|
37
|
+
META_DESC_RES = [
|
|
38
|
+
re.compile(
|
|
39
|
+
r'<meta[^>]+name=["\']description["\'][^>]*content=["\']([^"\']*)["\']',
|
|
40
|
+
re.IGNORECASE,
|
|
41
|
+
),
|
|
42
|
+
re.compile(
|
|
43
|
+
r'<meta[^>]+content=["\']([^"\']*)["\'][^>]*name=["\']description["\']',
|
|
44
|
+
re.IGNORECASE,
|
|
45
|
+
),
|
|
46
|
+
re.compile(
|
|
47
|
+
r'<meta[^>]+property=["\']og:description["\'][^>]*content=["\']([^"\']*)["\']',
|
|
48
|
+
re.IGNORECASE,
|
|
49
|
+
),
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def fetch_url(url, timeout=FETCH_TIMEOUT):
|
|
54
|
+
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
|
55
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
56
|
+
return resp.read().decode("utf-8", errors="replace")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def parse_sitemap_urls(xml_text, path_prefix, cutoff):
|
|
60
|
+
root = ET.fromstring(xml_text)
|
|
61
|
+
entries = []
|
|
62
|
+
for url_elem in root.findall(f"{SITEMAP_NS}url"):
|
|
63
|
+
loc = (url_elem.findtext(f"{SITEMAP_NS}loc") or "").strip()
|
|
64
|
+
lastmod = (url_elem.findtext(f"{SITEMAP_NS}lastmod") or "").strip()
|
|
65
|
+
if not loc:
|
|
66
|
+
continue
|
|
67
|
+
path = urllib.parse.urlparse(loc).path
|
|
68
|
+
if not path.startswith(path_prefix) or path == path_prefix:
|
|
69
|
+
continue
|
|
70
|
+
if not lastmod:
|
|
71
|
+
continue
|
|
72
|
+
try:
|
|
73
|
+
dt = datetime.fromisoformat(lastmod.replace("Z", "+00:00"))
|
|
74
|
+
except ValueError:
|
|
75
|
+
continue
|
|
76
|
+
if dt < cutoff:
|
|
77
|
+
continue
|
|
78
|
+
entries.append((loc, dt))
|
|
79
|
+
return entries
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def extract_meta(html):
|
|
83
|
+
title = ""
|
|
84
|
+
m = TITLE_RE.search(html)
|
|
85
|
+
if m:
|
|
86
|
+
title = m.group(1).strip()
|
|
87
|
+
description = ""
|
|
88
|
+
for pattern in META_DESC_RES:
|
|
89
|
+
m = pattern.search(html)
|
|
90
|
+
if m:
|
|
91
|
+
description = m.group(1).strip()
|
|
92
|
+
break
|
|
93
|
+
return title, description
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def slug_to_title(url):
|
|
97
|
+
slug = urllib.parse.urlparse(url).path.rstrip("/").rsplit("/", 1)[-1]
|
|
98
|
+
return slug.replace("-", " ").title()
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def main():
|
|
102
|
+
parser = argparse.ArgumentParser(description="Fetch Anthropic news signals from sitemap")
|
|
103
|
+
parser.add_argument("--config", required=True)
|
|
104
|
+
args = parser.parse_args()
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
with open(args.config) as f:
|
|
108
|
+
config = yaml.safe_load(f)
|
|
109
|
+
except Exception as e:
|
|
110
|
+
print(f"Error reading config: {e}", file=sys.stderr)
|
|
111
|
+
sys.exit(1)
|
|
112
|
+
|
|
113
|
+
src = (config.get("sources") or {}).get("anthropic_news") or {}
|
|
114
|
+
if not src.get("enabled", False):
|
|
115
|
+
print("[]")
|
|
116
|
+
sys.exit(0)
|
|
117
|
+
|
|
118
|
+
freshness_hours = src.get("freshness_hours", 168)
|
|
119
|
+
max_items = src.get("max_items", 5)
|
|
120
|
+
|
|
121
|
+
cutoff = datetime.now(timezone.utc) - timedelta(hours=freshness_hours)
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
sitemap_xml = fetch_url(SITEMAP_URL)
|
|
125
|
+
except (urllib.error.URLError, TimeoutError) as e:
|
|
126
|
+
print(f"Error fetching sitemap: {e}", file=sys.stderr)
|
|
127
|
+
print("[]")
|
|
128
|
+
sys.exit(0)
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
entries = parse_sitemap_urls(sitemap_xml, PATH_PREFIX, cutoff)
|
|
132
|
+
except ET.ParseError as e:
|
|
133
|
+
print(f"Error parsing sitemap: {e}", file=sys.stderr)
|
|
134
|
+
print("[]")
|
|
135
|
+
sys.exit(0)
|
|
136
|
+
|
|
137
|
+
entries.sort(key=lambda x: x[1], reverse=True)
|
|
138
|
+
entries = entries[:max_items]
|
|
139
|
+
|
|
140
|
+
signals = []
|
|
141
|
+
for url, lastmod in entries:
|
|
142
|
+
try:
|
|
143
|
+
html = fetch_url(url)
|
|
144
|
+
title, description = extract_meta(html)
|
|
145
|
+
except (urllib.error.URLError, TimeoutError) as e:
|
|
146
|
+
print(f"Warn: could not fetch {url}: {e}", file=sys.stderr)
|
|
147
|
+
title, description = "", ""
|
|
148
|
+
|
|
149
|
+
signals.append({
|
|
150
|
+
"source": "rss",
|
|
151
|
+
"title": title or slug_to_title(url),
|
|
152
|
+
"url": url,
|
|
153
|
+
"content": description,
|
|
154
|
+
"published_at": lastmod.isoformat(),
|
|
155
|
+
"metadata": {"source_name": "anthropic-news"},
|
|
156
|
+
})
|
|
157
|
+
|
|
158
|
+
print(json.dumps(signals, indent=2))
|
|
159
|
+
sys.exit(0)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
if __name__ == "__main__":
|
|
163
|
+
main()
|
|
@@ -172,6 +172,12 @@ if [[ "$EVENTS_ENABLED" == "true" ]]; then
|
|
|
172
172
|
run_source "events" "python3 '$SCRIPTS_DIR/fetch-events.py' --config '$CONFIG_PATH'"
|
|
173
173
|
fi
|
|
174
174
|
|
|
175
|
+
# 5. Anthropic news (sitemap-driven)
|
|
176
|
+
ANTHROPIC_NEWS_ENABLED=$(echo "$CONFIG_JSON" | python3 -c "import json,sys; d=json.load(sys.stdin); print('true' if d['sources'].get('anthropic_news',{}).get('enabled',False) else 'false')")
|
|
177
|
+
if [[ "$ANTHROPIC_NEWS_ENABLED" == "true" ]]; then
|
|
178
|
+
run_source "anthropic_news" "python3 '$SCRIPTS_DIR/fetch-anthropic-news.py' --config '$CONFIG_PATH'"
|
|
179
|
+
fi
|
|
180
|
+
|
|
175
181
|
# ── Combine and normalize ─────────────────────────────────────────────────────
|
|
176
182
|
|
|
177
183
|
echo " · Combining signals..." >&2
|
|
@@ -183,7 +189,7 @@ sys.path.insert(0, '$SCRIPTS_DIR')
|
|
|
183
189
|
from pipeline_utils import combine_signals, validate_signal, next_pipeline_path, normalize_score
|
|
184
190
|
|
|
185
191
|
# Read all source outputs
|
|
186
|
-
source_files = ['rss', 'hn', 'gmail', 'events']
|
|
192
|
+
source_files = ['rss', 'hn', 'gmail', 'events', 'anthropic_news']
|
|
187
193
|
all_signals = []
|
|
188
194
|
for name in source_files:
|
|
189
195
|
path = '$WORK_DIR/' + name + '.json'
|