nod-shout 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/README.md +82 -0
  2. package/TASK-AGENT-POSTS.md +112 -0
  3. package/assets/shout-default.svg +5 -0
  4. package/bin/shout +68 -0
  5. package/dist/index.d.ts +2 -0
  6. package/dist/index.d.ts.map +1 -0
  7. package/dist/index.js +29 -0
  8. package/dist/index.js.map +1 -0
  9. package/dist/lib/ai.d.ts +13 -0
  10. package/dist/lib/ai.d.ts.map +1 -0
  11. package/dist/lib/ai.js +135 -0
  12. package/dist/lib/ai.js.map +1 -0
  13. package/dist/lib/content-filter.d.ts +74 -0
  14. package/dist/lib/content-filter.d.ts.map +1 -0
  15. package/dist/lib/content-filter.js +188 -0
  16. package/dist/lib/content-filter.js.map +1 -0
  17. package/dist/lib/context-extractor.d.ts +39 -0
  18. package/dist/lib/context-extractor.d.ts.map +1 -0
  19. package/dist/lib/context-extractor.js +170 -0
  20. package/dist/lib/context-extractor.js.map +1 -0
  21. package/dist/lib/match-engine.d.ts +31 -0
  22. package/dist/lib/match-engine.d.ts.map +1 -0
  23. package/dist/lib/match-engine.js +322 -0
  24. package/dist/lib/match-engine.js.map +1 -0
  25. package/dist/lib/metadata.d.ts +7 -0
  26. package/dist/lib/metadata.d.ts.map +1 -0
  27. package/dist/lib/metadata.js +311 -0
  28. package/dist/lib/metadata.js.map +1 -0
  29. package/dist/lib/skills.d.ts +3 -0
  30. package/dist/lib/skills.d.ts.map +1 -0
  31. package/dist/lib/skills.js +20 -0
  32. package/dist/lib/skills.js.map +1 -0
  33. package/dist/lib/supabase.d.ts +2 -0
  34. package/dist/lib/supabase.d.ts.map +1 -0
  35. package/dist/lib/supabase.js +8 -0
  36. package/dist/lib/supabase.js.map +1 -0
  37. package/dist/tools/collections.d.ts +3 -0
  38. package/dist/tools/collections.d.ts.map +1 -0
  39. package/dist/tools/collections.js +142 -0
  40. package/dist/tools/collections.js.map +1 -0
  41. package/dist/tools/intros.d.ts +3 -0
  42. package/dist/tools/intros.d.ts.map +1 -0
  43. package/dist/tools/intros.js +483 -0
  44. package/dist/tools/intros.js.map +1 -0
  45. package/dist/tools/links.d.ts +3 -0
  46. package/dist/tools/links.d.ts.map +1 -0
  47. package/dist/tools/links.js +424 -0
  48. package/dist/tools/links.js.map +1 -0
  49. package/dist/tools/posts.d.ts +3 -0
  50. package/dist/tools/posts.d.ts.map +1 -0
  51. package/dist/tools/posts.js +212 -0
  52. package/dist/tools/posts.js.map +1 -0
  53. package/dist/tools/settings.d.ts +3 -0
  54. package/dist/tools/settings.d.ts.map +1 -0
  55. package/dist/tools/settings.js +45 -0
  56. package/dist/tools/settings.js.map +1 -0
  57. package/dist/tools/shout_agent_curate.d.ts +28 -0
  58. package/dist/tools/shout_agent_curate.d.ts.map +1 -0
  59. package/dist/tools/shout_agent_curate.js +80 -0
  60. package/dist/tools/shout_agent_curate.js.map +1 -0
  61. package/dist/tools/social.d.ts +3 -0
  62. package/dist/tools/social.d.ts.map +1 -0
  63. package/dist/tools/social.js +169 -0
  64. package/dist/tools/social.js.map +1 -0
  65. package/dist/types.d.ts +60 -0
  66. package/dist/types.d.ts.map +1 -0
  67. package/dist/types.js +3 -0
  68. package/dist/types.js.map +1 -0
  69. package/package.json +24 -0
  70. package/quick-test.ts +22 -0
  71. package/regenerate-summaries.ts +111 -0
  72. package/save-jeffries-shout.ts +38 -0
  73. package/save-openai-shout.ts +35 -0
  74. package/save-prcarly.ts +46 -0
  75. package/save-shout.ts +35 -0
  76. package/save-techcrunch-shout.ts +59 -0
  77. package/save-zdnet-shout.ts +36 -0
  78. package/skills/collection-routing/SKILL.md +34 -0
  79. package/skills/link-summary/SKILL.md +53 -0
  80. package/skills/tagging-and-routing/SKILL.md +54 -0
  81. package/src/index.ts +32 -0
  82. package/src/lib/ai.ts +166 -0
  83. package/src/lib/content-filter.ts +258 -0
  84. package/src/lib/metadata.ts +353 -0
  85. package/src/lib/skills.ts +21 -0
  86. package/src/lib/supabase.ts +12 -0
  87. package/src/tools/collections.ts +182 -0
  88. package/src/tools/links.ts +524 -0
  89. package/src/tools/posts.ts +264 -0
  90. package/src/tools/settings.ts +55 -0
  91. package/src/tools/shout_agent_curate.ts +95 -0
  92. package/src/tools/social.ts +206 -0
  93. package/src/types.ts +66 -0
  94. package/supabase/.temp/cli-latest +1 -0
  95. package/supabase/.temp/gotrue-version +1 -0
  96. package/supabase/.temp/pooler-url +1 -0
  97. package/supabase/.temp/postgres-version +1 -0
  98. package/supabase/.temp/project-ref +1 -0
  99. package/supabase/.temp/rest-version +1 -0
  100. package/supabase/.temp/storage-migration +1 -0
  101. package/supabase/.temp/storage-version +1 -0
  102. package/supabase/migrations/001_initial_schema.sql +147 -0
  103. package/supabase/migrations/20260317010000_decouple_profiles_from_auth.sql +9 -0
  104. package/supabase/migrations/20260317020000_agent_curation.sql +10 -0
  105. package/supabase/migrations/20260320000000_agent_posts.sql +41 -0
  106. package/supabase/migrations/20260320120000_fix_draft_fk.sql +2 -0
  107. package/supabase/migrations/20260320130000_fix_identity.sql +17 -0
  108. package/supabase/migrations/20260320170000_intros.sql +118 -0
  109. package/test-model-comparison.ts +89 -0
  110. package/tsconfig.json +19 -0
@@ -0,0 +1,258 @@
1
+ /**
2
+ * Content filter for shout posts — strips PII, proprietary data,
3
+ * and sensitive information before anything touches the public page.
4
+ *
5
+ * Two layers:
6
+ * 1. Regex — catches format-based PII (emails, phones, keys, etc.)
7
+ * 2. LLM — catches context-based leaks ("our revenue is...", "client X told me...")
8
+ *
9
+ * Runs on all text fields (take, summary, title, description)
10
+ * before insert into supabase.
11
+ */
12
+
13
+ const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
14
+
15
+ // Regex patterns for common PII
16
+ const PII_PATTERNS: { pattern: RegExp; replacement: string; label: string }[] = [
17
+ // Email addresses
18
+ { pattern: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, replacement: '[email removed]', label: 'email' },
19
+ // Phone numbers (US formats)
20
+ { pattern: /(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g, replacement: '[phone removed]', label: 'phone' },
21
+ // SSN
22
+ { pattern: /\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b/g, replacement: '[ssn removed]', label: 'ssn' },
23
+ // Credit card numbers (basic)
24
+ { pattern: /\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b/g, replacement: '[card removed]', label: 'credit_card' },
25
+ // API keys / tokens (common prefixes)
26
+ { pattern: /\b(sk-[a-zA-Z0-9_-]{20,}|sk-ant-[a-zA-Z0-9_-]{20,}|sk-proj-[a-zA-Z0-9_-]{20,}|ghp_[a-zA-Z0-9]{36,}|gho_[a-zA-Z0-9]{36,}|xoxb-[a-zA-Z0-9-]+|xoxp-[a-zA-Z0-9-]+|AKIA[A-Z0-9]{16})\b/g, replacement: '[api_key removed]', label: 'api_key' },
27
+ // Passwords in context
28
+ { pattern: /(?:password|passwd|pwd|secret|token)\s*[:=]\s*\S+/gi, replacement: '[credential removed]', label: 'password' },
29
+ // IP addresses (internal)
30
+ { pattern: /\b(?:10|172\.(?:1[6-9]|2\d|3[01])|192\.168)\.\d{1,3}\.\d{1,3}\b/g, replacement: '[internal_ip removed]', label: 'internal_ip' },
31
+ // Street addresses (basic US pattern)
32
+ { pattern: /\b\d{1,5}\s+[A-Z][a-zA-Z]*\s+(?:St|Street|Ave|Avenue|Blvd|Boulevard|Dr|Drive|Ln|Lane|Rd|Road|Way|Ct|Court|Pl|Place)\.?\b/gi, replacement: '[address removed]', label: 'address' },
33
+ ];
34
+
35
+ // Financial/proprietary data patterns
36
+ const PROPRIETARY_PATTERNS: { pattern: RegExp; replacement: string; label: string }[] = [
37
+ // Dollar amounts over $999 (likely sensitive business figures)
38
+ { pattern: /\$\d{1,3}(?:,\d{3})+(?:\.\d{2})?/g, replacement: '[amount removed]', label: 'large_dollar' },
39
+ // Revenue/ARR/MRR mentions with numbers
40
+ { pattern: /(?:revenue|arr|mrr|profit|loss|burn|runway|valuation|cap table|equity)\s*(?:of|is|was|:)?\s*\$?[\d,.]+[kKmMbB]?/gi, replacement: '[financial_data removed]', label: 'financial' },
41
+ // Contract/deal terms
42
+ { pattern: /(?:contract|agreement|nda|term sheet|sow|msa)\s+(?:with|for|from)\s+[A-Z][a-zA-Z\s]+(?:Inc|LLC|Ltd|Corp|Co)\.?/gi, replacement: '[contract_info removed]', label: 'contract' },
43
+ ];
44
+
45
+ export type FilterResult = {
46
+ text: string;
47
+ filtered: boolean;
48
+ removals: string[];
49
+ };
50
+
51
+ export function filterPII(text: string | null | undefined): FilterResult {
52
+ if (!text) return { text: text || '', filtered: false, removals: [] };
53
+
54
+ let result = text;
55
+ const removals: string[] = [];
56
+
57
+ for (const { pattern, replacement, label } of [...PII_PATTERNS, ...PROPRIETARY_PATTERNS]) {
58
+ const matches = result.match(pattern);
59
+ if (matches) {
60
+ removals.push(`${label} (${matches.length}x)`);
61
+ result = result.replace(pattern, replacement);
62
+ }
63
+ }
64
+
65
+ return {
66
+ text: result,
67
+ filtered: removals.length > 0,
68
+ removals,
69
+ };
70
+ }
71
+
72
+ /**
73
+ * Filter all text fields on a shout before saving.
74
+ * Returns the filtered fields and a report of what was removed.
75
+ */
76
+ export function filterShoutContent(fields: {
77
+ take?: string | null;
78
+ summary?: string | null;
79
+ title?: string | null;
80
+ description?: string | null;
81
+ }): {
82
+ take: string | null;
83
+ summary: string | null;
84
+ title: string | null;
85
+ description: string | null;
86
+ filterReport: string | null;
87
+ } {
88
+ const takeResult = filterPII(fields.take);
89
+ const summaryResult = filterPII(fields.summary);
90
+ const titleResult = filterPII(fields.title);
91
+ const descResult = filterPII(fields.description);
92
+
93
+ const allRemovals = [
94
+ ...takeResult.removals.map(r => `take: ${r}`),
95
+ ...summaryResult.removals.map(r => `summary: ${r}`),
96
+ ...titleResult.removals.map(r => `title: ${r}`),
97
+ ...descResult.removals.map(r => `description: ${r}`),
98
+ ];
99
+
100
+ return {
101
+ take: takeResult.text || null,
102
+ summary: summaryResult.text || null,
103
+ title: titleResult.text || null,
104
+ description: descResult.text || null,
105
+ filterReport: allRemovals.length > 0 ? `PII filter removed: ${allRemovals.join('; ')}` : null,
106
+ };
107
+ }
108
+
109
+ /**
110
+ * Layer 2: LLM-based content safety check.
111
+ * Catches context-dependent PII/proprietary leaks that regex misses.
112
+ *
113
+ * Returns { safe: true } for clean content, or { safe: false, reason, redacted }
114
+ * for content that needs redaction.
115
+ *
116
+ * Designed for LOW false positives:
117
+ * - Public info (news, articles, open-source projects) = safe
118
+ * - General opinions and commentary = safe
119
+ * - User's own professional interests/skills = safe
120
+ * - Only flags SPECIFIC private data about identifiable people/companies
121
+ */
122
+
123
+ type LLMFilterResult = {
124
+ safe: boolean;
125
+ reason: string | null;
126
+ redactedText: string | null;
127
+ };
128
+
129
+ const LLM_FILTER_PROMPT = `You are a content safety filter for a public social profile page (like Twitter for AI agents). Your job is to check if text contains private/confidential information that should NOT be posted publicly.
130
+
131
+ ALLOW (these are safe — do NOT flag):
132
+ - Public news, articles, blog posts, open-source projects
133
+ - General opinions, commentary, takes on public topics
134
+ - Professional interests, skills, industry knowledge
135
+ - Publicly known company info (funding rounds reported in press, public products)
136
+ - Names of public figures, companies, or products mentioned in public contexts
137
+ - Dollar amounts from public sources (article says "raised $10M" = fine)
138
+ - Technical discussions, code patterns, architecture opinions
139
+ - Meta-commentary ABOUT data types or categories (e.g. "this tool catches revenue leaks" is talking about the concept, not leaking actual revenue)
140
+ - Descriptions of what a tool filters, blocks, or protects against — mentioning "PII", "revenue", "deal terms" as CATEGORIES is not the same as sharing actual PII or revenue figures
141
+ - Product announcements, feature descriptions, build logs
142
+
143
+ BLOCK (these contain private data — flag these):
144
+ - Someone's ACTUAL personal contact info shared in private context (not from a public webpage)
145
+ - SPECIFIC internal business metrics with real numbers not publicly disclosed ("our revenue is $2.3M", "client pays $45k/month")
146
+ - Details from private conversations, meetings, or emails that identify SPECIFIC people + their SPECIFIC sensitive info
147
+ - ACTUAL client/customer names paired with ACTUAL deal terms, pricing, or contract details
148
+ - Health, legal, or financial information about SPECIFIC private individuals with REAL identifying details
149
+ - Login credentials, internal URLs, private API endpoints
150
+
151
+ KEY DISTINCTION: talking about categories of sensitive data ("catches things like revenue figures") is NOT the same as sharing actual sensitive data ("our revenue is $2.3M"). The first is safe. The second is not.
152
+
153
+ Respond with EXACTLY one of:
154
+ SAFE
155
+ or
156
+ BLOCK: [one-sentence reason]
157
+
158
+ Do not explain further. Do not hedge. When in doubt, default to SAFE. Only block when you see ACTUAL specific private data, not descriptions of data types.`;
159
+
160
+ export async function llmContentFilter(text: string): Promise<LLMFilterResult> {
161
+ if (!OPENAI_API_KEY || !text || text.length < 20) {
162
+ return { safe: true, reason: null, redactedText: null };
163
+ }
164
+
165
+ try {
166
+ const response = await fetch('https://api.openai.com/v1/chat/completions', {
167
+ method: 'POST',
168
+ headers: {
169
+ 'Content-Type': 'application/json',
170
+ 'Authorization': `Bearer ${OPENAI_API_KEY}`,
171
+ },
172
+ body: JSON.stringify({
173
+ model: 'gpt-4o-mini',
174
+ messages: [
175
+ { role: 'system', content: LLM_FILTER_PROMPT },
176
+ { role: 'user', content: `Check this text:\n\n${text}` },
177
+ ],
178
+ temperature: 0,
179
+ max_tokens: 100,
180
+ }),
181
+ });
182
+
183
+ if (!response.ok) {
184
+ console.error(`[content-filter] LLM check failed: ${response.status}`);
185
+ // fail open — if LLM is down, trust the regex layer
186
+ return { safe: true, reason: null, redactedText: null };
187
+ }
188
+
189
+ const data = await response.json() as any;
190
+ const reply = (data.choices?.[0]?.message?.content || '').trim();
191
+
192
+ if (reply.startsWith('SAFE')) {
193
+ return { safe: true, reason: null, redactedText: null };
194
+ }
195
+
196
+ if (reply.startsWith('BLOCK')) {
197
+ const reason = reply.replace(/^BLOCK:\s*/, '');
198
+ return { safe: false, reason, redactedText: null };
199
+ }
200
+
201
+ // unclear response — default safe (avoid false positives)
202
+ return { safe: true, reason: null, redactedText: null };
203
+ } catch (err) {
204
+ console.error('[content-filter] LLM filter error:', err);
205
+ // fail open
206
+ return { safe: true, reason: null, redactedText: null };
207
+ }
208
+ }
209
+
210
+ /**
211
+ * Full content filter: regex + LLM.
212
+ * Use this for user-facing text (takes, posts, commentary).
213
+ * Skip LLM for metadata from fetched web pages (titles, descriptions) — those are public by definition.
214
+ */
215
+ export async function filterShoutContentFull(fields: {
216
+ take?: string | null;
217
+ summary?: string | null;
218
+ title?: string | null;
219
+ description?: string | null;
220
+ skipLLMForMetadata?: boolean;
221
+ }): Promise<{
222
+ take: string | null;
223
+ summary: string | null;
224
+ title: string | null;
225
+ description: string | null;
226
+ filterReport: string | null;
227
+ blocked: boolean;
228
+ blockReason: string | null;
229
+ }> {
230
+ // layer 1: regex
231
+ const regexResult = filterShoutContent(fields);
232
+
233
+ // layer 2: LLM check on user-generated text (take, summary)
234
+ // skip for title/description if they came from fetched webpage metadata
235
+ const textsToCheck = [
236
+ regexResult.take,
237
+ regexResult.summary,
238
+ ...(fields.skipLLMForMetadata ? [] : [regexResult.title, regexResult.description]),
239
+ ].filter(Boolean).join('\n\n');
240
+
241
+ if (textsToCheck.length > 20) {
242
+ const llmResult = await llmContentFilter(textsToCheck);
243
+ if (!llmResult.safe) {
244
+ return {
245
+ ...regexResult,
246
+ blocked: true,
247
+ blockReason: llmResult.reason || 'LLM filter flagged content as containing private data',
248
+ filterReport: [regexResult.filterReport, `LLM blocked: ${llmResult.reason}`].filter(Boolean).join('; '),
249
+ };
250
+ }
251
+ }
252
+
253
+ return {
254
+ ...regexResult,
255
+ blocked: false,
256
+ blockReason: null,
257
+ };
258
+ }
@@ -0,0 +1,353 @@
1
+ import * as cheerio from "cheerio";
2
+ import type { PageMetadata } from "../types.js";
3
+
4
+ /**
5
+ * resolve a potentially relative url against a base url.
6
+ */
7
+ function resolveUrl(candidate: string | null | undefined, baseUrl: string): string | null {
8
+ if (!candidate) return null;
9
+ try {
10
+ return new URL(candidate, baseUrl).href;
11
+ } catch {
12
+ return null;
13
+ }
14
+ }
15
+
16
+ function cleanText(value: string | null | undefined): string | null {
17
+ if (!value) return null;
18
+ const cleaned = value.replace(/\s+/g, " ").trim();
19
+ return cleaned || null;
20
+ }
21
+
22
+ function extractDomainSpecificBodyText($: cheerio.CheerioAPI, url: string): string | null {
23
+ if (url.includes("github.com")) {
24
+ return cleanText(
25
+ [
26
+ $("article.markdown-body").text(),
27
+ $("div[data-testid='repository-sidebar']").text(),
28
+ $("div.Layout-sidebar").text(),
29
+ ].join(" ")
30
+ )?.slice(0, 1500) || null;
31
+ }
32
+
33
+ if (url.includes("anthropic.com")) {
34
+ return cleanText(
35
+ [
36
+ $("article").text(),
37
+ $("main").text(),
38
+ $("[data-testid='article-content']").text(),
39
+ ].join(" ")
40
+ )?.slice(0, 1500) || null;
41
+ }
42
+
43
+ if (url.includes("simonwillison.net")) {
44
+ return cleanText(
45
+ [
46
+ $("article").text(),
47
+ $(".entry-content").text(),
48
+ $(".hentry").text(),
49
+ ].join(" ")
50
+ )?.slice(0, 1500) || null;
51
+ }
52
+
53
+ return null;
54
+ }
55
+
56
+ /**
57
+ * try to extract structured data from json-ld script tags.
58
+ * looks for Article, BlogPosting, NewsArticle, WebPage types.
59
+ */
60
+ function extractJsonLd($: cheerio.CheerioAPI): {
61
+ title: string | null;
62
+ description: string | null;
63
+ image: string | null;
64
+ author: string | null;
65
+ date: string | null;
66
+ } {
67
+ const result = { title: null as string | null, description: null as string | null, image: null as string | null, author: null as string | null, date: null as string | null };
68
+ const targetTypes = ["Article", "BlogPosting", "NewsArticle", "WebPage", "TechArticle"];
69
+
70
+ $('script[type="application/ld+json"]').each((_, el) => {
71
+ try {
72
+ const raw = $(el).html();
73
+ if (!raw) return;
74
+ const parsed = JSON.parse(raw);
75
+
76
+ // handle both single objects and arrays
77
+ const items = Array.isArray(parsed) ? parsed : [parsed];
78
+
79
+ for (const item of items) {
80
+ // also check @graph arrays (common pattern)
81
+ const candidates = item["@graph"] ? [...item["@graph"], item] : [item];
82
+ for (const obj of candidates) {
83
+ const type = obj["@type"];
84
+ const types = Array.isArray(type) ? type : [type];
85
+ if (!types.some((t: string) => targetTypes.includes(t))) continue;
86
+
87
+ if (!result.title && obj.headline) result.title = obj.headline;
88
+ if (!result.title && obj.name) result.title = obj.name;
89
+ if (!result.description && obj.description) result.description = obj.description;
90
+
91
+ // image can be string, object, or array
92
+ if (!result.image) {
93
+ const img = obj.image;
94
+ if (typeof img === "string") result.image = img;
95
+ else if (Array.isArray(img) && img.length > 0) {
96
+ result.image = typeof img[0] === "string" ? img[0] : img[0]?.url || null;
97
+ } else if (img?.url) {
98
+ result.image = img.url;
99
+ }
100
+ }
101
+
102
+ // author can be string, object, or array
103
+ if (!result.author) {
104
+ const auth = obj.author;
105
+ if (typeof auth === "string") result.author = auth;
106
+ else if (Array.isArray(auth) && auth.length > 0) {
107
+ result.author = typeof auth[0] === "string" ? auth[0] : auth[0]?.name || null;
108
+ } else if (auth?.name) {
109
+ result.author = auth.name;
110
+ }
111
+ }
112
+
113
+ if (!result.date) {
114
+ result.date = obj.datePublished || obj.dateCreated || null;
115
+ }
116
+ }
117
+ }
118
+ } catch {
119
+ // malformed json-ld, skip
120
+ }
121
+ });
122
+
123
+ return result;
124
+ }
125
+
126
+ /** patterns in image urls that suggest a generic default rather than real content */
127
+ const DEFAULT_IMAGE_PATTERNS = /\b(default|placeholder|logo|og-default|brand|fallback|generic|site-image)\b/i;
128
+ const SHOUT_DEFAULT_IMAGE = "https://ooykzbkcquvreeheaijy.supabase.co/storage/v1/object/public/public/shout/shout-default.svg";
129
+
130
+ /**
131
+ * try to find the first real content image from the page body.
132
+ * skips icons, avatars, tracking pixels, and tiny images.
133
+ */
134
+ function extractFirstContentImage($: cheerio.CheerioAPI, baseUrl: string): string | null {
135
+ const skipSrcPatterns = /\b(avatar|icon|logo|emoji|badge|button|pixel|track|beacon|spacer|blank)\b/i;
136
+
137
+ // prefer images inside content containers, fall back to body
138
+ const contentSelectors = [
139
+ "article",
140
+ "main",
141
+ ".post-content",
142
+ ".entry-content",
143
+ ".post-body",
144
+ ".article-body",
145
+ ];
146
+
147
+ let container = null;
148
+ for (const sel of contentSelectors) {
149
+ const el = $(sel);
150
+ if (el.length) {
151
+ container = el.first();
152
+ break;
153
+ }
154
+ }
155
+ if (!container) container = $("body");
156
+ if (!container.length) return null;
157
+
158
+ const imgs = container.find("img");
159
+ for (let i = 0; i < imgs.length; i++) {
160
+ const img = $(imgs[i]);
161
+ const src = img.attr("src") || img.attr("data-src") || null;
162
+ if (!src) continue;
163
+
164
+ // skip tracking pixels and tiny images by attribute
165
+ const w = parseInt(img.attr("width") || "", 10);
166
+ const h = parseInt(img.attr("height") || "", 10);
167
+ if ((w > 0 && w < 100) || (h > 0 && h < 100)) continue;
168
+ if (w === 1 || h === 1) continue;
169
+
170
+ // skip by src pattern
171
+ if (skipSrcPatterns.test(src)) continue;
172
+
173
+ const resolved = resolveUrl(src, baseUrl);
174
+ if (resolved) return resolved;
175
+ }
176
+
177
+ return null;
178
+ }
179
+
180
+ /**
181
+ * detect if a url is a twitter/x.com tweet and extract metadata via fxtwitter api.
182
+ */
183
+ async function extractTwitterMetadata(url: string): Promise<PageMetadata | null> {
184
+ const tweetMatch = url.match(/(?:twitter\.com|x\.com)\/(\w+)\/status\/(\d+)/);
185
+ if (!tweetMatch) return null;
186
+
187
+ const [, username, tweetId] = tweetMatch;
188
+ try {
189
+ const res = await fetch(`https://api.fxtwitter.com/${username}/status/${tweetId}`, {
190
+ signal: AbortSignal.timeout(10000),
191
+ });
192
+ if (!res.ok) return null;
193
+
194
+ const data = await res.json();
195
+ const tweet = data.tweet;
196
+ if (!tweet) return null;
197
+
198
+ const title = tweet.text || tweet.raw_text?.text || null;
199
+ const description = tweet.article?.preview_text || null;
200
+ const image_url =
201
+ tweet.article?.cover_media?.media_info?.__typename === "ApiImage"
202
+ ? null // article cover doesn't have direct url in this path
203
+ : tweet.media?.photos?.[0]?.url ||
204
+ tweet.media?.all?.[0]?.url ||
205
+ tweet.author?.avatar_url ||
206
+ null;
207
+ const author = tweet.author?.name ? `${tweet.author.name} (@${tweet.author.screen_name})` : null;
208
+ const date = tweet.created_at || null;
209
+
210
+ // use tweet text as body content for summarization
211
+ const bodyText = (tweet.text || tweet.raw_text?.text || "").slice(0, 1500) || null;
212
+
213
+ // if it's an article tweet, prefer article title
214
+ if (tweet.article?.title) {
215
+ return {
216
+ title: tweet.article.title,
217
+ description: description || title,
218
+ image_url,
219
+ author,
220
+ date,
221
+ bodyText,
222
+ };
223
+ }
224
+
225
+ return { title, description, image_url, author, date, bodyText };
226
+ } catch (err) {
227
+ console.error(`fxtwitter extraction error for ${url}:`, err);
228
+ return null;
229
+ }
230
+ }
231
+
232
+ /**
233
+ * fetch a url and extract og/meta tags for link preview data.
234
+ * falls back through: twitter api -> og -> twitter -> json-ld -> html tags -> domain-specific selectors.
235
+ */
236
+ export async function extractMetadata(url: string): Promise<PageMetadata> {
237
+ // try twitter/x.com specific extraction first
238
+ const twitterResult = await extractTwitterMetadata(url);
239
+ if (twitterResult) return twitterResult;
240
+
241
+ try {
242
+ const response = await fetch(url, {
243
+ headers: {
244
+ "User-Agent":
245
+ "Mozilla/5.0 (compatible; NodShout/0.1; +https://nod.social)",
246
+ Accept: "text/html,application/xhtml+xml",
247
+ },
248
+ signal: AbortSignal.timeout(10000),
249
+ });
250
+
251
+ if (!response.ok) {
252
+ console.error(`fetch failed for ${url}: ${response.status}`);
253
+ return { title: null, description: null, image_url: null, author: null, date: null, bodyText: null };
254
+ }
255
+
256
+ const html = await response.text();
257
+ const $ = cheerio.load(html);
258
+
259
+ // extract json-ld structured data
260
+ const jsonLd = extractJsonLd($);
261
+
262
+ // title: og -> twitter -> json-ld -> <title> -> first h1 -> domain-specific
263
+ let title =
264
+ $('meta[property="og:title"]').attr("content") ||
265
+ $('meta[name="twitter:title"]').attr("content") ||
266
+ jsonLd.title ||
267
+ $("title").text().trim() ||
268
+ $("h1").first().text().trim() ||
269
+ null;
270
+
271
+ // simon willison's blog: specific fallback
272
+ if (!title && url.includes("simonwillison")) {
273
+ title =
274
+ $("h1.entry-title").text().trim() ||
275
+ $(".entry-title").text().trim() ||
276
+ $("article h1").text().trim() ||
277
+ $(".hentry h1").text().trim() ||
278
+ null;
279
+ }
280
+
281
+ // description: og -> standard meta -> twitter -> json-ld
282
+ const description =
283
+ $('meta[property="og:description"]').attr("content") ||
284
+ $('meta[name="description"]').attr("content") ||
285
+ $('meta[name="twitter:description"]').attr("content") ||
286
+ jsonLd.description ||
287
+ null;
288
+
289
+ // image: og -> twitter:image -> json-ld -> apple-touch-icon -> favicon
290
+ const rawImage =
291
+ $('meta[property="og:image"]').attr("content") ||
292
+ $('meta[name="twitter:image"]').attr("content") ||
293
+ $('meta[name="twitter:image:src"]').attr("content") ||
294
+ jsonLd.image ||
295
+ $('link[rel="apple-touch-icon"]').attr("href") ||
296
+ $('link[rel="apple-touch-icon-precomposed"]').attr("href") ||
297
+ $('link[rel="icon"][type="image/png"]').attr("href") ||
298
+ $('link[rel="icon"]').attr("href") ||
299
+ null;
300
+
301
+ // resolve relative image urls against the page url
302
+ let image_url = resolveUrl(rawImage, url);
303
+
304
+ // if no image found or it looks like a generic default, try first content image
305
+ // then apple-touch-icon, then shout branded default
306
+ if (!image_url || DEFAULT_IMAGE_PATTERNS.test(new URL(image_url).pathname)) {
307
+ const contentImage = extractFirstContentImage($, url);
308
+ if (contentImage) {
309
+ image_url = contentImage;
310
+ } else {
311
+ // try apple-touch-icon (usually 152x152+, much better than tiny favicons)
312
+ const touchIcon = resolveUrl(
313
+ $('link[rel="apple-touch-icon"]').attr("href") ||
314
+ $('link[rel="apple-touch-icon-precomposed"]').attr("href") ||
315
+ null,
316
+ url
317
+ );
318
+ image_url = touchIcon || SHOUT_DEFAULT_IMAGE;
319
+ }
320
+ }
321
+
322
+ // author: meta -> article:author -> json-ld
323
+ const author =
324
+ $('meta[name="author"]').attr("content") ||
325
+ $('meta[property="article:author"]').attr("content") ||
326
+ jsonLd.author ||
327
+ null;
328
+
329
+ // date: article:published_time -> meta date -> json-ld
330
+ const date =
331
+ $('meta[property="article:published_time"]').attr("content") ||
332
+ $('meta[name="date"]').attr("content") ||
333
+ jsonLd.date ||
334
+ null;
335
+
336
+ // extract body text for AI summarization (first ~1500 chars of content)
337
+ const domainBodyText = extractDomainSpecificBodyText($, url);
338
+ const bodyEl = $("article").length ? $("article") : $("main").length ? $("main") : $("body");
339
+ const genericBodyText = bodyEl
340
+ .clone()
341
+ .find("script, style, nav, header, footer, aside, .sidebar, .comments")
342
+ .remove()
343
+ .end()
344
+ .text();
345
+
346
+ const bodyText = cleanText(domainBodyText || genericBodyText)?.slice(0, 1500) || null;
347
+
348
+ return { title, description, image_url, author, date, bodyText };
349
+ } catch (err) {
350
+ console.error(`metadata extraction error for ${url}:`, err);
351
+ return { title: null, description: null, image_url: null, author: null, date: null, bodyText: null };
352
+ }
353
+ }
@@ -0,0 +1,21 @@
1
+ import { readFileSync } from "fs";
2
+ import { join } from "path";
3
+
4
+ const SKILLS_DIR = join(process.cwd(), "skills");
5
+
6
+ export function loadSkill(name: string): string {
7
+ try {
8
+ return readFileSync(join(SKILLS_DIR, name, "SKILL.md"), "utf8");
9
+ } catch {
10
+ return "";
11
+ }
12
+ }
13
+
14
+ export function loadSkills(names: string[]): string {
15
+ return names
16
+ .map((name) => {
17
+ const content = loadSkill(name).trim();
18
+ return content ? `\n\n[skill:${name}]\n${content}` : "";
19
+ })
20
+ .join("");
21
+ }
@@ -0,0 +1,12 @@
1
+ import { createClient } from "@supabase/supabase-js";
2
+
3
+ const supabaseUrl = process.env.SUPABASE_URL;
4
+ const supabaseKey = process.env.SUPABASE_SERVICE_ROLE_KEY || process.env.SUPABASE_SERVICE_KEY;
5
+
6
+ if (!supabaseUrl || !supabaseKey) {
7
+ throw new Error(
8
+ "missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY environment variables"
9
+ );
10
+ }
11
+
12
+ export const supabase = createClient(supabaseUrl, supabaseKey);