nod-shout 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -0
- package/TASK-AGENT-POSTS.md +112 -0
- package/assets/shout-default.svg +5 -0
- package/bin/shout +68 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +29 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/ai.d.ts +13 -0
- package/dist/lib/ai.d.ts.map +1 -0
- package/dist/lib/ai.js +135 -0
- package/dist/lib/ai.js.map +1 -0
- package/dist/lib/content-filter.d.ts +74 -0
- package/dist/lib/content-filter.d.ts.map +1 -0
- package/dist/lib/content-filter.js +188 -0
- package/dist/lib/content-filter.js.map +1 -0
- package/dist/lib/context-extractor.d.ts +39 -0
- package/dist/lib/context-extractor.d.ts.map +1 -0
- package/dist/lib/context-extractor.js +170 -0
- package/dist/lib/context-extractor.js.map +1 -0
- package/dist/lib/match-engine.d.ts +31 -0
- package/dist/lib/match-engine.d.ts.map +1 -0
- package/dist/lib/match-engine.js +322 -0
- package/dist/lib/match-engine.js.map +1 -0
- package/dist/lib/metadata.d.ts +7 -0
- package/dist/lib/metadata.d.ts.map +1 -0
- package/dist/lib/metadata.js +311 -0
- package/dist/lib/metadata.js.map +1 -0
- package/dist/lib/skills.d.ts +3 -0
- package/dist/lib/skills.d.ts.map +1 -0
- package/dist/lib/skills.js +20 -0
- package/dist/lib/skills.js.map +1 -0
- package/dist/lib/supabase.d.ts +2 -0
- package/dist/lib/supabase.d.ts.map +1 -0
- package/dist/lib/supabase.js +8 -0
- package/dist/lib/supabase.js.map +1 -0
- package/dist/tools/collections.d.ts +3 -0
- package/dist/tools/collections.d.ts.map +1 -0
- package/dist/tools/collections.js +142 -0
- package/dist/tools/collections.js.map +1 -0
- package/dist/tools/intros.d.ts +3 -0
- package/dist/tools/intros.d.ts.map +1 -0
- package/dist/tools/intros.js +483 -0
- package/dist/tools/intros.js.map +1 -0
- package/dist/tools/links.d.ts +3 -0
- package/dist/tools/links.d.ts.map +1 -0
- package/dist/tools/links.js +424 -0
- package/dist/tools/links.js.map +1 -0
- package/dist/tools/posts.d.ts +3 -0
- package/dist/tools/posts.d.ts.map +1 -0
- package/dist/tools/posts.js +212 -0
- package/dist/tools/posts.js.map +1 -0
- package/dist/tools/settings.d.ts +3 -0
- package/dist/tools/settings.d.ts.map +1 -0
- package/dist/tools/settings.js +45 -0
- package/dist/tools/settings.js.map +1 -0
- package/dist/tools/shout_agent_curate.d.ts +28 -0
- package/dist/tools/shout_agent_curate.d.ts.map +1 -0
- package/dist/tools/shout_agent_curate.js +80 -0
- package/dist/tools/shout_agent_curate.js.map +1 -0
- package/dist/tools/social.d.ts +3 -0
- package/dist/tools/social.d.ts.map +1 -0
- package/dist/tools/social.js +169 -0
- package/dist/tools/social.js.map +1 -0
- package/dist/types.d.ts +60 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/package.json +24 -0
- package/quick-test.ts +22 -0
- package/regenerate-summaries.ts +111 -0
- package/save-jeffries-shout.ts +38 -0
- package/save-openai-shout.ts +35 -0
- package/save-prcarly.ts +46 -0
- package/save-shout.ts +35 -0
- package/save-techcrunch-shout.ts +59 -0
- package/save-zdnet-shout.ts +36 -0
- package/skills/collection-routing/SKILL.md +34 -0
- package/skills/link-summary/SKILL.md +53 -0
- package/skills/tagging-and-routing/SKILL.md +54 -0
- package/src/index.ts +32 -0
- package/src/lib/ai.ts +166 -0
- package/src/lib/content-filter.ts +258 -0
- package/src/lib/metadata.ts +353 -0
- package/src/lib/skills.ts +21 -0
- package/src/lib/supabase.ts +12 -0
- package/src/tools/collections.ts +182 -0
- package/src/tools/links.ts +524 -0
- package/src/tools/posts.ts +264 -0
- package/src/tools/settings.ts +55 -0
- package/src/tools/shout_agent_curate.ts +95 -0
- package/src/tools/social.ts +206 -0
- package/src/types.ts +66 -0
- package/supabase/.temp/cli-latest +1 -0
- package/supabase/.temp/gotrue-version +1 -0
- package/supabase/.temp/pooler-url +1 -0
- package/supabase/.temp/postgres-version +1 -0
- package/supabase/.temp/project-ref +1 -0
- package/supabase/.temp/rest-version +1 -0
- package/supabase/.temp/storage-migration +1 -0
- package/supabase/.temp/storage-version +1 -0
- package/supabase/migrations/001_initial_schema.sql +147 -0
- package/supabase/migrations/20260317010000_decouple_profiles_from_auth.sql +9 -0
- package/supabase/migrations/20260317020000_agent_curation.sql +10 -0
- package/supabase/migrations/20260320000000_agent_posts.sql +41 -0
- package/supabase/migrations/20260320120000_fix_draft_fk.sql +2 -0
- package/supabase/migrations/20260320130000_fix_identity.sql +17 -0
- package/supabase/migrations/20260320170000_intros.sql +118 -0
- package/test-model-comparison.ts +89 -0
- package/tsconfig.json +19 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content filter for shout posts — strips PII, proprietary data,
|
|
3
|
+
* and sensitive information before anything touches the public page.
|
|
4
|
+
*
|
|
5
|
+
* Two layers:
|
|
6
|
+
* 1. Regex — catches format-based PII (emails, phones, keys, etc.)
|
|
7
|
+
* 2. LLM — catches context-based leaks ("our revenue is...", "client X told me...")
|
|
8
|
+
*
|
|
9
|
+
* Runs on all text fields (take, summary, title, description)
|
|
10
|
+
* before insert into supabase.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
|
|
14
|
+
|
|
15
|
+
// Regex patterns for common PII
|
|
16
|
+
const PII_PATTERNS: { pattern: RegExp; replacement: string; label: string }[] = [
|
|
17
|
+
// Email addresses
|
|
18
|
+
{ pattern: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, replacement: '[email removed]', label: 'email' },
|
|
19
|
+
// Phone numbers (US formats)
|
|
20
|
+
{ pattern: /(\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}/g, replacement: '[phone removed]', label: 'phone' },
|
|
21
|
+
// SSN
|
|
22
|
+
{ pattern: /\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b/g, replacement: '[ssn removed]', label: 'ssn' },
|
|
23
|
+
// Credit card numbers (basic)
|
|
24
|
+
{ pattern: /\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b/g, replacement: '[card removed]', label: 'credit_card' },
|
|
25
|
+
// API keys / tokens (common prefixes)
|
|
26
|
+
{ pattern: /\b(sk-[a-zA-Z0-9_-]{20,}|sk-ant-[a-zA-Z0-9_-]{20,}|sk-proj-[a-zA-Z0-9_-]{20,}|ghp_[a-zA-Z0-9]{36,}|gho_[a-zA-Z0-9]{36,}|xoxb-[a-zA-Z0-9-]+|xoxp-[a-zA-Z0-9-]+|AKIA[A-Z0-9]{16})\b/g, replacement: '[api_key removed]', label: 'api_key' },
|
|
27
|
+
// Passwords in context
|
|
28
|
+
{ pattern: /(?:password|passwd|pwd|secret|token)\s*[:=]\s*\S+/gi, replacement: '[credential removed]', label: 'password' },
|
|
29
|
+
// IP addresses (internal)
|
|
30
|
+
{ pattern: /\b(?:10|172\.(?:1[6-9]|2\d|3[01])|192\.168)\.\d{1,3}\.\d{1,3}\b/g, replacement: '[internal_ip removed]', label: 'internal_ip' },
|
|
31
|
+
// Street addresses (basic US pattern)
|
|
32
|
+
{ pattern: /\b\d{1,5}\s+[A-Z][a-zA-Z]*\s+(?:St|Street|Ave|Avenue|Blvd|Boulevard|Dr|Drive|Ln|Lane|Rd|Road|Way|Ct|Court|Pl|Place)\.?\b/gi, replacement: '[address removed]', label: 'address' },
|
|
33
|
+
];
|
|
34
|
+
|
|
35
|
+
// Financial/proprietary data patterns
|
|
36
|
+
const PROPRIETARY_PATTERNS: { pattern: RegExp; replacement: string; label: string }[] = [
|
|
37
|
+
// Dollar amounts over $999 (likely sensitive business figures)
|
|
38
|
+
{ pattern: /\$\d{1,3}(?:,\d{3})+(?:\.\d{2})?/g, replacement: '[amount removed]', label: 'large_dollar' },
|
|
39
|
+
// Revenue/ARR/MRR mentions with numbers
|
|
40
|
+
{ pattern: /(?:revenue|arr|mrr|profit|loss|burn|runway|valuation|cap table|equity)\s*(?:of|is|was|:)?\s*\$?[\d,.]+[kKmMbB]?/gi, replacement: '[financial_data removed]', label: 'financial' },
|
|
41
|
+
// Contract/deal terms
|
|
42
|
+
{ pattern: /(?:contract|agreement|nda|term sheet|sow|msa)\s+(?:with|for|from)\s+[A-Z][a-zA-Z\s]+(?:Inc|LLC|Ltd|Corp|Co)\.?/gi, replacement: '[contract_info removed]', label: 'contract' },
|
|
43
|
+
];
|
|
44
|
+
|
|
45
|
+
export type FilterResult = {
|
|
46
|
+
text: string;
|
|
47
|
+
filtered: boolean;
|
|
48
|
+
removals: string[];
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
export function filterPII(text: string | null | undefined): FilterResult {
|
|
52
|
+
if (!text) return { text: text || '', filtered: false, removals: [] };
|
|
53
|
+
|
|
54
|
+
let result = text;
|
|
55
|
+
const removals: string[] = [];
|
|
56
|
+
|
|
57
|
+
for (const { pattern, replacement, label } of [...PII_PATTERNS, ...PROPRIETARY_PATTERNS]) {
|
|
58
|
+
const matches = result.match(pattern);
|
|
59
|
+
if (matches) {
|
|
60
|
+
removals.push(`${label} (${matches.length}x)`);
|
|
61
|
+
result = result.replace(pattern, replacement);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
return {
|
|
66
|
+
text: result,
|
|
67
|
+
filtered: removals.length > 0,
|
|
68
|
+
removals,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Filter all text fields on a shout before saving.
|
|
74
|
+
* Returns the filtered fields and a report of what was removed.
|
|
75
|
+
*/
|
|
76
|
+
export function filterShoutContent(fields: {
|
|
77
|
+
take?: string | null;
|
|
78
|
+
summary?: string | null;
|
|
79
|
+
title?: string | null;
|
|
80
|
+
description?: string | null;
|
|
81
|
+
}): {
|
|
82
|
+
take: string | null;
|
|
83
|
+
summary: string | null;
|
|
84
|
+
title: string | null;
|
|
85
|
+
description: string | null;
|
|
86
|
+
filterReport: string | null;
|
|
87
|
+
} {
|
|
88
|
+
const takeResult = filterPII(fields.take);
|
|
89
|
+
const summaryResult = filterPII(fields.summary);
|
|
90
|
+
const titleResult = filterPII(fields.title);
|
|
91
|
+
const descResult = filterPII(fields.description);
|
|
92
|
+
|
|
93
|
+
const allRemovals = [
|
|
94
|
+
...takeResult.removals.map(r => `take: ${r}`),
|
|
95
|
+
...summaryResult.removals.map(r => `summary: ${r}`),
|
|
96
|
+
...titleResult.removals.map(r => `title: ${r}`),
|
|
97
|
+
...descResult.removals.map(r => `description: ${r}`),
|
|
98
|
+
];
|
|
99
|
+
|
|
100
|
+
return {
|
|
101
|
+
take: takeResult.text || null,
|
|
102
|
+
summary: summaryResult.text || null,
|
|
103
|
+
title: titleResult.text || null,
|
|
104
|
+
description: descResult.text || null,
|
|
105
|
+
filterReport: allRemovals.length > 0 ? `PII filter removed: ${allRemovals.join('; ')}` : null,
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Layer 2: LLM-based content safety check.
|
|
111
|
+
* Catches context-dependent PII/proprietary leaks that regex misses.
|
|
112
|
+
*
|
|
113
|
+
* Returns { safe: true } for clean content, or { safe: false, reason, redacted }
|
|
114
|
+
* for content that needs redaction.
|
|
115
|
+
*
|
|
116
|
+
* Designed for LOW false positives:
|
|
117
|
+
* - Public info (news, articles, open-source projects) = safe
|
|
118
|
+
* - General opinions and commentary = safe
|
|
119
|
+
* - User's own professional interests/skills = safe
|
|
120
|
+
* - Only flags SPECIFIC private data about identifiable people/companies
|
|
121
|
+
*/
|
|
122
|
+
|
|
123
|
+
type LLMFilterResult = {
|
|
124
|
+
safe: boolean;
|
|
125
|
+
reason: string | null;
|
|
126
|
+
redactedText: string | null;
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
const LLM_FILTER_PROMPT = `You are a content safety filter for a public social profile page (like Twitter for AI agents). Your job is to check if text contains private/confidential information that should NOT be posted publicly.
|
|
130
|
+
|
|
131
|
+
ALLOW (these are safe — do NOT flag):
|
|
132
|
+
- Public news, articles, blog posts, open-source projects
|
|
133
|
+
- General opinions, commentary, takes on public topics
|
|
134
|
+
- Professional interests, skills, industry knowledge
|
|
135
|
+
- Publicly known company info (funding rounds reported in press, public products)
|
|
136
|
+
- Names of public figures, companies, or products mentioned in public contexts
|
|
137
|
+
- Dollar amounts from public sources (article says "raised $10M" = fine)
|
|
138
|
+
- Technical discussions, code patterns, architecture opinions
|
|
139
|
+
- Meta-commentary ABOUT data types or categories (e.g. "this tool catches revenue leaks" is talking about the concept, not leaking actual revenue)
|
|
140
|
+
- Descriptions of what a tool filters, blocks, or protects against — mentioning "PII", "revenue", "deal terms" as CATEGORIES is not the same as sharing actual PII or revenue figures
|
|
141
|
+
- Product announcements, feature descriptions, build logs
|
|
142
|
+
|
|
143
|
+
BLOCK (these contain private data — flag these):
|
|
144
|
+
- Someone's ACTUAL personal contact info shared in private context (not from a public webpage)
|
|
145
|
+
- SPECIFIC internal business metrics with real numbers not publicly disclosed ("our revenue is $2.3M", "client pays $45k/month")
|
|
146
|
+
- Details from private conversations, meetings, or emails that identify SPECIFIC people + their SPECIFIC sensitive info
|
|
147
|
+
- ACTUAL client/customer names paired with ACTUAL deal terms, pricing, or contract details
|
|
148
|
+
- Health, legal, or financial information about SPECIFIC private individuals with REAL identifying details
|
|
149
|
+
- Login credentials, internal URLs, private API endpoints
|
|
150
|
+
|
|
151
|
+
KEY DISTINCTION: talking about categories of sensitive data ("catches things like revenue figures") is NOT the same as sharing actual sensitive data ("our revenue is $2.3M"). The first is safe. The second is not.
|
|
152
|
+
|
|
153
|
+
Respond with EXACTLY one of:
|
|
154
|
+
SAFE
|
|
155
|
+
or
|
|
156
|
+
BLOCK: [one-sentence reason]
|
|
157
|
+
|
|
158
|
+
Do not explain further. Do not hedge. When in doubt, default to SAFE. Only block when you see ACTUAL specific private data, not descriptions of data types.`;
|
|
159
|
+
|
|
160
|
+
export async function llmContentFilter(text: string): Promise<LLMFilterResult> {
|
|
161
|
+
if (!OPENAI_API_KEY || !text || text.length < 20) {
|
|
162
|
+
return { safe: true, reason: null, redactedText: null };
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
try {
|
|
166
|
+
const response = await fetch('https://api.openai.com/v1/chat/completions', {
|
|
167
|
+
method: 'POST',
|
|
168
|
+
headers: {
|
|
169
|
+
'Content-Type': 'application/json',
|
|
170
|
+
'Authorization': `Bearer ${OPENAI_API_KEY}`,
|
|
171
|
+
},
|
|
172
|
+
body: JSON.stringify({
|
|
173
|
+
model: 'gpt-4o-mini',
|
|
174
|
+
messages: [
|
|
175
|
+
{ role: 'system', content: LLM_FILTER_PROMPT },
|
|
176
|
+
{ role: 'user', content: `Check this text:\n\n${text}` },
|
|
177
|
+
],
|
|
178
|
+
temperature: 0,
|
|
179
|
+
max_tokens: 100,
|
|
180
|
+
}),
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
if (!response.ok) {
|
|
184
|
+
console.error(`[content-filter] LLM check failed: ${response.status}`);
|
|
185
|
+
// fail open — if LLM is down, trust the regex layer
|
|
186
|
+
return { safe: true, reason: null, redactedText: null };
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
const data = await response.json() as any;
|
|
190
|
+
const reply = (data.choices?.[0]?.message?.content || '').trim();
|
|
191
|
+
|
|
192
|
+
if (reply.startsWith('SAFE')) {
|
|
193
|
+
return { safe: true, reason: null, redactedText: null };
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
if (reply.startsWith('BLOCK')) {
|
|
197
|
+
const reason = reply.replace(/^BLOCK:\s*/, '');
|
|
198
|
+
return { safe: false, reason, redactedText: null };
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// unclear response — default safe (avoid false positives)
|
|
202
|
+
return { safe: true, reason: null, redactedText: null };
|
|
203
|
+
} catch (err) {
|
|
204
|
+
console.error('[content-filter] LLM filter error:', err);
|
|
205
|
+
// fail open
|
|
206
|
+
return { safe: true, reason: null, redactedText: null };
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Full content filter: regex + LLM.
|
|
212
|
+
* Use this for user-facing text (takes, posts, commentary).
|
|
213
|
+
* Skip LLM for metadata from fetched web pages (titles, descriptions) — those are public by definition.
|
|
214
|
+
*/
|
|
215
|
+
export async function filterShoutContentFull(fields: {
|
|
216
|
+
take?: string | null;
|
|
217
|
+
summary?: string | null;
|
|
218
|
+
title?: string | null;
|
|
219
|
+
description?: string | null;
|
|
220
|
+
skipLLMForMetadata?: boolean;
|
|
221
|
+
}): Promise<{
|
|
222
|
+
take: string | null;
|
|
223
|
+
summary: string | null;
|
|
224
|
+
title: string | null;
|
|
225
|
+
description: string | null;
|
|
226
|
+
filterReport: string | null;
|
|
227
|
+
blocked: boolean;
|
|
228
|
+
blockReason: string | null;
|
|
229
|
+
}> {
|
|
230
|
+
// layer 1: regex
|
|
231
|
+
const regexResult = filterShoutContent(fields);
|
|
232
|
+
|
|
233
|
+
// layer 2: LLM check on user-generated text (take, summary)
|
|
234
|
+
// skip for title/description if they came from fetched webpage metadata
|
|
235
|
+
const textsToCheck = [
|
|
236
|
+
regexResult.take,
|
|
237
|
+
regexResult.summary,
|
|
238
|
+
...(fields.skipLLMForMetadata ? [] : [regexResult.title, regexResult.description]),
|
|
239
|
+
].filter(Boolean).join('\n\n');
|
|
240
|
+
|
|
241
|
+
if (textsToCheck.length > 20) {
|
|
242
|
+
const llmResult = await llmContentFilter(textsToCheck);
|
|
243
|
+
if (!llmResult.safe) {
|
|
244
|
+
return {
|
|
245
|
+
...regexResult,
|
|
246
|
+
blocked: true,
|
|
247
|
+
blockReason: llmResult.reason || 'LLM filter flagged content as containing private data',
|
|
248
|
+
filterReport: [regexResult.filterReport, `LLM blocked: ${llmResult.reason}`].filter(Boolean).join('; '),
|
|
249
|
+
};
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
return {
|
|
254
|
+
...regexResult,
|
|
255
|
+
blocked: false,
|
|
256
|
+
blockReason: null,
|
|
257
|
+
};
|
|
258
|
+
}
|
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
import * as cheerio from "cheerio";
|
|
2
|
+
import type { PageMetadata } from "../types.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* resolve a potentially relative url against a base url.
|
|
6
|
+
*/
|
|
7
|
+
function resolveUrl(candidate: string | null | undefined, baseUrl: string): string | null {
|
|
8
|
+
if (!candidate) return null;
|
|
9
|
+
try {
|
|
10
|
+
return new URL(candidate, baseUrl).href;
|
|
11
|
+
} catch {
|
|
12
|
+
return null;
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
function cleanText(value: string | null | undefined): string | null {
|
|
17
|
+
if (!value) return null;
|
|
18
|
+
const cleaned = value.replace(/\s+/g, " ").trim();
|
|
19
|
+
return cleaned || null;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function extractDomainSpecificBodyText($: cheerio.CheerioAPI, url: string): string | null {
|
|
23
|
+
if (url.includes("github.com")) {
|
|
24
|
+
return cleanText(
|
|
25
|
+
[
|
|
26
|
+
$("article.markdown-body").text(),
|
|
27
|
+
$("div[data-testid='repository-sidebar']").text(),
|
|
28
|
+
$("div.Layout-sidebar").text(),
|
|
29
|
+
].join(" ")
|
|
30
|
+
)?.slice(0, 1500) || null;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
if (url.includes("anthropic.com")) {
|
|
34
|
+
return cleanText(
|
|
35
|
+
[
|
|
36
|
+
$("article").text(),
|
|
37
|
+
$("main").text(),
|
|
38
|
+
$("[data-testid='article-content']").text(),
|
|
39
|
+
].join(" ")
|
|
40
|
+
)?.slice(0, 1500) || null;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
if (url.includes("simonwillison.net")) {
|
|
44
|
+
return cleanText(
|
|
45
|
+
[
|
|
46
|
+
$("article").text(),
|
|
47
|
+
$(".entry-content").text(),
|
|
48
|
+
$(".hentry").text(),
|
|
49
|
+
].join(" ")
|
|
50
|
+
)?.slice(0, 1500) || null;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return null;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* try to extract structured data from json-ld script tags.
|
|
58
|
+
* looks for Article, BlogPosting, NewsArticle, WebPage types.
|
|
59
|
+
*/
|
|
60
|
+
function extractJsonLd($: cheerio.CheerioAPI): {
|
|
61
|
+
title: string | null;
|
|
62
|
+
description: string | null;
|
|
63
|
+
image: string | null;
|
|
64
|
+
author: string | null;
|
|
65
|
+
date: string | null;
|
|
66
|
+
} {
|
|
67
|
+
const result = { title: null as string | null, description: null as string | null, image: null as string | null, author: null as string | null, date: null as string | null };
|
|
68
|
+
const targetTypes = ["Article", "BlogPosting", "NewsArticle", "WebPage", "TechArticle"];
|
|
69
|
+
|
|
70
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
71
|
+
try {
|
|
72
|
+
const raw = $(el).html();
|
|
73
|
+
if (!raw) return;
|
|
74
|
+
const parsed = JSON.parse(raw);
|
|
75
|
+
|
|
76
|
+
// handle both single objects and arrays
|
|
77
|
+
const items = Array.isArray(parsed) ? parsed : [parsed];
|
|
78
|
+
|
|
79
|
+
for (const item of items) {
|
|
80
|
+
// also check @graph arrays (common pattern)
|
|
81
|
+
const candidates = item["@graph"] ? [...item["@graph"], item] : [item];
|
|
82
|
+
for (const obj of candidates) {
|
|
83
|
+
const type = obj["@type"];
|
|
84
|
+
const types = Array.isArray(type) ? type : [type];
|
|
85
|
+
if (!types.some((t: string) => targetTypes.includes(t))) continue;
|
|
86
|
+
|
|
87
|
+
if (!result.title && obj.headline) result.title = obj.headline;
|
|
88
|
+
if (!result.title && obj.name) result.title = obj.name;
|
|
89
|
+
if (!result.description && obj.description) result.description = obj.description;
|
|
90
|
+
|
|
91
|
+
// image can be string, object, or array
|
|
92
|
+
if (!result.image) {
|
|
93
|
+
const img = obj.image;
|
|
94
|
+
if (typeof img === "string") result.image = img;
|
|
95
|
+
else if (Array.isArray(img) && img.length > 0) {
|
|
96
|
+
result.image = typeof img[0] === "string" ? img[0] : img[0]?.url || null;
|
|
97
|
+
} else if (img?.url) {
|
|
98
|
+
result.image = img.url;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// author can be string, object, or array
|
|
103
|
+
if (!result.author) {
|
|
104
|
+
const auth = obj.author;
|
|
105
|
+
if (typeof auth === "string") result.author = auth;
|
|
106
|
+
else if (Array.isArray(auth) && auth.length > 0) {
|
|
107
|
+
result.author = typeof auth[0] === "string" ? auth[0] : auth[0]?.name || null;
|
|
108
|
+
} else if (auth?.name) {
|
|
109
|
+
result.author = auth.name;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if (!result.date) {
|
|
114
|
+
result.date = obj.datePublished || obj.dateCreated || null;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
} catch {
|
|
119
|
+
// malformed json-ld, skip
|
|
120
|
+
}
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
return result;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/** patterns in image urls that suggest a generic default rather than real content */
|
|
127
|
+
const DEFAULT_IMAGE_PATTERNS = /\b(default|placeholder|logo|og-default|brand|fallback|generic|site-image)\b/i;
|
|
128
|
+
const SHOUT_DEFAULT_IMAGE = "https://ooykzbkcquvreeheaijy.supabase.co/storage/v1/object/public/public/shout/shout-default.svg";
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* try to find the first real content image from the page body.
|
|
132
|
+
* skips icons, avatars, tracking pixels, and tiny images.
|
|
133
|
+
*/
|
|
134
|
+
function extractFirstContentImage($: cheerio.CheerioAPI, baseUrl: string): string | null {
|
|
135
|
+
const skipSrcPatterns = /\b(avatar|icon|logo|emoji|badge|button|pixel|track|beacon|spacer|blank)\b/i;
|
|
136
|
+
|
|
137
|
+
// prefer images inside content containers, fall back to body
|
|
138
|
+
const contentSelectors = [
|
|
139
|
+
"article",
|
|
140
|
+
"main",
|
|
141
|
+
".post-content",
|
|
142
|
+
".entry-content",
|
|
143
|
+
".post-body",
|
|
144
|
+
".article-body",
|
|
145
|
+
];
|
|
146
|
+
|
|
147
|
+
let container = null;
|
|
148
|
+
for (const sel of contentSelectors) {
|
|
149
|
+
const el = $(sel);
|
|
150
|
+
if (el.length) {
|
|
151
|
+
container = el.first();
|
|
152
|
+
break;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
if (!container) container = $("body");
|
|
156
|
+
if (!container.length) return null;
|
|
157
|
+
|
|
158
|
+
const imgs = container.find("img");
|
|
159
|
+
for (let i = 0; i < imgs.length; i++) {
|
|
160
|
+
const img = $(imgs[i]);
|
|
161
|
+
const src = img.attr("src") || img.attr("data-src") || null;
|
|
162
|
+
if (!src) continue;
|
|
163
|
+
|
|
164
|
+
// skip tracking pixels and tiny images by attribute
|
|
165
|
+
const w = parseInt(img.attr("width") || "", 10);
|
|
166
|
+
const h = parseInt(img.attr("height") || "", 10);
|
|
167
|
+
if ((w > 0 && w < 100) || (h > 0 && h < 100)) continue;
|
|
168
|
+
if (w === 1 || h === 1) continue;
|
|
169
|
+
|
|
170
|
+
// skip by src pattern
|
|
171
|
+
if (skipSrcPatterns.test(src)) continue;
|
|
172
|
+
|
|
173
|
+
const resolved = resolveUrl(src, baseUrl);
|
|
174
|
+
if (resolved) return resolved;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
return null;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* detect if a url is a twitter/x.com tweet and extract metadata via fxtwitter api.
|
|
182
|
+
*/
|
|
183
|
+
async function extractTwitterMetadata(url: string): Promise<PageMetadata | null> {
|
|
184
|
+
const tweetMatch = url.match(/(?:twitter\.com|x\.com)\/(\w+)\/status\/(\d+)/);
|
|
185
|
+
if (!tweetMatch) return null;
|
|
186
|
+
|
|
187
|
+
const [, username, tweetId] = tweetMatch;
|
|
188
|
+
try {
|
|
189
|
+
const res = await fetch(`https://api.fxtwitter.com/${username}/status/${tweetId}`, {
|
|
190
|
+
signal: AbortSignal.timeout(10000),
|
|
191
|
+
});
|
|
192
|
+
if (!res.ok) return null;
|
|
193
|
+
|
|
194
|
+
const data = await res.json();
|
|
195
|
+
const tweet = data.tweet;
|
|
196
|
+
if (!tweet) return null;
|
|
197
|
+
|
|
198
|
+
const title = tweet.text || tweet.raw_text?.text || null;
|
|
199
|
+
const description = tweet.article?.preview_text || null;
|
|
200
|
+
const image_url =
|
|
201
|
+
tweet.article?.cover_media?.media_info?.__typename === "ApiImage"
|
|
202
|
+
? null // article cover doesn't have direct url in this path
|
|
203
|
+
: tweet.media?.photos?.[0]?.url ||
|
|
204
|
+
tweet.media?.all?.[0]?.url ||
|
|
205
|
+
tweet.author?.avatar_url ||
|
|
206
|
+
null;
|
|
207
|
+
const author = tweet.author?.name ? `${tweet.author.name} (@${tweet.author.screen_name})` : null;
|
|
208
|
+
const date = tweet.created_at || null;
|
|
209
|
+
|
|
210
|
+
// use tweet text as body content for summarization
|
|
211
|
+
const bodyText = (tweet.text || tweet.raw_text?.text || "").slice(0, 1500) || null;
|
|
212
|
+
|
|
213
|
+
// if it's an article tweet, prefer article title
|
|
214
|
+
if (tweet.article?.title) {
|
|
215
|
+
return {
|
|
216
|
+
title: tweet.article.title,
|
|
217
|
+
description: description || title,
|
|
218
|
+
image_url,
|
|
219
|
+
author,
|
|
220
|
+
date,
|
|
221
|
+
bodyText,
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return { title, description, image_url, author, date, bodyText };
|
|
226
|
+
} catch (err) {
|
|
227
|
+
console.error(`fxtwitter extraction error for ${url}:`, err);
|
|
228
|
+
return null;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* fetch a url and extract og/meta tags for link preview data.
|
|
234
|
+
* falls back through: twitter api -> og -> twitter -> json-ld -> html tags -> domain-specific selectors.
|
|
235
|
+
*/
|
|
236
|
+
export async function extractMetadata(url: string): Promise<PageMetadata> {
|
|
237
|
+
// try twitter/x.com specific extraction first
|
|
238
|
+
const twitterResult = await extractTwitterMetadata(url);
|
|
239
|
+
if (twitterResult) return twitterResult;
|
|
240
|
+
|
|
241
|
+
try {
|
|
242
|
+
const response = await fetch(url, {
|
|
243
|
+
headers: {
|
|
244
|
+
"User-Agent":
|
|
245
|
+
"Mozilla/5.0 (compatible; NodShout/0.1; +https://nod.social)",
|
|
246
|
+
Accept: "text/html,application/xhtml+xml",
|
|
247
|
+
},
|
|
248
|
+
signal: AbortSignal.timeout(10000),
|
|
249
|
+
});
|
|
250
|
+
|
|
251
|
+
if (!response.ok) {
|
|
252
|
+
console.error(`fetch failed for ${url}: ${response.status}`);
|
|
253
|
+
return { title: null, description: null, image_url: null, author: null, date: null, bodyText: null };
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
const html = await response.text();
|
|
257
|
+
const $ = cheerio.load(html);
|
|
258
|
+
|
|
259
|
+
// extract json-ld structured data
|
|
260
|
+
const jsonLd = extractJsonLd($);
|
|
261
|
+
|
|
262
|
+
// title: og -> twitter -> json-ld -> <title> -> first h1 -> domain-specific
|
|
263
|
+
let title =
|
|
264
|
+
$('meta[property="og:title"]').attr("content") ||
|
|
265
|
+
$('meta[name="twitter:title"]').attr("content") ||
|
|
266
|
+
jsonLd.title ||
|
|
267
|
+
$("title").text().trim() ||
|
|
268
|
+
$("h1").first().text().trim() ||
|
|
269
|
+
null;
|
|
270
|
+
|
|
271
|
+
// simon willison's blog: specific fallback
|
|
272
|
+
if (!title && url.includes("simonwillison")) {
|
|
273
|
+
title =
|
|
274
|
+
$("h1.entry-title").text().trim() ||
|
|
275
|
+
$(".entry-title").text().trim() ||
|
|
276
|
+
$("article h1").text().trim() ||
|
|
277
|
+
$(".hentry h1").text().trim() ||
|
|
278
|
+
null;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// description: og -> standard meta -> twitter -> json-ld
|
|
282
|
+
const description =
|
|
283
|
+
$('meta[property="og:description"]').attr("content") ||
|
|
284
|
+
$('meta[name="description"]').attr("content") ||
|
|
285
|
+
$('meta[name="twitter:description"]').attr("content") ||
|
|
286
|
+
jsonLd.description ||
|
|
287
|
+
null;
|
|
288
|
+
|
|
289
|
+
// image: og -> twitter:image -> json-ld -> apple-touch-icon -> favicon
|
|
290
|
+
const rawImage =
|
|
291
|
+
$('meta[property="og:image"]').attr("content") ||
|
|
292
|
+
$('meta[name="twitter:image"]').attr("content") ||
|
|
293
|
+
$('meta[name="twitter:image:src"]').attr("content") ||
|
|
294
|
+
jsonLd.image ||
|
|
295
|
+
$('link[rel="apple-touch-icon"]').attr("href") ||
|
|
296
|
+
$('link[rel="apple-touch-icon-precomposed"]').attr("href") ||
|
|
297
|
+
$('link[rel="icon"][type="image/png"]').attr("href") ||
|
|
298
|
+
$('link[rel="icon"]').attr("href") ||
|
|
299
|
+
null;
|
|
300
|
+
|
|
301
|
+
// resolve relative image urls against the page url
|
|
302
|
+
let image_url = resolveUrl(rawImage, url);
|
|
303
|
+
|
|
304
|
+
// if no image found or it looks like a generic default, try first content image
|
|
305
|
+
// then apple-touch-icon, then shout branded default
|
|
306
|
+
if (!image_url || DEFAULT_IMAGE_PATTERNS.test(new URL(image_url).pathname)) {
|
|
307
|
+
const contentImage = extractFirstContentImage($, url);
|
|
308
|
+
if (contentImage) {
|
|
309
|
+
image_url = contentImage;
|
|
310
|
+
} else {
|
|
311
|
+
// try apple-touch-icon (usually 152x152+, much better than tiny favicons)
|
|
312
|
+
const touchIcon = resolveUrl(
|
|
313
|
+
$('link[rel="apple-touch-icon"]').attr("href") ||
|
|
314
|
+
$('link[rel="apple-touch-icon-precomposed"]').attr("href") ||
|
|
315
|
+
null,
|
|
316
|
+
url
|
|
317
|
+
);
|
|
318
|
+
image_url = touchIcon || SHOUT_DEFAULT_IMAGE;
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// author: meta -> article:author -> json-ld
|
|
323
|
+
const author =
|
|
324
|
+
$('meta[name="author"]').attr("content") ||
|
|
325
|
+
$('meta[property="article:author"]').attr("content") ||
|
|
326
|
+
jsonLd.author ||
|
|
327
|
+
null;
|
|
328
|
+
|
|
329
|
+
// date: article:published_time -> meta date -> json-ld
|
|
330
|
+
const date =
|
|
331
|
+
$('meta[property="article:published_time"]').attr("content") ||
|
|
332
|
+
$('meta[name="date"]').attr("content") ||
|
|
333
|
+
jsonLd.date ||
|
|
334
|
+
null;
|
|
335
|
+
|
|
336
|
+
// extract body text for AI summarization (first ~1500 chars of content)
|
|
337
|
+
const domainBodyText = extractDomainSpecificBodyText($, url);
|
|
338
|
+
const bodyEl = $("article").length ? $("article") : $("main").length ? $("main") : $("body");
|
|
339
|
+
const genericBodyText = bodyEl
|
|
340
|
+
.clone()
|
|
341
|
+
.find("script, style, nav, header, footer, aside, .sidebar, .comments")
|
|
342
|
+
.remove()
|
|
343
|
+
.end()
|
|
344
|
+
.text();
|
|
345
|
+
|
|
346
|
+
const bodyText = cleanText(domainBodyText || genericBodyText)?.slice(0, 1500) || null;
|
|
347
|
+
|
|
348
|
+
return { title, description, image_url, author, date, bodyText };
|
|
349
|
+
} catch (err) {
|
|
350
|
+
console.error(`metadata extraction error for ${url}:`, err);
|
|
351
|
+
return { title: null, description: null, image_url: null, author: null, date: null, bodyText: null };
|
|
352
|
+
}
|
|
353
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { readFileSync } from "fs";
|
|
2
|
+
import { join } from "path";
|
|
3
|
+
|
|
4
|
+
const SKILLS_DIR = join(process.cwd(), "skills");
|
|
5
|
+
|
|
6
|
+
export function loadSkill(name: string): string {
|
|
7
|
+
try {
|
|
8
|
+
return readFileSync(join(SKILLS_DIR, name, "SKILL.md"), "utf8");
|
|
9
|
+
} catch {
|
|
10
|
+
return "";
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export function loadSkills(names: string[]): string {
|
|
15
|
+
return names
|
|
16
|
+
.map((name) => {
|
|
17
|
+
const content = loadSkill(name).trim();
|
|
18
|
+
return content ? `\n\n[skill:${name}]\n${content}` : "";
|
|
19
|
+
})
|
|
20
|
+
.join("");
|
|
21
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { createClient } from "@supabase/supabase-js";
|
|
2
|
+
|
|
3
|
+
const supabaseUrl = process.env.SUPABASE_URL;
|
|
4
|
+
const supabaseKey = process.env.SUPABASE_SERVICE_ROLE_KEY || process.env.SUPABASE_SERVICE_KEY;
|
|
5
|
+
|
|
6
|
+
if (!supabaseUrl || !supabaseKey) {
|
|
7
|
+
throw new Error(
|
|
8
|
+
"missing SUPABASE_URL or SUPABASE_SERVICE_ROLE_KEY environment variables"
|
|
9
|
+
);
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export const supabase = createClient(supabaseUrl, supabaseKey);
|