@morphika/andami 0.5.11 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/app/robots.ts CHANGED
@@ -1,54 +1,73 @@
1
- import type { MetadataRoute } from "next";
2
- import { getSiteConfig } from "../lib/config";
3
-
4
- const cfg = getSiteConfig();
5
-
6
- /**
7
- * robots.txt — Controls crawler access and rate.
8
- *
9
- * Crawl-delay (seconds between requests) is honoured by Bing, Yandex, Baidu
10
- * and most well-behaved bots. Googlebot ignores it but respects the rate
11
- * configured in Search Console. The 10-second delay drastically reduces
12
- * serverless CPU usage from bot traffic on Hobby-tier hosting.
13
- *
14
- * Aggressive AI scrapers (GPTBot, CCBot, etc.) are blocked entirely.
15
- */
16
- export default function robots(): MetadataRoute.Robots {
17
- return {
18
- rules: [
19
- // Block known AI scrapers / aggressive bots
20
- {
21
- userAgent: "GPTBot",
22
- disallow: ["/"],
23
- },
24
- {
25
- userAgent: "CCBot",
26
- disallow: ["/"],
27
- },
28
- {
29
- userAgent: "anthropic-ai",
30
- disallow: ["/"],
31
- },
32
- {
33
- userAgent: "ClaudeBot",
34
- disallow: ["/"],
35
- },
36
- {
37
- userAgent: "Bytespider",
38
- disallow: ["/"],
39
- },
40
- {
41
- userAgent: "PetalBot",
42
- disallow: ["/"],
43
- },
44
- // Default: allow with crawl delay
45
- {
46
- userAgent: "*",
47
- allow: "/",
48
- disallow: ["/admin/", "/studio/", "/api/admin/", "/api/"],
49
- crawlDelay: 10,
50
- },
51
- ],
52
- sitemap: `${cfg.domain}/sitemap.xml`,
53
- };
54
- }
1
+ import type { MetadataRoute } from "next";
2
+ import { getSiteConfig } from "../lib/config";
3
+
4
+ const cfg = getSiteConfig();
5
+
6
+ /**
7
+ * robots.txt — Controls crawler access and rate.
8
+ *
9
+ * Two-tier AI bot strategy:
10
+ *
11
+ * 1. TRAINING bots (block) crawl pages to train LLMs on the content.
12
+ * Blocking them protects IP from being absorbed into model weights.
13
+ * Examples: GPTBot, CCBot, Google-Extended, ClaudeBot, anthropic-ai.
14
+ *
15
+ * 2. CITATION / ON-DEMAND bots (allow) — fetch URLs in real time when a
16
+ * user asks an AI assistant a question. Blocking them means we lose
17
+ * citation opportunities in ChatGPT, Perplexity, Claude responses.
18
+ * Examples: ChatGPT-User, Perplexity-User, Claude-User, OAI-SearchBot.
19
+ *
20
+ * Distinction matters because we want LLMs to *cite* us, not *learn from*
21
+ * us. The bots have different User-Agent strings; granular control via
22
+ * per-UA rules.
23
+ *
24
+ * Crawl-delay (seconds between requests) is honoured by Bing, Yandex,
25
+ * Baidu and most well-behaved bots. Googlebot ignores it but respects the
26
+ * rate configured in Search Console.
27
+ *
28
+ * Last reviewed: 2026-05-15.
29
+ */
30
+ export default function robots(): MetadataRoute.Robots {
31
+ return {
32
+ rules: [
33
+ // ─────────────────────────────────────────────
34
+ // TRAINING BOTS — explicitly blocked
35
+ // ─────────────────────────────────────────────
36
+ { userAgent: "GPTBot", disallow: ["/"] }, // OpenAI training crawler
37
+ { userAgent: "CCBot", disallow: ["/"] }, // Common Crawl (feeds most LLMs)
38
+ { userAgent: "Google-Extended", disallow: ["/"] }, // Google's AI training (separate from Googlebot)
39
+ { userAgent: "anthropic-ai", disallow: ["/"] }, // Older Anthropic training UA
40
+ { userAgent: "ClaudeBot", disallow: ["/"] }, // Anthropic Claude training
41
+ { userAgent: "FacebookBot", disallow: ["/"] }, // Meta/LLaMA training
42
+ { userAgent: "Applebot-Extended", disallow: ["/"] }, // Apple Intelligence training (separate from Applebot which indexes for Siri/Spotlight — that one is allowed implicitly)
43
+ { userAgent: "Bytespider", disallow: ["/"] }, // ByteDance / TikTok training crawler
44
+ { userAgent: "PetalBot", disallow: ["/"] }, // Huawei / Petal Search aggressive crawler
45
+
46
+ // ─────────────────────────────────────────────
47
+ // CITATION / ON-DEMAND BOTS — explicitly allowed
48
+ // (they're allowed by default anyway, but we list
49
+ // them to make the policy explicit and to override
50
+ // any future changes to the catch-all rule.)
51
+ // ─────────────────────────────────────────────
52
+ { userAgent: "ChatGPT-User", allow: "/" }, // ChatGPT user-triggered fetches
53
+ { userAgent: "OAI-SearchBot", allow: "/" }, // OpenAI search index
54
+ { userAgent: "PerplexityBot", allow: "/" }, // Perplexity index
55
+ { userAgent: "Perplexity-User", allow: "/" }, // Perplexity user-triggered fetches
56
+ { userAgent: "Claude-User", allow: "/" }, // Claude user-triggered fetches
57
+ { userAgent: "Claude-Web", allow: "/" }, // Claude web search
58
+ { userAgent: "Google-CloudVertexBot", allow: "/" }, // Vertex AI on-demand fetch
59
+ { userAgent: "YouBot", allow: "/" }, // You.com AI search
60
+
61
+ // ─────────────────────────────────────────────
62
+ // DEFAULT — Googlebot, Bingbot, and everyone else
63
+ // ─────────────────────────────────────────────
64
+ {
65
+ userAgent: "*",
66
+ allow: "/",
67
+ disallow: ["/admin/", "/studio/", "/api/admin/", "/api/"],
68
+ crawlDelay: 10,
69
+ },
70
+ ],
71
+ sitemap: `${cfg.domain}/sitemap.xml`,
72
+ };
73
+ }
package/app/sitemap.ts CHANGED
@@ -1,48 +1,85 @@
1
- import type { MetadataRoute } from "next";
2
- import { client } from "../lib/sanity/client";
3
- import { allPageSlugsQuery, allProjectSlugsQuery } from "../lib/sanity/queries";
4
- import { getSiteConfig } from "../lib/config";
5
-
6
- const cfg = getSiteConfig();
7
-
8
- export default async function sitemap(): Promise<MetadataRoute.Sitemap> {
9
- const baseUrl = cfg.domain;
10
-
11
- // Fetch all published slugs
12
- const [pageSlugs, projectSlugs] = await Promise.all([
13
- client.fetch<string[]>(allPageSlugsQuery).catch(() => [] as string[]),
14
- client.fetch<string[]>(allProjectSlugsQuery).catch(() => [] as string[]),
15
- ]);
16
-
17
- // Homepage
18
- const routes: MetadataRoute.Sitemap = [
19
- {
20
- url: baseUrl,
21
- lastModified: new Date(),
22
- changeFrequency: "weekly",
23
- priority: 1,
24
- },
25
- ];
26
-
27
- // Dynamic pages (About, Contact, Archive, etc.)
28
- for (const slug of pageSlugs) {
29
- routes.push({
30
- url: `${baseUrl}/${slug}`,
31
- lastModified: new Date(),
32
- changeFrequency: "monthly",
33
- priority: 0.8,
34
- });
35
- }
36
-
37
- // Project pages
38
- for (const slug of projectSlugs) {
39
- routes.push({
40
- url: `${baseUrl}/work/${slug}`,
41
- lastModified: new Date(),
42
- changeFrequency: "monthly",
43
- priority: 0.7,
44
- });
45
- }
46
-
47
- return routes;
48
- }
1
+ import type { MetadataRoute } from "next";
2
+ import { client } from "../lib/sanity/client";
3
+ import { allPagesForSitemapQuery, allProjectsForSitemapQuery } from "../lib/sanity/queries";
4
+ import { getSiteConfig } from "../lib/config";
5
+ import { assetUrl } from "../lib/assets";
6
+ import { toAbsoluteUrl } from "../lib/seo/site-settings";
7
+
8
+ const cfg = getSiteConfig();
9
+
10
+ interface PageSitemapEntry {
11
+ slug: string;
12
+ updatedAt: string;
13
+ }
14
+
15
+ interface ProjectSitemapEntry extends PageSitemapEntry {
16
+ thumbnail_path?: string;
17
+ title?: string;
18
+ description?: string;
19
+ published_at?: string;
20
+ }
21
+
22
+ /**
23
+ * sitemap.xml — Dynamic generation from Sanity content.
24
+ *
25
+ * Each entry's <lastmod> uses Sanity's _updatedAt field (not the build time),
26
+ * which is the correct SEO signal: Google deprioritizes sitemaps with all
27
+ * identical timestamps.
28
+ *
29
+ * Project entries include their thumbnail in the `images` array — Next.js
30
+ * emits this as <image:image> nodes per the Google Image Sitemap protocol,
31
+ * allowing Google Images to index project thumbnails alongside the page URL.
32
+ *
33
+ * Pages/projects with metadata.noindex == true are excluded at the GROQ
34
+ * level (see lib/sanity/queries.ts).
35
+ */
36
+ export default async function sitemap(): Promise<MetadataRoute.Sitemap> {
37
+ const baseUrl = cfg.domain;
38
+
39
+ const [pages, projects] = await Promise.all([
40
+ client.fetch<PageSitemapEntry[]>(allPagesForSitemapQuery).catch(() => [] as PageSitemapEntry[]),
41
+ client.fetch<ProjectSitemapEntry[]>(allProjectsForSitemapQuery).catch(() => [] as ProjectSitemapEntry[]),
42
+ ]);
43
+
44
+ // Homepage — lastModified derived from the most recent page or project update
45
+ const allTimestamps = [
46
+ ...pages.map((p) => p.updatedAt),
47
+ ...projects.map((p) => p.updatedAt),
48
+ ].filter(Boolean);
49
+ const homeLastMod = allTimestamps.length > 0
50
+ ? new Date(Math.max(...allTimestamps.map((t) => new Date(t).getTime())))
51
+ : new Date();
52
+
53
+ const routes: MetadataRoute.Sitemap = [
54
+ {
55
+ url: baseUrl,
56
+ lastModified: homeLastMod,
57
+ changeFrequency: "weekly",
58
+ priority: 1,
59
+ },
60
+ ];
61
+
62
+ for (const { slug, updatedAt } of pages) {
63
+ routes.push({
64
+ url: `${baseUrl}/${slug}`,
65
+ lastModified: updatedAt ? new Date(updatedAt) : new Date(),
66
+ changeFrequency: "monthly",
67
+ priority: 0.8,
68
+ });
69
+ }
70
+
71
+ for (const { slug, updatedAt, thumbnail_path } of projects) {
72
+ const thumbnailUrl = thumbnail_path
73
+ ? toAbsoluteUrl(assetUrl(thumbnail_path), baseUrl)
74
+ : undefined;
75
+ routes.push({
76
+ url: `${baseUrl}/work/${slug}`,
77
+ lastModified: updatedAt ? new Date(updatedAt) : new Date(),
78
+ changeFrequency: "monthly",
79
+ priority: 0.7,
80
+ ...(thumbnailUrl && { images: [thumbnailUrl] }),
81
+ });
82
+ }
83
+
84
+ return routes;
85
+ }