@docsector/docsector-reader 4.5.3 → 4.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -27,7 +27,7 @@ Transform Markdown content into beautiful, navigable documentation sites — wit
27
27
  - 🤖 **Open in ChatGPT / Claude** — One-click links to open the current page directly in ChatGPT or Claude for Q&A
28
28
  - 🤖 **LLM Bot Detection** — Automatically serves raw Markdown to known AI crawlers (GPTBot, ClaudeBot, PerplexityBot, Cloudflare-AI-Search, GrokBot, and others)
29
29
  - 🗺️ **Sitemap Generation** — Automatic `sitemap.xml` generation at build time with root-relative URLs by default and absolute URLs when `siteUrl` is configured
30
- - 🤖 **AI-Friendly robots.txt** — Scaffold includes a `robots.txt` explicitly allowing 24 AI crawlers (GPTBot, ClaudeBot, PerplexityBot, Cloudflare-AI-Search, GrokBot, etc.) and advertises `Sitemap: /sitemap.xml`
30
+ - 🤖 **AI-Friendly robots.txt** — Scaffold includes a `robots.txt` explicitly allowing 24 AI crawlers (GPTBot, ClaudeBot, PerplexityBot, Cloudflare-AI-Search, GrokBot, etc.), and the build appends `Sitemap: /sitemap.xml` at the end for crawler discovery
31
31
  - 🧭 **Content Signals** — Optional `Content-Signal` directive for declaring AI usage policy (`ai-train`, `search`, `ai-input`) in `robots.txt`
32
32
  - 🧩 **Agent Skills Discovery Index** — Optional `/.well-known/agent-skills/index.json` with RFC v0.2.0 schema and SHA-256 digests
33
33
  - ✍️ **Docsector Authoring Skill** — Publishable `SKILL.md` that teaches agents Docsector blocks, page patterns, MCP lookup, and WebMCP tools
@@ -352,8 +352,8 @@ export default {
352
352
  Use Cloudflare AI Search as the first provider path:
353
353
 
354
354
  - Create an AI Search instance in Cloudflare.
355
- - Build and deploy the Docsector site first; build output always publishes `/sitemap.xml` and adds `Sitemap: /sitemap.xml` to `robots.txt` for crawler discovery.
356
- - Use a Website data source. For the cleanest retrieval, point its specific sitemap to `/ai-search-sitemap.xml`; otherwise the crawler can discover `/sitemap.xml` from `robots.txt`.
355
+ - Build and deploy the Docsector site first; build output always publishes `/sitemap.xml` and appends `Sitemap: /sitemap.xml` to the end of `robots.txt` for crawler discovery.
356
+ - Use a Website data source. For the cleanest retrieval, point its specific sitemap to `/ai-search-sitemap.xml`. Docsector keeps that Markdown-focused sitemap available for explicit AI Search configuration, but does not auto-announce it from `robots.txt` so Cloudflare does not index the same content twice alongside `/sitemap.xml`.
357
357
  - Add metadata fields such as title, path, locale, book, version, and subpage if you want filtering later.
358
358
  - Set `AI_SEARCH_INSTANCE_NAME` as a Cloudflare Pages environment variable or local `.dev.vars` entry.
359
359
  - Bind the instance to Pages as `AI_SEARCH` when available, or set encrypted Pages secrets for `CLOUDFLARE_ACCOUNT_ID` and `CLOUDFLARE_API_TOKEN` with AI Search run access.
@@ -368,7 +368,7 @@ When enabled, `docsector build` can generate:
368
368
  | `functions/assistant.js` | Cloudflare Pages Function for browser assistant requests |
369
369
  | `dist/spa/sitemap.xml` | Default crawler sitemap advertised from `robots.txt` |
370
370
  | `dist/spa/robots.txt` | Crawler policy with `Sitemap: /sitemap.xml` |
371
- | `dist/spa/ai-search-sitemap.xml` | Markdown-focused sitemap for AI Search crawling |
371
+ | `dist/spa/ai-search-sitemap.xml` | Markdown-focused sitemap for explicit AI Search Website data source configuration |
372
372
  | `dist/spa/.well-known/ai-search/manifest.json` | Source metadata for indexed documentation pages |
373
373
  | `dist/spa/_routes.json` | Routes the internal assistant endpoint to the Pages Function |
374
374
 
@@ -619,7 +619,7 @@ Notes:
619
619
  - `aiTrain`, `search`, and `aiInput` accept `yes` / `no` (or booleans).
620
620
  - Default scope is only `User-agent: *`.
621
621
  - Build patch is idempotent: repeated builds do not duplicate `Content-Signal` lines.
622
- - Build also keeps `Sitemap: /sitemap.xml` discoverable in `robots.txt` so crawlers can find the generated sitemap automatically.
622
+ - Build also keeps `Sitemap: /sitemap.xml` discoverable at the end of `robots.txt` so crawlers can find the generated sitemap automatically.
623
623
 
624
624
  ### Validate
625
625
 
package/bin/docsector.js CHANGED
@@ -24,7 +24,7 @@ const packageRoot = resolve(__dirname, '..')
24
24
  const args = process.argv.slice(2)
25
25
  const command = args[0]
26
26
 
27
- const VERSION = '4.5.3'
27
+ const VERSION = '4.5.4'
28
28
  const AUTHORING_SKILL_NAME = 'docsector-documentation-authoring'
29
29
  const AUTHORING_SKILL_DESCRIPTION = 'Author Docsector documentation with Markdown, custom blocks, MCP, and WebMCP.'
30
30
  const AUTHORING_SKILL_PUBLIC_PATH = `/.well-known/agent-skills/${AUTHORING_SKILL_NAME}/SKILL.md`
@@ -157,6 +157,32 @@ export default {
157
157
  // sitemap.xml is still generated with root-relative URLs when omitted.
158
158
  // siteUrl: 'https://docs.example.com',
159
159
 
160
+ // @ Home page source (optional)
161
+ // Use a remote README.md as homepage content at build-time.
162
+ // Falls back to local src/pages/Homepage.{lang}.md on fetch failure by default.
163
+ // homePage: {
164
+ // source: 'remote-readme', // 'local' | 'remote-readme'
165
+ // remoteReadmeUrl: 'https://raw.githubusercontent.com/your-org/your-repo/main/README.md',
166
+ // timeoutMs: 8000,
167
+ // fallbackToLocal: true
168
+ // },
169
+
170
+ // --- Language configs ---
171
+
172
+ // @ Languages
173
+ languages: [
174
+ {
175
+ image: '/images/flags/united-states-of-america.png',
176
+ label: 'English (US)',
177
+ value: 'en-US'
178
+ }
179
+ ],
180
+
181
+ // @ Default language
182
+ defaultLanguage: 'en-US'
183
+
184
+ // --- AI configs ---
185
+
160
186
  // @ MCP (Model Context Protocol)
161
187
  // Uncomment to enable an MCP server at /mcp for AI assistant integration.
162
188
  // Requires Cloudflare Pages Functions (or compatible serverless platform).
@@ -197,16 +223,6 @@ export default {
197
223
  // }
198
224
  // },
199
225
 
200
- // @ Home page source (optional)
201
- // Use a remote README.md as homepage content at build-time.
202
- // Falls back to local src/pages/Homepage.{lang}.md on fetch failure by default.
203
- // homePage: {
204
- // source: 'remote-readme', // 'local' | 'remote-readme'
205
- // remoteReadmeUrl: 'https://raw.githubusercontent.com/your-org/your-repo/main/README.md',
206
- // timeoutMs: 8000,
207
- // fallbackToLocal: true
208
- // },
209
-
210
226
  // @ Homepage Link headers for agent discovery (optional)
211
227
  // linkHeaders: {
212
228
  // enabled: true,
@@ -270,19 +286,7 @@ export default {
270
286
  // url: '${AUTHORING_SKILL_PUBLIC_PATH}'
271
287
  // }
272
288
  // ]
273
- // },
274
-
275
- // @ Languages
276
- languages: [
277
- {
278
- image: '/images/flags/united-states-of-america.png',
279
- label: 'English (US)',
280
- value: 'en-US'
281
- }
282
- ],
283
-
284
- // @ Default language
285
- defaultLanguage: 'en-US'
289
+ // }
286
290
  }
287
291
  `
288
292
 
@@ -638,7 +642,6 @@ const TEMPLATE_ROBOTS_TXT = `\
638
642
  User-agent: *
639
643
  Allow: /
640
644
  Content-Signal: ai-train=yes, search=yes, ai-input=yes
641
- Sitemap: /sitemap.xml
642
645
 
643
646
  # Explicitly allow AI crawlers
644
647
  # OpenAI
@@ -793,7 +796,7 @@ npm run build
793
796
  \`\`\`
794
797
 
795
798
  The optimized SPA output will be in \`dist/spa/\`.
796
- Docsector also generates \`dist/spa/sitemap.xml\` and keeps \`robots.txt\` discoverable with \`Sitemap: /sitemap.xml\`. Set \`siteUrl\` in \`docsector.config.js\` when you want absolute sitemap URLs.
799
+ Docsector also generates \`dist/spa/sitemap.xml\` and appends \`Sitemap: /sitemap.xml\` to the end of \`dist/spa/robots.txt\` during build. Set \`siteUrl\` in \`docsector.config.js\` when you want absolute sitemap URLs.
797
800
  `
798
801
 
799
802
  // =============================================================================
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@docsector/docsector-reader",
3
- "version": "4.5.3",
3
+ "version": "4.5.4",
4
4
  "description": "A documentation rendering engine built with Vue 3, Quasar v2 and Vite. Transform Markdown into beautiful, navigable documentation sites.",
5
5
  "productName": "Docsector Reader",
6
6
  "author": "Rodrigo de Araujo Vieira",
package/public/robots.txt CHANGED
@@ -1,6 +1,5 @@
1
1
  User-agent: *
2
2
  Allow: /
3
- Sitemap: /sitemap.xml
4
3
 
5
4
  User-agent: Cloudflare-AI-Search
6
5
  Allow: /
@@ -251,7 +251,7 @@ function buildSystemPrompt (body, currentPageMarkdown = '') {
251
251
  const lines = [
252
252
  'You are Docsector Assistant, a concise documentation assistant.',
253
253
  'Answer using the indexed documentation context. If the answer is not in the docs, say so clearly.',
254
- 'Prefer short, actionable answers and cite the relevant source chunks when available.'
254
+ 'Prefer short, actionable answers.'
255
255
  ]
256
256
 
257
257
  if (locale) lines.push(`User locale: ${locale}.`)
package/src/index.js CHANGED
@@ -179,7 +179,7 @@ export function createDocsector (config = {}) {
179
179
  },
180
180
 
181
181
  contentSignals: {
182
- enabled: false,
182
+ enabled: true,
183
183
  aiTrain: 'yes',
184
184
  search: 'yes',
185
185
  aiInput: 'yes',
@@ -189,7 +189,7 @@ export function createDocsector (config = {}) {
189
189
  },
190
190
 
191
191
  agentSkills: {
192
- enabled: false,
192
+ enabled: true,
193
193
  path: '/.well-known/agent-skills/index.json',
194
194
  schema: 'https://schemas.agentskills.io/discovery/0.2.0/schema.json',
195
195
  skills: [],
@@ -47,7 +47,7 @@ For cleaner retrieval, point the specific sitemap setting to:
47
47
  https://docs.example.com/ai-search-sitemap.xml
48
48
  ```
49
49
 
50
- The AI Search sitemap points to Markdown URLs, which are cleaner for retrieval than rendered SPA HTML. The manifest at `/.well-known/ai-search/manifest.json` lists titles, routes, locales, books, versions, and subpages for the same source set.
50
+ The AI Search sitemap points to Markdown URLs, which are cleaner for retrieval than rendered SPA HTML. Docsector keeps it available for explicit Cloudflare configuration, but does not auto-advertise it from `robots.txt` to avoid duplicate indexing alongside `/sitemap.xml`. The manifest at `/.well-known/ai-search/manifest.json` lists titles, routes, locales, books, versions, and subpages for the same source set.
51
51
 
52
52
  ## Runtime Endpoint
53
53
 
@@ -47,7 +47,7 @@ Para uma recuperação mais limpa, aponte a configuração de sitemap específic
47
47
  https://docs.example.com/ai-search-sitemap.xml
48
48
  ```
49
49
 
50
- O sitemap do AI Search aponta para URLs Markdown, que são mais limpas para recuperação do que HTML renderizado pela SPA. O manifest em `/.well-known/ai-search/manifest.json` lista títulos, rotas, locales, books, versões e subpáginas do mesmo conjunto de fontes.
50
+ O sitemap do AI Search aponta para URLs Markdown, que são mais limpas para recuperação do que HTML renderizado pela SPA. O Docsector mantém esse arquivo disponível para configuração explícita no Cloudflare, mas não o anuncia automaticamente em `robots.txt`, para evitar indexação duplicada junto com `/sitemap.xml`. O manifest em `/.well-known/ai-search/manifest.json` lista títulos, rotas, locales, books, versões e subpáginas do mesmo conjunto de fontes.
51
51
 
52
52
  ## Endpoint Runtime
53
53
 
@@ -1958,6 +1958,16 @@ function collectStandardSitemapEntries ({ pagesDir, pageEntries = [], defaultLan
1958
1958
  return entries
1959
1959
  }
1960
1960
 
1961
+ export function getAdvertisedRobotsSitemapPaths ({ sitemapEnabled = true } = {}) {
1962
+ const paths = []
1963
+
1964
+ if (sitemapEnabled) {
1965
+ paths.push('/sitemap.xml')
1966
+ }
1967
+
1968
+ return paths
1969
+ }
1970
+
1961
1971
  /**
1962
1972
  * Create a Vite plugin that generates static `.md` files at build time.
1963
1973
  *
@@ -2461,9 +2471,7 @@ export async function onRequest (context) {
2461
2471
  }
2462
2472
  }
2463
2473
 
2464
- const robotsSitemapPaths = []
2465
- if (sitemapEnabled) robotsSitemapPaths.push('/sitemap.xml')
2466
- if (aiSearchSitemapGenerated) robotsSitemapPaths.push('/ai-search-sitemap.xml')
2474
+ const robotsSitemapPaths = getAdvertisedRobotsSitemapPaths({ sitemapEnabled })
2467
2475
 
2468
2476
  if (robotsSitemapPaths.length > 0) {
2469
2477
  const robotsPath = resolve(distDir, 'robots.txt')
package/src/sitemap.js CHANGED
@@ -76,28 +76,37 @@ export function appendSitemapsToRobots (robotsContent, { sitemaps = [], siteUrl
76
76
  ? robotsContent
77
77
  : 'User-agent: *\nAllow: /\n'
78
78
 
79
- const existingIdentities = new Set(
80
- input
81
- .replace(/\r\n/g, '\n')
82
- .split('\n')
83
- .map(line => line.match(/^\s*Sitemap\s*:\s*(.+?)\s*$/i)?.[1])
84
- .filter(Boolean)
85
- .map(normalizeSitemapIdentity)
86
- )
87
-
88
- const addedIdentities = new Set()
89
- const sitemapLines = (Array.isArray(sitemaps) ? sitemaps : [sitemaps])
79
+ const bodyLines = []
80
+ const existingSitemaps = []
81
+
82
+ for (const line of input.replace(/\r\n/g, '\n').split('\n')) {
83
+ const sitemap = line.match(/^\s*Sitemap\s*:\s*(.+?)\s*$/i)?.[1]
84
+ if (sitemap) {
85
+ existingSitemaps.push(sitemap)
86
+ continue
87
+ }
88
+
89
+ bodyLines.push(line)
90
+ }
91
+
92
+ const seenIdentities = new Set()
93
+ const normalizedSitemaps = [
94
+ ...(Array.isArray(sitemaps) ? sitemaps : [sitemaps]),
95
+ ...existingSitemaps
96
+ ]
90
97
  .filter(Boolean)
91
98
  .map(sitemap => resolveSitemapUrl(sitemap, siteUrl))
92
99
  .filter(sitemap => {
93
100
  const identity = normalizeSitemapIdentity(sitemap)
94
- if (existingIdentities.has(identity) || addedIdentities.has(identity)) return false
95
- addedIdentities.add(identity)
101
+ if (seenIdentities.has(identity)) return false
102
+ seenIdentities.add(identity)
96
103
  return true
97
104
  })
98
- .map(sitemap => `Sitemap: ${sitemap}`)
99
105
 
100
- if (sitemapLines.length === 0) return input
106
+ if (normalizedSitemaps.length === 0) return input
107
+
108
+ const body = bodyLines.join('\n').replace(/\s+$/g, '')
109
+ const sitemapLines = normalizedSitemaps.map(sitemap => `Sitemap: ${sitemap}`)
101
110
 
102
- return `${input.replace(/\s+$/g, '')}\n${sitemapLines.join('\n')}\n`
111
+ return `${body}\n\n${sitemapLines.join('\n')}\n`
103
112
  }