mulmoclaude 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +44 -0
- package/bin/mulmoclaude.js +202 -0
- package/bin/prepare-dist.js +93 -0
- package/client/assets/chunk-vKJrgz-R-C_I3GbVV.js +1 -0
- package/client/assets/html2canvas-Cx501zZr-BF5dYYkY.js +5 -0
- package/client/assets/index-D8rhwXLq.js +4906 -0
- package/client/assets/index-KNLBjwuh.css +1 -0
- package/client/assets/index.es-D4YyL_Dg-BfRHLTZV.js +5 -0
- package/client/assets/material-icons-Dr0goTwe.woff +0 -0
- package/client/assets/material-icons-kAwBdRge.woff2 +0 -0
- package/client/assets/material-icons-outlined-BpWbwl2n.woff +0 -0
- package/client/assets/material-icons-outlined-DZhiGvEA.woff2 +0 -0
- package/client/assets/material-icons-round-BDlwx-sv.woff +0 -0
- package/client/assets/material-icons-round-DrirKXBx.woff2 +0 -0
- package/client/assets/material-icons-sharp-CH1KkVu7.woff +0 -0
- package/client/assets/material-icons-sharp-gidztirS.woff2 +0 -0
- package/client/assets/material-icons-two-tone-B7wz7mED.woff +0 -0
- package/client/assets/material-icons-two-tone-DuNIpaEj.woff2 +0 -0
- package/client/assets/mulmo_bw-ERmkSv0a.png +0 -0
- package/client/assets/purify.es-Fx1Nqyry-PeS5RUhs.js +2 -0
- package/client/assets/typeof-DBp4T-Ny-BC0P-2DM.js +1 -0
- package/client/index.html +28 -0
- package/package.json +66 -0
- package/server/agent/attachmentConverter.ts +270 -0
- package/server/agent/config.ts +414 -0
- package/server/agent/index.ts +260 -0
- package/server/agent/mcp-server.ts +412 -0
- package/server/agent/mcp-tools/index.ts +63 -0
- package/server/agent/mcp-tools/x.ts +188 -0
- package/server/agent/plugin-names.ts +75 -0
- package/server/agent/prompt.ts +349 -0
- package/server/agent/resumeFailover.ts +129 -0
- package/server/agent/sandboxMounts.ts +329 -0
- package/server/agent/stream.ts +194 -0
- package/server/api/auth/bearerAuth.ts +61 -0
- package/server/api/auth/token.ts +98 -0
- package/server/api/csrfGuard.ts +85 -0
- package/server/api/routes/agent.ts +478 -0
- package/server/api/routes/chart.ts +98 -0
- package/server/api/routes/chat-index.ts +46 -0
- package/server/api/routes/config.ts +258 -0
- package/server/api/routes/dispatchResponse.ts +79 -0
- package/server/api/routes/files.ts +812 -0
- package/server/api/routes/html.ts +101 -0
- package/server/api/routes/image.ts +169 -0
- package/server/api/routes/mulmo-script.ts +712 -0
- package/server/api/routes/mulmoScriptValidate.ts +101 -0
- package/server/api/routes/notifications.ts +69 -0
- package/server/api/routes/pdf.ts +163 -0
- package/server/api/routes/plugins.ts +276 -0
- package/server/api/routes/presentHtml.ts +48 -0
- package/server/api/routes/roles.ts +125 -0
- package/server/api/routes/scheduler.ts +153 -0
- package/server/api/routes/schedulerHandlers.ts +151 -0
- package/server/api/routes/schedulerTasks.ts +163 -0
- package/server/api/routes/sessions.ts +294 -0
- package/server/api/routes/sessionsCursor.ts +59 -0
- package/server/api/routes/skills.ts +195 -0
- package/server/api/routes/sources.ts +540 -0
- package/server/api/routes/todos.ts +263 -0
- package/server/api/routes/todosColumnsHandlers.ts +347 -0
- package/server/api/routes/todosHandlers.ts +274 -0
- package/server/api/routes/todosItemsHandlers.ts +386 -0
- package/server/api/routes/wiki/pageIndex.ts +53 -0
- package/server/api/routes/wiki.ts +363 -0
- package/server/api/sandboxStatus.ts +64 -0
- package/server/events/notifications.ts +160 -0
- package/server/events/pub-sub/index.ts +45 -0
- package/server/events/relay-client.ts +288 -0
- package/server/events/scheduler-adapter.ts +302 -0
- package/server/events/session-store/index.ts +492 -0
- package/server/events/task-manager/index.ts +181 -0
- package/server/index.ts +572 -0
- package/server/system/config.ts +243 -0
- package/server/system/credentials.ts +220 -0
- package/server/system/docker.ts +97 -0
- package/server/system/env.ts +109 -0
- package/server/system/logger/config.ts +112 -0
- package/server/system/logger/formatters.ts +40 -0
- package/server/system/logger/index.ts +53 -0
- package/server/system/logger/rotation.ts +37 -0
- package/server/system/logger/sinks.ts +101 -0
- package/server/system/logger/types.ts +29 -0
- package/server/utils/date.ts +57 -0
- package/server/utils/errors.ts +7 -0
- package/server/utils/fetch.ts +27 -0
- package/server/utils/files/atomic.ts +125 -0
- package/server/utils/files/html-io.ts +20 -0
- package/server/utils/files/image-store.ts +66 -0
- package/server/utils/files/index.ts +45 -0
- package/server/utils/files/journal-io.ts +213 -0
- package/server/utils/files/json.ts +69 -0
- package/server/utils/files/markdown-store.ts +33 -0
- package/server/utils/files/naming.ts +50 -0
- package/server/utils/files/reference-dirs-io.ts +45 -0
- package/server/utils/files/roles-io.ts +45 -0
- package/server/utils/files/safe.ts +106 -0
- package/server/utils/files/scheduler-io.ts +20 -0
- package/server/utils/files/scheduler-overrides-io.ts +64 -0
- package/server/utils/files/session-io.ts +136 -0
- package/server/utils/files/spreadsheet-store.ts +63 -0
- package/server/utils/files/todos-io.ts +29 -0
- package/server/utils/files/user-tasks-io.ts +25 -0
- package/server/utils/files/workspace-io.ts +221 -0
- package/server/utils/gemini.ts +59 -0
- package/server/utils/gitignore.ts +69 -0
- package/server/utils/http.ts +15 -0
- package/server/utils/httpError.ts +61 -0
- package/server/utils/id.ts +16 -0
- package/server/utils/json.ts +83 -0
- package/server/utils/logBackgroundError.ts +22 -0
- package/server/utils/markdown.ts +82 -0
- package/server/utils/request.ts +29 -0
- package/server/utils/slug.ts +50 -0
- package/server/utils/spawn.ts +62 -0
- package/server/utils/time.ts +34 -0
- package/server/utils/types.ts +47 -0
- package/server/workspace/chat-index/index.ts +153 -0
- package/server/workspace/chat-index/indexer.ts +209 -0
- package/server/workspace/chat-index/paths.ts +34 -0
- package/server/workspace/chat-index/summarizer.ts +247 -0
- package/server/workspace/chat-index/types.ts +38 -0
- package/server/workspace/custom-dirs.ts +220 -0
- package/server/workspace/helps/business.md +104 -0
- package/server/workspace/helps/github.md +23 -0
- package/server/workspace/helps/index.md +60 -0
- package/server/workspace/helps/mulmoscript.md +249 -0
- package/server/workspace/helps/sandbox.md +90 -0
- package/server/workspace/helps/spreadsheet.md +43 -0
- package/server/workspace/helps/telegram.md +135 -0
- package/server/workspace/helps/wiki.md +131 -0
- package/server/workspace/journal/archivist.ts +386 -0
- package/server/workspace/journal/dailyPass.ts +743 -0
- package/server/workspace/journal/diff.ts +71 -0
- package/server/workspace/journal/index.ts +185 -0
- package/server/workspace/journal/indexFile.ts +136 -0
- package/server/workspace/journal/linkRewrite.ts +4 -0
- package/server/workspace/journal/memoryExtractor.ts +130 -0
- package/server/workspace/journal/optimizationPass.ts +160 -0
- package/server/workspace/journal/paths.ts +76 -0
- package/server/workspace/journal/state.ts +125 -0
- package/server/workspace/paths.ts +158 -0
- package/server/workspace/reference-dirs.ts +252 -0
- package/server/workspace/roles.ts +37 -0
- package/server/workspace/skills/discovery.ts +125 -0
- package/server/workspace/skills/index.ts +10 -0
- package/server/workspace/skills/parser.ts +144 -0
- package/server/workspace/skills/paths.ts +41 -0
- package/server/workspace/skills/scheduler.ts +149 -0
- package/server/workspace/skills/types.ts +30 -0
- package/server/workspace/skills/user-tasks.ts +257 -0
- package/server/workspace/skills/writer.ts +189 -0
- package/server/workspace/sources/arxivDiscovery.ts +182 -0
- package/server/workspace/sources/classifier.ts +268 -0
- package/server/workspace/sources/fetchers/arxiv.ts +170 -0
- package/server/workspace/sources/fetchers/github.ts +106 -0
- package/server/workspace/sources/fetchers/githubIssues.ts +208 -0
- package/server/workspace/sources/fetchers/githubReleases.ts +186 -0
- package/server/workspace/sources/fetchers/index.ts +71 -0
- package/server/workspace/sources/fetchers/registerAll.ts +15 -0
- package/server/workspace/sources/fetchers/rss.ts +141 -0
- package/server/workspace/sources/fetchers/rssParser.ts +295 -0
- package/server/workspace/sources/httpFetcher.ts +230 -0
- package/server/workspace/sources/interests.ts +120 -0
- package/server/workspace/sources/paths.ts +110 -0
- package/server/workspace/sources/pipeline/dedup.ts +60 -0
- package/server/workspace/sources/pipeline/fetch.ts +136 -0
- package/server/workspace/sources/pipeline/index.ts +249 -0
- package/server/workspace/sources/pipeline/notify.ts +72 -0
- package/server/workspace/sources/pipeline/plan.ts +66 -0
- package/server/workspace/sources/pipeline/summarize.ts +189 -0
- package/server/workspace/sources/pipeline/write.ts +185 -0
- package/server/workspace/sources/rateLimiter.ts +148 -0
- package/server/workspace/sources/registry.ts +326 -0
- package/server/workspace/sources/robots.ts +271 -0
- package/server/workspace/sources/sourceState.ts +135 -0
- package/server/workspace/sources/taxonomy.ts +74 -0
- package/server/workspace/sources/types.ts +144 -0
- package/server/workspace/sources/urls.ts +112 -0
- package/server/workspace/tool-trace/classify.ts +114 -0
- package/server/workspace/tool-trace/index.ts +250 -0
- package/server/workspace/tool-trace/writeSearch.ts +98 -0
- package/server/workspace/wiki-backlinks/index.ts +107 -0
- package/server/workspace/wiki-backlinks/sessionBacklinks.ts +144 -0
- package/server/workspace/workspace.ts +66 -0
- package/src/App.vue +720 -0
- package/src/assets/mulmo_bw.png +0 -0
- package/src/components/CanvasViewToggle.vue +27 -0
- package/src/components/ChatAttachmentPreview.vue +45 -0
- package/src/components/ChatImagePreview.vue +17 -0
- package/src/components/ChatInput.vue +208 -0
- package/src/components/FileContentHeader.vue +49 -0
- package/src/components/FileContentRenderer.vue +162 -0
- package/src/components/FileTree.vue +115 -0
- package/src/components/FileTreePane.vue +85 -0
- package/src/components/FilesView.vue +206 -0
- package/src/components/LockStatusPopup.vue +111 -0
- package/src/components/NotificationBell.vue +131 -0
- package/src/components/NotificationToast.vue +72 -0
- package/src/components/PluginLauncher.vue +138 -0
- package/src/components/RightSidebar.vue +113 -0
- package/src/components/RoleSelector.vue +64 -0
- package/src/components/SessionHistoryPanel.vue +176 -0
- package/src/components/SessionTabBar.vue +81 -0
- package/src/components/SettingsMcpTab.vue +350 -0
- package/src/components/SettingsModal.vue +275 -0
- package/src/components/SettingsReferenceDirsTab.vue +173 -0
- package/src/components/SettingsWorkspaceDirsTab.vue +174 -0
- package/src/components/SidebarHeader.vue +69 -0
- package/src/components/StackView.vue +360 -0
- package/src/components/SuggestionsPanel.vue +65 -0
- package/src/components/TodoExplorer.vue +358 -0
- package/src/components/ToolResultsPanel.vue +77 -0
- package/src/components/todo/TodoAddDialog.vue +131 -0
- package/src/components/todo/TodoEditDialog.vue +47 -0
- package/src/components/todo/TodoEditPanel.vue +113 -0
- package/src/components/todo/TodoKanbanView.vue +249 -0
- package/src/components/todo/TodoListView.vue +79 -0
- package/src/components/todo/TodoTableView.vue +177 -0
- package/src/composables/useActiveSession.ts +40 -0
- package/src/composables/useAppApi.ts +45 -0
- package/src/composables/useCanvasViewMode.ts +121 -0
- package/src/composables/useChatScroll.ts +47 -0
- package/src/composables/useClickOutside.ts +26 -0
- package/src/composables/useClipboardCopy.ts +44 -0
- package/src/composables/useContentDisplay.ts +52 -0
- package/src/composables/useDebugBeat.ts +23 -0
- package/src/composables/useDynamicFavicon.ts +115 -0
- package/src/composables/useEventListeners.ts +42 -0
- package/src/composables/useExpandedDirs.ts +64 -0
- package/src/composables/useFaviconState.ts +30 -0
- package/src/composables/useFileSelection.ts +115 -0
- package/src/composables/useFileSortMode.ts +24 -0
- package/src/composables/useFileTree.ts +85 -0
- package/src/composables/useFreshPluginData.ts +89 -0
- package/src/composables/useHealth.ts +38 -0
- package/src/composables/useImeAwareEnter.ts +57 -0
- package/src/composables/useKeyNavigation.ts +60 -0
- package/src/composables/useMarkdownLinkHandler.ts +46 -0
- package/src/composables/useMarkdownMode.ts +17 -0
- package/src/composables/useMcpTools.ts +71 -0
- package/src/composables/useMergedSessions.ts +27 -0
- package/src/composables/useNotifications.ts +90 -0
- package/src/composables/usePdfDownload.ts +60 -0
- package/src/composables/usePendingCalls.ts +77 -0
- package/src/composables/usePubSub.ts +85 -0
- package/src/composables/useRightSidebar.ts +23 -0
- package/src/composables/useRoles.ts +34 -0
- package/src/composables/useSandboxStatus.ts +67 -0
- package/src/composables/useSelectedResult.ts +49 -0
- package/src/composables/useSessionDerived.ts +51 -0
- package/src/composables/useSessionHistory.ts +81 -0
- package/src/composables/useSessionSync.ts +57 -0
- package/src/composables/useViewLayout.ts +55 -0
- package/src/config/apiRoutes.ts +173 -0
- package/src/config/pubsubChannels.ts +45 -0
- package/src/config/roles.ts +335 -0
- package/src/config/schedulerActions.ts +25 -0
- package/src/config/toolNames.ts +71 -0
- package/src/config/workspacePaths.ts +24 -0
- package/src/index.css +107 -0
- package/src/main.ts +25 -0
- package/src/plugins/canvas/Preview.vue +13 -0
- package/src/plugins/canvas/View.vue +333 -0
- package/src/plugins/canvas/definition.ts +38 -0
- package/src/plugins/canvas/index.ts +36 -0
- package/src/plugins/chart/Preview.vue +49 -0
- package/src/plugins/chart/View.vue +143 -0
- package/src/plugins/chart/definition.ts +58 -0
- package/src/plugins/chart/index.ts +52 -0
- package/src/plugins/editImage/Preview.vue +13 -0
- package/src/plugins/editImage/View.vue +13 -0
- package/src/plugins/editImage/definition.ts +27 -0
- package/src/plugins/editImage/index.ts +36 -0
- package/src/plugins/generateImage/Preview.vue +13 -0
- package/src/plugins/generateImage/View.vue +33 -0
- package/src/plugins/generateImage/definition.ts +32 -0
- package/src/plugins/generateImage/index.ts +56 -0
- package/src/plugins/manageRoles/Preview.vue +49 -0
- package/src/plugins/manageRoles/View.vue +525 -0
- package/src/plugins/manageRoles/definition.ts +43 -0
- package/src/plugins/manageRoles/index.ts +47 -0
- package/src/plugins/manageSkills/Preview.vue +21 -0
- package/src/plugins/manageSkills/View.vue +321 -0
- package/src/plugins/manageSkills/definition.ts +49 -0
- package/src/plugins/manageSkills/index.ts +49 -0
- package/src/plugins/manageSource/Preview.vue +33 -0
- package/src/plugins/manageSource/View.vue +697 -0
- package/src/plugins/manageSource/definition.ts +63 -0
- package/src/plugins/manageSource/index.ts +66 -0
- package/src/plugins/markdown/Preview.vue +77 -0
- package/src/plugins/markdown/View.vue +476 -0
- package/src/plugins/markdown/definition.ts +50 -0
- package/src/plugins/markdown/index.ts +36 -0
- package/src/plugins/presentHtml/Preview.vue +25 -0
- package/src/plugins/presentHtml/View.vue +52 -0
- package/src/plugins/presentHtml/definition.ts +27 -0
- package/src/plugins/presentHtml/helpers.ts +72 -0
- package/src/plugins/presentHtml/index.ts +41 -0
- package/src/plugins/presentMulmoScript/Preview.vue +23 -0
- package/src/plugins/presentMulmoScript/View.vue +1166 -0
- package/src/plugins/presentMulmoScript/definition.ts +95 -0
- package/src/plugins/presentMulmoScript/helpers.ts +162 -0
- package/src/plugins/presentMulmoScript/index.ts +40 -0
- package/src/plugins/scheduler/Preview.vue +67 -0
- package/src/plugins/scheduler/TasksTab.vue +205 -0
- package/src/plugins/scheduler/View.vue +565 -0
- package/src/plugins/scheduler/definition.ts +57 -0
- package/src/plugins/scheduler/index.ts +45 -0
- package/src/plugins/scheduler/viewModes.ts +26 -0
- package/src/plugins/spreadsheet/Preview.vue +29 -0
- package/src/plugins/spreadsheet/View.vue +997 -0
- package/src/plugins/spreadsheet/cellHighlights.ts +79 -0
- package/src/plugins/spreadsheet/definition.ts +121 -0
- package/src/plugins/spreadsheet/engine/calculator.ts +459 -0
- package/src/plugins/spreadsheet/engine/cellBuilder.ts +81 -0
- package/src/plugins/spreadsheet/engine/date-parser.ts +220 -0
- package/src/plugins/spreadsheet/engine/date-utils.ts +56 -0
- package/src/plugins/spreadsheet/engine/engine.ts +176 -0
- package/src/plugins/spreadsheet/engine/evaluator.ts +390 -0
- package/src/plugins/spreadsheet/engine/formatter.ts +172 -0
- package/src/plugins/spreadsheet/engine/formulaRefs.ts +101 -0
- package/src/plugins/spreadsheet/engine/functions/date.ts +299 -0
- package/src/plugins/spreadsheet/engine/functions/financial.ts +387 -0
- package/src/plugins/spreadsheet/engine/functions/index.ts +16 -0
- package/src/plugins/spreadsheet/engine/functions/logical.ts +262 -0
- package/src/plugins/spreadsheet/engine/functions/lookup.ts +400 -0
- package/src/plugins/spreadsheet/engine/functions/mathematical.ts +297 -0
- package/src/plugins/spreadsheet/engine/functions/statistical.ts +338 -0
- package/src/plugins/spreadsheet/engine/functions/text.ts +389 -0
- package/src/plugins/spreadsheet/engine/index.ts +27 -0
- package/src/plugins/spreadsheet/engine/jsonCellLocator.ts +111 -0
- package/src/plugins/spreadsheet/engine/parser.ts +143 -0
- package/src/plugins/spreadsheet/engine/registry.ts +150 -0
- package/src/plugins/spreadsheet/engine/responseDecoder.ts +67 -0
- package/src/plugins/spreadsheet/engine/types.ts +64 -0
- package/src/plugins/spreadsheet/index.ts +36 -0
- package/src/plugins/textResponse/Preview.vue +94 -0
- package/src/plugins/textResponse/View.vue +503 -0
- package/src/plugins/textResponse/definition.ts +34 -0
- package/src/plugins/textResponse/index.ts +27 -0
- package/src/plugins/textResponse/plugin.ts +29 -0
- package/src/plugins/textResponse/samples.ts +97 -0
- package/src/plugins/textResponse/types.ts +11 -0
- package/src/plugins/todo/Preview.vue +63 -0
- package/src/plugins/todo/View.vue +364 -0
- package/src/plugins/todo/composables/useTodos.ts +177 -0
- package/src/plugins/todo/definition.ts +45 -0
- package/src/plugins/todo/index.ts +61 -0
- package/src/plugins/todo/labels.ts +163 -0
- package/src/plugins/todo/priority.ts +98 -0
- package/src/plugins/todo/viewModes.ts +19 -0
- package/src/plugins/ui-image/ImagePreview.vue +23 -0
- package/src/plugins/ui-image/ImageView.vue +34 -0
- package/src/plugins/ui-image/index.ts +3 -0
- package/src/plugins/ui-image/types.ts +4 -0
- package/src/plugins/wiki/Preview.vue +65 -0
- package/src/plugins/wiki/View.vue +342 -0
- package/src/plugins/wiki/definition.ts +25 -0
- package/src/plugins/wiki/helpers.ts +59 -0
- package/src/plugins/wiki/index.ts +52 -0
- package/src/router/guards.ts +61 -0
- package/src/router/index.ts +50 -0
- package/src/tools/index.ts +52 -0
- package/src/tools/types.ts +27 -0
- package/src/types/events.ts +16 -0
- package/src/types/fileTree.ts +13 -0
- package/src/types/notification.ts +67 -0
- package/src/types/session.ts +116 -0
- package/src/types/sse.ts +90 -0
- package/src/types/toolCallHistory.ts +13 -0
- package/src/utils/agent/eventDispatch.ts +74 -0
- package/src/utils/agent/request.ts +55 -0
- package/src/utils/agent/toolCalls.ts +62 -0
- package/src/utils/api.ts +218 -0
- package/src/utils/canvas/viewMode.ts +46 -0
- package/src/utils/dom/authTokenMeta.ts +20 -0
- package/src/utils/dom/clickOutside.ts +11 -0
- package/src/utils/dom/externalLink.ts +57 -0
- package/src/utils/dom/scrollable.ts +24 -0
- package/src/utils/errors.ts +11 -0
- package/src/utils/files/expandedDirs.ts +25 -0
- package/src/utils/files/filename.ts +12 -0
- package/src/utils/files/sortChildren.ts +20 -0
- package/src/utils/filesPreview/schedulerPreview.ts +38 -0
- package/src/utils/filesPreview/todoPreview.ts +40 -0
- package/src/utils/format/date.ts +85 -0
- package/src/utils/format/frontmatter.ts +80 -0
- package/src/utils/format/jsonSyntax.ts +109 -0
- package/src/utils/html/previewCsp.ts +65 -0
- package/src/utils/image/resolve.ts +8 -0
- package/src/utils/image/rewriteMarkdownImageRefs.ts +182 -0
- package/src/utils/markdown/extractFirstH1.ts +39 -0
- package/src/utils/notification/dispatch.ts +22 -0
- package/src/utils/path/relativeLink.ts +130 -0
- package/src/utils/role/icon.ts +20 -0
- package/src/utils/role/merge.ts +10 -0
- package/src/utils/role/plugins.ts +12 -0
- package/src/utils/session/mergeSessions.ts +103 -0
- package/src/utils/session/seedRoleDefault.ts +35 -0
- package/src/utils/session/sessionEntries.ts +121 -0
- package/src/utils/session/sessionFactory.ts +22 -0
- package/src/utils/session/sessionHelpers.ts +99 -0
- package/src/utils/tools/dedup.ts +17 -0
- package/src/utils/tools/mcp.ts +33 -0
- package/src/utils/tools/pendingCalls.ts +16 -0
- package/src/utils/tools/result.ts +40 -0
- package/src/utils/types.ts +44 -0
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
// Read / write source registry files under `workspace/sources/`.
|
|
2
|
+
//
|
|
3
|
+
// On-disk format — one markdown file per source:
|
|
4
|
+
//
|
|
5
|
+
// ---
|
|
6
|
+
// slug: hn-front-page
|
|
7
|
+
// title: Hacker News front page
|
|
8
|
+
// url: https://news.ycombinator.com/rss
|
|
9
|
+
// fetcher_kind: rss
|
|
10
|
+
// schedule: daily
|
|
11
|
+
// categories: [tech-news, general, english]
|
|
12
|
+
// max_items_per_fetch: 30
|
|
13
|
+
// added_at: 2026-04-13T09:00:00Z
|
|
14
|
+
// <fetcher-specific params as flat key: value>
|
|
15
|
+
// ---
|
|
16
|
+
//
|
|
17
|
+
// # Notes
|
|
18
|
+
//
|
|
19
|
+
// Free-form markdown body. Claude reads it for context when
|
|
20
|
+
// summarizing.
|
|
21
|
+
//
|
|
22
|
+
// Parser policy:
|
|
23
|
+
//
|
|
24
|
+
// - Flat YAML only. Nested mappings are not supported by design —
|
|
25
|
+
// the frontmatter is hand-edited by humans and the LLM, both of
|
|
26
|
+
// which routinely get nesting wrong. Fetcher params are flat
|
|
27
|
+
// strings (e.g. `github_repo: foo/bar`) so the fetcher itself
|
|
28
|
+
// interprets them.
|
|
29
|
+
// - Unknown frontmatter keys are preserved as opaque strings in
|
|
30
|
+
// `fetcherParams`, so future fetchers can add fields without
|
|
31
|
+
// round-trip data loss.
|
|
32
|
+
// - Missing required fields → the loader returns `null` and logs
|
|
33
|
+
// a warning; the caller skips that source rather than crashing
|
|
34
|
+
// the pass.
|
|
35
|
+
//
|
|
36
|
+
// The writer preserves the body text verbatim so re-saving a file
|
|
37
|
+
// doesn't rewrite the user's notes.
|
|
38
|
+
|
|
39
|
+
import fsp from "node:fs/promises";
|
|
40
|
+
import { isFetcherKind, isSourceSchedule, type Source, type FetcherParams, type FetcherKind, type SourceSchedule } from "./types.js";
|
|
41
|
+
import { normalizeCategories } from "./taxonomy.js";
|
|
42
|
+
import type { CategorySlug } from "./taxonomy.js";
|
|
43
|
+
import { writeFileAtomic } from "../../utils/files/index.js";
|
|
44
|
+
import { sourceFilePath, sourcesRoot } from "./paths.js";
|
|
45
|
+
import { isValidSlug } from "../../utils/slug.js";
|
|
46
|
+
import { isNonEmptyString } from "../../utils/types.js";
|
|
47
|
+
import { log } from "../../system/logger/index.js";
|
|
48
|
+
|
|
49
|
+
// --- Frontmatter parsing ------------------------------------------------
|
|
50
|
+
|
|
51
|
+
// Fields we recognize as first-class on every source. Anything else
|
|
52
|
+
// in the frontmatter ends up in `fetcherParams` so a fetcher kind
|
|
53
|
+
// that needs extra config can read it without us adding yet
|
|
54
|
+
// another typed field for every new fetcher.
|
|
55
|
+
const RESERVED_KEYS = new Set(["slug", "title", "url", "fetcher_kind", "schedule", "categories", "max_items_per_fetch", "added_at"]);
|
|
56
|
+
|
|
57
|
+
const FRONTMATTER_OPEN = /^---\r?\n/;
|
|
58
|
+
const FRONTMATTER_CLOSE = /\r?\n---\s*(\r?\n|$)/;
|
|
59
|
+
|
|
60
|
+
interface ParsedFrontmatter {
|
|
61
|
+
fields: Map<string, string | string[]>;
|
|
62
|
+
body: string;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Extract YAML frontmatter + body. Returns null when the file has
|
|
66
|
+
// no frontmatter at all — that's an error condition for source
|
|
67
|
+
// files (we always write frontmatter), not a degraded mode.
|
|
68
|
+
export function parseSourceFile(raw: string): ParsedFrontmatter | null {
|
|
69
|
+
if (!FRONTMATTER_OPEN.test(raw)) return null;
|
|
70
|
+
const afterOpen = raw.replace(FRONTMATTER_OPEN, "");
|
|
71
|
+
const closeMatch = FRONTMATTER_CLOSE.exec(afterOpen);
|
|
72
|
+
if (!closeMatch || closeMatch.index === undefined) return null;
|
|
73
|
+
const fmText = afterOpen.slice(0, closeMatch.index);
|
|
74
|
+
const body = afterOpen.slice(closeMatch.index + closeMatch[0].length);
|
|
75
|
+
return { fields: parseFields(fmText), body };
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function parseFields(fmText: string): Map<string, string | string[]> {
|
|
79
|
+
const fields = new Map<string, string | string[]>();
|
|
80
|
+
for (const line of fmText.split(/\r?\n/)) {
|
|
81
|
+
const parsed = parseLine(line);
|
|
82
|
+
if (parsed) fields.set(parsed.key, parsed.value);
|
|
83
|
+
}
|
|
84
|
+
return fields;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function parseLine(line: string): { key: string; value: string | string[] } | null {
|
|
88
|
+
if (!line.trim() || line.trimStart().startsWith("#")) return null;
|
|
89
|
+
const colonIdx = line.indexOf(":");
|
|
90
|
+
if (colonIdx <= 0) return null;
|
|
91
|
+
const key = line.slice(0, colonIdx).trim();
|
|
92
|
+
const rawValue = line.slice(colonIdx + 1).trim();
|
|
93
|
+
if (!key) return null;
|
|
94
|
+
return { key, value: parseValue(rawValue) };
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function parseValue(raw: string): string | string[] {
|
|
98
|
+
if (!raw) return "";
|
|
99
|
+
const arrayMatch = /^\[(.*)\]$/.exec(raw);
|
|
100
|
+
if (arrayMatch) {
|
|
101
|
+
return arrayMatch[1]
|
|
102
|
+
.split(",")
|
|
103
|
+
.map((s) => unquote(s.trim()))
|
|
104
|
+
.filter((s) => s.length > 0);
|
|
105
|
+
}
|
|
106
|
+
return unquote(raw);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function unquote(s: string): string {
|
|
110
|
+
// Double-quoted strings: yamlScalar writes JSON-compatible escape
|
|
111
|
+
// sequences (\\ for \, \" for "), so JSON.parse reverses them in
|
|
112
|
+
// one shot. Fall back to a plain strip if the string is
|
|
113
|
+
// double-quoted but somehow malformed.
|
|
114
|
+
if (s.length >= 2 && s.startsWith('"') && s.endsWith('"')) {
|
|
115
|
+
try {
|
|
116
|
+
return JSON.parse(s);
|
|
117
|
+
} catch {
|
|
118
|
+
return s.slice(1, -1);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
// Single-quoted scalars follow YAML's doubling convention: '' → '.
|
|
122
|
+
if (s.length >= 2 && s.startsWith("'") && s.endsWith("'")) {
|
|
123
|
+
return s.slice(1, -1).replace(/''/g, "'");
|
|
124
|
+
}
|
|
125
|
+
return s;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// --- Source validation / construction -----------------------------------
|
|
129
|
+
|
|
130
|
+
function stringField(fields: Map<string, string | string[]>, key: string): string | null {
|
|
131
|
+
const v = fields.get(key);
|
|
132
|
+
return isNonEmptyString(v) ? v : null;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function numberField(fields: Map<string, string | string[]>, key: string, defaultValue: number): number {
|
|
136
|
+
const v = fields.get(key);
|
|
137
|
+
if (typeof v !== "string") return defaultValue;
|
|
138
|
+
const n = Number(v);
|
|
139
|
+
return Number.isFinite(n) && n > 0 ? Math.floor(n) : defaultValue;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Default per-fetch cap. Fetchers treat it as a hint — if the
|
|
143
|
+
// upstream API returns fewer items naturally the fetcher MAY
|
|
144
|
+
// return fewer, but must NEVER return more than this.
|
|
145
|
+
export const DEFAULT_MAX_ITEMS_PER_FETCH = 30;
|
|
146
|
+
|
|
147
|
+
// Construct a Source from parsed frontmatter fields. Returns null
|
|
148
|
+
// on required-field validation failure. The `body` arg is inlined
|
|
149
|
+
// into the Source as `notes`.
|
|
150
|
+
export function buildSource(fields: Map<string, string | string[]>, body: string): Source | null {
|
|
151
|
+
const slug = stringField(fields, "slug");
|
|
152
|
+
if (!slug || !isValidSlug(slug)) return null;
|
|
153
|
+
|
|
154
|
+
const title = stringField(fields, "title");
|
|
155
|
+
if (!title) return null;
|
|
156
|
+
|
|
157
|
+
const url = stringField(fields, "url");
|
|
158
|
+
if (!url) return null;
|
|
159
|
+
|
|
160
|
+
const fetcherKindRaw = stringField(fields, "fetcher_kind");
|
|
161
|
+
if (!isFetcherKind(fetcherKindRaw)) return null;
|
|
162
|
+
const fetcherKind: FetcherKind = fetcherKindRaw;
|
|
163
|
+
|
|
164
|
+
const scheduleRaw = stringField(fields, "schedule");
|
|
165
|
+
if (!isSourceSchedule(scheduleRaw)) return null;
|
|
166
|
+
const schedule: SourceSchedule = scheduleRaw;
|
|
167
|
+
|
|
168
|
+
const categoriesRaw = fields.get("categories");
|
|
169
|
+
const categories: CategorySlug[] = normalizeCategories(categoriesRaw);
|
|
170
|
+
|
|
171
|
+
const maxItemsPerFetch = numberField(fields, "max_items_per_fetch", DEFAULT_MAX_ITEMS_PER_FETCH);
|
|
172
|
+
|
|
173
|
+
const addedAt = stringField(fields, "added_at") ?? new Date(0).toISOString();
|
|
174
|
+
|
|
175
|
+
// Collect unrecognized fields into fetcherParams. Only flat
|
|
176
|
+
// string values — array values would indicate a schema mismatch
|
|
177
|
+
// since no fetcher param is a list today.
|
|
178
|
+
const fetcherParams: FetcherParams = {};
|
|
179
|
+
for (const [key, value] of fields.entries()) {
|
|
180
|
+
if (RESERVED_KEYS.has(key)) continue;
|
|
181
|
+
if (typeof value === "string") fetcherParams[key] = value;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
return {
|
|
185
|
+
slug,
|
|
186
|
+
title,
|
|
187
|
+
url,
|
|
188
|
+
fetcherKind,
|
|
189
|
+
fetcherParams,
|
|
190
|
+
schedule,
|
|
191
|
+
categories,
|
|
192
|
+
maxItemsPerFetch,
|
|
193
|
+
addedAt,
|
|
194
|
+
notes: body,
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// --- Serialization ------------------------------------------------------
|
|
199
|
+
|
|
200
|
+
// Escape a scalar for use as a YAML value. Very conservative —
|
|
201
|
+
// wraps in double-quotes whenever the value contains any character
|
|
202
|
+
// that could be mis-parsed. Idempotent-safe: a round-trip through
|
|
203
|
+
// parseValue → yamlScalar preserves the semantic string.
|
|
204
|
+
function yamlScalar(value: string): string {
|
|
205
|
+
// Quote whenever the raw value contains characters that would
|
|
206
|
+
// confuse the flat-YAML parser or collide with a YAML reserved
|
|
207
|
+
// glyph. Numbers, dates, booleans, null all get quoted too so
|
|
208
|
+
// the reader always treats them as strings.
|
|
209
|
+
const needsQuote =
|
|
210
|
+
value === "" ||
|
|
211
|
+
/[:#[\]{},&*?|<>=!%@`]/.test(value) ||
|
|
212
|
+
/^\s|\s$/.test(value) ||
|
|
213
|
+
/^(true|false|null|~|yes|no|on|off)$/i.test(value) ||
|
|
214
|
+
/^[+-]?[\d.]/.test(value);
|
|
215
|
+
if (needsQuote) {
|
|
216
|
+
return `"${value.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`;
|
|
217
|
+
}
|
|
218
|
+
return value;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
function yamlList(values: readonly string[]): string {
|
|
222
|
+
return `[${values.map(yamlScalar).join(", ")}]`;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Serialize a Source back to the canonical markdown-with-
|
|
226
|
+
// frontmatter shape. Reserved-key ordering is stable (nice for
|
|
227
|
+
// diffs) and fetcher-specific params come after in alphabetical
|
|
228
|
+
// order.
|
|
229
|
+
export function serializeSource(source: Source): string {
|
|
230
|
+
const lines: string[] = [];
|
|
231
|
+
lines.push("---");
|
|
232
|
+
lines.push(`slug: ${yamlScalar(source.slug)}`);
|
|
233
|
+
lines.push(`title: ${yamlScalar(source.title)}`);
|
|
234
|
+
lines.push(`url: ${yamlScalar(source.url)}`);
|
|
235
|
+
lines.push(`fetcher_kind: ${yamlScalar(source.fetcherKind)}`);
|
|
236
|
+
lines.push(`schedule: ${yamlScalar(source.schedule)}`);
|
|
237
|
+
lines.push(`categories: ${yamlList(source.categories)}`);
|
|
238
|
+
lines.push(`max_items_per_fetch: ${String(source.maxItemsPerFetch)}`);
|
|
239
|
+
lines.push(`added_at: ${yamlScalar(source.addedAt)}`);
|
|
240
|
+
const paramKeys = Object.keys(source.fetcherParams).sort();
|
|
241
|
+
for (const key of paramKeys) {
|
|
242
|
+
lines.push(`${key}: ${yamlScalar(source.fetcherParams[key])}`);
|
|
243
|
+
}
|
|
244
|
+
lines.push("---");
|
|
245
|
+
lines.push("");
|
|
246
|
+
// Preserve trailing newline semantics — if the notes were empty,
|
|
247
|
+
// emit exactly one newline after the closing fence; otherwise
|
|
248
|
+
// append the notes verbatim.
|
|
249
|
+
if (source.notes.length > 0) {
|
|
250
|
+
lines.push(source.notes.endsWith("\n") ? source.notes : `${source.notes}\n`);
|
|
251
|
+
} else {
|
|
252
|
+
lines.push("");
|
|
253
|
+
}
|
|
254
|
+
return lines.join("\n");
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// --- Filesystem I/O -----------------------------------------------------
|
|
258
|
+
|
|
259
|
+
// Load one source by slug. Returns null if missing, malformed, or
|
|
260
|
+
// fails required-field validation. Never throws — consumer code
|
|
261
|
+
// just skips null entries.
|
|
262
|
+
export async function readSource(workspaceRoot: string, slug: string): Promise<Source | null> {
|
|
263
|
+
if (!isValidSlug(slug)) return null;
|
|
264
|
+
let raw: string;
|
|
265
|
+
try {
|
|
266
|
+
raw = await fsp.readFile(sourceFilePath(workspaceRoot, slug), "utf-8");
|
|
267
|
+
} catch {
|
|
268
|
+
return null;
|
|
269
|
+
}
|
|
270
|
+
const parsed = parseSourceFile(raw);
|
|
271
|
+
if (!parsed) return null;
|
|
272
|
+
const source = buildSource(parsed.fields, parsed.body);
|
|
273
|
+
// Sanity: filename slug must match frontmatter slug. A mismatch
|
|
274
|
+
// indicates the user renamed the file without editing the header
|
|
275
|
+
// (or vice-versa) — refuse the load rather than silently using
|
|
276
|
+
// the wrong slug.
|
|
277
|
+
if (source && source.slug !== slug) return null;
|
|
278
|
+
return source;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// List every source in the registry. Files that fail to parse are
|
|
282
|
+
// logged and skipped; a single bad source file must not break the
|
|
283
|
+
// daily pipeline for all the others.
|
|
284
|
+
export async function listSources(workspaceRoot: string): Promise<Source[]> {
|
|
285
|
+
const dir = sourcesRoot(workspaceRoot);
|
|
286
|
+
let entries: string[];
|
|
287
|
+
try {
|
|
288
|
+
entries = await fsp.readdir(dir);
|
|
289
|
+
} catch {
|
|
290
|
+
return [];
|
|
291
|
+
}
|
|
292
|
+
const out: Source[] = [];
|
|
293
|
+
for (const name of entries) {
|
|
294
|
+
// Skip meta files and the `_state/` subdirectory.
|
|
295
|
+
if (name.startsWith("_")) continue;
|
|
296
|
+
if (!name.endsWith(".md")) continue;
|
|
297
|
+
const slug = name.slice(0, -".md".length);
|
|
298
|
+
const source = await readSource(workspaceRoot, slug);
|
|
299
|
+
if (source) out.push(source);
|
|
300
|
+
else {
|
|
301
|
+
log.warn("sources", "failed to load source, skipping", { slug });
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
// Deterministic sort by slug so callers can rely on stable order.
|
|
305
|
+
out.sort((a, b) => (a.slug < b.slug ? -1 : a.slug > b.slug ? 1 : 0));
|
|
306
|
+
return out;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// Atomic write: stage to a sibling `.tmp` file then rename. Crash
|
|
310
|
+
// mid-write cannot leave a half-written source file behind.
|
|
311
|
+
export async function writeSource(workspaceRoot: string, source: Source): Promise<void> {
|
|
312
|
+
if (!isValidSlug(source.slug)) {
|
|
313
|
+
throw new Error(`[sources] invalid slug: ${source.slug}`);
|
|
314
|
+
}
|
|
315
|
+
await writeFileAtomic(sourceFilePath(workspaceRoot, source.slug), serializeSource(source));
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
export async function deleteSource(workspaceRoot: string, slug: string): Promise<boolean> {
|
|
319
|
+
if (!isValidSlug(slug)) return false;
|
|
320
|
+
try {
|
|
321
|
+
await fsp.unlink(sourceFilePath(workspaceRoot, slug));
|
|
322
|
+
return true;
|
|
323
|
+
} catch {
|
|
324
|
+
return false;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
// robots.txt parser + rule evaluator.
|
|
2
|
+
//
|
|
3
|
+
// Phase-1 fetchers call `isAllowedByRobots(robotsText, userAgent, path)`
|
|
4
|
+
// before GET-ing a URL on a host we haven't fetched from recently.
|
|
5
|
+
// The robots text itself comes from a 24h cache populated elsewhere —
|
|
6
|
+
// this module only deals with parsing and rule evaluation.
|
|
7
|
+
//
|
|
8
|
+
// Supported directives:
|
|
9
|
+
// User-agent: <name> — group selector
|
|
10
|
+
// Disallow: <path> — path-prefix block (empty value == allow all)
|
|
11
|
+
// Allow: <path> — path-prefix exception
|
|
12
|
+
// Crawl-delay: <secs> — minimum seconds between fetches for this UA
|
|
13
|
+
//
|
|
14
|
+
// Deliberately NOT supported:
|
|
15
|
+
// Sitemap: — irrelevant for fetchers
|
|
16
|
+
// Request-rate: — rarely used in the wild
|
|
17
|
+
// Visit-time: — same
|
|
18
|
+
// Host: — Yandex-only extension
|
|
19
|
+
//
|
|
20
|
+
// Matching semantics follow the de-facto Google robots.txt rules
|
|
21
|
+
// (draft IETF "robotstxt-00"): longest-prefix wins between Allow
|
|
22
|
+
// and Disallow; `*` in paths is treated as a wildcard; `$` at
|
|
23
|
+
// end-of-path anchors to end-of-URL.
|
|
24
|
+
//
|
|
25
|
+
// Pure — no I/O, no network, fully testable.
|
|
26
|
+
|
|
27
|
+
export interface RobotsGroup {
|
|
28
|
+
// Lowercased user-agent names this group applies to. May contain
|
|
29
|
+
// "*" meaning "any agent not matched by a more-specific group".
|
|
30
|
+
userAgents: string[];
|
|
31
|
+
rules: RobotsRule[];
|
|
32
|
+
crawlDelaySec: number | null;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export interface RobotsRule {
|
|
36
|
+
kind: "allow" | "disallow";
|
|
37
|
+
// Raw path pattern as it appeared in the file, minus the leading
|
|
38
|
+
// directive. Wildcards preserved.
|
|
39
|
+
pattern: string;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface ParsedRobots {
|
|
43
|
+
groups: RobotsGroup[];
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Parse a robots.txt body into structured groups. Completely
|
|
47
|
+
// lenient: unknown directives are skipped, malformed lines are
|
|
48
|
+
// skipped, empty input yields an empty group list (which means
|
|
49
|
+
// "everything allowed" downstream).
|
|
50
|
+
export function parseRobots(text: string): ParsedRobots {
|
|
51
|
+
// State machine driven by one helper per directive kind — keeps
|
|
52
|
+
// the main loop free of nested branching.
|
|
53
|
+
//
|
|
54
|
+
// `collectingAgents` is true while we're inside a run of
|
|
55
|
+
// consecutive User-agent lines before the first rule. Additional
|
|
56
|
+
// User-agent lines extend the same group; once a rule appears,
|
|
57
|
+
// the next User-agent starts a new group.
|
|
58
|
+
const state: ParseState = {
|
|
59
|
+
groups: [],
|
|
60
|
+
current: null,
|
|
61
|
+
collectingAgents: false,
|
|
62
|
+
};
|
|
63
|
+
for (const rawLine of text.split(/\r?\n/)) {
|
|
64
|
+
const line = stripComment(rawLine).trim();
|
|
65
|
+
if (!line) continue;
|
|
66
|
+
const parsed = parseDirective(line);
|
|
67
|
+
if (!parsed) continue;
|
|
68
|
+
applyDirective(state, parsed.name, parsed.value);
|
|
69
|
+
}
|
|
70
|
+
return { groups: state.groups };
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
interface ParseState {
|
|
74
|
+
groups: RobotsGroup[];
|
|
75
|
+
current: RobotsGroup | null;
|
|
76
|
+
collectingAgents: boolean;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function applyDirective(state: ParseState, name: string, value: string): void {
|
|
80
|
+
if (name === "user-agent") {
|
|
81
|
+
applyUserAgent(state, value);
|
|
82
|
+
return;
|
|
83
|
+
}
|
|
84
|
+
// Any non-user-agent directive ends the "collecting agents"
|
|
85
|
+
// window. If we see a rule before any User-agent (malformed),
|
|
86
|
+
// drop it — robots.txt without a User-agent scope is
|
|
87
|
+
// meaningless.
|
|
88
|
+
state.collectingAgents = false;
|
|
89
|
+
if (!state.current) return;
|
|
90
|
+
applyRule(state.current, name, value);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function applyUserAgent(state: ParseState, value: string): void {
|
|
94
|
+
if (!state.collectingAgents || state.current === null) {
|
|
95
|
+
state.current = { userAgents: [], rules: [], crawlDelaySec: null };
|
|
96
|
+
state.groups.push(state.current);
|
|
97
|
+
state.collectingAgents = true;
|
|
98
|
+
}
|
|
99
|
+
state.current.userAgents.push(value.toLowerCase());
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function applyRule(group: RobotsGroup, name: string, value: string): void {
|
|
103
|
+
if (name === "disallow") {
|
|
104
|
+
group.rules.push({ kind: "disallow", pattern: value });
|
|
105
|
+
} else if (name === "allow") {
|
|
106
|
+
group.rules.push({ kind: "allow", pattern: value });
|
|
107
|
+
} else if (name === "crawl-delay") {
|
|
108
|
+
const n = Number(value);
|
|
109
|
+
if (Number.isFinite(n) && n >= 0) group.crawlDelaySec = n;
|
|
110
|
+
}
|
|
111
|
+
// Any other directive: ignored.
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function stripComment(line: string): string {
|
|
115
|
+
const hashIdx = line.indexOf("#");
|
|
116
|
+
return hashIdx === -1 ? line : line.slice(0, hashIdx);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
function parseDirective(line: string): { name: string; value: string } | null {
|
|
120
|
+
const colonIdx = line.indexOf(":");
|
|
121
|
+
if (colonIdx <= 0) return null;
|
|
122
|
+
const name = line.slice(0, colonIdx).trim().toLowerCase();
|
|
123
|
+
const value = line.slice(colonIdx + 1).trim();
|
|
124
|
+
return { name, value };
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Pick the group whose User-agent directive best matches `userAgent`.
|
|
128
|
+
// Ordering of preference:
|
|
129
|
+
// 1. All exact-match groups (case-insensitive) merged together.
|
|
130
|
+
// 2. All prefix-match groups tied for longest prefix, merged.
|
|
131
|
+
// 3. All `*` groups merged together.
|
|
132
|
+
// 4. No group at all → null (caller treats as "everything allowed").
|
|
133
|
+
//
|
|
134
|
+
// Per REP (IETF draft `robotstxt-00`): when multiple groups apply
|
|
135
|
+
// equally to the same agent, their rules are combined into a single
|
|
136
|
+
// rule set before the Allow/Disallow decision is made. Returning the
|
|
137
|
+
// first match only (old behaviour) let a later Disallow in a
|
|
138
|
+
// duplicate group get ignored, which could silently let a fetcher
|
|
139
|
+
// hit a path the site explicitly blocked.
|
|
140
|
+
export function selectGroup(robots: ParsedRobots, userAgent: string): RobotsGroup | null {
|
|
141
|
+
const ua = userAgent.toLowerCase();
|
|
142
|
+
const exacts: RobotsGroup[] = [];
|
|
143
|
+
const stars: RobotsGroup[] = [];
|
|
144
|
+
let bestPrefixScore = -1;
|
|
145
|
+
let prefixMatches: RobotsGroup[] = [];
|
|
146
|
+
for (const group of robots.groups) {
|
|
147
|
+
const outcome = scoreGroupAgainstAgent(group, ua);
|
|
148
|
+
if (outcome.kind === "exact") exacts.push(outcome.group);
|
|
149
|
+
else if (outcome.kind === "star") stars.push(outcome.group);
|
|
150
|
+
else if (outcome.kind === "prefix") {
|
|
151
|
+
if (outcome.score > bestPrefixScore) {
|
|
152
|
+
bestPrefixScore = outcome.score;
|
|
153
|
+
prefixMatches = [outcome.group];
|
|
154
|
+
} else if (outcome.score === bestPrefixScore) {
|
|
155
|
+
prefixMatches.push(outcome.group);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
if (exacts.length > 0) return mergeGroups(exacts);
|
|
160
|
+
if (prefixMatches.length > 0) return mergeGroups(prefixMatches);
|
|
161
|
+
if (stars.length > 0) return mergeGroups(stars);
|
|
162
|
+
return null;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Merge an array of groups into one. Concatenates rules (preserving
|
|
166
|
+
// per-group order) and takes the smallest non-null crawlDelaySec
|
|
167
|
+
// (the most conservative value) if any group specifies one.
|
|
168
|
+
function mergeGroups(groups: readonly RobotsGroup[]): RobotsGroup {
|
|
169
|
+
if (groups.length === 1) return groups[0];
|
|
170
|
+
const rules: RobotsRule[] = [];
|
|
171
|
+
const userAgents: string[] = [];
|
|
172
|
+
let crawlDelaySec: number | null = null;
|
|
173
|
+
for (const g of groups) {
|
|
174
|
+
rules.push(...g.rules);
|
|
175
|
+
userAgents.push(...g.userAgents);
|
|
176
|
+
if (g.crawlDelaySec !== null) {
|
|
177
|
+
crawlDelaySec = crawlDelaySec === null ? g.crawlDelaySec : Math.min(crawlDelaySec, g.crawlDelaySec);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
return { userAgents, rules, crawlDelaySec };
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
type AgentMatch =
|
|
184
|
+
| { kind: "exact"; group: RobotsGroup }
|
|
185
|
+
| { kind: "prefix"; group: RobotsGroup; score: number }
|
|
186
|
+
| { kind: "star"; group: RobotsGroup }
|
|
187
|
+
| { kind: "none" };
|
|
188
|
+
|
|
189
|
+
function scoreGroupAgainstAgent(group: RobotsGroup, ua: string): AgentMatch {
|
|
190
|
+
let bestPrefix = -1;
|
|
191
|
+
let hasStar = false;
|
|
192
|
+
for (const listed of group.userAgents) {
|
|
193
|
+
if (listed === "*") {
|
|
194
|
+
hasStar = true;
|
|
195
|
+
continue;
|
|
196
|
+
}
|
|
197
|
+
if (listed === ua) return { kind: "exact", group };
|
|
198
|
+
if (ua.startsWith(listed) && listed.length > bestPrefix) {
|
|
199
|
+
bestPrefix = listed.length;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
if (bestPrefix >= 0) return { kind: "prefix", group, score: bestPrefix };
|
|
203
|
+
if (hasStar) return { kind: "star", group };
|
|
204
|
+
return { kind: "none" };
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Decide whether `path` (the URL's path + query, e.g. "/a/b?c=d")
|
|
208
|
+
// is permitted for `userAgent` by the parsed robots text. Returns
|
|
209
|
+
// `true` when allowed, `false` when disallowed. Empty / missing
|
|
210
|
+
// groups default to allowed, matching the robots.txt convention.
|
|
211
|
+
//
|
|
212
|
+
// Rule resolution follows the Google / IETF robots draft:
|
|
213
|
+
// longest-prefix match wins between Allow and Disallow; tie goes
|
|
214
|
+
// to Allow (the more permissive outcome).
|
|
215
|
+
export function isAllowedByRobots(robots: ParsedRobots, userAgent: string, path: string): boolean {
|
|
216
|
+
const group = selectGroup(robots, userAgent);
|
|
217
|
+
if (!group) return true;
|
|
218
|
+
const { bestAllow, bestDisallow } = scoreRules(group, path);
|
|
219
|
+
if (bestAllow < 0 && bestDisallow < 0) return true;
|
|
220
|
+
return bestAllow >= bestDisallow;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// For each rule in `group`, compute the length of the longest
|
|
224
|
+
// matching prefix; return the best Allow and Disallow lengths.
|
|
225
|
+
// Returns -1 in either slot when no rule of that kind matched.
|
|
226
|
+
// Empty patterns (from `Disallow:` with no value) never match in
|
|
227
|
+
// `matchesPattern`, so they correctly fall through to the
|
|
228
|
+
// allow-all default.
|
|
229
|
+
function scoreRules(group: RobotsGroup, path: string): { bestAllow: number; bestDisallow: number } {
|
|
230
|
+
let bestAllow = -1;
|
|
231
|
+
let bestDisallow = -1;
|
|
232
|
+
for (const rule of group.rules) {
|
|
233
|
+
const matchLen = matchesPattern(rule.pattern, path);
|
|
234
|
+
if (matchLen < 0) continue;
|
|
235
|
+
if (rule.kind === "allow" && matchLen > bestAllow) bestAllow = matchLen;
|
|
236
|
+
else if (rule.kind === "disallow" && matchLen > bestDisallow) {
|
|
237
|
+
bestDisallow = matchLen;
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
return { bestAllow, bestDisallow };
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// Returns the length of the matched prefix (for longest-prefix
|
|
244
|
+
// arbitration), or -1 if the pattern doesn't match. An empty
|
|
245
|
+
// pattern never matches (special-cased so `Disallow:` with empty
|
|
246
|
+
// value doesn't block everything). Wildcards:
|
|
247
|
+
//
|
|
248
|
+
// `*` — matches any sequence of characters
|
|
249
|
+
// `$` — at end of pattern, anchors the match to end-of-path
|
|
250
|
+
//
|
|
251
|
+
// The return value is the length of the pattern consumed (i.e.
|
|
252
|
+
// pattern length with wildcards counted literally). This ranking
|
|
253
|
+
// isn't perfect for patterns with multiple `*` but good enough
|
|
254
|
+
// for real-world robots.txt where rule specificity rarely
|
|
255
|
+
// depends on wildcard placement.
|
|
256
|
+
export function matchesPattern(pattern: string, path: string): number {
|
|
257
|
+
if (pattern === "") return -1;
|
|
258
|
+
// Fast path: no wildcards means literal prefix match.
|
|
259
|
+
if (!pattern.includes("*") && !pattern.endsWith("$")) {
|
|
260
|
+
return path.startsWith(pattern) ? pattern.length : -1;
|
|
261
|
+
}
|
|
262
|
+
// Compile to a regex. Escape everything except `*` / end-$.
|
|
263
|
+
const endAnchored = pattern.endsWith("$");
|
|
264
|
+
const body = endAnchored ? pattern.slice(0, -1) : pattern;
|
|
265
|
+
const regexBody = body
|
|
266
|
+
.split("*")
|
|
267
|
+
.map((chunk) => chunk.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"))
|
|
268
|
+
.join(".*");
|
|
269
|
+
const re = new RegExp("^" + regexBody + (endAnchored ? "$" : ""));
|
|
270
|
+
return re.test(path) ? pattern.length : -1;
|
|
271
|
+
}
|