mulmoclaude 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +44 -0
- package/bin/mulmoclaude.js +202 -0
- package/bin/prepare-dist.js +93 -0
- package/client/assets/chunk-vKJrgz-R-C_I3GbVV.js +1 -0
- package/client/assets/html2canvas-Cx501zZr-BF5dYYkY.js +5 -0
- package/client/assets/index-D8rhwXLq.js +4906 -0
- package/client/assets/index-KNLBjwuh.css +1 -0
- package/client/assets/index.es-D4YyL_Dg-BfRHLTZV.js +5 -0
- package/client/assets/material-icons-Dr0goTwe.woff +0 -0
- package/client/assets/material-icons-kAwBdRge.woff2 +0 -0
- package/client/assets/material-icons-outlined-BpWbwl2n.woff +0 -0
- package/client/assets/material-icons-outlined-DZhiGvEA.woff2 +0 -0
- package/client/assets/material-icons-round-BDlwx-sv.woff +0 -0
- package/client/assets/material-icons-round-DrirKXBx.woff2 +0 -0
- package/client/assets/material-icons-sharp-CH1KkVu7.woff +0 -0
- package/client/assets/material-icons-sharp-gidztirS.woff2 +0 -0
- package/client/assets/material-icons-two-tone-B7wz7mED.woff +0 -0
- package/client/assets/material-icons-two-tone-DuNIpaEj.woff2 +0 -0
- package/client/assets/mulmo_bw-ERmkSv0a.png +0 -0
- package/client/assets/purify.es-Fx1Nqyry-PeS5RUhs.js +2 -0
- package/client/assets/typeof-DBp4T-Ny-BC0P-2DM.js +1 -0
- package/client/index.html +28 -0
- package/package.json +66 -0
- package/server/agent/attachmentConverter.ts +270 -0
- package/server/agent/config.ts +414 -0
- package/server/agent/index.ts +260 -0
- package/server/agent/mcp-server.ts +412 -0
- package/server/agent/mcp-tools/index.ts +63 -0
- package/server/agent/mcp-tools/x.ts +188 -0
- package/server/agent/plugin-names.ts +75 -0
- package/server/agent/prompt.ts +349 -0
- package/server/agent/resumeFailover.ts +129 -0
- package/server/agent/sandboxMounts.ts +329 -0
- package/server/agent/stream.ts +194 -0
- package/server/api/auth/bearerAuth.ts +61 -0
- package/server/api/auth/token.ts +98 -0
- package/server/api/csrfGuard.ts +85 -0
- package/server/api/routes/agent.ts +478 -0
- package/server/api/routes/chart.ts +98 -0
- package/server/api/routes/chat-index.ts +46 -0
- package/server/api/routes/config.ts +258 -0
- package/server/api/routes/dispatchResponse.ts +79 -0
- package/server/api/routes/files.ts +812 -0
- package/server/api/routes/html.ts +101 -0
- package/server/api/routes/image.ts +169 -0
- package/server/api/routes/mulmo-script.ts +712 -0
- package/server/api/routes/mulmoScriptValidate.ts +101 -0
- package/server/api/routes/notifications.ts +69 -0
- package/server/api/routes/pdf.ts +163 -0
- package/server/api/routes/plugins.ts +276 -0
- package/server/api/routes/presentHtml.ts +48 -0
- package/server/api/routes/roles.ts +125 -0
- package/server/api/routes/scheduler.ts +153 -0
- package/server/api/routes/schedulerHandlers.ts +151 -0
- package/server/api/routes/schedulerTasks.ts +163 -0
- package/server/api/routes/sessions.ts +294 -0
- package/server/api/routes/sessionsCursor.ts +59 -0
- package/server/api/routes/skills.ts +195 -0
- package/server/api/routes/sources.ts +540 -0
- package/server/api/routes/todos.ts +263 -0
- package/server/api/routes/todosColumnsHandlers.ts +347 -0
- package/server/api/routes/todosHandlers.ts +274 -0
- package/server/api/routes/todosItemsHandlers.ts +386 -0
- package/server/api/routes/wiki/pageIndex.ts +53 -0
- package/server/api/routes/wiki.ts +363 -0
- package/server/api/sandboxStatus.ts +64 -0
- package/server/events/notifications.ts +160 -0
- package/server/events/pub-sub/index.ts +45 -0
- package/server/events/relay-client.ts +288 -0
- package/server/events/scheduler-adapter.ts +302 -0
- package/server/events/session-store/index.ts +492 -0
- package/server/events/task-manager/index.ts +181 -0
- package/server/index.ts +572 -0
- package/server/system/config.ts +243 -0
- package/server/system/credentials.ts +220 -0
- package/server/system/docker.ts +97 -0
- package/server/system/env.ts +109 -0
- package/server/system/logger/config.ts +112 -0
- package/server/system/logger/formatters.ts +40 -0
- package/server/system/logger/index.ts +53 -0
- package/server/system/logger/rotation.ts +37 -0
- package/server/system/logger/sinks.ts +101 -0
- package/server/system/logger/types.ts +29 -0
- package/server/utils/date.ts +57 -0
- package/server/utils/errors.ts +7 -0
- package/server/utils/fetch.ts +27 -0
- package/server/utils/files/atomic.ts +125 -0
- package/server/utils/files/html-io.ts +20 -0
- package/server/utils/files/image-store.ts +66 -0
- package/server/utils/files/index.ts +45 -0
- package/server/utils/files/journal-io.ts +213 -0
- package/server/utils/files/json.ts +69 -0
- package/server/utils/files/markdown-store.ts +33 -0
- package/server/utils/files/naming.ts +50 -0
- package/server/utils/files/reference-dirs-io.ts +45 -0
- package/server/utils/files/roles-io.ts +45 -0
- package/server/utils/files/safe.ts +106 -0
- package/server/utils/files/scheduler-io.ts +20 -0
- package/server/utils/files/scheduler-overrides-io.ts +64 -0
- package/server/utils/files/session-io.ts +136 -0
- package/server/utils/files/spreadsheet-store.ts +63 -0
- package/server/utils/files/todos-io.ts +29 -0
- package/server/utils/files/user-tasks-io.ts +25 -0
- package/server/utils/files/workspace-io.ts +221 -0
- package/server/utils/gemini.ts +59 -0
- package/server/utils/gitignore.ts +69 -0
- package/server/utils/http.ts +15 -0
- package/server/utils/httpError.ts +61 -0
- package/server/utils/id.ts +16 -0
- package/server/utils/json.ts +83 -0
- package/server/utils/logBackgroundError.ts +22 -0
- package/server/utils/markdown.ts +82 -0
- package/server/utils/request.ts +29 -0
- package/server/utils/slug.ts +50 -0
- package/server/utils/spawn.ts +62 -0
- package/server/utils/time.ts +34 -0
- package/server/utils/types.ts +47 -0
- package/server/workspace/chat-index/index.ts +153 -0
- package/server/workspace/chat-index/indexer.ts +209 -0
- package/server/workspace/chat-index/paths.ts +34 -0
- package/server/workspace/chat-index/summarizer.ts +247 -0
- package/server/workspace/chat-index/types.ts +38 -0
- package/server/workspace/custom-dirs.ts +220 -0
- package/server/workspace/helps/business.md +104 -0
- package/server/workspace/helps/github.md +23 -0
- package/server/workspace/helps/index.md +60 -0
- package/server/workspace/helps/mulmoscript.md +249 -0
- package/server/workspace/helps/sandbox.md +90 -0
- package/server/workspace/helps/spreadsheet.md +43 -0
- package/server/workspace/helps/telegram.md +135 -0
- package/server/workspace/helps/wiki.md +131 -0
- package/server/workspace/journal/archivist.ts +386 -0
- package/server/workspace/journal/dailyPass.ts +743 -0
- package/server/workspace/journal/diff.ts +71 -0
- package/server/workspace/journal/index.ts +185 -0
- package/server/workspace/journal/indexFile.ts +136 -0
- package/server/workspace/journal/linkRewrite.ts +4 -0
- package/server/workspace/journal/memoryExtractor.ts +130 -0
- package/server/workspace/journal/optimizationPass.ts +160 -0
- package/server/workspace/journal/paths.ts +76 -0
- package/server/workspace/journal/state.ts +125 -0
- package/server/workspace/paths.ts +158 -0
- package/server/workspace/reference-dirs.ts +252 -0
- package/server/workspace/roles.ts +37 -0
- package/server/workspace/skills/discovery.ts +125 -0
- package/server/workspace/skills/index.ts +10 -0
- package/server/workspace/skills/parser.ts +144 -0
- package/server/workspace/skills/paths.ts +41 -0
- package/server/workspace/skills/scheduler.ts +149 -0
- package/server/workspace/skills/types.ts +30 -0
- package/server/workspace/skills/user-tasks.ts +257 -0
- package/server/workspace/skills/writer.ts +189 -0
- package/server/workspace/sources/arxivDiscovery.ts +182 -0
- package/server/workspace/sources/classifier.ts +268 -0
- package/server/workspace/sources/fetchers/arxiv.ts +170 -0
- package/server/workspace/sources/fetchers/github.ts +106 -0
- package/server/workspace/sources/fetchers/githubIssues.ts +208 -0
- package/server/workspace/sources/fetchers/githubReleases.ts +186 -0
- package/server/workspace/sources/fetchers/index.ts +71 -0
- package/server/workspace/sources/fetchers/registerAll.ts +15 -0
- package/server/workspace/sources/fetchers/rss.ts +141 -0
- package/server/workspace/sources/fetchers/rssParser.ts +295 -0
- package/server/workspace/sources/httpFetcher.ts +230 -0
- package/server/workspace/sources/interests.ts +120 -0
- package/server/workspace/sources/paths.ts +110 -0
- package/server/workspace/sources/pipeline/dedup.ts +60 -0
- package/server/workspace/sources/pipeline/fetch.ts +136 -0
- package/server/workspace/sources/pipeline/index.ts +249 -0
- package/server/workspace/sources/pipeline/notify.ts +72 -0
- package/server/workspace/sources/pipeline/plan.ts +66 -0
- package/server/workspace/sources/pipeline/summarize.ts +189 -0
- package/server/workspace/sources/pipeline/write.ts +185 -0
- package/server/workspace/sources/rateLimiter.ts +148 -0
- package/server/workspace/sources/registry.ts +326 -0
- package/server/workspace/sources/robots.ts +271 -0
- package/server/workspace/sources/sourceState.ts +135 -0
- package/server/workspace/sources/taxonomy.ts +74 -0
- package/server/workspace/sources/types.ts +144 -0
- package/server/workspace/sources/urls.ts +112 -0
- package/server/workspace/tool-trace/classify.ts +114 -0
- package/server/workspace/tool-trace/index.ts +250 -0
- package/server/workspace/tool-trace/writeSearch.ts +98 -0
- package/server/workspace/wiki-backlinks/index.ts +107 -0
- package/server/workspace/wiki-backlinks/sessionBacklinks.ts +144 -0
- package/server/workspace/workspace.ts +66 -0
- package/src/App.vue +720 -0
- package/src/assets/mulmo_bw.png +0 -0
- package/src/components/CanvasViewToggle.vue +27 -0
- package/src/components/ChatAttachmentPreview.vue +45 -0
- package/src/components/ChatImagePreview.vue +17 -0
- package/src/components/ChatInput.vue +208 -0
- package/src/components/FileContentHeader.vue +49 -0
- package/src/components/FileContentRenderer.vue +162 -0
- package/src/components/FileTree.vue +115 -0
- package/src/components/FileTreePane.vue +85 -0
- package/src/components/FilesView.vue +206 -0
- package/src/components/LockStatusPopup.vue +111 -0
- package/src/components/NotificationBell.vue +131 -0
- package/src/components/NotificationToast.vue +72 -0
- package/src/components/PluginLauncher.vue +138 -0
- package/src/components/RightSidebar.vue +113 -0
- package/src/components/RoleSelector.vue +64 -0
- package/src/components/SessionHistoryPanel.vue +176 -0
- package/src/components/SessionTabBar.vue +81 -0
- package/src/components/SettingsMcpTab.vue +350 -0
- package/src/components/SettingsModal.vue +275 -0
- package/src/components/SettingsReferenceDirsTab.vue +173 -0
- package/src/components/SettingsWorkspaceDirsTab.vue +174 -0
- package/src/components/SidebarHeader.vue +69 -0
- package/src/components/StackView.vue +360 -0
- package/src/components/SuggestionsPanel.vue +65 -0
- package/src/components/TodoExplorer.vue +358 -0
- package/src/components/ToolResultsPanel.vue +77 -0
- package/src/components/todo/TodoAddDialog.vue +131 -0
- package/src/components/todo/TodoEditDialog.vue +47 -0
- package/src/components/todo/TodoEditPanel.vue +113 -0
- package/src/components/todo/TodoKanbanView.vue +249 -0
- package/src/components/todo/TodoListView.vue +79 -0
- package/src/components/todo/TodoTableView.vue +177 -0
- package/src/composables/useActiveSession.ts +40 -0
- package/src/composables/useAppApi.ts +45 -0
- package/src/composables/useCanvasViewMode.ts +121 -0
- package/src/composables/useChatScroll.ts +47 -0
- package/src/composables/useClickOutside.ts +26 -0
- package/src/composables/useClipboardCopy.ts +44 -0
- package/src/composables/useContentDisplay.ts +52 -0
- package/src/composables/useDebugBeat.ts +23 -0
- package/src/composables/useDynamicFavicon.ts +115 -0
- package/src/composables/useEventListeners.ts +42 -0
- package/src/composables/useExpandedDirs.ts +64 -0
- package/src/composables/useFaviconState.ts +30 -0
- package/src/composables/useFileSelection.ts +115 -0
- package/src/composables/useFileSortMode.ts +24 -0
- package/src/composables/useFileTree.ts +85 -0
- package/src/composables/useFreshPluginData.ts +89 -0
- package/src/composables/useHealth.ts +38 -0
- package/src/composables/useImeAwareEnter.ts +57 -0
- package/src/composables/useKeyNavigation.ts +60 -0
- package/src/composables/useMarkdownLinkHandler.ts +46 -0
- package/src/composables/useMarkdownMode.ts +17 -0
- package/src/composables/useMcpTools.ts +71 -0
- package/src/composables/useMergedSessions.ts +27 -0
- package/src/composables/useNotifications.ts +90 -0
- package/src/composables/usePdfDownload.ts +60 -0
- package/src/composables/usePendingCalls.ts +77 -0
- package/src/composables/usePubSub.ts +85 -0
- package/src/composables/useRightSidebar.ts +23 -0
- package/src/composables/useRoles.ts +34 -0
- package/src/composables/useSandboxStatus.ts +67 -0
- package/src/composables/useSelectedResult.ts +49 -0
- package/src/composables/useSessionDerived.ts +51 -0
- package/src/composables/useSessionHistory.ts +81 -0
- package/src/composables/useSessionSync.ts +57 -0
- package/src/composables/useViewLayout.ts +55 -0
- package/src/config/apiRoutes.ts +173 -0
- package/src/config/pubsubChannels.ts +45 -0
- package/src/config/roles.ts +335 -0
- package/src/config/schedulerActions.ts +25 -0
- package/src/config/toolNames.ts +71 -0
- package/src/config/workspacePaths.ts +24 -0
- package/src/index.css +107 -0
- package/src/main.ts +25 -0
- package/src/plugins/canvas/Preview.vue +13 -0
- package/src/plugins/canvas/View.vue +333 -0
- package/src/plugins/canvas/definition.ts +38 -0
- package/src/plugins/canvas/index.ts +36 -0
- package/src/plugins/chart/Preview.vue +49 -0
- package/src/plugins/chart/View.vue +143 -0
- package/src/plugins/chart/definition.ts +58 -0
- package/src/plugins/chart/index.ts +52 -0
- package/src/plugins/editImage/Preview.vue +13 -0
- package/src/plugins/editImage/View.vue +13 -0
- package/src/plugins/editImage/definition.ts +27 -0
- package/src/plugins/editImage/index.ts +36 -0
- package/src/plugins/generateImage/Preview.vue +13 -0
- package/src/plugins/generateImage/View.vue +33 -0
- package/src/plugins/generateImage/definition.ts +32 -0
- package/src/plugins/generateImage/index.ts +56 -0
- package/src/plugins/manageRoles/Preview.vue +49 -0
- package/src/plugins/manageRoles/View.vue +525 -0
- package/src/plugins/manageRoles/definition.ts +43 -0
- package/src/plugins/manageRoles/index.ts +47 -0
- package/src/plugins/manageSkills/Preview.vue +21 -0
- package/src/plugins/manageSkills/View.vue +321 -0
- package/src/plugins/manageSkills/definition.ts +49 -0
- package/src/plugins/manageSkills/index.ts +49 -0
- package/src/plugins/manageSource/Preview.vue +33 -0
- package/src/plugins/manageSource/View.vue +697 -0
- package/src/plugins/manageSource/definition.ts +63 -0
- package/src/plugins/manageSource/index.ts +66 -0
- package/src/plugins/markdown/Preview.vue +77 -0
- package/src/plugins/markdown/View.vue +476 -0
- package/src/plugins/markdown/definition.ts +50 -0
- package/src/plugins/markdown/index.ts +36 -0
- package/src/plugins/presentHtml/Preview.vue +25 -0
- package/src/plugins/presentHtml/View.vue +52 -0
- package/src/plugins/presentHtml/definition.ts +27 -0
- package/src/plugins/presentHtml/helpers.ts +72 -0
- package/src/plugins/presentHtml/index.ts +41 -0
- package/src/plugins/presentMulmoScript/Preview.vue +23 -0
- package/src/plugins/presentMulmoScript/View.vue +1166 -0
- package/src/plugins/presentMulmoScript/definition.ts +95 -0
- package/src/plugins/presentMulmoScript/helpers.ts +162 -0
- package/src/plugins/presentMulmoScript/index.ts +40 -0
- package/src/plugins/scheduler/Preview.vue +67 -0
- package/src/plugins/scheduler/TasksTab.vue +205 -0
- package/src/plugins/scheduler/View.vue +565 -0
- package/src/plugins/scheduler/definition.ts +57 -0
- package/src/plugins/scheduler/index.ts +45 -0
- package/src/plugins/scheduler/viewModes.ts +26 -0
- package/src/plugins/spreadsheet/Preview.vue +29 -0
- package/src/plugins/spreadsheet/View.vue +997 -0
- package/src/plugins/spreadsheet/cellHighlights.ts +79 -0
- package/src/plugins/spreadsheet/definition.ts +121 -0
- package/src/plugins/spreadsheet/engine/calculator.ts +459 -0
- package/src/plugins/spreadsheet/engine/cellBuilder.ts +81 -0
- package/src/plugins/spreadsheet/engine/date-parser.ts +220 -0
- package/src/plugins/spreadsheet/engine/date-utils.ts +56 -0
- package/src/plugins/spreadsheet/engine/engine.ts +176 -0
- package/src/plugins/spreadsheet/engine/evaluator.ts +390 -0
- package/src/plugins/spreadsheet/engine/formatter.ts +172 -0
- package/src/plugins/spreadsheet/engine/formulaRefs.ts +101 -0
- package/src/plugins/spreadsheet/engine/functions/date.ts +299 -0
- package/src/plugins/spreadsheet/engine/functions/financial.ts +387 -0
- package/src/plugins/spreadsheet/engine/functions/index.ts +16 -0
- package/src/plugins/spreadsheet/engine/functions/logical.ts +262 -0
- package/src/plugins/spreadsheet/engine/functions/lookup.ts +400 -0
- package/src/plugins/spreadsheet/engine/functions/mathematical.ts +297 -0
- package/src/plugins/spreadsheet/engine/functions/statistical.ts +338 -0
- package/src/plugins/spreadsheet/engine/functions/text.ts +389 -0
- package/src/plugins/spreadsheet/engine/index.ts +27 -0
- package/src/plugins/spreadsheet/engine/jsonCellLocator.ts +111 -0
- package/src/plugins/spreadsheet/engine/parser.ts +143 -0
- package/src/plugins/spreadsheet/engine/registry.ts +150 -0
- package/src/plugins/spreadsheet/engine/responseDecoder.ts +67 -0
- package/src/plugins/spreadsheet/engine/types.ts +64 -0
- package/src/plugins/spreadsheet/index.ts +36 -0
- package/src/plugins/textResponse/Preview.vue +94 -0
- package/src/plugins/textResponse/View.vue +503 -0
- package/src/plugins/textResponse/definition.ts +34 -0
- package/src/plugins/textResponse/index.ts +27 -0
- package/src/plugins/textResponse/plugin.ts +29 -0
- package/src/plugins/textResponse/samples.ts +97 -0
- package/src/plugins/textResponse/types.ts +11 -0
- package/src/plugins/todo/Preview.vue +63 -0
- package/src/plugins/todo/View.vue +364 -0
- package/src/plugins/todo/composables/useTodos.ts +177 -0
- package/src/plugins/todo/definition.ts +45 -0
- package/src/plugins/todo/index.ts +61 -0
- package/src/plugins/todo/labels.ts +163 -0
- package/src/plugins/todo/priority.ts +98 -0
- package/src/plugins/todo/viewModes.ts +19 -0
- package/src/plugins/ui-image/ImagePreview.vue +23 -0
- package/src/plugins/ui-image/ImageView.vue +34 -0
- package/src/plugins/ui-image/index.ts +3 -0
- package/src/plugins/ui-image/types.ts +4 -0
- package/src/plugins/wiki/Preview.vue +65 -0
- package/src/plugins/wiki/View.vue +342 -0
- package/src/plugins/wiki/definition.ts +25 -0
- package/src/plugins/wiki/helpers.ts +59 -0
- package/src/plugins/wiki/index.ts +52 -0
- package/src/router/guards.ts +61 -0
- package/src/router/index.ts +50 -0
- package/src/tools/index.ts +52 -0
- package/src/tools/types.ts +27 -0
- package/src/types/events.ts +16 -0
- package/src/types/fileTree.ts +13 -0
- package/src/types/notification.ts +67 -0
- package/src/types/session.ts +116 -0
- package/src/types/sse.ts +90 -0
- package/src/types/toolCallHistory.ts +13 -0
- package/src/utils/agent/eventDispatch.ts +74 -0
- package/src/utils/agent/request.ts +55 -0
- package/src/utils/agent/toolCalls.ts +62 -0
- package/src/utils/api.ts +218 -0
- package/src/utils/canvas/viewMode.ts +46 -0
- package/src/utils/dom/authTokenMeta.ts +20 -0
- package/src/utils/dom/clickOutside.ts +11 -0
- package/src/utils/dom/externalLink.ts +57 -0
- package/src/utils/dom/scrollable.ts +24 -0
- package/src/utils/errors.ts +11 -0
- package/src/utils/files/expandedDirs.ts +25 -0
- package/src/utils/files/filename.ts +12 -0
- package/src/utils/files/sortChildren.ts +20 -0
- package/src/utils/filesPreview/schedulerPreview.ts +38 -0
- package/src/utils/filesPreview/todoPreview.ts +40 -0
- package/src/utils/format/date.ts +85 -0
- package/src/utils/format/frontmatter.ts +80 -0
- package/src/utils/format/jsonSyntax.ts +109 -0
- package/src/utils/html/previewCsp.ts +65 -0
- package/src/utils/image/resolve.ts +8 -0
- package/src/utils/image/rewriteMarkdownImageRefs.ts +182 -0
- package/src/utils/markdown/extractFirstH1.ts +39 -0
- package/src/utils/notification/dispatch.ts +22 -0
- package/src/utils/path/relativeLink.ts +130 -0
- package/src/utils/role/icon.ts +20 -0
- package/src/utils/role/merge.ts +10 -0
- package/src/utils/role/plugins.ts +12 -0
- package/src/utils/session/mergeSessions.ts +103 -0
- package/src/utils/session/seedRoleDefault.ts +35 -0
- package/src/utils/session/sessionEntries.ts +121 -0
- package/src/utils/session/sessionFactory.ts +22 -0
- package/src/utils/session/sessionHelpers.ts +99 -0
- package/src/utils/tools/dedup.ts +17 -0
- package/src/utils/tools/mcp.ts +33 -0
- package/src/utils/tools/pendingCalls.ts +16 -0
- package/src/utils/tools/result.ts +40 -0
- package/src/utils/types.ts +44 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
// Pure RSS 2.0 + Atom 1.0 parser.
|
|
2
|
+
//
|
|
3
|
+
// Uses `fast-xml-parser` for the XML-decoding plumbing (CDATA,
|
|
4
|
+
// entity decoding, namespaces) and layers a feed-specific
|
|
5
|
+
// normalization on top. The output shape is format-agnostic:
|
|
6
|
+
// both RSS and Atom resolve into the same `ParsedFeedItem[]`.
|
|
7
|
+
//
|
|
8
|
+
// Pure — no I/O. Unit-testable with fixture strings.
|
|
9
|
+
|
|
10
|
+
import { XMLParser } from "fast-xml-parser";
|
|
11
|
+
import { isNonEmptyString, isRecord } from "../../../utils/types.js";
|
|
12
|
+
|
|
13
|
+
export interface ParsedFeedItem {
|
|
14
|
+
// Best-effort stable identity from the feed itself. For RSS
|
|
15
|
+
// this is <guid>; for Atom it's <id>. Falls back to <link>
|
|
16
|
+
// when neither is present. The caller normalizes via
|
|
17
|
+
// `stableItemId(normalizedUrl)` when dedup across sources is
|
|
18
|
+
// needed, so this id is informational only.
|
|
19
|
+
feedId: string | null;
|
|
20
|
+
title: string;
|
|
21
|
+
link: string | null;
|
|
22
|
+
// RFC-822-ish date string from RSS <pubDate>, or RFC-3339
|
|
23
|
+
// from Atom <updated>/<published>. Pre-normalized to a
|
|
24
|
+
// JavaScript-parseable ISO string when possible; otherwise
|
|
25
|
+
// passed through verbatim for the consumer to handle.
|
|
26
|
+
publishedAt: string | null;
|
|
27
|
+
// Short description. RSS <description> or Atom
|
|
28
|
+
// <summary>/<content>. May contain HTML — the pipeline's
|
|
29
|
+
// summarizer step will flatten it.
|
|
30
|
+
summary: string | null;
|
|
31
|
+
// Full HTML/text body when the feed provides one separately
|
|
32
|
+
// from summary. Otherwise null.
|
|
33
|
+
content: string | null;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface ParsedFeed {
|
|
37
|
+
// "rss" or "atom" — callers rarely care but logging benefits
|
|
38
|
+
// from knowing which branch parsed.
|
|
39
|
+
kind: "rss" | "atom";
|
|
40
|
+
title: string | null;
|
|
41
|
+
items: ParsedFeedItem[];
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Configure the XML parser once at module load.
|
|
45
|
+
// - preserveOrder=false: we access elements by name, not by order
|
|
46
|
+
// - ignoreAttributes=false: Atom's <link href=...> matters
|
|
47
|
+
// - cdataPropName="#cdata": CDATA content lands in a predictable
|
|
48
|
+
// key; we coalesce it with plain text content below
|
|
49
|
+
// - parseTagValue/parseAttributeValue=false: keep everything as
|
|
50
|
+
// strings so weird "dates" like "2026/13/45" don't silently
|
|
51
|
+
// become NaN
|
|
52
|
+
const xml = new XMLParser({
|
|
53
|
+
ignoreAttributes: false,
|
|
54
|
+
attributeNamePrefix: "@_",
|
|
55
|
+
cdataPropName: "#cdata",
|
|
56
|
+
parseTagValue: false,
|
|
57
|
+
parseAttributeValue: false,
|
|
58
|
+
trimValues: true,
|
|
59
|
+
// Some feeds emit arrays of <item>/<entry> and some emit a
|
|
60
|
+
// single one. alwaysCreateTextNode=false + isArray callbacks
|
|
61
|
+
// below normalize both shapes to arrays.
|
|
62
|
+
isArray: (name) => name === "item" || name === "entry" || name === "link",
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
// Parse an RSS or Atom feed body. Returns null when the input
|
|
66
|
+
// doesn't look like a feed we understand (wrong root element,
|
|
67
|
+
// unparseable XML). The pipeline treats null the same way it
|
|
68
|
+
// treats "zero new items" — logged + skipped.
|
|
69
|
+
export function parseFeed(body: string): ParsedFeed | null {
|
|
70
|
+
const text = stripBom(body);
|
|
71
|
+
if (!text.trim()) return null;
|
|
72
|
+
let parsed: unknown;
|
|
73
|
+
try {
|
|
74
|
+
parsed = xml.parse(text);
|
|
75
|
+
} catch {
|
|
76
|
+
return null;
|
|
77
|
+
}
|
|
78
|
+
if (!isRecord(parsed)) return null;
|
|
79
|
+
|
|
80
|
+
if (isRecord(parsed.rss)) return parseRss(parsed.rss);
|
|
81
|
+
if (isRecord(parsed.feed)) return parseAtom(parsed.feed);
|
|
82
|
+
// RDF 1.0 (RSS 1.0) uses <rdf:RDF> as the root. fast-xml-parser
|
|
83
|
+
// keeps the namespace prefix on the key by default, so check
|
|
84
|
+
// both the prefixed form and the unprefixed fallback.
|
|
85
|
+
const rdf = parsed["rdf:RDF"] ?? parsed.RDF;
|
|
86
|
+
if (isRecord(rdf)) return parseRss10(rdf);
|
|
87
|
+
return null;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// --- RSS 2.0 ------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
function parseRss(rss: Record<string, unknown>): ParsedFeed | null {
|
|
93
|
+
const channel = rss.channel;
|
|
94
|
+
if (!isRecord(channel)) return null;
|
|
95
|
+
const rawItems = Array.isArray(channel.item) ? channel.item : [];
|
|
96
|
+
const items: ParsedFeedItem[] = [];
|
|
97
|
+
for (const raw of rawItems) {
|
|
98
|
+
if (!isRecord(raw)) continue;
|
|
99
|
+
const parsed = parseRssItem(raw);
|
|
100
|
+
if (parsed) items.push(parsed);
|
|
101
|
+
}
|
|
102
|
+
return {
|
|
103
|
+
kind: "rss",
|
|
104
|
+
title: readString(channel.title),
|
|
105
|
+
items,
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function parseRssItem(raw: Record<string, unknown>): ParsedFeedItem | null {
|
|
110
|
+
const title = readString(raw.title);
|
|
111
|
+
// <guid> can be a plain string or `{ "#text": "...",
|
|
112
|
+
// "@_isPermaLink": "false" }` depending on attributes.
|
|
113
|
+
const guid = readString(raw.guid);
|
|
114
|
+
const link = readString(raw.link);
|
|
115
|
+
// <pubDate> is RFC 822. Convert to ISO for easier downstream
|
|
116
|
+
// comparisons; fall back to the raw value if the conversion
|
|
117
|
+
// fails (a malformed date is better than no date).
|
|
118
|
+
const publishedAt = normalizeDate(readString(raw.pubDate));
|
|
119
|
+
// <description> is the short summary. Some feeds also emit
|
|
120
|
+
// <content:encoded> for the full HTML body. fast-xml-parser
|
|
121
|
+
// keeps the namespace prefix on the key by default, so we
|
|
122
|
+
// read both `content:encoded` (prefixed, the common case)
|
|
123
|
+
// and a bare `encoded` key as a fallback for parsers that
|
|
124
|
+
// stripped the namespace.
|
|
125
|
+
// Fall back to content when <description> is absent so
|
|
126
|
+
// content-only feeds (fairly common among tech blogs) don't
|
|
127
|
+
// end up with a null summary — the summarizer intentionally
|
|
128
|
+
// drops `content` so title-only items would otherwise slip in.
|
|
129
|
+
const content = readString(raw["content:encoded"]) ?? readString(raw.encoded);
|
|
130
|
+
const summary = readString(raw.description) ?? content;
|
|
131
|
+
if (!title) return null;
|
|
132
|
+
return {
|
|
133
|
+
feedId: guid ?? link ?? null,
|
|
134
|
+
title,
|
|
135
|
+
link,
|
|
136
|
+
publishedAt,
|
|
137
|
+
summary,
|
|
138
|
+
content,
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// --- RSS 1.0 (RDF) ------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
function parseRss10(rdf: Record<string, unknown>): ParsedFeed | null {
|
|
145
|
+
// RDF feeds put <item> directly under <rdf:RDF>, not under a
|
|
146
|
+
// <channel>. Items are the same shape as RSS 2.0 otherwise.
|
|
147
|
+
const rawItems = Array.isArray(rdf.item) ? rdf.item : [];
|
|
148
|
+
const items: ParsedFeedItem[] = [];
|
|
149
|
+
for (const raw of rawItems) {
|
|
150
|
+
if (!isRecord(raw)) continue;
|
|
151
|
+
const parsed = parseRssItem(raw);
|
|
152
|
+
if (parsed) items.push(parsed);
|
|
153
|
+
}
|
|
154
|
+
const channel = isRecord(rdf.channel) ? rdf.channel : null;
|
|
155
|
+
return {
|
|
156
|
+
kind: "rss",
|
|
157
|
+
title: channel ? readString(channel.title) : null,
|
|
158
|
+
items,
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// --- Atom 1.0 -----------------------------------------------------------
|
|
163
|
+
|
|
164
|
+
function parseAtom(feed: Record<string, unknown>): ParsedFeed | null {
|
|
165
|
+
const rawEntries = Array.isArray(feed.entry) ? feed.entry : [];
|
|
166
|
+
const items: ParsedFeedItem[] = [];
|
|
167
|
+
for (const raw of rawEntries) {
|
|
168
|
+
if (!isRecord(raw)) continue;
|
|
169
|
+
const parsed = parseAtomEntry(raw);
|
|
170
|
+
if (parsed) items.push(parsed);
|
|
171
|
+
}
|
|
172
|
+
return {
|
|
173
|
+
kind: "atom",
|
|
174
|
+
title: readString(feed.title),
|
|
175
|
+
items,
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
function parseAtomEntry(raw: Record<string, unknown>): ParsedFeedItem | null {
|
|
180
|
+
const title = readString(raw.title);
|
|
181
|
+
const id = readString(raw.id);
|
|
182
|
+
const link = resolveAtomLink(raw.link);
|
|
183
|
+
const published = readString(raw.published) ?? readString(raw.updated) ?? null;
|
|
184
|
+
const publishedAt = published ? normalizeDate(published) : null;
|
|
185
|
+
// Same fallback story as RSS 2.0: content-only Atom entries
|
|
186
|
+
// (e.g. GitHub-generated feeds) should still surface in the
|
|
187
|
+
// summary step rather than be silently title-only.
|
|
188
|
+
const content = readString(raw.content);
|
|
189
|
+
const summary = readString(raw.summary) ?? content;
|
|
190
|
+
if (!title) return null;
|
|
191
|
+
return {
|
|
192
|
+
feedId: id ?? link ?? null,
|
|
193
|
+
title,
|
|
194
|
+
link,
|
|
195
|
+
publishedAt,
|
|
196
|
+
summary,
|
|
197
|
+
content,
|
|
198
|
+
};
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Atom <link> has three shapes in the wild:
|
|
202
|
+
// 1. `<link>https://x.com/</link>` — plain string body (rare but
|
|
203
|
+
// real, e.g. hand-written atom feeds)
|
|
204
|
+
// 2. `<link href="..." rel="alternate"/>` — attribute-bearing
|
|
205
|
+
// element, which is the spec-canonical form
|
|
206
|
+
// 3. Multiple `<link>` elements with different `rel` values, a
|
|
207
|
+
// mix of the above
|
|
208
|
+
//
|
|
209
|
+
// Because we set `isArray: name === "link"` on the parser, every
|
|
210
|
+
// link form arrives wrapped in an array. Within the array we may
|
|
211
|
+
// see plain strings (form 1) and objects with `@_href` (forms 2/3).
|
|
212
|
+
//
|
|
213
|
+
// Preference: rel="alternate" wins (canonical web URL). Otherwise
|
|
214
|
+
// we fall back to the first candidate that has a usable href /
|
|
215
|
+
// string value.
|
|
216
|
+
function resolveAtomLink(raw: unknown): string | null {
|
|
217
|
+
if (typeof raw === "string") return raw;
|
|
218
|
+
const candidates = Array.isArray(raw) ? raw : [raw];
|
|
219
|
+
let fallback: string | null = null;
|
|
220
|
+
for (const candidate of candidates) {
|
|
221
|
+
const outcome = classifyAtomLinkCandidate(candidate);
|
|
222
|
+
if (outcome.kind === "alternate") return outcome.href;
|
|
223
|
+
if (outcome.kind === "fallback") fallback ??= outcome.href;
|
|
224
|
+
}
|
|
225
|
+
return fallback;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
type AtomLinkOutcome = { kind: "alternate"; href: string } | { kind: "fallback"; href: string } | { kind: "skip" };
|
|
229
|
+
|
|
230
|
+
// Inspect one candidate from Atom's `<link>` list (which may be a
|
|
231
|
+
// plain string or an object carrying `@_href` / `@_rel` attrs)
|
|
232
|
+
// and report whether it's a rel="alternate" winner, a usable
|
|
233
|
+
// fallback, or nothing we can use.
|
|
234
|
+
function classifyAtomLinkCandidate(candidate: unknown): AtomLinkOutcome {
|
|
235
|
+
if (isNonEmptyString(candidate)) {
|
|
236
|
+
// Form 1: bare `<link>url</link>`. Unattributed → fallback.
|
|
237
|
+
return { kind: "fallback", href: candidate };
|
|
238
|
+
}
|
|
239
|
+
if (!isRecord(candidate)) return { kind: "skip" };
|
|
240
|
+
const href = readString(candidate["@_href"]);
|
|
241
|
+
if (!href) return { kind: "skip" };
|
|
242
|
+
const rel = readString(candidate["@_rel"]);
|
|
243
|
+
if (rel === "alternate" || rel === null) {
|
|
244
|
+
return { kind: "alternate", href };
|
|
245
|
+
}
|
|
246
|
+
return { kind: "fallback", href };
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// --- helpers ------------------------------------------------------------
|
|
250
|
+
|
|
251
|
+
// isRecord moved to server/utils/types.ts
|
|
252
|
+
|
|
253
|
+
// Extract a string from a value that might be:
|
|
254
|
+
// - a plain string
|
|
255
|
+
// - an object with `#text` (tag with attributes + body text)
|
|
256
|
+
// - an object with `#cdata` (CDATA-wrapped body)
|
|
257
|
+
// - an array (pick the first non-empty)
|
|
258
|
+
// Returns null when nothing plausibly-textual is found.
|
|
259
|
+
function readString(value: unknown): string | null {
|
|
260
|
+
if (isNonEmptyString(value)) return value;
|
|
261
|
+
if (typeof value === "string") return null;
|
|
262
|
+
if (isRecord(value)) return readStringFromRecord(value);
|
|
263
|
+
if (Array.isArray(value)) return readStringFromArray(value);
|
|
264
|
+
return null;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
function readStringFromRecord(record: Record<string, unknown>): string | null {
|
|
268
|
+
const text = record["#text"];
|
|
269
|
+
if (isNonEmptyString(text)) return text;
|
|
270
|
+
const cdata = record["#cdata"];
|
|
271
|
+
if (isNonEmptyString(cdata)) return cdata;
|
|
272
|
+
return null;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
function readStringFromArray(array: readonly unknown[]): string | null {
|
|
276
|
+
for (const entry of array) {
|
|
277
|
+
const resolved = readString(entry);
|
|
278
|
+
if (resolved !== null) return resolved;
|
|
279
|
+
}
|
|
280
|
+
return null;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
function stripBom(text: string): string {
|
|
284
|
+
return text.charCodeAt(0) === 0xfeff ? text.slice(1) : text;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// Convert a date string into ISO 8601 if possible; otherwise
|
|
288
|
+
// return the original. We never throw — a weird but non-empty
|
|
289
|
+
// date is more useful to the pipeline than a null.
|
|
290
|
+
function normalizeDate(raw: string | null): string | null {
|
|
291
|
+
if (!raw) return null;
|
|
292
|
+
const ts = Date.parse(raw);
|
|
293
|
+
if (Number.isFinite(ts)) return new Date(ts).toISOString();
|
|
294
|
+
return raw;
|
|
295
|
+
}
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
// Etiquette-respecting HTTP fetcher for server-side source
|
|
2
|
+
// fetchers (RSS, GitHub API, arXiv, etc.).
|
|
3
|
+
//
|
|
4
|
+
// Wraps `fetch` with four things phase-1 fetchers would otherwise
|
|
5
|
+
// all have to reimplement:
|
|
6
|
+
//
|
|
7
|
+
// 1. User-Agent: `MulmoClaude-SourceBot/1.0 (+<repo url>)` on every
|
|
8
|
+
// request, so site operators can identify and contact.
|
|
9
|
+
// 2. robots.txt check: before fetching `<scheme>://<host>/<path>`,
|
|
10
|
+
// read the cached robots.txt for `<host>` and consult
|
|
11
|
+
// `isAllowedByRobots`. Disallowed paths 400-reject at the
|
|
12
|
+
// library boundary — fetcher sees `RobotsDisallowedError` and
|
|
13
|
+
// can log / skip.
|
|
14
|
+
// 3. Per-host rate limit: HostRateLimiter serializes same-host
|
|
15
|
+
// requests with a `Crawl-delay`-aware minimum gap.
|
|
16
|
+
// 4. Timeout: each request gets a finite AbortController so a
|
|
17
|
+
// hung server can't wedge the daily pipeline.
|
|
18
|
+
//
|
|
19
|
+
// The robots.txt cache itself is NOT owned by this module — it's
|
|
20
|
+
// a pluggable `RobotsProvider` so tests can stub it and the real
|
|
21
|
+
// cache (filesystem-backed, 24h TTL) lives elsewhere.
|
|
22
|
+
//
|
|
23
|
+
// Every moving part has an injectable dep so tests can drive the
|
|
24
|
+
// whole flow without network or disk.
|
|
25
|
+
|
|
26
|
+
import { DEFAULT_MIN_DELAY_MS, HostRateLimiter, defaultRateLimiterDeps, type RateLimiterDeps } from "./rateLimiter.js";
|
|
27
|
+
import { isAllowedByRobots, parseRobots } from "./robots.js";
|
|
28
|
+
import { ONE_SECOND_MS } from "../../utils/time.js";
|
|
29
|
+
|
|
30
|
+
// The User-Agent value sent on every fetch. Identifies us clearly
|
|
31
|
+
// enough for a site operator to find the project and contact us.
|
|
32
|
+
// Update the URL if the repo ever moves.
|
|
33
|
+
export const USER_AGENT = "MulmoClaude-SourceBot/1.0 (+https://github.com/receptron/mulmoclaude)";
|
|
34
|
+
|
|
35
|
+
// Per-request wall-clock cap. Fetchers can still cancel earlier
|
|
36
|
+
// via a passed-in AbortSignal; this is the outer safety net so a
|
|
37
|
+
// hung server never holds a rate-limit slot forever.
|
|
38
|
+
export const DEFAULT_FETCH_TIMEOUT_MS = 30 * ONE_SECOND_MS;
|
|
39
|
+
|
|
40
|
+
// Thrown when a URL would violate the target host's robots.txt
|
|
41
|
+
// policy for our User-Agent. Caught by fetchers so a single
|
|
42
|
+
// disallowed source doesn't look like a generic HTTP error.
|
|
43
|
+
export class RobotsDisallowedError extends Error {
|
|
44
|
+
readonly url: string;
|
|
45
|
+
constructor(url: string) {
|
|
46
|
+
super(`[sources] robots.txt disallows ${url} for our User-Agent`);
|
|
47
|
+
this.name = "RobotsDisallowedError";
|
|
48
|
+
this.url = url;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// How the fetcher gets robots.txt for a given host. The real
|
|
53
|
+
// implementation (phase-2-ish) will read from
|
|
54
|
+
// `workspace/sources/_state/robots/<host>.txt` with a 24h TTL,
|
|
55
|
+
// falling back to an HTTP GET. Tests inject an in-memory map.
|
|
56
|
+
//
|
|
57
|
+
// Returns null to signal "no robots.txt found" (or "404-equivalent")
|
|
58
|
+
// which the evaluator treats as permissive — the usual convention.
|
|
59
|
+
export type RobotsProvider = (host: string) => Promise<string | null>;
|
|
60
|
+
|
|
61
|
+
// Cap the number of redirect hops one fetchPolite call will follow.
|
|
62
|
+
// Matches the common browser default (Firefox 20, Chrome 20) and
|
|
63
|
+
// RFC 7231 recommendation. Rejects with a plain Error after the cap.
|
|
64
|
+
export const MAX_REDIRECTS = 5;
|
|
65
|
+
|
|
66
|
+
// Thrown when the redirect chain exceeds MAX_REDIRECTS. Caller can
|
|
67
|
+
// distinguish "site is mis-configured" from "network error" and log
|
|
68
|
+
// appropriately.
|
|
69
|
+
export class RedirectLimitError extends Error {
|
|
70
|
+
readonly startUrl: string;
|
|
71
|
+
readonly lastUrl: string;
|
|
72
|
+
constructor(startUrl: string, lastUrl: string) {
|
|
73
|
+
super(`[sources] too many redirects (>${MAX_REDIRECTS}) starting from ${startUrl}`);
|
|
74
|
+
this.name = "RedirectLimitError";
|
|
75
|
+
this.startUrl = startUrl;
|
|
76
|
+
this.lastUrl = lastUrl;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
export interface HttpFetcherDeps {
|
|
81
|
+
// HTTP client. Defaults to global `fetch`; tests inject a
|
|
82
|
+
// response-map function.
|
|
83
|
+
fetchImpl: typeof fetch;
|
|
84
|
+
// robots.txt source. See RobotsProvider.
|
|
85
|
+
robots: RobotsProvider;
|
|
86
|
+
// Shared rate limiter so fetchers going through the same
|
|
87
|
+
// HttpFetcher instance all serialize per-host together.
|
|
88
|
+
rateLimiter: HostRateLimiter;
|
|
89
|
+
// Rate-limiter clock / sleep — usually pass through from
|
|
90
|
+
// `rateLimiter`'s deps when constructing both.
|
|
91
|
+
rateLimiterDeps: RateLimiterDeps;
|
|
92
|
+
// Per-host crawl-delay override, looked up per request. Returns
|
|
93
|
+
// null when the fetcher should fall back to its
|
|
94
|
+
// `DEFAULT_MIN_DELAY_MS`. Normally implemented by caching the
|
|
95
|
+
// robots.txt `Crawl-delay` value per host.
|
|
96
|
+
crawlDelayMs: (host: string) => number | null;
|
|
97
|
+
// Extra abort signal the caller can provide to cancel a fetch
|
|
98
|
+
// early (e.g. pipeline shutdown). Combined with the internal
|
|
99
|
+
// timeout via any-of.
|
|
100
|
+
externalSignal?: AbortSignal;
|
|
101
|
+
// Request timeout. DEFAULT_FETCH_TIMEOUT_MS unless overridden.
|
|
102
|
+
timeoutMs: number;
|
|
103
|
+
// For tests: called exactly once with every resolved URL right
|
|
104
|
+
// before `fetchImpl` runs. Production passes a no-op.
|
|
105
|
+
onWillFetch: (url: string) => void;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
export function defaultHttpFetcherDeps(
|
|
109
|
+
robots: RobotsProvider,
|
|
110
|
+
rateLimiter?: HostRateLimiter,
|
|
111
|
+
rateLimiterDeps: RateLimiterDeps = defaultRateLimiterDeps(),
|
|
112
|
+
): HttpFetcherDeps {
|
|
113
|
+
return {
|
|
114
|
+
fetchImpl: globalThis.fetch.bind(globalThis),
|
|
115
|
+
robots,
|
|
116
|
+
rateLimiter: rateLimiter ?? new HostRateLimiter(rateLimiterDeps),
|
|
117
|
+
rateLimiterDeps,
|
|
118
|
+
crawlDelayMs: () => null,
|
|
119
|
+
timeoutMs: DEFAULT_FETCH_TIMEOUT_MS,
|
|
120
|
+
onWillFetch: () => {},
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Fetch a URL politely. Resolves with the Response object (caller
|
|
125
|
+
// decides whether to `.text()` / `.json()`); rejects with
|
|
126
|
+
// `RobotsDisallowedError` when robots says no, `RedirectLimitError`
|
|
127
|
+
// when the redirect chain exceeds MAX_REDIRECTS, `DOMException`
|
|
128
|
+
// (AbortError) when the external signal or the internal timeout
|
|
129
|
+
// fires, or whatever `fetchImpl` throws otherwise.
|
|
130
|
+
//
|
|
131
|
+
// Redirects are followed manually so every hop goes through the
|
|
132
|
+
// same robots-check + per-host rate-limit. Auto-follow (`fetch`'s
|
|
133
|
+
// default) would let a 302 to another host or another path bypass
|
|
134
|
+
// those checks entirely — a silent politeness violation.
|
|
135
|
+
export async function fetchPolite(rawUrl: string, deps: HttpFetcherDeps): Promise<Response> {
|
|
136
|
+
let currentUrl = rawUrl;
|
|
137
|
+
for (let hop = 0; hop <= MAX_REDIRECTS; hop++) {
|
|
138
|
+
const response = await fetchSingleHop(currentUrl, deps);
|
|
139
|
+
const nextUrl = redirectTarget(response, currentUrl);
|
|
140
|
+
if (nextUrl === null) return response;
|
|
141
|
+
currentUrl = nextUrl;
|
|
142
|
+
}
|
|
143
|
+
throw new RedirectLimitError(rawUrl, currentUrl);
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// A single hop: validate scheme, check robots.txt, run the fetch
|
|
147
|
+
// under the per-host rate limit. Returns whatever Response the
|
|
148
|
+
// server produced (including 3xx with a Location header — the
|
|
149
|
+
// caller in fetchPolite inspects and may re-enter).
|
|
150
|
+
async function fetchSingleHop(rawUrl: string, deps: HttpFetcherDeps): Promise<Response> {
|
|
151
|
+
const url = new URL(rawUrl);
|
|
152
|
+
// Only http(s) reach the fetch — file://, data:, mailto: would
|
|
153
|
+
// never be legitimate source URLs and robots.txt doesn't cover
|
|
154
|
+
// them. Reject at the boundary.
|
|
155
|
+
if (url.protocol !== "http:" && url.protocol !== "https:") {
|
|
156
|
+
throw new Error(`[sources] fetchPolite: refusing non-http(s) URL ${rawUrl}`);
|
|
157
|
+
}
|
|
158
|
+
const host = url.host.toLowerCase();
|
|
159
|
+
|
|
160
|
+
const robotsText = await deps.robots(host);
|
|
161
|
+
if (robotsText !== null) {
|
|
162
|
+
const parsed = parseRobots(robotsText);
|
|
163
|
+
const pathAndQuery = url.pathname + url.search;
|
|
164
|
+
if (!isAllowedByRobots(parsed, USER_AGENT, pathAndQuery)) {
|
|
165
|
+
throw new RobotsDisallowedError(rawUrl);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
const minDelay = deps.crawlDelayMs(host) ?? DEFAULT_MIN_DELAY_MS;
|
|
170
|
+
return deps.rateLimiter.run(host, () => fetchWithTimeout(rawUrl, deps), minDelay);
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
// Return the absolute URL the caller should hop to next, or null
|
|
174
|
+
// when the response is not a redirect we should follow. A 3xx
|
|
175
|
+
// status without a Location header is treated as terminal — the
|
|
176
|
+
// server sent something unusual and it's safer to surface the
|
|
177
|
+
// response than to guess.
|
|
178
|
+
function redirectTarget(response: Response, currentUrl: string): string | null {
|
|
179
|
+
if (response.status < 300 || response.status >= 400) return null;
|
|
180
|
+
// 304 Not Modified is a cache response, not a redirect.
|
|
181
|
+
if (response.status === 304) return null;
|
|
182
|
+
const location = response.headers.get("location");
|
|
183
|
+
if (!location) return null;
|
|
184
|
+
try {
|
|
185
|
+
return new URL(location, currentUrl).toString();
|
|
186
|
+
} catch {
|
|
187
|
+
// Malformed Location — give up so the caller sees the 3xx
|
|
188
|
+
// instead of a recursive parse crash.
|
|
189
|
+
return null;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
async function fetchWithTimeout(rawUrl: string, deps: HttpFetcherDeps): Promise<Response> {
|
|
194
|
+
// Combine the caller's signal with an internal timeout. Each
|
|
195
|
+
// fetch gets its own AbortController so a slow request doesn't
|
|
196
|
+
// affect the next caller.
|
|
197
|
+
const controller = new AbortController();
|
|
198
|
+
const timeoutHandle = setTimeout(() => {
|
|
199
|
+
controller.abort(new DOMException(`[sources] fetch timed out after ${deps.timeoutMs}ms`, "TimeoutError"));
|
|
200
|
+
}, deps.timeoutMs);
|
|
201
|
+
|
|
202
|
+
const external = deps.externalSignal;
|
|
203
|
+
let externalUnsub: (() => void) | null = null;
|
|
204
|
+
if (external) {
|
|
205
|
+
if (external.aborted) {
|
|
206
|
+
clearTimeout(timeoutHandle);
|
|
207
|
+
throw (external as AbortSignal & { reason?: unknown }).reason ?? new DOMException("Aborted", "AbortError");
|
|
208
|
+
}
|
|
209
|
+
const onAbort = () => {
|
|
210
|
+
controller.abort((external as AbortSignal & { reason?: unknown }).reason);
|
|
211
|
+
};
|
|
212
|
+
external.addEventListener("abort", onAbort, { once: true });
|
|
213
|
+
externalUnsub = () => external.removeEventListener("abort", onAbort);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
deps.onWillFetch(rawUrl);
|
|
217
|
+
try {
|
|
218
|
+
return await deps.fetchImpl(rawUrl, {
|
|
219
|
+
headers: { "User-Agent": USER_AGENT },
|
|
220
|
+
signal: controller.signal,
|
|
221
|
+
// Redirects are followed manually in `fetchPolite` so each
|
|
222
|
+
// hop re-runs robots.txt + per-host rate-limit. Auto-follow
|
|
223
|
+
// would skip those checks on the second hop.
|
|
224
|
+
redirect: "manual",
|
|
225
|
+
});
|
|
226
|
+
} finally {
|
|
227
|
+
clearTimeout(timeoutHandle);
|
|
228
|
+
externalUnsub?.();
|
|
229
|
+
}
|
|
230
|
+
}
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
// User interest profile for news notification filtering (#466).
|
|
2
|
+
//
|
|
3
|
+
// Loaded from `config/interests.json`. Claude populates this file
|
|
4
|
+
// during conversation when it detects user interest in a topic.
|
|
5
|
+
// The pipeline's notify phase uses it to score and filter articles.
|
|
6
|
+
|
|
7
|
+
import fs from "fs";
|
|
8
|
+
import path from "path";
|
|
9
|
+
import { workspacePath } from "../paths.js";
|
|
10
|
+
import { log } from "../../system/logger/index.js";
|
|
11
|
+
import type { SourceItem } from "./types.js";
|
|
12
|
+
import type { CategorySlug } from "./taxonomy.js";
|
|
13
|
+
import { isCategorySlug } from "./taxonomy.js";
|
|
14
|
+
import { isNonEmptyString, isRecord } from "../../utils/types.js";
|
|
15
|
+
|
|
16
|
+
// ── Types ───────────────────────────────────────────────────────
|
|
17
|
+
|
|
18
|
+
export interface InterestsProfile {
|
|
19
|
+
keywords: string[];
|
|
20
|
+
categories: CategorySlug[];
|
|
21
|
+
minRelevance: number;
|
|
22
|
+
maxNotificationsPerRun: number;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// ── Constants ───────────────────────────────────────────────────
|
|
26
|
+
|
|
27
|
+
const CONFIG_FILE = "config/interests.json";
|
|
28
|
+
const DEFAULT_MIN_RELEVANCE = 0.5;
|
|
29
|
+
const DEFAULT_MAX_NOTIFICATIONS = 5;
|
|
30
|
+
|
|
31
|
+
// Scoring weights
|
|
32
|
+
const KEYWORD_TITLE_WEIGHT = 0.4;
|
|
33
|
+
const KEYWORD_SUMMARY_WEIGHT = 0.2;
|
|
34
|
+
const CATEGORY_MATCH_WEIGHT = 0.3;
|
|
35
|
+
const SEVERITY_CRITICAL_WEIGHT = 0.3;
|
|
36
|
+
const SEVERITY_WARN_WEIGHT = 0.1;
|
|
37
|
+
|
|
38
|
+
// ── Load ────────────────────────────────────────────────────────
|
|
39
|
+
|
|
40
|
+
export function loadInterests(root?: string): InterestsProfile | null {
|
|
41
|
+
const base = root ?? workspacePath;
|
|
42
|
+
const filePath = path.join(base, CONFIG_FILE);
|
|
43
|
+
try {
|
|
44
|
+
if (!fs.existsSync(filePath)) return null;
|
|
45
|
+
const raw = fs.readFileSync(filePath, "utf-8");
|
|
46
|
+
const parsed: unknown = JSON.parse(raw);
|
|
47
|
+
return validateInterests(parsed);
|
|
48
|
+
} catch (err) {
|
|
49
|
+
log.warn("interests", "failed to load interests.json", {
|
|
50
|
+
error: String(err),
|
|
51
|
+
});
|
|
52
|
+
return null;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function validateInterests(raw: unknown): InterestsProfile | null {
|
|
57
|
+
if (!isRecord(raw)) return null;
|
|
58
|
+
const obj = raw as Record<string, unknown>;
|
|
59
|
+
|
|
60
|
+
// Filter out blank/whitespace-only keywords — "" matches every title
|
|
61
|
+
const keywords = Array.isArray(obj.keywords) ? obj.keywords.filter((k): k is string => isNonEmptyString(k)) : [];
|
|
62
|
+
|
|
63
|
+
const categories = Array.isArray(obj.categories) ? obj.categories.filter((c): c is CategorySlug => isCategorySlug(c)) : [];
|
|
64
|
+
|
|
65
|
+
if (keywords.length === 0 && categories.length === 0) return null;
|
|
66
|
+
|
|
67
|
+
// Clamp minRelevance to [0, 1] — values > 1 would make notifications
|
|
68
|
+
// impossible since scores are clamped to 1.0
|
|
69
|
+
const rawMin = typeof obj.minRelevance === "number" ? obj.minRelevance : DEFAULT_MIN_RELEVANCE;
|
|
70
|
+
const minRelevance = Math.max(0, Math.min(1, rawMin));
|
|
71
|
+
|
|
72
|
+
// Floor to integer, minimum 1
|
|
73
|
+
const rawMax = typeof obj.maxNotificationsPerRun === "number" ? obj.maxNotificationsPerRun : DEFAULT_MAX_NOTIFICATIONS;
|
|
74
|
+
const maxNotificationsPerRun = Math.max(1, Math.floor(rawMax));
|
|
75
|
+
|
|
76
|
+
return { keywords, categories, minRelevance, maxNotificationsPerRun };
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// ── Scoring ─────────────────────────────────────────────────────
|
|
80
|
+
|
|
81
|
+
export interface ScoredItem {
|
|
82
|
+
item: SourceItem;
|
|
83
|
+
score: number;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
export function scoreItem(item: SourceItem, profile: InterestsProfile): number {
|
|
87
|
+
let score = 0;
|
|
88
|
+
const titleLower = item.title.toLowerCase();
|
|
89
|
+
const summaryLower = (item.summary ?? "").toLowerCase();
|
|
90
|
+
|
|
91
|
+
for (const kw of profile.keywords) {
|
|
92
|
+
const kwLower = kw.toLowerCase();
|
|
93
|
+
if (titleLower.includes(kwLower)) {
|
|
94
|
+
score += KEYWORD_TITLE_WEIGHT;
|
|
95
|
+
} else if (summaryLower.includes(kwLower)) {
|
|
96
|
+
score += KEYWORD_SUMMARY_WEIGHT;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
const hasCategory = item.categories.some((c) => profile.categories.includes(c));
|
|
101
|
+
if (hasCategory) {
|
|
102
|
+
score += CATEGORY_MATCH_WEIGHT;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
if (item.severity === "critical") {
|
|
106
|
+
score += SEVERITY_CRITICAL_WEIGHT;
|
|
107
|
+
} else if (item.severity === "warn") {
|
|
108
|
+
score += SEVERITY_WARN_WEIGHT;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
return Math.min(score, 1.0);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
export function scoreAndFilter(items: readonly SourceItem[], profile: InterestsProfile): ScoredItem[] {
|
|
115
|
+
return items
|
|
116
|
+
.map((item) => ({ item, score: scoreItem(item, profile) }))
|
|
117
|
+
.filter((s) => s.score >= profile.minRelevance)
|
|
118
|
+
.sort((a, b) => b.score - a.score)
|
|
119
|
+
.slice(0, profile.maxNotificationsPerRun);
|
|
120
|
+
}
|