explorbot 0.0.1 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +80 -26
- package/bin/explorbot-cli.ts +679 -0
- package/boat/api-tester/src/ai/chief/styles.ts +15 -0
- package/boat/api-tester/src/ai/chief.ts +335 -0
- package/boat/api-tester/src/ai/curler-tools.ts +278 -0
- package/boat/api-tester/src/ai/curler.ts +306 -0
- package/boat/api-tester/src/api-client.ts +28 -0
- package/boat/api-tester/src/apibot.ts +203 -0
- package/boat/api-tester/src/cli.ts +301 -0
- package/boat/api-tester/src/config.ts +190 -0
- package/dist/bin/explorbot-cli.js +19 -98
- package/dist/boat/api-tester/bin/apibot-cli.js +0 -1
- package/dist/boat/api-tester/src/ai/chief/styles.js +0 -1
- package/dist/boat/api-tester/src/ai/chief.js +0 -1
- package/dist/boat/api-tester/src/ai/curler-tools.js +0 -1
- package/dist/boat/api-tester/src/ai/curler.js +0 -1
- package/dist/boat/api-tester/src/api-client.js +0 -1
- package/dist/boat/api-tester/src/apibot.js +0 -1
- package/dist/boat/api-tester/src/cli.js +0 -1
- package/dist/boat/api-tester/src/config.js +0 -1
- package/dist/src/action-result.js +0 -1
- package/dist/src/action.js +0 -1
- package/dist/src/activity.js +0 -1
- package/dist/src/ai/agent.js +0 -1
- package/dist/src/ai/bosun.js +0 -1
- package/dist/src/ai/captain/idle-mode.js +0 -1
- package/dist/src/ai/captain/mixin.js +0 -1
- package/dist/src/ai/captain/test-mode.js +0 -1
- package/dist/src/ai/captain/web-mode.js +0 -1
- package/dist/src/ai/captain.js +0 -1
- package/dist/src/ai/conversation.js +0 -1
- package/dist/src/ai/experience-compactor.js +0 -1
- package/dist/src/ai/fisherman-tools.js +0 -1
- package/dist/src/ai/fisherman.js +0 -1
- package/dist/src/ai/historian.js +0 -1
- package/dist/src/ai/navigator.js +0 -1
- package/dist/src/ai/pilot.js +0 -1
- package/dist/src/ai/planner/session-dedup.js +0 -1
- package/dist/src/ai/planner/styles.js +0 -1
- package/dist/src/ai/planner/subpages.js +0 -1
- package/dist/src/ai/planner.js +0 -1
- package/dist/src/ai/provider.js +0 -1
- package/dist/src/ai/quartermaster.js +0 -1
- package/dist/src/ai/researcher/cache.js +0 -1
- package/dist/src/ai/researcher/coordinates.js +0 -1
- package/dist/src/ai/researcher/deep-analysis.js +0 -1
- package/dist/src/ai/researcher/fingerprint-worker.js +0 -1
- package/dist/src/ai/researcher/focus.js +0 -1
- package/dist/src/ai/researcher/locators.js +0 -1
- package/dist/src/ai/researcher/mixin.js +0 -1
- package/dist/src/ai/researcher/parser.js +0 -1
- package/dist/src/ai/researcher/research-result.js +0 -1
- package/dist/src/ai/researcher.js +0 -1
- package/dist/src/ai/rules.js +0 -1
- package/dist/src/ai/task-agent.js +0 -1
- package/dist/src/ai/tester.js +0 -1
- package/dist/src/ai/tools.js +0 -1
- package/dist/src/api/api-client.js +0 -1
- package/dist/src/api/request-result.js +0 -1
- package/dist/src/api/request-store.js +0 -1
- package/dist/src/api/spec-reader.js +0 -1
- package/dist/src/api/xhr-capture.js +0 -1
- package/dist/src/browser-server.js +0 -1
- package/dist/src/command-handler.js +0 -1
- package/dist/src/commands/add-rule-command.js +0 -1
- package/dist/src/commands/base-command.js +0 -1
- package/dist/src/commands/clean-command.js +0 -1
- package/dist/src/commands/context-aria-command.js +0 -1
- package/dist/src/commands/context-command.js +0 -1
- package/dist/src/commands/context-data-command.js +0 -1
- package/dist/src/commands/context-experience-command.js +0 -1
- package/dist/src/commands/context-html-command.js +0 -1
- package/dist/src/commands/context-knowledge-command.js +0 -1
- package/dist/src/commands/debug-command.js +0 -1
- package/dist/src/commands/drill-command.js +0 -1
- package/dist/src/commands/exit-command.js +0 -1
- package/dist/src/commands/explore-command.js +2 -2
- package/dist/src/commands/freesail-command.js +0 -1
- package/dist/src/commands/help-command.js +0 -1
- package/dist/src/commands/index.js +0 -1
- package/dist/src/commands/init-command.js +115 -0
- package/dist/src/commands/knows-command.js +0 -1
- package/dist/src/commands/learn-command.js +0 -1
- package/dist/src/commands/navigate-command.js +0 -1
- package/dist/src/commands/path-command.js +0 -1
- package/dist/src/commands/plan-clear-command.js +0 -1
- package/dist/src/commands/plan-command.js +0 -1
- package/dist/src/commands/plan-edit-command.js +0 -1
- package/dist/src/commands/plan-load-command.js +0 -1
- package/dist/src/commands/plan-reload-command.js +0 -1
- package/dist/src/commands/plan-save-command.js +0 -1
- package/dist/src/commands/research-command.js +0 -1
- package/dist/src/commands/start-command.js +0 -1
- package/dist/src/commands/status-command.js +0 -1
- package/dist/src/commands/test-command.js +0 -1
- package/dist/src/components/ActivityPane.js +0 -1
- package/dist/src/components/AddKnowledge.js +0 -1
- package/dist/src/components/AddRule.js +0 -1
- package/dist/src/components/App.js +0 -1
- package/dist/src/components/Autocomplete.js +0 -1
- package/dist/src/components/InputPane.js +0 -1
- package/dist/src/components/InputReadline.js +0 -1
- package/dist/src/components/LogPane.js +0 -1
- package/dist/src/components/PlanEditor.js +0 -1
- package/dist/src/components/PlanPane.js +0 -1
- package/dist/src/components/SessionTimer.js +0 -1
- package/dist/src/components/StateTransitionPane.js +0 -1
- package/dist/src/components/StatusPane.js +0 -1
- package/dist/src/components/TaskPane.js +0 -1
- package/dist/src/components/Welcome.js +0 -1
- package/dist/src/components/WelcomeChecklist.js +0 -1
- package/dist/src/components/WelcomeCommands.js +0 -1
- package/dist/src/components/autocomplete-store.js +0 -1
- package/dist/src/components/parse-keypress.js +0 -1
- package/dist/src/config.js +0 -1
- package/dist/src/execution-controller.js +0 -1
- package/dist/src/experience-tracker.js +0 -1
- package/dist/src/explorbot.js +0 -1
- package/dist/src/explorer.js +0 -1
- package/dist/src/index.js +0 -1
- package/dist/src/knowledge-tracker.js +2 -2
- package/dist/src/observability.js +0 -1
- package/dist/src/reporter.js +0 -1
- package/dist/src/state-manager.js +0 -1
- package/dist/src/stats.js +0 -1
- package/dist/src/test-plan.js +0 -1
- package/dist/src/utils/aria.js +0 -1
- package/dist/src/utils/cli-name.js +16 -0
- package/dist/src/utils/code-extractor.js +0 -1
- package/dist/src/utils/context-formatter.js +0 -1
- package/dist/src/utils/error-page.js +0 -1
- package/dist/src/utils/expandable.js +0 -1
- package/dist/src/utils/hooks-runner.js +0 -1
- package/dist/src/utils/html-diff.js +0 -1
- package/dist/src/utils/html.js +0 -1
- package/dist/src/utils/logger.js +0 -1
- package/dist/src/utils/loop.js +0 -1
- package/dist/src/utils/markdown-parser.js +0 -1
- package/dist/src/utils/markdown-query.js +0 -1
- package/dist/src/utils/markdown-terminal.js +0 -1
- package/dist/src/utils/research-parser.js +0 -1
- package/dist/src/utils/retry.js +0 -1
- package/dist/src/utils/rules-loader.js +0 -1
- package/dist/src/utils/strings.js +0 -1
- package/dist/src/utils/test-plan-markdown.js +0 -1
- package/dist/src/utils/throttle.js +0 -1
- package/dist/src/utils/unique-names.js +0 -1
- package/dist/src/utils/url-matcher.js +0 -1
- package/dist/src/utils/web-element.js +0 -1
- package/dist/src/utils/xpath.js +0 -1
- package/package.json +27 -3
- package/src/action-result.ts +694 -0
- package/src/action.ts +445 -0
- package/src/activity.ts +111 -0
- package/src/ai/agent.ts +3 -0
- package/src/ai/bosun.ts +557 -0
- package/src/ai/captain/idle-mode.ts +116 -0
- package/src/ai/captain/mixin.ts +22 -0
- package/src/ai/captain/test-mode.ts +262 -0
- package/src/ai/captain/web-mode.ts +136 -0
- package/src/ai/captain.ts +504 -0
- package/src/ai/conversation.ts +205 -0
- package/src/ai/experience-compactor.ts +284 -0
- package/src/ai/fisherman-tools.ts +181 -0
- package/src/ai/fisherman.ts +223 -0
- package/src/ai/historian.ts +457 -0
- package/src/ai/navigator.ts +572 -0
- package/src/ai/pilot.ts +776 -0
- package/src/ai/planner/session-dedup.ts +35 -0
- package/src/ai/planner/styles.ts +17 -0
- package/src/ai/planner/subpages.ts +141 -0
- package/src/ai/planner.ts +536 -0
- package/src/ai/provider.ts +613 -0
- package/src/ai/quartermaster.ts +286 -0
- package/src/ai/researcher/cache.ts +103 -0
- package/src/ai/researcher/coordinates.ts +238 -0
- package/src/ai/researcher/deep-analysis.ts +415 -0
- package/src/ai/researcher/fingerprint-worker.ts +59 -0
- package/src/ai/researcher/focus.ts +42 -0
- package/src/ai/researcher/locators.ts +282 -0
- package/src/ai/researcher/mixin.ts +4 -0
- package/src/ai/researcher/parser.ts +186 -0
- package/src/ai/researcher/research-result.ts +115 -0
- package/src/ai/researcher.ts +857 -0
- package/src/ai/rules.ts +376 -0
- package/src/ai/task-agent.ts +141 -0
- package/src/ai/tester.ts +939 -0
- package/src/ai/tools.ts +1117 -0
- package/src/api/api-client.ts +109 -0
- package/src/api/request-result.ts +212 -0
- package/src/api/request-store.ts +130 -0
- package/src/api/spec-reader.ts +174 -0
- package/src/api/xhr-capture.ts +100 -0
- package/src/browser-server.ts +74 -0
- package/src/command-handler.ts +454 -0
- package/src/commands/add-rule-command.ts +63 -0
- package/src/commands/base-command.ts +27 -0
- package/src/commands/clean-command.ts +73 -0
- package/src/commands/context-aria-command.ts +22 -0
- package/src/commands/context-command.ts +67 -0
- package/src/commands/context-data-command.ts +30 -0
- package/src/commands/context-experience-command.ts +48 -0
- package/src/commands/context-html-command.ts +33 -0
- package/src/commands/context-knowledge-command.ts +43 -0
- package/src/commands/debug-command.ts +13 -0
- package/src/commands/drill-command.ts +34 -0
- package/src/commands/exit-command.ts +32 -0
- package/src/commands/explore-command.ts +129 -0
- package/src/commands/freesail-command.ts +95 -0
- package/src/commands/help-command.ts +8 -0
- package/src/commands/index.ts +69 -0
- package/src/commands/init-command.ts +128 -0
- package/src/commands/knows-command.ts +68 -0
- package/src/commands/learn-command.ts +44 -0
- package/src/commands/navigate-command.ts +18 -0
- package/src/commands/path-command.ts +83 -0
- package/src/commands/plan-clear-command.ts +14 -0
- package/src/commands/plan-command.ts +41 -0
- package/src/commands/plan-edit-command.ts +9 -0
- package/src/commands/plan-load-command.ts +18 -0
- package/src/commands/plan-reload-command.ts +28 -0
- package/src/commands/plan-save-command.ts +25 -0
- package/src/commands/research-command.ts +45 -0
- package/src/commands/start-command.ts +13 -0
- package/src/commands/status-command.tsx +23 -0
- package/src/commands/test-command.ts +84 -0
- package/src/components/ActivityPane.tsx +80 -0
- package/src/components/AddKnowledge.tsx +169 -0
- package/src/components/AddRule.tsx +174 -0
- package/src/components/App.tsx +377 -0
- package/src/components/Autocomplete.tsx +63 -0
- package/src/components/InputPane.tsx +259 -0
- package/src/components/InputReadline.tsx +704 -0
- package/src/components/LogPane.tsx +187 -0
- package/src/components/PlanEditor.tsx +150 -0
- package/src/components/PlanPane.tsx +71 -0
- package/src/components/SessionTimer.tsx +35 -0
- package/src/components/StateTransitionPane.tsx +149 -0
- package/src/components/StatusPane.tsx +62 -0
- package/src/components/TaskPane.tsx +119 -0
- package/src/components/Welcome.tsx +83 -0
- package/src/components/WelcomeChecklist.tsx +118 -0
- package/src/components/WelcomeCommands.tsx +102 -0
- package/src/components/autocomplete-store.ts +35 -0
- package/src/components/parse-keypress.ts +170 -0
- package/src/config.ts +490 -0
- package/src/execution-controller.ts +109 -0
- package/src/experience-tracker.ts +350 -0
- package/src/explorbot.ts +405 -0
- package/src/explorer.ts +713 -0
- package/src/index.tsx +62 -0
- package/src/knowledge-tracker.ts +230 -0
- package/src/observability.ts +150 -0
- package/src/reporter.ts +224 -0
- package/src/state-manager.ts +556 -0
- package/src/stats.ts +53 -0
- package/src/test-plan.ts +432 -0
- package/src/utils/aria.ts +629 -0
- package/src/utils/cli-name.ts +13 -0
- package/src/utils/code-extractor.ts +22 -0
- package/src/utils/context-formatter.ts +239 -0
- package/src/utils/error-page.ts +23 -0
- package/src/utils/expandable.ts +38 -0
- package/src/utils/hooks-runner.ts +79 -0
- package/src/utils/html-diff.ts +918 -0
- package/src/utils/html.ts +1316 -0
- package/src/utils/logger.ts +534 -0
- package/src/utils/loop.ts +176 -0
- package/src/utils/markdown-parser.ts +127 -0
- package/src/utils/markdown-query.ts +466 -0
- package/src/utils/markdown-terminal.ts +43 -0
- package/src/utils/research-parser.ts +11 -0
- package/src/utils/retry.ts +73 -0
- package/src/utils/rules-loader.ts +118 -0
- package/src/utils/strings.ts +13 -0
- package/src/utils/test-plan-markdown.ts +332 -0
- package/src/utils/throttle.ts +18 -0
- package/src/utils/unique-names.ts +14 -0
- package/src/utils/url-matcher.ts +45 -0
- package/src/utils/web-element.ts +145 -0
- package/src/utils/xpath.ts +129 -0
- package/dist/bin/explorbot-cli.js.map +0 -1
- package/dist/boat/api-tester/bin/apibot-cli.js.map +0 -1
- package/dist/boat/api-tester/example/apibot.config.js +0 -31
- package/dist/boat/api-tester/example/apibot.config.js.map +0 -1
- package/dist/boat/api-tester/src/ai/chief/styles.js.map +0 -1
- package/dist/boat/api-tester/src/ai/chief.js.map +0 -1
- package/dist/boat/api-tester/src/ai/curler-tools.js.map +0 -1
- package/dist/boat/api-tester/src/ai/curler.js.map +0 -1
- package/dist/boat/api-tester/src/api-client.js.map +0 -1
- package/dist/boat/api-tester/src/apibot.js.map +0 -1
- package/dist/boat/api-tester/src/cli.js.map +0 -1
- package/dist/boat/api-tester/src/config.js.map +0 -1
- package/dist/prompts/audit-rules.md +0 -124
- package/dist/src/action-result.js.map +0 -1
- package/dist/src/action.js.map +0 -1
- package/dist/src/activity.js.map +0 -1
- package/dist/src/ai/agent.js.map +0 -1
- package/dist/src/ai/bosun.js.map +0 -1
- package/dist/src/ai/captain/idle-mode.js.map +0 -1
- package/dist/src/ai/captain/mixin.js.map +0 -1
- package/dist/src/ai/captain/test-mode.js.map +0 -1
- package/dist/src/ai/captain/web-mode.js.map +0 -1
- package/dist/src/ai/captain.js.map +0 -1
- package/dist/src/ai/conversation.js.map +0 -1
- package/dist/src/ai/experience-compactor.js.map +0 -1
- package/dist/src/ai/fisherman-tools.js.map +0 -1
- package/dist/src/ai/fisherman.js.map +0 -1
- package/dist/src/ai/historian.js.map +0 -1
- package/dist/src/ai/navigator.js.map +0 -1
- package/dist/src/ai/pilot.js.map +0 -1
- package/dist/src/ai/planner/session-dedup.js.map +0 -1
- package/dist/src/ai/planner/styles.js.map +0 -1
- package/dist/src/ai/planner/subpages.js.map +0 -1
- package/dist/src/ai/planner.js.map +0 -1
- package/dist/src/ai/provider.js.map +0 -1
- package/dist/src/ai/quartermaster.js.map +0 -1
- package/dist/src/ai/researcher/cache.js.map +0 -1
- package/dist/src/ai/researcher/coordinates.js.map +0 -1
- package/dist/src/ai/researcher/deep-analysis.js.map +0 -1
- package/dist/src/ai/researcher/fingerprint-worker.js.map +0 -1
- package/dist/src/ai/researcher/focus.js.map +0 -1
- package/dist/src/ai/researcher/locators.js.map +0 -1
- package/dist/src/ai/researcher/mixin.js.map +0 -1
- package/dist/src/ai/researcher/parser.js.map +0 -1
- package/dist/src/ai/researcher/research-result.js.map +0 -1
- package/dist/src/ai/researcher.js.map +0 -1
- package/dist/src/ai/rules.js.map +0 -1
- package/dist/src/ai/task-agent.js.map +0 -1
- package/dist/src/ai/tester.js.map +0 -1
- package/dist/src/ai/tools.js.map +0 -1
- package/dist/src/api/api-client.js.map +0 -1
- package/dist/src/api/request-result.js.map +0 -1
- package/dist/src/api/request-store.js.map +0 -1
- package/dist/src/api/spec-reader.js.map +0 -1
- package/dist/src/api/xhr-capture.js.map +0 -1
- package/dist/src/browser-server.js.map +0 -1
- package/dist/src/command-handler.js.map +0 -1
- package/dist/src/commands/add-rule-command.js.map +0 -1
- package/dist/src/commands/base-command.js.map +0 -1
- package/dist/src/commands/clean-command.js.map +0 -1
- package/dist/src/commands/context-aria-command.js.map +0 -1
- package/dist/src/commands/context-command.js.map +0 -1
- package/dist/src/commands/context-data-command.js.map +0 -1
- package/dist/src/commands/context-experience-command.js.map +0 -1
- package/dist/src/commands/context-html-command.js.map +0 -1
- package/dist/src/commands/context-knowledge-command.js.map +0 -1
- package/dist/src/commands/debug-command.js.map +0 -1
- package/dist/src/commands/drill-command.js.map +0 -1
- package/dist/src/commands/exit-command.js.map +0 -1
- package/dist/src/commands/explore-command.js.map +0 -1
- package/dist/src/commands/freesail-command.js.map +0 -1
- package/dist/src/commands/help-command.js.map +0 -1
- package/dist/src/commands/index.js.map +0 -1
- package/dist/src/commands/knows-command.js.map +0 -1
- package/dist/src/commands/learn-command.js.map +0 -1
- package/dist/src/commands/navigate-command.js.map +0 -1
- package/dist/src/commands/path-command.js.map +0 -1
- package/dist/src/commands/plan-clear-command.js.map +0 -1
- package/dist/src/commands/plan-command.js.map +0 -1
- package/dist/src/commands/plan-edit-command.js.map +0 -1
- package/dist/src/commands/plan-load-command.js.map +0 -1
- package/dist/src/commands/plan-reload-command.js.map +0 -1
- package/dist/src/commands/plan-save-command.js.map +0 -1
- package/dist/src/commands/research-command.js.map +0 -1
- package/dist/src/commands/start-command.js.map +0 -1
- package/dist/src/commands/status-command.js.map +0 -1
- package/dist/src/commands/test-command.js.map +0 -1
- package/dist/src/components/ActivityPane.js.map +0 -1
- package/dist/src/components/AddKnowledge.js.map +0 -1
- package/dist/src/components/AddRule.js.map +0 -1
- package/dist/src/components/App.js.map +0 -1
- package/dist/src/components/Autocomplete.js.map +0 -1
- package/dist/src/components/InputPane.js.map +0 -1
- package/dist/src/components/InputReadline.js.map +0 -1
- package/dist/src/components/LogPane.js.map +0 -1
- package/dist/src/components/PlanEditor.js.map +0 -1
- package/dist/src/components/PlanPane.js.map +0 -1
- package/dist/src/components/SessionTimer.js.map +0 -1
- package/dist/src/components/StateTransitionPane.js.map +0 -1
- package/dist/src/components/StatusPane.js.map +0 -1
- package/dist/src/components/TaskPane.js.map +0 -1
- package/dist/src/components/Welcome.js.map +0 -1
- package/dist/src/components/WelcomeChecklist.js.map +0 -1
- package/dist/src/components/WelcomeCommands.js.map +0 -1
- package/dist/src/components/autocomplete-store.js.map +0 -1
- package/dist/src/components/parse-keypress.js.map +0 -1
- package/dist/src/config.js.map +0 -1
- package/dist/src/execution-controller.js.map +0 -1
- package/dist/src/experience-tracker.js.map +0 -1
- package/dist/src/explorbot.js.map +0 -1
- package/dist/src/explorer.js.map +0 -1
- package/dist/src/index.js.map +0 -1
- package/dist/src/knowledge-tracker.js.map +0 -1
- package/dist/src/observability.js.map +0 -1
- package/dist/src/reporter.js.map +0 -1
- package/dist/src/state-manager.js.map +0 -1
- package/dist/src/stats.js.map +0 -1
- package/dist/src/test-plan.js.map +0 -1
- package/dist/src/utils/aria.js.map +0 -1
- package/dist/src/utils/code-extractor.js.map +0 -1
- package/dist/src/utils/context-formatter.js.map +0 -1
- package/dist/src/utils/error-page.js.map +0 -1
- package/dist/src/utils/expandable.js.map +0 -1
- package/dist/src/utils/hooks-runner.js.map +0 -1
- package/dist/src/utils/html-diff.js.map +0 -1
- package/dist/src/utils/html.js.map +0 -1
- package/dist/src/utils/logger.js.map +0 -1
- package/dist/src/utils/loop.js.map +0 -1
- package/dist/src/utils/markdown-parser.js.map +0 -1
- package/dist/src/utils/markdown-query.js.map +0 -1
- package/dist/src/utils/markdown-terminal.js.map +0 -1
- package/dist/src/utils/research-parser.js.map +0 -1
- package/dist/src/utils/retry.js.map +0 -1
- package/dist/src/utils/rules-loader.js.map +0 -1
- package/dist/src/utils/strings.js.map +0 -1
- package/dist/src/utils/test-plan-markdown.js.map +0 -1
- package/dist/src/utils/throttle.js.map +0 -1
- package/dist/src/utils/unique-names.js.map +0 -1
- package/dist/src/utils/url-matcher.js.map +0 -1
- package/dist/src/utils/web-element.js.map +0 -1
- package/dist/src/utils/xpath.js.map +0 -1
- package/prompts/audit-rules.md +0 -124
|
@@ -0,0 +1,1316 @@
|
|
|
1
|
+
import dedent from 'dedent';
|
|
2
|
+
import { minify } from 'html-minifier-next';
|
|
3
|
+
import { parse, parseFragment, serialize } from 'parse5';
|
|
4
|
+
import type * as parse5TreeAdapter from 'parse5/lib/tree-adapters/default';
|
|
5
|
+
import type { HtmlConfig } from '../config.ts';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* HTML parsing library that preserves original structure while filtering content
|
|
9
|
+
* Based on CodeceptJS approach but with recursive parsing to maintain structure
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Simple CSS selector matcher
|
|
14
|
+
* Supports basic selectors: tag, .class, #id, [attr], [attr=value]
|
|
15
|
+
*/
|
|
16
|
+
function matchesSelector(element: parse5TreeAdapter.Element, selector: string): boolean {
|
|
17
|
+
// Check if it's actually an element with tagName
|
|
18
|
+
if (!element || !element.tagName) {
|
|
19
|
+
return false;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Tag selector
|
|
23
|
+
if (!selector.includes('[', '.') && !selector.includes('#') && !selector.includes(':')) {
|
|
24
|
+
return element.tagName.toLowerCase() === selector.toLowerCase();
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Class selector
|
|
28
|
+
if (selector.startsWith('.')) {
|
|
29
|
+
const className = selector.slice(1);
|
|
30
|
+
const classAttr = element.attrs.find((attr) => attr.name === 'class');
|
|
31
|
+
return classAttr ? classAttr.value.split(' ').includes(className) : false;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// ID selector
|
|
35
|
+
if (selector.startsWith('#')) {
|
|
36
|
+
const id = selector.slice(1);
|
|
37
|
+
const idAttr = element.attrs.find((attr) => attr.name === 'id');
|
|
38
|
+
return idAttr ? idAttr.value === id : false;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// Attribute selector
|
|
42
|
+
if (selector.startsWith('[') && selector.endsWith(']')) {
|
|
43
|
+
const attrContent = selector.slice(1, -1);
|
|
44
|
+
const eqIndex = attrContent.indexOf('=');
|
|
45
|
+
|
|
46
|
+
if (eqIndex === -1) {
|
|
47
|
+
// Just attribute existence
|
|
48
|
+
return element.attrs.some((attr) => attr.name === attrContent);
|
|
49
|
+
}
|
|
50
|
+
// Attribute with value
|
|
51
|
+
const attrName = attrContent.slice(0, eqIndex);
|
|
52
|
+
const attrValue = attrContent.slice(eqIndex + 1);
|
|
53
|
+
// Remove quotes if present
|
|
54
|
+
const unquotedValue = attrValue.replace(/^["']|["']$/g, '');
|
|
55
|
+
const attr = element.attrs.find((a) => a.name === attrName);
|
|
56
|
+
return attr ? attr.value === unquotedValue : false;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return false;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Check if element matches any of the provided selectors
|
|
64
|
+
*/
|
|
65
|
+
function matchesAnySelector(element: parse5TreeAdapter.Element, selectors: string[]): boolean {
|
|
66
|
+
if (!selectors || selectors.length === 0) return false;
|
|
67
|
+
|
|
68
|
+
for (const selector of selectors) {
|
|
69
|
+
if (matchesSelector(element, selector)) {
|
|
70
|
+
return true;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
return false;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const TEXT_ELEMENT_TAGS = new Set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li', 'td', 'th', 'label', 'div', 'span']);
|
|
77
|
+
|
|
78
|
+
const INTERACTIVE_TAGS = new Set(['a', 'button', 'details', 'input', 'option', 'select', 'summary', 'textarea', 'iframe']);
|
|
79
|
+
|
|
80
|
+
const INTERACTIVE_ROLES = new Set(['button', 'checkbox', 'combobox', 'link', 'listbox', 'radio', 'search', 'switch', 'tab', 'textbox']);
|
|
81
|
+
|
|
82
|
+
const INTERACTIVE_EVENT_ATTRIBUTES = new Set(['onclick', 'onchange', 'onblur', 'onfocus', 'onmousedown', 'onmouseup']);
|
|
83
|
+
|
|
84
|
+
const HIDDEN_CLASSES = new Set(['hidden', 'invisible', 'd-none', 'hide', 'dn', 'u-hidden', 'is-hidden', 'visually-hidden', 'sr-only', 'screen-reader-only', 'visuallyhidden', 'opacity-0']);
|
|
85
|
+
|
|
86
|
+
export const TRASH_HTML_CLASSES = /^(text-|color-|flex-|float-|v-|ember-|d-|border-)/;
|
|
87
|
+
|
|
88
|
+
export const TAILWIND_CLASS_PATTERNS: RegExp[] = [
|
|
89
|
+
/^m[trblxy]?-/i,
|
|
90
|
+
/^p[trblxy]?-/i,
|
|
91
|
+
/^(min|max)-(w|h)-/i,
|
|
92
|
+
/^(h|w)-/i,
|
|
93
|
+
/^bg-/i,
|
|
94
|
+
/^text-/i,
|
|
95
|
+
/^font-/i,
|
|
96
|
+
/^leading-/i,
|
|
97
|
+
/^tracking-/i,
|
|
98
|
+
/^uppercase$/i,
|
|
99
|
+
/^lowercase$/i,
|
|
100
|
+
/^capitalize$/i,
|
|
101
|
+
/^italic$/i,
|
|
102
|
+
/^antialiased$/i,
|
|
103
|
+
/^subpixel-antialiased$/i,
|
|
104
|
+
/^whitespace-/i,
|
|
105
|
+
/^break-/i,
|
|
106
|
+
/^flex$/i,
|
|
107
|
+
/^inline-flex$/i,
|
|
108
|
+
/^grid$/i,
|
|
109
|
+
/^inline-grid$/i,
|
|
110
|
+
/^items-/i,
|
|
111
|
+
/^content-/i,
|
|
112
|
+
/^justify-/i,
|
|
113
|
+
/^place-/i,
|
|
114
|
+
/^self-/i,
|
|
115
|
+
/^gap-/i,
|
|
116
|
+
/^space-[xy]-/i,
|
|
117
|
+
/^order-/i,
|
|
118
|
+
/^z-/i,
|
|
119
|
+
/^shadow/i,
|
|
120
|
+
/^rounded/i,
|
|
121
|
+
/^border/i,
|
|
122
|
+
/^outline-/i,
|
|
123
|
+
/^ring-/i,
|
|
124
|
+
/^opacity-/i,
|
|
125
|
+
/^fill-/i,
|
|
126
|
+
/^stroke-/i,
|
|
127
|
+
/^blur-/i,
|
|
128
|
+
/^brightness-/i,
|
|
129
|
+
/^contrast-/i,
|
|
130
|
+
/^drop-shadow-/i,
|
|
131
|
+
/^grayscale$/i,
|
|
132
|
+
/^hue-rotate-/i,
|
|
133
|
+
/^invert$/i,
|
|
134
|
+
/^saturate-/i,
|
|
135
|
+
/^sepia$/i,
|
|
136
|
+
/^backdrop-/i,
|
|
137
|
+
/^overflow-/i,
|
|
138
|
+
/^truncate$/i,
|
|
139
|
+
/^transform$/i,
|
|
140
|
+
/^transition$/i,
|
|
141
|
+
/^duration-/i,
|
|
142
|
+
/^delay-/i,
|
|
143
|
+
/^ease-/i,
|
|
144
|
+
/^animate-/i,
|
|
145
|
+
/^cursor-/i,
|
|
146
|
+
/^select-/i,
|
|
147
|
+
/^pointer-events-/i,
|
|
148
|
+
/^align-/i,
|
|
149
|
+
/^table-/i,
|
|
150
|
+
/^list-/i,
|
|
151
|
+
/^grid-cols-/i,
|
|
152
|
+
/^grid-rows-/i,
|
|
153
|
+
/^col-span-/i,
|
|
154
|
+
/^row-span-/i,
|
|
155
|
+
/^translate-[xyz]-/i,
|
|
156
|
+
/^scale-[xyz]?-/i,
|
|
157
|
+
/^rotate-/i,
|
|
158
|
+
/^skew-[xy]-/i,
|
|
159
|
+
/^origin-/i,
|
|
160
|
+
/^inset-/i,
|
|
161
|
+
/^top-/i,
|
|
162
|
+
/^bottom-/i,
|
|
163
|
+
/^left-/i,
|
|
164
|
+
/^right-/i,
|
|
165
|
+
/^aspect-/i,
|
|
166
|
+
/^prose$/i,
|
|
167
|
+
];
|
|
168
|
+
|
|
169
|
+
const NON_SEMANTIC_TAGS = new Set([
|
|
170
|
+
'style',
|
|
171
|
+
'script',
|
|
172
|
+
'link',
|
|
173
|
+
'meta',
|
|
174
|
+
'base',
|
|
175
|
+
'template',
|
|
176
|
+
'slot',
|
|
177
|
+
'noscript',
|
|
178
|
+
'frame',
|
|
179
|
+
'frameset',
|
|
180
|
+
'object',
|
|
181
|
+
'embed',
|
|
182
|
+
'path',
|
|
183
|
+
'polygon',
|
|
184
|
+
'polyline',
|
|
185
|
+
'circle',
|
|
186
|
+
'ellipse',
|
|
187
|
+
'line',
|
|
188
|
+
'rect',
|
|
189
|
+
'defs',
|
|
190
|
+
'g',
|
|
191
|
+
'symbol',
|
|
192
|
+
'use',
|
|
193
|
+
'mask',
|
|
194
|
+
'pattern',
|
|
195
|
+
'clippath',
|
|
196
|
+
'animate',
|
|
197
|
+
'animatetransform',
|
|
198
|
+
'animatecolor',
|
|
199
|
+
]);
|
|
200
|
+
|
|
201
|
+
type ParentNodeLike = parse5TreeAdapter.Document | parse5TreeAdapter.DocumentFragment | parse5TreeAdapter.Element;
|
|
202
|
+
|
|
203
|
+
function hasChildNodes(node: unknown): node is ParentNodeLike {
|
|
204
|
+
return !!node && typeof node === 'object' && 'childNodes' in (node as Record<string, unknown>) && Array.isArray((node as { childNodes?: unknown }).childNodes);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
function stripElementsByTag(node: ParentNodeLike, tagsToRemove: Set<string>): void {
|
|
208
|
+
if (!node.childNodes) return;
|
|
209
|
+
|
|
210
|
+
for (let i = node.childNodes.length - 1; i >= 0; i--) {
|
|
211
|
+
const child = node.childNodes[i];
|
|
212
|
+
|
|
213
|
+
if (child.nodeName === '#comment') {
|
|
214
|
+
node.childNodes.splice(i, 1);
|
|
215
|
+
continue;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if ('tagName' in child && child.tagName) {
|
|
219
|
+
const tagName = child.tagName.toLowerCase();
|
|
220
|
+
if (tagsToRemove.has(tagName)) {
|
|
221
|
+
node.childNodes.splice(i, 1);
|
|
222
|
+
continue;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
stripElementsByTag(child as ParentNodeLike, tagsToRemove);
|
|
226
|
+
} else if (hasChildNodes(child)) {
|
|
227
|
+
stripElementsByTag(child as ParentNodeLike, tagsToRemove);
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
function pruneDocumentHead(document: parse5TreeAdapter.Document): void {
|
|
233
|
+
if (!document.childNodes) return;
|
|
234
|
+
|
|
235
|
+
const htmlElement = document.childNodes.find((node): node is parse5TreeAdapter.Element => 'tagName' in node && node.tagName?.toLowerCase() === 'html');
|
|
236
|
+
|
|
237
|
+
if (!htmlElement || !htmlElement.childNodes) {
|
|
238
|
+
return;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
const headElement = htmlElement.childNodes.find((node): node is parse5TreeAdapter.Element => 'tagName' in node && node.tagName?.toLowerCase() === 'head');
|
|
242
|
+
|
|
243
|
+
if (!headElement || !headElement.childNodes) {
|
|
244
|
+
return;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
for (let i = headElement.childNodes.length - 1; i >= 0; i--) {
|
|
248
|
+
const child = headElement.childNodes[i];
|
|
249
|
+
|
|
250
|
+
if ('tagName' in child && child.tagName) {
|
|
251
|
+
const tagName = child.tagName.toLowerCase();
|
|
252
|
+
if (tagName !== 'title') {
|
|
253
|
+
headElement.childNodes.splice(i, 1);
|
|
254
|
+
}
|
|
255
|
+
continue;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
if (child.nodeName === '#text') {
|
|
259
|
+
const textNode = child as parse5TreeAdapter.TextNode;
|
|
260
|
+
if (!textNode.value.trim()) {
|
|
261
|
+
headElement.childNodes.splice(i, 1);
|
|
262
|
+
}
|
|
263
|
+
continue;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
if (child.nodeName === '#comment') {
|
|
267
|
+
headElement.childNodes.splice(i, 1);
|
|
268
|
+
continue;
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
headElement.childNodes.splice(i, 1);
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
function sanitizeDocumentTree(document: parse5TreeAdapter.Document): void {
|
|
276
|
+
stripElementsByTag(document, NON_SEMANTIC_TAGS);
|
|
277
|
+
pruneDocumentHead(document);
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
function getDocumentTitle(document: parse5TreeAdapter.Document): string | null {
|
|
281
|
+
if (!document.childNodes) return null;
|
|
282
|
+
|
|
283
|
+
const htmlElement = document.childNodes.find((node): node is parse5TreeAdapter.Element => 'tagName' in node && node.tagName?.toLowerCase() === 'html');
|
|
284
|
+
|
|
285
|
+
if (!htmlElement || !htmlElement.childNodes) {
|
|
286
|
+
return null;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
const headElement = htmlElement.childNodes.find((node): node is parse5TreeAdapter.Element => 'tagName' in node && node.tagName?.toLowerCase() === 'head');
|
|
290
|
+
|
|
291
|
+
if (!headElement || !headElement.childNodes) {
|
|
292
|
+
return null;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
const titleElement = headElement.childNodes.find((node): node is parse5TreeAdapter.Element => 'tagName' in node && node.tagName?.toLowerCase() === 'title');
|
|
296
|
+
|
|
297
|
+
if (!titleElement) {
|
|
298
|
+
return null;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
const text = getTextContent(titleElement).trim();
|
|
302
|
+
return text.length > 0 ? text : null;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
function ensureDocumentTitle(document: parse5TreeAdapter.Document, titleText: string | null): void {
|
|
306
|
+
if (!titleText || !document.childNodes) {
|
|
307
|
+
return;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
const htmlElement = document.childNodes.find((node): node is parse5TreeAdapter.Element => 'tagName' in node && node.tagName?.toLowerCase() === 'html');
|
|
311
|
+
|
|
312
|
+
if (!htmlElement) {
|
|
313
|
+
return;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
const namespace = htmlElement.namespaceURI || 'http://www.w3.org/1999/xhtml';
|
|
317
|
+
|
|
318
|
+
let headElement = htmlElement.childNodes.find((node): node is parse5TreeAdapter.Element => 'tagName' in node && node.tagName?.toLowerCase() === 'head');
|
|
319
|
+
|
|
320
|
+
if (!headElement) {
|
|
321
|
+
headElement = {
|
|
322
|
+
nodeName: 'head',
|
|
323
|
+
tagName: 'head',
|
|
324
|
+
attrs: [],
|
|
325
|
+
namespaceURI: namespace,
|
|
326
|
+
childNodes: [],
|
|
327
|
+
parentNode: htmlElement,
|
|
328
|
+
} as parse5TreeAdapter.Element;
|
|
329
|
+
|
|
330
|
+
// Insert head before body if possible, otherwise prepend
|
|
331
|
+
const bodyIndex = htmlElement.childNodes.findIndex((node) => 'tagName' in node && node.tagName?.toLowerCase() === 'body');
|
|
332
|
+
if (bodyIndex === -1) {
|
|
333
|
+
htmlElement.childNodes.push(headElement);
|
|
334
|
+
} else {
|
|
335
|
+
htmlElement.childNodes.splice(bodyIndex, 0, headElement);
|
|
336
|
+
}
|
|
337
|
+
} else {
|
|
338
|
+
headElement.childNodes = [];
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
const titleElement: parse5TreeAdapter.Element = {
|
|
342
|
+
nodeName: 'title',
|
|
343
|
+
tagName: 'title',
|
|
344
|
+
attrs: [],
|
|
345
|
+
namespaceURI: namespace,
|
|
346
|
+
childNodes: [],
|
|
347
|
+
parentNode: headElement,
|
|
348
|
+
};
|
|
349
|
+
|
|
350
|
+
const textNode: parse5TreeAdapter.TextNode = {
|
|
351
|
+
nodeName: '#text',
|
|
352
|
+
value: titleText,
|
|
353
|
+
};
|
|
354
|
+
|
|
355
|
+
(textNode as any).parentNode = titleElement;
|
|
356
|
+
titleElement.childNodes.push(textNode);
|
|
357
|
+
headElement.childNodes.push(titleElement);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
function createSanitizedDocument(html: string, _htmlConfig?: HtmlConfig): parse5TreeAdapter.Document {
|
|
361
|
+
const document = parse(html);
|
|
362
|
+
const documentTitle = getDocumentTitle(document);
|
|
363
|
+
sanitizeDocumentTree(document);
|
|
364
|
+
ensureDocumentTitle(document, documentTitle);
|
|
365
|
+
return document;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
export function sanitizeHtmlDocument(html: string, htmlConfig?: HtmlConfig): parse5TreeAdapter.Document {
|
|
369
|
+
return createSanitizedDocument(html, htmlConfig);
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
export function sanitizeHtmlString(html: string, htmlConfig?: HtmlConfig): string {
|
|
373
|
+
const document = createSanitizedDocument(html, htmlConfig);
|
|
374
|
+
return serialize(document);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
/**
|
|
378
|
+
* Creates a minimal snapshot keeping only interactive elements and their structure
|
|
379
|
+
* Based on CodeceptJS HTML library
|
|
380
|
+
*/
|
|
381
|
+
export function htmlMinimalUISnapshot(html: string, htmlConfig?: HtmlConfig['minimal']) {
|
|
382
|
+
const document = parse(html);
|
|
383
|
+
const trashHtmlClasses = TRASH_HTML_CLASSES;
|
|
384
|
+
const removeElements = ['path', 'script'];
|
|
385
|
+
|
|
386
|
+
function isFilteredOut(node) {
|
|
387
|
+
// Check exclude selectors first
|
|
388
|
+
if (htmlConfig?.exclude && matchesAnySelector(node, htmlConfig.exclude)) {
|
|
389
|
+
return true;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
if (removeElements.includes(node.nodeName)) return true;
|
|
393
|
+
if (node.attrs) {
|
|
394
|
+
if (node.attrs.find((attr) => attr.name === 'role' && attr.value === 'tooltip')) return true;
|
|
395
|
+
}
|
|
396
|
+
return false;
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
// Define default interactive elements
|
|
400
|
+
const interactiveElements = ['a', 'input', 'button', 'select', 'textarea', 'option', 'iframe'];
|
|
401
|
+
const textElements = ['label', 'h1', 'h2'];
|
|
402
|
+
const allowedRoles = ['button', 'checkbox', 'search', 'textbox', 'tab'];
|
|
403
|
+
const allowedAttrs = ['id', 'for', 'class', 'name', 'type', 'value', 'tabindex', 'aria-labelledby', 'aria-label', 'label', 'placeholder', 'title', 'alt', 'src', 'width', 'height', 'role'];
|
|
404
|
+
|
|
405
|
+
function isInteractive(element) {
|
|
406
|
+
// Check if element matches include selectors
|
|
407
|
+
if (htmlConfig?.include && matchesAnySelector(element, htmlConfig.include)) {
|
|
408
|
+
return true;
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
// Check if element matches exclude selectors
|
|
412
|
+
if (htmlConfig?.exclude && matchesAnySelector(element, htmlConfig.exclude)) {
|
|
413
|
+
return false;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// Check for data-explorbot attributes (new addition)
|
|
417
|
+
if (hasExplorbotAttributes(element)) {
|
|
418
|
+
return true;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
// Default logic
|
|
422
|
+
if (element.nodeName === 'input' && element.attrs.find((attr) => attr.name === 'type' && attr.value === 'hidden')) return false;
|
|
423
|
+
if (interactiveElements.includes(element.nodeName)) return true;
|
|
424
|
+
if (element.attrs) {
|
|
425
|
+
if (element.attrs.find((attr) => attr.name === 'contenteditable')) return true;
|
|
426
|
+
if (element.attrs.find((attr) => attr.name === 'tabindex')) return true;
|
|
427
|
+
const role = element.attrs.find((attr) => attr.name === 'role');
|
|
428
|
+
if (role && allowedRoles.includes(role.value)) return true;
|
|
429
|
+
}
|
|
430
|
+
return false;
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
function hasMeaningfulText(node) {
|
|
434
|
+
if (textElements.includes(node.nodeName)) return true;
|
|
435
|
+
return false;
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
function hasInteractiveDescendant(node) {
|
|
439
|
+
if (!node.childNodes) return false;
|
|
440
|
+
let result = false;
|
|
441
|
+
|
|
442
|
+
for (const childNode of node.childNodes) {
|
|
443
|
+
if (isInteractive(childNode) || hasMeaningfulText(childNode)) return true;
|
|
444
|
+
result = result || hasInteractiveDescendant(childNode);
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
return result;
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
function removeNonInteractive(node) {
|
|
451
|
+
if (node.nodeName !== '#document') {
|
|
452
|
+
const parent = node.parentNode;
|
|
453
|
+
const index = parent.childNodes.indexOf(node);
|
|
454
|
+
|
|
455
|
+
if (isFilteredOut(node)) {
|
|
456
|
+
parent.childNodes.splice(index, 1);
|
|
457
|
+
return true;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// keep texts for interactive elements
|
|
461
|
+
if ((isInteractive(parent) || hasMeaningfulText(parent)) && node.nodeName === '#text') {
|
|
462
|
+
node.value = node.value.trim().slice(0, 200);
|
|
463
|
+
if (!node.value) return false;
|
|
464
|
+
return true;
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
if (
|
|
468
|
+
// if parent is interactive, we may need child element to match
|
|
469
|
+
!isInteractive(parent) &&
|
|
470
|
+
!isInteractive(node) &&
|
|
471
|
+
!hasInteractiveDescendant(node) &&
|
|
472
|
+
!hasMeaningfulText(node)
|
|
473
|
+
) {
|
|
474
|
+
parent.childNodes.splice(index, 1);
|
|
475
|
+
return true;
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
if (node.nodeName === 'svg') {
|
|
480
|
+
cleanElement(node as parse5TreeAdapter.Element);
|
|
481
|
+
return false;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
if (node.attrs) {
|
|
485
|
+
// Filter and keep allowed attributes, accessibility attributes
|
|
486
|
+
node.attrs = node.attrs.filter((attr) => {
|
|
487
|
+
const { name, value } = attr;
|
|
488
|
+
if (name === 'class') {
|
|
489
|
+
// Remove classes containing digits
|
|
490
|
+
attr.value = value
|
|
491
|
+
.split(' ')
|
|
492
|
+
// remove classes containing digits/
|
|
493
|
+
.filter((className) => !/\d/.test(className))
|
|
494
|
+
// remove popular trash classes
|
|
495
|
+
.filter((className) => !className.match(trashHtmlClasses))
|
|
496
|
+
// remove classes with : and __ in them
|
|
497
|
+
.filter((className) => !className.match(/(:|__)/))
|
|
498
|
+
// remove tailwind utility classes
|
|
499
|
+
.filter((className) => !TAILWIND_CLASS_PATTERNS.some((pattern) => pattern.test(className)))
|
|
500
|
+
.join(' ');
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
return allowedAttrs.includes(name) || name.startsWith('data-explorbot-');
|
|
504
|
+
});
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
// Convert data-explorbot-* attributes to regular attributes
|
|
508
|
+
if (node.attrs) {
|
|
509
|
+
convertExplorbotAttributes(node);
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
if (node.childNodes) {
|
|
513
|
+
for (let i = node.childNodes.length - 1; i >= 0; i--) {
|
|
514
|
+
const childNode = node.childNodes[i];
|
|
515
|
+
removeNonInteractive(childNode);
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
return false;
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
// Remove non-interactive elements starting from the root element
|
|
522
|
+
removeNonInteractive(document);
|
|
523
|
+
|
|
524
|
+
// Serialize the modified document tree back to HTML
|
|
525
|
+
const serializedHTML = serialize(document);
|
|
526
|
+
|
|
527
|
+
return serializedHTML;
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
export async function minifyHtml(html: string): Promise<string> {
|
|
531
|
+
return await minify(html, {
|
|
532
|
+
collapseWhitespace: true,
|
|
533
|
+
removeComments: true,
|
|
534
|
+
removeEmptyElements: false,
|
|
535
|
+
removeOptionalTags: false,
|
|
536
|
+
});
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
/**
|
|
540
|
+
* Creates a combined snapshot with interactive elements and meaningful text
|
|
541
|
+
* Preserves original HTML structure
|
|
542
|
+
*/
|
|
543
|
+
export function htmlCombinedSnapshot(html: string, htmlConfig?: HtmlConfig['combined'], opts?: { keepPositions?: boolean }): string {
|
|
544
|
+
const shouldKeepWithConfig = (element: parse5TreeAdapter.Element) => {
|
|
545
|
+
return shouldKeepCombined(element, htmlConfig);
|
|
546
|
+
};
|
|
547
|
+
|
|
548
|
+
const document = createSanitizedDocument(html);
|
|
549
|
+
const body = findBody(document);
|
|
550
|
+
if (!body) return html;
|
|
551
|
+
|
|
552
|
+
filterTree(body, shouldKeepWithConfig, opts?.keepPositions);
|
|
553
|
+
cleanAllElements(body);
|
|
554
|
+
|
|
555
|
+
return serialize(document);
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
/**
|
|
559
|
+
* Creates text-only snapshot with markdown formatting
|
|
560
|
+
*/
|
|
561
|
+
export function htmlTextSnapshot(html: string, htmlConfig?: HtmlConfig['text']): string {
|
|
562
|
+
const document = createSanitizedDocument(html);
|
|
563
|
+
const body = findBody(document);
|
|
564
|
+
if (!body) return '';
|
|
565
|
+
|
|
566
|
+
const text = processHtmlForText(body, htmlConfig);
|
|
567
|
+
return text.trim();
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
function processHtmlForText(element: parse5TreeAdapter.Element, htmlConfig?: HtmlConfig['text']): string {
|
|
571
|
+
const lines: string[] = [];
|
|
572
|
+
|
|
573
|
+
// Helper function to check if element matches include/exclude selectors
|
|
574
|
+
const shouldIncludeElement = (el: parse5TreeAdapter.Element): boolean => {
|
|
575
|
+
if (!htmlConfig) return true;
|
|
576
|
+
|
|
577
|
+
// If element matches any exclude selector, don't include it
|
|
578
|
+
if (htmlConfig.exclude && matchesAnySelector(el, htmlConfig.exclude)) {
|
|
579
|
+
return false;
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
// If no include selectors, include by default
|
|
583
|
+
if (!htmlConfig.include || htmlConfig.include.length === 0) {
|
|
584
|
+
return true;
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
// Include if matches any include selector
|
|
588
|
+
return matchesAnySelector(el, htmlConfig.include);
|
|
589
|
+
};
|
|
590
|
+
|
|
591
|
+
const processNode = (node: parse5TreeAdapter.Node): void => {
|
|
592
|
+
if (node.nodeName === '#text') {
|
|
593
|
+
// For text nodes, check if parent element should be included
|
|
594
|
+
if (node.parentNode && 'tagName' in node.parentNode) {
|
|
595
|
+
const parentElement = node.parentNode as parse5TreeAdapter.Element;
|
|
596
|
+
if (!shouldIncludeElement(parentElement)) {
|
|
597
|
+
return;
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
const text = (node as parse5TreeAdapter.TextNode).value.trim();
|
|
602
|
+
if (text.length >= 5) {
|
|
603
|
+
lines.push(text);
|
|
604
|
+
}
|
|
605
|
+
return;
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
if ('tagName' in node) {
|
|
609
|
+
const element = node as parse5TreeAdapter.Element;
|
|
610
|
+
const tagName = element.tagName.toLowerCase();
|
|
611
|
+
|
|
612
|
+
// Skip style and script elements completely
|
|
613
|
+
if (['style', 'script'].includes(tagName)) {
|
|
614
|
+
return;
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
// Check if element should be included based on configuration
|
|
618
|
+
if (!shouldIncludeElement(element)) {
|
|
619
|
+
// Still process children in case they should be included
|
|
620
|
+
element.childNodes.forEach((child) => processNode(child));
|
|
621
|
+
return;
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
// Handle headers specially - they should always be processed as markdown
|
|
625
|
+
if (tagName.startsWith('h')) {
|
|
626
|
+
const text = getTextContent(element).trim();
|
|
627
|
+
if (text) {
|
|
628
|
+
switch (tagName) {
|
|
629
|
+
case 'h1':
|
|
630
|
+
lines.push(`# ${text}`);
|
|
631
|
+
break;
|
|
632
|
+
case 'h2':
|
|
633
|
+
lines.push(`## ${text}`);
|
|
634
|
+
break;
|
|
635
|
+
case 'h3':
|
|
636
|
+
lines.push(`### ${text}`);
|
|
637
|
+
break;
|
|
638
|
+
case 'h4':
|
|
639
|
+
lines.push(`#### ${text}`);
|
|
640
|
+
break;
|
|
641
|
+
case 'h5':
|
|
642
|
+
lines.push(`##### ${text}`);
|
|
643
|
+
break;
|
|
644
|
+
case 'h6':
|
|
645
|
+
lines.push(`###### ${text}`);
|
|
646
|
+
break;
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
return;
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
// Handle interactive elements specially
|
|
653
|
+
if (shouldKeepInteractive(element)) {
|
|
654
|
+
// Format buttons and links
|
|
655
|
+
if (tagName === 'button' || getAttribute(element, 'role') === 'button') {
|
|
656
|
+
const buttonText = getTextContent(element).trim();
|
|
657
|
+
if (buttonText) {
|
|
658
|
+
lines.push(`[${buttonText}]`);
|
|
659
|
+
} else {
|
|
660
|
+
lines.push('[Button]');
|
|
661
|
+
}
|
|
662
|
+
return;
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
if (tagName === 'a' || getAttribute(element, 'role') === 'link') {
|
|
666
|
+
const linkText = getTextContent(element).trim();
|
|
667
|
+
if (linkText) {
|
|
668
|
+
lines.push(`[${linkText}]`);
|
|
669
|
+
} else {
|
|
670
|
+
lines.push('[Link]');
|
|
671
|
+
}
|
|
672
|
+
return;
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
// Format input fields
|
|
676
|
+
if (tagName === 'input') {
|
|
677
|
+
const name = getAttribute(element, 'name') || getAttribute(element, 'id');
|
|
678
|
+
const placeholder = getAttribute(element, 'placeholder');
|
|
679
|
+
const type = getAttribute(element, 'type');
|
|
680
|
+
|
|
681
|
+
if (type === 'submit' || type === 'button' || type === 'reset') {
|
|
682
|
+
const value = getAttribute(element, 'value') || type;
|
|
683
|
+
lines.push(`[${value}]`);
|
|
684
|
+
} else if (placeholder) {
|
|
685
|
+
lines.push(`{${placeholder}}`);
|
|
686
|
+
} else if (name) {
|
|
687
|
+
lines.push(`{${name}}`);
|
|
688
|
+
} else {
|
|
689
|
+
lines.push('{Input}');
|
|
690
|
+
}
|
|
691
|
+
return;
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
// Format textarea fields
|
|
695
|
+
if (tagName === 'textarea') {
|
|
696
|
+
const name = getAttribute(element, 'name') || getAttribute(element, 'id');
|
|
697
|
+
const placeholder = getAttribute(element, 'placeholder');
|
|
698
|
+
|
|
699
|
+
if (placeholder) {
|
|
700
|
+
lines.push(`{${placeholder}}`);
|
|
701
|
+
} else if (name) {
|
|
702
|
+
lines.push(`{${name}}`);
|
|
703
|
+
} else {
|
|
704
|
+
lines.push('{Textarea}');
|
|
705
|
+
}
|
|
706
|
+
return;
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
// Format select fields
|
|
710
|
+
if (tagName === 'select') {
|
|
711
|
+
const name = getAttribute(element, 'name') || getAttribute(element, 'id');
|
|
712
|
+
|
|
713
|
+
if (name) {
|
|
714
|
+
lines.push(`{${name}}`);
|
|
715
|
+
} else {
|
|
716
|
+
lines.push('{Select}');
|
|
717
|
+
}
|
|
718
|
+
return;
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
// For other interactive elements, just process children
|
|
722
|
+
element.childNodes.forEach((child) => processNode(child));
|
|
723
|
+
return;
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
// Handle text elements (but not headers - they're handled above)
|
|
727
|
+
if (TEXT_ELEMENT_TAGS.has(tagName) && !tagName.startsWith('h')) {
|
|
728
|
+
// Only get direct text content, not from descendants
|
|
729
|
+
const directText = element.childNodes
|
|
730
|
+
.filter((child) => child.nodeName === '#text')
|
|
731
|
+
.map((child) => (child as parse5TreeAdapter.TextNode).value)
|
|
732
|
+
.join('')
|
|
733
|
+
.trim();
|
|
734
|
+
|
|
735
|
+
// Filter by length (5 chars minimum)
|
|
736
|
+
if (directText.length < 5) {
|
|
737
|
+
// Still process children
|
|
738
|
+
element.childNodes.forEach((child) => processNode(child));
|
|
739
|
+
return;
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
if (tagName === 'li' || tagName === 'label') {
|
|
743
|
+
switch (tagName) {
|
|
744
|
+
case 'li': {
|
|
745
|
+
// Handle nested lists
|
|
746
|
+
const indent = hasListParent(element) ? ' ' : '';
|
|
747
|
+
// Get all text content for list items (including descendants)
|
|
748
|
+
const fullText = getTextContent(element).trim();
|
|
749
|
+
lines.push(`${indent}- ${fullText}`);
|
|
750
|
+
break;
|
|
751
|
+
}
|
|
752
|
+
case 'label':
|
|
753
|
+
lines.push(`**${directText}**`);
|
|
754
|
+
break;
|
|
755
|
+
}
|
|
756
|
+
return;
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
// For other text elements, check if we should add them
|
|
760
|
+
if (!hasTextAncestor(element)) {
|
|
761
|
+
lines.push(directText);
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
// Always process children
|
|
765
|
+
element.childNodes.forEach((child) => processNode(child));
|
|
766
|
+
return;
|
|
767
|
+
}
|
|
768
|
+
// Process children of non-text elements
|
|
769
|
+
element.childNodes.forEach((child) => processNode(child));
|
|
770
|
+
}
|
|
771
|
+
};
|
|
772
|
+
|
|
773
|
+
processNode(element);
|
|
774
|
+
|
|
775
|
+
// Clean up spacing and trim whitespace
|
|
776
|
+
let result = lines.join('\n\n');
|
|
777
|
+
|
|
778
|
+
// Add some structure for better readability
|
|
779
|
+
// Ensure headers have proper spacing
|
|
780
|
+
result = result.replace(/^(#{1,6} .+)$/gm, '\n$1\n');
|
|
781
|
+
|
|
782
|
+
// Ensure form elements are grouped with proper spacing
|
|
783
|
+
result = result.replace(/(\{[^}]+\}| \[[^\]]+\])/g, '\n$1');
|
|
784
|
+
|
|
785
|
+
// Clean up excessive empty lines
|
|
786
|
+
result = result.replace(/\n{3,}/g, '\n\n');
|
|
787
|
+
|
|
788
|
+
// Trim leading/trailing whitespace
|
|
789
|
+
result = result.trim();
|
|
790
|
+
|
|
791
|
+
return result;
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
// Helper functions
|
|
795
|
+
|
|
796
|
+
function findBody(document: parse5TreeAdapter.Document): parse5TreeAdapter.Element | null {
|
|
797
|
+
const html = document.childNodes.find((node) => node.nodeName === 'html');
|
|
798
|
+
if (!html || !('childNodes' in html)) return null;
|
|
799
|
+
|
|
800
|
+
return (html.childNodes.find((node) => node.nodeName === 'body') as parse5TreeAdapter.Element) || null;
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
function shouldKeepInteractive(element: parse5TreeAdapter.Element, selectorConfig?: { include?: string[]; exclude?: string[] }): boolean {
|
|
804
|
+
if (hasHiddenClass(element)) return false;
|
|
805
|
+
|
|
806
|
+
if (selectorConfig?.include && matchesAnySelector(element, selectorConfig.include)) {
|
|
807
|
+
return true;
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
if (selectorConfig?.exclude && matchesAnySelector(element, selectorConfig.exclude)) {
|
|
811
|
+
return false;
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
if (hasExplorbotAttributes(element)) return true;
|
|
815
|
+
|
|
816
|
+
const tagName = element.tagName.toLowerCase();
|
|
817
|
+
if (tagName === 'input') {
|
|
818
|
+
const type = getAttribute(element, 'type');
|
|
819
|
+
if (type && type.toLowerCase() === 'hidden') return false;
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
if (INTERACTIVE_TAGS.has(tagName)) return true;
|
|
823
|
+
|
|
824
|
+
const role = getAttribute(element, 'role');
|
|
825
|
+
if (role && INTERACTIVE_ROLES.has(role.toLowerCase())) return true;
|
|
826
|
+
|
|
827
|
+
for (const attr of element.attrs ?? []) {
|
|
828
|
+
const attrName = attr.name.toLowerCase();
|
|
829
|
+
if (INTERACTIVE_EVENT_ATTRIBUTES.has(attrName)) return true;
|
|
830
|
+
if (attrName === 'contenteditable') return true;
|
|
831
|
+
if (attrName === 'tabindex') return true;
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
return false;
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
function shouldKeepCombined(element: parse5TreeAdapter.Element, htmlConfig?: HtmlConfig['combined']): boolean {
|
|
838
|
+
if (hasHiddenClass(element)) return false;
|
|
839
|
+
|
|
840
|
+
if (htmlConfig?.include && matchesAnySelector(element, htmlConfig.include)) {
|
|
841
|
+
return true;
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
if (htmlConfig?.exclude && matchesAnySelector(element, htmlConfig.exclude)) {
|
|
845
|
+
return false;
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
if (hasExplorbotAttributes(element)) return true;
|
|
849
|
+
|
|
850
|
+
if (getAttribute(element, 'role')) return true;
|
|
851
|
+
|
|
852
|
+
if (shouldKeepInteractive(element, htmlConfig)) return true;
|
|
853
|
+
|
|
854
|
+
const tagName = element.tagName.toLowerCase();
|
|
855
|
+
if (tagName === 'svg' && getAttribute(element, 'class')) return true;
|
|
856
|
+
|
|
857
|
+
if (TEXT_ELEMENT_TAGS.has(tagName)) {
|
|
858
|
+
if (tagName.startsWith('h')) return true;
|
|
859
|
+
const text = getTextContent(element).trim();
|
|
860
|
+
if (text.length <= 5) return false;
|
|
861
|
+
return true;
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
return hasKeepableChildren(element, htmlConfig);
|
|
865
|
+
}
|
|
866
|
+
|
|
867
|
+
function hasKeepableChildren(element: parse5TreeAdapter.Element, htmlConfig?: HtmlConfig['combined']): boolean {
|
|
868
|
+
if (!element.childNodes) return false;
|
|
869
|
+
|
|
870
|
+
for (const child of element.childNodes) {
|
|
871
|
+
if ('tagName' in child) {
|
|
872
|
+
if (shouldKeepCombined(child as parse5TreeAdapter.Element, htmlConfig)) {
|
|
873
|
+
return true;
|
|
874
|
+
}
|
|
875
|
+
} else if (child.nodeName === '#text') {
|
|
876
|
+
// Also consider direct text content
|
|
877
|
+
const text = (child as parse5TreeAdapter.TextNode).value.trim();
|
|
878
|
+
if (text.length >= 5) {
|
|
879
|
+
return true;
|
|
880
|
+
}
|
|
881
|
+
}
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
return false;
|
|
885
|
+
}
|
|
886
|
+
|
|
887
|
+
function hasTextAncestor(element: parse5TreeAdapter.Element): boolean {
|
|
888
|
+
let parent = element.parentNode;
|
|
889
|
+
|
|
890
|
+
while (parent && 'tagName' in parent) {
|
|
891
|
+
const parentElement = parent as parse5TreeAdapter.Element;
|
|
892
|
+
const parentTagName = parentElement.tagName.toLowerCase();
|
|
893
|
+
|
|
894
|
+
if (['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'p', 'td', 'th', 'label'].includes(parentTagName)) {
|
|
895
|
+
return true;
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
parent = parent.parentNode;
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
return false;
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
function hasListParent(element: parse5TreeAdapter.Element): boolean {
|
|
905
|
+
let parent = element.parentNode;
|
|
906
|
+
|
|
907
|
+
while (parent && 'tagName' in parent) {
|
|
908
|
+
if (parent.parentNode && 'tagName' in parent.parentNode) {
|
|
909
|
+
const grandParent = parent.parentNode as parse5TreeAdapter.Element;
|
|
910
|
+
const grandParentTagName = grandParent.tagName.toLowerCase();
|
|
911
|
+
|
|
912
|
+
if (['ul', 'ol'].includes(grandParentTagName)) {
|
|
913
|
+
return true;
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
|
|
917
|
+
parent = parent.parentNode;
|
|
918
|
+
}
|
|
919
|
+
|
|
920
|
+
return false;
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
function isInteractiveContainer(element: parse5TreeAdapter.Element): boolean {
|
|
924
|
+
const tagName = element.tagName.toLowerCase();
|
|
925
|
+
if (tagName === 'button') return true;
|
|
926
|
+
if (getAttribute(element, 'role')) return true;
|
|
927
|
+
return false;
|
|
928
|
+
}
|
|
929
|
+
|
|
930
|
+
function filterTree(element: parse5TreeAdapter.Element, shouldKeep: (el: parse5TreeAdapter.Element) => boolean, keepPositions?: boolean): boolean {
|
|
931
|
+
if (!element.childNodes) return false;
|
|
932
|
+
|
|
933
|
+
const isInteractive = isInteractiveContainer(element);
|
|
934
|
+
let hasKeepableElementChildren = false;
|
|
935
|
+
let hasTextContent = false;
|
|
936
|
+
const children = [...element.childNodes];
|
|
937
|
+
|
|
938
|
+
for (let i = children.length - 1; i >= 0; i--) {
|
|
939
|
+
const child = children[i];
|
|
940
|
+
|
|
941
|
+
if ('tagName' in child) {
|
|
942
|
+
const childElement = child as parse5TreeAdapter.Element;
|
|
943
|
+
const childHasContent = filterTree(childElement, shouldKeep, keepPositions);
|
|
944
|
+
|
|
945
|
+
if (isInteractive) {
|
|
946
|
+
hasKeepableElementChildren = true;
|
|
947
|
+
cleanElement(childElement);
|
|
948
|
+
continue;
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
if (!shouldKeep(childElement)) {
|
|
952
|
+
if (hasHiddenClass(childElement) || !childHasContent) {
|
|
953
|
+
const index = element.childNodes.indexOf(child);
|
|
954
|
+
if (index > -1) {
|
|
955
|
+
if (keepPositions) {
|
|
956
|
+
emptyElement(childElement);
|
|
957
|
+
} else {
|
|
958
|
+
element.childNodes.splice(index, 1);
|
|
959
|
+
}
|
|
960
|
+
}
|
|
961
|
+
} else {
|
|
962
|
+
hasKeepableElementChildren = true;
|
|
963
|
+
}
|
|
964
|
+
continue;
|
|
965
|
+
}
|
|
966
|
+
|
|
967
|
+
hasKeepableElementChildren = true;
|
|
968
|
+
cleanElement(childElement);
|
|
969
|
+
} else if (child.nodeName === '#text') {
|
|
970
|
+
const text = (child as parse5TreeAdapter.TextNode).value.trim();
|
|
971
|
+
if (text.length > 0) {
|
|
972
|
+
hasTextContent = true;
|
|
973
|
+
} else {
|
|
974
|
+
const index = element.childNodes.indexOf(child);
|
|
975
|
+
if (index > -1) {
|
|
976
|
+
element.childNodes.splice(index, 1);
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
|
|
982
|
+
if (shouldKeep(element)) {
|
|
983
|
+
return true;
|
|
984
|
+
}
|
|
985
|
+
|
|
986
|
+
if (hasKeepableElementChildren) {
|
|
987
|
+
return true;
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
const tagName = element.tagName.toLowerCase();
|
|
991
|
+
const isTextElement = TEXT_ELEMENT_TAGS.has(tagName);
|
|
992
|
+
if (isTextElement && hasTextContent) {
|
|
993
|
+
const text = getTextContent(element).trim();
|
|
994
|
+
return text.length > 5;
|
|
995
|
+
}
|
|
996
|
+
|
|
997
|
+
return hasTextContent;
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
function emptyElement(element: parse5TreeAdapter.Element): void {
|
|
1001
|
+
element.childNodes = [];
|
|
1002
|
+
element.attrs = [];
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
function cleanAllElements(element: parse5TreeAdapter.Element): void {
|
|
1006
|
+
cleanElement(element);
|
|
1007
|
+
|
|
1008
|
+
if (!element.childNodes) return;
|
|
1009
|
+
|
|
1010
|
+
for (const child of element.childNodes) {
|
|
1011
|
+
if ('tagName' in child) {
|
|
1012
|
+
cleanAllElements(child as parse5TreeAdapter.Element);
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
function cleanElement(element: parse5TreeAdapter.Element): void {
|
|
1018
|
+
if (element.tagName.toLowerCase() === 'svg') {
|
|
1019
|
+
element.attrs = element.attrs.filter((attr) => attr.name === 'class');
|
|
1020
|
+
element.childNodes = [];
|
|
1021
|
+
}
|
|
1022
|
+
|
|
1023
|
+
const keepAttrs = [
|
|
1024
|
+
'id',
|
|
1025
|
+
'class',
|
|
1026
|
+
'name',
|
|
1027
|
+
'type',
|
|
1028
|
+
'value',
|
|
1029
|
+
'placeholder',
|
|
1030
|
+
'aria-label',
|
|
1031
|
+
'aria-labelledby',
|
|
1032
|
+
'aria-describedby',
|
|
1033
|
+
'aria-owns',
|
|
1034
|
+
'role',
|
|
1035
|
+
'title',
|
|
1036
|
+
'href',
|
|
1037
|
+
'src',
|
|
1038
|
+
'tabindex',
|
|
1039
|
+
'contenteditable',
|
|
1040
|
+
'onclick',
|
|
1041
|
+
'onmousedown',
|
|
1042
|
+
'onmouseup',
|
|
1043
|
+
'onchange',
|
|
1044
|
+
'onfocus',
|
|
1045
|
+
'required',
|
|
1046
|
+
'disabled',
|
|
1047
|
+
'checked',
|
|
1048
|
+
'selected',
|
|
1049
|
+
'action',
|
|
1050
|
+
'key',
|
|
1051
|
+
'label',
|
|
1052
|
+
'important',
|
|
1053
|
+
'eidx',
|
|
1054
|
+
];
|
|
1055
|
+
|
|
1056
|
+
convertExplorbotAttributes(element);
|
|
1057
|
+
|
|
1058
|
+
element.attrs = element.attrs.filter((attr) => keepAttrs.includes(attr.name) || attr.name.startsWith('data-explorbot-'));
|
|
1059
|
+
|
|
1060
|
+
for (const attr of element.attrs) {
|
|
1061
|
+
if (attr.name === 'class') {
|
|
1062
|
+
attr.value = attr.value
|
|
1063
|
+
.split(/\s+/)
|
|
1064
|
+
.filter((className) => !/\d/.test(className))
|
|
1065
|
+
.filter((className) => !TAILWIND_CLASS_PATTERNS.some((pattern) => pattern.test(className)))
|
|
1066
|
+
.join(' ');
|
|
1067
|
+
|
|
1068
|
+
if (!attr.value) {
|
|
1069
|
+
element.attrs = element.attrs.filter((a) => a.name !== 'class');
|
|
1070
|
+
}
|
|
1071
|
+
}
|
|
1072
|
+
}
|
|
1073
|
+
|
|
1074
|
+
if (element.tagName.toLowerCase() === 'script') {
|
|
1075
|
+
element.childNodes = [];
|
|
1076
|
+
}
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
function getTextContent(element: parse5TreeAdapter.Element): string {
|
|
1080
|
+
let text = '';
|
|
1081
|
+
|
|
1082
|
+
function processNode(node: parse5TreeAdapter.Node) {
|
|
1083
|
+
if (node.nodeName === '#text') {
|
|
1084
|
+
text += (node as parse5TreeAdapter.TextNode).value;
|
|
1085
|
+
} else if ('childNodes' in node) {
|
|
1086
|
+
node.childNodes.forEach(processNode);
|
|
1087
|
+
}
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
processNode(element);
|
|
1091
|
+
return text.trim();
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
function getAttribute(element: parse5TreeAdapter.Element, name: string): string | undefined {
|
|
1095
|
+
const attr = element.attrs.find((a) => a.name === name);
|
|
1096
|
+
return attr?.value;
|
|
1097
|
+
}
|
|
1098
|
+
|
|
1099
|
+
function hasHiddenClass(element: parse5TreeAdapter.Element): boolean {
|
|
1100
|
+
const classAttr = element.attrs.find((attr) => attr.name === 'class');
|
|
1101
|
+
if (!classAttr) return false;
|
|
1102
|
+
|
|
1103
|
+
const classes = classAttr.value.split(/\s+/);
|
|
1104
|
+
return classes.some((className) => HIDDEN_CLASSES.has(className));
|
|
1105
|
+
}
|
|
1106
|
+
|
|
1107
|
+
/**
|
|
1108
|
+
* Check if element has any data-explorbot-* attributes
|
|
1109
|
+
*/
|
|
1110
|
+
function hasExplorbotAttributes(element: parse5TreeAdapter.Element): boolean {
|
|
1111
|
+
return element.attrs?.some((attr) => attr.name.startsWith('data-explorbot-'));
|
|
1112
|
+
}
|
|
1113
|
+
|
|
1114
|
+
/**
|
|
1115
|
+
* Convert data-explorbot-* attributes to regular attributes
|
|
1116
|
+
* e.g., data-explorbot-value becomes value
|
|
1117
|
+
*/
|
|
1118
|
+
function convertExplorbotAttributes(element: parse5TreeAdapter.Element): void {
|
|
1119
|
+
const explorbotAttrs: Array<{ name: string; value: string }> = [];
|
|
1120
|
+
|
|
1121
|
+
element.attrs = element.attrs.filter((attr) => {
|
|
1122
|
+
if (attr.name.startsWith('data-explorbot-')) {
|
|
1123
|
+
const regularName = attr.name.replace('data-explorbot-', '');
|
|
1124
|
+
explorbotAttrs.push({ name: regularName, value: attr.value });
|
|
1125
|
+
return false;
|
|
1126
|
+
}
|
|
1127
|
+
return true;
|
|
1128
|
+
});
|
|
1129
|
+
|
|
1130
|
+
element.attrs.push(...explorbotAttrs);
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
export interface ExtractedLink {
|
|
1134
|
+
title: string;
|
|
1135
|
+
url: string;
|
|
1136
|
+
}
|
|
1137
|
+
|
|
1138
|
+
function sanitizeLinkTitle(text: string): string {
|
|
1139
|
+
return text.replace(/\s+/g, ' ').trim();
|
|
1140
|
+
}
|
|
1141
|
+
|
|
1142
|
+
export function extractLinks(html: string): ExtractedLink[] {
|
|
1143
|
+
const document = parseFragment(html);
|
|
1144
|
+
const links: ExtractedLink[] = [];
|
|
1145
|
+
const seen = new Set<string>();
|
|
1146
|
+
|
|
1147
|
+
const skipPrefixes = ['javascript:', 'mailto:', 'tel:', '#'];
|
|
1148
|
+
|
|
1149
|
+
function traverseNodes(node: parse5TreeAdapter.Node): void {
|
|
1150
|
+
if ('tagName' in node) {
|
|
1151
|
+
const element = node as parse5TreeAdapter.Element;
|
|
1152
|
+
const tagName = element.tagName.toLowerCase();
|
|
1153
|
+
|
|
1154
|
+
if (tagName === 'a') {
|
|
1155
|
+
const href = getAttribute(element, 'href');
|
|
1156
|
+
if (href) {
|
|
1157
|
+
const shouldSkip = skipPrefixes.some((prefix) => href.startsWith(prefix));
|
|
1158
|
+
if (!shouldSkip) {
|
|
1159
|
+
const rawTitle = getAttribute(element, 'aria-label') || getTextContent(element);
|
|
1160
|
+
const title = sanitizeLinkTitle(rawTitle);
|
|
1161
|
+
if (title && title.length <= 100) {
|
|
1162
|
+
const key = `${href}|${title}`;
|
|
1163
|
+
if (!seen.has(key)) {
|
|
1164
|
+
seen.add(key);
|
|
1165
|
+
links.push({ title, url: href });
|
|
1166
|
+
}
|
|
1167
|
+
}
|
|
1168
|
+
}
|
|
1169
|
+
}
|
|
1170
|
+
}
|
|
1171
|
+
}
|
|
1172
|
+
|
|
1173
|
+
if ('childNodes' in node) {
|
|
1174
|
+
for (const child of node.childNodes) {
|
|
1175
|
+
traverseNodes(child);
|
|
1176
|
+
}
|
|
1177
|
+
}
|
|
1178
|
+
}
|
|
1179
|
+
|
|
1180
|
+
traverseNodes(document);
|
|
1181
|
+
|
|
1182
|
+
return links;
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
export function extractHeadings(html: string): {
|
|
1186
|
+
h1?: string;
|
|
1187
|
+
h2?: string;
|
|
1188
|
+
h3?: string;
|
|
1189
|
+
h4?: string;
|
|
1190
|
+
} {
|
|
1191
|
+
const document = parseFragment(html);
|
|
1192
|
+
const headings: { h1: string[]; h2: string[]; h3: string[]; h4: string[] } = {
|
|
1193
|
+
h1: [],
|
|
1194
|
+
h2: [],
|
|
1195
|
+
h3: [],
|
|
1196
|
+
h4: [],
|
|
1197
|
+
};
|
|
1198
|
+
|
|
1199
|
+
function traverseNodes(node: parse5TreeAdapter.Node): void {
|
|
1200
|
+
if ('tagName' in node) {
|
|
1201
|
+
const element = node as parse5TreeAdapter.Element;
|
|
1202
|
+
const tagName = element.tagName.toLowerCase();
|
|
1203
|
+
|
|
1204
|
+
if (tagName === 'h1' || tagName === 'h2' || tagName === 'h3' || tagName === 'h4') {
|
|
1205
|
+
const text = getTextContent(element).trim();
|
|
1206
|
+
if (text) {
|
|
1207
|
+
headings[tagName as 'h1' | 'h2' | 'h3' | 'h4'].push(text);
|
|
1208
|
+
}
|
|
1209
|
+
}
|
|
1210
|
+
}
|
|
1211
|
+
|
|
1212
|
+
if ('childNodes' in node) {
|
|
1213
|
+
for (const child of node.childNodes) {
|
|
1214
|
+
traverseNodes(child);
|
|
1215
|
+
}
|
|
1216
|
+
}
|
|
1217
|
+
}
|
|
1218
|
+
|
|
1219
|
+
traverseNodes(document);
|
|
1220
|
+
|
|
1221
|
+
const result: { h1?: string; h2?: string; h3?: string; h4?: string } = {};
|
|
1222
|
+
|
|
1223
|
+
if (headings.h1.length > 0) {
|
|
1224
|
+
result.h1 = headings.h1.join(' | ');
|
|
1225
|
+
}
|
|
1226
|
+
if (headings.h2.length > 0) {
|
|
1227
|
+
result.h2 = headings.h2.join(' | ');
|
|
1228
|
+
}
|
|
1229
|
+
if (headings.h3.length > 0) {
|
|
1230
|
+
result.h3 = headings.h3.join(' | ');
|
|
1231
|
+
}
|
|
1232
|
+
if (headings.h4.length > 0) {
|
|
1233
|
+
result.h4 = headings.h4.join(' | ');
|
|
1234
|
+
}
|
|
1235
|
+
|
|
1236
|
+
return result;
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
export function codeToMarkdown(code: string): string {
|
|
1240
|
+
return `
|
|
1241
|
+
\`\`\`
|
|
1242
|
+
${code}
|
|
1243
|
+
\`\`\`
|
|
1244
|
+
`;
|
|
1245
|
+
}
|
|
1246
|
+
|
|
1247
|
+
export function isBodyEmpty(html: string): boolean {
|
|
1248
|
+
if (!html) return true;
|
|
1249
|
+
const bodyMatch = html.match(/<body[^>]*>(.*?)<\/body>/is);
|
|
1250
|
+
if (!bodyMatch) return true;
|
|
1251
|
+
const bodyContent = bodyMatch[1].trim();
|
|
1252
|
+
return bodyContent === '';
|
|
1253
|
+
}
|
|
1254
|
+
|
|
1255
|
+
/**
|
|
1256
|
+
* Extract HTML snippet around a targeted element based on locator
|
|
1257
|
+
* Used for accessibility analysis to show what element was being targeted
|
|
1258
|
+
*/
|
|
1259
|
+
export function extractTargetedHtml(html: string, locator: string): string {
|
|
1260
|
+
if (!html || !locator) return '';
|
|
1261
|
+
|
|
1262
|
+
const searchTerms: string[] = [];
|
|
1263
|
+
|
|
1264
|
+
// XPath locator
|
|
1265
|
+
if (locator.startsWith('//') || locator.startsWith('(//')) {
|
|
1266
|
+
const textMatch = locator.match(/text\(\)\s*=\s*['"]([^'"]+)['"]/i);
|
|
1267
|
+
if (textMatch) searchTerms.push(textMatch[1]);
|
|
1268
|
+
|
|
1269
|
+
const attrMatch = locator.match(/@[\w-]+\s*=\s*['"]([^'"]+)['"]/g);
|
|
1270
|
+
if (attrMatch) {
|
|
1271
|
+
for (const match of attrMatch) {
|
|
1272
|
+
const valueMatch = match.match(/['"]([^'"]+)['"]/);
|
|
1273
|
+
if (valueMatch) searchTerms.push(valueMatch[1]);
|
|
1274
|
+
}
|
|
1275
|
+
}
|
|
1276
|
+
} else if (locator.startsWith('{')) {
|
|
1277
|
+
// JSON locator (Playwright-style)
|
|
1278
|
+
try {
|
|
1279
|
+
const parsed = JSON.parse(locator);
|
|
1280
|
+
if (parsed.text) searchTerms.push(parsed.text);
|
|
1281
|
+
if (parsed.name) searchTerms.push(parsed.name);
|
|
1282
|
+
} catch {}
|
|
1283
|
+
} else {
|
|
1284
|
+
// CSS selector or text
|
|
1285
|
+
const cleanLoc = locator.replace(/['"]/g, '');
|
|
1286
|
+
if (!cleanLoc.startsWith('.') && !cleanLoc.startsWith('#') && !cleanLoc.includes('[')) {
|
|
1287
|
+
searchTerms.push(cleanLoc);
|
|
1288
|
+
}
|
|
1289
|
+
const classMatch = cleanLoc.match(/\.([a-zA-Z0-9_-]+)/);
|
|
1290
|
+
if (classMatch) searchTerms.push(classMatch[1]);
|
|
1291
|
+
const idMatch = cleanLoc.match(/#([a-zA-Z0-9_-]+)/);
|
|
1292
|
+
if (idMatch) searchTerms.push(idMatch[1]);
|
|
1293
|
+
}
|
|
1294
|
+
|
|
1295
|
+
for (const term of searchTerms) {
|
|
1296
|
+
if (term.length < 2) continue;
|
|
1297
|
+
const idx = html.indexOf(term);
|
|
1298
|
+
if (idx === -1) continue;
|
|
1299
|
+
|
|
1300
|
+
const start = Math.max(0, html.lastIndexOf('<', idx));
|
|
1301
|
+
let depth = 0;
|
|
1302
|
+
let end = start;
|
|
1303
|
+
for (let i = start; i < html.length && i < start + 1000; i++) {
|
|
1304
|
+
if (html[i] === '<' && html[i + 1] !== '/') depth++;
|
|
1305
|
+
if (html[i] === '<' && html[i + 1] === '/') depth--;
|
|
1306
|
+
if (depth === 0 && html[i] === '>') {
|
|
1307
|
+
end = i + 1;
|
|
1308
|
+
break;
|
|
1309
|
+
}
|
|
1310
|
+
}
|
|
1311
|
+
const snippet = html.slice(start, Math.min(end, start + 500));
|
|
1312
|
+
return snippet.trim();
|
|
1313
|
+
}
|
|
1314
|
+
|
|
1315
|
+
return '';
|
|
1316
|
+
}
|