@booklib/core 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cursor/rules/booklib-standards.mdc +40 -0
- package/.gemini/context.md +372 -0
- package/AGENTS.md +166 -0
- package/CHANGELOG.md +226 -0
- package/CLAUDE.md +81 -0
- package/CODE_OF_CONDUCT.md +31 -0
- package/CONTRIBUTING.md +304 -0
- package/LICENSE +21 -0
- package/PLAN.md +28 -0
- package/README.ja.md +198 -0
- package/README.ko.md +198 -0
- package/README.md +503 -0
- package/README.pt-BR.md +198 -0
- package/README.uk.md +241 -0
- package/README.zh-CN.md +198 -0
- package/SECURITY.md +9 -0
- package/agents/architecture-reviewer.md +136 -0
- package/agents/booklib-reviewer.md +90 -0
- package/agents/data-reviewer.md +107 -0
- package/agents/jvm-reviewer.md +146 -0
- package/agents/python-reviewer.md +128 -0
- package/agents/rust-reviewer.md +115 -0
- package/agents/ts-reviewer.md +110 -0
- package/agents/ui-reviewer.md +117 -0
- package/assets/logo.svg +36 -0
- package/bin/booklib-mcp.js +304 -0
- package/bin/booklib.js +1705 -0
- package/bin/skills.cjs +1292 -0
- package/booklib-router.mdc +36 -0
- package/booklib.config.json +19 -0
- package/commands/animation-at-work.md +10 -0
- package/commands/clean-code-reviewer.md +10 -0
- package/commands/data-intensive-patterns.md +10 -0
- package/commands/data-pipelines.md +10 -0
- package/commands/design-patterns.md +10 -0
- package/commands/domain-driven-design.md +10 -0
- package/commands/effective-java.md +10 -0
- package/commands/effective-kotlin.md +10 -0
- package/commands/effective-python.md +10 -0
- package/commands/effective-typescript.md +10 -0
- package/commands/kotlin-in-action.md +10 -0
- package/commands/lean-startup.md +10 -0
- package/commands/microservices-patterns.md +10 -0
- package/commands/programming-with-rust.md +10 -0
- package/commands/refactoring-ui.md +10 -0
- package/commands/rust-in-action.md +10 -0
- package/commands/skill-router.md +10 -0
- package/commands/spring-boot-in-action.md +10 -0
- package/commands/storytelling-with-data.md +10 -0
- package/commands/system-design-interview.md +10 -0
- package/commands/using-asyncio-python.md +10 -0
- package/commands/web-scraping-python.md +10 -0
- package/community/registry.json +1616 -0
- package/hooks/hooks.json +23 -0
- package/hooks/posttooluse-capture.mjs +67 -0
- package/hooks/suggest.js +153 -0
- package/lib/agent-behaviors.js +40 -0
- package/lib/agent-detector.js +96 -0
- package/lib/config-loader.js +39 -0
- package/lib/conflict-resolver.js +148 -0
- package/lib/context-builder.js +574 -0
- package/lib/discovery-engine.js +298 -0
- package/lib/doctor/hook-installer.js +83 -0
- package/lib/doctor/usage-tracker.js +87 -0
- package/lib/engine/ai-features.js +253 -0
- package/lib/engine/auditor.js +103 -0
- package/lib/engine/bm25-index.js +178 -0
- package/lib/engine/capture.js +120 -0
- package/lib/engine/corrections.js +198 -0
- package/lib/engine/doctor.js +195 -0
- package/lib/engine/graph-injector.js +137 -0
- package/lib/engine/graph.js +161 -0
- package/lib/engine/handoff.js +405 -0
- package/lib/engine/indexer.js +242 -0
- package/lib/engine/parser.js +53 -0
- package/lib/engine/query-expander.js +42 -0
- package/lib/engine/reranker.js +40 -0
- package/lib/engine/rrf.js +59 -0
- package/lib/engine/scanner.js +151 -0
- package/lib/engine/searcher.js +139 -0
- package/lib/engine/session-coordinator.js +306 -0
- package/lib/engine/session-manager.js +429 -0
- package/lib/engine/synthesizer.js +70 -0
- package/lib/installer.js +70 -0
- package/lib/instinct-block.js +33 -0
- package/lib/mcp-config-writer.js +88 -0
- package/lib/paths.js +57 -0
- package/lib/profiles/design.md +19 -0
- package/lib/profiles/general.md +16 -0
- package/lib/profiles/research-analysis.md +22 -0
- package/lib/profiles/software-development.md +23 -0
- package/lib/profiles/writing-content.md +19 -0
- package/lib/project-initializer.js +916 -0
- package/lib/registry/skills.js +102 -0
- package/lib/registry-searcher.js +99 -0
- package/lib/rules/rules-manager.js +169 -0
- package/lib/skill-fetcher.js +333 -0
- package/lib/well-known-builder.js +70 -0
- package/lib/wizard/index.js +404 -0
- package/lib/wizard/integration-detector.js +41 -0
- package/lib/wizard/project-detector.js +100 -0
- package/lib/wizard/prompt.js +156 -0
- package/lib/wizard/registry-embeddings.js +107 -0
- package/lib/wizard/skill-recommender.js +69 -0
- package/llms-full.txt +254 -0
- package/llms.txt +70 -0
- package/package.json +45 -0
- package/research-reports/2026-04-01-current-architecture.md +160 -0
- package/research-reports/IDEAS.md +93 -0
- package/rules/common/clean-code.md +42 -0
- package/rules/java/effective-java.md +42 -0
- package/rules/kotlin/effective-kotlin.md +37 -0
- package/rules/python/effective-python.md +38 -0
- package/rules/rust/rust.md +37 -0
- package/rules/typescript/effective-typescript.md +42 -0
- package/scripts/gen-llms-full.mjs +36 -0
- package/scripts/gen-og.mjs +142 -0
- package/scripts/validate-frontmatter.js +25 -0
- package/skills/animation-at-work/SKILL.md +270 -0
- package/skills/animation-at-work/assets/example_asset.txt +1 -0
- package/skills/animation-at-work/evals/evals.json +44 -0
- package/skills/animation-at-work/evals/results.json +13 -0
- package/skills/animation-at-work/examples/after.md +64 -0
- package/skills/animation-at-work/examples/before.md +35 -0
- package/skills/animation-at-work/references/api_reference.md +369 -0
- package/skills/animation-at-work/references/review-checklist.md +79 -0
- package/skills/animation-at-work/scripts/audit_animations.py +295 -0
- package/skills/animation-at-work/scripts/example.py +1 -0
- package/skills/clean-code-reviewer/SKILL.md +444 -0
- package/skills/clean-code-reviewer/audit.json +35 -0
- package/skills/clean-code-reviewer/evals/evals.json +185 -0
- package/skills/clean-code-reviewer/evals/results.json +13 -0
- package/skills/clean-code-reviewer/examples/after.md +48 -0
- package/skills/clean-code-reviewer/examples/before.md +33 -0
- package/skills/clean-code-reviewer/references/api_reference.md +158 -0
- package/skills/clean-code-reviewer/references/practices-catalog.md +282 -0
- package/skills/clean-code-reviewer/references/review-checklist.md +254 -0
- package/skills/clean-code-reviewer/scripts/pre-review.py +206 -0
- package/skills/data-intensive-patterns/SKILL.md +267 -0
- package/skills/data-intensive-patterns/assets/example_asset.txt +1 -0
- package/skills/data-intensive-patterns/evals/evals.json +54 -0
- package/skills/data-intensive-patterns/evals/results.json +13 -0
- package/skills/data-intensive-patterns/examples/after.md +61 -0
- package/skills/data-intensive-patterns/examples/before.md +38 -0
- package/skills/data-intensive-patterns/references/api_reference.md +34 -0
- package/skills/data-intensive-patterns/references/patterns-catalog.md +551 -0
- package/skills/data-intensive-patterns/references/review-checklist.md +193 -0
- package/skills/data-intensive-patterns/scripts/adr.py +213 -0
- package/skills/data-intensive-patterns/scripts/example.py +1 -0
- package/skills/data-pipelines/SKILL.md +259 -0
- package/skills/data-pipelines/assets/example_asset.txt +1 -0
- package/skills/data-pipelines/evals/evals.json +45 -0
- package/skills/data-pipelines/evals/results.json +13 -0
- package/skills/data-pipelines/examples/after.md +97 -0
- package/skills/data-pipelines/examples/before.md +37 -0
- package/skills/data-pipelines/references/api_reference.md +301 -0
- package/skills/data-pipelines/references/review-checklist.md +181 -0
- package/skills/data-pipelines/scripts/example.py +1 -0
- package/skills/data-pipelines/scripts/new_pipeline.py +444 -0
- package/skills/design-patterns/SKILL.md +271 -0
- package/skills/design-patterns/assets/example_asset.txt +1 -0
- package/skills/design-patterns/evals/evals.json +46 -0
- package/skills/design-patterns/evals/results.json +13 -0
- package/skills/design-patterns/examples/after.md +52 -0
- package/skills/design-patterns/examples/before.md +29 -0
- package/skills/design-patterns/references/api_reference.md +1 -0
- package/skills/design-patterns/references/patterns-catalog.md +726 -0
- package/skills/design-patterns/references/review-checklist.md +173 -0
- package/skills/design-patterns/scripts/example.py +1 -0
- package/skills/design-patterns/scripts/scaffold.py +807 -0
- package/skills/domain-driven-design/SKILL.md +142 -0
- package/skills/domain-driven-design/assets/example_asset.txt +1 -0
- package/skills/domain-driven-design/evals/evals.json +48 -0
- package/skills/domain-driven-design/evals/results.json +13 -0
- package/skills/domain-driven-design/examples/after.md +80 -0
- package/skills/domain-driven-design/examples/before.md +43 -0
- package/skills/domain-driven-design/references/api_reference.md +1 -0
- package/skills/domain-driven-design/references/patterns-catalog.md +545 -0
- package/skills/domain-driven-design/references/review-checklist.md +158 -0
- package/skills/domain-driven-design/scripts/example.py +1 -0
- package/skills/domain-driven-design/scripts/scaffold.py +421 -0
- package/skills/effective-java/SKILL.md +227 -0
- package/skills/effective-java/assets/example_asset.txt +1 -0
- package/skills/effective-java/evals/evals.json +46 -0
- package/skills/effective-java/evals/results.json +13 -0
- package/skills/effective-java/examples/after.md +83 -0
- package/skills/effective-java/examples/before.md +37 -0
- package/skills/effective-java/references/api_reference.md +1 -0
- package/skills/effective-java/references/items-catalog.md +955 -0
- package/skills/effective-java/references/review-checklist.md +216 -0
- package/skills/effective-java/scripts/checkstyle_setup.py +211 -0
- package/skills/effective-java/scripts/example.py +1 -0
- package/skills/effective-kotlin/SKILL.md +271 -0
- package/skills/effective-kotlin/assets/example_asset.txt +1 -0
- package/skills/effective-kotlin/audit.json +29 -0
- package/skills/effective-kotlin/evals/evals.json +45 -0
- package/skills/effective-kotlin/evals/results.json +13 -0
- package/skills/effective-kotlin/examples/after.md +36 -0
- package/skills/effective-kotlin/examples/before.md +38 -0
- package/skills/effective-kotlin/references/api_reference.md +1 -0
- package/skills/effective-kotlin/references/practices-catalog.md +1228 -0
- package/skills/effective-kotlin/references/review-checklist.md +126 -0
- package/skills/effective-kotlin/scripts/example.py +1 -0
- package/skills/effective-python/SKILL.md +441 -0
- package/skills/effective-python/evals/evals.json +44 -0
- package/skills/effective-python/evals/results.json +13 -0
- package/skills/effective-python/examples/after.md +56 -0
- package/skills/effective-python/examples/before.md +40 -0
- package/skills/effective-python/ref-01-pythonic-thinking.md +202 -0
- package/skills/effective-python/ref-02-lists-and-dicts.md +146 -0
- package/skills/effective-python/ref-03-functions.md +186 -0
- package/skills/effective-python/ref-04-comprehensions-generators.md +211 -0
- package/skills/effective-python/ref-05-classes-interfaces.md +188 -0
- package/skills/effective-python/ref-06-metaclasses-attributes.md +209 -0
- package/skills/effective-python/ref-07-concurrency.md +213 -0
- package/skills/effective-python/ref-08-robustness-performance.md +248 -0
- package/skills/effective-python/ref-09-testing-debugging.md +253 -0
- package/skills/effective-python/ref-10-collaboration.md +175 -0
- package/skills/effective-python/references/api_reference.md +218 -0
- package/skills/effective-python/references/practices-catalog.md +483 -0
- package/skills/effective-python/references/review-checklist.md +190 -0
- package/skills/effective-python/scripts/lint.py +173 -0
- package/skills/effective-typescript/SKILL.md +262 -0
- package/skills/effective-typescript/audit.json +29 -0
- package/skills/effective-typescript/evals/evals.json +37 -0
- package/skills/effective-typescript/evals/results.json +13 -0
- package/skills/effective-typescript/examples/after.md +70 -0
- package/skills/effective-typescript/examples/before.md +47 -0
- package/skills/effective-typescript/references/api_reference.md +118 -0
- package/skills/effective-typescript/references/practices-catalog.md +371 -0
- package/skills/effective-typescript/scripts/review.py +169 -0
- package/skills/kotlin-in-action/SKILL.md +261 -0
- package/skills/kotlin-in-action/assets/example_asset.txt +1 -0
- package/skills/kotlin-in-action/evals/evals.json +43 -0
- package/skills/kotlin-in-action/evals/results.json +13 -0
- package/skills/kotlin-in-action/examples/after.md +53 -0
- package/skills/kotlin-in-action/examples/before.md +39 -0
- package/skills/kotlin-in-action/references/api_reference.md +1 -0
- package/skills/kotlin-in-action/references/practices-catalog.md +436 -0
- package/skills/kotlin-in-action/references/review-checklist.md +204 -0
- package/skills/kotlin-in-action/scripts/example.py +1 -0
- package/skills/kotlin-in-action/scripts/setup_detekt.py +224 -0
- package/skills/lean-startup/SKILL.md +160 -0
- package/skills/lean-startup/assets/example_asset.txt +1 -0
- package/skills/lean-startup/evals/evals.json +43 -0
- package/skills/lean-startup/evals/results.json +13 -0
- package/skills/lean-startup/examples/after.md +80 -0
- package/skills/lean-startup/examples/before.md +34 -0
- package/skills/lean-startup/references/api_reference.md +319 -0
- package/skills/lean-startup/references/review-checklist.md +137 -0
- package/skills/lean-startup/scripts/example.py +1 -0
- package/skills/lean-startup/scripts/new_experiment.py +286 -0
- package/skills/microservices-patterns/SKILL.md +384 -0
- package/skills/microservices-patterns/evals/evals.json +45 -0
- package/skills/microservices-patterns/evals/results.json +13 -0
- package/skills/microservices-patterns/examples/after.md +69 -0
- package/skills/microservices-patterns/examples/before.md +40 -0
- package/skills/microservices-patterns/references/patterns-catalog.md +391 -0
- package/skills/microservices-patterns/references/review-checklist.md +169 -0
- package/skills/microservices-patterns/scripts/new_service.py +583 -0
- package/skills/programming-with-rust/SKILL.md +209 -0
- package/skills/programming-with-rust/evals/evals.json +37 -0
- package/skills/programming-with-rust/evals/results.json +13 -0
- package/skills/programming-with-rust/examples/after.md +107 -0
- package/skills/programming-with-rust/examples/before.md +59 -0
- package/skills/programming-with-rust/references/api_reference.md +152 -0
- package/skills/programming-with-rust/references/practices-catalog.md +335 -0
- package/skills/programming-with-rust/scripts/review.py +142 -0
- package/skills/refactoring-ui/SKILL.md +362 -0
- package/skills/refactoring-ui/assets/example_asset.txt +1 -0
- package/skills/refactoring-ui/evals/evals.json +45 -0
- package/skills/refactoring-ui/evals/results.json +13 -0
- package/skills/refactoring-ui/examples/after.md +85 -0
- package/skills/refactoring-ui/examples/before.md +58 -0
- package/skills/refactoring-ui/references/api_reference.md +355 -0
- package/skills/refactoring-ui/references/review-checklist.md +114 -0
- package/skills/refactoring-ui/scripts/audit_css.py +250 -0
- package/skills/refactoring-ui/scripts/example.py +1 -0
- package/skills/rust-in-action/SKILL.md +350 -0
- package/skills/rust-in-action/evals/evals.json +38 -0
- package/skills/rust-in-action/evals/results.json +13 -0
- package/skills/rust-in-action/examples/after.md +156 -0
- package/skills/rust-in-action/examples/before.md +56 -0
- package/skills/rust-in-action/references/practices-catalog.md +346 -0
- package/skills/rust-in-action/scripts/review.py +147 -0
- package/skills/skill-router/SKILL.md +186 -0
- package/skills/skill-router/evals/evals.json +38 -0
- package/skills/skill-router/evals/results.json +13 -0
- package/skills/skill-router/examples/after.md +63 -0
- package/skills/skill-router/examples/before.md +39 -0
- package/skills/skill-router/references/api_reference.md +24 -0
- package/skills/skill-router/references/routing-heuristics.md +89 -0
- package/skills/skill-router/references/skill-catalog.md +174 -0
- package/skills/skill-router/scripts/route.py +266 -0
- package/skills/spring-boot-in-action/SKILL.md +340 -0
- package/skills/spring-boot-in-action/evals/evals.json +39 -0
- package/skills/spring-boot-in-action/evals/results.json +13 -0
- package/skills/spring-boot-in-action/examples/after.md +185 -0
- package/skills/spring-boot-in-action/examples/before.md +84 -0
- package/skills/spring-boot-in-action/references/practices-catalog.md +403 -0
- package/skills/spring-boot-in-action/scripts/review.py +184 -0
- package/skills/storytelling-with-data/SKILL.md +241 -0
- package/skills/storytelling-with-data/assets/example_asset.txt +1 -0
- package/skills/storytelling-with-data/evals/evals.json +47 -0
- package/skills/storytelling-with-data/evals/results.json +13 -0
- package/skills/storytelling-with-data/examples/after.md +50 -0
- package/skills/storytelling-with-data/examples/before.md +33 -0
- package/skills/storytelling-with-data/references/api_reference.md +379 -0
- package/skills/storytelling-with-data/references/review-checklist.md +111 -0
- package/skills/storytelling-with-data/scripts/chart_review.py +301 -0
- package/skills/storytelling-with-data/scripts/example.py +1 -0
- package/skills/system-design-interview/SKILL.md +233 -0
- package/skills/system-design-interview/assets/example_asset.txt +1 -0
- package/skills/system-design-interview/evals/evals.json +46 -0
- package/skills/system-design-interview/evals/results.json +13 -0
- package/skills/system-design-interview/examples/after.md +94 -0
- package/skills/system-design-interview/examples/before.md +27 -0
- package/skills/system-design-interview/references/api_reference.md +582 -0
- package/skills/system-design-interview/references/review-checklist.md +201 -0
- package/skills/system-design-interview/scripts/example.py +1 -0
- package/skills/system-design-interview/scripts/new_design.py +421 -0
- package/skills/using-asyncio-python/SKILL.md +290 -0
- package/skills/using-asyncio-python/assets/example_asset.txt +1 -0
- package/skills/using-asyncio-python/evals/evals.json +43 -0
- package/skills/using-asyncio-python/evals/results.json +13 -0
- package/skills/using-asyncio-python/examples/after.md +68 -0
- package/skills/using-asyncio-python/examples/before.md +39 -0
- package/skills/using-asyncio-python/references/api_reference.md +267 -0
- package/skills/using-asyncio-python/references/review-checklist.md +149 -0
- package/skills/using-asyncio-python/scripts/check_blocking.py +270 -0
- package/skills/using-asyncio-python/scripts/example.py +1 -0
- package/skills/web-scraping-python/SKILL.md +280 -0
- package/skills/web-scraping-python/assets/example_asset.txt +1 -0
- package/skills/web-scraping-python/evals/evals.json +46 -0
- package/skills/web-scraping-python/evals/results.json +13 -0
- package/skills/web-scraping-python/examples/after.md +109 -0
- package/skills/web-scraping-python/examples/before.md +40 -0
- package/skills/web-scraping-python/references/api_reference.md +393 -0
- package/skills/web-scraping-python/references/review-checklist.md +163 -0
- package/skills/web-scraping-python/scripts/example.py +1 -0
- package/skills/web-scraping-python/scripts/new_scraper.py +231 -0
- package/skills/writing-plans/audit.json +34 -0
- package/tests/agent-detector.test.js +83 -0
- package/tests/corrections.test.js +245 -0
- package/tests/doctor/hook-installer.test.js +72 -0
- package/tests/doctor/usage-tracker.test.js +140 -0
- package/tests/engine/benchmark-eval.test.js +31 -0
- package/tests/engine/bm25-index.test.js +85 -0
- package/tests/engine/capture-command.test.js +35 -0
- package/tests/engine/capture.test.js +17 -0
- package/tests/engine/graph-augmented-search.test.js +107 -0
- package/tests/engine/graph-injector.test.js +44 -0
- package/tests/engine/graph.test.js +216 -0
- package/tests/engine/hybrid-searcher.test.js +74 -0
- package/tests/engine/indexer-bm25.test.js +37 -0
- package/tests/engine/mcp-tools.test.js +73 -0
- package/tests/engine/project-initializer-mcp.test.js +99 -0
- package/tests/engine/query-expander.test.js +36 -0
- package/tests/engine/reranker.test.js +51 -0
- package/tests/engine/rrf.test.js +49 -0
- package/tests/engine/srag-prefix.test.js +47 -0
- package/tests/instinct-block.test.js +23 -0
- package/tests/mcp-config-writer.test.js +60 -0
- package/tests/project-initializer-new-agents.test.js +48 -0
- package/tests/rules/rules-manager.test.js +230 -0
- package/tests/well-known-builder.test.js +40 -0
- package/tests/wizard/integration-detector.test.js +31 -0
- package/tests/wizard/project-detector.test.js +51 -0
- package/tests/wizard/prompt-session.test.js +61 -0
- package/tests/wizard/prompt.test.js +16 -0
- package/tests/wizard/registry-embeddings.test.js +35 -0
- package/tests/wizard/skill-recommender.test.js +34 -0
- package/tests/wizard/slot-count.test.js +25 -0
- package/vercel.json +21 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# Web Scraping with Python — Scraper Review Checklist
|
|
2
|
+
|
|
3
|
+
Systematic checklist for reviewing web scrapers against the 18 chapters
|
|
4
|
+
from *Web Scraping with Python* by Ryan Mitchell.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## 1. Fetching & Connection (Chapters 1, 10–11)
|
|
9
|
+
|
|
10
|
+
### HTTP Requests
|
|
11
|
+
- [ ] **Ch 1 — Error handling** — Are HTTP errors (4xx, 5xx), connection errors, and timeouts caught and handled?
|
|
12
|
+
- [ ] **Ch 1 — Response validation** — Is status code checked before parsing? Are non-200 responses handled?
|
|
13
|
+
- [ ] **Ch 1 — Timeout configuration** — Are request timeouts set to avoid hanging on unresponsive servers?
|
|
14
|
+
- [ ] **Ch 10 — Session usage** — Is `requests.Session()` used for cookie persistence and connection pooling?
|
|
15
|
+
|
|
16
|
+
### Authentication
|
|
17
|
+
- [ ] **Ch 10 — Login handling** — Is login implemented correctly with CSRF tokens and proper POST data?
|
|
18
|
+
- [ ] **Ch 10 — Session persistence** — Are cookies maintained across requests for authenticated scraping?
|
|
19
|
+
- [ ] **Ch 10 — Credential security** — Are login credentials stored in environment variables, not hardcoded?
|
|
20
|
+
- [ ] **Ch 10 — Session expiry** — Is session expiry detected and handled with automatic re-authentication?
|
|
21
|
+
|
|
22
|
+
### JavaScript Rendering
|
|
23
|
+
- [ ] **Ch 11 — Rendering need** — Is JavaScript rendering actually needed, or does the data exist in raw HTML or an API?
|
|
24
|
+
- [ ] **Ch 11 — Headless mode** — Is the browser running headless for server/production use?
|
|
25
|
+
- [ ] **Ch 11 — Explicit waits** — Are `WebDriverWait` with `expected_conditions` used instead of `time.sleep()`?
|
|
26
|
+
- [ ] **Ch 11 — Resource cleanup** — Is `driver.quit()` called in a finally block or context manager?
|
|
27
|
+
- [ ] **Ch 11 — Page load strategy** — Is the page load strategy appropriate (normal, eager, none)?
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## 2. Parsing & Extraction (Chapters 2, 7)
|
|
32
|
+
|
|
33
|
+
### HTML Parsing
|
|
34
|
+
- [ ] **Ch 2 — Parser choice** — Is an appropriate parser used (html.parser, lxml, html5lib)?
|
|
35
|
+
- [ ] **Ch 2 — Selector quality** — Are selectors specific enough to avoid false matches but flexible enough to survive minor changes?
|
|
36
|
+
- [ ] **Ch 2 — None checking** — Is `find()` result checked for None before accessing attributes or text?
|
|
37
|
+
- [ ] **Ch 2 — Multiple strategies** — Are fallback selectors used in case the primary selector fails?
|
|
38
|
+
- [ ] **Ch 2 — CSS selectors vs find** — Is `select()` used for complex hierarchical selection where appropriate?
|
|
39
|
+
|
|
40
|
+
### Data Extraction
|
|
41
|
+
- [ ] **Ch 2 — Attribute access** — Is `tag.get('href')` used instead of `tag['href']` to avoid KeyError?
|
|
42
|
+
- [ ] **Ch 2 — Text extraction** — Is `get_text(strip=True)` used for clean text content?
|
|
43
|
+
- [ ] **Ch 2 — Regex usage** — Are regex patterns compiled and used appropriately (not for HTML parsing)?
|
|
44
|
+
- [ ] **Ch 7 — Document handling** — Are non-HTML documents (PDF, Word) handled with appropriate libraries?
|
|
45
|
+
- [ ] **Ch 7 — Encoding** — Is character encoding handled correctly? Is UTF-8 enforced?
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## 3. Crawling & Navigation (Chapters 3–5)
|
|
50
|
+
|
|
51
|
+
### URL Management
|
|
52
|
+
- [ ] **Ch 3 — URL normalization** — Are URLs normalized (resolve relative, strip fragments, handle trailing slashes)?
|
|
53
|
+
- [ ] **Ch 3 — Deduplication** — Is a visited set maintained? Are URLs checked before adding to queue?
|
|
54
|
+
- [ ] **Ch 3 — Scope control** — Is crawl scope defined (same domain, specific paths, depth limit)?
|
|
55
|
+
- [ ] **Ch 3 — Relative URL resolution** — Is `urljoin` used to resolve relative links against the base URL?
|
|
56
|
+
|
|
57
|
+
### Crawl Strategy
|
|
58
|
+
- [ ] **Ch 3 — Traversal order** — Is the right traversal used (BFS for breadth, DFS for depth)?
|
|
59
|
+
- [ ] **Ch 4 — Layout handling** — Are different page layouts detected and parsed appropriately?
|
|
60
|
+
- [ ] **Ch 4 — Data normalization** — Is extracted data normalized to a consistent schema across pages?
|
|
61
|
+
- [ ] **Ch 3 — Pagination** — Is pagination handled correctly (next links, page numbers, cursor)?
|
|
62
|
+
|
|
63
|
+
### Scrapy-Specific
|
|
64
|
+
- [ ] **Ch 5 — Item definitions** — Are Scrapy Items defined for structured data extraction?
|
|
65
|
+
- [ ] **Ch 5 — Pipeline usage** — Are item pipelines used for validation, cleaning, and storage?
|
|
66
|
+
- [ ] **Ch 5 — Rules configuration** — Are CrawlSpider rules properly configured with LinkExtractor?
|
|
67
|
+
- [ ] **Ch 5 — Settings tuning** — Are CONCURRENT_REQUESTS, DOWNLOAD_DELAY, and AUTOTHROTTLE configured?
|
|
68
|
+
- [ ] **Ch 5 — Logging** — Is logging configured at the appropriate level for production?
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## 4. Data Storage (Chapter 6)
|
|
73
|
+
|
|
74
|
+
### Storage Patterns
|
|
75
|
+
- [ ] **Ch 6 — Format choice** — Is the right storage format used (CSV for simple, database for relational, JSON for nested)?
|
|
76
|
+
- [ ] **Ch 6 — Duplicate prevention** — Are duplicates detected and handled (UPSERT, unique constraints)?
|
|
77
|
+
- [ ] **Ch 6 — Batch operations** — Are database writes batched instead of per-row for efficiency?
|
|
78
|
+
- [ ] **Ch 6 — Connection management** — Are database connections properly opened, pooled, and closed?
|
|
79
|
+
|
|
80
|
+
### Data Integrity
|
|
81
|
+
- [ ] **Ch 6 — Schema enforcement** — Is extracted data validated against expected schema before storage?
|
|
82
|
+
- [ ] **Ch 6 — Raw preservation** — Is raw HTML/response stored alongside extracted data for re-parsing?
|
|
83
|
+
- [ ] **Ch 6 — Encoding handling** — Are files written with explicit UTF-8 encoding?
|
|
84
|
+
- [ ] **Ch 6 — Error on write** — Are storage errors caught and handled (disk full, DB connection lost)?
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## 5. Data Quality (Chapters 8, 9, 15)
|
|
89
|
+
|
|
90
|
+
### Cleaning
|
|
91
|
+
- [ ] **Ch 8 — Whitespace normalization** — Is whitespace stripped and normalized in extracted text?
|
|
92
|
+
- [ ] **Ch 8 — Unicode normalization** — Is Unicode text normalized (NFKD or NFC) for consistency?
|
|
93
|
+
- [ ] **Ch 8 — Type conversion** — Are strings converted to appropriate types (int, float, date) with error handling?
|
|
94
|
+
- [ ] **Ch 8 — Pattern cleaning** — Are regex patterns used to extract clean data from messy strings?
|
|
95
|
+
|
|
96
|
+
### Testing
|
|
97
|
+
- [ ] **Ch 15 — Parser unit tests** — Are parsing functions tested with saved HTML fixtures?
|
|
98
|
+
- [ ] **Ch 15 — Edge case tests** — Are missing elements, empty pages, and malformed HTML tested?
|
|
99
|
+
- [ ] **Ch 15 — Integration tests** — Is the full pipeline tested end-to-end?
|
|
100
|
+
- [ ] **Ch 15 — Change detection** — Is there monitoring for when the target site changes structure?
|
|
101
|
+
- [ ] **Ch 15 — CI integration** — Are scraper tests automated in a CI pipeline?
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## 6. Resilience & Performance (Chapters 14, 16)
|
|
106
|
+
|
|
107
|
+
### Anti-Detection
|
|
108
|
+
- [ ] **Ch 14 — User-Agent** — Is a realistic User-Agent header set? Is rotation implemented for scale?
|
|
109
|
+
- [ ] **Ch 14 — Request headers** — Are Accept, Accept-Language, and other standard headers included?
|
|
110
|
+
- [ ] **Ch 14 — Request delays** — Are random delays added between requests (not fixed intervals)?
|
|
111
|
+
- [ ] **Ch 14 — Cookie handling** — Are cookies accepted and maintained properly?
|
|
112
|
+
- [ ] **Ch 14 — Honeypot avoidance** — Are hidden links (display:none, visibility:hidden) detected and avoided?
|
|
113
|
+
|
|
114
|
+
### Performance
|
|
115
|
+
- [ ] **Ch 16 — Parallelism** — Is parallel scraping used for large-scale jobs (threading or multiprocessing)?
|
|
116
|
+
- [ ] **Ch 16 — Thread safety** — Are shared data structures properly protected with locks or queues?
|
|
117
|
+
- [ ] **Ch 16 — Per-domain limits** — Are concurrent requests limited per domain even with parallel scraping?
|
|
118
|
+
- [ ] **Ch 16 — Graceful shutdown** — Can the scraper shut down cleanly, saving state for resumption?
|
|
119
|
+
|
|
120
|
+
### Error Recovery
|
|
121
|
+
- [ ] **Ch 14 — Retry logic** — Are transient errors retried with backoff? Are permanent errors skipped?
|
|
122
|
+
- [ ] **Ch 14 — Block detection** — Are 403/captcha responses detected as potential blocks?
|
|
123
|
+
- [ ] **Ch 16 — Worker isolation** — Does one worker's failure not crash the entire scraper?
|
|
124
|
+
- [ ] **Ch 14 — State persistence** — Can the scraper resume from where it left off after a crash?
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## 7. Ethics & Legal (Chapters 17–18)
|
|
129
|
+
|
|
130
|
+
### Compliance
|
|
131
|
+
- [ ] **Ch 18 — robots.txt** — Is robots.txt fetched and respected before crawling?
|
|
132
|
+
- [ ] **Ch 18 — Terms of Service** — Has the target site's ToS been reviewed for scraping restrictions?
|
|
133
|
+
- [ ] **Ch 18 — Rate respect** — Is the scraping rate respectful of server resources?
|
|
134
|
+
- [ ] **Ch 18 — Data rights** — Is scraped data handled in compliance with copyright and privacy laws?
|
|
135
|
+
- [ ] **Ch 18 — GDPR compliance** — If scraping personal data, are GDPR obligations met?
|
|
136
|
+
|
|
137
|
+
### Anonymity & Infrastructure
|
|
138
|
+
- [ ] **Ch 17 — Proxy usage** — Are proxies used appropriately when needed for scale or anonymity?
|
|
139
|
+
- [ ] **Ch 17 — Tor appropriateness** — Is Tor used only when genuinely needed, not as a default?
|
|
140
|
+
- [ ] **Ch 17 — IP verification** — Is proxy/Tor IP verified before scraping sensitive targets?
|
|
141
|
+
- [ ] **Ch 14 — Identification** — Does the User-Agent identify the scraper and provide contact info?
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## Quick Review Workflow
|
|
146
|
+
|
|
147
|
+
1. **Fetching pass** — Verify request handling, error handling, session usage, JS rendering needs
|
|
148
|
+
2. **Parsing pass** — Check selector quality, None handling, defensive parsing, fallback strategies
|
|
149
|
+
3. **Crawling pass** — Verify URL management, deduplication, pagination, scope control
|
|
150
|
+
4. **Storage pass** — Check data format, duplicate handling, raw preservation, encoding
|
|
151
|
+
5. **Quality pass** — Verify data cleaning, testing coverage, change detection
|
|
152
|
+
6. **Resilience pass** — Check rate limiting, parallelism, retry logic, anti-detection
|
|
153
|
+
7. **Ethics pass** — Verify robots.txt compliance, legal awareness, respectful crawling
|
|
154
|
+
8. **Prioritize findings** — Rank by severity: legal risk > data loss > reliability > performance > best practices
|
|
155
|
+
|
|
156
|
+
## Severity Levels
|
|
157
|
+
|
|
158
|
+
| Severity | Description | Example |
|
|
159
|
+
|----------|-------------|---------|
|
|
160
|
+
| **Critical** | Legal risk, data loss, or server harm | Ignoring robots.txt, no rate limiting (hammering server), hardcoded credentials, GDPR violations |
|
|
161
|
+
| **High** | Reliability or data quality issues | No error handling, missing None checks, no session management, no deduplication |
|
|
162
|
+
| **Medium** | Performance, maintainability, or operational gaps | No parallel scraping for large jobs, no testing, fixed delays instead of random, no logging |
|
|
163
|
+
| **Low** | Best practice improvements | Missing User-Agent rotation, no raw HTML storage, no change detection, minor code organization |
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
new_scraper.py — Scaffold a best-practice web scraper.
|
|
4
|
+
Usage: python new_scraper.py <scraper-name> <target-url>
|
|
5
|
+
|
|
6
|
+
Generates <scraper-name>.py — a real, runnable scraper with retry, rate limiting,
|
|
7
|
+
robots.txt checking, BeautifulSoup parsing, and CSV output.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from string import Template
|
|
13
|
+
|
|
14
|
+
SCRAPER_TEMPLATE = '''\
|
|
15
|
+
#!/usr/bin/env python3
|
|
16
|
+
"""
|
|
17
|
+
$scraper_name — scraper for $target_url
|
|
18
|
+
Generated by new_scraper.py. Edit the parse() function for your target site.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import csv
|
|
22
|
+
import logging
|
|
23
|
+
import time
|
|
24
|
+
import urllib.parse
|
|
25
|
+
import urllib.robotparser
|
|
26
|
+
from datetime import datetime
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
import requests
|
|
31
|
+
from requests.adapters import HTTPAdapter
|
|
32
|
+
from urllib3.util.retry import Retry
|
|
33
|
+
from bs4 import BeautifulSoup
|
|
34
|
+
except ImportError as exc:
|
|
35
|
+
raise SystemExit(
|
|
36
|
+
f"Missing dependency: {exc}\\n"
|
|
37
|
+
"Install with: pip install requests beautifulsoup4"
|
|
38
|
+
) from exc
|
|
39
|
+
|
|
40
|
+
logging.basicConfig(
|
|
41
|
+
level=logging.INFO,
|
|
42
|
+
format="%(asctime)s %(levelname)-8s %(message)s",
|
|
43
|
+
datefmt="%Y-%m-%dT%H:%M:%S",
|
|
44
|
+
)
|
|
45
|
+
logger = logging.getLogger(__name__)
|
|
46
|
+
|
|
47
|
+
TARGET_URL = "$target_url"
|
|
48
|
+
OUTPUT_CSV = "$scraper_name_output.csv"
|
|
49
|
+
REQUEST_DELAY = 1.5 # seconds between requests — be polite
|
|
50
|
+
USER_AGENT = "research-bot/1.0 (+https://example.com/bot)"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# ---------------------------------------------------------------------------
|
|
54
|
+
# Session with retry
|
|
55
|
+
# ---------------------------------------------------------------------------
|
|
56
|
+
|
|
57
|
+
def make_session() -> requests.Session:
|
|
58
|
+
"""Build a requests Session with automatic retries on transient errors."""
|
|
59
|
+
session = requests.Session()
|
|
60
|
+
retry_strategy = Retry(
|
|
61
|
+
total=3,
|
|
62
|
+
backoff_factor=1.5,
|
|
63
|
+
status_forcelist=[429, 500, 502, 503, 504],
|
|
64
|
+
allowed_methods=["GET", "HEAD"],
|
|
65
|
+
)
|
|
66
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
67
|
+
session.mount("https://", adapter)
|
|
68
|
+
session.mount("http://", adapter)
|
|
69
|
+
session.headers.update({"User-Agent": USER_AGENT})
|
|
70
|
+
return session
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# Robots.txt
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
def check_robots(url: str, user_agent: str = USER_AGENT) -> bool:
|
|
78
|
+
"""Return True if scraping the URL is permitted by robots.txt."""
|
|
79
|
+
parsed = urllib.parse.urlparse(url)
|
|
80
|
+
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
|
81
|
+
rp = urllib.robotparser.RobotFileParser()
|
|
82
|
+
rp.set_url(robots_url)
|
|
83
|
+
try:
|
|
84
|
+
rp.read()
|
|
85
|
+
allowed = rp.can_fetch(user_agent, url)
|
|
86
|
+
if not allowed:
|
|
87
|
+
logger.warning("robots.txt disallows scraping: %s", url)
|
|
88
|
+
return allowed
|
|
89
|
+
except Exception as exc:
|
|
90
|
+
logger.warning("Could not read robots.txt (%s) — proceeding cautiously.", exc)
|
|
91
|
+
return True # assume allowed if robots.txt is unreachable
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
# Parse — EDIT THIS FUNCTION for your target site
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
|
|
98
|
+
def parse(html: str, source_url: str) -> list[dict]:
|
|
99
|
+
"""
|
|
100
|
+
Extract structured data from a page. Returns a list of dicts.
|
|
101
|
+
Edit the selectors below for your actual target.
|
|
102
|
+
"""
|
|
103
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
104
|
+
records = []
|
|
105
|
+
|
|
106
|
+
# Example: scrape all hyperlinks with their text
|
|
107
|
+
# Replace this block with selectors for your target site.
|
|
108
|
+
for link in soup.find_all("a", href=True):
|
|
109
|
+
href = link["href"]
|
|
110
|
+
text = link.get_text(strip=True)
|
|
111
|
+
if not text:
|
|
112
|
+
continue
|
|
113
|
+
# Resolve relative URLs
|
|
114
|
+
full_url = urllib.parse.urljoin(source_url, href)
|
|
115
|
+
records.append({
|
|
116
|
+
"text": text,
|
|
117
|
+
"url": full_url,
|
|
118
|
+
"source_page": source_url,
|
|
119
|
+
"scraped_at": datetime.utcnow().isoformat(),
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
return records
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# ---------------------------------------------------------------------------
|
|
126
|
+
# Core fetch + crawl logic
|
|
127
|
+
# ---------------------------------------------------------------------------
|
|
128
|
+
|
|
129
|
+
def fetch_page(session: requests.Session, url: str) -> str | None:
|
|
130
|
+
"""Fetch a single page and return HTML. Returns None on failure."""
|
|
131
|
+
try:
|
|
132
|
+
response = session.get(url, timeout=20)
|
|
133
|
+
response.raise_for_status()
|
|
134
|
+
return response.text
|
|
135
|
+
except requests.exceptions.RequestException as exc:
|
|
136
|
+
logger.error("Failed to fetch %s: %s", url, exc)
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def scrape(urls: list[str] | None = None) -> list[dict]:
|
|
141
|
+
"""
|
|
142
|
+
Main scrape loop. Pass a list of URLs or leave None to scrape TARGET_URL.
|
|
143
|
+
Respects robots.txt and rate-limits requests.
|
|
144
|
+
"""
|
|
145
|
+
urls = urls or [TARGET_URL]
|
|
146
|
+
session = make_session()
|
|
147
|
+
all_records: list[dict] = []
|
|
148
|
+
|
|
149
|
+
for i, url in enumerate(urls):
|
|
150
|
+
if not check_robots(url):
|
|
151
|
+
logger.info("Skipping disallowed URL: %s", url)
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
logger.info("Fetching (%d/%d): %s", i + 1, len(urls), url)
|
|
155
|
+
html = fetch_page(session, url)
|
|
156
|
+
if html is None:
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
records = parse(html, url)
|
|
160
|
+
logger.info(" -> %d records found", len(records))
|
|
161
|
+
all_records.extend(records)
|
|
162
|
+
|
|
163
|
+
if i < len(urls) - 1:
|
|
164
|
+
time.sleep(REQUEST_DELAY) # rate limit between pages
|
|
165
|
+
|
|
166
|
+
return all_records
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
# ---------------------------------------------------------------------------
|
|
170
|
+
# CSV output
|
|
171
|
+
# ---------------------------------------------------------------------------
|
|
172
|
+
|
|
173
|
+
def save_csv(records: list[dict], path: str = OUTPUT_CSV) -> None:
|
|
174
|
+
"""Write records to a CSV file."""
|
|
175
|
+
if not records:
|
|
176
|
+
logger.warning("No records to save.")
|
|
177
|
+
return
|
|
178
|
+
out = Path(path)
|
|
179
|
+
with out.open("w", newline="", encoding="utf-8") as fh:
|
|
180
|
+
writer = csv.DictWriter(fh, fieldnames=records[0].keys())
|
|
181
|
+
writer.writeheader()
|
|
182
|
+
writer.writerows(records)
|
|
183
|
+
logger.info("Saved %d records to %s", len(records), out)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# ---------------------------------------------------------------------------
|
|
187
|
+
# Entry point
|
|
188
|
+
# ---------------------------------------------------------------------------
|
|
189
|
+
|
|
190
|
+
if __name__ == "__main__":
|
|
191
|
+
records = scrape()
|
|
192
|
+
save_csv(records)
|
|
193
|
+
'''
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def main():
|
|
197
|
+
if len(sys.argv) < 3:
|
|
198
|
+
print("Usage: python new_scraper.py <scraper-name> <target-url>")
|
|
199
|
+
sys.exit(1)
|
|
200
|
+
|
|
201
|
+
scraper_name = sys.argv[1]
|
|
202
|
+
target_url = sys.argv[2]
|
|
203
|
+
|
|
204
|
+
# Basic URL sanity check
|
|
205
|
+
if not target_url.startswith(("http://", "https://")):
|
|
206
|
+
print(f"Warning: target URL '{target_url}' doesn't look like a full URL.")
|
|
207
|
+
|
|
208
|
+
output_path = Path(f"{scraper_name}.py")
|
|
209
|
+
if output_path.exists():
|
|
210
|
+
print(f"Error: '{output_path}' already exists. Choose a different name.")
|
|
211
|
+
sys.exit(1)
|
|
212
|
+
|
|
213
|
+
safe_name = scraper_name.replace("-", "_")
|
|
214
|
+
content = Template(SCRAPER_TEMPLATE).safe_substitute(
|
|
215
|
+
scraper_name=safe_name,
|
|
216
|
+
target_url=target_url,
|
|
217
|
+
)
|
|
218
|
+
output_path.write_text(content, encoding="utf-8")
|
|
219
|
+
output_path.chmod(0o755)
|
|
220
|
+
|
|
221
|
+
print(f"\nScraper '{scraper_name}' created: {output_path}\n")
|
|
222
|
+
print(f" Target URL : {target_url}")
|
|
223
|
+
print(f" Output CSV : {safe_name}_output.csv")
|
|
224
|
+
print(f"\nNext steps:")
|
|
225
|
+
print(f" 1. pip install requests beautifulsoup4")
|
|
226
|
+
print(f" 2. Edit the parse() function in {output_path} for your target site")
|
|
227
|
+
print(f" 3. python {output_path}")
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
if __name__ == "__main__":
|
|
231
|
+
main()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "writing-plans",
|
|
3
|
+
"description": "Static prose quality checks for markdown documents — specs, PRDs, user stories, plans.",
|
|
4
|
+
"rules": [
|
|
5
|
+
{
|
|
6
|
+
"id": "wp-001",
|
|
7
|
+
"name": "Passive voice",
|
|
8
|
+
"pattern": "\\b(was|were)\\s+[a-z]+ed\\b",
|
|
9
|
+
"message": "Passive voice detected — prefer active voice for clarity",
|
|
10
|
+
"severity": "LOW"
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"id": "wp-002",
|
|
14
|
+
"name": "Unresolved placeholder",
|
|
15
|
+
"pattern": "\\b(TBD|TBC|TODO|FIXME|lorem ipsum)\\b",
|
|
16
|
+
"message": "Unresolved placeholder — fill in before publishing",
|
|
17
|
+
"severity": "HIGH"
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "wp-003",
|
|
21
|
+
"name": "Hedge words",
|
|
22
|
+
"pattern": "\\b(should probably|might want to|could potentially|sort of|kind of|basically|essentially)\\b",
|
|
23
|
+
"message": "Hedge word weakens intent — use precise language",
|
|
24
|
+
"severity": "LOW"
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"id": "wp-004",
|
|
28
|
+
"name": "User story without acceptance criteria",
|
|
29
|
+
"pattern": "[Aa]s a (?:user|customer|admin|developer).{0,80}(?:[Ii] want|[Ii] need|[Ii] can)",
|
|
30
|
+
"message": "User story found — ensure acceptance criteria (Given/When/Then) are defined nearby",
|
|
31
|
+
"severity": "MEDIUM"
|
|
32
|
+
}
|
|
33
|
+
]
|
|
34
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import { test } from 'node:test';
|
|
2
|
+
import assert from 'node:assert/strict';
|
|
3
|
+
import { mkdtempSync, rmSync, mkdirSync, writeFileSync } from 'node:fs';
|
|
4
|
+
import { tmpdir } from 'node:os';
|
|
5
|
+
import path from 'node:path';
|
|
6
|
+
import { AgentDetector } from '../lib/agent-detector.js';
|
|
7
|
+
|
|
8
|
+
function tmpDir() {
|
|
9
|
+
return mkdtempSync(path.join(tmpdir(), 'booklib-detect-test-'));
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
test('always detects claude', () => {
|
|
13
|
+
const cwd = tmpDir();
|
|
14
|
+
const detector = new AgentDetector({ cwd, checkPath: false });
|
|
15
|
+
const detected = detector.detect();
|
|
16
|
+
assert.ok(detected.includes('claude'), 'claude should always be detected');
|
|
17
|
+
rmSync(cwd, { recursive: true });
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
test('detects cursor when .cursor/ directory exists', () => {
|
|
21
|
+
const cwd = tmpDir();
|
|
22
|
+
mkdirSync(path.join(cwd, '.cursor'));
|
|
23
|
+
const detector = new AgentDetector({ cwd, checkPath: false });
|
|
24
|
+
const detected = detector.detect();
|
|
25
|
+
assert.ok(detected.includes('cursor'), 'cursor not detected from .cursor/');
|
|
26
|
+
rmSync(cwd, { recursive: true });
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
test('detects roo-code when .roo/ directory exists', () => {
|
|
30
|
+
const cwd = tmpDir();
|
|
31
|
+
mkdirSync(path.join(cwd, '.roo'));
|
|
32
|
+
const detector = new AgentDetector({ cwd, checkPath: false });
|
|
33
|
+
const detected = detector.detect();
|
|
34
|
+
assert.ok(detected.includes('roo-code'), 'roo-code not detected from .roo/');
|
|
35
|
+
rmSync(cwd, { recursive: true });
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
test('detects junie when .junie/ directory exists', () => {
|
|
39
|
+
const cwd = tmpDir();
|
|
40
|
+
mkdirSync(path.join(cwd, '.junie'));
|
|
41
|
+
const detector = new AgentDetector({ cwd, checkPath: false });
|
|
42
|
+
const detected = detector.detect();
|
|
43
|
+
assert.ok(detected.includes('junie'));
|
|
44
|
+
rmSync(cwd, { recursive: true });
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
test('does not detect goose when no signals present', () => {
|
|
48
|
+
const cwd = tmpDir();
|
|
49
|
+
const detector = new AgentDetector({ cwd, checkPath: false });
|
|
50
|
+
const detected = detector.detect();
|
|
51
|
+
assert.ok(!detected.includes('goose'), 'goose falsely detected');
|
|
52
|
+
rmSync(cwd, { recursive: true });
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
test('detects opencode when opencode.toml exists', () => {
|
|
56
|
+
const cwd = tmpDir();
|
|
57
|
+
writeFileSync(path.join(cwd, 'opencode.toml'), '');
|
|
58
|
+
const detector = new AgentDetector({ cwd, checkPath: false });
|
|
59
|
+
const detected = detector.detect();
|
|
60
|
+
assert.ok(detected.includes('opencode'));
|
|
61
|
+
rmSync(cwd, { recursive: true });
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
test('detects copilot when VS Code extension directory exists', () => {
|
|
65
|
+
const cwd = tmpDir();
|
|
66
|
+
const home = tmpDir();
|
|
67
|
+
mkdirSync(path.join(home, '.vscode', 'extensions', 'github.copilot-1.234.0'), { recursive: true });
|
|
68
|
+
const detector = new AgentDetector({ cwd, checkPath: false, home });
|
|
69
|
+
const detected = detector.detect();
|
|
70
|
+
assert.ok(detected.includes('copilot'), 'copilot not detected from VS Code extension');
|
|
71
|
+
rmSync(cwd, { recursive: true });
|
|
72
|
+
rmSync(home, { recursive: true });
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
test('does not detect copilot when no VS Code extensions exist', () => {
|
|
76
|
+
const cwd = tmpDir();
|
|
77
|
+
const home = tmpDir();
|
|
78
|
+
const detector = new AgentDetector({ cwd, checkPath: false, home });
|
|
79
|
+
const detected = detector.detect();
|
|
80
|
+
assert.ok(!detected.includes('copilot'), 'copilot falsely detected');
|
|
81
|
+
rmSync(cwd, { recursive: true });
|
|
82
|
+
rmSync(home, { recursive: true });
|
|
83
|
+
});
|