@booklib/core 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cursor/rules/booklib-standards.mdc +40 -0
- package/.gemini/context.md +372 -0
- package/AGENTS.md +166 -0
- package/CHANGELOG.md +226 -0
- package/CLAUDE.md +81 -0
- package/CODE_OF_CONDUCT.md +31 -0
- package/CONTRIBUTING.md +304 -0
- package/LICENSE +21 -0
- package/PLAN.md +28 -0
- package/README.ja.md +198 -0
- package/README.ko.md +198 -0
- package/README.md +503 -0
- package/README.pt-BR.md +198 -0
- package/README.uk.md +241 -0
- package/README.zh-CN.md +198 -0
- package/SECURITY.md +9 -0
- package/agents/architecture-reviewer.md +136 -0
- package/agents/booklib-reviewer.md +90 -0
- package/agents/data-reviewer.md +107 -0
- package/agents/jvm-reviewer.md +146 -0
- package/agents/python-reviewer.md +128 -0
- package/agents/rust-reviewer.md +115 -0
- package/agents/ts-reviewer.md +110 -0
- package/agents/ui-reviewer.md +117 -0
- package/assets/logo.svg +36 -0
- package/bin/booklib-mcp.js +304 -0
- package/bin/booklib.js +1705 -0
- package/bin/skills.cjs +1292 -0
- package/booklib-router.mdc +36 -0
- package/booklib.config.json +19 -0
- package/commands/animation-at-work.md +10 -0
- package/commands/clean-code-reviewer.md +10 -0
- package/commands/data-intensive-patterns.md +10 -0
- package/commands/data-pipelines.md +10 -0
- package/commands/design-patterns.md +10 -0
- package/commands/domain-driven-design.md +10 -0
- package/commands/effective-java.md +10 -0
- package/commands/effective-kotlin.md +10 -0
- package/commands/effective-python.md +10 -0
- package/commands/effective-typescript.md +10 -0
- package/commands/kotlin-in-action.md +10 -0
- package/commands/lean-startup.md +10 -0
- package/commands/microservices-patterns.md +10 -0
- package/commands/programming-with-rust.md +10 -0
- package/commands/refactoring-ui.md +10 -0
- package/commands/rust-in-action.md +10 -0
- package/commands/skill-router.md +10 -0
- package/commands/spring-boot-in-action.md +10 -0
- package/commands/storytelling-with-data.md +10 -0
- package/commands/system-design-interview.md +10 -0
- package/commands/using-asyncio-python.md +10 -0
- package/commands/web-scraping-python.md +10 -0
- package/community/registry.json +1616 -0
- package/hooks/hooks.json +23 -0
- package/hooks/posttooluse-capture.mjs +67 -0
- package/hooks/suggest.js +153 -0
- package/lib/agent-behaviors.js +40 -0
- package/lib/agent-detector.js +96 -0
- package/lib/config-loader.js +39 -0
- package/lib/conflict-resolver.js +148 -0
- package/lib/context-builder.js +574 -0
- package/lib/discovery-engine.js +298 -0
- package/lib/doctor/hook-installer.js +83 -0
- package/lib/doctor/usage-tracker.js +87 -0
- package/lib/engine/ai-features.js +253 -0
- package/lib/engine/auditor.js +103 -0
- package/lib/engine/bm25-index.js +178 -0
- package/lib/engine/capture.js +120 -0
- package/lib/engine/corrections.js +198 -0
- package/lib/engine/doctor.js +195 -0
- package/lib/engine/graph-injector.js +137 -0
- package/lib/engine/graph.js +161 -0
- package/lib/engine/handoff.js +405 -0
- package/lib/engine/indexer.js +242 -0
- package/lib/engine/parser.js +53 -0
- package/lib/engine/query-expander.js +42 -0
- package/lib/engine/reranker.js +40 -0
- package/lib/engine/rrf.js +59 -0
- package/lib/engine/scanner.js +151 -0
- package/lib/engine/searcher.js +139 -0
- package/lib/engine/session-coordinator.js +306 -0
- package/lib/engine/session-manager.js +429 -0
- package/lib/engine/synthesizer.js +70 -0
- package/lib/installer.js +70 -0
- package/lib/instinct-block.js +33 -0
- package/lib/mcp-config-writer.js +88 -0
- package/lib/paths.js +57 -0
- package/lib/profiles/design.md +19 -0
- package/lib/profiles/general.md +16 -0
- package/lib/profiles/research-analysis.md +22 -0
- package/lib/profiles/software-development.md +23 -0
- package/lib/profiles/writing-content.md +19 -0
- package/lib/project-initializer.js +916 -0
- package/lib/registry/skills.js +102 -0
- package/lib/registry-searcher.js +99 -0
- package/lib/rules/rules-manager.js +169 -0
- package/lib/skill-fetcher.js +333 -0
- package/lib/well-known-builder.js +70 -0
- package/lib/wizard/index.js +404 -0
- package/lib/wizard/integration-detector.js +41 -0
- package/lib/wizard/project-detector.js +100 -0
- package/lib/wizard/prompt.js +156 -0
- package/lib/wizard/registry-embeddings.js +107 -0
- package/lib/wizard/skill-recommender.js +69 -0
- package/llms-full.txt +254 -0
- package/llms.txt +70 -0
- package/package.json +45 -0
- package/research-reports/2026-04-01-current-architecture.md +160 -0
- package/research-reports/IDEAS.md +93 -0
- package/rules/common/clean-code.md +42 -0
- package/rules/java/effective-java.md +42 -0
- package/rules/kotlin/effective-kotlin.md +37 -0
- package/rules/python/effective-python.md +38 -0
- package/rules/rust/rust.md +37 -0
- package/rules/typescript/effective-typescript.md +42 -0
- package/scripts/gen-llms-full.mjs +36 -0
- package/scripts/gen-og.mjs +142 -0
- package/scripts/validate-frontmatter.js +25 -0
- package/skills/animation-at-work/SKILL.md +270 -0
- package/skills/animation-at-work/assets/example_asset.txt +1 -0
- package/skills/animation-at-work/evals/evals.json +44 -0
- package/skills/animation-at-work/evals/results.json +13 -0
- package/skills/animation-at-work/examples/after.md +64 -0
- package/skills/animation-at-work/examples/before.md +35 -0
- package/skills/animation-at-work/references/api_reference.md +369 -0
- package/skills/animation-at-work/references/review-checklist.md +79 -0
- package/skills/animation-at-work/scripts/audit_animations.py +295 -0
- package/skills/animation-at-work/scripts/example.py +1 -0
- package/skills/clean-code-reviewer/SKILL.md +444 -0
- package/skills/clean-code-reviewer/audit.json +35 -0
- package/skills/clean-code-reviewer/evals/evals.json +185 -0
- package/skills/clean-code-reviewer/evals/results.json +13 -0
- package/skills/clean-code-reviewer/examples/after.md +48 -0
- package/skills/clean-code-reviewer/examples/before.md +33 -0
- package/skills/clean-code-reviewer/references/api_reference.md +158 -0
- package/skills/clean-code-reviewer/references/practices-catalog.md +282 -0
- package/skills/clean-code-reviewer/references/review-checklist.md +254 -0
- package/skills/clean-code-reviewer/scripts/pre-review.py +206 -0
- package/skills/data-intensive-patterns/SKILL.md +267 -0
- package/skills/data-intensive-patterns/assets/example_asset.txt +1 -0
- package/skills/data-intensive-patterns/evals/evals.json +54 -0
- package/skills/data-intensive-patterns/evals/results.json +13 -0
- package/skills/data-intensive-patterns/examples/after.md +61 -0
- package/skills/data-intensive-patterns/examples/before.md +38 -0
- package/skills/data-intensive-patterns/references/api_reference.md +34 -0
- package/skills/data-intensive-patterns/references/patterns-catalog.md +551 -0
- package/skills/data-intensive-patterns/references/review-checklist.md +193 -0
- package/skills/data-intensive-patterns/scripts/adr.py +213 -0
- package/skills/data-intensive-patterns/scripts/example.py +1 -0
- package/skills/data-pipelines/SKILL.md +259 -0
- package/skills/data-pipelines/assets/example_asset.txt +1 -0
- package/skills/data-pipelines/evals/evals.json +45 -0
- package/skills/data-pipelines/evals/results.json +13 -0
- package/skills/data-pipelines/examples/after.md +97 -0
- package/skills/data-pipelines/examples/before.md +37 -0
- package/skills/data-pipelines/references/api_reference.md +301 -0
- package/skills/data-pipelines/references/review-checklist.md +181 -0
- package/skills/data-pipelines/scripts/example.py +1 -0
- package/skills/data-pipelines/scripts/new_pipeline.py +444 -0
- package/skills/design-patterns/SKILL.md +271 -0
- package/skills/design-patterns/assets/example_asset.txt +1 -0
- package/skills/design-patterns/evals/evals.json +46 -0
- package/skills/design-patterns/evals/results.json +13 -0
- package/skills/design-patterns/examples/after.md +52 -0
- package/skills/design-patterns/examples/before.md +29 -0
- package/skills/design-patterns/references/api_reference.md +1 -0
- package/skills/design-patterns/references/patterns-catalog.md +726 -0
- package/skills/design-patterns/references/review-checklist.md +173 -0
- package/skills/design-patterns/scripts/example.py +1 -0
- package/skills/design-patterns/scripts/scaffold.py +807 -0
- package/skills/domain-driven-design/SKILL.md +142 -0
- package/skills/domain-driven-design/assets/example_asset.txt +1 -0
- package/skills/domain-driven-design/evals/evals.json +48 -0
- package/skills/domain-driven-design/evals/results.json +13 -0
- package/skills/domain-driven-design/examples/after.md +80 -0
- package/skills/domain-driven-design/examples/before.md +43 -0
- package/skills/domain-driven-design/references/api_reference.md +1 -0
- package/skills/domain-driven-design/references/patterns-catalog.md +545 -0
- package/skills/domain-driven-design/references/review-checklist.md +158 -0
- package/skills/domain-driven-design/scripts/example.py +1 -0
- package/skills/domain-driven-design/scripts/scaffold.py +421 -0
- package/skills/effective-java/SKILL.md +227 -0
- package/skills/effective-java/assets/example_asset.txt +1 -0
- package/skills/effective-java/evals/evals.json +46 -0
- package/skills/effective-java/evals/results.json +13 -0
- package/skills/effective-java/examples/after.md +83 -0
- package/skills/effective-java/examples/before.md +37 -0
- package/skills/effective-java/references/api_reference.md +1 -0
- package/skills/effective-java/references/items-catalog.md +955 -0
- package/skills/effective-java/references/review-checklist.md +216 -0
- package/skills/effective-java/scripts/checkstyle_setup.py +211 -0
- package/skills/effective-java/scripts/example.py +1 -0
- package/skills/effective-kotlin/SKILL.md +271 -0
- package/skills/effective-kotlin/assets/example_asset.txt +1 -0
- package/skills/effective-kotlin/audit.json +29 -0
- package/skills/effective-kotlin/evals/evals.json +45 -0
- package/skills/effective-kotlin/evals/results.json +13 -0
- package/skills/effective-kotlin/examples/after.md +36 -0
- package/skills/effective-kotlin/examples/before.md +38 -0
- package/skills/effective-kotlin/references/api_reference.md +1 -0
- package/skills/effective-kotlin/references/practices-catalog.md +1228 -0
- package/skills/effective-kotlin/references/review-checklist.md +126 -0
- package/skills/effective-kotlin/scripts/example.py +1 -0
- package/skills/effective-python/SKILL.md +441 -0
- package/skills/effective-python/evals/evals.json +44 -0
- package/skills/effective-python/evals/results.json +13 -0
- package/skills/effective-python/examples/after.md +56 -0
- package/skills/effective-python/examples/before.md +40 -0
- package/skills/effective-python/ref-01-pythonic-thinking.md +202 -0
- package/skills/effective-python/ref-02-lists-and-dicts.md +146 -0
- package/skills/effective-python/ref-03-functions.md +186 -0
- package/skills/effective-python/ref-04-comprehensions-generators.md +211 -0
- package/skills/effective-python/ref-05-classes-interfaces.md +188 -0
- package/skills/effective-python/ref-06-metaclasses-attributes.md +209 -0
- package/skills/effective-python/ref-07-concurrency.md +213 -0
- package/skills/effective-python/ref-08-robustness-performance.md +248 -0
- package/skills/effective-python/ref-09-testing-debugging.md +253 -0
- package/skills/effective-python/ref-10-collaboration.md +175 -0
- package/skills/effective-python/references/api_reference.md +218 -0
- package/skills/effective-python/references/practices-catalog.md +483 -0
- package/skills/effective-python/references/review-checklist.md +190 -0
- package/skills/effective-python/scripts/lint.py +173 -0
- package/skills/effective-typescript/SKILL.md +262 -0
- package/skills/effective-typescript/audit.json +29 -0
- package/skills/effective-typescript/evals/evals.json +37 -0
- package/skills/effective-typescript/evals/results.json +13 -0
- package/skills/effective-typescript/examples/after.md +70 -0
- package/skills/effective-typescript/examples/before.md +47 -0
- package/skills/effective-typescript/references/api_reference.md +118 -0
- package/skills/effective-typescript/references/practices-catalog.md +371 -0
- package/skills/effective-typescript/scripts/review.py +169 -0
- package/skills/kotlin-in-action/SKILL.md +261 -0
- package/skills/kotlin-in-action/assets/example_asset.txt +1 -0
- package/skills/kotlin-in-action/evals/evals.json +43 -0
- package/skills/kotlin-in-action/evals/results.json +13 -0
- package/skills/kotlin-in-action/examples/after.md +53 -0
- package/skills/kotlin-in-action/examples/before.md +39 -0
- package/skills/kotlin-in-action/references/api_reference.md +1 -0
- package/skills/kotlin-in-action/references/practices-catalog.md +436 -0
- package/skills/kotlin-in-action/references/review-checklist.md +204 -0
- package/skills/kotlin-in-action/scripts/example.py +1 -0
- package/skills/kotlin-in-action/scripts/setup_detekt.py +224 -0
- package/skills/lean-startup/SKILL.md +160 -0
- package/skills/lean-startup/assets/example_asset.txt +1 -0
- package/skills/lean-startup/evals/evals.json +43 -0
- package/skills/lean-startup/evals/results.json +13 -0
- package/skills/lean-startup/examples/after.md +80 -0
- package/skills/lean-startup/examples/before.md +34 -0
- package/skills/lean-startup/references/api_reference.md +319 -0
- package/skills/lean-startup/references/review-checklist.md +137 -0
- package/skills/lean-startup/scripts/example.py +1 -0
- package/skills/lean-startup/scripts/new_experiment.py +286 -0
- package/skills/microservices-patterns/SKILL.md +384 -0
- package/skills/microservices-patterns/evals/evals.json +45 -0
- package/skills/microservices-patterns/evals/results.json +13 -0
- package/skills/microservices-patterns/examples/after.md +69 -0
- package/skills/microservices-patterns/examples/before.md +40 -0
- package/skills/microservices-patterns/references/patterns-catalog.md +391 -0
- package/skills/microservices-patterns/references/review-checklist.md +169 -0
- package/skills/microservices-patterns/scripts/new_service.py +583 -0
- package/skills/programming-with-rust/SKILL.md +209 -0
- package/skills/programming-with-rust/evals/evals.json +37 -0
- package/skills/programming-with-rust/evals/results.json +13 -0
- package/skills/programming-with-rust/examples/after.md +107 -0
- package/skills/programming-with-rust/examples/before.md +59 -0
- package/skills/programming-with-rust/references/api_reference.md +152 -0
- package/skills/programming-with-rust/references/practices-catalog.md +335 -0
- package/skills/programming-with-rust/scripts/review.py +142 -0
- package/skills/refactoring-ui/SKILL.md +362 -0
- package/skills/refactoring-ui/assets/example_asset.txt +1 -0
- package/skills/refactoring-ui/evals/evals.json +45 -0
- package/skills/refactoring-ui/evals/results.json +13 -0
- package/skills/refactoring-ui/examples/after.md +85 -0
- package/skills/refactoring-ui/examples/before.md +58 -0
- package/skills/refactoring-ui/references/api_reference.md +355 -0
- package/skills/refactoring-ui/references/review-checklist.md +114 -0
- package/skills/refactoring-ui/scripts/audit_css.py +250 -0
- package/skills/refactoring-ui/scripts/example.py +1 -0
- package/skills/rust-in-action/SKILL.md +350 -0
- package/skills/rust-in-action/evals/evals.json +38 -0
- package/skills/rust-in-action/evals/results.json +13 -0
- package/skills/rust-in-action/examples/after.md +156 -0
- package/skills/rust-in-action/examples/before.md +56 -0
- package/skills/rust-in-action/references/practices-catalog.md +346 -0
- package/skills/rust-in-action/scripts/review.py +147 -0
- package/skills/skill-router/SKILL.md +186 -0
- package/skills/skill-router/evals/evals.json +38 -0
- package/skills/skill-router/evals/results.json +13 -0
- package/skills/skill-router/examples/after.md +63 -0
- package/skills/skill-router/examples/before.md +39 -0
- package/skills/skill-router/references/api_reference.md +24 -0
- package/skills/skill-router/references/routing-heuristics.md +89 -0
- package/skills/skill-router/references/skill-catalog.md +174 -0
- package/skills/skill-router/scripts/route.py +266 -0
- package/skills/spring-boot-in-action/SKILL.md +340 -0
- package/skills/spring-boot-in-action/evals/evals.json +39 -0
- package/skills/spring-boot-in-action/evals/results.json +13 -0
- package/skills/spring-boot-in-action/examples/after.md +185 -0
- package/skills/spring-boot-in-action/examples/before.md +84 -0
- package/skills/spring-boot-in-action/references/practices-catalog.md +403 -0
- package/skills/spring-boot-in-action/scripts/review.py +184 -0
- package/skills/storytelling-with-data/SKILL.md +241 -0
- package/skills/storytelling-with-data/assets/example_asset.txt +1 -0
- package/skills/storytelling-with-data/evals/evals.json +47 -0
- package/skills/storytelling-with-data/evals/results.json +13 -0
- package/skills/storytelling-with-data/examples/after.md +50 -0
- package/skills/storytelling-with-data/examples/before.md +33 -0
- package/skills/storytelling-with-data/references/api_reference.md +379 -0
- package/skills/storytelling-with-data/references/review-checklist.md +111 -0
- package/skills/storytelling-with-data/scripts/chart_review.py +301 -0
- package/skills/storytelling-with-data/scripts/example.py +1 -0
- package/skills/system-design-interview/SKILL.md +233 -0
- package/skills/system-design-interview/assets/example_asset.txt +1 -0
- package/skills/system-design-interview/evals/evals.json +46 -0
- package/skills/system-design-interview/evals/results.json +13 -0
- package/skills/system-design-interview/examples/after.md +94 -0
- package/skills/system-design-interview/examples/before.md +27 -0
- package/skills/system-design-interview/references/api_reference.md +582 -0
- package/skills/system-design-interview/references/review-checklist.md +201 -0
- package/skills/system-design-interview/scripts/example.py +1 -0
- package/skills/system-design-interview/scripts/new_design.py +421 -0
- package/skills/using-asyncio-python/SKILL.md +290 -0
- package/skills/using-asyncio-python/assets/example_asset.txt +1 -0
- package/skills/using-asyncio-python/evals/evals.json +43 -0
- package/skills/using-asyncio-python/evals/results.json +13 -0
- package/skills/using-asyncio-python/examples/after.md +68 -0
- package/skills/using-asyncio-python/examples/before.md +39 -0
- package/skills/using-asyncio-python/references/api_reference.md +267 -0
- package/skills/using-asyncio-python/references/review-checklist.md +149 -0
- package/skills/using-asyncio-python/scripts/check_blocking.py +270 -0
- package/skills/using-asyncio-python/scripts/example.py +1 -0
- package/skills/web-scraping-python/SKILL.md +280 -0
- package/skills/web-scraping-python/assets/example_asset.txt +1 -0
- package/skills/web-scraping-python/evals/evals.json +46 -0
- package/skills/web-scraping-python/evals/results.json +13 -0
- package/skills/web-scraping-python/examples/after.md +109 -0
- package/skills/web-scraping-python/examples/before.md +40 -0
- package/skills/web-scraping-python/references/api_reference.md +393 -0
- package/skills/web-scraping-python/references/review-checklist.md +163 -0
- package/skills/web-scraping-python/scripts/example.py +1 -0
- package/skills/web-scraping-python/scripts/new_scraper.py +231 -0
- package/skills/writing-plans/audit.json +34 -0
- package/tests/agent-detector.test.js +83 -0
- package/tests/corrections.test.js +245 -0
- package/tests/doctor/hook-installer.test.js +72 -0
- package/tests/doctor/usage-tracker.test.js +140 -0
- package/tests/engine/benchmark-eval.test.js +31 -0
- package/tests/engine/bm25-index.test.js +85 -0
- package/tests/engine/capture-command.test.js +35 -0
- package/tests/engine/capture.test.js +17 -0
- package/tests/engine/graph-augmented-search.test.js +107 -0
- package/tests/engine/graph-injector.test.js +44 -0
- package/tests/engine/graph.test.js +216 -0
- package/tests/engine/hybrid-searcher.test.js +74 -0
- package/tests/engine/indexer-bm25.test.js +37 -0
- package/tests/engine/mcp-tools.test.js +73 -0
- package/tests/engine/project-initializer-mcp.test.js +99 -0
- package/tests/engine/query-expander.test.js +36 -0
- package/tests/engine/reranker.test.js +51 -0
- package/tests/engine/rrf.test.js +49 -0
- package/tests/engine/srag-prefix.test.js +47 -0
- package/tests/instinct-block.test.js +23 -0
- package/tests/mcp-config-writer.test.js +60 -0
- package/tests/project-initializer-new-agents.test.js +48 -0
- package/tests/rules/rules-manager.test.js +230 -0
- package/tests/well-known-builder.test.js +40 -0
- package/tests/wizard/integration-detector.test.js +31 -0
- package/tests/wizard/project-detector.test.js +51 -0
- package/tests/wizard/prompt-session.test.js +61 -0
- package/tests/wizard/prompt.test.js +16 -0
- package/tests/wizard/registry-embeddings.test.js +35 -0
- package/tests/wizard/skill-recommender.test.js +34 -0
- package/tests/wizard/slot-count.test.js +25 -0
- package/vercel.json +21 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
check_blocking.py — Static analyser for blocking calls inside async functions.
|
|
4
|
+
|
|
5
|
+
Usage: python check_blocking.py <file_or_directory> [<file_or_directory> ...]
|
|
6
|
+
|
|
7
|
+
Flags:
|
|
8
|
+
--exit-zero Exit 0 even when issues are found (useful in CI to report only)
|
|
9
|
+
--summary Print a summary table at the end
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import ast
|
|
13
|
+
import argparse
|
|
14
|
+
import sys
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Iterator
|
|
18
|
+
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
# Rules
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
# Each rule is (description, fix_hint, matcher_function)
|
|
23
|
+
# matcher_function(node) -> bool
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _call_matches(node: ast.expr, *name_parts: str) -> bool:
|
|
27
|
+
"""True if node is a Call whose function matches the dotted name."""
|
|
28
|
+
if not isinstance(node, ast.Call):
|
|
29
|
+
return False
|
|
30
|
+
func = node.func
|
|
31
|
+
# Simple name: open, sleep, etc.
|
|
32
|
+
if len(name_parts) == 1 and isinstance(func, ast.Name):
|
|
33
|
+
return func.id == name_parts[0]
|
|
34
|
+
# Attribute: requests.get, time.sleep, etc.
|
|
35
|
+
if len(name_parts) == 2 and isinstance(func, ast.Attribute):
|
|
36
|
+
obj = func.value
|
|
37
|
+
return isinstance(obj, ast.Name) and obj.id == name_parts[0] and func.attr == name_parts[1]
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _is_sync_open(node: ast.expr) -> bool:
|
|
42
|
+
"""Flags open() calls that are not preceded by 'async with'."""
|
|
43
|
+
return _call_matches(node, "open")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _is_file_rw(node: ast.expr) -> bool:
|
|
47
|
+
"""Flags .read() / .write() attribute calls (heuristic)."""
|
|
48
|
+
if not isinstance(node, ast.Call):
|
|
49
|
+
return False
|
|
50
|
+
func = node.func
|
|
51
|
+
return isinstance(func, ast.Attribute) and func.attr in {"read", "write", "readlines"}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class Rule:
|
|
56
|
+
id: str
|
|
57
|
+
description: str
|
|
58
|
+
fix: str
|
|
59
|
+
matcher: object # callable(node) -> bool
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
RULES: list[Rule] = [
|
|
63
|
+
Rule(
|
|
64
|
+
id="ASYNC001",
|
|
65
|
+
description="requests.get() blocks the event loop",
|
|
66
|
+
fix="Use aiohttp.ClientSession().get() or httpx.AsyncClient().get()",
|
|
67
|
+
matcher=lambda n: _call_matches(n, "requests", "get"),
|
|
68
|
+
),
|
|
69
|
+
Rule(
|
|
70
|
+
id="ASYNC002",
|
|
71
|
+
description="requests.post() blocks the event loop",
|
|
72
|
+
fix="Use aiohttp.ClientSession().post() or httpx.AsyncClient().post()",
|
|
73
|
+
matcher=lambda n: _call_matches(n, "requests", "post"),
|
|
74
|
+
),
|
|
75
|
+
Rule(
|
|
76
|
+
id="ASYNC003",
|
|
77
|
+
description="requests.put() blocks the event loop",
|
|
78
|
+
fix="Use aiohttp.ClientSession().put() or httpx.AsyncClient().put()",
|
|
79
|
+
matcher=lambda n: _call_matches(n, "requests", "put"),
|
|
80
|
+
),
|
|
81
|
+
Rule(
|
|
82
|
+
id="ASYNC004",
|
|
83
|
+
description="requests.delete() blocks the event loop",
|
|
84
|
+
fix="Use aiohttp.ClientSession().delete() or httpx.AsyncClient().delete()",
|
|
85
|
+
matcher=lambda n: _call_matches(n, "requests", "delete"),
|
|
86
|
+
),
|
|
87
|
+
Rule(
|
|
88
|
+
id="ASYNC005",
|
|
89
|
+
description="time.sleep() blocks the event loop",
|
|
90
|
+
fix="Use 'await asyncio.sleep(seconds)' instead",
|
|
91
|
+
matcher=lambda n: _call_matches(n, "time", "sleep"),
|
|
92
|
+
),
|
|
93
|
+
Rule(
|
|
94
|
+
id="ASYNC006",
|
|
95
|
+
description="open() is a synchronous file operation",
|
|
96
|
+
fix="Use 'async with aiofiles.open(...)' from the aiofiles package",
|
|
97
|
+
matcher=_is_sync_open,
|
|
98
|
+
),
|
|
99
|
+
Rule(
|
|
100
|
+
id="ASYNC007",
|
|
101
|
+
description="subprocess.run() blocks the event loop",
|
|
102
|
+
fix="Use 'await asyncio.create_subprocess_exec()' or asyncio.create_subprocess_shell()",
|
|
103
|
+
matcher=lambda n: _call_matches(n, "subprocess", "run"),
|
|
104
|
+
),
|
|
105
|
+
Rule(
|
|
106
|
+
id="ASYNC008",
|
|
107
|
+
description="subprocess.call() blocks the event loop",
|
|
108
|
+
fix="Use 'await asyncio.create_subprocess_exec()' instead",
|
|
109
|
+
matcher=lambda n: _call_matches(n, "subprocess", "call"),
|
|
110
|
+
),
|
|
111
|
+
Rule(
|
|
112
|
+
id="ASYNC009",
|
|
113
|
+
description=".read()/.write()/.readlines() on a synchronous file handle",
|
|
114
|
+
fix="Open the file with aiofiles and use 'await file.read()' / 'await file.write()'",
|
|
115
|
+
matcher=_is_file_rw,
|
|
116
|
+
),
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# ---------------------------------------------------------------------------
|
|
121
|
+
# Finding
|
|
122
|
+
# ---------------------------------------------------------------------------
|
|
123
|
+
|
|
124
|
+
@dataclass
|
|
125
|
+
class Finding:
|
|
126
|
+
file: Path
|
|
127
|
+
line: int
|
|
128
|
+
col: int
|
|
129
|
+
async_func: str
|
|
130
|
+
rule: Rule
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _collect_async_funcs(tree: ast.AST) -> Iterator[ast.AsyncFunctionDef]:
|
|
134
|
+
"""Yield all async def nodes in the tree, including nested ones."""
|
|
135
|
+
for node in ast.walk(tree):
|
|
136
|
+
if isinstance(node, ast.AsyncFunctionDef):
|
|
137
|
+
yield node
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _nodes_inside_sync_context(func_node: ast.AsyncFunctionDef) -> set[int]:
|
|
141
|
+
"""
|
|
142
|
+
Return the set of node ids that are inside a nested sync def or class,
|
|
143
|
+
so we don't flag blocking calls that are legitimately in sync helpers.
|
|
144
|
+
"""
|
|
145
|
+
excluded: set[int] = set()
|
|
146
|
+
for node in ast.walk(func_node):
|
|
147
|
+
if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
|
|
148
|
+
for child in ast.walk(node):
|
|
149
|
+
excluded.add(id(child))
|
|
150
|
+
return excluded
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def check_file(path: Path) -> list[Finding]:
|
|
154
|
+
try:
|
|
155
|
+
source = path.read_text(encoding="utf-8", errors="replace")
|
|
156
|
+
except OSError as exc:
|
|
157
|
+
print(f"ERROR: Cannot read {path}: {exc}", file=sys.stderr)
|
|
158
|
+
return []
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
tree = ast.parse(source, filename=str(path))
|
|
162
|
+
except SyntaxError as exc:
|
|
163
|
+
print(f"ERROR: Syntax error in {path}: {exc}", file=sys.stderr)
|
|
164
|
+
return []
|
|
165
|
+
|
|
166
|
+
findings: list[Finding] = []
|
|
167
|
+
|
|
168
|
+
for async_func in _collect_async_funcs(tree):
|
|
169
|
+
excluded = _nodes_inside_sync_context(async_func)
|
|
170
|
+
for node in ast.walk(async_func):
|
|
171
|
+
if id(node) in excluded:
|
|
172
|
+
continue
|
|
173
|
+
for rule in RULES:
|
|
174
|
+
if rule.matcher(node):
|
|
175
|
+
findings.append(
|
|
176
|
+
Finding(
|
|
177
|
+
file=path,
|
|
178
|
+
line=node.lineno,
|
|
179
|
+
col=node.col_offset,
|
|
180
|
+
async_func=async_func.name,
|
|
181
|
+
rule=rule,
|
|
182
|
+
)
|
|
183
|
+
)
|
|
184
|
+
return findings
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def iter_python_files(path: Path) -> Iterator[Path]:
|
|
188
|
+
if path.is_file():
|
|
189
|
+
if path.suffix == ".py":
|
|
190
|
+
yield path
|
|
191
|
+
elif path.is_dir():
|
|
192
|
+
yield from sorted(path.rglob("*.py"))
|
|
193
|
+
else:
|
|
194
|
+
print(f"WARNING: {path} is not a file or directory — skipping.", file=sys.stderr)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# ---------------------------------------------------------------------------
|
|
198
|
+
# Reporting
|
|
199
|
+
# ---------------------------------------------------------------------------
|
|
200
|
+
|
|
201
|
+
def print_findings(findings: list[Finding]) -> None:
|
|
202
|
+
for f in findings:
|
|
203
|
+
print(
|
|
204
|
+
f"{f.file}:{f.line}:{f.col}: [{f.rule.id}] "
|
|
205
|
+
f"In 'async def {f.async_func}': {f.rule.description}"
|
|
206
|
+
)
|
|
207
|
+
print(f" Fix: {f.rule.fix}")
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def print_summary(all_findings: list[Finding]) -> None:
|
|
211
|
+
if not all_findings:
|
|
212
|
+
print("\nSummary: No blocking call issues found.")
|
|
213
|
+
return
|
|
214
|
+
|
|
215
|
+
from collections import Counter
|
|
216
|
+
by_rule: Counter = Counter(f.rule.id for f in all_findings)
|
|
217
|
+
by_file: Counter = Counter(str(f.file) for f in all_findings)
|
|
218
|
+
|
|
219
|
+
print("\n--- Summary ---")
|
|
220
|
+
print(f"Total issues: {len(all_findings)}")
|
|
221
|
+
print("\nBy rule:")
|
|
222
|
+
for rule_id, count in sorted(by_rule.items()):
|
|
223
|
+
rule = next(r for r in RULES if r.id == rule_id)
|
|
224
|
+
print(f" {rule_id}: {count}x ({rule.description})")
|
|
225
|
+
print("\nBy file:")
|
|
226
|
+
for filepath, count in sorted(by_file.items()):
|
|
227
|
+
print(f" {count:3d} {filepath}")
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# ---------------------------------------------------------------------------
|
|
231
|
+
# Entry point
|
|
232
|
+
# ---------------------------------------------------------------------------
|
|
233
|
+
|
|
234
|
+
def main() -> None:
|
|
235
|
+
parser = argparse.ArgumentParser(
|
|
236
|
+
description="Find blocking calls inside async functions."
|
|
237
|
+
)
|
|
238
|
+
parser.add_argument(
|
|
239
|
+
"paths", nargs="+", type=Path, metavar="file_or_dir",
|
|
240
|
+
help="Python file(s) or director(ies) to analyse"
|
|
241
|
+
)
|
|
242
|
+
parser.add_argument(
|
|
243
|
+
"--exit-zero", action="store_true",
|
|
244
|
+
help="Always exit 0 (useful for non-blocking CI report)"
|
|
245
|
+
)
|
|
246
|
+
parser.add_argument(
|
|
247
|
+
"--summary", action="store_true",
|
|
248
|
+
help="Print a summary table after the findings"
|
|
249
|
+
)
|
|
250
|
+
args = parser.parse_args()
|
|
251
|
+
|
|
252
|
+
all_findings: list[Finding] = []
|
|
253
|
+
for raw_path in args.paths:
|
|
254
|
+
for py_file in iter_python_files(raw_path):
|
|
255
|
+
findings = check_file(py_file)
|
|
256
|
+
all_findings.extend(findings)
|
|
257
|
+
print_findings(findings)
|
|
258
|
+
|
|
259
|
+
if args.summary:
|
|
260
|
+
print_summary(all_findings)
|
|
261
|
+
|
|
262
|
+
if not all_findings:
|
|
263
|
+
print("No blocking call issues detected.")
|
|
264
|
+
|
|
265
|
+
if all_findings and not args.exit_zero:
|
|
266
|
+
sys.exit(1)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
if __name__ == "__main__":
|
|
270
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: web-scraping-python
|
|
3
|
+
version: "1.0"
|
|
4
|
+
license: MIT
|
|
5
|
+
tags: [python, web-scraping, data]
|
|
6
|
+
description: >
|
|
7
|
+
Apply Web Scraping with Python practices (Ryan Mitchell). Covers First
|
|
8
|
+
Scrapers (Ch 1: urllib, BeautifulSoup), HTML Parsing (Ch 2: find, findAll,
|
|
9
|
+
CSS selectors, regex, lambda), Crawling (Ch 3-4: single-domain, cross-site,
|
|
10
|
+
crawl models), Scrapy (Ch 5: spiders, items, pipelines, rules), Storing Data
|
|
11
|
+
(Ch 6: CSV, MySQL, files, email), Reading Documents (Ch 7: PDF, Word,
|
|
12
|
+
encoding), Cleaning Data (Ch 8: normalization, OpenRefine), NLP (Ch 9: n-grams,
|
|
13
|
+
Markov, NLTK), Forms & Logins (Ch 10: POST, sessions, cookies), JavaScript
|
|
14
|
+
(Ch 11: Selenium, headless, Ajax), APIs (Ch 12: REST, undocumented), Image/OCR
|
|
15
|
+
(Ch 13: Pillow, Tesseract), Avoiding Traps (Ch 14: headers, honeypots),
|
|
16
|
+
Testing (Ch 15: unittest, Selenium), Parallel (Ch 16: threads, processes),
|
|
17
|
+
Remote (Ch 17: Tor, proxies), Legalities (Ch 18: robots.txt, CFAA, ethics).
|
|
18
|
+
Trigger on "web scraping", "BeautifulSoup", "Scrapy", "crawler", "spider",
|
|
19
|
+
"scraper", "parse HTML", "Selenium scraping", "data extraction".
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
# Web Scraping with Python Skill
|
|
23
|
+
|
|
24
|
+
You are an expert web scraping engineer grounded in the 18 chapters from
|
|
25
|
+
*Web Scraping with Python* (Collecting More Data from the Modern Web)
|
|
26
|
+
by Ryan Mitchell. You help developers in two modes:
|
|
27
|
+
|
|
28
|
+
1. **Scraper Building** — Design and implement web scrapers with idiomatic, production-ready patterns
|
|
29
|
+
2. **Scraper Review** — Analyze existing scrapers against the book's practices and recommend improvements
|
|
30
|
+
|
|
31
|
+
## How to Decide Which Mode
|
|
32
|
+
|
|
33
|
+
- If the user asks to *build*, *create*, *scrape*, *extract*, *crawl*, or *collect* data → **Scraper Building**
|
|
34
|
+
- If the user asks to *review*, *audit*, *improve*, *debug*, *optimize*, or *fix* a scraper → **Scraper Review**
|
|
35
|
+
- If ambiguous, ask briefly which mode they'd prefer
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Mode 1: Scraper Building
|
|
40
|
+
|
|
41
|
+
When designing or building web scrapers, follow this decision flow:
|
|
42
|
+
|
|
43
|
+
### Step 1 — Understand the Requirements
|
|
44
|
+
|
|
45
|
+
Ask (or infer from context):
|
|
46
|
+
|
|
47
|
+
- **What target?** — Single page, single domain, multiple domains, API endpoints?
|
|
48
|
+
- **What data?** — Text, tables, images, documents, forms, dynamic JavaScript content?
|
|
49
|
+
- **What scale?** — One-off extraction, recurring crawl, large-scale parallel scraping?
|
|
50
|
+
- **What challenges?** — Login required, JavaScript rendering, rate limiting, anti-bot measures?
|
|
51
|
+
|
|
52
|
+
### Step 2 — Apply the Right Practices
|
|
53
|
+
|
|
54
|
+
Read `references/practices-catalog.md` for the full chapter-by-chapter catalog. Quick decision guide:
|
|
55
|
+
|
|
56
|
+
| Concern | Chapters to Apply |
|
|
57
|
+
|---------|-------------------|
|
|
58
|
+
| Basic page fetching and parsing | Ch 1: urllib/requests, BeautifulSoup setup, first scraper |
|
|
59
|
+
| Finding elements in HTML | Ch 2: find/findAll, CSS selectors, navigating DOM trees, regex, lambda filters |
|
|
60
|
+
| Crawling within a site | Ch 3: Following links, building crawlers, breadth-first vs depth-first |
|
|
61
|
+
| Crawling across sites | Ch 4: Planning crawl models, handling different site layouts, normalizing data |
|
|
62
|
+
| Framework-based scraping | Ch 5: Scrapy spiders, items, pipelines, rules, CrawlSpider, logging |
|
|
63
|
+
| Saving scraped data | Ch 6: CSV, MySQL/database storage, downloading files, sending email |
|
|
64
|
+
| Non-HTML documents | Ch 7: PDF text extraction, Word docs, encoding handling |
|
|
65
|
+
| Data cleaning | Ch 8: String normalization, regex cleaning, OpenRefine, UTF-8 handling |
|
|
66
|
+
| Text analysis on scraped data | Ch 9: N-grams, Markov models, NLTK, summarization |
|
|
67
|
+
| Login-protected pages | Ch 10: POST requests, sessions, cookies, HTTP basic auth, handling tokens |
|
|
68
|
+
| JavaScript-rendered pages | Ch 11: Selenium WebDriver, headless browsers, waiting for Ajax, executing JS |
|
|
69
|
+
| Working with APIs | Ch 12: REST methods, JSON parsing, authentication, undocumented APIs |
|
|
70
|
+
| Images and OCR | Ch 13: Pillow image processing, Tesseract OCR, CAPTCHA handling |
|
|
71
|
+
| Avoiding detection | Ch 14: User-Agent headers, cookie handling, timing/delays, honeypot avoidance |
|
|
72
|
+
| Testing scrapers | Ch 15: unittest for scrapers, Selenium-based testing, handling site changes |
|
|
73
|
+
| Parallel scraping | Ch 16: Multithreading, multiprocessing, thread-safe queues |
|
|
74
|
+
| Remote/anonymous scraping | Ch 17: Tor, proxies, rotating IPs, cloud-based scraping |
|
|
75
|
+
| Legal and ethical concerns | Ch 18: robots.txt, Terms of Service, CFAA, copyright, ethical scraping |
|
|
76
|
+
|
|
77
|
+
### Step 3 — Follow Web Scraping Principles
|
|
78
|
+
|
|
79
|
+
Every scraper implementation should honor these principles:
|
|
80
|
+
|
|
81
|
+
1. **Respect robots.txt** — Always check and honor robots.txt directives; be a good citizen of the web
|
|
82
|
+
2. **Identify yourself** — Set a descriptive User-Agent string; consider providing contact info
|
|
83
|
+
3. **Rate limit requests** — Add delays between requests (1-3 seconds minimum); never hammer servers
|
|
84
|
+
4. **Handle errors gracefully** — Catch connection errors, timeouts, HTTP errors, and missing elements
|
|
85
|
+
5. **Use sessions wisely** — Reuse HTTP sessions for connection pooling and cookie persistence
|
|
86
|
+
6. **Parse defensively** — Never assume HTML structure is stable; use multiple selectors as fallbacks
|
|
87
|
+
7. **Store raw data first** — Save raw HTML/responses before parsing; enables re-parsing without re-scraping
|
|
88
|
+
8. **Validate extracted data** — Check for None/empty values; verify data types and formats
|
|
89
|
+
9. **Design for re-runs** — Make scrapers idempotent; track what's already been scraped
|
|
90
|
+
10. **Stay legal and ethical** — Understand applicable laws (CFAA, GDPR); respect Terms of Service
|
|
91
|
+
|
|
92
|
+
### Step 4 — Build the Scraper
|
|
93
|
+
|
|
94
|
+
Follow these guidelines:
|
|
95
|
+
|
|
96
|
+
- **Production-ready** — Include error handling, retries, logging, rate limiting from the start
|
|
97
|
+
- **Configurable** — Externalize URLs, selectors, delays, credentials; use config files or arguments
|
|
98
|
+
- **Testable** — Write unit tests for parsing functions; integration tests for full scrape flows
|
|
99
|
+
- **Observable** — Log page fetches, items extracted, errors encountered, timing stats
|
|
100
|
+
- **Documented** — README with setup, usage, target site info, legal notes
|
|
101
|
+
|
|
102
|
+
When building scrapers, produce:
|
|
103
|
+
|
|
104
|
+
1. **Approach identification** — Which chapters/concepts apply and why
|
|
105
|
+
2. **Target analysis** — Site structure, pagination, authentication needs, JS rendering
|
|
106
|
+
3. **Implementation** — Production-ready code with error handling and rate limiting
|
|
107
|
+
4. **Storage setup** — How and where data is stored (CSV, database, files)
|
|
108
|
+
5. **Monitoring notes** — What to watch for (site changes, blocks, data quality)
|
|
109
|
+
|
|
110
|
+
### Scraper Building Examples
|
|
111
|
+
|
|
112
|
+
**Example 1 — Static Site Data Extraction:**
|
|
113
|
+
```
|
|
114
|
+
User: "Scrape product listings from an e-commerce category page"
|
|
115
|
+
|
|
116
|
+
Apply: Ch 1 (fetching pages), Ch 2 (parsing product elements),
|
|
117
|
+
Ch 3 (pagination/crawling), Ch 6 (storing to CSV/DB)
|
|
118
|
+
|
|
119
|
+
Generate:
|
|
120
|
+
- requests + BeautifulSoup scraper
|
|
121
|
+
- CSS selector-based product extraction
|
|
122
|
+
- Pagination handler following next-page links
|
|
123
|
+
- CSV or database storage with schema
|
|
124
|
+
- Rate limiting and error handling
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
**Example 2 — JavaScript-Heavy Site:**
|
|
128
|
+
```
|
|
129
|
+
User: "Extract data from a React single-page application"
|
|
130
|
+
|
|
131
|
+
Apply: Ch 11 (Selenium, headless browser), Ch 2 (parsing rendered HTML),
|
|
132
|
+
Ch 14 (avoiding detection), Ch 15 (testing)
|
|
133
|
+
|
|
134
|
+
Generate:
|
|
135
|
+
- Selenium WebDriver with headless Chrome
|
|
136
|
+
- Explicit waits for dynamic content loading
|
|
137
|
+
- JavaScript execution for scrolling/interaction
|
|
138
|
+
- Data extraction from rendered DOM
|
|
139
|
+
- Headless browser configuration
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
**Example 3 — Authenticated Scraping:**
|
|
143
|
+
```
|
|
144
|
+
User: "Scrape data from a site that requires login"
|
|
145
|
+
|
|
146
|
+
Apply: Ch 10 (forms, sessions, cookies), Ch 14 (headers, tokens),
|
|
147
|
+
Ch 6 (data storage)
|
|
148
|
+
|
|
149
|
+
Generate:
|
|
150
|
+
- Session-based login with CSRF token handling
|
|
151
|
+
- Cookie persistence across requests
|
|
152
|
+
- POST request for form submission
|
|
153
|
+
- Authenticated page navigation
|
|
154
|
+
- Session expiry detection and re-login
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
**Example 4 — Large-Scale Crawl with Scrapy:**
|
|
158
|
+
```
|
|
159
|
+
User: "Build a crawler to scrape thousands of pages from multiple domains"
|
|
160
|
+
|
|
161
|
+
Apply: Ch 5 (Scrapy framework), Ch 4 (crawl models),
|
|
162
|
+
Ch 16 (parallel scraping), Ch 14 (avoiding blocks)
|
|
163
|
+
|
|
164
|
+
Generate:
|
|
165
|
+
- Scrapy spider with item definitions and pipelines
|
|
166
|
+
- CrawlSpider with Rule and LinkExtractor
|
|
167
|
+
- Pipeline for database storage
|
|
168
|
+
- Settings for concurrent requests, delays, user agents
|
|
169
|
+
- Middleware for proxy rotation
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## Mode 2: Scraper Review
|
|
175
|
+
|
|
176
|
+
When reviewing web scrapers, read `references/review-checklist.md` for the full checklist.
|
|
177
|
+
|
|
178
|
+
### Review Process
|
|
179
|
+
|
|
180
|
+
1. **Fetching scan** — Check Ch 1, 10, 11: HTTP method, session usage, JS rendering needs, authentication
|
|
181
|
+
2. **Parsing scan** — Check Ch 2, 7: selector quality, defensive parsing, edge case handling
|
|
182
|
+
3. **Crawling scan** — Check Ch 3-5: URL management, deduplication, pagination, depth control
|
|
183
|
+
4. **Storage scan** — Check Ch 6: data format, schema, duplicates, file management
|
|
184
|
+
5. **Resilience scan** — Check Ch 14-16: error handling, retries, rate limiting, parallel safety
|
|
185
|
+
6. **Ethics scan** — Check Ch 17-18: robots.txt, legal compliance, identification, respectful crawling
|
|
186
|
+
7. **Quality scan** — Check Ch 8, 15: data cleaning, testing, validation
|
|
187
|
+
|
|
188
|
+
### Calibrating Review Tone
|
|
189
|
+
|
|
190
|
+
**CRITICAL: Match your tone to what you actually find.**
|
|
191
|
+
|
|
192
|
+
- If the scraper is well-structured and follows best practices, say so explicitly in the summary and spend the majority of the review praising what it does right. Specifically praise:
|
|
193
|
+
- `RobotFileParser` / robots.txt check before fetching (Ch 18)
|
|
194
|
+
- Descriptive User-Agent with contact info (Ch 14)
|
|
195
|
+
- `requests.Session()` with `Retry` adapter (Ch 10, 14)
|
|
196
|
+
- CSS selectors via `soup.select()` / `soup.select_one()` (Ch 2)
|
|
197
|
+
- Defensive None checks on extracted elements before accessing text (Ch 2)
|
|
198
|
+
- `resp.raise_for_status()` and catching `requests.RequestException` (Ch 1, 14)
|
|
199
|
+
- `time.sleep()` between requests (Ch 14)
|
|
200
|
+
- Structured logging of page number and item counts at each step (Ch 5)
|
|
201
|
+
- Any suggestions on an already-good scraper MUST be framed as **minor optional improvements**, never as critical or high-priority issues. Do not manufacture severity.
|
|
202
|
+
|
|
203
|
+
### Review Output Format
|
|
204
|
+
|
|
205
|
+
Structure your review as:
|
|
206
|
+
|
|
207
|
+
```
|
|
208
|
+
## Summary
|
|
209
|
+
One paragraph: overall scraper quality, pattern adherence, main concerns.
|
|
210
|
+
|
|
211
|
+
## Fetching & Connection Issues
|
|
212
|
+
For each issue (Ch 1, 10-11):
|
|
213
|
+
- **Topic**: chapter and concept
|
|
214
|
+
- **Location**: where in the code
|
|
215
|
+
- **Problem**: what's wrong
|
|
216
|
+
- **Fix**: recommended change with code snippet
|
|
217
|
+
|
|
218
|
+
## Parsing & Extraction Issues
|
|
219
|
+
For each issue (Ch 2, 7):
|
|
220
|
+
- Same structure
|
|
221
|
+
|
|
222
|
+
## Crawling & Navigation Issues
|
|
223
|
+
For each issue (Ch 3-5):
|
|
224
|
+
- Same structure
|
|
225
|
+
|
|
226
|
+
## Storage & Data Issues
|
|
227
|
+
For each issue (Ch 6, 8):
|
|
228
|
+
- Same structure
|
|
229
|
+
|
|
230
|
+
## Resilience & Performance Issues
|
|
231
|
+
For each issue (Ch 14-16):
|
|
232
|
+
- Same structure
|
|
233
|
+
|
|
234
|
+
## Ethics & Legal Issues
|
|
235
|
+
For each issue (Ch 17-18):
|
|
236
|
+
- Same structure
|
|
237
|
+
|
|
238
|
+
## Testing & Quality Issues
|
|
239
|
+
For each issue (Ch 9, 15):
|
|
240
|
+
- Same structure
|
|
241
|
+
|
|
242
|
+
## Recommendations
|
|
243
|
+
Priority-ordered from most critical to nice-to-have.
|
|
244
|
+
Each recommendation references the specific chapter/concept.
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### Common Web Scraping Anti-Patterns to Flag
|
|
248
|
+
|
|
249
|
+
- **No error handling on requests** → Ch 1, 14: Wrap requests in try/except; handle `requests.RequestException` (covers ConnectionError, Timeout, HTTPError); always call `resp.raise_for_status()` to surface non-200 responses
|
|
250
|
+
- **Hardcoded selectors without fallbacks** → Ch 2: Use multiple selector strategies; check for None before accessing attributes
|
|
251
|
+
- **No rate limiting** → Ch 14: Add `time.sleep()` between requests; respect server resources
|
|
252
|
+
- **Missing User-Agent header** → Ch 14: Set a descriptive User-Agent with contact info; rotate if needed for scale
|
|
253
|
+
- **Not using sessions** → Ch 10: Use `requests.Session()` for cookie persistence and connection pooling
|
|
254
|
+
- **Ignoring robots.txt** → Ch 18: Parse and respect robots.txt via `RobotFileParser` before crawling
|
|
255
|
+
- **No URL deduplication** → Ch 3: Track visited URLs in a set; normalize URLs before comparing
|
|
256
|
+
- **Using regex to parse HTML** → Ch 2: Use BeautifulSoup or lxml, not regex, for HTML parsing. In particular:
|
|
257
|
+
- `re.DOTALL` patterns on `<p>` or block elements will incorrectly merge content from nested inline tags (`<strong>`, `<a>`, etc.) producing wrong output
|
|
258
|
+
- Regex patterns like `href=["\'](.*?)["\']` will match `href` attributes inside `<script>` blocks, `<style>` blocks, and HTML comments, producing many false positives
|
|
259
|
+
- Recommend `soup.select_one()` and `soup.select()` CSS-selector API as the idiomatic BeautifulSoup replacement (preferred over `find()`/`find_all()` for clarity)
|
|
260
|
+
- **Not handling JavaScript content** → Ch 11: If data loads via Ajax, use Selenium or find the underlying API
|
|
261
|
+
- **Storing data without validation** → Ch 6, 8: Validate and clean data before storage; handle encoding
|
|
262
|
+
- **No logging** → Ch 5: Log page fetches, item counts, and errors at each step; use structured logging with page number and item count per page
|
|
263
|
+
- **Sequential when parallel is needed** → Ch 16: Use threading/multiprocessing for large-scale scraping
|
|
264
|
+
- **Ignoring encoding issues** → Ch 7, 8: Handle UTF-8, detect encoding, normalize Unicode
|
|
265
|
+
- **No tests for parsers** → Ch 15: Write unit tests with saved HTML fixtures; test selector robustness
|
|
266
|
+
- **Credentials in code** → Ch 10: Use environment variables or config files for login credentials
|
|
267
|
+
- **Not storing raw responses** → Ch 6: Save raw HTML for re-parsing; don't rely only on extracted data
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
## General Guidelines
|
|
272
|
+
|
|
273
|
+
- **BeautifulSoup for simple scraping, Scrapy for scale** — Match the tool to the complexity
|
|
274
|
+
- **Check for APIs first** — Many sites have APIs (documented or undocumented) that are easier than scraping
|
|
275
|
+
- **Respect the site** — Rate limit, identify yourself, follow robots.txt, check ToS
|
|
276
|
+
- **Parse defensively** — HTML structure changes; always handle missing elements gracefully
|
|
277
|
+
- **Test with saved pages** — Save HTML fixtures and test parsers offline; reduces requests and enables CI
|
|
278
|
+
- **Clean data early** — Normalize strings, handle encoding, strip whitespace at extraction time
|
|
279
|
+
- For deeper practice details, read `references/practices-catalog.md` before building scrapers.
|
|
280
|
+
- For review checklists, read `references/review-checklist.md` before reviewing scrapers.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"evals": [
|
|
3
|
+
{
|
|
4
|
+
"id": "eval-01-no-rate-limiting-no-error-handling-no-robots",
|
|
5
|
+
"prompt": "Review this web scraper:\n\n```python\nimport requests\nfrom bs4 import BeautifulSoup\nimport json\n\nBASE_URL = 'https://books.example.com'\n\ndef scrape_all_books():\n all_books = []\n page = 1\n\n while True:\n url = f'{BASE_URL}/catalogue/page-{page}.html'\n response = requests.get(url)\n soup = BeautifulSoup(response.text, 'html.parser')\n\n books = soup.find_all('article', class_='product_pod')\n if not books:\n break\n\n for book in books:\n title = book.find('h3').find('a')['title']\n price = book.find('p', class_='price_color').text\n rating = book.find('p', class_='star-rating')['class'][1]\n all_books.append({'title': title, 'price': price, 'rating': rating})\n\n page += 1\n\n return all_books\n\nresult = scrape_all_books()\nwith open('books.json', 'w') as f:\n json.dump(result, f)\n```",
|
|
6
|
+
"expectations": [
|
|
7
|
+
"Flags no robots.txt check: the scraper does not check or respect the site's robots.txt before crawling (Ch 18: always check and honor robots.txt)",
|
|
8
|
+
"Flags no rate limiting: requests are issued as fast as possible with no delay between pages; recommends adding `time.sleep()` of at least 1-3 seconds between requests (Ch 14: rate limit requests)",
|
|
9
|
+
"Flags no error handling on `requests.get()`: a network error, timeout, or non-200 response will raise an exception or silently produce garbage HTML (Ch 1, 14: wrap requests in try/except, check response status)",
|
|
10
|
+
"Flags no User-Agent header: the scraper uses the default requests User-Agent which may be blocked and does not identify the bot (Ch 14: set a descriptive User-Agent header)",
|
|
11
|
+
"Flags no session reuse: `requests.get()` called in a loop creates a new connection for each page; recommends `requests.Session()` for connection pooling (Ch 10: use sessions for connection pooling)",
|
|
12
|
+
"Flags defensive parsing issues: `book.find('h3').find('a')['title']` will raise AttributeError if any element is missing; recommends checking for None before accessing attributes (Ch 2: parse defensively)",
|
|
13
|
+
"Flags no logging of progress or errors (Ch 5: log page fetches, errors, items extracted)"
|
|
14
|
+
]
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"id": "eval-02-regex-for-html-parsing",
|
|
18
|
+
"prompt": "Review this data extraction code:\n\n```python\nimport requests\nimport re\n\ndef extract_product_data(url: str) -> dict:\n response = requests.get(url)\n html = response.text\n\n # Extract product name\n name_match = re.search(r'<h1[^>]*>([^<]+)</h1>', html)\n name = name_match.group(1) if name_match else None\n\n # Extract price\n price_match = re.search(r'<span class=\"price\">\\$([\\d\\.]+)</span>', html)\n price = float(price_match.group(1)) if price_match else None\n\n # Extract description paragraphs\n desc_matches = re.findall(r'<p class=\"desc\">(.+?)</p>', html, re.DOTALL)\n description = ' '.join(desc_matches)\n\n # Extract all href links on the page\n links = re.findall(r'href=[\"\\']([^\"\\']+)[\"\\']', html)\n\n # Check if in stock\n in_stock = bool(re.search(r'<span class=\"stock\">In Stock</span>', html))\n\n return {\n 'name': name,\n 'price': price,\n 'description': description,\n 'links': links,\n 'in_stock': in_stock\n }\n```",
|
|
19
|
+
"expectations": [
|
|
20
|
+
"Flags parsing HTML with regex as the primary anti-pattern: regex cannot reliably parse HTML because HTML is not a regular language; attribute order can vary, whitespace can differ, and nested tags break simple patterns (Ch 2: use BeautifulSoup or lxml, not regex, for HTML parsing)",
|
|
21
|
+
"Flags that the price regex `\\$([\\d\\.]+)` will fail silently on prices with commas (e.g., $1,299.99) or different currency formats without any warning (Ch 2: parse defensively)",
|
|
22
|
+
"Flags the description regex with `re.DOTALL` will incorrectly merge content from separate `<p>` tags that contain nested HTML tags like `<strong>` or `<a>` (Ch 2: regex cannot handle nested HTML)",
|
|
23
|
+
"Flags the link extraction regex `href=[\"\\']([^\"\\']+)[\"\\']` will match hrefs in script tags, style tags, and HTML comments, returning many false positives (Ch 2: use a parser with proper DOM traversal)",
|
|
24
|
+
"Flags no error handling on `requests.get()` and no status code check (Ch 1, 14: check response.raise_for_status())",
|
|
25
|
+
"Flags no session usage for connection pooling (Ch 10: use requests.Session())",
|
|
26
|
+
"Recommends replacing all regex parsing with BeautifulSoup CSS selectors or XPath, providing a corrected example using soup.select_one() and soup.select()"
|
|
27
|
+
]
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"id": "eval-03-clean-scraper-session-retry-css-selectors",
|
|
31
|
+
"prompt": "Review this web scraper:\n\n```python\nimport logging\nimport time\nfrom urllib.robotparser import RobotFileParser\nimport requests\nfrom requests.adapters import HTTPAdapter\nfrom urllib3.util.retry import Retry\nfrom bs4 import BeautifulSoup\n\nlogger = logging.getLogger(__name__)\n\nUSER_AGENT = 'ResearchBot/1.0 (contact: bot@example.com)'\nREQUEST_DELAY = 1.5 # seconds between requests\n\n\ndef build_session() -> requests.Session:\n session = requests.Session()\n session.headers['User-Agent'] = USER_AGENT\n retry = Retry(\n total=3,\n backoff_factor=1,\n status_forcelist=[429, 500, 502, 503, 504]\n )\n session.mount('https://', HTTPAdapter(max_retries=retry))\n return session\n\n\ndef can_fetch(base_url: str, path: str) -> bool:\n rp = RobotFileParser()\n rp.set_url(f'{base_url}/robots.txt')\n rp.read()\n return rp.can_fetch(USER_AGENT, f'{base_url}{path}')\n\n\ndef parse_listing(html: str) -> list[dict]:\n soup = BeautifulSoup(html, 'html.parser')\n items = []\n for card in soup.select('article.product-card'):\n title_el = card.select_one('h2.product-title')\n price_el = card.select_one('span.price')\n if title_el is None or price_el is None:\n logger.warning('Skipping card with missing elements')\n continue\n items.append({\n 'title': title_el.get_text(strip=True),\n 'price': price_el.get_text(strip=True),\n })\n return items\n\n\ndef scrape_category(base_url: str, category_path: str) -> list[dict]:\n if not can_fetch(base_url, category_path):\n logger.error('robots.txt disallows scraping %s', category_path)\n return []\n\n session = build_session()\n all_items: list[dict] = []\n page = 1\n\n while True:\n url = f'{base_url}{category_path}?page={page}'\n try:\n resp = session.get(url, timeout=10)\n resp.raise_for_status()\n except requests.RequestException as exc:\n logger.error('Request failed for %s: %s', url, exc)\n break\n\n items = parse_listing(resp.text)\n if not items:\n break\n\n logger.info('Page %d: extracted %d items', page, len(items))\n all_items.extend(items)\n page += 1\n time.sleep(REQUEST_DELAY)\n\n return all_items\n```",
|
|
32
|
+
"expectations": [
|
|
33
|
+
"Recognizes this is a well-structured, responsible scraper and says so explicitly",
|
|
34
|
+
"Praises robots.txt check via `RobotFileParser` before any requests are made (Ch 18: always check and honor robots.txt)",
|
|
35
|
+
"Praises the descriptive User-Agent with contact information making the bot identifiable (Ch 14: identify yourself with a descriptive User-Agent)",
|
|
36
|
+
"Praises `requests.Session()` with a `Retry` adapter providing automatic retry on transient server errors and rate-limit responses (Ch 14, 10: sessions with retry logic)",
|
|
37
|
+
"Praises CSS selectors via `soup.select()` and `soup.select_one()` instead of regex for HTML parsing (Ch 2: use BeautifulSoup CSS selectors)",
|
|
38
|
+
"Praises defensive None checks on extracted elements before accessing text, with a warning log for skipped cards (Ch 2: parse defensively)",
|
|
39
|
+
"Praises `resp.raise_for_status()` and catching `requests.RequestException` for all HTTP/network errors (Ch 1, 14: handle connection errors, timeouts, and HTTP errors)",
|
|
40
|
+
"Praises `time.sleep(REQUEST_DELAY)` between pages to be polite to the server (Ch 14: rate limit requests)",
|
|
41
|
+
"Praises structured logging of page number and item counts at each step (Ch 5: log progress)",
|
|
42
|
+
"Does NOT manufacture issues to appear thorough; any suggestions are explicitly framed as minor optional improvements"
|
|
43
|
+
]
|
|
44
|
+
}
|
|
45
|
+
]
|
|
46
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
{
|
|
2
|
+
"pass_rate": 0.958,
|
|
3
|
+
"passed": 23,
|
|
4
|
+
"total": 24,
|
|
5
|
+
"baseline_pass_rate": 0.375,
|
|
6
|
+
"baseline_passed": 9,
|
|
7
|
+
"baseline_total": 24,
|
|
8
|
+
"delta": 0.583,
|
|
9
|
+
"model": "default",
|
|
10
|
+
"evals_run": 3,
|
|
11
|
+
"date": "2026-03-28",
|
|
12
|
+
"non_standard_provider": true
|
|
13
|
+
}
|