@booklib/core 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (374) hide show
  1. package/.cursor/rules/booklib-standards.mdc +40 -0
  2. package/.gemini/context.md +372 -0
  3. package/AGENTS.md +166 -0
  4. package/CHANGELOG.md +226 -0
  5. package/CLAUDE.md +81 -0
  6. package/CODE_OF_CONDUCT.md +31 -0
  7. package/CONTRIBUTING.md +304 -0
  8. package/LICENSE +21 -0
  9. package/PLAN.md +28 -0
  10. package/README.ja.md +198 -0
  11. package/README.ko.md +198 -0
  12. package/README.md +503 -0
  13. package/README.pt-BR.md +198 -0
  14. package/README.uk.md +241 -0
  15. package/README.zh-CN.md +198 -0
  16. package/SECURITY.md +9 -0
  17. package/agents/architecture-reviewer.md +136 -0
  18. package/agents/booklib-reviewer.md +90 -0
  19. package/agents/data-reviewer.md +107 -0
  20. package/agents/jvm-reviewer.md +146 -0
  21. package/agents/python-reviewer.md +128 -0
  22. package/agents/rust-reviewer.md +115 -0
  23. package/agents/ts-reviewer.md +110 -0
  24. package/agents/ui-reviewer.md +117 -0
  25. package/assets/logo.svg +36 -0
  26. package/bin/booklib-mcp.js +304 -0
  27. package/bin/booklib.js +1705 -0
  28. package/bin/skills.cjs +1292 -0
  29. package/booklib-router.mdc +36 -0
  30. package/booklib.config.json +19 -0
  31. package/commands/animation-at-work.md +10 -0
  32. package/commands/clean-code-reviewer.md +10 -0
  33. package/commands/data-intensive-patterns.md +10 -0
  34. package/commands/data-pipelines.md +10 -0
  35. package/commands/design-patterns.md +10 -0
  36. package/commands/domain-driven-design.md +10 -0
  37. package/commands/effective-java.md +10 -0
  38. package/commands/effective-kotlin.md +10 -0
  39. package/commands/effective-python.md +10 -0
  40. package/commands/effective-typescript.md +10 -0
  41. package/commands/kotlin-in-action.md +10 -0
  42. package/commands/lean-startup.md +10 -0
  43. package/commands/microservices-patterns.md +10 -0
  44. package/commands/programming-with-rust.md +10 -0
  45. package/commands/refactoring-ui.md +10 -0
  46. package/commands/rust-in-action.md +10 -0
  47. package/commands/skill-router.md +10 -0
  48. package/commands/spring-boot-in-action.md +10 -0
  49. package/commands/storytelling-with-data.md +10 -0
  50. package/commands/system-design-interview.md +10 -0
  51. package/commands/using-asyncio-python.md +10 -0
  52. package/commands/web-scraping-python.md +10 -0
  53. package/community/registry.json +1616 -0
  54. package/hooks/hooks.json +23 -0
  55. package/hooks/posttooluse-capture.mjs +67 -0
  56. package/hooks/suggest.js +153 -0
  57. package/lib/agent-behaviors.js +40 -0
  58. package/lib/agent-detector.js +96 -0
  59. package/lib/config-loader.js +39 -0
  60. package/lib/conflict-resolver.js +148 -0
  61. package/lib/context-builder.js +574 -0
  62. package/lib/discovery-engine.js +298 -0
  63. package/lib/doctor/hook-installer.js +83 -0
  64. package/lib/doctor/usage-tracker.js +87 -0
  65. package/lib/engine/ai-features.js +253 -0
  66. package/lib/engine/auditor.js +103 -0
  67. package/lib/engine/bm25-index.js +178 -0
  68. package/lib/engine/capture.js +120 -0
  69. package/lib/engine/corrections.js +198 -0
  70. package/lib/engine/doctor.js +195 -0
  71. package/lib/engine/graph-injector.js +137 -0
  72. package/lib/engine/graph.js +161 -0
  73. package/lib/engine/handoff.js +405 -0
  74. package/lib/engine/indexer.js +242 -0
  75. package/lib/engine/parser.js +53 -0
  76. package/lib/engine/query-expander.js +42 -0
  77. package/lib/engine/reranker.js +40 -0
  78. package/lib/engine/rrf.js +59 -0
  79. package/lib/engine/scanner.js +151 -0
  80. package/lib/engine/searcher.js +139 -0
  81. package/lib/engine/session-coordinator.js +306 -0
  82. package/lib/engine/session-manager.js +429 -0
  83. package/lib/engine/synthesizer.js +70 -0
  84. package/lib/installer.js +70 -0
  85. package/lib/instinct-block.js +33 -0
  86. package/lib/mcp-config-writer.js +88 -0
  87. package/lib/paths.js +57 -0
  88. package/lib/profiles/design.md +19 -0
  89. package/lib/profiles/general.md +16 -0
  90. package/lib/profiles/research-analysis.md +22 -0
  91. package/lib/profiles/software-development.md +23 -0
  92. package/lib/profiles/writing-content.md +19 -0
  93. package/lib/project-initializer.js +916 -0
  94. package/lib/registry/skills.js +102 -0
  95. package/lib/registry-searcher.js +99 -0
  96. package/lib/rules/rules-manager.js +169 -0
  97. package/lib/skill-fetcher.js +333 -0
  98. package/lib/well-known-builder.js +70 -0
  99. package/lib/wizard/index.js +404 -0
  100. package/lib/wizard/integration-detector.js +41 -0
  101. package/lib/wizard/project-detector.js +100 -0
  102. package/lib/wizard/prompt.js +156 -0
  103. package/lib/wizard/registry-embeddings.js +107 -0
  104. package/lib/wizard/skill-recommender.js +69 -0
  105. package/llms-full.txt +254 -0
  106. package/llms.txt +70 -0
  107. package/package.json +45 -0
  108. package/research-reports/2026-04-01-current-architecture.md +160 -0
  109. package/research-reports/IDEAS.md +93 -0
  110. package/rules/common/clean-code.md +42 -0
  111. package/rules/java/effective-java.md +42 -0
  112. package/rules/kotlin/effective-kotlin.md +37 -0
  113. package/rules/python/effective-python.md +38 -0
  114. package/rules/rust/rust.md +37 -0
  115. package/rules/typescript/effective-typescript.md +42 -0
  116. package/scripts/gen-llms-full.mjs +36 -0
  117. package/scripts/gen-og.mjs +142 -0
  118. package/scripts/validate-frontmatter.js +25 -0
  119. package/skills/animation-at-work/SKILL.md +270 -0
  120. package/skills/animation-at-work/assets/example_asset.txt +1 -0
  121. package/skills/animation-at-work/evals/evals.json +44 -0
  122. package/skills/animation-at-work/evals/results.json +13 -0
  123. package/skills/animation-at-work/examples/after.md +64 -0
  124. package/skills/animation-at-work/examples/before.md +35 -0
  125. package/skills/animation-at-work/references/api_reference.md +369 -0
  126. package/skills/animation-at-work/references/review-checklist.md +79 -0
  127. package/skills/animation-at-work/scripts/audit_animations.py +295 -0
  128. package/skills/animation-at-work/scripts/example.py +1 -0
  129. package/skills/clean-code-reviewer/SKILL.md +444 -0
  130. package/skills/clean-code-reviewer/audit.json +35 -0
  131. package/skills/clean-code-reviewer/evals/evals.json +185 -0
  132. package/skills/clean-code-reviewer/evals/results.json +13 -0
  133. package/skills/clean-code-reviewer/examples/after.md +48 -0
  134. package/skills/clean-code-reviewer/examples/before.md +33 -0
  135. package/skills/clean-code-reviewer/references/api_reference.md +158 -0
  136. package/skills/clean-code-reviewer/references/practices-catalog.md +282 -0
  137. package/skills/clean-code-reviewer/references/review-checklist.md +254 -0
  138. package/skills/clean-code-reviewer/scripts/pre-review.py +206 -0
  139. package/skills/data-intensive-patterns/SKILL.md +267 -0
  140. package/skills/data-intensive-patterns/assets/example_asset.txt +1 -0
  141. package/skills/data-intensive-patterns/evals/evals.json +54 -0
  142. package/skills/data-intensive-patterns/evals/results.json +13 -0
  143. package/skills/data-intensive-patterns/examples/after.md +61 -0
  144. package/skills/data-intensive-patterns/examples/before.md +38 -0
  145. package/skills/data-intensive-patterns/references/api_reference.md +34 -0
  146. package/skills/data-intensive-patterns/references/patterns-catalog.md +551 -0
  147. package/skills/data-intensive-patterns/references/review-checklist.md +193 -0
  148. package/skills/data-intensive-patterns/scripts/adr.py +213 -0
  149. package/skills/data-intensive-patterns/scripts/example.py +1 -0
  150. package/skills/data-pipelines/SKILL.md +259 -0
  151. package/skills/data-pipelines/assets/example_asset.txt +1 -0
  152. package/skills/data-pipelines/evals/evals.json +45 -0
  153. package/skills/data-pipelines/evals/results.json +13 -0
  154. package/skills/data-pipelines/examples/after.md +97 -0
  155. package/skills/data-pipelines/examples/before.md +37 -0
  156. package/skills/data-pipelines/references/api_reference.md +301 -0
  157. package/skills/data-pipelines/references/review-checklist.md +181 -0
  158. package/skills/data-pipelines/scripts/example.py +1 -0
  159. package/skills/data-pipelines/scripts/new_pipeline.py +444 -0
  160. package/skills/design-patterns/SKILL.md +271 -0
  161. package/skills/design-patterns/assets/example_asset.txt +1 -0
  162. package/skills/design-patterns/evals/evals.json +46 -0
  163. package/skills/design-patterns/evals/results.json +13 -0
  164. package/skills/design-patterns/examples/after.md +52 -0
  165. package/skills/design-patterns/examples/before.md +29 -0
  166. package/skills/design-patterns/references/api_reference.md +1 -0
  167. package/skills/design-patterns/references/patterns-catalog.md +726 -0
  168. package/skills/design-patterns/references/review-checklist.md +173 -0
  169. package/skills/design-patterns/scripts/example.py +1 -0
  170. package/skills/design-patterns/scripts/scaffold.py +807 -0
  171. package/skills/domain-driven-design/SKILL.md +142 -0
  172. package/skills/domain-driven-design/assets/example_asset.txt +1 -0
  173. package/skills/domain-driven-design/evals/evals.json +48 -0
  174. package/skills/domain-driven-design/evals/results.json +13 -0
  175. package/skills/domain-driven-design/examples/after.md +80 -0
  176. package/skills/domain-driven-design/examples/before.md +43 -0
  177. package/skills/domain-driven-design/references/api_reference.md +1 -0
  178. package/skills/domain-driven-design/references/patterns-catalog.md +545 -0
  179. package/skills/domain-driven-design/references/review-checklist.md +158 -0
  180. package/skills/domain-driven-design/scripts/example.py +1 -0
  181. package/skills/domain-driven-design/scripts/scaffold.py +421 -0
  182. package/skills/effective-java/SKILL.md +227 -0
  183. package/skills/effective-java/assets/example_asset.txt +1 -0
  184. package/skills/effective-java/evals/evals.json +46 -0
  185. package/skills/effective-java/evals/results.json +13 -0
  186. package/skills/effective-java/examples/after.md +83 -0
  187. package/skills/effective-java/examples/before.md +37 -0
  188. package/skills/effective-java/references/api_reference.md +1 -0
  189. package/skills/effective-java/references/items-catalog.md +955 -0
  190. package/skills/effective-java/references/review-checklist.md +216 -0
  191. package/skills/effective-java/scripts/checkstyle_setup.py +211 -0
  192. package/skills/effective-java/scripts/example.py +1 -0
  193. package/skills/effective-kotlin/SKILL.md +271 -0
  194. package/skills/effective-kotlin/assets/example_asset.txt +1 -0
  195. package/skills/effective-kotlin/audit.json +29 -0
  196. package/skills/effective-kotlin/evals/evals.json +45 -0
  197. package/skills/effective-kotlin/evals/results.json +13 -0
  198. package/skills/effective-kotlin/examples/after.md +36 -0
  199. package/skills/effective-kotlin/examples/before.md +38 -0
  200. package/skills/effective-kotlin/references/api_reference.md +1 -0
  201. package/skills/effective-kotlin/references/practices-catalog.md +1228 -0
  202. package/skills/effective-kotlin/references/review-checklist.md +126 -0
  203. package/skills/effective-kotlin/scripts/example.py +1 -0
  204. package/skills/effective-python/SKILL.md +441 -0
  205. package/skills/effective-python/evals/evals.json +44 -0
  206. package/skills/effective-python/evals/results.json +13 -0
  207. package/skills/effective-python/examples/after.md +56 -0
  208. package/skills/effective-python/examples/before.md +40 -0
  209. package/skills/effective-python/ref-01-pythonic-thinking.md +202 -0
  210. package/skills/effective-python/ref-02-lists-and-dicts.md +146 -0
  211. package/skills/effective-python/ref-03-functions.md +186 -0
  212. package/skills/effective-python/ref-04-comprehensions-generators.md +211 -0
  213. package/skills/effective-python/ref-05-classes-interfaces.md +188 -0
  214. package/skills/effective-python/ref-06-metaclasses-attributes.md +209 -0
  215. package/skills/effective-python/ref-07-concurrency.md +213 -0
  216. package/skills/effective-python/ref-08-robustness-performance.md +248 -0
  217. package/skills/effective-python/ref-09-testing-debugging.md +253 -0
  218. package/skills/effective-python/ref-10-collaboration.md +175 -0
  219. package/skills/effective-python/references/api_reference.md +218 -0
  220. package/skills/effective-python/references/practices-catalog.md +483 -0
  221. package/skills/effective-python/references/review-checklist.md +190 -0
  222. package/skills/effective-python/scripts/lint.py +173 -0
  223. package/skills/effective-typescript/SKILL.md +262 -0
  224. package/skills/effective-typescript/audit.json +29 -0
  225. package/skills/effective-typescript/evals/evals.json +37 -0
  226. package/skills/effective-typescript/evals/results.json +13 -0
  227. package/skills/effective-typescript/examples/after.md +70 -0
  228. package/skills/effective-typescript/examples/before.md +47 -0
  229. package/skills/effective-typescript/references/api_reference.md +118 -0
  230. package/skills/effective-typescript/references/practices-catalog.md +371 -0
  231. package/skills/effective-typescript/scripts/review.py +169 -0
  232. package/skills/kotlin-in-action/SKILL.md +261 -0
  233. package/skills/kotlin-in-action/assets/example_asset.txt +1 -0
  234. package/skills/kotlin-in-action/evals/evals.json +43 -0
  235. package/skills/kotlin-in-action/evals/results.json +13 -0
  236. package/skills/kotlin-in-action/examples/after.md +53 -0
  237. package/skills/kotlin-in-action/examples/before.md +39 -0
  238. package/skills/kotlin-in-action/references/api_reference.md +1 -0
  239. package/skills/kotlin-in-action/references/practices-catalog.md +436 -0
  240. package/skills/kotlin-in-action/references/review-checklist.md +204 -0
  241. package/skills/kotlin-in-action/scripts/example.py +1 -0
  242. package/skills/kotlin-in-action/scripts/setup_detekt.py +224 -0
  243. package/skills/lean-startup/SKILL.md +160 -0
  244. package/skills/lean-startup/assets/example_asset.txt +1 -0
  245. package/skills/lean-startup/evals/evals.json +43 -0
  246. package/skills/lean-startup/evals/results.json +13 -0
  247. package/skills/lean-startup/examples/after.md +80 -0
  248. package/skills/lean-startup/examples/before.md +34 -0
  249. package/skills/lean-startup/references/api_reference.md +319 -0
  250. package/skills/lean-startup/references/review-checklist.md +137 -0
  251. package/skills/lean-startup/scripts/example.py +1 -0
  252. package/skills/lean-startup/scripts/new_experiment.py +286 -0
  253. package/skills/microservices-patterns/SKILL.md +384 -0
  254. package/skills/microservices-patterns/evals/evals.json +45 -0
  255. package/skills/microservices-patterns/evals/results.json +13 -0
  256. package/skills/microservices-patterns/examples/after.md +69 -0
  257. package/skills/microservices-patterns/examples/before.md +40 -0
  258. package/skills/microservices-patterns/references/patterns-catalog.md +391 -0
  259. package/skills/microservices-patterns/references/review-checklist.md +169 -0
  260. package/skills/microservices-patterns/scripts/new_service.py +583 -0
  261. package/skills/programming-with-rust/SKILL.md +209 -0
  262. package/skills/programming-with-rust/evals/evals.json +37 -0
  263. package/skills/programming-with-rust/evals/results.json +13 -0
  264. package/skills/programming-with-rust/examples/after.md +107 -0
  265. package/skills/programming-with-rust/examples/before.md +59 -0
  266. package/skills/programming-with-rust/references/api_reference.md +152 -0
  267. package/skills/programming-with-rust/references/practices-catalog.md +335 -0
  268. package/skills/programming-with-rust/scripts/review.py +142 -0
  269. package/skills/refactoring-ui/SKILL.md +362 -0
  270. package/skills/refactoring-ui/assets/example_asset.txt +1 -0
  271. package/skills/refactoring-ui/evals/evals.json +45 -0
  272. package/skills/refactoring-ui/evals/results.json +13 -0
  273. package/skills/refactoring-ui/examples/after.md +85 -0
  274. package/skills/refactoring-ui/examples/before.md +58 -0
  275. package/skills/refactoring-ui/references/api_reference.md +355 -0
  276. package/skills/refactoring-ui/references/review-checklist.md +114 -0
  277. package/skills/refactoring-ui/scripts/audit_css.py +250 -0
  278. package/skills/refactoring-ui/scripts/example.py +1 -0
  279. package/skills/rust-in-action/SKILL.md +350 -0
  280. package/skills/rust-in-action/evals/evals.json +38 -0
  281. package/skills/rust-in-action/evals/results.json +13 -0
  282. package/skills/rust-in-action/examples/after.md +156 -0
  283. package/skills/rust-in-action/examples/before.md +56 -0
  284. package/skills/rust-in-action/references/practices-catalog.md +346 -0
  285. package/skills/rust-in-action/scripts/review.py +147 -0
  286. package/skills/skill-router/SKILL.md +186 -0
  287. package/skills/skill-router/evals/evals.json +38 -0
  288. package/skills/skill-router/evals/results.json +13 -0
  289. package/skills/skill-router/examples/after.md +63 -0
  290. package/skills/skill-router/examples/before.md +39 -0
  291. package/skills/skill-router/references/api_reference.md +24 -0
  292. package/skills/skill-router/references/routing-heuristics.md +89 -0
  293. package/skills/skill-router/references/skill-catalog.md +174 -0
  294. package/skills/skill-router/scripts/route.py +266 -0
  295. package/skills/spring-boot-in-action/SKILL.md +340 -0
  296. package/skills/spring-boot-in-action/evals/evals.json +39 -0
  297. package/skills/spring-boot-in-action/evals/results.json +13 -0
  298. package/skills/spring-boot-in-action/examples/after.md +185 -0
  299. package/skills/spring-boot-in-action/examples/before.md +84 -0
  300. package/skills/spring-boot-in-action/references/practices-catalog.md +403 -0
  301. package/skills/spring-boot-in-action/scripts/review.py +184 -0
  302. package/skills/storytelling-with-data/SKILL.md +241 -0
  303. package/skills/storytelling-with-data/assets/example_asset.txt +1 -0
  304. package/skills/storytelling-with-data/evals/evals.json +47 -0
  305. package/skills/storytelling-with-data/evals/results.json +13 -0
  306. package/skills/storytelling-with-data/examples/after.md +50 -0
  307. package/skills/storytelling-with-data/examples/before.md +33 -0
  308. package/skills/storytelling-with-data/references/api_reference.md +379 -0
  309. package/skills/storytelling-with-data/references/review-checklist.md +111 -0
  310. package/skills/storytelling-with-data/scripts/chart_review.py +301 -0
  311. package/skills/storytelling-with-data/scripts/example.py +1 -0
  312. package/skills/system-design-interview/SKILL.md +233 -0
  313. package/skills/system-design-interview/assets/example_asset.txt +1 -0
  314. package/skills/system-design-interview/evals/evals.json +46 -0
  315. package/skills/system-design-interview/evals/results.json +13 -0
  316. package/skills/system-design-interview/examples/after.md +94 -0
  317. package/skills/system-design-interview/examples/before.md +27 -0
  318. package/skills/system-design-interview/references/api_reference.md +582 -0
  319. package/skills/system-design-interview/references/review-checklist.md +201 -0
  320. package/skills/system-design-interview/scripts/example.py +1 -0
  321. package/skills/system-design-interview/scripts/new_design.py +421 -0
  322. package/skills/using-asyncio-python/SKILL.md +290 -0
  323. package/skills/using-asyncio-python/assets/example_asset.txt +1 -0
  324. package/skills/using-asyncio-python/evals/evals.json +43 -0
  325. package/skills/using-asyncio-python/evals/results.json +13 -0
  326. package/skills/using-asyncio-python/examples/after.md +68 -0
  327. package/skills/using-asyncio-python/examples/before.md +39 -0
  328. package/skills/using-asyncio-python/references/api_reference.md +267 -0
  329. package/skills/using-asyncio-python/references/review-checklist.md +149 -0
  330. package/skills/using-asyncio-python/scripts/check_blocking.py +270 -0
  331. package/skills/using-asyncio-python/scripts/example.py +1 -0
  332. package/skills/web-scraping-python/SKILL.md +280 -0
  333. package/skills/web-scraping-python/assets/example_asset.txt +1 -0
  334. package/skills/web-scraping-python/evals/evals.json +46 -0
  335. package/skills/web-scraping-python/evals/results.json +13 -0
  336. package/skills/web-scraping-python/examples/after.md +109 -0
  337. package/skills/web-scraping-python/examples/before.md +40 -0
  338. package/skills/web-scraping-python/references/api_reference.md +393 -0
  339. package/skills/web-scraping-python/references/review-checklist.md +163 -0
  340. package/skills/web-scraping-python/scripts/example.py +1 -0
  341. package/skills/web-scraping-python/scripts/new_scraper.py +231 -0
  342. package/skills/writing-plans/audit.json +34 -0
  343. package/tests/agent-detector.test.js +83 -0
  344. package/tests/corrections.test.js +245 -0
  345. package/tests/doctor/hook-installer.test.js +72 -0
  346. package/tests/doctor/usage-tracker.test.js +140 -0
  347. package/tests/engine/benchmark-eval.test.js +31 -0
  348. package/tests/engine/bm25-index.test.js +85 -0
  349. package/tests/engine/capture-command.test.js +35 -0
  350. package/tests/engine/capture.test.js +17 -0
  351. package/tests/engine/graph-augmented-search.test.js +107 -0
  352. package/tests/engine/graph-injector.test.js +44 -0
  353. package/tests/engine/graph.test.js +216 -0
  354. package/tests/engine/hybrid-searcher.test.js +74 -0
  355. package/tests/engine/indexer-bm25.test.js +37 -0
  356. package/tests/engine/mcp-tools.test.js +73 -0
  357. package/tests/engine/project-initializer-mcp.test.js +99 -0
  358. package/tests/engine/query-expander.test.js +36 -0
  359. package/tests/engine/reranker.test.js +51 -0
  360. package/tests/engine/rrf.test.js +49 -0
  361. package/tests/engine/srag-prefix.test.js +47 -0
  362. package/tests/instinct-block.test.js +23 -0
  363. package/tests/mcp-config-writer.test.js +60 -0
  364. package/tests/project-initializer-new-agents.test.js +48 -0
  365. package/tests/rules/rules-manager.test.js +230 -0
  366. package/tests/well-known-builder.test.js +40 -0
  367. package/tests/wizard/integration-detector.test.js +31 -0
  368. package/tests/wizard/project-detector.test.js +51 -0
  369. package/tests/wizard/prompt-session.test.js +61 -0
  370. package/tests/wizard/prompt.test.js +16 -0
  371. package/tests/wizard/registry-embeddings.test.js +35 -0
  372. package/tests/wizard/skill-recommender.test.js +34 -0
  373. package/tests/wizard/slot-count.test.js +25 -0
  374. package/vercel.json +21 -0
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ check_blocking.py — Static analyser for blocking calls inside async functions.
4
+
5
+ Usage: python check_blocking.py <file_or_directory> [<file_or_directory> ...]
6
+
7
+ Flags:
8
+ --exit-zero Exit 0 even when issues are found (useful in CI to report only)
9
+ --summary Print a summary table at the end
10
+ """
11
+
12
+ import ast
13
+ import argparse
14
+ import sys
15
+ from dataclasses import dataclass, field
16
+ from pathlib import Path
17
+ from typing import Iterator
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Rules
21
+ # ---------------------------------------------------------------------------
22
+ # Each rule is (description, fix_hint, matcher_function)
23
+ # matcher_function(node) -> bool
24
+
25
+
26
+ def _call_matches(node: ast.expr, *name_parts: str) -> bool:
27
+ """True if node is a Call whose function matches the dotted name."""
28
+ if not isinstance(node, ast.Call):
29
+ return False
30
+ func = node.func
31
+ # Simple name: open, sleep, etc.
32
+ if len(name_parts) == 1 and isinstance(func, ast.Name):
33
+ return func.id == name_parts[0]
34
+ # Attribute: requests.get, time.sleep, etc.
35
+ if len(name_parts) == 2 and isinstance(func, ast.Attribute):
36
+ obj = func.value
37
+ return isinstance(obj, ast.Name) and obj.id == name_parts[0] and func.attr == name_parts[1]
38
+ return False
39
+
40
+
41
+ def _is_sync_open(node: ast.expr) -> bool:
42
+ """Flags open() calls that are not preceded by 'async with'."""
43
+ return _call_matches(node, "open")
44
+
45
+
46
+ def _is_file_rw(node: ast.expr) -> bool:
47
+ """Flags .read() / .write() attribute calls (heuristic)."""
48
+ if not isinstance(node, ast.Call):
49
+ return False
50
+ func = node.func
51
+ return isinstance(func, ast.Attribute) and func.attr in {"read", "write", "readlines"}
52
+
53
+
54
+ @dataclass
55
+ class Rule:
56
+ id: str
57
+ description: str
58
+ fix: str
59
+ matcher: object # callable(node) -> bool
60
+
61
+
62
+ RULES: list[Rule] = [
63
+ Rule(
64
+ id="ASYNC001",
65
+ description="requests.get() blocks the event loop",
66
+ fix="Use aiohttp.ClientSession().get() or httpx.AsyncClient().get()",
67
+ matcher=lambda n: _call_matches(n, "requests", "get"),
68
+ ),
69
+ Rule(
70
+ id="ASYNC002",
71
+ description="requests.post() blocks the event loop",
72
+ fix="Use aiohttp.ClientSession().post() or httpx.AsyncClient().post()",
73
+ matcher=lambda n: _call_matches(n, "requests", "post"),
74
+ ),
75
+ Rule(
76
+ id="ASYNC003",
77
+ description="requests.put() blocks the event loop",
78
+ fix="Use aiohttp.ClientSession().put() or httpx.AsyncClient().put()",
79
+ matcher=lambda n: _call_matches(n, "requests", "put"),
80
+ ),
81
+ Rule(
82
+ id="ASYNC004",
83
+ description="requests.delete() blocks the event loop",
84
+ fix="Use aiohttp.ClientSession().delete() or httpx.AsyncClient().delete()",
85
+ matcher=lambda n: _call_matches(n, "requests", "delete"),
86
+ ),
87
+ Rule(
88
+ id="ASYNC005",
89
+ description="time.sleep() blocks the event loop",
90
+ fix="Use 'await asyncio.sleep(seconds)' instead",
91
+ matcher=lambda n: _call_matches(n, "time", "sleep"),
92
+ ),
93
+ Rule(
94
+ id="ASYNC006",
95
+ description="open() is a synchronous file operation",
96
+ fix="Use 'async with aiofiles.open(...)' from the aiofiles package",
97
+ matcher=_is_sync_open,
98
+ ),
99
+ Rule(
100
+ id="ASYNC007",
101
+ description="subprocess.run() blocks the event loop",
102
+ fix="Use 'await asyncio.create_subprocess_exec()' or asyncio.create_subprocess_shell()",
103
+ matcher=lambda n: _call_matches(n, "subprocess", "run"),
104
+ ),
105
+ Rule(
106
+ id="ASYNC008",
107
+ description="subprocess.call() blocks the event loop",
108
+ fix="Use 'await asyncio.create_subprocess_exec()' instead",
109
+ matcher=lambda n: _call_matches(n, "subprocess", "call"),
110
+ ),
111
+ Rule(
112
+ id="ASYNC009",
113
+ description=".read()/.write()/.readlines() on a synchronous file handle",
114
+ fix="Open the file with aiofiles and use 'await file.read()' / 'await file.write()'",
115
+ matcher=_is_file_rw,
116
+ ),
117
+ ]
118
+
119
+
120
+ # ---------------------------------------------------------------------------
121
+ # Finding
122
+ # ---------------------------------------------------------------------------
123
+
124
+ @dataclass
125
+ class Finding:
126
+ file: Path
127
+ line: int
128
+ col: int
129
+ async_func: str
130
+ rule: Rule
131
+
132
+
133
+ def _collect_async_funcs(tree: ast.AST) -> Iterator[ast.AsyncFunctionDef]:
134
+ """Yield all async def nodes in the tree, including nested ones."""
135
+ for node in ast.walk(tree):
136
+ if isinstance(node, ast.AsyncFunctionDef):
137
+ yield node
138
+
139
+
140
+ def _nodes_inside_sync_context(func_node: ast.AsyncFunctionDef) -> set[int]:
141
+ """
142
+ Return the set of node ids that are inside a nested sync def or class,
143
+ so we don't flag blocking calls that are legitimately in sync helpers.
144
+ """
145
+ excluded: set[int] = set()
146
+ for node in ast.walk(func_node):
147
+ if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
148
+ for child in ast.walk(node):
149
+ excluded.add(id(child))
150
+ return excluded
151
+
152
+
153
+ def check_file(path: Path) -> list[Finding]:
154
+ try:
155
+ source = path.read_text(encoding="utf-8", errors="replace")
156
+ except OSError as exc:
157
+ print(f"ERROR: Cannot read {path}: {exc}", file=sys.stderr)
158
+ return []
159
+
160
+ try:
161
+ tree = ast.parse(source, filename=str(path))
162
+ except SyntaxError as exc:
163
+ print(f"ERROR: Syntax error in {path}: {exc}", file=sys.stderr)
164
+ return []
165
+
166
+ findings: list[Finding] = []
167
+
168
+ for async_func in _collect_async_funcs(tree):
169
+ excluded = _nodes_inside_sync_context(async_func)
170
+ for node in ast.walk(async_func):
171
+ if id(node) in excluded:
172
+ continue
173
+ for rule in RULES:
174
+ if rule.matcher(node):
175
+ findings.append(
176
+ Finding(
177
+ file=path,
178
+ line=node.lineno,
179
+ col=node.col_offset,
180
+ async_func=async_func.name,
181
+ rule=rule,
182
+ )
183
+ )
184
+ return findings
185
+
186
+
187
+ def iter_python_files(path: Path) -> Iterator[Path]:
188
+ if path.is_file():
189
+ if path.suffix == ".py":
190
+ yield path
191
+ elif path.is_dir():
192
+ yield from sorted(path.rglob("*.py"))
193
+ else:
194
+ print(f"WARNING: {path} is not a file or directory — skipping.", file=sys.stderr)
195
+
196
+
197
+ # ---------------------------------------------------------------------------
198
+ # Reporting
199
+ # ---------------------------------------------------------------------------
200
+
201
+ def print_findings(findings: list[Finding]) -> None:
202
+ for f in findings:
203
+ print(
204
+ f"{f.file}:{f.line}:{f.col}: [{f.rule.id}] "
205
+ f"In 'async def {f.async_func}': {f.rule.description}"
206
+ )
207
+ print(f" Fix: {f.rule.fix}")
208
+
209
+
210
+ def print_summary(all_findings: list[Finding]) -> None:
211
+ if not all_findings:
212
+ print("\nSummary: No blocking call issues found.")
213
+ return
214
+
215
+ from collections import Counter
216
+ by_rule: Counter = Counter(f.rule.id for f in all_findings)
217
+ by_file: Counter = Counter(str(f.file) for f in all_findings)
218
+
219
+ print("\n--- Summary ---")
220
+ print(f"Total issues: {len(all_findings)}")
221
+ print("\nBy rule:")
222
+ for rule_id, count in sorted(by_rule.items()):
223
+ rule = next(r for r in RULES if r.id == rule_id)
224
+ print(f" {rule_id}: {count}x ({rule.description})")
225
+ print("\nBy file:")
226
+ for filepath, count in sorted(by_file.items()):
227
+ print(f" {count:3d} {filepath}")
228
+
229
+
230
+ # ---------------------------------------------------------------------------
231
+ # Entry point
232
+ # ---------------------------------------------------------------------------
233
+
234
+ def main() -> None:
235
+ parser = argparse.ArgumentParser(
236
+ description="Find blocking calls inside async functions."
237
+ )
238
+ parser.add_argument(
239
+ "paths", nargs="+", type=Path, metavar="file_or_dir",
240
+ help="Python file(s) or director(ies) to analyse"
241
+ )
242
+ parser.add_argument(
243
+ "--exit-zero", action="store_true",
244
+ help="Always exit 0 (useful for non-blocking CI report)"
245
+ )
246
+ parser.add_argument(
247
+ "--summary", action="store_true",
248
+ help="Print a summary table after the findings"
249
+ )
250
+ args = parser.parse_args()
251
+
252
+ all_findings: list[Finding] = []
253
+ for raw_path in args.paths:
254
+ for py_file in iter_python_files(raw_path):
255
+ findings = check_file(py_file)
256
+ all_findings.extend(findings)
257
+ print_findings(findings)
258
+
259
+ if args.summary:
260
+ print_summary(all_findings)
261
+
262
+ if not all_findings:
263
+ print("No blocking call issues detected.")
264
+
265
+ if all_findings and not args.exit_zero:
266
+ sys.exit(1)
267
+
268
+
269
+ if __name__ == "__main__":
270
+ main()
@@ -0,0 +1,280 @@
1
+ ---
2
+ name: web-scraping-python
3
+ version: "1.0"
4
+ license: MIT
5
+ tags: [python, web-scraping, data]
6
+ description: >
7
+ Apply Web Scraping with Python practices (Ryan Mitchell). Covers First
8
+ Scrapers (Ch 1: urllib, BeautifulSoup), HTML Parsing (Ch 2: find, findAll,
9
+ CSS selectors, regex, lambda), Crawling (Ch 3-4: single-domain, cross-site,
10
+ crawl models), Scrapy (Ch 5: spiders, items, pipelines, rules), Storing Data
11
+ (Ch 6: CSV, MySQL, files, email), Reading Documents (Ch 7: PDF, Word,
12
+ encoding), Cleaning Data (Ch 8: normalization, OpenRefine), NLP (Ch 9: n-grams,
13
+ Markov, NLTK), Forms & Logins (Ch 10: POST, sessions, cookies), JavaScript
14
+ (Ch 11: Selenium, headless, Ajax), APIs (Ch 12: REST, undocumented), Image/OCR
15
+ (Ch 13: Pillow, Tesseract), Avoiding Traps (Ch 14: headers, honeypots),
16
+ Testing (Ch 15: unittest, Selenium), Parallel (Ch 16: threads, processes),
17
+ Remote (Ch 17: Tor, proxies), Legalities (Ch 18: robots.txt, CFAA, ethics).
18
+ Trigger on "web scraping", "BeautifulSoup", "Scrapy", "crawler", "spider",
19
+ "scraper", "parse HTML", "Selenium scraping", "data extraction".
20
+ ---
21
+
22
+ # Web Scraping with Python Skill
23
+
24
+ You are an expert web scraping engineer grounded in the 18 chapters from
25
+ *Web Scraping with Python* (Collecting More Data from the Modern Web)
26
+ by Ryan Mitchell. You help developers in two modes:
27
+
28
+ 1. **Scraper Building** — Design and implement web scrapers with idiomatic, production-ready patterns
29
+ 2. **Scraper Review** — Analyze existing scrapers against the book's practices and recommend improvements
30
+
31
+ ## How to Decide Which Mode
32
+
33
+ - If the user asks to *build*, *create*, *scrape*, *extract*, *crawl*, or *collect* data → **Scraper Building**
34
+ - If the user asks to *review*, *audit*, *improve*, *debug*, *optimize*, or *fix* a scraper → **Scraper Review**
35
+ - If ambiguous, ask briefly which mode they'd prefer
36
+
37
+ ---
38
+
39
+ ## Mode 1: Scraper Building
40
+
41
+ When designing or building web scrapers, follow this decision flow:
42
+
43
+ ### Step 1 — Understand the Requirements
44
+
45
+ Ask (or infer from context):
46
+
47
+ - **What target?** — Single page, single domain, multiple domains, API endpoints?
48
+ - **What data?** — Text, tables, images, documents, forms, dynamic JavaScript content?
49
+ - **What scale?** — One-off extraction, recurring crawl, large-scale parallel scraping?
50
+ - **What challenges?** — Login required, JavaScript rendering, rate limiting, anti-bot measures?
51
+
52
+ ### Step 2 — Apply the Right Practices
53
+
54
+ Read `references/practices-catalog.md` for the full chapter-by-chapter catalog. Quick decision guide:
55
+
56
+ | Concern | Chapters to Apply |
57
+ |---------|-------------------|
58
+ | Basic page fetching and parsing | Ch 1: urllib/requests, BeautifulSoup setup, first scraper |
59
+ | Finding elements in HTML | Ch 2: find/findAll, CSS selectors, navigating DOM trees, regex, lambda filters |
60
+ | Crawling within a site | Ch 3: Following links, building crawlers, breadth-first vs depth-first |
61
+ | Crawling across sites | Ch 4: Planning crawl models, handling different site layouts, normalizing data |
62
+ | Framework-based scraping | Ch 5: Scrapy spiders, items, pipelines, rules, CrawlSpider, logging |
63
+ | Saving scraped data | Ch 6: CSV, MySQL/database storage, downloading files, sending email |
64
+ | Non-HTML documents | Ch 7: PDF text extraction, Word docs, encoding handling |
65
+ | Data cleaning | Ch 8: String normalization, regex cleaning, OpenRefine, UTF-8 handling |
66
+ | Text analysis on scraped data | Ch 9: N-grams, Markov models, NLTK, summarization |
67
+ | Login-protected pages | Ch 10: POST requests, sessions, cookies, HTTP basic auth, handling tokens |
68
+ | JavaScript-rendered pages | Ch 11: Selenium WebDriver, headless browsers, waiting for Ajax, executing JS |
69
+ | Working with APIs | Ch 12: REST methods, JSON parsing, authentication, undocumented APIs |
70
+ | Images and OCR | Ch 13: Pillow image processing, Tesseract OCR, CAPTCHA handling |
71
+ | Avoiding detection | Ch 14: User-Agent headers, cookie handling, timing/delays, honeypot avoidance |
72
+ | Testing scrapers | Ch 15: unittest for scrapers, Selenium-based testing, handling site changes |
73
+ | Parallel scraping | Ch 16: Multithreading, multiprocessing, thread-safe queues |
74
+ | Remote/anonymous scraping | Ch 17: Tor, proxies, rotating IPs, cloud-based scraping |
75
+ | Legal and ethical concerns | Ch 18: robots.txt, Terms of Service, CFAA, copyright, ethical scraping |
76
+
77
+ ### Step 3 — Follow Web Scraping Principles
78
+
79
+ Every scraper implementation should honor these principles:
80
+
81
+ 1. **Respect robots.txt** — Always check and honor robots.txt directives; be a good citizen of the web
82
+ 2. **Identify yourself** — Set a descriptive User-Agent string; consider providing contact info
83
+ 3. **Rate limit requests** — Add delays between requests (1-3 seconds minimum); never hammer servers
84
+ 4. **Handle errors gracefully** — Catch connection errors, timeouts, HTTP errors, and missing elements
85
+ 5. **Use sessions wisely** — Reuse HTTP sessions for connection pooling and cookie persistence
86
+ 6. **Parse defensively** — Never assume HTML structure is stable; use multiple selectors as fallbacks
87
+ 7. **Store raw data first** — Save raw HTML/responses before parsing; enables re-parsing without re-scraping
88
+ 8. **Validate extracted data** — Check for None/empty values; verify data types and formats
89
+ 9. **Design for re-runs** — Make scrapers idempotent; track what's already been scraped
90
+ 10. **Stay legal and ethical** — Understand applicable laws (CFAA, GDPR); respect Terms of Service
91
+
92
+ ### Step 4 — Build the Scraper
93
+
94
+ Follow these guidelines:
95
+
96
+ - **Production-ready** — Include error handling, retries, logging, rate limiting from the start
97
+ - **Configurable** — Externalize URLs, selectors, delays, credentials; use config files or arguments
98
+ - **Testable** — Write unit tests for parsing functions; integration tests for full scrape flows
99
+ - **Observable** — Log page fetches, items extracted, errors encountered, timing stats
100
+ - **Documented** — README with setup, usage, target site info, legal notes
101
+
102
+ When building scrapers, produce:
103
+
104
+ 1. **Approach identification** — Which chapters/concepts apply and why
105
+ 2. **Target analysis** — Site structure, pagination, authentication needs, JS rendering
106
+ 3. **Implementation** — Production-ready code with error handling and rate limiting
107
+ 4. **Storage setup** — How and where data is stored (CSV, database, files)
108
+ 5. **Monitoring notes** — What to watch for (site changes, blocks, data quality)
109
+
110
+ ### Scraper Building Examples
111
+
112
+ **Example 1 — Static Site Data Extraction:**
113
+ ```
114
+ User: "Scrape product listings from an e-commerce category page"
115
+
116
+ Apply: Ch 1 (fetching pages), Ch 2 (parsing product elements),
117
+ Ch 3 (pagination/crawling), Ch 6 (storing to CSV/DB)
118
+
119
+ Generate:
120
+ - requests + BeautifulSoup scraper
121
+ - CSS selector-based product extraction
122
+ - Pagination handler following next-page links
123
+ - CSV or database storage with schema
124
+ - Rate limiting and error handling
125
+ ```
126
+
127
+ **Example 2 — JavaScript-Heavy Site:**
128
+ ```
129
+ User: "Extract data from a React single-page application"
130
+
131
+ Apply: Ch 11 (Selenium, headless browser), Ch 2 (parsing rendered HTML),
132
+ Ch 14 (avoiding detection), Ch 15 (testing)
133
+
134
+ Generate:
135
+ - Selenium WebDriver with headless Chrome
136
+ - Explicit waits for dynamic content loading
137
+ - JavaScript execution for scrolling/interaction
138
+ - Data extraction from rendered DOM
139
+ - Headless browser configuration
140
+ ```
141
+
142
+ **Example 3 — Authenticated Scraping:**
143
+ ```
144
+ User: "Scrape data from a site that requires login"
145
+
146
+ Apply: Ch 10 (forms, sessions, cookies), Ch 14 (headers, tokens),
147
+ Ch 6 (data storage)
148
+
149
+ Generate:
150
+ - Session-based login with CSRF token handling
151
+ - Cookie persistence across requests
152
+ - POST request for form submission
153
+ - Authenticated page navigation
154
+ - Session expiry detection and re-login
155
+ ```
156
+
157
+ **Example 4 — Large-Scale Crawl with Scrapy:**
158
+ ```
159
+ User: "Build a crawler to scrape thousands of pages from multiple domains"
160
+
161
+ Apply: Ch 5 (Scrapy framework), Ch 4 (crawl models),
162
+ Ch 16 (parallel scraping), Ch 14 (avoiding blocks)
163
+
164
+ Generate:
165
+ - Scrapy spider with item definitions and pipelines
166
+ - CrawlSpider with Rule and LinkExtractor
167
+ - Pipeline for database storage
168
+ - Settings for concurrent requests, delays, user agents
169
+ - Middleware for proxy rotation
170
+ ```
171
+
172
+ ---
173
+
174
+ ## Mode 2: Scraper Review
175
+
176
+ When reviewing web scrapers, read `references/review-checklist.md` for the full checklist.
177
+
178
+ ### Review Process
179
+
180
+ 1. **Fetching scan** — Check Ch 1, 10, 11: HTTP method, session usage, JS rendering needs, authentication
181
+ 2. **Parsing scan** — Check Ch 2, 7: selector quality, defensive parsing, edge case handling
182
+ 3. **Crawling scan** — Check Ch 3-5: URL management, deduplication, pagination, depth control
183
+ 4. **Storage scan** — Check Ch 6: data format, schema, duplicates, file management
184
+ 5. **Resilience scan** — Check Ch 14-16: error handling, retries, rate limiting, parallel safety
185
+ 6. **Ethics scan** — Check Ch 17-18: robots.txt, legal compliance, identification, respectful crawling
186
+ 7. **Quality scan** — Check Ch 8, 15: data cleaning, testing, validation
187
+
188
+ ### Calibrating Review Tone
189
+
190
+ **CRITICAL: Match your tone to what you actually find.**
191
+
192
+ - If the scraper is well-structured and follows best practices, say so explicitly in the summary and spend the majority of the review praising what it does right. Specifically praise:
193
+ - `RobotFileParser` / robots.txt check before fetching (Ch 18)
194
+ - Descriptive User-Agent with contact info (Ch 14)
195
+ - `requests.Session()` with `Retry` adapter (Ch 10, 14)
196
+ - CSS selectors via `soup.select()` / `soup.select_one()` (Ch 2)
197
+ - Defensive None checks on extracted elements before accessing text (Ch 2)
198
+ - `resp.raise_for_status()` and catching `requests.RequestException` (Ch 1, 14)
199
+ - `time.sleep()` between requests (Ch 14)
200
+ - Structured logging of page number and item counts at each step (Ch 5)
201
+ - Any suggestions on an already-good scraper MUST be framed as **minor optional improvements**, never as critical or high-priority issues. Do not manufacture severity.
202
+
203
+ ### Review Output Format
204
+
205
+ Structure your review as:
206
+
207
+ ```
208
+ ## Summary
209
+ One paragraph: overall scraper quality, pattern adherence, main concerns.
210
+
211
+ ## Fetching & Connection Issues
212
+ For each issue (Ch 1, 10-11):
213
+ - **Topic**: chapter and concept
214
+ - **Location**: where in the code
215
+ - **Problem**: what's wrong
216
+ - **Fix**: recommended change with code snippet
217
+
218
+ ## Parsing & Extraction Issues
219
+ For each issue (Ch 2, 7):
220
+ - Same structure
221
+
222
+ ## Crawling & Navigation Issues
223
+ For each issue (Ch 3-5):
224
+ - Same structure
225
+
226
+ ## Storage & Data Issues
227
+ For each issue (Ch 6, 8):
228
+ - Same structure
229
+
230
+ ## Resilience & Performance Issues
231
+ For each issue (Ch 14-16):
232
+ - Same structure
233
+
234
+ ## Ethics & Legal Issues
235
+ For each issue (Ch 17-18):
236
+ - Same structure
237
+
238
+ ## Testing & Quality Issues
239
+ For each issue (Ch 9, 15):
240
+ - Same structure
241
+
242
+ ## Recommendations
243
+ Priority-ordered from most critical to nice-to-have.
244
+ Each recommendation references the specific chapter/concept.
245
+ ```
246
+
247
+ ### Common Web Scraping Anti-Patterns to Flag
248
+
249
+ - **No error handling on requests** → Ch 1, 14: Wrap requests in try/except; handle `requests.RequestException` (covers ConnectionError, Timeout, HTTPError); always call `resp.raise_for_status()` to surface non-200 responses
250
+ - **Hardcoded selectors without fallbacks** → Ch 2: Use multiple selector strategies; check for None before accessing attributes
251
+ - **No rate limiting** → Ch 14: Add `time.sleep()` between requests; respect server resources
252
+ - **Missing User-Agent header** → Ch 14: Set a descriptive User-Agent with contact info; rotate if needed for scale
253
+ - **Not using sessions** → Ch 10: Use `requests.Session()` for cookie persistence and connection pooling
254
+ - **Ignoring robots.txt** → Ch 18: Parse and respect robots.txt via `RobotFileParser` before crawling
255
+ - **No URL deduplication** → Ch 3: Track visited URLs in a set; normalize URLs before comparing
256
+ - **Using regex to parse HTML** → Ch 2: Use BeautifulSoup or lxml, not regex, for HTML parsing. In particular:
257
+ - `re.DOTALL` patterns on `<p>` or block elements will incorrectly merge content from nested inline tags (`<strong>`, `<a>`, etc.) producing wrong output
258
+ - Regex patterns like `href=["\'](.*?)["\']` will match `href` attributes inside `<script>` blocks, `<style>` blocks, and HTML comments, producing many false positives
259
+ - Recommend `soup.select_one()` and `soup.select()` CSS-selector API as the idiomatic BeautifulSoup replacement (preferred over `find()`/`find_all()` for clarity)
260
+ - **Not handling JavaScript content** → Ch 11: If data loads via Ajax, use Selenium or find the underlying API
261
+ - **Storing data without validation** → Ch 6, 8: Validate and clean data before storage; handle encoding
262
+ - **No logging** → Ch 5: Log page fetches, item counts, and errors at each step; use structured logging with page number and item count per page
263
+ - **Sequential when parallel is needed** → Ch 16: Use threading/multiprocessing for large-scale scraping
264
+ - **Ignoring encoding issues** → Ch 7, 8: Handle UTF-8, detect encoding, normalize Unicode
265
+ - **No tests for parsers** → Ch 15: Write unit tests with saved HTML fixtures; test selector robustness
266
+ - **Credentials in code** → Ch 10: Use environment variables or config files for login credentials
267
+ - **Not storing raw responses** → Ch 6: Save raw HTML for re-parsing; don't rely only on extracted data
268
+
269
+ ---
270
+
271
+ ## General Guidelines
272
+
273
+ - **BeautifulSoup for simple scraping, Scrapy for scale** — Match the tool to the complexity
274
+ - **Check for APIs first** — Many sites have APIs (documented or undocumented) that are easier than scraping
275
+ - **Respect the site** — Rate limit, identify yourself, follow robots.txt, check ToS
276
+ - **Parse defensively** — HTML structure changes; always handle missing elements gracefully
277
+ - **Test with saved pages** — Save HTML fixtures and test parsers offline; reduces requests and enables CI
278
+ - **Clean data early** — Normalize strings, handle encoding, strip whitespace at extraction time
279
+ - For deeper practice details, read `references/practices-catalog.md` before building scrapers.
280
+ - For review checklists, read `references/review-checklist.md` before reviewing scrapers.
@@ -0,0 +1,46 @@
1
+ {
2
+ "evals": [
3
+ {
4
+ "id": "eval-01-no-rate-limiting-no-error-handling-no-robots",
5
+ "prompt": "Review this web scraper:\n\n```python\nimport requests\nfrom bs4 import BeautifulSoup\nimport json\n\nBASE_URL = 'https://books.example.com'\n\ndef scrape_all_books():\n all_books = []\n page = 1\n\n while True:\n url = f'{BASE_URL}/catalogue/page-{page}.html'\n response = requests.get(url)\n soup = BeautifulSoup(response.text, 'html.parser')\n\n books = soup.find_all('article', class_='product_pod')\n if not books:\n break\n\n for book in books:\n title = book.find('h3').find('a')['title']\n price = book.find('p', class_='price_color').text\n rating = book.find('p', class_='star-rating')['class'][1]\n all_books.append({'title': title, 'price': price, 'rating': rating})\n\n page += 1\n\n return all_books\n\nresult = scrape_all_books()\nwith open('books.json', 'w') as f:\n json.dump(result, f)\n```",
6
+ "expectations": [
7
+ "Flags no robots.txt check: the scraper does not check or respect the site's robots.txt before crawling (Ch 18: always check and honor robots.txt)",
8
+ "Flags no rate limiting: requests are issued as fast as possible with no delay between pages; recommends adding `time.sleep()` of at least 1-3 seconds between requests (Ch 14: rate limit requests)",
9
+ "Flags no error handling on `requests.get()`: a network error, timeout, or non-200 response will raise an exception or silently produce garbage HTML (Ch 1, 14: wrap requests in try/except, check response status)",
10
+ "Flags no User-Agent header: the scraper uses the default requests User-Agent which may be blocked and does not identify the bot (Ch 14: set a descriptive User-Agent header)",
11
+ "Flags no session reuse: `requests.get()` called in a loop creates a new connection for each page; recommends `requests.Session()` for connection pooling (Ch 10: use sessions for connection pooling)",
12
+ "Flags defensive parsing issues: `book.find('h3').find('a')['title']` will raise AttributeError if any element is missing; recommends checking for None before accessing attributes (Ch 2: parse defensively)",
13
+ "Flags no logging of progress or errors (Ch 5: log page fetches, errors, items extracted)"
14
+ ]
15
+ },
16
+ {
17
+ "id": "eval-02-regex-for-html-parsing",
18
+ "prompt": "Review this data extraction code:\n\n```python\nimport requests\nimport re\n\ndef extract_product_data(url: str) -> dict:\n response = requests.get(url)\n html = response.text\n\n # Extract product name\n name_match = re.search(r'<h1[^>]*>([^<]+)</h1>', html)\n name = name_match.group(1) if name_match else None\n\n # Extract price\n price_match = re.search(r'<span class=\"price\">\\$([\\d\\.]+)</span>', html)\n price = float(price_match.group(1)) if price_match else None\n\n # Extract description paragraphs\n desc_matches = re.findall(r'<p class=\"desc\">(.+?)</p>', html, re.DOTALL)\n description = ' '.join(desc_matches)\n\n # Extract all href links on the page\n links = re.findall(r'href=[\"\\']([^\"\\']+)[\"\\']', html)\n\n # Check if in stock\n in_stock = bool(re.search(r'<span class=\"stock\">In Stock</span>', html))\n\n return {\n 'name': name,\n 'price': price,\n 'description': description,\n 'links': links,\n 'in_stock': in_stock\n }\n```",
19
+ "expectations": [
20
+ "Flags parsing HTML with regex as the primary anti-pattern: regex cannot reliably parse HTML because HTML is not a regular language; attribute order can vary, whitespace can differ, and nested tags break simple patterns (Ch 2: use BeautifulSoup or lxml, not regex, for HTML parsing)",
21
+ "Flags that the price regex `\\$([\\d\\.]+)` will fail silently on prices with commas (e.g., $1,299.99) or different currency formats without any warning (Ch 2: parse defensively)",
22
+ "Flags the description regex with `re.DOTALL` will incorrectly merge content from separate `<p>` tags that contain nested HTML tags like `<strong>` or `<a>` (Ch 2: regex cannot handle nested HTML)",
23
+ "Flags the link extraction regex `href=[\"\\']([^\"\\']+)[\"\\']` will match hrefs in script tags, style tags, and HTML comments, returning many false positives (Ch 2: use a parser with proper DOM traversal)",
24
+ "Flags no error handling on `requests.get()` and no status code check (Ch 1, 14: check response.raise_for_status())",
25
+ "Flags no session usage for connection pooling (Ch 10: use requests.Session())",
26
+ "Recommends replacing all regex parsing with BeautifulSoup CSS selectors or XPath, providing a corrected example using soup.select_one() and soup.select()"
27
+ ]
28
+ },
29
+ {
30
+ "id": "eval-03-clean-scraper-session-retry-css-selectors",
31
+ "prompt": "Review this web scraper:\n\n```python\nimport logging\nimport time\nfrom urllib.robotparser import RobotFileParser\nimport requests\nfrom requests.adapters import HTTPAdapter\nfrom urllib3.util.retry import Retry\nfrom bs4 import BeautifulSoup\n\nlogger = logging.getLogger(__name__)\n\nUSER_AGENT = 'ResearchBot/1.0 (contact: bot@example.com)'\nREQUEST_DELAY = 1.5 # seconds between requests\n\n\ndef build_session() -> requests.Session:\n session = requests.Session()\n session.headers['User-Agent'] = USER_AGENT\n retry = Retry(\n total=3,\n backoff_factor=1,\n status_forcelist=[429, 500, 502, 503, 504]\n )\n session.mount('https://', HTTPAdapter(max_retries=retry))\n return session\n\n\ndef can_fetch(base_url: str, path: str) -> bool:\n rp = RobotFileParser()\n rp.set_url(f'{base_url}/robots.txt')\n rp.read()\n return rp.can_fetch(USER_AGENT, f'{base_url}{path}')\n\n\ndef parse_listing(html: str) -> list[dict]:\n soup = BeautifulSoup(html, 'html.parser')\n items = []\n for card in soup.select('article.product-card'):\n title_el = card.select_one('h2.product-title')\n price_el = card.select_one('span.price')\n if title_el is None or price_el is None:\n logger.warning('Skipping card with missing elements')\n continue\n items.append({\n 'title': title_el.get_text(strip=True),\n 'price': price_el.get_text(strip=True),\n })\n return items\n\n\ndef scrape_category(base_url: str, category_path: str) -> list[dict]:\n if not can_fetch(base_url, category_path):\n logger.error('robots.txt disallows scraping %s', category_path)\n return []\n\n session = build_session()\n all_items: list[dict] = []\n page = 1\n\n while True:\n url = f'{base_url}{category_path}?page={page}'\n try:\n resp = session.get(url, timeout=10)\n resp.raise_for_status()\n except requests.RequestException as exc:\n logger.error('Request failed for %s: %s', url, exc)\n break\n\n items = parse_listing(resp.text)\n if not items:\n break\n\n logger.info('Page %d: extracted %d items', page, len(items))\n all_items.extend(items)\n page += 1\n time.sleep(REQUEST_DELAY)\n\n return all_items\n```",
32
+ "expectations": [
33
+ "Recognizes this is a well-structured, responsible scraper and says so explicitly",
34
+ "Praises robots.txt check via `RobotFileParser` before any requests are made (Ch 18: always check and honor robots.txt)",
35
+ "Praises the descriptive User-Agent with contact information making the bot identifiable (Ch 14: identify yourself with a descriptive User-Agent)",
36
+ "Praises `requests.Session()` with a `Retry` adapter providing automatic retry on transient server errors and rate-limit responses (Ch 14, 10: sessions with retry logic)",
37
+ "Praises CSS selectors via `soup.select()` and `soup.select_one()` instead of regex for HTML parsing (Ch 2: use BeautifulSoup CSS selectors)",
38
+ "Praises defensive None checks on extracted elements before accessing text, with a warning log for skipped cards (Ch 2: parse defensively)",
39
+ "Praises `resp.raise_for_status()` and catching `requests.RequestException` for all HTTP/network errors (Ch 1, 14: handle connection errors, timeouts, and HTTP errors)",
40
+ "Praises `time.sleep(REQUEST_DELAY)` between pages to be polite to the server (Ch 14: rate limit requests)",
41
+ "Praises structured logging of page number and item counts at each step (Ch 5: log progress)",
42
+ "Does NOT manufacture issues to appear thorough; any suggestions are explicitly framed as minor optional improvements"
43
+ ]
44
+ }
45
+ ]
46
+ }
@@ -0,0 +1,13 @@
1
+ {
2
+ "pass_rate": 0.958,
3
+ "passed": 23,
4
+ "total": 24,
5
+ "baseline_pass_rate": 0.375,
6
+ "baseline_passed": 9,
7
+ "baseline_total": 24,
8
+ "delta": 0.583,
9
+ "model": "default",
10
+ "evals_run": 3,
11
+ "date": "2026-03-28",
12
+ "non_standard_provider": true
13
+ }