@booklib/skills 1.2.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONTRIBUTING.md +122 -0
- package/README.md +20 -2
- package/ROADMAP.md +36 -0
- package/animation-at-work/evals/evals.json +44 -0
- package/animation-at-work/examples/after.md +64 -0
- package/animation-at-work/examples/before.md +35 -0
- package/animation-at-work/scripts/audit_animations.py +295 -0
- package/bin/skills.js +552 -42
- package/clean-code-reviewer/SKILL.md +109 -1
- package/clean-code-reviewer/evals/evals.json +121 -3
- package/clean-code-reviewer/examples/after.md +48 -0
- package/clean-code-reviewer/examples/before.md +33 -0
- package/clean-code-reviewer/references/api_reference.md +158 -0
- package/clean-code-reviewer/references/practices-catalog.md +282 -0
- package/clean-code-reviewer/references/review-checklist.md +254 -0
- package/clean-code-reviewer/scripts/pre-review.py +206 -0
- package/data-intensive-patterns/evals/evals.json +43 -0
- package/data-intensive-patterns/examples/after.md +61 -0
- package/data-intensive-patterns/examples/before.md +38 -0
- package/data-intensive-patterns/scripts/adr.py +213 -0
- package/data-pipelines/evals/evals.json +45 -0
- package/data-pipelines/examples/after.md +97 -0
- package/data-pipelines/examples/before.md +37 -0
- package/data-pipelines/scripts/new_pipeline.py +444 -0
- package/design-patterns/evals/evals.json +46 -0
- package/design-patterns/examples/after.md +52 -0
- package/design-patterns/examples/before.md +29 -0
- package/design-patterns/scripts/scaffold.py +807 -0
- package/domain-driven-design/SKILL.md +120 -0
- package/domain-driven-design/evals/evals.json +48 -0
- package/domain-driven-design/examples/after.md +80 -0
- package/domain-driven-design/examples/before.md +43 -0
- package/domain-driven-design/scripts/scaffold.py +421 -0
- package/effective-java/evals/evals.json +46 -0
- package/effective-java/examples/after.md +83 -0
- package/effective-java/examples/before.md +37 -0
- package/effective-java/scripts/checkstyle_setup.py +211 -0
- package/effective-kotlin/evals/evals.json +45 -0
- package/effective-kotlin/examples/after.md +36 -0
- package/effective-kotlin/examples/before.md +38 -0
- package/effective-python/evals/evals.json +44 -0
- package/effective-python/examples/after.md +56 -0
- package/effective-python/examples/before.md +40 -0
- package/effective-python/references/api_reference.md +218 -0
- package/effective-python/references/practices-catalog.md +483 -0
- package/effective-python/references/review-checklist.md +190 -0
- package/effective-python/scripts/lint.py +173 -0
- package/kotlin-in-action/evals/evals.json +43 -0
- package/kotlin-in-action/examples/after.md +53 -0
- package/kotlin-in-action/examples/before.md +39 -0
- package/kotlin-in-action/scripts/setup_detekt.py +224 -0
- package/lean-startup/evals/evals.json +43 -0
- package/lean-startup/examples/after.md +80 -0
- package/lean-startup/examples/before.md +34 -0
- package/lean-startup/scripts/new_experiment.py +286 -0
- package/microservices-patterns/SKILL.md +140 -0
- package/microservices-patterns/evals/evals.json +45 -0
- package/microservices-patterns/examples/after.md +69 -0
- package/microservices-patterns/examples/before.md +40 -0
- package/microservices-patterns/scripts/new_service.py +583 -0
- package/package.json +2 -8
- package/refactoring-ui/evals/evals.json +45 -0
- package/refactoring-ui/examples/after.md +85 -0
- package/refactoring-ui/examples/before.md +58 -0
- package/refactoring-ui/scripts/audit_css.py +250 -0
- package/skill-router/SKILL.md +142 -0
- package/skill-router/evals/evals.json +38 -0
- package/skill-router/examples/after.md +63 -0
- package/skill-router/examples/before.md +39 -0
- package/skill-router/references/api_reference.md +24 -0
- package/skill-router/references/routing-heuristics.md +89 -0
- package/skill-router/references/skill-catalog.md +156 -0
- package/skill-router/scripts/route.py +266 -0
- package/storytelling-with-data/evals/evals.json +47 -0
- package/storytelling-with-data/examples/after.md +50 -0
- package/storytelling-with-data/examples/before.md +33 -0
- package/storytelling-with-data/scripts/chart_review.py +301 -0
- package/system-design-interview/evals/evals.json +45 -0
- package/system-design-interview/examples/after.md +94 -0
- package/system-design-interview/examples/before.md +27 -0
- package/system-design-interview/scripts/new_design.py +421 -0
- package/using-asyncio-python/evals/evals.json +43 -0
- package/using-asyncio-python/examples/after.md +68 -0
- package/using-asyncio-python/examples/before.md +39 -0
- package/using-asyncio-python/scripts/check_blocking.py +270 -0
- package/web-scraping-python/evals/evals.json +46 -0
- package/web-scraping-python/examples/after.md +109 -0
- package/web-scraping-python/examples/before.md +40 -0
- package/web-scraping-python/scripts/new_scraper.py +231 -0
- /package/{effective-python-skill → effective-python}/SKILL.md +0 -0
- /package/{effective-python-skill → effective-python}/ref-01-pythonic-thinking.md +0 -0
- /package/{effective-python-skill → effective-python}/ref-02-lists-and-dicts.md +0 -0
- /package/{effective-python-skill → effective-python}/ref-03-functions.md +0 -0
- /package/{effective-python-skill → effective-python}/ref-04-comprehensions-generators.md +0 -0
- /package/{effective-python-skill → effective-python}/ref-05-classes-interfaces.md +0 -0
- /package/{effective-python-skill → effective-python}/ref-06-metaclasses-attributes.md +0 -0
- /package/{effective-python-skill → effective-python}/ref-07-concurrency.md +0 -0
- /package/{effective-python-skill → effective-python}/ref-08-robustness-performance.md +0 -0
- /package/{effective-python-skill → effective-python}/ref-09-testing-debugging.md +0 -0
- /package/{effective-python-skill → effective-python}/ref-10-collaboration.md +0 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
check_blocking.py — Static analyser for blocking calls inside async functions.
|
|
4
|
+
|
|
5
|
+
Usage: python check_blocking.py <file_or_directory> [<file_or_directory> ...]
|
|
6
|
+
|
|
7
|
+
Flags:
|
|
8
|
+
--exit-zero Exit 0 even when issues are found (useful in CI to report only)
|
|
9
|
+
--summary Print a summary table at the end
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import ast
|
|
13
|
+
import argparse
|
|
14
|
+
import sys
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Iterator
|
|
18
|
+
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
# Rules
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
# Each rule is (description, fix_hint, matcher_function)
|
|
23
|
+
# matcher_function(node) -> bool
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _call_matches(node: ast.expr, *name_parts: str) -> bool:
|
|
27
|
+
"""True if node is a Call whose function matches the dotted name."""
|
|
28
|
+
if not isinstance(node, ast.Call):
|
|
29
|
+
return False
|
|
30
|
+
func = node.func
|
|
31
|
+
# Simple name: open, sleep, etc.
|
|
32
|
+
if len(name_parts) == 1 and isinstance(func, ast.Name):
|
|
33
|
+
return func.id == name_parts[0]
|
|
34
|
+
# Attribute: requests.get, time.sleep, etc.
|
|
35
|
+
if len(name_parts) == 2 and isinstance(func, ast.Attribute):
|
|
36
|
+
obj = func.value
|
|
37
|
+
return isinstance(obj, ast.Name) and obj.id == name_parts[0] and func.attr == name_parts[1]
|
|
38
|
+
return False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _is_sync_open(node: ast.expr) -> bool:
|
|
42
|
+
"""Flags open() calls that are not preceded by 'async with'."""
|
|
43
|
+
return _call_matches(node, "open")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _is_file_rw(node: ast.expr) -> bool:
|
|
47
|
+
"""Flags .read() / .write() attribute calls (heuristic)."""
|
|
48
|
+
if not isinstance(node, ast.Call):
|
|
49
|
+
return False
|
|
50
|
+
func = node.func
|
|
51
|
+
return isinstance(func, ast.Attribute) and func.attr in {"read", "write", "readlines"}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class Rule:
|
|
56
|
+
id: str
|
|
57
|
+
description: str
|
|
58
|
+
fix: str
|
|
59
|
+
matcher: object # callable(node) -> bool
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
RULES: list[Rule] = [
|
|
63
|
+
Rule(
|
|
64
|
+
id="ASYNC001",
|
|
65
|
+
description="requests.get() blocks the event loop",
|
|
66
|
+
fix="Use aiohttp.ClientSession().get() or httpx.AsyncClient().get()",
|
|
67
|
+
matcher=lambda n: _call_matches(n, "requests", "get"),
|
|
68
|
+
),
|
|
69
|
+
Rule(
|
|
70
|
+
id="ASYNC002",
|
|
71
|
+
description="requests.post() blocks the event loop",
|
|
72
|
+
fix="Use aiohttp.ClientSession().post() or httpx.AsyncClient().post()",
|
|
73
|
+
matcher=lambda n: _call_matches(n, "requests", "post"),
|
|
74
|
+
),
|
|
75
|
+
Rule(
|
|
76
|
+
id="ASYNC003",
|
|
77
|
+
description="requests.put() blocks the event loop",
|
|
78
|
+
fix="Use aiohttp.ClientSession().put() or httpx.AsyncClient().put()",
|
|
79
|
+
matcher=lambda n: _call_matches(n, "requests", "put"),
|
|
80
|
+
),
|
|
81
|
+
Rule(
|
|
82
|
+
id="ASYNC004",
|
|
83
|
+
description="requests.delete() blocks the event loop",
|
|
84
|
+
fix="Use aiohttp.ClientSession().delete() or httpx.AsyncClient().delete()",
|
|
85
|
+
matcher=lambda n: _call_matches(n, "requests", "delete"),
|
|
86
|
+
),
|
|
87
|
+
Rule(
|
|
88
|
+
id="ASYNC005",
|
|
89
|
+
description="time.sleep() blocks the event loop",
|
|
90
|
+
fix="Use 'await asyncio.sleep(seconds)' instead",
|
|
91
|
+
matcher=lambda n: _call_matches(n, "time", "sleep"),
|
|
92
|
+
),
|
|
93
|
+
Rule(
|
|
94
|
+
id="ASYNC006",
|
|
95
|
+
description="open() is a synchronous file operation",
|
|
96
|
+
fix="Use 'async with aiofiles.open(...)' from the aiofiles package",
|
|
97
|
+
matcher=_is_sync_open,
|
|
98
|
+
),
|
|
99
|
+
Rule(
|
|
100
|
+
id="ASYNC007",
|
|
101
|
+
description="subprocess.run() blocks the event loop",
|
|
102
|
+
fix="Use 'await asyncio.create_subprocess_exec()' or asyncio.create_subprocess_shell()",
|
|
103
|
+
matcher=lambda n: _call_matches(n, "subprocess", "run"),
|
|
104
|
+
),
|
|
105
|
+
Rule(
|
|
106
|
+
id="ASYNC008",
|
|
107
|
+
description="subprocess.call() blocks the event loop",
|
|
108
|
+
fix="Use 'await asyncio.create_subprocess_exec()' instead",
|
|
109
|
+
matcher=lambda n: _call_matches(n, "subprocess", "call"),
|
|
110
|
+
),
|
|
111
|
+
Rule(
|
|
112
|
+
id="ASYNC009",
|
|
113
|
+
description=".read()/.write()/.readlines() on a synchronous file handle",
|
|
114
|
+
fix="Open the file with aiofiles and use 'await file.read()' / 'await file.write()'",
|
|
115
|
+
matcher=_is_file_rw,
|
|
116
|
+
),
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# ---------------------------------------------------------------------------
|
|
121
|
+
# Finding
|
|
122
|
+
# ---------------------------------------------------------------------------
|
|
123
|
+
|
|
124
|
+
@dataclass
|
|
125
|
+
class Finding:
|
|
126
|
+
file: Path
|
|
127
|
+
line: int
|
|
128
|
+
col: int
|
|
129
|
+
async_func: str
|
|
130
|
+
rule: Rule
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _collect_async_funcs(tree: ast.AST) -> Iterator[ast.AsyncFunctionDef]:
|
|
134
|
+
"""Yield all async def nodes in the tree, including nested ones."""
|
|
135
|
+
for node in ast.walk(tree):
|
|
136
|
+
if isinstance(node, ast.AsyncFunctionDef):
|
|
137
|
+
yield node
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _nodes_inside_sync_context(func_node: ast.AsyncFunctionDef) -> set[int]:
|
|
141
|
+
"""
|
|
142
|
+
Return the set of node ids that are inside a nested sync def or class,
|
|
143
|
+
so we don't flag blocking calls that are legitimately in sync helpers.
|
|
144
|
+
"""
|
|
145
|
+
excluded: set[int] = set()
|
|
146
|
+
for node in ast.walk(func_node):
|
|
147
|
+
if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
|
|
148
|
+
for child in ast.walk(node):
|
|
149
|
+
excluded.add(id(child))
|
|
150
|
+
return excluded
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def check_file(path: Path) -> list[Finding]:
|
|
154
|
+
try:
|
|
155
|
+
source = path.read_text(encoding="utf-8", errors="replace")
|
|
156
|
+
except OSError as exc:
|
|
157
|
+
print(f"ERROR: Cannot read {path}: {exc}", file=sys.stderr)
|
|
158
|
+
return []
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
tree = ast.parse(source, filename=str(path))
|
|
162
|
+
except SyntaxError as exc:
|
|
163
|
+
print(f"ERROR: Syntax error in {path}: {exc}", file=sys.stderr)
|
|
164
|
+
return []
|
|
165
|
+
|
|
166
|
+
findings: list[Finding] = []
|
|
167
|
+
|
|
168
|
+
for async_func in _collect_async_funcs(tree):
|
|
169
|
+
excluded = _nodes_inside_sync_context(async_func)
|
|
170
|
+
for node in ast.walk(async_func):
|
|
171
|
+
if id(node) in excluded:
|
|
172
|
+
continue
|
|
173
|
+
for rule in RULES:
|
|
174
|
+
if rule.matcher(node):
|
|
175
|
+
findings.append(
|
|
176
|
+
Finding(
|
|
177
|
+
file=path,
|
|
178
|
+
line=node.lineno,
|
|
179
|
+
col=node.col_offset,
|
|
180
|
+
async_func=async_func.name,
|
|
181
|
+
rule=rule,
|
|
182
|
+
)
|
|
183
|
+
)
|
|
184
|
+
return findings
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def iter_python_files(path: Path) -> Iterator[Path]:
|
|
188
|
+
if path.is_file():
|
|
189
|
+
if path.suffix == ".py":
|
|
190
|
+
yield path
|
|
191
|
+
elif path.is_dir():
|
|
192
|
+
yield from sorted(path.rglob("*.py"))
|
|
193
|
+
else:
|
|
194
|
+
print(f"WARNING: {path} is not a file or directory — skipping.", file=sys.stderr)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# ---------------------------------------------------------------------------
|
|
198
|
+
# Reporting
|
|
199
|
+
# ---------------------------------------------------------------------------
|
|
200
|
+
|
|
201
|
+
def print_findings(findings: list[Finding]) -> None:
|
|
202
|
+
for f in findings:
|
|
203
|
+
print(
|
|
204
|
+
f"{f.file}:{f.line}:{f.col}: [{f.rule.id}] "
|
|
205
|
+
f"In 'async def {f.async_func}': {f.rule.description}"
|
|
206
|
+
)
|
|
207
|
+
print(f" Fix: {f.rule.fix}")
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def print_summary(all_findings: list[Finding]) -> None:
|
|
211
|
+
if not all_findings:
|
|
212
|
+
print("\nSummary: No blocking call issues found.")
|
|
213
|
+
return
|
|
214
|
+
|
|
215
|
+
from collections import Counter
|
|
216
|
+
by_rule: Counter = Counter(f.rule.id for f in all_findings)
|
|
217
|
+
by_file: Counter = Counter(str(f.file) for f in all_findings)
|
|
218
|
+
|
|
219
|
+
print("\n--- Summary ---")
|
|
220
|
+
print(f"Total issues: {len(all_findings)}")
|
|
221
|
+
print("\nBy rule:")
|
|
222
|
+
for rule_id, count in sorted(by_rule.items()):
|
|
223
|
+
rule = next(r for r in RULES if r.id == rule_id)
|
|
224
|
+
print(f" {rule_id}: {count}x ({rule.description})")
|
|
225
|
+
print("\nBy file:")
|
|
226
|
+
for filepath, count in sorted(by_file.items()):
|
|
227
|
+
print(f" {count:3d} {filepath}")
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# ---------------------------------------------------------------------------
|
|
231
|
+
# Entry point
|
|
232
|
+
# ---------------------------------------------------------------------------
|
|
233
|
+
|
|
234
|
+
def main() -> None:
|
|
235
|
+
parser = argparse.ArgumentParser(
|
|
236
|
+
description="Find blocking calls inside async functions."
|
|
237
|
+
)
|
|
238
|
+
parser.add_argument(
|
|
239
|
+
"paths", nargs="+", type=Path, metavar="file_or_dir",
|
|
240
|
+
help="Python file(s) or director(ies) to analyse"
|
|
241
|
+
)
|
|
242
|
+
parser.add_argument(
|
|
243
|
+
"--exit-zero", action="store_true",
|
|
244
|
+
help="Always exit 0 (useful for non-blocking CI report)"
|
|
245
|
+
)
|
|
246
|
+
parser.add_argument(
|
|
247
|
+
"--summary", action="store_true",
|
|
248
|
+
help="Print a summary table after the findings"
|
|
249
|
+
)
|
|
250
|
+
args = parser.parse_args()
|
|
251
|
+
|
|
252
|
+
all_findings: list[Finding] = []
|
|
253
|
+
for raw_path in args.paths:
|
|
254
|
+
for py_file in iter_python_files(raw_path):
|
|
255
|
+
findings = check_file(py_file)
|
|
256
|
+
all_findings.extend(findings)
|
|
257
|
+
print_findings(findings)
|
|
258
|
+
|
|
259
|
+
if args.summary:
|
|
260
|
+
print_summary(all_findings)
|
|
261
|
+
|
|
262
|
+
if not all_findings:
|
|
263
|
+
print("No blocking call issues detected.")
|
|
264
|
+
|
|
265
|
+
if all_findings and not args.exit_zero:
|
|
266
|
+
sys.exit(1)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
if __name__ == "__main__":
|
|
270
|
+
main()
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"evals": [
|
|
3
|
+
{
|
|
4
|
+
"id": "eval-01-no-rate-limiting-no-error-handling-no-robots",
|
|
5
|
+
"prompt": "Review this web scraper:\n\n```python\nimport requests\nfrom bs4 import BeautifulSoup\nimport json\n\nBASE_URL = 'https://books.example.com'\n\ndef scrape_all_books():\n all_books = []\n page = 1\n\n while True:\n url = f'{BASE_URL}/catalogue/page-{page}.html'\n response = requests.get(url)\n soup = BeautifulSoup(response.text, 'html.parser')\n\n books = soup.find_all('article', class_='product_pod')\n if not books:\n break\n\n for book in books:\n title = book.find('h3').find('a')['title']\n price = book.find('p', class_='price_color').text\n rating = book.find('p', class_='star-rating')['class'][1]\n all_books.append({'title': title, 'price': price, 'rating': rating})\n\n page += 1\n\n return all_books\n\nresult = scrape_all_books()\nwith open('books.json', 'w') as f:\n json.dump(result, f)\n```",
|
|
6
|
+
"expectations": [
|
|
7
|
+
"Flags no robots.txt check: the scraper does not check or respect the site's robots.txt before crawling (Ch 18: always check and honor robots.txt)",
|
|
8
|
+
"Flags no rate limiting: requests are issued as fast as possible with no delay between pages; recommends adding `time.sleep()` of at least 1-3 seconds between requests (Ch 14: rate limit requests)",
|
|
9
|
+
"Flags no error handling on `requests.get()`: a network error, timeout, or non-200 response will raise an exception or silently produce garbage HTML (Ch 1, 14: wrap requests in try/except, check response status)",
|
|
10
|
+
"Flags no User-Agent header: the scraper uses the default requests User-Agent which may be blocked and does not identify the bot (Ch 14: set a descriptive User-Agent header)",
|
|
11
|
+
"Flags no session reuse: `requests.get()` called in a loop creates a new connection for each page; recommends `requests.Session()` for connection pooling (Ch 10: use sessions for connection pooling)",
|
|
12
|
+
"Flags defensive parsing issues: `book.find('h3').find('a')['title']` will raise AttributeError if any element is missing; recommends checking for None before accessing attributes (Ch 2: parse defensively)",
|
|
13
|
+
"Flags no logging of progress or errors (Ch 5: log page fetches, errors, items extracted)"
|
|
14
|
+
]
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"id": "eval-02-regex-for-html-parsing",
|
|
18
|
+
"prompt": "Review this data extraction code:\n\n```python\nimport requests\nimport re\n\ndef extract_product_data(url: str) -> dict:\n response = requests.get(url)\n html = response.text\n\n # Extract product name\n name_match = re.search(r'<h1[^>]*>([^<]+)</h1>', html)\n name = name_match.group(1) if name_match else None\n\n # Extract price\n price_match = re.search(r'<span class=\"price\">\\$([\\d\\.]+)</span>', html)\n price = float(price_match.group(1)) if price_match else None\n\n # Extract description paragraphs\n desc_matches = re.findall(r'<p class=\"desc\">(.+?)</p>', html, re.DOTALL)\n description = ' '.join(desc_matches)\n\n # Extract all href links on the page\n links = re.findall(r'href=[\"\\']([^\"\\']+)[\"\\']', html)\n\n # Check if in stock\n in_stock = bool(re.search(r'<span class=\"stock\">In Stock</span>', html))\n\n return {\n 'name': name,\n 'price': price,\n 'description': description,\n 'links': links,\n 'in_stock': in_stock\n }\n```",
|
|
19
|
+
"expectations": [
|
|
20
|
+
"Flags parsing HTML with regex as the primary anti-pattern: regex cannot reliably parse HTML because HTML is not a regular language; attribute order can vary, whitespace can differ, and nested tags break simple patterns (Ch 2: use BeautifulSoup or lxml, not regex, for HTML parsing)",
|
|
21
|
+
"Flags that the price regex `\\$([\\d\\.]+)` will fail silently on prices with commas (e.g., $1,299.99) or different currency formats without any warning (Ch 2: parse defensively)",
|
|
22
|
+
"Flags the description regex with `re.DOTALL` will incorrectly merge content from separate `<p>` tags that contain nested HTML tags like `<strong>` or `<a>` (Ch 2: regex cannot handle nested HTML)",
|
|
23
|
+
"Flags the link extraction regex `href=[\"\\']([^\"\\']+)[\"\\']` will match hrefs in script tags, style tags, and HTML comments, returning many false positives (Ch 2: use a parser with proper DOM traversal)",
|
|
24
|
+
"Flags no error handling on `requests.get()` and no status code check (Ch 1, 14: check response.raise_for_status())",
|
|
25
|
+
"Flags no session usage for connection pooling (Ch 10: use requests.Session())",
|
|
26
|
+
"Recommends replacing all regex parsing with BeautifulSoup CSS selectors or XPath, providing a corrected example using soup.select_one() and soup.select()"
|
|
27
|
+
]
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"id": "eval-03-clean-scraper-session-retry-css-selectors",
|
|
31
|
+
"prompt": "Review this web scraper:\n\n```python\nimport logging\nimport time\nfrom urllib.robotparser import RobotFileParser\nimport requests\nfrom requests.adapters import HTTPAdapter\nfrom urllib3.util.retry import Retry\nfrom bs4 import BeautifulSoup\n\nlogger = logging.getLogger(__name__)\n\nUSER_AGENT = 'ResearchBot/1.0 (contact: bot@example.com)'\nREQUEST_DELAY = 1.5 # seconds between requests\n\n\ndef build_session() -> requests.Session:\n session = requests.Session()\n session.headers['User-Agent'] = USER_AGENT\n retry = Retry(\n total=3,\n backoff_factor=1,\n status_forcelist=[429, 500, 502, 503, 504]\n )\n session.mount('https://', HTTPAdapter(max_retries=retry))\n return session\n\n\ndef can_fetch(base_url: str, path: str) -> bool:\n rp = RobotFileParser()\n rp.set_url(f'{base_url}/robots.txt')\n rp.read()\n return rp.can_fetch(USER_AGENT, f'{base_url}{path}')\n\n\ndef parse_listing(html: str) -> list[dict]:\n soup = BeautifulSoup(html, 'html.parser')\n items = []\n for card in soup.select('article.product-card'):\n title_el = card.select_one('h2.product-title')\n price_el = card.select_one('span.price')\n if title_el is None or price_el is None:\n logger.warning('Skipping card with missing elements')\n continue\n items.append({\n 'title': title_el.get_text(strip=True),\n 'price': price_el.get_text(strip=True),\n })\n return items\n\n\ndef scrape_category(base_url: str, category_path: str) -> list[dict]:\n if not can_fetch(base_url, category_path):\n logger.error('robots.txt disallows scraping %s', category_path)\n return []\n\n session = build_session()\n all_items: list[dict] = []\n page = 1\n\n while True:\n url = f'{base_url}{category_path}?page={page}'\n try:\n resp = session.get(url, timeout=10)\n resp.raise_for_status()\n except requests.RequestException as exc:\n logger.error('Request failed for %s: %s', url, exc)\n break\n\n items = parse_listing(resp.text)\n if not items:\n break\n\n logger.info('Page %d: extracted %d items', page, len(items))\n all_items.extend(items)\n page += 1\n time.sleep(REQUEST_DELAY)\n\n return all_items\n```",
|
|
32
|
+
"expectations": [
|
|
33
|
+
"Recognizes this is a well-structured, responsible scraper and says so explicitly",
|
|
34
|
+
"Praises robots.txt check via `RobotFileParser` before any requests are made (Ch 18: always check and honor robots.txt)",
|
|
35
|
+
"Praises the descriptive User-Agent with contact information making the bot identifiable (Ch 14: identify yourself with a descriptive User-Agent)",
|
|
36
|
+
"Praises `requests.Session()` with a `Retry` adapter providing automatic retry on transient server errors and rate-limit responses (Ch 14, 10: sessions with retry logic)",
|
|
37
|
+
"Praises CSS selectors via `soup.select()` and `soup.select_one()` instead of regex for HTML parsing (Ch 2: use BeautifulSoup CSS selectors)",
|
|
38
|
+
"Praises defensive None checks on extracted elements before accessing text, with a warning log for skipped cards (Ch 2: parse defensively)",
|
|
39
|
+
"Praises `resp.raise_for_status()` and catching `requests.RequestException` for all HTTP/network errors (Ch 1, 14: handle connection errors, timeouts, and HTTP errors)",
|
|
40
|
+
"Praises `time.sleep(REQUEST_DELAY)` between pages to be polite to the server (Ch 14: rate limit requests)",
|
|
41
|
+
"Praises structured logging of page number and item counts at each step (Ch 5: log progress)",
|
|
42
|
+
"Does NOT manufacture issues to appear thorough; any suggestions are explicitly framed as minor optional improvements"
|
|
43
|
+
]
|
|
44
|
+
}
|
|
45
|
+
]
|
|
46
|
+
}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# After
|
|
2
|
+
|
|
3
|
+
A scraper using `requests.Session` for connection reuse, `BeautifulSoup` for HTML parsing, per-request retry logic, and polite rate limiting between pages.
|
|
4
|
+
|
|
5
|
+
```python
|
|
6
|
+
import logging
|
|
7
|
+
import time
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
|
|
10
|
+
import requests
|
|
11
|
+
from bs4 import BeautifulSoup
|
|
12
|
+
from requests.adapters import HTTPAdapter
|
|
13
|
+
from urllib3.util.retry import Retry
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
USER_AGENT = "JobResearchBot/1.0 (contact: scraping@mycompany.com)"
|
|
18
|
+
REQUEST_DELAY_SECONDS = 2.0
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class JobListing:
|
|
23
|
+
title: str
|
|
24
|
+
company: str
|
|
25
|
+
salary: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def make_session() -> requests.Session:
|
|
29
|
+
"""Create a session with retry logic and a descriptive User-Agent."""
|
|
30
|
+
session = requests.Session()
|
|
31
|
+
session.headers.update({"User-Agent": USER_AGENT})
|
|
32
|
+
|
|
33
|
+
retry_policy = Retry(
|
|
34
|
+
total=3,
|
|
35
|
+
backoff_factor=1.5,
|
|
36
|
+
status_forcelist=[429, 500, 502, 503, 504],
|
|
37
|
+
allowed_methods=["GET"],
|
|
38
|
+
)
|
|
39
|
+
adapter = HTTPAdapter(max_retries=retry_policy)
|
|
40
|
+
session.mount("https://", adapter)
|
|
41
|
+
session.mount("http://", adapter)
|
|
42
|
+
return session
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def parse_job_listings(html: str) -> list[JobListing]:
|
|
46
|
+
"""Extract job listings from a page of HTML using BeautifulSoup."""
|
|
47
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
48
|
+
jobs = []
|
|
49
|
+
|
|
50
|
+
for card in soup.select("article.job-card"):
|
|
51
|
+
title_el = card.select_one("h2.job-title")
|
|
52
|
+
company_el = card.select_one("span.company")
|
|
53
|
+
salary_el = card.select_one("div.salary")
|
|
54
|
+
|
|
55
|
+
if title_el is None:
|
|
56
|
+
logger.debug("Skipping card with no title element")
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
jobs.append(JobListing(
|
|
60
|
+
title=title_el.get_text(strip=True),
|
|
61
|
+
company=company_el.get_text(strip=True) if company_el else "",
|
|
62
|
+
salary=salary_el.get_text(strip=True) if salary_el else "Not specified",
|
|
63
|
+
))
|
|
64
|
+
|
|
65
|
+
return jobs
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def scrape_jobs(base_url: str, num_pages: int) -> list[JobListing]:
|
|
69
|
+
"""Scrape job listings across multiple pages with rate limiting."""
|
|
70
|
+
session = make_session()
|
|
71
|
+
all_jobs: list[JobListing] = []
|
|
72
|
+
|
|
73
|
+
for page in range(1, num_pages + 1):
|
|
74
|
+
url = f"{base_url}?page={page}"
|
|
75
|
+
logger.info("Fetching page %d: %s", page, url)
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
response = session.get(url, timeout=15)
|
|
79
|
+
response.raise_for_status()
|
|
80
|
+
except requests.HTTPError as exc:
|
|
81
|
+
logger.error("HTTP error on page %d: %s", page, exc)
|
|
82
|
+
break
|
|
83
|
+
except requests.RequestException as exc:
|
|
84
|
+
logger.error("Request failed on page %d: %s — stopping", page, exc)
|
|
85
|
+
break
|
|
86
|
+
|
|
87
|
+
page_jobs = parse_job_listings(response.text)
|
|
88
|
+
logger.info("Extracted %d listings from page %d", len(page_jobs), page)
|
|
89
|
+
all_jobs.extend(page_jobs)
|
|
90
|
+
|
|
91
|
+
if page < num_pages:
|
|
92
|
+
time.sleep(REQUEST_DELAY_SECONDS) # be polite
|
|
93
|
+
|
|
94
|
+
return all_jobs
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
if __name__ == "__main__":
|
|
98
|
+
logging.basicConfig(level=logging.INFO)
|
|
99
|
+
jobs = scrape_jobs("https://jobs.example.com/listings", num_pages=20)
|
|
100
|
+
print(f"Total jobs scraped: {len(jobs)}")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Key improvements:
|
|
104
|
+
- `requests.Session` with `HTTPAdapter` reuses TCP connections and retries on transient server errors — one session for all pages instead of a new connection per request (Ch 1, 14: Session reuse and retry)
|
|
105
|
+
- `BeautifulSoup` with CSS selectors replaces regex HTML parsing — correct, readable, and resilient to attribute ordering changes (Ch 2: Use BeautifulSoup, not regex, for HTML)
|
|
106
|
+
- `parse_job_listings` is a pure function that takes an HTML string and returns typed `JobListing` dataclasses — easily unit-tested with saved HTML fixtures (Ch 15: Testing scrapers)
|
|
107
|
+
- `None` checks on each element before `.get_text()` prevent `AttributeError` when elements are missing (Ch 2: Defensive parsing)
|
|
108
|
+
- `time.sleep(REQUEST_DELAY_SECONDS)` between pages respects the server; `USER_AGENT` identifies the bot with a contact address (Ch 14, 18: Rate limiting and identification)
|
|
109
|
+
- Specific `requests.HTTPError` and `requests.RequestException` replace the bare `except` — errors are logged with page context and the crawl stops gracefully (Ch 1, 14: Error handling)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Before
|
|
2
|
+
|
|
3
|
+
A scraper that hammers a job listings site with no delays, parses HTML with regex, swallows all errors, and creates a new TCP connection for every page.
|
|
4
|
+
|
|
5
|
+
```python
|
|
6
|
+
import urllib.request
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
def scrape_jobs(base_url, num_pages):
|
|
10
|
+
all_jobs = []
|
|
11
|
+
|
|
12
|
+
for page in range(1, num_pages + 1):
|
|
13
|
+
url = base_url + "?page=" + str(page)
|
|
14
|
+
try:
|
|
15
|
+
# New connection every request, no headers, no rate limiting
|
|
16
|
+
response = urllib.request.urlopen(url)
|
|
17
|
+
html = response.read().decode("utf-8")
|
|
18
|
+
except:
|
|
19
|
+
# Swallows every error — silent failures
|
|
20
|
+
continue
|
|
21
|
+
|
|
22
|
+
# Parsing HTML with regex — fragile and incorrect
|
|
23
|
+
titles = re.findall(r'<h2 class="job-title">(.*?)</h2>', html)
|
|
24
|
+
companies = re.findall(r'<span class="company">(.*?)</span>', html)
|
|
25
|
+
salaries = re.findall(r'<div class="salary">(.*?)</div>', html)
|
|
26
|
+
|
|
27
|
+
for i in range(len(titles)):
|
|
28
|
+
job = {
|
|
29
|
+
"title": titles[i],
|
|
30
|
+
"company": companies[i] if i < len(companies) else "",
|
|
31
|
+
"salary": salaries[i] if i < len(salaries) else "",
|
|
32
|
+
}
|
|
33
|
+
all_jobs.append(job)
|
|
34
|
+
|
|
35
|
+
return all_jobs
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
jobs = scrape_jobs("https://jobs.example.com/listings", 20)
|
|
39
|
+
print(f"Scraped {len(jobs)} jobs")
|
|
40
|
+
```
|