oh-my-customcode 0.48.0 → 0.48.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/dist/cli/index.js +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/templates/.claude/skills/deep-verify/SKILL.md +19 -4
- package/templates/.claude/skills/professor-triage/SKILL.md +205 -0
- package/templates/CLAUDE.md +1 -1
- package/templates/guides/index.yaml +45 -0
- package/templates/guides/web-scraping/README.md +926 -0
- package/templates/guides/web-scraping/index.yaml +19 -0
- package/templates/manifest.json +3 -3
|
@@ -0,0 +1,926 @@
|
|
|
1
|
+
# Web Scraping Best Practices
|
|
2
|
+
|
|
3
|
+
Reliable patterns for BeautifulSoup and Playwright-based web scraping, with emphasis on Korean government site parsing (QC crawling).
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## 1. BeautifulSoup Parsing Patterns
|
|
8
|
+
|
|
9
|
+
### Table Parsing
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
from bs4 import BeautifulSoup
|
|
13
|
+
|
|
14
|
+
def parse_table(html: str, table_index: int = 0) -> list[dict]:
|
|
15
|
+
"""Parse an HTML table into a list of dicts keyed by header text."""
|
|
16
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
17
|
+
tables = soup.find_all("table")
|
|
18
|
+
if table_index >= len(tables):
|
|
19
|
+
return []
|
|
20
|
+
|
|
21
|
+
table = tables[table_index]
|
|
22
|
+
headers = [th.get_text(strip=True) for th in table.find_all("th")]
|
|
23
|
+
|
|
24
|
+
rows = []
|
|
25
|
+
for tr in table.find_all("tr"):
|
|
26
|
+
cells = tr.find_all(["td"])
|
|
27
|
+
if not cells:
|
|
28
|
+
continue
|
|
29
|
+
row = {}
|
|
30
|
+
for i, td in enumerate(cells):
|
|
31
|
+
key = headers[i] if i < len(headers) else f"col_{i}"
|
|
32
|
+
row[key] = td.get_text(strip=True)
|
|
33
|
+
rows.append(row)
|
|
34
|
+
return rows
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
#### Handling rowspan/colspan
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
def parse_complex_table(table_element) -> list[list[str]]:
|
|
41
|
+
"""Handle rowspan and colspan by expanding cells into a 2D grid."""
|
|
42
|
+
rows = table_element.find_all("tr")
|
|
43
|
+
if not rows:
|
|
44
|
+
return []
|
|
45
|
+
|
|
46
|
+
# Determine grid dimensions
|
|
47
|
+
max_cols = 0
|
|
48
|
+
for tr in rows:
|
|
49
|
+
col_count = sum(
|
|
50
|
+
int(cell.get("colspan", 1)) for cell in tr.find_all(["td", "th"])
|
|
51
|
+
)
|
|
52
|
+
max_cols = max(max_cols, col_count)
|
|
53
|
+
|
|
54
|
+
grid: list[list[str | None]] = [
|
|
55
|
+
[None] * max_cols for _ in range(len(rows))
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
for row_idx, tr in enumerate(rows):
|
|
59
|
+
col_idx = 0
|
|
60
|
+
for cell in tr.find_all(["td", "th"]):
|
|
61
|
+
# Skip cells already filled by rowspan
|
|
62
|
+
while col_idx < max_cols and grid[row_idx][col_idx] is not None:
|
|
63
|
+
col_idx += 1
|
|
64
|
+
if col_idx >= max_cols:
|
|
65
|
+
break
|
|
66
|
+
|
|
67
|
+
text = cell.get_text(strip=True)
|
|
68
|
+
rowspan = int(cell.get("rowspan", 1))
|
|
69
|
+
colspan = int(cell.get("colspan", 1))
|
|
70
|
+
|
|
71
|
+
for dr in range(rowspan):
|
|
72
|
+
for dc in range(colspan):
|
|
73
|
+
r, c = row_idx + dr, col_idx + dc
|
|
74
|
+
if r < len(grid) and c < max_cols:
|
|
75
|
+
grid[r][c] = text
|
|
76
|
+
col_idx += colspan
|
|
77
|
+
|
|
78
|
+
return [[cell or "" for cell in row] for row in grid]
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### List Extraction
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
def extract_definition_list(soup: BeautifulSoup) -> dict[str, str]:
|
|
85
|
+
"""Extract <dl> definition lists into key-value pairs."""
|
|
86
|
+
result = {}
|
|
87
|
+
for dl in soup.find_all("dl"):
|
|
88
|
+
dts = dl.find_all("dt")
|
|
89
|
+
dds = dl.find_all("dd")
|
|
90
|
+
for dt, dd in zip(dts, dds):
|
|
91
|
+
result[dt.get_text(strip=True)] = dd.get_text(strip=True)
|
|
92
|
+
return result
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def extract_nested_list(ul_element) -> list:
|
|
96
|
+
"""Recursively extract nested ul/ol into a tree structure."""
|
|
97
|
+
items = []
|
|
98
|
+
for li in ul_element.find_all("li", recursive=False):
|
|
99
|
+
text = li.find(string=True, recursive=False)
|
|
100
|
+
text = text.strip() if text else ""
|
|
101
|
+
children_ul = li.find(["ul", "ol"])
|
|
102
|
+
if children_ul:
|
|
103
|
+
items.append({"text": text, "children": extract_nested_list(children_ul)})
|
|
104
|
+
else:
|
|
105
|
+
items.append({"text": li.get_text(strip=True)})
|
|
106
|
+
return items
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### iframe Content Access
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
import httpx
|
|
113
|
+
|
|
114
|
+
async def fetch_iframe_content(
|
|
115
|
+
page_html: str, base_url: str, client: httpx.AsyncClient
|
|
116
|
+
) -> list[BeautifulSoup]:
|
|
117
|
+
"""Fetch and parse all iframe sources from a page."""
|
|
118
|
+
soup = BeautifulSoup(page_html, "html.parser")
|
|
119
|
+
iframes = soup.find_all("iframe")
|
|
120
|
+
results = []
|
|
121
|
+
|
|
122
|
+
for iframe in iframes:
|
|
123
|
+
src = iframe.get("src")
|
|
124
|
+
if not src:
|
|
125
|
+
continue
|
|
126
|
+
# Resolve relative URLs
|
|
127
|
+
if src.startswith("//"):
|
|
128
|
+
src = "https:" + src
|
|
129
|
+
elif src.startswith("/"):
|
|
130
|
+
from urllib.parse import urljoin
|
|
131
|
+
src = urljoin(base_url, src)
|
|
132
|
+
|
|
133
|
+
resp = await client.get(src, follow_redirects=True)
|
|
134
|
+
results.append(BeautifulSoup(resp.text, "html.parser"))
|
|
135
|
+
return results
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Korean Text Encoding
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
import httpx
|
|
142
|
+
|
|
143
|
+
def fetch_with_encoding(url: str, *, fallback_encoding: str = "euc-kr") -> str:
|
|
144
|
+
"""Fetch a page, auto-detecting EUC-KR/CP949 encoding."""
|
|
145
|
+
resp = httpx.get(url, follow_redirects=True)
|
|
146
|
+
|
|
147
|
+
# 1. Check HTTP header
|
|
148
|
+
content_type = resp.headers.get("content-type", "")
|
|
149
|
+
if "charset=" in content_type.lower():
|
|
150
|
+
declared = content_type.split("charset=")[-1].strip().lower()
|
|
151
|
+
if declared in ("euc-kr", "euckr", "cp949"):
|
|
152
|
+
return resp.content.decode("cp949", errors="replace")
|
|
153
|
+
|
|
154
|
+
# 2. Check meta tag
|
|
155
|
+
raw = resp.content
|
|
156
|
+
probe = raw[:2048].decode("ascii", errors="ignore").lower()
|
|
157
|
+
if "euc-kr" in probe or "euckr" in probe:
|
|
158
|
+
return raw.decode("cp949", errors="replace")
|
|
159
|
+
|
|
160
|
+
# 3. Try UTF-8, then fallback
|
|
161
|
+
try:
|
|
162
|
+
return raw.decode("utf-8")
|
|
163
|
+
except UnicodeDecodeError:
|
|
164
|
+
return raw.decode(fallback_encoding, errors="replace")
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### CSS Selector vs find/find_all
|
|
168
|
+
|
|
169
|
+
| Method | Best For | Example |
|
|
170
|
+
|--------|----------|---------|
|
|
171
|
+
| `soup.select("div.content > p")` | Complex nested paths, class/id combos | Multi-level CSS paths |
|
|
172
|
+
| `soup.find("div", class_="content")` | Simple single-element lookup | Known structure |
|
|
173
|
+
| `soup.find_all("a", href=True)` | Attribute filtering | Collecting all links |
|
|
174
|
+
| `soup.select_one("#main-table tr:nth-child(2)")` | Positional targeting | Specific row/cell |
|
|
175
|
+
|
|
176
|
+
**Rule of thumb**: Use `select()` for paths, `find()`/`find_all()` for attribute filters.
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## 2. Playwright Navigation & Wait Strategies
|
|
181
|
+
|
|
182
|
+
### Wait Event Comparison
|
|
183
|
+
|
|
184
|
+
| Event | When to Use | Caveat |
|
|
185
|
+
|-------|-------------|--------|
|
|
186
|
+
| `networkidle` | SPA with lazy-loaded data | Slow; waits for 500ms of no requests |
|
|
187
|
+
| `domcontentloaded` | Server-rendered pages | JS may not have executed yet |
|
|
188
|
+
| `load` | Traditional pages with images/fonts | Blocks on all resources |
|
|
189
|
+
| `commit` | Fastest; navigation started | Page not rendered yet |
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from playwright.async_api import async_playwright
|
|
193
|
+
|
|
194
|
+
async def scrape_dynamic_page(url: str) -> str:
|
|
195
|
+
async with async_playwright() as p:
|
|
196
|
+
browser = await p.chromium.launch(headless=True)
|
|
197
|
+
context = await browser.new_context(
|
|
198
|
+
locale="ko-KR",
|
|
199
|
+
user_agent=(
|
|
200
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
201
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
202
|
+
"Chrome/120.0.0.0 Safari/537.36"
|
|
203
|
+
),
|
|
204
|
+
)
|
|
205
|
+
page = await context.new_page()
|
|
206
|
+
|
|
207
|
+
# Use domcontentloaded for gov sites (faster than networkidle)
|
|
208
|
+
await page.goto(url, wait_until="domcontentloaded", timeout=30_000)
|
|
209
|
+
|
|
210
|
+
# Then wait for the specific content element
|
|
211
|
+
await page.wait_for_selector("#content-area", timeout=10_000)
|
|
212
|
+
|
|
213
|
+
html = await page.content()
|
|
214
|
+
await browser.close()
|
|
215
|
+
return html
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### JavaScript Redirect Detection
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
async def follow_js_redirects(page, url: str, max_redirects: int = 5) -> str:
|
|
222
|
+
"""Follow JS redirects (window.location, meta refresh) up to N hops."""
|
|
223
|
+
visited = set()
|
|
224
|
+
|
|
225
|
+
for _ in range(max_redirects):
|
|
226
|
+
if page.url in visited:
|
|
227
|
+
break
|
|
228
|
+
visited.add(page.url)
|
|
229
|
+
|
|
230
|
+
# Wait for potential JS redirect
|
|
231
|
+
try:
|
|
232
|
+
await page.wait_for_navigation(timeout=3_000)
|
|
233
|
+
except Exception:
|
|
234
|
+
break # No redirect happened
|
|
235
|
+
|
|
236
|
+
return page.url
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### Dynamic Content Waiting
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
async def wait_for_ajax_table(page, table_selector: str = "table") -> str:
|
|
243
|
+
"""Wait for a table to be populated by AJAX."""
|
|
244
|
+
# Wait for at least one data row
|
|
245
|
+
await page.wait_for_selector(
|
|
246
|
+
f"{table_selector} tbody tr",
|
|
247
|
+
state="attached",
|
|
248
|
+
timeout=15_000,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# Optional: wait for a loading spinner to disappear
|
|
252
|
+
try:
|
|
253
|
+
await page.wait_for_selector(
|
|
254
|
+
".loading, .spinner",
|
|
255
|
+
state="detached",
|
|
256
|
+
timeout=5_000,
|
|
257
|
+
)
|
|
258
|
+
except Exception:
|
|
259
|
+
pass # No spinner found, proceed
|
|
260
|
+
|
|
261
|
+
return await page.content()
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
### page.evaluate() for Complex Extraction
|
|
265
|
+
|
|
266
|
+
```python
|
|
267
|
+
async def extract_table_via_js(page) -> list[dict]:
|
|
268
|
+
"""Use page.evaluate() when DOM is complex or heavily JS-rendered."""
|
|
269
|
+
return await page.evaluate("""
|
|
270
|
+
() => {
|
|
271
|
+
const table = document.querySelector('#data-table');
|
|
272
|
+
if (!table) return [];
|
|
273
|
+
|
|
274
|
+
const headers = [...table.querySelectorAll('th')]
|
|
275
|
+
.map(th => th.textContent.trim());
|
|
276
|
+
const rows = [...table.querySelectorAll('tbody tr')];
|
|
277
|
+
|
|
278
|
+
return rows.map(tr => {
|
|
279
|
+
const cells = [...tr.querySelectorAll('td')];
|
|
280
|
+
const obj = {};
|
|
281
|
+
cells.forEach((td, i) => {
|
|
282
|
+
obj[headers[i] || `col_${i}`] = td.textContent.trim();
|
|
283
|
+
});
|
|
284
|
+
return obj;
|
|
285
|
+
});
|
|
286
|
+
}
|
|
287
|
+
""")
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
### Browser Context Isolation for Parallel Scraping
|
|
291
|
+
|
|
292
|
+
```python
|
|
293
|
+
import asyncio
|
|
294
|
+
|
|
295
|
+
async def parallel_scrape(urls: list[str]) -> list[str]:
|
|
296
|
+
"""Scrape multiple URLs in parallel using isolated browser contexts."""
|
|
297
|
+
async with async_playwright() as p:
|
|
298
|
+
browser = await p.chromium.launch(headless=True)
|
|
299
|
+
|
|
300
|
+
async def scrape_one(url: str) -> str:
|
|
301
|
+
# Each URL gets its own context (cookies, storage isolated)
|
|
302
|
+
context = await browser.new_context()
|
|
303
|
+
page = await context.new_page()
|
|
304
|
+
try:
|
|
305
|
+
await page.goto(url, wait_until="domcontentloaded", timeout=30_000)
|
|
306
|
+
return await page.content()
|
|
307
|
+
finally:
|
|
308
|
+
await context.close()
|
|
309
|
+
|
|
310
|
+
# Limit concurrency to avoid overloading target
|
|
311
|
+
semaphore = asyncio.Semaphore(3)
|
|
312
|
+
|
|
313
|
+
async def bounded_scrape(url: str) -> str:
|
|
314
|
+
async with semaphore:
|
|
315
|
+
return await scrape_one(url)
|
|
316
|
+
|
|
317
|
+
results = await asyncio.gather(
|
|
318
|
+
*[bounded_scrape(url) for url in urls],
|
|
319
|
+
return_exceptions=True,
|
|
320
|
+
)
|
|
321
|
+
await browser.close()
|
|
322
|
+
return [r if isinstance(r, str) else "" for r in results]
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
---
|
|
326
|
+
|
|
327
|
+
## 3. Government Site Common Patterns
|
|
328
|
+
|
|
329
|
+
### Korean Government Portal Structures
|
|
330
|
+
|
|
331
|
+
Korean government sites (`.go.kr`) share common patterns:
|
|
332
|
+
|
|
333
|
+
| Pattern | Sites | Handling |
|
|
334
|
+
|---------|-------|----------|
|
|
335
|
+
| Board-list pagination | data.go.kr, me.go.kr | `page=N` or `pageIndex=N` query params |
|
|
336
|
+
| iframe-wrapped content | KOSIS, e-Nara | Fetch iframe `src` separately |
|
|
337
|
+
| JavaScript-only navigation | Various ministries | Playwright required |
|
|
338
|
+
| EUC-KR encoding | Older systems | CP949 decoding (superset of EUC-KR) |
|
|
339
|
+
| Session-gated downloads | data.go.kr API | Login + session cookie forwarding |
|
|
340
|
+
|
|
341
|
+
### JS Redirect Chains
|
|
342
|
+
|
|
343
|
+
```python
|
|
344
|
+
async def handle_gov_redirects(page) -> None:
|
|
345
|
+
"""Handle common Korean gov site redirect patterns."""
|
|
346
|
+
# Pattern 1: window.location.href = '...'
|
|
347
|
+
# Pattern 2: document.location.replace('...')
|
|
348
|
+
# Pattern 3: <meta http-equiv="refresh" content="0;url=...">
|
|
349
|
+
|
|
350
|
+
# Wait for final destination
|
|
351
|
+
await page.wait_for_load_state("domcontentloaded")
|
|
352
|
+
|
|
353
|
+
# Check for meta refresh
|
|
354
|
+
meta_refresh = await page.query_selector('meta[http-equiv="refresh"]')
|
|
355
|
+
if meta_refresh:
|
|
356
|
+
content = await meta_refresh.get_attribute("content")
|
|
357
|
+
if content and "url=" in content.lower():
|
|
358
|
+
target_url = content.split("url=", 1)[-1].strip("'\"")
|
|
359
|
+
await page.goto(target_url, wait_until="domcontentloaded")
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
### Session/Cookie Handling
|
|
363
|
+
|
|
364
|
+
```python
|
|
365
|
+
async def authenticated_gov_scrape(
|
|
366
|
+
login_url: str,
|
|
367
|
+
target_url: str,
|
|
368
|
+
credentials: dict,
|
|
369
|
+
) -> str:
|
|
370
|
+
"""Login to a government portal and scrape authenticated content."""
|
|
371
|
+
async with async_playwright() as p:
|
|
372
|
+
browser = await p.chromium.launch(headless=True)
|
|
373
|
+
context = await browser.new_context()
|
|
374
|
+
page = await context.new_page()
|
|
375
|
+
|
|
376
|
+
# Step 1: Navigate to login
|
|
377
|
+
await page.goto(login_url, wait_until="networkidle")
|
|
378
|
+
|
|
379
|
+
# Step 2: Fill credentials
|
|
380
|
+
await page.fill("#userId", credentials["user_id"])
|
|
381
|
+
await page.fill("#userPw", credentials["password"])
|
|
382
|
+
await page.click("#loginBtn")
|
|
383
|
+
|
|
384
|
+
# Step 3: Wait for redirect after login
|
|
385
|
+
await page.wait_for_url("**/main**", timeout=10_000)
|
|
386
|
+
|
|
387
|
+
# Step 4: Navigate to target with session cookies
|
|
388
|
+
await page.goto(target_url, wait_until="domcontentloaded")
|
|
389
|
+
html = await page.content()
|
|
390
|
+
|
|
391
|
+
await browser.close()
|
|
392
|
+
return html
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
### iframe-Based Content (Common in Korean Gov Sites)
|
|
396
|
+
|
|
397
|
+
```python
|
|
398
|
+
async def extract_iframe_content(page) -> str:
|
|
399
|
+
"""Extract content from nested iframes (common in KOSIS, e-Nara)."""
|
|
400
|
+
# Wait for iframe to load
|
|
401
|
+
iframe_element = await page.wait_for_selector("iframe#contentFrame")
|
|
402
|
+
iframe = await iframe_element.content_frame()
|
|
403
|
+
|
|
404
|
+
if iframe is None:
|
|
405
|
+
return ""
|
|
406
|
+
|
|
407
|
+
# Some sites nest iframes 2-3 levels deep
|
|
408
|
+
nested_iframe = await iframe.query_selector("iframe")
|
|
409
|
+
if nested_iframe:
|
|
410
|
+
iframe = await nested_iframe.content_frame()
|
|
411
|
+
|
|
412
|
+
return await iframe.content() if iframe else ""
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
### CAPTCHA and Bot Detection Indicators
|
|
416
|
+
|
|
417
|
+
| Indicator | Detection | Mitigation |
|
|
418
|
+
|-----------|-----------|------------|
|
|
419
|
+
| 403 Forbidden | Status code check | Rotate user-agent, add delays |
|
|
420
|
+
| Empty response body | `len(html) < 100` | Retry with Playwright |
|
|
421
|
+
| CAPTCHA form | `soup.find("form", id="captchaForm")` | Flag for manual intervention |
|
|
422
|
+
| Rate limit headers | `Retry-After` header | Respect backoff period |
|
|
423
|
+
| JavaScript challenge | Cloudflare/WAF JS | Use Playwright, not httpx |
|
|
424
|
+
|
|
425
|
+
```python
|
|
426
|
+
def detect_bot_block(html: str, status_code: int) -> str | None:
|
|
427
|
+
"""Detect common bot-blocking patterns. Returns block type or None."""
|
|
428
|
+
if status_code == 403:
|
|
429
|
+
return "forbidden"
|
|
430
|
+
if status_code == 429:
|
|
431
|
+
return "rate_limited"
|
|
432
|
+
if len(html) < 200:
|
|
433
|
+
return "empty_response"
|
|
434
|
+
|
|
435
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
436
|
+
if soup.find("form", id=lambda x: x and "captcha" in x.lower()):
|
|
437
|
+
return "captcha"
|
|
438
|
+
if soup.find("div", class_=lambda x: x and "cf-" in str(x)):
|
|
439
|
+
return "cloudflare"
|
|
440
|
+
|
|
441
|
+
return None
|
|
442
|
+
```
|
|
443
|
+
|
|
444
|
+
---
|
|
445
|
+
|
|
446
|
+
## 4. Smart Parser Design Patterns
|
|
447
|
+
|
|
448
|
+
### SmartTableDetector
|
|
449
|
+
|
|
450
|
+
```python
|
|
451
|
+
from dataclasses import dataclass
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
@dataclass
|
|
455
|
+
class TableSignature:
|
|
456
|
+
"""Describes expected table structure for auto-detection."""
|
|
457
|
+
required_headers: list[str]
|
|
458
|
+
optional_headers: list[str] = None
|
|
459
|
+
min_rows: int = 1
|
|
460
|
+
header_row_index: int = 0
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def detect_table(
|
|
464
|
+
soup: BeautifulSoup,
|
|
465
|
+
signature: TableSignature,
|
|
466
|
+
) -> "Tag | None":
|
|
467
|
+
"""Find a table matching the given signature."""
|
|
468
|
+
for table in soup.find_all("table"):
|
|
469
|
+
headers = [
|
|
470
|
+
th.get_text(strip=True)
|
|
471
|
+
for th in table.find_all("tr")[signature.header_row_index].find_all(
|
|
472
|
+
["th", "td"]
|
|
473
|
+
)
|
|
474
|
+
]
|
|
475
|
+
if all(h in headers for h in signature.required_headers):
|
|
476
|
+
data_rows = table.find_all("tr")[signature.header_row_index + 1 :]
|
|
477
|
+
if len(data_rows) >= signature.min_rows:
|
|
478
|
+
return table
|
|
479
|
+
return None
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
# Usage
|
|
483
|
+
sig = TableSignature(
|
|
484
|
+
required_headers=["항목명", "기준일", "수치"],
|
|
485
|
+
optional_headers=["단위", "비고"],
|
|
486
|
+
min_rows=3,
|
|
487
|
+
)
|
|
488
|
+
target_table = detect_table(soup, sig)
|
|
489
|
+
```
|
|
490
|
+
|
|
491
|
+
### Adaptive Selectors
|
|
492
|
+
|
|
493
|
+
```python
|
|
494
|
+
class AdaptiveSelector:
|
|
495
|
+
"""Try multiple selectors in order, surviving minor layout changes."""
|
|
496
|
+
|
|
497
|
+
def __init__(self, selectors: list[str], description: str = ""):
|
|
498
|
+
self.selectors = selectors
|
|
499
|
+
self.description = description
|
|
500
|
+
|
|
501
|
+
def find(self, soup: BeautifulSoup):
|
|
502
|
+
for selector in self.selectors:
|
|
503
|
+
result = soup.select(selector)
|
|
504
|
+
if result:
|
|
505
|
+
return result
|
|
506
|
+
return []
|
|
507
|
+
|
|
508
|
+
def find_one(self, soup: BeautifulSoup):
|
|
509
|
+
results = self.find(soup)
|
|
510
|
+
return results[0] if results else None
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
# Define selectors with fallbacks
|
|
514
|
+
CONTENT_AREA = AdaptiveSelector(
|
|
515
|
+
selectors=[
|
|
516
|
+
"#content-area", # Primary: ID-based
|
|
517
|
+
"div.content_area", # Fallback 1: class-based
|
|
518
|
+
"main > div:first-child", # Fallback 2: structural
|
|
519
|
+
"body > div.wrapper > div.content", # Fallback 3: full path
|
|
520
|
+
],
|
|
521
|
+
description="Main content area",
|
|
522
|
+
)
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
### Schema-First Parsing
|
|
526
|
+
|
|
527
|
+
```python
|
|
528
|
+
from pydantic import BaseModel, field_validator
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
class QCInspectionResult(BaseModel):
|
|
532
|
+
"""Define expected output shape before writing the parser."""
|
|
533
|
+
item_name: str
|
|
534
|
+
inspection_date: str
|
|
535
|
+
result: str # "적합" | "부적합" | "해당없음"
|
|
536
|
+
standard_value: str | None = None
|
|
537
|
+
measured_value: str | None = None
|
|
538
|
+
unit: str | None = None
|
|
539
|
+
|
|
540
|
+
@field_validator("result")
|
|
541
|
+
@classmethod
|
|
542
|
+
def validate_result(cls, v: str) -> str:
|
|
543
|
+
allowed = {"적합", "부적합", "해당없음"}
|
|
544
|
+
if v not in allowed:
|
|
545
|
+
raise ValueError(f"Result must be one of {allowed}, got '{v}'")
|
|
546
|
+
return v
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def parse_inspection_table(
|
|
550
|
+
table_element,
|
|
551
|
+
header_map: dict[str, str] | None = None,
|
|
552
|
+
) -> list[QCInspectionResult]:
|
|
553
|
+
"""Parse table rows into validated Pydantic models."""
|
|
554
|
+
default_map = {
|
|
555
|
+
"항목명": "item_name",
|
|
556
|
+
"검사일자": "inspection_date",
|
|
557
|
+
"결과": "result",
|
|
558
|
+
"기준치": "standard_value",
|
|
559
|
+
"측정치": "measured_value",
|
|
560
|
+
"단위": "unit",
|
|
561
|
+
}
|
|
562
|
+
mapping = header_map or default_map
|
|
563
|
+
|
|
564
|
+
headers = [
|
|
565
|
+
th.get_text(strip=True) for th in table_element.find_all("th")
|
|
566
|
+
]
|
|
567
|
+
results = []
|
|
568
|
+
|
|
569
|
+
for tr in table_element.find_all("tr"):
|
|
570
|
+
cells = tr.find_all("td")
|
|
571
|
+
if not cells:
|
|
572
|
+
continue
|
|
573
|
+
|
|
574
|
+
raw = {}
|
|
575
|
+
for i, td in enumerate(cells):
|
|
576
|
+
if i < len(headers) and headers[i] in mapping:
|
|
577
|
+
raw[mapping[headers[i]]] = td.get_text(strip=True)
|
|
578
|
+
|
|
579
|
+
try:
|
|
580
|
+
results.append(QCInspectionResult(**raw))
|
|
581
|
+
except Exception:
|
|
582
|
+
continue # Skip malformed rows, log in production
|
|
583
|
+
|
|
584
|
+
return results
|
|
585
|
+
```
|
|
586
|
+
|
|
587
|
+
### Fallback Chains
|
|
588
|
+
|
|
589
|
+
```python
|
|
590
|
+
import re
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def extract_value(
|
|
594
|
+
soup: BeautifulSoup,
|
|
595
|
+
label: str,
|
|
596
|
+
) -> str | None:
|
|
597
|
+
"""Extract a value using a fallback chain of strategies."""
|
|
598
|
+
|
|
599
|
+
# Strategy 1: CSS selector — label in <th>, value in next <td>
|
|
600
|
+
for th in soup.find_all("th"):
|
|
601
|
+
if label in th.get_text(strip=True):
|
|
602
|
+
td = th.find_next_sibling("td")
|
|
603
|
+
if td:
|
|
604
|
+
return td.get_text(strip=True)
|
|
605
|
+
|
|
606
|
+
# Strategy 2: Definition list
|
|
607
|
+
for dt in soup.find_all("dt"):
|
|
608
|
+
if label in dt.get_text(strip=True):
|
|
609
|
+
dd = dt.find_next_sibling("dd")
|
|
610
|
+
if dd:
|
|
611
|
+
return dd.get_text(strip=True)
|
|
612
|
+
|
|
613
|
+
# Strategy 3: Label + adjacent text pattern
|
|
614
|
+
label_el = soup.find(string=re.compile(re.escape(label)))
|
|
615
|
+
if label_el:
|
|
616
|
+
parent = label_el.parent
|
|
617
|
+
next_sib = parent.find_next_sibling()
|
|
618
|
+
if next_sib:
|
|
619
|
+
return next_sib.get_text(strip=True)
|
|
620
|
+
|
|
621
|
+
# Strategy 4: Regex on raw text
|
|
622
|
+
text = soup.get_text()
|
|
623
|
+
pattern = rf"{re.escape(label)}\s*[:\uff1a]\s*(.+?)(?:\n|$)"
|
|
624
|
+
match = re.search(pattern, text)
|
|
625
|
+
if match:
|
|
626
|
+
return match.group(1).strip()
|
|
627
|
+
|
|
628
|
+
return None
|
|
629
|
+
```
|
|
630
|
+
|
|
631
|
+
---
|
|
632
|
+
|
|
633
|
+
## 5. Error Handling
|
|
634
|
+
|
|
635
|
+
### Timeout Strategies
|
|
636
|
+
|
|
637
|
+
```python
|
|
638
|
+
import httpx
|
|
639
|
+
|
|
640
|
+
def create_scraping_client() -> httpx.AsyncClient:
|
|
641
|
+
"""Create an HTTP client with layered timeout strategy."""
|
|
642
|
+
return httpx.AsyncClient(
|
|
643
|
+
timeout=httpx.Timeout(
|
|
644
|
+
connect=5.0, # TCP connection timeout
|
|
645
|
+
read=15.0, # Read timeout (waiting for response body)
|
|
646
|
+
write=5.0, # Write timeout (sending request body)
|
|
647
|
+
pool=10.0, # Connection pool timeout
|
|
648
|
+
),
|
|
649
|
+
follow_redirects=True,
|
|
650
|
+
limits=httpx.Limits(
|
|
651
|
+
max_connections=10,
|
|
652
|
+
max_keepalive_connections=5,
|
|
653
|
+
),
|
|
654
|
+
)
|
|
655
|
+
```
|
|
656
|
+
|
|
657
|
+
### Retry with Exponential Backoff
|
|
658
|
+
|
|
659
|
+
```python
|
|
660
|
+
import asyncio
|
|
661
|
+
import random
|
|
662
|
+
from collections.abc import Callable
|
|
663
|
+
|
|
664
|
+
|
|
665
|
+
async def retry_with_backoff(
|
|
666
|
+
fn: Callable,
|
|
667
|
+
*args,
|
|
668
|
+
max_retries: int = 3,
|
|
669
|
+
base_delay: float = 1.0,
|
|
670
|
+
max_delay: float = 30.0,
|
|
671
|
+
retryable_status: set[int] = frozenset({429, 500, 502, 503, 504}),
|
|
672
|
+
**kwargs,
|
|
673
|
+
):
|
|
674
|
+
"""Retry an async function with exponential backoff and jitter."""
|
|
675
|
+
last_exception = None
|
|
676
|
+
|
|
677
|
+
for attempt in range(max_retries + 1):
|
|
678
|
+
try:
|
|
679
|
+
result = await fn(*args, **kwargs)
|
|
680
|
+
|
|
681
|
+
# Check HTTP status if result has one
|
|
682
|
+
if hasattr(result, "status_code"):
|
|
683
|
+
if result.status_code in retryable_status:
|
|
684
|
+
raise httpx.HTTPStatusError(
|
|
685
|
+
f"Status {result.status_code}",
|
|
686
|
+
request=result.request,
|
|
687
|
+
response=result,
|
|
688
|
+
)
|
|
689
|
+
return result
|
|
690
|
+
|
|
691
|
+
except (httpx.TimeoutException, httpx.HTTPStatusError) as e:
|
|
692
|
+
last_exception = e
|
|
693
|
+
if attempt == max_retries:
|
|
694
|
+
break
|
|
695
|
+
|
|
696
|
+
delay = min(base_delay * (2 ** attempt), max_delay)
|
|
697
|
+
jitter = random.uniform(0, delay * 0.1)
|
|
698
|
+
await asyncio.sleep(delay + jitter)
|
|
699
|
+
|
|
700
|
+
raise last_exception
|
|
701
|
+
```
|
|
702
|
+
|
|
703
|
+
### Structure Change Detection
|
|
704
|
+
|
|
705
|
+
```python
|
|
706
|
+
import hashlib
|
|
707
|
+
import json
|
|
708
|
+
from pathlib import Path
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
class StructureValidator:
|
|
712
|
+
"""Detect when a target site's HTML structure has changed."""
|
|
713
|
+
|
|
714
|
+
def __init__(self, fingerprint_dir: str = ".scraper_fingerprints"):
|
|
715
|
+
self.fp_dir = Path(fingerprint_dir)
|
|
716
|
+
self.fp_dir.mkdir(exist_ok=True)
|
|
717
|
+
|
|
718
|
+
def compute_fingerprint(self, soup: BeautifulSoup, selectors: list[str]) -> str:
|
|
719
|
+
"""Create a structural fingerprint from CSS selectors."""
|
|
720
|
+
parts = []
|
|
721
|
+
for sel in selectors:
|
|
722
|
+
elements = soup.select(sel)
|
|
723
|
+
parts.append(f"{sel}:{len(elements)}")
|
|
724
|
+
for el in elements[:3]: # Sample first 3
|
|
725
|
+
parts.append(f" tag={el.name},classes={el.get('class')}")
|
|
726
|
+
return hashlib.sha256("\n".join(parts).encode()).hexdigest()[:16]
|
|
727
|
+
|
|
728
|
+
def check(self, site_key: str, soup: BeautifulSoup, selectors: list[str]) -> bool:
|
|
729
|
+
"""Returns True if structure matches previous fingerprint."""
|
|
730
|
+
fp_file = self.fp_dir / f"{site_key}.json"
|
|
731
|
+
current_fp = self.compute_fingerprint(soup, selectors)
|
|
732
|
+
|
|
733
|
+
if fp_file.exists():
|
|
734
|
+
stored = json.loads(fp_file.read_text())
|
|
735
|
+
if stored["fingerprint"] != current_fp:
|
|
736
|
+
return False # Structure changed!
|
|
737
|
+
|
|
738
|
+
# Update fingerprint
|
|
739
|
+
fp_file.write_text(json.dumps({
|
|
740
|
+
"fingerprint": current_fp,
|
|
741
|
+
"selectors": selectors,
|
|
742
|
+
}))
|
|
743
|
+
return True
|
|
744
|
+
```
|
|
745
|
+
|
|
746
|
+
### Stale Content Detection
|
|
747
|
+
|
|
748
|
+
```python
|
|
749
|
+
import hashlib
|
|
750
|
+
from datetime import datetime
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
class ContentFreshnessChecker:
|
|
754
|
+
"""Detect when scraped content hasn't actually changed."""
|
|
755
|
+
|
|
756
|
+
def __init__(self):
|
|
757
|
+
self._hashes: dict[str, tuple[str, datetime]] = {}
|
|
758
|
+
|
|
759
|
+
def is_stale(self, url: str, content: str) -> bool:
|
|
760
|
+
"""Returns True if content is identical to last scrape."""
|
|
761
|
+
content_hash = hashlib.md5(content.encode()).hexdigest()
|
|
762
|
+
if url in self._hashes:
|
|
763
|
+
prev_hash, _ = self._hashes[url]
|
|
764
|
+
if prev_hash == content_hash:
|
|
765
|
+
return True
|
|
766
|
+
|
|
767
|
+
self._hashes[url] = (content_hash, datetime.now())
|
|
768
|
+
return False
|
|
769
|
+
```
|
|
770
|
+
|
|
771
|
+
---
|
|
772
|
+
|
|
773
|
+
## 6. Testing
|
|
774
|
+
|
|
775
|
+
### Snapshot Testing for Parser Outputs
|
|
776
|
+
|
|
777
|
+
```python
|
|
778
|
+
import json
|
|
779
|
+
from pathlib import Path
|
|
780
|
+
|
|
781
|
+
import pytest
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
FIXTURES_DIR = Path(__file__).parent / "fixtures"
|
|
785
|
+
SNAPSHOTS_DIR = Path(__file__).parent / "snapshots"
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
def load_fixture(name: str) -> str:
|
|
789
|
+
"""Load an HTML fixture file."""
|
|
790
|
+
return (FIXTURES_DIR / f"{name}.html").read_text(encoding="utf-8")
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
def assert_snapshot(name: str, data: list[dict]) -> None:
|
|
794
|
+
"""Compare parser output against a stored snapshot."""
|
|
795
|
+
snapshot_file = SNAPSHOTS_DIR / f"{name}.json"
|
|
796
|
+
serialized = json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True)
|
|
797
|
+
|
|
798
|
+
if not snapshot_file.exists():
|
|
799
|
+
# First run: create snapshot
|
|
800
|
+
snapshot_file.parent.mkdir(parents=True, exist_ok=True)
|
|
801
|
+
snapshot_file.write_text(serialized)
|
|
802
|
+
pytest.skip(f"Snapshot created: {snapshot_file}")
|
|
803
|
+
|
|
804
|
+
expected = snapshot_file.read_text(encoding="utf-8")
|
|
805
|
+
assert serialized == expected, (
|
|
806
|
+
f"Snapshot mismatch for {name}. "
|
|
807
|
+
f"Run with --update-snapshots to update."
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
# Usage in tests
|
|
812
|
+
def test_parse_qc_inspection_table():
|
|
813
|
+
html = load_fixture("qc_inspection_2024")
|
|
814
|
+
results = parse_inspection_table(
|
|
815
|
+
BeautifulSoup(html, "html.parser").find("table")
|
|
816
|
+
)
|
|
817
|
+
assert_snapshot("qc_inspection_2024", [r.model_dump() for r in results])
|
|
818
|
+
```
|
|
819
|
+
|
|
820
|
+
### Mock HTML Fixtures for Unit Tests
|
|
821
|
+
|
|
822
|
+
```python
|
|
823
|
+
# tests/fixtures/simple_table.html
|
|
824
|
+
SIMPLE_TABLE_HTML = """
|
|
825
|
+
<html>
|
|
826
|
+
<body>
|
|
827
|
+
<table id="result-table">
|
|
828
|
+
<thead>
|
|
829
|
+
<tr><th>항목명</th><th>검사일자</th><th>결과</th></tr>
|
|
830
|
+
</thead>
|
|
831
|
+
<tbody>
|
|
832
|
+
<tr><td>수질검사</td><td>2024-01-15</td><td>적합</td></tr>
|
|
833
|
+
<tr><td>대기질검사</td><td>2024-01-16</td><td>부적합</td></tr>
|
|
834
|
+
</tbody>
|
|
835
|
+
</table>
|
|
836
|
+
</body>
|
|
837
|
+
</html>
|
|
838
|
+
"""
|
|
839
|
+
|
|
840
|
+
|
|
841
|
+
def test_parse_simple_table():
|
|
842
|
+
soup = BeautifulSoup(SIMPLE_TABLE_HTML, "html.parser")
|
|
843
|
+
table = soup.find("table", id="result-table")
|
|
844
|
+
results = parse_inspection_table(table)
|
|
845
|
+
assert len(results) == 2
|
|
846
|
+
assert results[0].item_name == "수질검사"
|
|
847
|
+
assert results[0].result == "적합"
|
|
848
|
+
assert results[1].result == "부적합"
|
|
849
|
+
```
|
|
850
|
+
|
|
851
|
+
### VCR-Style Recording for Integration Tests
|
|
852
|
+
|
|
853
|
+
```python
|
|
854
|
+
import json
|
|
855
|
+
import hashlib
|
|
856
|
+
from pathlib import Path
|
|
857
|
+
|
|
858
|
+
import httpx
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
class ResponseRecorder:
|
|
862
|
+
"""Record and replay HTTP responses for deterministic testing."""
|
|
863
|
+
|
|
864
|
+
def __init__(self, cassette_dir: str = "tests/cassettes"):
|
|
865
|
+
self.cassette_dir = Path(cassette_dir)
|
|
866
|
+
self.cassette_dir.mkdir(parents=True, exist_ok=True)
|
|
867
|
+
|
|
868
|
+
def _cassette_path(self, url: str, method: str = "GET") -> Path:
|
|
869
|
+
key = hashlib.md5(f"{method}:{url}".encode()).hexdigest()
|
|
870
|
+
return self.cassette_dir / f"{key}.json"
|
|
871
|
+
|
|
872
|
+
async def get(
|
|
873
|
+
self,
|
|
874
|
+
url: str,
|
|
875
|
+
client: httpx.AsyncClient,
|
|
876
|
+
*,
|
|
877
|
+
record: bool = False,
|
|
878
|
+
) -> httpx.Response:
|
|
879
|
+
cassette = self._cassette_path(url)
|
|
880
|
+
|
|
881
|
+
if not record and cassette.exists():
|
|
882
|
+
# Replay mode
|
|
883
|
+
data = json.loads(cassette.read_text())
|
|
884
|
+
return httpx.Response(
|
|
885
|
+
status_code=data["status_code"],
|
|
886
|
+
headers=data["headers"],
|
|
887
|
+
content=data["body"].encode("utf-8"),
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
# Record mode
|
|
891
|
+
resp = await client.get(url)
|
|
892
|
+
cassette.write_text(json.dumps({
|
|
893
|
+
"url": url,
|
|
894
|
+
"status_code": resp.status_code,
|
|
895
|
+
"headers": dict(resp.headers),
|
|
896
|
+
"body": resp.text,
|
|
897
|
+
}, ensure_ascii=False, indent=2))
|
|
898
|
+
return resp
|
|
899
|
+
|
|
900
|
+
|
|
901
|
+
# Usage
|
|
902
|
+
recorder = ResponseRecorder()
|
|
903
|
+
|
|
904
|
+
async def test_fetch_gov_data():
|
|
905
|
+
async with httpx.AsyncClient() as client:
|
|
906
|
+
resp = await recorder.get(
|
|
907
|
+
"https://data.go.kr/api/sample",
|
|
908
|
+
client,
|
|
909
|
+
record=False, # Set True on first run
|
|
910
|
+
)
|
|
911
|
+
assert resp.status_code == 200
|
|
912
|
+
```
|
|
913
|
+
|
|
914
|
+
---
|
|
915
|
+
|
|
916
|
+
## Quick Reference
|
|
917
|
+
|
|
918
|
+
| Task | Tool | Key Pattern |
|
|
919
|
+
|------|------|-------------|
|
|
920
|
+
| Static HTML parsing | BeautifulSoup | `parse_table()`, `select()` |
|
|
921
|
+
| JS-rendered content | Playwright | `wait_for_selector()`, `evaluate()` |
|
|
922
|
+
| Korean encoding | httpx + CP949 | `fetch_with_encoding()` |
|
|
923
|
+
| Gov site login | Playwright contexts | `authenticated_gov_scrape()` |
|
|
924
|
+
| Parallel scraping | Playwright + asyncio | `Semaphore(3)` per domain |
|
|
925
|
+
| Layout change detection | Structural fingerprint | `StructureValidator.check()` |
|
|
926
|
+
| Test reproducibility | VCR cassettes | `ResponseRecorder` |
|