oh-my-customcode 0.47.2 → 0.48.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/dist/cli/index.js +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/templates/.claude/hooks/hooks.json +1 -1
- package/templates/.claude/skills/deep-verify/SKILL.md +111 -0
- package/templates/CLAUDE.md +3 -2
- package/templates/guides/index.yaml +45 -0
- package/templates/guides/web-scraping/README.md +926 -0
- package/templates/guides/web-scraping/index.yaml +19 -0
- package/templates/manifest.json +3 -3
package/README.md
CHANGED
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
|
|
14
14
|
**[한국어 문서 (Korean)](./README_ko.md)**
|
|
15
15
|
|
|
16
|
-
45 agents.
|
|
16
|
+
45 agents. 84 skills. 21 rules. One command.
|
|
17
17
|
|
|
18
18
|
```bash
|
|
19
19
|
npm install -g oh-my-customcode && cd your-project && omcustom init
|
|
@@ -138,7 +138,7 @@ Each agent declares its tools, model, memory scope, and limitations in YAML fron
|
|
|
138
138
|
|
|
139
139
|
---
|
|
140
140
|
|
|
141
|
-
### Skills (
|
|
141
|
+
### Skills (84)
|
|
142
142
|
|
|
143
143
|
| Category | Count | Includes |
|
|
144
144
|
|----------|-------|----------|
|
|
@@ -226,7 +226,7 @@ Key rules: R010 (orchestrator never writes files), R009 (parallel execution mand
|
|
|
226
226
|
|
|
227
227
|
---
|
|
228
228
|
|
|
229
|
-
### Guides (
|
|
229
|
+
### Guides (29)
|
|
230
230
|
|
|
231
231
|
Reference documentation covering best practices, architecture decisions, and integration patterns. Located in `guides/` at project root, covering topics from agent design to CI/CD to observability.
|
|
232
232
|
|
|
@@ -272,14 +272,14 @@ your-project/
|
|
|
272
272
|
├── CLAUDE.md # Entry point
|
|
273
273
|
├── .claude/
|
|
274
274
|
│ ├── agents/ # 45 agent definitions
|
|
275
|
-
│ ├── skills/ #
|
|
275
|
+
│ ├── skills/ # 84 skill modules
|
|
276
276
|
│ ├── rules/ # 21 governance rules (R000-R021)
|
|
277
277
|
│ ├── hooks/ # 15 lifecycle hook scripts
|
|
278
278
|
│ ├── schemas/ # Tool input validation schemas
|
|
279
279
|
│ ├── specs/ # Extracted canonical specs
|
|
280
280
|
│ ├── contexts/ # 4 shared context files
|
|
281
281
|
│ └── ontology/ # Knowledge graph for RAG
|
|
282
|
-
└── guides/ #
|
|
282
|
+
└── guides/ # 29 reference documents
|
|
283
283
|
```
|
|
284
284
|
|
|
285
285
|
---
|
package/dist/cli/index.js
CHANGED
|
@@ -9323,7 +9323,7 @@ var init_package = __esm(() => {
|
|
|
9323
9323
|
package_default = {
|
|
9324
9324
|
name: "oh-my-customcode",
|
|
9325
9325
|
workspaces: ["packages/*"],
|
|
9326
|
-
version: "0.
|
|
9326
|
+
version: "0.48.1",
|
|
9327
9327
|
description: "Batteries-included agent harness for Claude Code",
|
|
9328
9328
|
type: "module",
|
|
9329
9329
|
bin: {
|
package/dist/index.js
CHANGED
|
@@ -1642,7 +1642,7 @@ import { join as join6 } from "node:path";
|
|
|
1642
1642
|
var package_default = {
|
|
1643
1643
|
name: "oh-my-customcode",
|
|
1644
1644
|
workspaces: ["packages/*"],
|
|
1645
|
-
version: "0.
|
|
1645
|
+
version: "0.48.1",
|
|
1646
1646
|
description: "Batteries-included agent harness for Claude Code",
|
|
1647
1647
|
type: "module",
|
|
1648
1648
|
bin: {
|
package/package.json
CHANGED
|
@@ -139,7 +139,7 @@
|
|
|
139
139
|
},
|
|
140
140
|
{
|
|
141
141
|
"type": "command",
|
|
142
|
-
"command": "count_file=\"/tmp/.claude-loop-count-$PPID\"; if [ -f \"$count_file\" ]; then last_mod=$(stat -f%m \"$count_file\" 2>/dev/null || echo 0); now=$(date +%s); if [ $((now - last_mod)) -gt 60 ]; then echo 0 > \"$count_file\"; fi; fi; count=$(cat \"$count_file\" 2>/dev/null || echo 0); count=$((count + 1)); echo \"$count\" > \"$count_file\"; if [ \"$count\" -ge 4 ]; then echo '[AutoContinue] SAFETY: auto-continue limit (3) reached. Pausing.' >&2; fi; cat"
|
|
142
|
+
"command": "count_file=\"/tmp/.claude-loop-count-$PPID\"; if [ -f \"$count_file\" ]; then last_mod=$(stat -c%Y \"$count_file\" 2>/dev/null || stat -f%m \"$count_file\" 2>/dev/null || echo 0); now=$(date +%s); if [ $((now - last_mod)) -gt 60 ]; then echo 0 > \"$count_file\"; fi; fi; count=$(cat \"$count_file\" 2>/dev/null || echo 0); count=$((count + 1)); echo \"$count\" > \"$count_file\"; if [ \"$count\" -ge 4 ]; then echo '[AutoContinue] SAFETY: auto-continue limit (3) reached. Pausing.' >&2; fi; cat"
|
|
143
143
|
},
|
|
144
144
|
{
|
|
145
145
|
"type": "prompt",
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: deep-verify
|
|
3
|
+
description: Multi-angle release quality verification using parallel expert review teams
|
|
4
|
+
scope: core
|
|
5
|
+
version: 1.1.0
|
|
6
|
+
user-invocable: true
|
|
7
|
+
effort: high
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# /deep-verify — Multi-Angle Release Quality Verification
|
|
11
|
+
|
|
12
|
+
## Purpose
|
|
13
|
+
|
|
14
|
+
Performs deep cross-iterative verification of code changes before release, using multiple independent review perspectives to catch issues that single-pass review misses.
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
/deep-verify [branch|PR]
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
If no argument, verifies current branch against its base (usually `develop`).
|
|
23
|
+
|
|
24
|
+
## Workflow
|
|
25
|
+
|
|
26
|
+
### Round 1: Baseline Assessment
|
|
27
|
+
- Gather the full diff (`git diff develop...HEAD`)
|
|
28
|
+
- Run test suite, lint, and type check
|
|
29
|
+
- Collect results as baseline
|
|
30
|
+
|
|
31
|
+
### Round 2: Parallel Expert Review (6 agents)
|
|
32
|
+
Spawn 6 parallel review agents, each with a different focus:
|
|
33
|
+
|
|
34
|
+
1. **Correctness Reviewer** — Logic errors, edge cases, off-by-one, null handling
|
|
35
|
+
2. **Security Reviewer** — Injection, auth bypass, data exposure, OWASP top 10
|
|
36
|
+
3. **Performance Reviewer** — O(n^2) loops, unbounded queries, memory leaks, missing indexes
|
|
37
|
+
4. **Integration Reviewer** — API contract breaks, migration safety, cross-module side effects
|
|
38
|
+
5. **Philosophy Reviewer** — Project concept/metaphor alignment, separation of concerns (R006), orchestrator rules (R010), advisory-first enforcement (R021), compilation metaphor integrity
|
|
39
|
+
6. **Regression & Performance Reviewer** — Feature regression risk, API contract preservation, query performance impact, index effectiveness, algorithm complexity at realistic scale
|
|
40
|
+
|
|
41
|
+
Each agent receives the full diff and returns findings as structured JSON:
|
|
42
|
+
```json
|
|
43
|
+
{
|
|
44
|
+
"severity": "HIGH|MEDIUM|LOW",
|
|
45
|
+
"file": "path/to/file",
|
|
46
|
+
"line": 42,
|
|
47
|
+
"finding": "description",
|
|
48
|
+
"suggestion": "fix suggestion"
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Round 3: Cross-Verification
|
|
53
|
+
- Merge all findings from Round 2
|
|
54
|
+
- Deduplicate (same file+line+similar finding = 1 entry)
|
|
55
|
+
- For each HIGH finding: spawn a verification agent to confirm or dismiss as FALSE POSITIVE
|
|
56
|
+
- Evidence-based: each confirmation must include proof (e.g., `toQuery()` output, test result)
|
|
57
|
+
|
|
58
|
+
### Round 4: FALSE POSITIVE Filter
|
|
59
|
+
- Remove confirmed false positives with evidence
|
|
60
|
+
- Remaining findings are CONFIRMED issues
|
|
61
|
+
|
|
62
|
+
### Round 5: Fix Application
|
|
63
|
+
- For each CONFIRMED HIGH/MEDIUM finding: spawn fix agent
|
|
64
|
+
- Run tests after fixes
|
|
65
|
+
- If tests fail: revert fix, report as "needs manual review"
|
|
66
|
+
|
|
67
|
+
### Round 6: Final Verification
|
|
68
|
+
- Re-run full test suite
|
|
69
|
+
- Re-run lint and type check
|
|
70
|
+
- Generate summary report
|
|
71
|
+
|
|
72
|
+
### Round 7: Philosophy & Regression Gate
|
|
73
|
+
- Verify all changes align with project's compilation metaphor (Skills=source, Agents=artifacts, Rules=spec)
|
|
74
|
+
- Check separation of concerns: no agents containing skill logic, no skills with agent definitions
|
|
75
|
+
- Verify orchestrator rules: no new file writes from orchestrator context
|
|
76
|
+
- Check advisory-first: no new hard-blocking hooks introduced
|
|
77
|
+
- Confirm no feature regressions: existing APIs preserved, test coverage maintained
|
|
78
|
+
- Performance sanity: no O(n^2) on large datasets, no missing indexes for new queries
|
|
79
|
+
- If any CONCERN or VIOLATION found: report for manual review before release
|
|
80
|
+
|
|
81
|
+
## Output Format
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
╔══════════════════════════════════════════════════════╗
|
|
85
|
+
║ Deep Verification Report ║
|
|
86
|
+
╠══════════════════════════════════════════════════════╣
|
|
87
|
+
║ Branch: {branch} ║
|
|
88
|
+
║ Commits: {count} ║
|
|
89
|
+
║ Files changed: {count} ║
|
|
90
|
+
╠══════════════════════════════════════════════════════╣
|
|
91
|
+
║ Findings: ║
|
|
92
|
+
║ HIGH: {n} ({confirmed} confirmed, {fp} FP) ║
|
|
93
|
+
║ MEDIUM: {n} ({confirmed} confirmed, {fp} FP) ║
|
|
94
|
+
║ LOW: {n} ║
|
|
95
|
+
╠══════════════════════════════════════════════════════╣
|
|
96
|
+
║ Fixes Applied: {n} ║
|
|
97
|
+
║ Tests: {pass}/{total} passing ║
|
|
98
|
+
║ Verdict: READY / NEEDS REVIEW / BLOCKED ║
|
|
99
|
+
║ Philosophy: ALIGNED / {n} CONCERNS ║
|
|
100
|
+
║ Regression: CLEAN / {n} RISKS ║
|
|
101
|
+
╚══════════════════════════════════════════════════════╝
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Notes
|
|
105
|
+
|
|
106
|
+
- Round 2 agents use `model: sonnet` for cost efficiency
|
|
107
|
+
- Round 3 verification agents use `model: opus` for reasoning depth
|
|
108
|
+
- FALSE POSITIVE filtering is critical — previous releases showed 80%+ FP rate on automated review
|
|
109
|
+
- This skill replaces ad-hoc cross-verification with a repeatable process
|
|
110
|
+
- Round 7 philosophy check references CLAUDE.md architecture section and R006/R010/R021 rules
|
|
111
|
+
- Regression check compares function signatures, export lists, and test counts against develop baseline
|
package/templates/CLAUDE.md
CHANGED
|
@@ -120,6 +120,7 @@ oh-my-customcode로 구동됩니다.
|
|
|
120
120
|
| `/optimize-report` | 최적화 리포트 생성 |
|
|
121
121
|
| `/research` | 10-team 병렬 딥 분석 및 교차 검증 |
|
|
122
122
|
| `/deep-plan` | 연구 검증 기반 계획 수립 (research → plan → verify) |
|
|
123
|
+
| `/deep-verify` | 다중 관점 릴리즈 품질 검증 |
|
|
123
124
|
| `/omcustom:sauron-watch` | 전체 R017 검증 |
|
|
124
125
|
| `/structured-dev-cycle` | 6단계 구조적 개발 사이클 (Plan → Verify → Implement → Verify → Compound → Done) |
|
|
125
126
|
| `/omcustom:loop` | 백그라운드 에이전트 자동 계속 실행 |
|
|
@@ -134,11 +135,11 @@ project/
|
|
|
134
135
|
+-- CLAUDE.md # 진입점
|
|
135
136
|
+-- .claude/
|
|
136
137
|
| +-- agents/ # 서브에이전트 정의 (45 파일)
|
|
137
|
-
| +-- skills/ # 스킬 (
|
|
138
|
+
| +-- skills/ # 스킬 (84 디렉토리)
|
|
138
139
|
| +-- rules/ # 전역 규칙 (R000-R021)
|
|
139
140
|
| +-- hooks/ # 훅 스크립트 (보안, 검증, HUD)
|
|
140
141
|
| +-- contexts/ # 컨텍스트 파일 (ecomode)
|
|
141
|
-
+-- guides/ # 레퍼런스 문서 (
|
|
142
|
+
+-- guides/ # 레퍼런스 문서 (29 토픽)
|
|
142
143
|
```
|
|
143
144
|
|
|
144
145
|
## 오케스트레이션
|
|
@@ -34,6 +34,14 @@ guides:
|
|
|
34
34
|
origin: go.dev
|
|
35
35
|
url: https://go.dev/doc/effective_go
|
|
36
36
|
|
|
37
|
+
- name: java21
|
|
38
|
+
description: Java 21 language reference and modern feature documentation
|
|
39
|
+
path: ./java21/
|
|
40
|
+
source:
|
|
41
|
+
type: external
|
|
42
|
+
origin: docs.oracle.com
|
|
43
|
+
url: https://docs.oracle.com/en/java/javase/21/
|
|
44
|
+
|
|
37
45
|
- name: python
|
|
38
46
|
description: Python reference from PEP 8 and PEP 20
|
|
39
47
|
path: ./python/
|
|
@@ -67,6 +75,14 @@ guides:
|
|
|
67
75
|
url: https://www.typescriptlang.org/docs/handbook/
|
|
68
76
|
|
|
69
77
|
# Backend
|
|
78
|
+
- name: django-best-practices
|
|
79
|
+
description: Django patterns for production-ready Python web applications
|
|
80
|
+
path: ./django-best-practices/
|
|
81
|
+
source:
|
|
82
|
+
type: external
|
|
83
|
+
origin: djangoproject.com
|
|
84
|
+
url: https://docs.djangoproject.com/en/6.0/
|
|
85
|
+
|
|
70
86
|
- name: fastapi
|
|
71
87
|
description: FastAPI framework reference
|
|
72
88
|
path: ./fastapi/
|
|
@@ -158,6 +174,14 @@ guides:
|
|
|
158
174
|
url: https://iceberg.apache.org/docs/latest/
|
|
159
175
|
|
|
160
176
|
# Database
|
|
177
|
+
- name: alembic
|
|
178
|
+
description: Alembic database migration framework for SQLAlchemy
|
|
179
|
+
path: ./alembic/
|
|
180
|
+
source:
|
|
181
|
+
type: external
|
|
182
|
+
origin: alembic.sqlalchemy.org
|
|
183
|
+
url: https://alembic.sqlalchemy.org/en/latest/
|
|
184
|
+
|
|
161
185
|
- name: supabase-postgres
|
|
162
186
|
description: Supabase and PostgreSQL best practices reference
|
|
163
187
|
path: ./supabase-postgres/
|
|
@@ -182,6 +206,20 @@ guides:
|
|
|
182
206
|
origin: redis.io
|
|
183
207
|
url: https://redis.io/docs/
|
|
184
208
|
|
|
209
|
+
# Git
|
|
210
|
+
- name: git-worktree-workflow
|
|
211
|
+
description: Git worktree workflow for parallel branch development
|
|
212
|
+
path: ./git-worktree-workflow/
|
|
213
|
+
source:
|
|
214
|
+
type: internal
|
|
215
|
+
|
|
216
|
+
# Architecture
|
|
217
|
+
- name: skill-bundle-design
|
|
218
|
+
description: Domain skill bundle design patterns for Author/Test/Troubleshoot tri-pattern
|
|
219
|
+
path: ./skill-bundle-design/
|
|
220
|
+
source:
|
|
221
|
+
type: internal
|
|
222
|
+
|
|
185
223
|
# Writing
|
|
186
224
|
- name: elements-of-style
|
|
187
225
|
description: The Elements of Style writing clarity guidelines
|
|
@@ -190,3 +228,10 @@ guides:
|
|
|
190
228
|
type: external
|
|
191
229
|
origin: public-domain
|
|
192
230
|
url: https://www.gutenberg.org/ebooks/37134
|
|
231
|
+
|
|
232
|
+
# Web Scraping
|
|
233
|
+
- name: web-scraping
|
|
234
|
+
description: BeautifulSoup and Playwright patterns for reliable web scraping and government site parsing
|
|
235
|
+
path: ./web-scraping/
|
|
236
|
+
source:
|
|
237
|
+
type: internal
|
|
@@ -0,0 +1,926 @@
|
|
|
1
|
+
# Web Scraping Best Practices
|
|
2
|
+
|
|
3
|
+
Reliable patterns for BeautifulSoup and Playwright-based web scraping, with emphasis on Korean government site parsing (QC crawling).
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## 1. BeautifulSoup Parsing Patterns
|
|
8
|
+
|
|
9
|
+
### Table Parsing
|
|
10
|
+
|
|
11
|
+
```python
|
|
12
|
+
from bs4 import BeautifulSoup
|
|
13
|
+
|
|
14
|
+
def parse_table(html: str, table_index: int = 0) -> list[dict]:
|
|
15
|
+
"""Parse an HTML table into a list of dicts keyed by header text."""
|
|
16
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
17
|
+
tables = soup.find_all("table")
|
|
18
|
+
if table_index >= len(tables):
|
|
19
|
+
return []
|
|
20
|
+
|
|
21
|
+
table = tables[table_index]
|
|
22
|
+
headers = [th.get_text(strip=True) for th in table.find_all("th")]
|
|
23
|
+
|
|
24
|
+
rows = []
|
|
25
|
+
for tr in table.find_all("tr"):
|
|
26
|
+
cells = tr.find_all(["td"])
|
|
27
|
+
if not cells:
|
|
28
|
+
continue
|
|
29
|
+
row = {}
|
|
30
|
+
for i, td in enumerate(cells):
|
|
31
|
+
key = headers[i] if i < len(headers) else f"col_{i}"
|
|
32
|
+
row[key] = td.get_text(strip=True)
|
|
33
|
+
rows.append(row)
|
|
34
|
+
return rows
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
#### Handling rowspan/colspan
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
def parse_complex_table(table_element) -> list[list[str]]:
|
|
41
|
+
"""Handle rowspan and colspan by expanding cells into a 2D grid."""
|
|
42
|
+
rows = table_element.find_all("tr")
|
|
43
|
+
if not rows:
|
|
44
|
+
return []
|
|
45
|
+
|
|
46
|
+
# Determine grid dimensions
|
|
47
|
+
max_cols = 0
|
|
48
|
+
for tr in rows:
|
|
49
|
+
col_count = sum(
|
|
50
|
+
int(cell.get("colspan", 1)) for cell in tr.find_all(["td", "th"])
|
|
51
|
+
)
|
|
52
|
+
max_cols = max(max_cols, col_count)
|
|
53
|
+
|
|
54
|
+
grid: list[list[str | None]] = [
|
|
55
|
+
[None] * max_cols for _ in range(len(rows))
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
for row_idx, tr in enumerate(rows):
|
|
59
|
+
col_idx = 0
|
|
60
|
+
for cell in tr.find_all(["td", "th"]):
|
|
61
|
+
# Skip cells already filled by rowspan
|
|
62
|
+
while col_idx < max_cols and grid[row_idx][col_idx] is not None:
|
|
63
|
+
col_idx += 1
|
|
64
|
+
if col_idx >= max_cols:
|
|
65
|
+
break
|
|
66
|
+
|
|
67
|
+
text = cell.get_text(strip=True)
|
|
68
|
+
rowspan = int(cell.get("rowspan", 1))
|
|
69
|
+
colspan = int(cell.get("colspan", 1))
|
|
70
|
+
|
|
71
|
+
for dr in range(rowspan):
|
|
72
|
+
for dc in range(colspan):
|
|
73
|
+
r, c = row_idx + dr, col_idx + dc
|
|
74
|
+
if r < len(grid) and c < max_cols:
|
|
75
|
+
grid[r][c] = text
|
|
76
|
+
col_idx += colspan
|
|
77
|
+
|
|
78
|
+
return [[cell or "" for cell in row] for row in grid]
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### List Extraction
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
def extract_definition_list(soup: BeautifulSoup) -> dict[str, str]:
|
|
85
|
+
"""Extract <dl> definition lists into key-value pairs."""
|
|
86
|
+
result = {}
|
|
87
|
+
for dl in soup.find_all("dl"):
|
|
88
|
+
dts = dl.find_all("dt")
|
|
89
|
+
dds = dl.find_all("dd")
|
|
90
|
+
for dt, dd in zip(dts, dds):
|
|
91
|
+
result[dt.get_text(strip=True)] = dd.get_text(strip=True)
|
|
92
|
+
return result
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def extract_nested_list(ul_element) -> list:
|
|
96
|
+
"""Recursively extract nested ul/ol into a tree structure."""
|
|
97
|
+
items = []
|
|
98
|
+
for li in ul_element.find_all("li", recursive=False):
|
|
99
|
+
text = li.find(string=True, recursive=False)
|
|
100
|
+
text = text.strip() if text else ""
|
|
101
|
+
children_ul = li.find(["ul", "ol"])
|
|
102
|
+
if children_ul:
|
|
103
|
+
items.append({"text": text, "children": extract_nested_list(children_ul)})
|
|
104
|
+
else:
|
|
105
|
+
items.append({"text": li.get_text(strip=True)})
|
|
106
|
+
return items
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### iframe Content Access
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
import httpx
|
|
113
|
+
|
|
114
|
+
async def fetch_iframe_content(
|
|
115
|
+
page_html: str, base_url: str, client: httpx.AsyncClient
|
|
116
|
+
) -> list[BeautifulSoup]:
|
|
117
|
+
"""Fetch and parse all iframe sources from a page."""
|
|
118
|
+
soup = BeautifulSoup(page_html, "html.parser")
|
|
119
|
+
iframes = soup.find_all("iframe")
|
|
120
|
+
results = []
|
|
121
|
+
|
|
122
|
+
for iframe in iframes:
|
|
123
|
+
src = iframe.get("src")
|
|
124
|
+
if not src:
|
|
125
|
+
continue
|
|
126
|
+
# Resolve relative URLs
|
|
127
|
+
if src.startswith("//"):
|
|
128
|
+
src = "https:" + src
|
|
129
|
+
elif src.startswith("/"):
|
|
130
|
+
from urllib.parse import urljoin
|
|
131
|
+
src = urljoin(base_url, src)
|
|
132
|
+
|
|
133
|
+
resp = await client.get(src, follow_redirects=True)
|
|
134
|
+
results.append(BeautifulSoup(resp.text, "html.parser"))
|
|
135
|
+
return results
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Korean Text Encoding
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
import httpx
|
|
142
|
+
|
|
143
|
+
def fetch_with_encoding(url: str, *, fallback_encoding: str = "euc-kr") -> str:
|
|
144
|
+
"""Fetch a page, auto-detecting EUC-KR/CP949 encoding."""
|
|
145
|
+
resp = httpx.get(url, follow_redirects=True)
|
|
146
|
+
|
|
147
|
+
# 1. Check HTTP header
|
|
148
|
+
content_type = resp.headers.get("content-type", "")
|
|
149
|
+
if "charset=" in content_type.lower():
|
|
150
|
+
declared = content_type.split("charset=")[-1].strip().lower()
|
|
151
|
+
if declared in ("euc-kr", "euckr", "cp949"):
|
|
152
|
+
return resp.content.decode("cp949", errors="replace")
|
|
153
|
+
|
|
154
|
+
# 2. Check meta tag
|
|
155
|
+
raw = resp.content
|
|
156
|
+
probe = raw[:2048].decode("ascii", errors="ignore").lower()
|
|
157
|
+
if "euc-kr" in probe or "euckr" in probe:
|
|
158
|
+
return raw.decode("cp949", errors="replace")
|
|
159
|
+
|
|
160
|
+
# 3. Try UTF-8, then fallback
|
|
161
|
+
try:
|
|
162
|
+
return raw.decode("utf-8")
|
|
163
|
+
except UnicodeDecodeError:
|
|
164
|
+
return raw.decode(fallback_encoding, errors="replace")
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### CSS Selector vs find/find_all
|
|
168
|
+
|
|
169
|
+
| Method | Best For | Example |
|
|
170
|
+
|--------|----------|---------|
|
|
171
|
+
| `soup.select("div.content > p")` | Complex nested paths, class/id combos | Multi-level CSS paths |
|
|
172
|
+
| `soup.find("div", class_="content")` | Simple single-element lookup | Known structure |
|
|
173
|
+
| `soup.find_all("a", href=True)` | Attribute filtering | Collecting all links |
|
|
174
|
+
| `soup.select_one("#main-table tr:nth-child(2)")` | Positional targeting | Specific row/cell |
|
|
175
|
+
|
|
176
|
+
**Rule of thumb**: Use `select()` for paths, `find()`/`find_all()` for attribute filters.
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## 2. Playwright Navigation & Wait Strategies
|
|
181
|
+
|
|
182
|
+
### Wait Event Comparison
|
|
183
|
+
|
|
184
|
+
| Event | When to Use | Caveat |
|
|
185
|
+
|-------|-------------|--------|
|
|
186
|
+
| `networkidle` | SPA with lazy-loaded data | Slow; waits for 500ms of no requests |
|
|
187
|
+
| `domcontentloaded` | Server-rendered pages | JS may not have executed yet |
|
|
188
|
+
| `load` | Traditional pages with images/fonts | Blocks on all resources |
|
|
189
|
+
| `commit` | Fastest; navigation started | Page not rendered yet |
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from playwright.async_api import async_playwright
|
|
193
|
+
|
|
194
|
+
async def scrape_dynamic_page(url: str) -> str:
|
|
195
|
+
async with async_playwright() as p:
|
|
196
|
+
browser = await p.chromium.launch(headless=True)
|
|
197
|
+
context = await browser.new_context(
|
|
198
|
+
locale="ko-KR",
|
|
199
|
+
user_agent=(
|
|
200
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
201
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
202
|
+
"Chrome/120.0.0.0 Safari/537.36"
|
|
203
|
+
),
|
|
204
|
+
)
|
|
205
|
+
page = await context.new_page()
|
|
206
|
+
|
|
207
|
+
# Use domcontentloaded for gov sites (faster than networkidle)
|
|
208
|
+
await page.goto(url, wait_until="domcontentloaded", timeout=30_000)
|
|
209
|
+
|
|
210
|
+
# Then wait for the specific content element
|
|
211
|
+
await page.wait_for_selector("#content-area", timeout=10_000)
|
|
212
|
+
|
|
213
|
+
html = await page.content()
|
|
214
|
+
await browser.close()
|
|
215
|
+
return html
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### JavaScript Redirect Detection
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
async def follow_js_redirects(page, url: str, max_redirects: int = 5) -> str:
|
|
222
|
+
"""Follow JS redirects (window.location, meta refresh) up to N hops."""
|
|
223
|
+
visited = set()
|
|
224
|
+
|
|
225
|
+
for _ in range(max_redirects):
|
|
226
|
+
if page.url in visited:
|
|
227
|
+
break
|
|
228
|
+
visited.add(page.url)
|
|
229
|
+
|
|
230
|
+
# Wait for potential JS redirect
|
|
231
|
+
try:
|
|
232
|
+
await page.wait_for_navigation(timeout=3_000)
|
|
233
|
+
except Exception:
|
|
234
|
+
break # No redirect happened
|
|
235
|
+
|
|
236
|
+
return page.url
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### Dynamic Content Waiting
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
async def wait_for_ajax_table(page, table_selector: str = "table") -> str:
|
|
243
|
+
"""Wait for a table to be populated by AJAX."""
|
|
244
|
+
# Wait for at least one data row
|
|
245
|
+
await page.wait_for_selector(
|
|
246
|
+
f"{table_selector} tbody tr",
|
|
247
|
+
state="attached",
|
|
248
|
+
timeout=15_000,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# Optional: wait for a loading spinner to disappear
|
|
252
|
+
try:
|
|
253
|
+
await page.wait_for_selector(
|
|
254
|
+
".loading, .spinner",
|
|
255
|
+
state="detached",
|
|
256
|
+
timeout=5_000,
|
|
257
|
+
)
|
|
258
|
+
except Exception:
|
|
259
|
+
pass # No spinner found, proceed
|
|
260
|
+
|
|
261
|
+
return await page.content()
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
### page.evaluate() for Complex Extraction
|
|
265
|
+
|
|
266
|
+
```python
|
|
267
|
+
async def extract_table_via_js(page) -> list[dict]:
|
|
268
|
+
"""Use page.evaluate() when DOM is complex or heavily JS-rendered."""
|
|
269
|
+
return await page.evaluate("""
|
|
270
|
+
() => {
|
|
271
|
+
const table = document.querySelector('#data-table');
|
|
272
|
+
if (!table) return [];
|
|
273
|
+
|
|
274
|
+
const headers = [...table.querySelectorAll('th')]
|
|
275
|
+
.map(th => th.textContent.trim());
|
|
276
|
+
const rows = [...table.querySelectorAll('tbody tr')];
|
|
277
|
+
|
|
278
|
+
return rows.map(tr => {
|
|
279
|
+
const cells = [...tr.querySelectorAll('td')];
|
|
280
|
+
const obj = {};
|
|
281
|
+
cells.forEach((td, i) => {
|
|
282
|
+
obj[headers[i] || `col_${i}`] = td.textContent.trim();
|
|
283
|
+
});
|
|
284
|
+
return obj;
|
|
285
|
+
});
|
|
286
|
+
}
|
|
287
|
+
""")
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
### Browser Context Isolation for Parallel Scraping
|
|
291
|
+
|
|
292
|
+
```python
|
|
293
|
+
import asyncio
|
|
294
|
+
|
|
295
|
+
async def parallel_scrape(urls: list[str]) -> list[str]:
|
|
296
|
+
"""Scrape multiple URLs in parallel using isolated browser contexts."""
|
|
297
|
+
async with async_playwright() as p:
|
|
298
|
+
browser = await p.chromium.launch(headless=True)
|
|
299
|
+
|
|
300
|
+
async def scrape_one(url: str) -> str:
|
|
301
|
+
# Each URL gets its own context (cookies, storage isolated)
|
|
302
|
+
context = await browser.new_context()
|
|
303
|
+
page = await context.new_page()
|
|
304
|
+
try:
|
|
305
|
+
await page.goto(url, wait_until="domcontentloaded", timeout=30_000)
|
|
306
|
+
return await page.content()
|
|
307
|
+
finally:
|
|
308
|
+
await context.close()
|
|
309
|
+
|
|
310
|
+
# Limit concurrency to avoid overloading target
|
|
311
|
+
semaphore = asyncio.Semaphore(3)
|
|
312
|
+
|
|
313
|
+
async def bounded_scrape(url: str) -> str:
|
|
314
|
+
async with semaphore:
|
|
315
|
+
return await scrape_one(url)
|
|
316
|
+
|
|
317
|
+
results = await asyncio.gather(
|
|
318
|
+
*[bounded_scrape(url) for url in urls],
|
|
319
|
+
return_exceptions=True,
|
|
320
|
+
)
|
|
321
|
+
await browser.close()
|
|
322
|
+
return [r if isinstance(r, str) else "" for r in results]
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
---
|
|
326
|
+
|
|
327
|
+
## 3. Government Site Common Patterns
|
|
328
|
+
|
|
329
|
+
### Korean Government Portal Structures
|
|
330
|
+
|
|
331
|
+
Korean government sites (`.go.kr`) share common patterns:
|
|
332
|
+
|
|
333
|
+
| Pattern | Sites | Handling |
|
|
334
|
+
|---------|-------|----------|
|
|
335
|
+
| Board-list pagination | data.go.kr, me.go.kr | `page=N` or `pageIndex=N` query params |
|
|
336
|
+
| iframe-wrapped content | KOSIS, e-Nara | Fetch iframe `src` separately |
|
|
337
|
+
| JavaScript-only navigation | Various ministries | Playwright required |
|
|
338
|
+
| EUC-KR encoding | Older systems | CP949 decoding (superset of EUC-KR) |
|
|
339
|
+
| Session-gated downloads | data.go.kr API | Login + session cookie forwarding |
|
|
340
|
+
|
|
341
|
+
### JS Redirect Chains
|
|
342
|
+
|
|
343
|
+
```python
|
|
344
|
+
async def handle_gov_redirects(page) -> None:
|
|
345
|
+
"""Handle common Korean gov site redirect patterns."""
|
|
346
|
+
# Pattern 1: window.location.href = '...'
|
|
347
|
+
# Pattern 2: document.location.replace('...')
|
|
348
|
+
# Pattern 3: <meta http-equiv="refresh" content="0;url=...">
|
|
349
|
+
|
|
350
|
+
# Wait for final destination
|
|
351
|
+
await page.wait_for_load_state("domcontentloaded")
|
|
352
|
+
|
|
353
|
+
# Check for meta refresh
|
|
354
|
+
meta_refresh = await page.query_selector('meta[http-equiv="refresh"]')
|
|
355
|
+
if meta_refresh:
|
|
356
|
+
content = await meta_refresh.get_attribute("content")
|
|
357
|
+
if content and "url=" in content.lower():
|
|
358
|
+
target_url = content.split("url=", 1)[-1].strip("'\"")
|
|
359
|
+
await page.goto(target_url, wait_until="domcontentloaded")
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
### Session/Cookie Handling
|
|
363
|
+
|
|
364
|
+
```python
|
|
365
|
+
async def authenticated_gov_scrape(
|
|
366
|
+
login_url: str,
|
|
367
|
+
target_url: str,
|
|
368
|
+
credentials: dict,
|
|
369
|
+
) -> str:
|
|
370
|
+
"""Login to a government portal and scrape authenticated content."""
|
|
371
|
+
async with async_playwright() as p:
|
|
372
|
+
browser = await p.chromium.launch(headless=True)
|
|
373
|
+
context = await browser.new_context()
|
|
374
|
+
page = await context.new_page()
|
|
375
|
+
|
|
376
|
+
# Step 1: Navigate to login
|
|
377
|
+
await page.goto(login_url, wait_until="networkidle")
|
|
378
|
+
|
|
379
|
+
# Step 2: Fill credentials
|
|
380
|
+
await page.fill("#userId", credentials["user_id"])
|
|
381
|
+
await page.fill("#userPw", credentials["password"])
|
|
382
|
+
await page.click("#loginBtn")
|
|
383
|
+
|
|
384
|
+
# Step 3: Wait for redirect after login
|
|
385
|
+
await page.wait_for_url("**/main**", timeout=10_000)
|
|
386
|
+
|
|
387
|
+
# Step 4: Navigate to target with session cookies
|
|
388
|
+
await page.goto(target_url, wait_until="domcontentloaded")
|
|
389
|
+
html = await page.content()
|
|
390
|
+
|
|
391
|
+
await browser.close()
|
|
392
|
+
return html
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
### iframe-Based Content (Common in Korean Gov Sites)
|
|
396
|
+
|
|
397
|
+
```python
|
|
398
|
+
async def extract_iframe_content(page) -> str:
|
|
399
|
+
"""Extract content from nested iframes (common in KOSIS, e-Nara)."""
|
|
400
|
+
# Wait for iframe to load
|
|
401
|
+
iframe_element = await page.wait_for_selector("iframe#contentFrame")
|
|
402
|
+
iframe = await iframe_element.content_frame()
|
|
403
|
+
|
|
404
|
+
if iframe is None:
|
|
405
|
+
return ""
|
|
406
|
+
|
|
407
|
+
# Some sites nest iframes 2-3 levels deep
|
|
408
|
+
nested_iframe = await iframe.query_selector("iframe")
|
|
409
|
+
if nested_iframe:
|
|
410
|
+
iframe = await nested_iframe.content_frame()
|
|
411
|
+
|
|
412
|
+
return await iframe.content() if iframe else ""
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
### CAPTCHA and Bot Detection Indicators
|
|
416
|
+
|
|
417
|
+
| Indicator | Detection | Mitigation |
|
|
418
|
+
|-----------|-----------|------------|
|
|
419
|
+
| 403 Forbidden | Status code check | Rotate user-agent, add delays |
|
|
420
|
+
| Empty response body | `len(html) < 100` | Retry with Playwright |
|
|
421
|
+
| CAPTCHA form | `soup.find("form", id="captchaForm")` | Flag for manual intervention |
|
|
422
|
+
| Rate limit headers | `Retry-After` header | Respect backoff period |
|
|
423
|
+
| JavaScript challenge | Cloudflare/WAF JS | Use Playwright, not httpx |
|
|
424
|
+
|
|
425
|
+
```python
|
|
426
|
+
def detect_bot_block(html: str, status_code: int) -> str | None:
|
|
427
|
+
"""Detect common bot-blocking patterns. Returns block type or None."""
|
|
428
|
+
if status_code == 403:
|
|
429
|
+
return "forbidden"
|
|
430
|
+
if status_code == 429:
|
|
431
|
+
return "rate_limited"
|
|
432
|
+
if len(html) < 200:
|
|
433
|
+
return "empty_response"
|
|
434
|
+
|
|
435
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
436
|
+
if soup.find("form", id=lambda x: x and "captcha" in x.lower()):
|
|
437
|
+
return "captcha"
|
|
438
|
+
if soup.find("div", class_=lambda x: x and "cf-" in str(x)):
|
|
439
|
+
return "cloudflare"
|
|
440
|
+
|
|
441
|
+
return None
|
|
442
|
+
```
|
|
443
|
+
|
|
444
|
+
---
|
|
445
|
+
|
|
446
|
+
## 4. Smart Parser Design Patterns
|
|
447
|
+
|
|
448
|
+
### SmartTableDetector
|
|
449
|
+
|
|
450
|
+
```python
|
|
451
|
+
from dataclasses import dataclass
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
@dataclass
|
|
455
|
+
class TableSignature:
|
|
456
|
+
"""Describes expected table structure for auto-detection."""
|
|
457
|
+
required_headers: list[str]
|
|
458
|
+
optional_headers: list[str] = None
|
|
459
|
+
min_rows: int = 1
|
|
460
|
+
header_row_index: int = 0
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def detect_table(
|
|
464
|
+
soup: BeautifulSoup,
|
|
465
|
+
signature: TableSignature,
|
|
466
|
+
) -> "Tag | None":
|
|
467
|
+
"""Find a table matching the given signature."""
|
|
468
|
+
for table in soup.find_all("table"):
|
|
469
|
+
headers = [
|
|
470
|
+
th.get_text(strip=True)
|
|
471
|
+
for th in table.find_all("tr")[signature.header_row_index].find_all(
|
|
472
|
+
["th", "td"]
|
|
473
|
+
)
|
|
474
|
+
]
|
|
475
|
+
if all(h in headers for h in signature.required_headers):
|
|
476
|
+
data_rows = table.find_all("tr")[signature.header_row_index + 1 :]
|
|
477
|
+
if len(data_rows) >= signature.min_rows:
|
|
478
|
+
return table
|
|
479
|
+
return None
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
# Usage
|
|
483
|
+
sig = TableSignature(
|
|
484
|
+
required_headers=["항목명", "기준일", "수치"],
|
|
485
|
+
optional_headers=["단위", "비고"],
|
|
486
|
+
min_rows=3,
|
|
487
|
+
)
|
|
488
|
+
target_table = detect_table(soup, sig)
|
|
489
|
+
```
|
|
490
|
+
|
|
491
|
+
### Adaptive Selectors
|
|
492
|
+
|
|
493
|
+
```python
|
|
494
|
+
class AdaptiveSelector:
|
|
495
|
+
"""Try multiple selectors in order, surviving minor layout changes."""
|
|
496
|
+
|
|
497
|
+
def __init__(self, selectors: list[str], description: str = ""):
|
|
498
|
+
self.selectors = selectors
|
|
499
|
+
self.description = description
|
|
500
|
+
|
|
501
|
+
def find(self, soup: BeautifulSoup):
|
|
502
|
+
for selector in self.selectors:
|
|
503
|
+
result = soup.select(selector)
|
|
504
|
+
if result:
|
|
505
|
+
return result
|
|
506
|
+
return []
|
|
507
|
+
|
|
508
|
+
def find_one(self, soup: BeautifulSoup):
|
|
509
|
+
results = self.find(soup)
|
|
510
|
+
return results[0] if results else None
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
# Define selectors with fallbacks
|
|
514
|
+
CONTENT_AREA = AdaptiveSelector(
|
|
515
|
+
selectors=[
|
|
516
|
+
"#content-area", # Primary: ID-based
|
|
517
|
+
"div.content_area", # Fallback 1: class-based
|
|
518
|
+
"main > div:first-child", # Fallback 2: structural
|
|
519
|
+
"body > div.wrapper > div.content", # Fallback 3: full path
|
|
520
|
+
],
|
|
521
|
+
description="Main content area",
|
|
522
|
+
)
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
### Schema-First Parsing
|
|
526
|
+
|
|
527
|
+
```python
|
|
528
|
+
from pydantic import BaseModel, field_validator
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
class QCInspectionResult(BaseModel):
|
|
532
|
+
"""Define expected output shape before writing the parser."""
|
|
533
|
+
item_name: str
|
|
534
|
+
inspection_date: str
|
|
535
|
+
result: str # "적합" | "부적합" | "해당없음"
|
|
536
|
+
standard_value: str | None = None
|
|
537
|
+
measured_value: str | None = None
|
|
538
|
+
unit: str | None = None
|
|
539
|
+
|
|
540
|
+
@field_validator("result")
|
|
541
|
+
@classmethod
|
|
542
|
+
def validate_result(cls, v: str) -> str:
|
|
543
|
+
allowed = {"적합", "부적합", "해당없음"}
|
|
544
|
+
if v not in allowed:
|
|
545
|
+
raise ValueError(f"Result must be one of {allowed}, got '{v}'")
|
|
546
|
+
return v
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
def parse_inspection_table(
|
|
550
|
+
table_element,
|
|
551
|
+
header_map: dict[str, str] | None = None,
|
|
552
|
+
) -> list[QCInspectionResult]:
|
|
553
|
+
"""Parse table rows into validated Pydantic models."""
|
|
554
|
+
default_map = {
|
|
555
|
+
"항목명": "item_name",
|
|
556
|
+
"검사일자": "inspection_date",
|
|
557
|
+
"결과": "result",
|
|
558
|
+
"기준치": "standard_value",
|
|
559
|
+
"측정치": "measured_value",
|
|
560
|
+
"단위": "unit",
|
|
561
|
+
}
|
|
562
|
+
mapping = header_map or default_map
|
|
563
|
+
|
|
564
|
+
headers = [
|
|
565
|
+
th.get_text(strip=True) for th in table_element.find_all("th")
|
|
566
|
+
]
|
|
567
|
+
results = []
|
|
568
|
+
|
|
569
|
+
for tr in table_element.find_all("tr"):
|
|
570
|
+
cells = tr.find_all("td")
|
|
571
|
+
if not cells:
|
|
572
|
+
continue
|
|
573
|
+
|
|
574
|
+
raw = {}
|
|
575
|
+
for i, td in enumerate(cells):
|
|
576
|
+
if i < len(headers) and headers[i] in mapping:
|
|
577
|
+
raw[mapping[headers[i]]] = td.get_text(strip=True)
|
|
578
|
+
|
|
579
|
+
try:
|
|
580
|
+
results.append(QCInspectionResult(**raw))
|
|
581
|
+
except Exception:
|
|
582
|
+
continue # Skip malformed rows, log in production
|
|
583
|
+
|
|
584
|
+
return results
|
|
585
|
+
```
|
|
586
|
+
|
|
587
|
+
### Fallback Chains
|
|
588
|
+
|
|
589
|
+
```python
|
|
590
|
+
import re
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def extract_value(
|
|
594
|
+
soup: BeautifulSoup,
|
|
595
|
+
label: str,
|
|
596
|
+
) -> str | None:
|
|
597
|
+
"""Extract a value using a fallback chain of strategies."""
|
|
598
|
+
|
|
599
|
+
# Strategy 1: CSS selector — label in <th>, value in next <td>
|
|
600
|
+
for th in soup.find_all("th"):
|
|
601
|
+
if label in th.get_text(strip=True):
|
|
602
|
+
td = th.find_next_sibling("td")
|
|
603
|
+
if td:
|
|
604
|
+
return td.get_text(strip=True)
|
|
605
|
+
|
|
606
|
+
# Strategy 2: Definition list
|
|
607
|
+
for dt in soup.find_all("dt"):
|
|
608
|
+
if label in dt.get_text(strip=True):
|
|
609
|
+
dd = dt.find_next_sibling("dd")
|
|
610
|
+
if dd:
|
|
611
|
+
return dd.get_text(strip=True)
|
|
612
|
+
|
|
613
|
+
# Strategy 3: Label + adjacent text pattern
|
|
614
|
+
label_el = soup.find(string=re.compile(re.escape(label)))
|
|
615
|
+
if label_el:
|
|
616
|
+
parent = label_el.parent
|
|
617
|
+
next_sib = parent.find_next_sibling()
|
|
618
|
+
if next_sib:
|
|
619
|
+
return next_sib.get_text(strip=True)
|
|
620
|
+
|
|
621
|
+
# Strategy 4: Regex on raw text
|
|
622
|
+
text = soup.get_text()
|
|
623
|
+
pattern = rf"{re.escape(label)}\s*[:\uff1a]\s*(.+?)(?:\n|$)"
|
|
624
|
+
match = re.search(pattern, text)
|
|
625
|
+
if match:
|
|
626
|
+
return match.group(1).strip()
|
|
627
|
+
|
|
628
|
+
return None
|
|
629
|
+
```
|
|
630
|
+
|
|
631
|
+
---
|
|
632
|
+
|
|
633
|
+
## 5. Error Handling
|
|
634
|
+
|
|
635
|
+
### Timeout Strategies
|
|
636
|
+
|
|
637
|
+
```python
|
|
638
|
+
import httpx
|
|
639
|
+
|
|
640
|
+
def create_scraping_client() -> httpx.AsyncClient:
|
|
641
|
+
"""Create an HTTP client with layered timeout strategy."""
|
|
642
|
+
return httpx.AsyncClient(
|
|
643
|
+
timeout=httpx.Timeout(
|
|
644
|
+
connect=5.0, # TCP connection timeout
|
|
645
|
+
read=15.0, # Read timeout (waiting for response body)
|
|
646
|
+
write=5.0, # Write timeout (sending request body)
|
|
647
|
+
pool=10.0, # Connection pool timeout
|
|
648
|
+
),
|
|
649
|
+
follow_redirects=True,
|
|
650
|
+
limits=httpx.Limits(
|
|
651
|
+
max_connections=10,
|
|
652
|
+
max_keepalive_connections=5,
|
|
653
|
+
),
|
|
654
|
+
)
|
|
655
|
+
```
|
|
656
|
+
|
|
657
|
+
### Retry with Exponential Backoff
|
|
658
|
+
|
|
659
|
+
```python
|
|
660
|
+
import asyncio
|
|
661
|
+
import random
|
|
662
|
+
from collections.abc import Callable
|
|
663
|
+
|
|
664
|
+
|
|
665
|
+
async def retry_with_backoff(
|
|
666
|
+
fn: Callable,
|
|
667
|
+
*args,
|
|
668
|
+
max_retries: int = 3,
|
|
669
|
+
base_delay: float = 1.0,
|
|
670
|
+
max_delay: float = 30.0,
|
|
671
|
+
retryable_status: set[int] = frozenset({429, 500, 502, 503, 504}),
|
|
672
|
+
**kwargs,
|
|
673
|
+
):
|
|
674
|
+
"""Retry an async function with exponential backoff and jitter."""
|
|
675
|
+
last_exception = None
|
|
676
|
+
|
|
677
|
+
for attempt in range(max_retries + 1):
|
|
678
|
+
try:
|
|
679
|
+
result = await fn(*args, **kwargs)
|
|
680
|
+
|
|
681
|
+
# Check HTTP status if result has one
|
|
682
|
+
if hasattr(result, "status_code"):
|
|
683
|
+
if result.status_code in retryable_status:
|
|
684
|
+
raise httpx.HTTPStatusError(
|
|
685
|
+
f"Status {result.status_code}",
|
|
686
|
+
request=result.request,
|
|
687
|
+
response=result,
|
|
688
|
+
)
|
|
689
|
+
return result
|
|
690
|
+
|
|
691
|
+
except (httpx.TimeoutException, httpx.HTTPStatusError) as e:
|
|
692
|
+
last_exception = e
|
|
693
|
+
if attempt == max_retries:
|
|
694
|
+
break
|
|
695
|
+
|
|
696
|
+
delay = min(base_delay * (2 ** attempt), max_delay)
|
|
697
|
+
jitter = random.uniform(0, delay * 0.1)
|
|
698
|
+
await asyncio.sleep(delay + jitter)
|
|
699
|
+
|
|
700
|
+
raise last_exception
|
|
701
|
+
```
|
|
702
|
+
|
|
703
|
+
### Structure Change Detection
|
|
704
|
+
|
|
705
|
+
```python
|
|
706
|
+
import hashlib
|
|
707
|
+
import json
|
|
708
|
+
from pathlib import Path
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
class StructureValidator:
|
|
712
|
+
"""Detect when a target site's HTML structure has changed."""
|
|
713
|
+
|
|
714
|
+
def __init__(self, fingerprint_dir: str = ".scraper_fingerprints"):
|
|
715
|
+
self.fp_dir = Path(fingerprint_dir)
|
|
716
|
+
self.fp_dir.mkdir(exist_ok=True)
|
|
717
|
+
|
|
718
|
+
def compute_fingerprint(self, soup: BeautifulSoup, selectors: list[str]) -> str:
|
|
719
|
+
"""Create a structural fingerprint from CSS selectors."""
|
|
720
|
+
parts = []
|
|
721
|
+
for sel in selectors:
|
|
722
|
+
elements = soup.select(sel)
|
|
723
|
+
parts.append(f"{sel}:{len(elements)}")
|
|
724
|
+
for el in elements[:3]: # Sample first 3
|
|
725
|
+
parts.append(f" tag={el.name},classes={el.get('class')}")
|
|
726
|
+
return hashlib.sha256("\n".join(parts).encode()).hexdigest()[:16]
|
|
727
|
+
|
|
728
|
+
def check(self, site_key: str, soup: BeautifulSoup, selectors: list[str]) -> bool:
|
|
729
|
+
"""Returns True if structure matches previous fingerprint."""
|
|
730
|
+
fp_file = self.fp_dir / f"{site_key}.json"
|
|
731
|
+
current_fp = self.compute_fingerprint(soup, selectors)
|
|
732
|
+
|
|
733
|
+
if fp_file.exists():
|
|
734
|
+
stored = json.loads(fp_file.read_text())
|
|
735
|
+
if stored["fingerprint"] != current_fp:
|
|
736
|
+
return False # Structure changed!
|
|
737
|
+
|
|
738
|
+
# Update fingerprint
|
|
739
|
+
fp_file.write_text(json.dumps({
|
|
740
|
+
"fingerprint": current_fp,
|
|
741
|
+
"selectors": selectors,
|
|
742
|
+
}))
|
|
743
|
+
return True
|
|
744
|
+
```
|
|
745
|
+
|
|
746
|
+
### Stale Content Detection
|
|
747
|
+
|
|
748
|
+
```python
|
|
749
|
+
import hashlib
|
|
750
|
+
from datetime import datetime
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
class ContentFreshnessChecker:
|
|
754
|
+
"""Detect when scraped content hasn't actually changed."""
|
|
755
|
+
|
|
756
|
+
def __init__(self):
|
|
757
|
+
self._hashes: dict[str, tuple[str, datetime]] = {}
|
|
758
|
+
|
|
759
|
+
def is_stale(self, url: str, content: str) -> bool:
|
|
760
|
+
"""Returns True if content is identical to last scrape."""
|
|
761
|
+
content_hash = hashlib.md5(content.encode()).hexdigest()
|
|
762
|
+
if url in self._hashes:
|
|
763
|
+
prev_hash, _ = self._hashes[url]
|
|
764
|
+
if prev_hash == content_hash:
|
|
765
|
+
return True
|
|
766
|
+
|
|
767
|
+
self._hashes[url] = (content_hash, datetime.now())
|
|
768
|
+
return False
|
|
769
|
+
```
|
|
770
|
+
|
|
771
|
+
---
|
|
772
|
+
|
|
773
|
+
## 6. Testing
|
|
774
|
+
|
|
775
|
+
### Snapshot Testing for Parser Outputs
|
|
776
|
+
|
|
777
|
+
```python
|
|
778
|
+
import json
|
|
779
|
+
from pathlib import Path
|
|
780
|
+
|
|
781
|
+
import pytest
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
FIXTURES_DIR = Path(__file__).parent / "fixtures"
|
|
785
|
+
SNAPSHOTS_DIR = Path(__file__).parent / "snapshots"
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
def load_fixture(name: str) -> str:
|
|
789
|
+
"""Load an HTML fixture file."""
|
|
790
|
+
return (FIXTURES_DIR / f"{name}.html").read_text(encoding="utf-8")
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
def assert_snapshot(name: str, data: list[dict]) -> None:
|
|
794
|
+
"""Compare parser output against a stored snapshot."""
|
|
795
|
+
snapshot_file = SNAPSHOTS_DIR / f"{name}.json"
|
|
796
|
+
serialized = json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True)
|
|
797
|
+
|
|
798
|
+
if not snapshot_file.exists():
|
|
799
|
+
# First run: create snapshot
|
|
800
|
+
snapshot_file.parent.mkdir(parents=True, exist_ok=True)
|
|
801
|
+
snapshot_file.write_text(serialized)
|
|
802
|
+
pytest.skip(f"Snapshot created: {snapshot_file}")
|
|
803
|
+
|
|
804
|
+
expected = snapshot_file.read_text(encoding="utf-8")
|
|
805
|
+
assert serialized == expected, (
|
|
806
|
+
f"Snapshot mismatch for {name}. "
|
|
807
|
+
f"Run with --update-snapshots to update."
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
# Usage in tests
|
|
812
|
+
def test_parse_qc_inspection_table():
|
|
813
|
+
html = load_fixture("qc_inspection_2024")
|
|
814
|
+
results = parse_inspection_table(
|
|
815
|
+
BeautifulSoup(html, "html.parser").find("table")
|
|
816
|
+
)
|
|
817
|
+
assert_snapshot("qc_inspection_2024", [r.model_dump() for r in results])
|
|
818
|
+
```
|
|
819
|
+
|
|
820
|
+
### Mock HTML Fixtures for Unit Tests
|
|
821
|
+
|
|
822
|
+
```python
|
|
823
|
+
# tests/fixtures/simple_table.html
|
|
824
|
+
SIMPLE_TABLE_HTML = """
|
|
825
|
+
<html>
|
|
826
|
+
<body>
|
|
827
|
+
<table id="result-table">
|
|
828
|
+
<thead>
|
|
829
|
+
<tr><th>항목명</th><th>검사일자</th><th>결과</th></tr>
|
|
830
|
+
</thead>
|
|
831
|
+
<tbody>
|
|
832
|
+
<tr><td>수질검사</td><td>2024-01-15</td><td>적합</td></tr>
|
|
833
|
+
<tr><td>대기질검사</td><td>2024-01-16</td><td>부적합</td></tr>
|
|
834
|
+
</tbody>
|
|
835
|
+
</table>
|
|
836
|
+
</body>
|
|
837
|
+
</html>
|
|
838
|
+
"""
|
|
839
|
+
|
|
840
|
+
|
|
841
|
+
def test_parse_simple_table():
|
|
842
|
+
soup = BeautifulSoup(SIMPLE_TABLE_HTML, "html.parser")
|
|
843
|
+
table = soup.find("table", id="result-table")
|
|
844
|
+
results = parse_inspection_table(table)
|
|
845
|
+
assert len(results) == 2
|
|
846
|
+
assert results[0].item_name == "수질검사"
|
|
847
|
+
assert results[0].result == "적합"
|
|
848
|
+
assert results[1].result == "부적합"
|
|
849
|
+
```
|
|
850
|
+
|
|
851
|
+
### VCR-Style Recording for Integration Tests
|
|
852
|
+
|
|
853
|
+
```python
|
|
854
|
+
import json
|
|
855
|
+
import hashlib
|
|
856
|
+
from pathlib import Path
|
|
857
|
+
|
|
858
|
+
import httpx
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
class ResponseRecorder:
|
|
862
|
+
"""Record and replay HTTP responses for deterministic testing."""
|
|
863
|
+
|
|
864
|
+
def __init__(self, cassette_dir: str = "tests/cassettes"):
|
|
865
|
+
self.cassette_dir = Path(cassette_dir)
|
|
866
|
+
self.cassette_dir.mkdir(parents=True, exist_ok=True)
|
|
867
|
+
|
|
868
|
+
def _cassette_path(self, url: str, method: str = "GET") -> Path:
|
|
869
|
+
key = hashlib.md5(f"{method}:{url}".encode()).hexdigest()
|
|
870
|
+
return self.cassette_dir / f"{key}.json"
|
|
871
|
+
|
|
872
|
+
async def get(
|
|
873
|
+
self,
|
|
874
|
+
url: str,
|
|
875
|
+
client: httpx.AsyncClient,
|
|
876
|
+
*,
|
|
877
|
+
record: bool = False,
|
|
878
|
+
) -> httpx.Response:
|
|
879
|
+
cassette = self._cassette_path(url)
|
|
880
|
+
|
|
881
|
+
if not record and cassette.exists():
|
|
882
|
+
# Replay mode
|
|
883
|
+
data = json.loads(cassette.read_text())
|
|
884
|
+
return httpx.Response(
|
|
885
|
+
status_code=data["status_code"],
|
|
886
|
+
headers=data["headers"],
|
|
887
|
+
content=data["body"].encode("utf-8"),
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
# Record mode
|
|
891
|
+
resp = await client.get(url)
|
|
892
|
+
cassette.write_text(json.dumps({
|
|
893
|
+
"url": url,
|
|
894
|
+
"status_code": resp.status_code,
|
|
895
|
+
"headers": dict(resp.headers),
|
|
896
|
+
"body": resp.text,
|
|
897
|
+
}, ensure_ascii=False, indent=2))
|
|
898
|
+
return resp
|
|
899
|
+
|
|
900
|
+
|
|
901
|
+
# Usage
|
|
902
|
+
recorder = ResponseRecorder()
|
|
903
|
+
|
|
904
|
+
async def test_fetch_gov_data():
|
|
905
|
+
async with httpx.AsyncClient() as client:
|
|
906
|
+
resp = await recorder.get(
|
|
907
|
+
"https://data.go.kr/api/sample",
|
|
908
|
+
client,
|
|
909
|
+
record=False, # Set True on first run
|
|
910
|
+
)
|
|
911
|
+
assert resp.status_code == 200
|
|
912
|
+
```
|
|
913
|
+
|
|
914
|
+
---
|
|
915
|
+
|
|
916
|
+
## Quick Reference
|
|
917
|
+
|
|
918
|
+
| Task | Tool | Key Pattern |
|
|
919
|
+
|------|------|-------------|
|
|
920
|
+
| Static HTML parsing | BeautifulSoup | `parse_table()`, `select()` |
|
|
921
|
+
| JS-rendered content | Playwright | `wait_for_selector()`, `evaluate()` |
|
|
922
|
+
| Korean encoding | httpx + CP949 | `fetch_with_encoding()` |
|
|
923
|
+
| Gov site login | Playwright contexts | `authenticated_gov_scrape()` |
|
|
924
|
+
| Parallel scraping | Playwright + asyncio | `Semaphore(3)` per domain |
|
|
925
|
+
| Layout change detection | Structural fingerprint | `StructureValidator.check()` |
|
|
926
|
+
| Test reproducibility | VCR cassettes | `ResponseRecorder` |
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Web Scraping Guide
|
|
2
|
+
|
|
3
|
+
metadata:
|
|
4
|
+
name: web-scraping
|
|
5
|
+
description: BeautifulSoup and Playwright patterns for reliable web scraping and government site parsing
|
|
6
|
+
|
|
7
|
+
source:
|
|
8
|
+
type: internal
|
|
9
|
+
|
|
10
|
+
topics:
|
|
11
|
+
- beautifulsoup-patterns
|
|
12
|
+
- playwright-navigation
|
|
13
|
+
- government-site-patterns
|
|
14
|
+
- smart-parser-design
|
|
15
|
+
- error-handling
|
|
16
|
+
- testing
|
|
17
|
+
|
|
18
|
+
used_by:
|
|
19
|
+
- lang-python-expert
|
package/templates/manifest.json
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
{
|
|
2
|
-
"version": "0.
|
|
2
|
+
"version": "0.48.1",
|
|
3
3
|
"lastUpdated": "2026-03-16T00:00:00.000Z",
|
|
4
4
|
"components": [
|
|
5
5
|
{
|
|
@@ -18,13 +18,13 @@
|
|
|
18
18
|
"name": "skills",
|
|
19
19
|
"path": ".claude/skills",
|
|
20
20
|
"description": "Reusable skill modules (includes slash commands)",
|
|
21
|
-
"files":
|
|
21
|
+
"files": 84
|
|
22
22
|
},
|
|
23
23
|
{
|
|
24
24
|
"name": "guides",
|
|
25
25
|
"path": "guides",
|
|
26
26
|
"description": "Reference documentation",
|
|
27
|
-
"files":
|
|
27
|
+
"files": 29
|
|
28
28
|
},
|
|
29
29
|
{
|
|
30
30
|
"name": "hooks",
|