ccgx-workflow 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (212) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +469 -0
  3. package/README.zh-CN.md +466 -0
  4. package/bin/ccg.mjs +2 -0
  5. package/dist/cli.d.mts +1 -0
  6. package/dist/cli.d.ts +1 -0
  7. package/dist/cli.mjs +173 -0
  8. package/dist/index.d.mts +1774 -0
  9. package/dist/index.d.ts +1774 -0
  10. package/dist/index.mjs +2029 -0
  11. package/dist/shared/ccgx-workflow.WgUzkiC3.mjs +5248 -0
  12. package/package.json +129 -0
  13. package/templates/commands/agents/assumptions-analyzer.md +129 -0
  14. package/templates/commands/agents/code-fixer.md +292 -0
  15. package/templates/commands/agents/codebase-mapper.md +152 -0
  16. package/templates/commands/agents/debug-session-manager.md +247 -0
  17. package/templates/commands/agents/debugger.md +111 -0
  18. package/templates/commands/agents/eval-auditor.md +171 -0
  19. package/templates/commands/agents/framework-selector.md +152 -0
  20. package/templates/commands/agents/get-current-datetime.md +29 -0
  21. package/templates/commands/agents/init-architect.md +114 -0
  22. package/templates/commands/agents/integration-checker.md +163 -0
  23. package/templates/commands/agents/interface-auditor.md +170 -0
  24. package/templates/commands/agents/nyquist-auditor.md +131 -0
  25. package/templates/commands/agents/pattern-mapper.md +111 -0
  26. package/templates/commands/agents/phase-runner.md +321 -0
  27. package/templates/commands/agents/plan-checker.md +255 -0
  28. package/templates/commands/agents/planner.md +320 -0
  29. package/templates/commands/agents/team-architect.md +186 -0
  30. package/templates/commands/agents/team-qa.md +121 -0
  31. package/templates/commands/agents/team-reviewer.md +157 -0
  32. package/templates/commands/agents/ui-ux-designer.md +573 -0
  33. package/templates/commands/agents/verifier.md +274 -0
  34. package/templates/commands/analyze.md +210 -0
  35. package/templates/commands/autonomous.md +792 -0
  36. package/templates/commands/cancel.md +132 -0
  37. package/templates/commands/clean-branches.md +117 -0
  38. package/templates/commands/codex-exec.md +404 -0
  39. package/templates/commands/commit.md +151 -0
  40. package/templates/commands/context.md +332 -0
  41. package/templates/commands/debate.md +165 -0
  42. package/templates/commands/debug.md +226 -0
  43. package/templates/commands/enhance.md +64 -0
  44. package/templates/commands/execute.md +380 -0
  45. package/templates/commands/init.md +123 -0
  46. package/templates/commands/optimize.md +217 -0
  47. package/templates/commands/plan.md +373 -0
  48. package/templates/commands/result.md +106 -0
  49. package/templates/commands/review.md +338 -0
  50. package/templates/commands/rollback.md +116 -0
  51. package/templates/commands/spec-impl.md +139 -0
  52. package/templates/commands/spec-init.md +101 -0
  53. package/templates/commands/spec-plan.md +210 -0
  54. package/templates/commands/spec-research.md +152 -0
  55. package/templates/commands/spec-review.md +120 -0
  56. package/templates/commands/status.md +206 -0
  57. package/templates/commands/team-exec.md +265 -0
  58. package/templates/commands/test.md +236 -0
  59. package/templates/commands/verify-work.md +338 -0
  60. package/templates/commands/verify.md +66 -0
  61. package/templates/commands/workflow.md +190 -0
  62. package/templates/commands/worktree.md +128 -0
  63. package/templates/hooks/ccg-context-monitor.js +159 -0
  64. package/templates/hooks/ccg-session-state.cjs +510 -0
  65. package/templates/hooks/ccg-statusline.js +142 -0
  66. package/templates/output-styles/abyss-command.md +56 -0
  67. package/templates/output-styles/abyss-concise.md +89 -0
  68. package/templates/output-styles/abyss-cultivator.md +302 -0
  69. package/templates/output-styles/abyss-ritual.md +70 -0
  70. package/templates/output-styles/engineer-professional.md +89 -0
  71. package/templates/output-styles/laowang-engineer.md +127 -0
  72. package/templates/output-styles/nekomata-engineer.md +120 -0
  73. package/templates/output-styles/ojousama-engineer.md +121 -0
  74. package/templates/prompts/claude/analyzer.md +59 -0
  75. package/templates/prompts/claude/architect.md +54 -0
  76. package/templates/prompts/claude/debugger.md +71 -0
  77. package/templates/prompts/claude/optimizer.md +73 -0
  78. package/templates/prompts/claude/reviewer.md +63 -0
  79. package/templates/prompts/claude/tester.md +69 -0
  80. package/templates/prompts/codex/analyzer.md +58 -0
  81. package/templates/prompts/codex/architect.md +54 -0
  82. package/templates/prompts/codex/debugger.md +74 -0
  83. package/templates/prompts/codex/optimizer.md +81 -0
  84. package/templates/prompts/codex/reviewer.md +73 -0
  85. package/templates/prompts/codex/tester.md +62 -0
  86. package/templates/prompts/gemini/analyzer.md +61 -0
  87. package/templates/prompts/gemini/architect.md +55 -0
  88. package/templates/prompts/gemini/debugger.md +78 -0
  89. package/templates/prompts/gemini/frontend.md +64 -0
  90. package/templates/prompts/gemini/optimizer.md +84 -0
  91. package/templates/prompts/gemini/reviewer.md +80 -0
  92. package/templates/prompts/gemini/tester.md +68 -0
  93. package/templates/rules/ccg-skill-routing.md +83 -0
  94. package/templates/rules/ccg-skills.md +71 -0
  95. package/templates/scripts/ccg-phase-runner-launcher.mjs +467 -0
  96. package/templates/scripts/invoke-model.mjs +949 -0
  97. package/templates/scripts/repatch-gemini-plugin.mjs +194 -0
  98. package/templates/skills/SKILL.md +92 -0
  99. package/templates/skills/domains/ai/SKILL.md +35 -0
  100. package/templates/skills/domains/ai/agent-dev.md +242 -0
  101. package/templates/skills/domains/ai/llm-security.md +288 -0
  102. package/templates/skills/domains/ai/prompt-and-eval.md +279 -0
  103. package/templates/skills/domains/ai/rag-system.md +542 -0
  104. package/templates/skills/domains/architecture/SKILL.md +43 -0
  105. package/templates/skills/domains/architecture/api-design.md +225 -0
  106. package/templates/skills/domains/architecture/caching.md +299 -0
  107. package/templates/skills/domains/architecture/cloud-native.md +285 -0
  108. package/templates/skills/domains/architecture/message-queue.md +329 -0
  109. package/templates/skills/domains/architecture/security-arch.md +297 -0
  110. package/templates/skills/domains/data-engineering/SKILL.md +208 -0
  111. package/templates/skills/domains/development/SKILL.md +47 -0
  112. package/templates/skills/domains/development/cpp.md +246 -0
  113. package/templates/skills/domains/development/go.md +323 -0
  114. package/templates/skills/domains/development/java.md +277 -0
  115. package/templates/skills/domains/development/python.md +288 -0
  116. package/templates/skills/domains/development/rust.md +313 -0
  117. package/templates/skills/domains/development/shell.md +313 -0
  118. package/templates/skills/domains/development/typescript.md +277 -0
  119. package/templates/skills/domains/devops/SKILL.md +40 -0
  120. package/templates/skills/domains/devops/cost-optimization.md +272 -0
  121. package/templates/skills/domains/devops/database.md +217 -0
  122. package/templates/skills/domains/devops/devsecops.md +198 -0
  123. package/templates/skills/domains/devops/git-workflow.md +181 -0
  124. package/templates/skills/domains/devops/observability.md +280 -0
  125. package/templates/skills/domains/devops/performance.md +336 -0
  126. package/templates/skills/domains/devops/testing.md +283 -0
  127. package/templates/skills/domains/frontend-design/SKILL.md +244 -0
  128. package/templates/skills/domains/frontend-design/agents/openai.yaml +4 -0
  129. package/templates/skills/domains/frontend-design/claymorphism/SKILL.md +121 -0
  130. package/templates/skills/domains/frontend-design/claymorphism/references/tokens.css +52 -0
  131. package/templates/skills/domains/frontend-design/component-patterns.md +202 -0
  132. package/templates/skills/domains/frontend-design/engineering.md +287 -0
  133. package/templates/skills/domains/frontend-design/glassmorphism/SKILL.md +142 -0
  134. package/templates/skills/domains/frontend-design/glassmorphism/references/tokens.css +32 -0
  135. package/templates/skills/domains/frontend-design/liquid-glass/SKILL.md +139 -0
  136. package/templates/skills/domains/frontend-design/liquid-glass/references/tokens.css +81 -0
  137. package/templates/skills/domains/frontend-design/neubrutalism/SKILL.md +145 -0
  138. package/templates/skills/domains/frontend-design/neubrutalism/references/tokens.css +44 -0
  139. package/templates/skills/domains/frontend-design/reference/color-and-contrast.md +132 -0
  140. package/templates/skills/domains/frontend-design/reference/interaction-design.md +195 -0
  141. package/templates/skills/domains/frontend-design/reference/motion-design.md +99 -0
  142. package/templates/skills/domains/frontend-design/reference/responsive-design.md +114 -0
  143. package/templates/skills/domains/frontend-design/reference/spatial-design.md +100 -0
  144. package/templates/skills/domains/frontend-design/reference/typography.md +133 -0
  145. package/templates/skills/domains/frontend-design/reference/ux-writing.md +107 -0
  146. package/templates/skills/domains/frontend-design/state-management.md +680 -0
  147. package/templates/skills/domains/frontend-design/ui-aesthetics.md +110 -0
  148. package/templates/skills/domains/frontend-design/ux-principles.md +156 -0
  149. package/templates/skills/domains/infrastructure/SKILL.md +201 -0
  150. package/templates/skills/domains/mobile/SKILL.md +225 -0
  151. package/templates/skills/domains/orchestration/SKILL.md +30 -0
  152. package/templates/skills/domains/orchestration/multi-agent.md +263 -0
  153. package/templates/skills/domains/security/SKILL.md +73 -0
  154. package/templates/skills/domains/security/blue-team.md +436 -0
  155. package/templates/skills/domains/security/code-audit.md +265 -0
  156. package/templates/skills/domains/security/pentest.md +226 -0
  157. package/templates/skills/domains/security/red-team.md +374 -0
  158. package/templates/skills/domains/security/threat-intel.md +372 -0
  159. package/templates/skills/domains/security/vuln-research.md +369 -0
  160. package/templates/skills/impeccable/adapt/SKILL.md +201 -0
  161. package/templates/skills/impeccable/animate/SKILL.md +176 -0
  162. package/templates/skills/impeccable/arrange/SKILL.md +126 -0
  163. package/templates/skills/impeccable/audit/SKILL.md +149 -0
  164. package/templates/skills/impeccable/bolder/SKILL.md +118 -0
  165. package/templates/skills/impeccable/clarify/SKILL.md +185 -0
  166. package/templates/skills/impeccable/colorize/SKILL.md +144 -0
  167. package/templates/skills/impeccable/critique/SKILL.md +203 -0
  168. package/templates/skills/impeccable/critique/reference/cognitive-load.md +106 -0
  169. package/templates/skills/impeccable/critique/reference/heuristics-scoring.md +234 -0
  170. package/templates/skills/impeccable/critique/reference/personas.md +178 -0
  171. package/templates/skills/impeccable/delight/SKILL.md +305 -0
  172. package/templates/skills/impeccable/distill/SKILL.md +123 -0
  173. package/templates/skills/impeccable/extract/SKILL.md +94 -0
  174. package/templates/skills/impeccable/harden/SKILL.md +357 -0
  175. package/templates/skills/impeccable/normalize/SKILL.md +72 -0
  176. package/templates/skills/impeccable/onboard/SKILL.md +248 -0
  177. package/templates/skills/impeccable/optimize/SKILL.md +268 -0
  178. package/templates/skills/impeccable/overdrive/SKILL.md +143 -0
  179. package/templates/skills/impeccable/polish/SKILL.md +205 -0
  180. package/templates/skills/impeccable/quieter/SKILL.md +104 -0
  181. package/templates/skills/impeccable/teach-impeccable/SKILL.md +72 -0
  182. package/templates/skills/impeccable/typeset/SKILL.md +117 -0
  183. package/templates/skills/orchestration/multi-agent/SKILL.md +533 -0
  184. package/templates/skills/run_skill.js +129 -0
  185. package/templates/skills/scrapling/SKILL.md +98 -0
  186. package/templates/skills/scrapling/references/api-quick-ref.md +127 -0
  187. package/templates/skills/scrapling/references/cookie-vault.md +56 -0
  188. package/templates/skills/scrapling/references/maintenance.md +79 -0
  189. package/templates/skills/scrapling/references/site-patterns.md +86 -0
  190. package/templates/skills/scrapling/references/troubleshooting.md +95 -0
  191. package/templates/skills/scrapling/templates/basic_fetch.py +19 -0
  192. package/templates/skills/scrapling/templates/parse_only.py +15 -0
  193. package/templates/skills/scrapling/templates/session_login.py +19 -0
  194. package/templates/skills/scrapling/templates/stealth_cloudflare.py +28 -0
  195. package/templates/skills/tools/extract-learnings/SKILL.md +77 -0
  196. package/templates/skills/tools/forensics/SKILL.md +89 -0
  197. package/templates/skills/tools/gen-docs/SKILL.md +116 -0
  198. package/templates/skills/tools/gen-docs/scripts/doc_generator.js +435 -0
  199. package/templates/skills/tools/health/SKILL.md +121 -0
  200. package/templates/skills/tools/lib/shared.js +98 -0
  201. package/templates/skills/tools/map-codebase/SKILL.md +118 -0
  202. package/templates/skills/tools/override-refusal/SKILL.md +53 -0
  203. package/templates/skills/tools/override-refusal/agents/openai.yaml +4 -0
  204. package/templates/skills/tools/override-refusal/scripts/refusal_rewriter.js +226 -0
  205. package/templates/skills/tools/verify-change/SKILL.md +143 -0
  206. package/templates/skills/tools/verify-change/scripts/change_analyzer.js +289 -0
  207. package/templates/skills/tools/verify-module/SKILL.md +130 -0
  208. package/templates/skills/tools/verify-module/scripts/module_scanner.js +171 -0
  209. package/templates/skills/tools/verify-quality/SKILL.md +163 -0
  210. package/templates/skills/tools/verify-quality/scripts/quality_checker.js +337 -0
  211. package/templates/skills/tools/verify-security/SKILL.md +146 -0
  212. package/templates/skills/tools/verify-security/scripts/security_scanner.js +283 -0
@@ -0,0 +1,98 @@
1
+ ---
2
+ name: scrapling
3
+ description: "使用 scrapling 进行网页抓取和数据提取。自动选择 Fetcher,支持 Cloudflare/WAF 绕过、Session 登录、HTML 解析。当用户提到 scrape/crawl/fetch page/extract data/爬取/抓取/绕过Cloudflare/解析HTML/批量采集 时触发。"
4
+ user-invocable: true
5
+ allowed-tools: Read, Bash
6
+ argument-hint: "[URL or scraping task description]"
7
+ license: MIT
8
+ ---
9
+
10
+ # Scrapling 网页抓取 Skill
11
+
12
+ ## 步骤 0:检查版本
13
+
14
+ ```bash
15
+ pip show scrapling
16
+ ```
17
+
18
+ - 未安装 → 执行 `pip install "scrapling[fetchers]"` + `scrapling install`
19
+ - 有新版 → 执行 `pip install --upgrade "scrapling[fetchers]"` → 查 changelog 告知用户
20
+ - 已最新 → 继续
21
+
22
+ ## 步骤 1:选择 Fetcher
23
+
24
+ ```
25
+ 目标网站 →
26
+
27
+ ├─ 已有 HTML 字符串/文件,只需解析?
28
+ │ → Selector(纯解析,无网络请求)
29
+ │ → 模板: templates/parse_only.py
30
+
31
+ ├─ 静态页面,无 JS 渲染,无反爬?
32
+ │ → Fetcher(最快,基于 curl_cffi)
33
+ │ → 模板: templates/basic_fetch.py
34
+
35
+ ├─ 需要登录(HTTP 表单,非 JS 登录)?
36
+ │ → FetcherSession(保持会话 cookie)
37
+ │ → 模板: templates/session_login.py
38
+
39
+ ├─ 有 Cloudflare / WAF 保护?
40
+ │ → StealthyFetcher(Camoufox 浏览器,自动过 CF)
41
+ │ → 模板: templates/stealth_cloudflare.py
42
+
43
+ ├─ SPA 应用(React/Vue),需要 JS 渲染?
44
+ │ → DynamicFetcher(Playwright 浏览器)
45
+ │ → 基于模板即时生成
46
+
47
+ └─ 不确定?
48
+ → 先用 Fetcher 试,403/空内容 → 升级到 StealthyFetcher
49
+ ```
50
+
51
+ ## 步骤 2:执行工作流
52
+
53
+ ```
54
+ 1. 检查版本(步骤 0)
55
+ 2. 查阅 references/site-patterns.md — 匹配已有模式则直接复用
56
+ 3. 无匹配 → 用决策树选择 Fetcher
57
+ 4. 读取对应模板 → 替换参数 → 生成完整脚本
58
+ 5. 执行脚本 → 返回结果
59
+ 6. **沉淀经验(必做)**:
60
+ - 新站点 → 追加到 site-patterns.md
61
+ - 新 cookie / 用户提供了 cookie → 保存到 cookie-vault.md
62
+ - **完成抓取后必须检查**:是否有新的 cookie 或 site pattern 需要保存
63
+ ```
64
+
65
+ ## Cookie 格式速查
66
+
67
+ | Fetcher 类型 | Cookie 格式 | 示例 |
68
+ |-------------|-------------|------|
69
+ | Fetcher / FetcherSession | `dict` | `{'name': 'value', 'token': 'abc'}` |
70
+ | StealthyFetcher / DynamicFetcher | `list[dict]` | `[{'name': 'n', 'value': 'v', 'domain': '.site.com', 'path': '/'}]` |
71
+
72
+ **浏览器 Fetcher cookie 必填字段**: `name`, `value`, `domain`, `path`
73
+
74
+ ## 超时单位速查
75
+
76
+ | Fetcher 类型 | 超时单位 | 示例 |
77
+ |-------------|---------|------|
78
+ | Fetcher / FetcherSession | 秒 | `timeout=30` |
79
+ | StealthyFetcher / DynamicFetcher | 毫秒 | `timeout=60000` |
80
+
81
+ ## 模板索引
82
+
83
+ | 模板 | 文件 | 何时读取 |
84
+ |------|------|---------|
85
+ | 基础 HTTP 抓取 | `templates/basic_fetch.py` | 目标为静态页面,无反爬 |
86
+ | Cloudflare 绕过 | `templates/stealth_cloudflare.py` | 目标有 CF/WAF 保护 |
87
+ | Session 登录 | `templates/session_login.py` | 需 HTTP 表单登录后抓取 |
88
+ | 纯 HTML 解析 | `templates/parse_only.py` | 已有 HTML 字符串,只需提取数据 |
89
+
90
+ ## References 索引
91
+
92
+ | 文件 | 何时读取 |
93
+ |------|---------|
94
+ | `references/site-patterns.md` | **每次抓取前先查阅** — 检查目标站点是否有已记录的模式 |
95
+ | `references/api-quick-ref.md` | 生成脚本时查阅 — Fetcher/Selector 方法签名和参数 |
96
+ | `references/troubleshooting.md` | 执行报错时查阅 — 按错误信息查找原因和解决方案 |
97
+ | `references/cookie-vault.md` | 需要登录 cookie 时查阅 — 检查是否有历史记录可复用 |
98
+ | `references/maintenance.md` | 安装/升级/依赖问题时查阅 — 安装层级和验证命令 |
@@ -0,0 +1,127 @@
1
+ # Scrapling API 速查卡
2
+
3
+ ## Fetcher(基于 curl_cffi,最快)
4
+
5
+ ```python
6
+ from scrapling.fetchers import Fetcher
7
+
8
+ # GET 请求
9
+ page = Fetcher.get(url, impersonate='chrome', timeout=30, headers=None, cookies=None)
10
+
11
+ # POST 请求
12
+ page = Fetcher.post(url, data=None, json=None, impersonate='chrome', timeout=30)
13
+ ```
14
+
15
+ **Cookie 格式**: `dict` — `{'name': 'value'}`
16
+ **超时单位**: 秒
17
+
18
+ ## FetcherSession(保持会话 cookie)
19
+
20
+ ```python
21
+ from scrapling.fetchers import FetcherSession
22
+
23
+ with FetcherSession(impersonate='chrome') as s:
24
+ s.post(login_url, data={'user': '...', 'pass': '...'})
25
+ page = s.get(target_url)
26
+ ```
27
+
28
+ ## StealthyFetcher(Camoufox,绕过反爬)
29
+
30
+ ```python
31
+ from scrapling.fetchers import StealthyFetcher
32
+
33
+ page = StealthyFetcher.fetch(
34
+ url,
35
+ headless=True, # 无头模式
36
+ solve_cloudflare=True, # 自动过 Cloudflare
37
+ cookies=None, # list[dict] 格式
38
+ timeout=60000, # 毫秒
39
+ network_idle=True, # 等待网络空闲
40
+ hide_canvas=True, # 隐藏 canvas 指纹
41
+ block_webrtc=True, # 阻止 WebRTC 泄露 IP
42
+ disable_resources=False, # 禁用图片/字体加速
43
+ )
44
+ ```
45
+
46
+ **Cookie 格式**: `list[dict]` — `[{'name': 'n', 'value': 'v', 'domain': '.site.com', 'path': '/'}]`
47
+ **超时单位**: 毫秒
48
+
49
+ ## DynamicFetcher(Playwright,JS 渲染)
50
+
51
+ ```python
52
+ from scrapling.fetchers import DynamicFetcher
53
+
54
+ page = DynamicFetcher.fetch(
55
+ url,
56
+ headless=True,
57
+ cookies=None, # list[dict] 格式
58
+ timeout=30000, # 毫秒
59
+ network_idle=True, # 等待网络空闲
60
+ wait_selector=None, # 等待特定元素出现
61
+ disable_resources=True, # 跳过图片/字体/CSS 加速
62
+ )
63
+ ```
64
+
65
+ **Cookie 格式**: `list[dict]`
66
+ **超时单位**: 毫秒
67
+
68
+ ## Selector(纯 HTML 解析,无网络请求)
69
+
70
+ ```python
71
+ from scrapling.parser import Selector
72
+
73
+ page = Selector(html_string, url='https://base-url.com')
74
+ ```
75
+
76
+ ## Response 常用属性
77
+
78
+ ```python
79
+ page.status # HTTP 状态码 (int)
80
+ page.text # 原始 HTML/文本内容 (str)
81
+ page.url # 最终 URL(可能经过重定向)
82
+ page.cookies # 响应 cookie
83
+ page.headers # 响应头
84
+ ```
85
+
86
+ ## 选择器方法
87
+
88
+ ```python
89
+ # CSS 选择器
90
+ page.css('div.content') # 返回元素列表
91
+ page.css_first('h1') # 返回第一个匹配元素
92
+
93
+ # XPath 选择器
94
+ page.xpath('//div[@class="content"]')
95
+
96
+ # 文本提取伪元素
97
+ page.css('h1::text') # 提取文本内容
98
+ page.css('a::attr(href)') # 提取属性值
99
+
100
+ # 获取所有匹配结果的文本
101
+ results = page.css('h1::text').getall() # list[str]
102
+
103
+ # 获取第一个匹配结果的文本
104
+ result = page.css('h1::text').get() # str | None
105
+ ```
106
+
107
+ ## 元素方法
108
+
109
+ ```python
110
+ element = page.css_first('div.post')
111
+
112
+ element.text # 直接子文本
113
+ element.get_all_text(strip=True) # 递归获取所有文本
114
+ element.attrib # 属性字典
115
+ element.attrib.get('href') # 获取单个属性
116
+ element.css('span.author::text') # 在子树中继续选择
117
+ element.parent # 父元素
118
+ element.children # 子元素列表
119
+ ```
120
+
121
+ ## 正则提取
122
+
123
+ ```python
124
+ # 从文本中提取匹配
125
+ page.re(r'price: \$(\d+\.\d+)') # list[str] — 所有匹配
126
+ page.re_first(r'price: \$(\d+\.\d+)') # str | None — 第一个匹配
127
+ ```
@@ -0,0 +1,56 @@
1
+ # Cookie 保险库
2
+
3
+ 按站点分区记录历史 cookie,供抓取时快速查找使用。
4
+
5
+ > **安全提示**: 此文件存储敏感 cookie 值,请勿提交到版本控制或分享给他人。
6
+ > 实际使用时,请将此文件复制为 `cookie-vault.local.md` 并填入真实值。
7
+
8
+ ---
9
+
10
+ ## 示例站点 (example.com)
11
+
12
+ **最后更新**: YYYY-MM-DD
13
+ **状态**: 有效 / 可能已过期
14
+ **登录 cookie 字段**: `session_id`, `auth_token`
15
+ **Fetcher 类型**: StealthyFetcher
16
+
17
+ ### Playwright 格式(StealthyFetcher/DynamicFetcher 用)
18
+
19
+ ```python
20
+ cookies = [
21
+ {'name': 'session_id', 'value': '<YOUR_SESSION_ID>', 'domain': '.example.com', 'path': '/'},
22
+ {'name': 'auth_token', 'value': '<YOUR_AUTH_TOKEN>', 'domain': '.example.com', 'path': '/'},
23
+ ]
24
+ ```
25
+
26
+ ### 备注
27
+
28
+ - 从浏览器 DevTools > Application > Cookies 获取真实值
29
+ - cookie 有效期取决于站点设置,过期后需重新获取
30
+
31
+ ---
32
+
33
+ ## 模板:添加新站点
34
+
35
+ 复制以下模板,替换具体内容后追加到此文件:
36
+
37
+ ```markdown
38
+ ## 站点名称 (域名)
39
+
40
+ **最后更新**: YYYY-MM-DD
41
+ **状态**: 有效 / 可能已过期
42
+ **登录 cookie 字段**: `field1`, `field2`
43
+ **Fetcher 类型**: Fetcher / StealthyFetcher / DynamicFetcher
44
+
45
+ ### Playwright 格式
46
+
47
+ \```python
48
+ cookies = [
49
+ {'name': 'field1', 'value': '...', 'domain': '.example.com', 'path': '/'},
50
+ ]
51
+ \```
52
+
53
+ ### 备注
54
+
55
+ - 相关注意事项
56
+ ```
@@ -0,0 +1,79 @@
1
+ # Scrapling 安装与维护
2
+
3
+ ## 安装层级
4
+
5
+ | 安装命令 | 包含内容 |
6
+ |---------|---------|
7
+ | `pip install scrapling` | 仅核心解析器(Selector),无网络抓取能力 |
8
+ | `pip install "scrapling[fetchers]"` | + Fetcher/StealthyFetcher/DynamicFetcher(curl_cffi, Playwright, Camoufox) |
9
+ | `pip install "scrapling[ai]"` | + AI 功能(transformers) |
10
+ | `pip install "scrapling[shell]"` | + 交互式 shell |
11
+ | `pip install "scrapling[all]"` | 全部功能 |
12
+
13
+ **推荐**: 大多数场景使用 `scrapling[fetchers]` 即可。
14
+
15
+ ## 检查安装状态
16
+
17
+ ```bash
18
+ # 查看版本
19
+ pip show scrapling
20
+
21
+ # 验证基础包可用
22
+ python -c "from scrapling.parser import Selector; print('Parser OK')"
23
+
24
+ # 验证 Fetcher 可用(需要 [fetchers])
25
+ python -c "from scrapling.fetchers import Fetcher; print('Fetcher OK')"
26
+
27
+ # 验证 StealthyFetcher 可用
28
+ python -c "from scrapling.fetchers import StealthyFetcher; print('StealthyFetcher OK')"
29
+
30
+ # 验证 DynamicFetcher 可用
31
+ python -c "from scrapling.fetchers import DynamicFetcher; print('DynamicFetcher OK')"
32
+ ```
33
+
34
+ ## 安装浏览器依赖
35
+
36
+ StealthyFetcher 和 DynamicFetcher 需要浏览器引擎,安装后需执行:
37
+
38
+ ```bash
39
+ # 方式 1: 直接命令(PATH 包含 Scripts 目录时)
40
+ scrapling install
41
+
42
+ # 方式 2: 通过 Python 调用(推荐,避免 PATH 问题)
43
+ python -c "from scrapling.cli import main; main(['install'])"
44
+ ```
45
+
46
+ ## 升级
47
+
48
+ ```bash
49
+ pip install --upgrade "scrapling[fetchers]"
50
+ ```
51
+
52
+ 升级后建议重新验证三个 Fetcher 是否可用(见上方检查命令)。
53
+
54
+ ## 三 Fetcher 完整验证脚本
55
+
56
+ ```python
57
+ #!/usr/bin/env python3
58
+ """验证 scrapling 三个 Fetcher 均可正常使用"""
59
+ import scrapling
60
+
61
+ print(f"scrapling version: {scrapling.__version__}")
62
+
63
+ # 1. Fetcher (curl_cffi)
64
+ from scrapling.fetchers import Fetcher
65
+ page = Fetcher.get("https://httpbin.org/get", impersonate='chrome', timeout=15)
66
+ print(f"Fetcher: status={page.status}")
67
+
68
+ # 2. StealthyFetcher (Camoufox)
69
+ from scrapling.fetchers import StealthyFetcher
70
+ page = StealthyFetcher.fetch("https://httpbin.org/get", headless=True, timeout=30000)
71
+ print(f"StealthyFetcher: status={page.status}")
72
+
73
+ # 3. DynamicFetcher (Playwright)
74
+ from scrapling.fetchers import DynamicFetcher
75
+ page = DynamicFetcher.fetch("https://httpbin.org/get", headless=True, timeout=30000)
76
+ print(f"DynamicFetcher: status={page.status}")
77
+
78
+ print("\nAll Fetchers verified successfully")
79
+ ```
@@ -0,0 +1,86 @@
1
+ # 站点抓取模式经验库
2
+
3
+ 每次成功抓取新类型站点后,Agent 应提示用户是否将经验追加到此文件。
4
+
5
+ ---
6
+
7
+ ## Discourse 论坛 (linux.do, meta.discourse.org 等)
8
+
9
+ **站点特征**: Cloudflare 保护 + Ember.js SPA + 登录态区分
10
+ **推荐 Fetcher**: StealthyFetcher
11
+ **关键参数**:
12
+ - `solve_cloudflare=True` — 必须
13
+ - `network_idle=True` — 等待 Ember 渲染完成
14
+ - `timeout=60000` — CF 验证耗时长,至少 60 秒(毫秒单位)
15
+ **登录 cookie 字段**: `_forum_session`, `_t`
16
+ **不需要**: `cf_clearance`(StealthyFetcher 自动获取)
17
+ **JSON API**: `/t/topic/{id}.json`(需过 CF 后才可用)
18
+ **选择器参考**:
19
+ - 帖子列表: `.topic-post`
20
+ - 作者: `[data-user-card]::attr(data-user-card)`
21
+ - 内容: `.cooked` → `.get_all_text(strip=True)`
22
+
23
+ ---
24
+
25
+ ## 静态博客/文档站 (GitHub Pages, Hugo, Jekyll)
26
+
27
+ **站点特征**: 纯静态 HTML,无 JS 渲染依赖,无反爬
28
+ **推荐 Fetcher**: Fetcher(最快)
29
+ **关键参数**: `impersonate='chrome'`, `timeout=30`
30
+ **选择器参考**: `article`, `.content`, `.post-body`
31
+
32
+ ---
33
+
34
+ ## SPA 应用 (React/Vue/Next.js)
35
+
36
+ **站点特征**: JS 渲染,内容不在初始 HTML 中
37
+ **推荐 Fetcher**: DynamicFetcher
38
+ **关键参数**:
39
+ - `network_idle=True` — 等待 API 请求完成
40
+ - `wait_selector='.content-loaded'` — 等待关键元素(按实际调整)
41
+ - `disable_resources=True` — 跳过字体/图片加速
42
+ **备注**: 优先检查是否有 API 端点可直接用 Fetcher 请求(更快更稳定)
43
+
44
+ ---
45
+
46
+ ## API 端点 (REST/GraphQL)
47
+
48
+ **站点特征**: 返回 JSON,无需解析 HTML
49
+ **推荐 Fetcher**: Fetcher
50
+ **关键参数**: `impersonate='chrome'`, 自定义 `headers`
51
+ **处理方式**: `page.text` 获取 JSON → `json.loads()` 解析
52
+ **备注**: 如果 API 有反爬,可能需要带 Referer/Origin 等 header
53
+
54
+ ---
55
+
56
+ ## TAPD 项目管理 (tapd.cn)
57
+
58
+ **站点特征**: React SPA + 企业登录态 + 分页懒加载("展开更多"按钮)
59
+ **推荐方案**: Playwright 直接控制(非 scrapling Fetcher)
60
+ **原因**: DynamicFetcher 可渲染首屏但无法点击交互;scrapling Fetcher 调 API 时 `page.text` 始终为空;curl 可达 API 但返回 500(需浏览器环境的 CSRF 校验)
61
+ **关键流程**:
62
+ 1. Playwright + cookies 加载页面,`wait_until='networkidle'`
63
+ 2. 循环点击"展开更多"按钮加载全部数据
64
+ 3. `page.inner_text('body')` 提取纯文本,按行解析
65
+ **Cookie 格式**: `list[dict]`,必填 `name/value/domain/path`,domain 为 `.tapd.cn`
66
+ **API 端点**(参考,浏览器内部使用): `POST /api/my_worktable/my_worktable/get_my_worktable_by_page`
67
+ **CSRF**: cookie `dsc-token` 的值需作为 `DSC-TOKEN` header 发送(由 axios interceptor 自动添加)
68
+ **已知限制**: scrapling Fetcher 对 TAPD API 返回空响应(`page.text` 为空),需用 Playwright 或 curl
69
+ **数据结构**: 文本按行排列,类型前缀(P/E/PROGRAM/TEST/BUG) → 标题 → 状态 → 优先级 → ...
70
+
71
+ ---
72
+
73
+ ## 模板:添加新站点模式
74
+
75
+ 复制以下模板,替换具体内容后追加到此文件:
76
+
77
+ ```markdown
78
+ ## 站点名称/类型 (代表域名)
79
+
80
+ **站点特征**: 描述
81
+ **推荐 Fetcher**: Fetcher / StealthyFetcher / DynamicFetcher
82
+ **关键参数**:
83
+ - `参数名=值` — 说明
84
+ **选择器参考**: CSS 选择器示例
85
+ **备注**: 踩坑经验
86
+ ```
@@ -0,0 +1,95 @@
1
+ # Scrapling 踩坑记录与解决方案
2
+
3
+ ## ModuleNotFoundError: curl_cffi
4
+
5
+ **错误信息**: `ModuleNotFoundError: No module named 'curl_cffi'`
6
+ **原因**: 安装了基础包 `pip install scrapling`,不含抓取依赖
7
+ **解决方案**:
8
+ ```bash
9
+ pip install "scrapling[fetchers]"
10
+ ```
11
+
12
+ ## Cloudflare 403 + "Just a moment"
13
+
14
+ **错误信息**: 返回 403,页面内容包含 "Just a moment" 或 "Checking your browser"
15
+ **原因**: Fetcher(curl_cffi)无法通过 Cloudflare 验证
16
+ **解决方案**: 换用 StealthyFetcher + `solve_cloudflare=True`
17
+ ```python
18
+ from scrapling.fetchers import StealthyFetcher
19
+ page = StealthyFetcher.fetch(url, headless=True, solve_cloudflare=True, timeout=60000)
20
+ ```
21
+
22
+ ## cf_clearance cookie 无效
23
+
24
+ **错误信息**: 手动传入 `cf_clearance` cookie 但仍被 Cloudflare 拦截
25
+ **原因**: `cf_clearance` 绑定浏览器指纹(TLS/JA3/UA),不可跨客户端复用
26
+ **解决方案**: 不要手动传 `cf_clearance`,让 StealthyFetcher 自己通过 Cloudflare 获取
27
+
28
+ ## Expected array, got object at $.cookies
29
+
30
+ **错误信息**: `Expected array, got object` at `$.cookies`
31
+ **原因**: 浏览器 Fetcher(StealthyFetcher/DynamicFetcher)cookie 必须是 `list[dict]`,不能是 `dict`
32
+ **解决方案**:
33
+ ```python
34
+ # ❌ 错误
35
+ cookies = {'name': 'value'}
36
+
37
+ # ✅ 正确
38
+ cookies = [{'name': 'cookie_name', 'value': 'cookie_value', 'domain': '.site.com', 'path': '/'}]
39
+ ```
40
+
41
+ ## Cookie should have a url or a domain/path pair
42
+
43
+ **错误信息**: `Cookie should have a url or a domain/path pair`
44
+ **原因**: cookie dict 缺少 `domain` 和 `path` 字段
45
+ **解决方案**: 每个 cookie dict 必须包含 `domain`(以 `.` 开头)和 `path`(通常 `/`)
46
+ ```python
47
+ cookies = [
48
+ {'name': 'token', 'value': 'abc', 'domain': '.example.com', 'path': '/'},
49
+ ]
50
+ ```
51
+
52
+ ## 404 "page is private"
53
+
54
+ **错误信息**: 返回 404,页面提示内容为私有
55
+ **原因**: Cloudflare 已通过,但目标页面需要登录态
56
+ **解决方案**: 带上登录 cookie(从浏览器手动获取),参见 `cookie-vault.md`
57
+ ```python
58
+ page = StealthyFetcher.fetch(
59
+ url,
60
+ solve_cloudflare=True,
61
+ cookies=[{'name': '_session', 'value': '...', 'domain': '.site.com', 'path': '/'}],
62
+ timeout=60000,
63
+ )
64
+ ```
65
+
66
+ ## Cloudflare 多轮 Turnstile
67
+
68
+ **现象**: StealthyFetcher 运行时间很长(30-90 秒),日志显示多次 Turnstile 验证
69
+ **原因**: 正常现象,Cloudflare 有时需要 2-3 轮验证
70
+ **解决方案**: 耐心等待,确保 `timeout` 足够长(至少 60000ms)。如果超时失败,增加到 120000ms 重试
71
+
72
+ ## scrapling: command not found
73
+
74
+ **错误信息**: `scrapling: command not found`
75
+ **原因**: Python Scripts 目录不在 PATH 中
76
+ **解决方案**:
77
+ ```python
78
+ # 方式 1: 使用 python -c
79
+ python -c "from scrapling.cli import main; main(['install'])"
80
+
81
+ # 方式 2: 使用 python -m(如果支持)
82
+ python -m scrapling install
83
+ ```
84
+
85
+ ## StealthyFetcher/DynamicFetcher 报浏览器未安装
86
+
87
+ **错误信息**: 类似 "browser not found" 或 Playwright/Camoufox 相关错误
88
+ **原因**: 未安装浏览器依赖
89
+ **解决方案**:
90
+ ```bash
91
+ # 安装 scrapling 浏览器依赖
92
+ scrapling install
93
+ # 或
94
+ python -c "from scrapling.cli import main; main(['install'])"
95
+ ```
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env python3
2
+ """基础 HTTP 抓取模板
3
+ 用途: 静态页面抓取,无 JS 渲染,无反爬保护
4
+ 替换: URL, CSS_SELECTOR, 输出处理逻辑
5
+ """
6
+ from scrapling.fetchers import Fetcher
7
+
8
+ URL = "{{URL}}"
9
+ CSS_SELECTOR = "{{CSS_SELECTOR}}" # 如 '.article h1::text'
10
+
11
+ page = Fetcher.get(URL, impersonate='chrome', timeout=30)
12
+ print(f"Status: {page.status}")
13
+
14
+ if CSS_SELECTOR:
15
+ results = page.css(CSS_SELECTOR).getall()
16
+ for r in results:
17
+ print(r)
18
+ else:
19
+ print(page.get_all_text(strip=True)[:2000])
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env python3
2
+ """纯 HTML 解析模板(不需要 fetchers 依赖)
3
+ 用途: 已有 HTML 内容(来自 WebFetch/文件/API),只需解析提取
4
+ 替换: HTML_SOURCE, BASE_URL, CSS_SELECTOR
5
+ """
6
+ from scrapling.parser import Selector
7
+
8
+ HTML_SOURCE = """{{HTML}}"""
9
+ # 或从文件读取: HTML_SOURCE = open('page.html').read()
10
+
11
+ page = Selector(HTML_SOURCE, url='{{BASE_URL}}')
12
+
13
+ results = page.css('{{CSS_SELECTOR}}')
14
+ for item in results:
15
+ print(item.get_all_text(strip=True))
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env python3
2
+ """Session 登录 + 多页抓取模板
3
+ 用途: 需要登录后才能访问的页面,基于 HTTP(无 JS 登录表单)
4
+ 替换: LOGIN_URL, LOGIN_DATA, TARGET_URLS
5
+ """
6
+ from scrapling.fetchers import FetcherSession
7
+
8
+ LOGIN_URL = "{{LOGIN_URL}}"
9
+ LOGIN_DATA = {{LOGIN_DATA}} # {'username': '...', 'password': '...'}
10
+ TARGET_URLS = {{TARGET_URLS}} # ['https://site.com/page1', ...]
11
+
12
+ with FetcherSession(impersonate='chrome') as s:
13
+ login_resp = s.post(LOGIN_URL, data=LOGIN_DATA)
14
+ print(f"Login status: {login_resp.status}")
15
+
16
+ for url in TARGET_URLS:
17
+ page = s.get(url)
18
+ print(f"\n--- {url} (status: {page.status}) ---")
19
+ print(page.get_all_text(strip=True)[:1000])
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env python3
2
+ """Cloudflare 反爬绕过模板
3
+ 用途: 有 Cloudflare/WAF 保护的网站
4
+ 替换: URL, COOKIES(可选), CSS_SELECTOR
5
+ """
6
+ from scrapling.fetchers import StealthyFetcher
7
+
8
+ URL = "{{URL}}"
9
+ COOKIES = {{COOKIES}} # None 或 [{'name': ..., 'value': ..., 'domain': ..., 'path': '/'}]
10
+ CSS_SELECTOR = "{{CSS_SELECTOR}}"
11
+
12
+ page = StealthyFetcher.fetch(
13
+ URL,
14
+ headless=True,
15
+ solve_cloudflare=True,
16
+ cookies=COOKIES,
17
+ timeout=60000,
18
+ network_idle=True,
19
+ )
20
+
21
+ print(f"Status: {page.status}")
22
+
23
+ if CSS_SELECTOR:
24
+ results = page.css(CSS_SELECTOR).getall()
25
+ for r in results:
26
+ print(r)
27
+ else:
28
+ print(page.get_all_text(strip=True)[:2000])