ultimate-pi 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (516) hide show
  1. package/.agents/skills/ck-search/SKILL.md +99 -0
  2. package/.agents/skills/defuddle/SKILL.md +90 -0
  3. package/.agents/skills/find-skills/SKILL.md +142 -0
  4. package/.agents/skills/firecrawl/SKILL.md +150 -0
  5. package/.agents/skills/firecrawl/rules/install.md +82 -0
  6. package/.agents/skills/firecrawl/rules/security.md +26 -0
  7. package/.agents/skills/firecrawl-agent/SKILL.md +57 -0
  8. package/.agents/skills/firecrawl-build-interact/SKILL.md +67 -0
  9. package/.agents/skills/firecrawl-build-onboarding/SKILL.md +102 -0
  10. package/.agents/skills/firecrawl-build-onboarding/references/auth-flow.md +39 -0
  11. package/.agents/skills/firecrawl-build-onboarding/references/project-setup.md +20 -0
  12. package/.agents/skills/firecrawl-build-onboarding/references/sdk-installation.md +17 -0
  13. package/.agents/skills/firecrawl-build-scrape/SKILL.md +68 -0
  14. package/.agents/skills/firecrawl-build-search/SKILL.md +68 -0
  15. package/.agents/skills/firecrawl-crawl/SKILL.md +58 -0
  16. package/.agents/skills/firecrawl-download/SKILL.md +69 -0
  17. package/.agents/skills/firecrawl-interact/SKILL.md +83 -0
  18. package/.agents/skills/firecrawl-map/SKILL.md +50 -0
  19. package/.agents/skills/firecrawl-parse/SKILL.md +61 -0
  20. package/.agents/skills/firecrawl-scrape/SKILL.md +68 -0
  21. package/.agents/skills/firecrawl-search/SKILL.md +59 -0
  22. package/.agents/skills/obsidian-bases/SKILL.md +299 -0
  23. package/.agents/skills/obsidian-markdown/SKILL.md +237 -0
  24. package/.agents/skills/posthog-analyst/SKILL.md +306 -0
  25. package/.agents/skills/posthog-analyst/evals/evals.json +23 -0
  26. package/.agents/skills/wiki/SKILL.md +215 -0
  27. package/.agents/skills/wiki/references/css-snippets.md +122 -0
  28. package/.agents/skills/wiki/references/frontmatter.md +107 -0
  29. package/.agents/skills/wiki/references/git-setup.md +58 -0
  30. package/.agents/skills/wiki/references/mcp-setup.md +149 -0
  31. package/.agents/skills/wiki/references/modes.md +259 -0
  32. package/.agents/skills/wiki/references/plugins.md +96 -0
  33. package/.agents/skills/wiki/references/rest-api.md +124 -0
  34. package/.agents/skills/wiki-autoresearch/SKILL.md +211 -0
  35. package/.agents/skills/wiki-autoresearch/references/program.md +75 -0
  36. package/.agents/skills/wiki-fold/SKILL.md +204 -0
  37. package/.agents/skills/wiki-fold/references/fold-template.md +133 -0
  38. package/.agents/skills/wiki-ingest/SKILL.md +288 -0
  39. package/.agents/skills/wiki-lint/SKILL.md +183 -0
  40. package/.agents/skills/wiki-query/SKILL.md +176 -0
  41. package/.agents/skills/wiki-save/SKILL.md +128 -0
  42. package/.ckignore +41 -0
  43. package/.env.example +9 -0
  44. package/.github/workflows/lint.yml +33 -0
  45. package/.github/workflows/publish-github-packages.yml +35 -0
  46. package/.github/workflows/publish-npm.yml +1 -1
  47. package/.pi/SYSTEM.md +107 -40
  48. package/.pi/agents/pi-pi/agent-expert.md +205 -0
  49. package/.pi/agents/pi-pi/cli-expert.md +47 -0
  50. package/.pi/agents/pi-pi/config-expert.md +67 -0
  51. package/.pi/agents/pi-pi/ext-expert.md +53 -0
  52. package/.pi/agents/pi-pi/keybinding-expert.md +123 -0
  53. package/.pi/agents/pi-pi/pi-orchestrator.md +103 -0
  54. package/.pi/agents/pi-pi/prompt-expert.md +83 -0
  55. package/.pi/agents/pi-pi/skill-expert.md +52 -0
  56. package/.pi/agents/pi-pi/theme-expert.md +46 -0
  57. package/.pi/agents/pi-pi/tui-expert.md +100 -0
  58. package/.pi/agents/rethink.md +140 -0
  59. package/.pi/agents/wiki-ingest.md +67 -0
  60. package/.pi/agents/wiki-lint.md +75 -0
  61. package/.pi/auto-commit.json +20 -0
  62. package/.pi/extensions/banner.png +0 -0
  63. package/.pi/extensions/ck-enforce.ts +216 -0
  64. package/.pi/extensions/custom-footer.ts +308 -0
  65. package/.pi/extensions/custom-header.ts +116 -0
  66. package/.pi/extensions/dotenv-loader.ts +170 -0
  67. package/.pi/internal/cursor-sdk-transcript-parser.ts +59 -0
  68. package/.pi/model-router.json +95 -0
  69. package/.pi/npm/.gitignore +2 -0
  70. package/.pi/prompts/git-sync.md +124 -0
  71. package/.pi/prompts/harness-setup.md +509 -0
  72. package/.pi/prompts/save.md +16 -0
  73. package/.pi/prompts/wiki-autoresearch.md +19 -0
  74. package/.pi/prompts/wiki.md +23 -0
  75. package/.pi/providers/cursor-sdk-provider.test.mjs +476 -0
  76. package/.pi/providers/cursor-sdk-provider.ts +1085 -0
  77. package/.pi/settings.json +14 -4
  78. package/.pi/skills/agent-router/SKILL.md +174 -0
  79. package/.pi/sounds/alert/1-kaching-track.mp3 +0 -0
  80. package/.pi/sounds/error/1-ksi-wth-track.mp3 +0 -0
  81. package/.pi/sounds/error/2-smash-track.mp3 +0 -0
  82. package/.pi/sounds/error/3-buzzer-track.mp3 +0 -0
  83. package/.pi/sounds/notification/1-soft-notification-track.mp3 +0 -0
  84. package/.pi/sounds/project-sounds.json +25 -0
  85. package/.pi/sounds/reminder/1-soft-notification-track.mp3 +0 -0
  86. package/.pi/sounds/success/1-tada-track.mp3 +0 -0
  87. package/.pi/sounds/success/2-jobs-done-track.mp3 +0 -0
  88. package/.pi/sounds/success/3-yay-track.mp3 +0 -0
  89. package/CONTRIBUTING.md +116 -0
  90. package/README.md +32 -39
  91. package/biome.json +34 -0
  92. package/firecrawl/.env.template +58 -0
  93. package/firecrawl/README.md +49 -0
  94. package/firecrawl/docker-compose.yaml +201 -0
  95. package/firecrawl/searxng/searxng.env +3 -0
  96. package/firecrawl/searxng/settings.yml +85 -0
  97. package/lefthook.yml +8 -0
  98. package/package.json +55 -24
  99. package/vault/AGENTS.md +37 -0
  100. package/vault/wiki/_templates/comparison.md +39 -0
  101. package/vault/wiki/_templates/concept.md +40 -0
  102. package/vault/wiki/_templates/decision.md +21 -0
  103. package/vault/wiki/_templates/entity.md +32 -0
  104. package/vault/wiki/_templates/flow.md +14 -0
  105. package/vault/wiki/_templates/module.md +18 -0
  106. package/vault/wiki/_templates/question.md +31 -0
  107. package/vault/wiki/_templates/source.md +39 -0
  108. package/vault/wiki/concepts/AST-Aware Code Chunking.md +44 -0
  109. package/vault/wiki/concepts/Build-Time Prompt Compilation.md +107 -0
  110. package/vault/wiki/concepts/Context Engine (AI Coding).md +47 -0
  111. package/vault/wiki/concepts/Context-Aware System Reminders.md +61 -0
  112. package/vault/wiki/concepts/Contextualized Text Embedding.md +42 -0
  113. package/vault/wiki/concepts/Contractor vs Employee AI Model.md +55 -0
  114. package/vault/wiki/concepts/Dual-Model Agent Architecture.md +65 -0
  115. package/vault/wiki/concepts/Late Chunking vs Early Chunking.md +43 -0
  116. package/vault/wiki/concepts/Majority Vote Ensembling.md +68 -0
  117. package/vault/wiki/concepts/Meta-Harness.md +16 -0
  118. package/vault/wiki/concepts/Multi-Agent AI Coding Architecture.md +75 -0
  119. package/vault/wiki/concepts/Prompt Enhancement.md +90 -0
  120. package/vault/wiki/concepts/Prompt Renderer.md +89 -0
  121. package/vault/wiki/concepts/Semantic Codebase Indexing.md +67 -0
  122. package/vault/wiki/concepts/additive-config-hierarchy.md +16 -0
  123. package/vault/wiki/concepts/agent-artifacts-verifiable-deliverables.md +71 -0
  124. package/vault/wiki/concepts/agent-browser-browser-automation.md +99 -0
  125. package/vault/wiki/concepts/agent-codebase-interface.md +43 -0
  126. package/vault/wiki/concepts/agent-harness-architecture.md +67 -0
  127. package/vault/wiki/concepts/agent-loop-detection-patterns.md +133 -0
  128. package/vault/wiki/concepts/agent-search-enforcement.md +126 -0
  129. package/vault/wiki/concepts/agent-skills-ecosystem.md +74 -0
  130. package/vault/wiki/concepts/agent-skills-pattern.md +68 -0
  131. package/vault/wiki/concepts/agentic-harness-context-enforcement.md +91 -0
  132. package/vault/wiki/concepts/agentic-harness.md +34 -0
  133. package/vault/wiki/concepts/agentic-orchestration-pipeline.md +56 -0
  134. package/vault/wiki/concepts/agentic-search-no-embeddings.md +18 -0
  135. package/vault/wiki/concepts/anthropic-context-engineering.md +13 -0
  136. package/vault/wiki/concepts/antigravity-agent-first-architecture.md +61 -0
  137. package/vault/wiki/concepts/ast-compression.md +19 -0
  138. package/vault/wiki/concepts/ast-truncation.md +66 -0
  139. package/vault/wiki/concepts/barrel-files.md +37 -0
  140. package/vault/wiki/concepts/browser-harness-agent.md +41 -0
  141. package/vault/wiki/concepts/browser-subagent-visual-verification.md +82 -0
  142. package/vault/wiki/concepts/codebase-intelligence-ecosystem-comparison.md +192 -0
  143. package/vault/wiki/concepts/codebase-intelligence-harness-integration.md +161 -0
  144. package/vault/wiki/concepts/codebase-to-context-ingestion.md +46 -0
  145. package/vault/wiki/concepts/codex-harness-innovations.md +147 -0
  146. package/vault/wiki/concepts/consensus-debate-flow.md +17 -0
  147. package/vault/wiki/concepts/consensus-debate.md +206 -0
  148. package/vault/wiki/concepts/content-addressed-spec-identity.md +166 -0
  149. package/vault/wiki/concepts/context-anxiety.md +57 -0
  150. package/vault/wiki/concepts/context-compression-techniques.md +19 -0
  151. package/vault/wiki/concepts/context-continuity.md +22 -0
  152. package/vault/wiki/concepts/context-drift-in-agents.md +106 -0
  153. package/vault/wiki/concepts/context-engineering.md +62 -0
  154. package/vault/wiki/concepts/context-folding.md +67 -0
  155. package/vault/wiki/concepts/context-mode.md +38 -0
  156. package/vault/wiki/concepts/cursor-harness-innovations.md +107 -0
  157. package/vault/wiki/concepts/deterministic-session-compaction.md +79 -0
  158. package/vault/wiki/concepts/drift-detection-unified.md +296 -0
  159. package/vault/wiki/concepts/execution-feedback-loop.md +46 -0
  160. package/vault/wiki/concepts/feedforward-feedback-harness.md +60 -0
  161. package/vault/wiki/concepts/five-root-cause-metrics-sentrux.md +40 -0
  162. package/vault/wiki/concepts/fork-safe-spec-storage.md +89 -0
  163. package/vault/wiki/concepts/fts5-sandbox.md +19 -0
  164. package/vault/wiki/concepts/fuzzy-edit-matching.md +71 -0
  165. package/vault/wiki/concepts/gemini-cli-architecture.md +104 -0
  166. package/vault/wiki/concepts/generator-evaluator-architecture.md +64 -0
  167. package/vault/wiki/concepts/guardian-agent-pattern.md +67 -0
  168. package/vault/wiki/concepts/harness-configuration-layers.md +89 -0
  169. package/vault/wiki/concepts/harness-control-frameworks.md +155 -0
  170. package/vault/wiki/concepts/harness-engineering-first-principles.md +90 -0
  171. package/vault/wiki/concepts/harness-h-formalism.md +53 -0
  172. package/vault/wiki/concepts/hybrid-code-search.md +61 -0
  173. package/vault/wiki/concepts/inline-post-edit-validation.md +112 -0
  174. package/vault/wiki/concepts/legendary-engineering-patterns-harness.md +110 -0
  175. package/vault/wiki/concepts/lifecycle-hooks.md +94 -0
  176. package/vault/wiki/concepts/mcp-tool-routing.md +102 -0
  177. package/vault/wiki/concepts/memory-system-of-record-vs-ephemeral-cache.md +47 -0
  178. package/vault/wiki/concepts/meta-agent-context-pruning.md +151 -0
  179. package/vault/wiki/concepts/model-adaptive-harness.md +122 -0
  180. package/vault/wiki/concepts/model-routing-agents.md +101 -0
  181. package/vault/wiki/concepts/monorepo-architecture.md +45 -0
  182. package/vault/wiki/concepts/multi-agent-specialization.md +61 -0
  183. package/vault/wiki/concepts/permission-subsystem.md +16 -0
  184. package/vault/wiki/concepts/pi-messenger-analysis.md +243 -0
  185. package/vault/wiki/concepts/pi-vscode-extension-landscape.md +37 -0
  186. package/vault/wiki/concepts/policy-engine-pattern.md +78 -0
  187. package/vault/wiki/concepts/progressive-disclosure-agents.md +53 -0
  188. package/vault/wiki/concepts/progressive-skill-disclosure.md +17 -0
  189. package/vault/wiki/concepts/provider-native-prompting.md +203 -0
  190. package/vault/wiki/concepts/quality-signal-sentrux.md +37 -0
  191. package/vault/wiki/concepts/repo-map-ranking.md +42 -0
  192. package/vault/wiki/concepts/result-monad-error-handling.md +47 -0
  193. package/vault/wiki/concepts/safety-defense-in-depth.md +83 -0
  194. package/vault/wiki/concepts/sandbox-os-enforcement.md +18 -0
  195. package/vault/wiki/concepts/selective-debate-routing.md +70 -0
  196. package/vault/wiki/concepts/self-evolving-harness.md +60 -0
  197. package/vault/wiki/concepts/sentrux-mcp-integration.md +36 -0
  198. package/vault/wiki/concepts/sentrux-rules-engine.md +49 -0
  199. package/vault/wiki/concepts/shell-pattern-compression.md +24 -0
  200. package/vault/wiki/concepts/skill-first-architecture.md +166 -0
  201. package/vault/wiki/concepts/structured-compaction.md +78 -0
  202. package/vault/wiki/concepts/subagent-orchestration.md +17 -0
  203. package/vault/wiki/concepts/subagent-worktree-isolation.md +68 -0
  204. package/vault/wiki/concepts/superpowers-methodology.md +78 -0
  205. package/vault/wiki/concepts/think-in-code.md +73 -0
  206. package/vault/wiki/concepts/ts-execution-layer.md +100 -0
  207. package/vault/wiki/concepts/typescript-strict-mode.md +37 -0
  208. package/vault/wiki/concepts/vcc-conversation-compaction-for-pi.md +51 -0
  209. package/vault/wiki/concepts/verification-drift-detection.md +19 -0
  210. package/vault/wiki/consensus/consensus-records.md +58 -0
  211. package/vault/wiki/decisions/2026-04-30-pi-lean-ctx-native.md +122 -0
  212. package/vault/wiki/decisions/adr-008.md +40 -0
  213. package/vault/wiki/decisions/adr-009.md +46 -0
  214. package/vault/wiki/decisions/adr-010.md +55 -0
  215. package/vault/wiki/decisions/adr-011.md +165 -0
  216. package/vault/wiki/decisions/adr-012.md +102 -0
  217. package/vault/wiki/decisions/adr-013.md +59 -0
  218. package/vault/wiki/decisions/adr-014.md +73 -0
  219. package/vault/wiki/decisions/adr-015.md +81 -0
  220. package/vault/wiki/decisions/adr-016.md +91 -0
  221. package/vault/wiki/decisions/adr-017.md +79 -0
  222. package/vault/wiki/decisions/adr-018.md +100 -0
  223. package/vault/wiki/decisions/adr-019.md +75 -0
  224. package/vault/wiki/decisions/adr-020.md +106 -0
  225. package/vault/wiki/decisions/adr-021.md +86 -0
  226. package/vault/wiki/decisions/adr-022.md +113 -0
  227. package/vault/wiki/decisions/adr-023.md +113 -0
  228. package/vault/wiki/decisions/adr-024.md +73 -0
  229. package/vault/wiki/decisions/adr-025.md +130 -0
  230. package/vault/wiki/decisions/adr-026.md +56 -0
  231. package/vault/wiki/decisions/colocate-wiki.md +34 -0
  232. package/vault/wiki/entities/Anders Hejlsberg.md +29 -0
  233. package/vault/wiki/entities/Anthropic.md +17 -0
  234. package/vault/wiki/entities/Augment Code.md +49 -0
  235. package/vault/wiki/entities/Bjarne Stroustrup.md +26 -0
  236. package/vault/wiki/entities/Bolt.new (StackBlitz).md +39 -0
  237. package/vault/wiki/entities/Boris Cherny.md +11 -0
  238. package/vault/wiki/entities/Claude Code.md +19 -0
  239. package/vault/wiki/entities/Dennis Ritchie.md +26 -0
  240. package/vault/wiki/entities/Emergent Labs.md +32 -0
  241. package/vault/wiki/entities/Google Cloud.md +16 -0
  242. package/vault/wiki/entities/Guido van Rossum.md +28 -0
  243. package/vault/wiki/entities/Ken Thompson.md +28 -0
  244. package/vault/wiki/entities/Lee et al.md +16 -0
  245. package/vault/wiki/entities/Linus Torvalds.md +28 -0
  246. package/vault/wiki/entities/Lovable (company).md +40 -0
  247. package/vault/wiki/entities/Martin Fowler.md +16 -0
  248. package/vault/wiki/entities/Meng et al.md +16 -0
  249. package/vault/wiki/entities/OpenAI.md +16 -0
  250. package/vault/wiki/entities/Rocket.new.md +38 -0
  251. package/vault/wiki/entities/VILA-Lab.md +15 -0
  252. package/vault/wiki/entities/autodev-codebase.md +18 -0
  253. package/vault/wiki/entities/ck-tool.md +59 -0
  254. package/vault/wiki/entities/codesearch.md +18 -0
  255. package/vault/wiki/entities/disler-indydevdan.md +33 -0
  256. package/vault/wiki/entities/gsd-get-shit-done.md +56 -0
  257. package/vault/wiki/entities/javascript-runtimes.md +48 -0
  258. package/vault/wiki/entities/jesse-vincent.md +38 -0
  259. package/vault/wiki/entities/lean-ctx.md +32 -0
  260. package/vault/wiki/entities/opendev.md +41 -0
  261. package/vault/wiki/entities/ops-codegraph-tool.md +18 -0
  262. package/vault/wiki/entities/pi-coding-agent.md +53 -0
  263. package/vault/wiki/entities/sentrux.md +54 -0
  264. package/vault/wiki/entities/vgrep-tool.md +57 -0
  265. package/vault/wiki/entities/vitest.md +41 -0
  266. package/vault/wiki/flows/harness-wiki-pipeline.md +204 -0
  267. package/vault/wiki/hot.md +932 -0
  268. package/vault/wiki/index.md +437 -0
  269. package/vault/wiki/log.md +418 -0
  270. package/vault/wiki/meta/dashboard.md +30 -0
  271. package/vault/wiki/meta/lint-report-2026-04-30.md +86 -0
  272. package/vault/wiki/meta/lint-report-2026-05-02.md +251 -0
  273. package/vault/wiki/meta/overview.canvas +43 -0
  274. package/vault/wiki/modules/adversarial-verification.md +57 -0
  275. package/vault/wiki/modules/automated-observability.md +54 -0
  276. package/vault/wiki/modules/bench.md +20 -0
  277. package/vault/wiki/modules/extensions.md +23 -0
  278. package/vault/wiki/modules/grounding-checkpoints.md +62 -0
  279. package/vault/wiki/modules/harness-implementation-plan.md +345 -0
  280. package/vault/wiki/modules/harness-wiki-skill-mapping.md +135 -0
  281. package/vault/wiki/modules/harness.md +86 -0
  282. package/vault/wiki/modules/persistent-memory.md +85 -0
  283. package/vault/wiki/modules/schema-orchestration.md +68 -0
  284. package/vault/wiki/modules/skills.md +27 -0
  285. package/vault/wiki/modules/spec-hardening.md +58 -0
  286. package/vault/wiki/modules/structured-planning.md +53 -0
  287. package/vault/wiki/modules/think-in-code-enforcement.md +153 -0
  288. package/vault/wiki/modules/wiki-query-interface.md +64 -0
  289. package/vault/wiki/overview.md +51 -0
  290. package/vault/wiki/questions/Research-pi-vs-claude-code-agentic-orchestration-pipeline.md +87 -0
  291. package/vault/wiki/questions/Research-sentrux-dev.md +123 -0
  292. package/vault/wiki/questions/Research-superpowers-skill-for-agentic-coding-agents.md +164 -0
  293. package/vault/wiki/questions/Research: Augment Code Context Engine.md +244 -0
  294. package/vault/wiki/questions/Research: Automating Software Engineering - Lovable, Bolt, Emergent, Rocket.md +112 -0
  295. package/vault/wiki/questions/Research: Claude Code State-of-the-Art Harness Improvements.md +209 -0
  296. package/vault/wiki/questions/Research: Codex State-of-the-Art Harness Improvements.md +99 -0
  297. package/vault/wiki/questions/Research: Engineering Workflows of Legendary Programmers and AI Harness Mapping.md +107 -0
  298. package/vault/wiki/questions/Research: Fallow Codebase Intelligence Harness Integration.md +72 -0
  299. package/vault/wiki/questions/Research: Gemini CLI SOTA Harness Integration.md +166 -0
  300. package/vault/wiki/questions/Research: GitHub Issues as Harness Spec Storage.md +188 -0
  301. package/vault/wiki/questions/Research: Google Antigravity Harness Integration.md +120 -0
  302. package/vault/wiki/questions/Research: Meta-Agent Context Drift Detection.md +236 -0
  303. package/vault/wiki/questions/Research: Model-Adaptive Agent Harness Design.md +95 -0
  304. package/vault/wiki/questions/Research: Model-Specific Prompting Guides.md +165 -0
  305. package/vault/wiki/questions/Research: Prompt Renderer for Multi-Model Agent Harness.md +216 -0
  306. package/vault/wiki/questions/Research: Skill-First Harness Architecture.md +91 -0
  307. package/vault/wiki/questions/Research: TypeScript Best Practices and Codebase Structure.md +88 -0
  308. package/vault/wiki/questions/Research: TypeScript Execution Layer for Agent Tool Calling.md +81 -0
  309. package/vault/wiki/questions/Research: claude-mem over Obsidian for Harness Layer.md +71 -0
  310. package/vault/wiki/questions/Research: claude-mem over obsidian wiki as the knowledge base for our agentic harness pipeline. think from first principles. does this replace or complement our current setup? no hard feelings about previous decisions. gimme accurate points.md +80 -0
  311. package/vault/wiki/questions/Research: context-mode vs lean-ctx.md +72 -0
  312. package/vault/wiki/questions/Research: cursor.sh Harness Innovations.md +92 -0
  313. package/vault/wiki/questions/Research: executor.sh Harness Integration.md +170 -0
  314. package/vault/wiki/questions/Research: how GSD fits into our coding harness setup.md +97 -0
  315. package/vault/wiki/questions/Research: how claude-mem fits into our workflow. and whether it should replace obsidian in the codebase. no hard feelings about previous actions, rethink from first principles always.md +80 -0
  316. package/vault/wiki/questions/Research: pi-vcc.md +113 -0
  317. package/vault/wiki/questions/Research: semantic code search tools.md +69 -0
  318. package/vault/wiki/questions/Research: vcc extension for pi coding agent.md +73 -0
  319. package/vault/wiki/questions/how-to-enable-semantic-code-search-now.md +111 -0
  320. package/vault/wiki/questions/mvp-implementation-blueprint.md +552 -0
  321. package/vault/wiki/questions/research-agent-first-codebase-exploration.md +199 -0
  322. package/vault/wiki/questions/research-agentic-coding-harness-latest-papers.md +142 -0
  323. package/vault/wiki/questions/research-gitingest-gitreverse-integration.md +100 -0
  324. package/vault/wiki/questions/research-wozcode-token-reduction.md +67 -0
  325. package/vault/wiki/questions/resolved-context-pruning-inplace-vs-restart.md +95 -0
  326. package/vault/wiki/questions/resolved-context-window-economics.md +167 -0
  327. package/vault/wiki/questions/resolved-imad-debate-gating-transfer.md +126 -0
  328. package/vault/wiki/questions/resolved-mcp-tool-preference.md +112 -0
  329. package/vault/wiki/questions/resolved-small-model-meta-agents.md +107 -0
  330. package/vault/wiki/questions/resolved-treesitter-dynamic-languages.md +95 -0
  331. package/vault/wiki/sources/Auggie Context MCP Server.md +63 -0
  332. package/vault/wiki/sources/Augment Code Codacy AI Giants.md +61 -0
  333. package/vault/wiki/sources/Augment Code MCP SiliconAngle.md +49 -0
  334. package/vault/wiki/sources/Augment Code WorkOS ERC 2025.md +55 -0
  335. package/vault/wiki/sources/Augment Context Engine Official.md +71 -0
  336. package/vault/wiki/sources/Augment SWE-bench Agent GitHub.md +74 -0
  337. package/vault/wiki/sources/Augment SWE-bench Pro Blog.md +58 -0
  338. package/vault/wiki/sources/Source: AgentBus Jinja2 Prompt Pipelines.md +75 -0
  339. package/vault/wiki/sources/Source: Arxiv /342/200/224 Don't Break the Cache.md" +85 -0
  340. package/vault/wiki/sources/Source: Augment - Harness Engineering for AI Coding Agents.md +58 -0
  341. package/vault/wiki/sources/Source: Blake Crosley Agent Architecture Guide.md +100 -0
  342. package/vault/wiki/sources/Source: Bolt.new Architecture & Case Study.md +75 -0
  343. package/vault/wiki/sources/Source: Build-Time Prompt Compilation Architecture.md +107 -0
  344. package/vault/wiki/sources/Source: Claude API Agent Skills Overview.md +70 -0
  345. package/vault/wiki/sources/Source: Gemini CLI Changelogs.md +88 -0
  346. package/vault/wiki/sources/Source: Google Blog - Gemini CLI Announcement.md +57 -0
  347. package/vault/wiki/sources/Source: Google Gemini CLI Architecture Docs.md +53 -0
  348. package/vault/wiki/sources/Source: LangChain - Anatomy of Agent Harness.md +65 -0
  349. package/vault/wiki/sources/Source: Lovable Architecture & Clone Analysis.md +83 -0
  350. package/vault/wiki/sources/Source: Martin Fowler - Harness Engineering.md +70 -0
  351. package/vault/wiki/sources/Source: OpenAI Harness Engineering Five Principles.md +58 -0
  352. package/vault/wiki/sources/Source: OpenAI Harness Engineering /342/200/224 0 Lines of Human Code.md" +101 -0
  353. package/vault/wiki/sources/Source: OpenDev /342/200/224 Building AI Coding Agents for the Terminal.md" +100 -0
  354. package/vault/wiki/sources/Source: Render AI Coding Agents Benchmark 2025.md +53 -0
  355. package/vault/wiki/sources/Source: Rocket.new /342/200/224 Vibe Solutioning Platform.md" +70 -0
  356. package/vault/wiki/sources/Source: SwirlAI Agent Skills Progressive Disclosure.md +71 -0
  357. package/vault/wiki/sources/Source: TianPan Prompt Caching Architecture.md +89 -0
  358. package/vault/wiki/sources/Source: Vercel Labs agent-browser.md +155 -0
  359. package/vault/wiki/sources/Source: browser-harness CDP Harness.md +126 -0
  360. package/vault/wiki/sources/agent-drift-academic-paper.md +79 -0
  361. package/vault/wiki/sources/aider-repomap-tree-sitter.md +42 -0
  362. package/vault/wiki/sources/anthropic-compaction-api.md +58 -0
  363. package/vault/wiki/sources/anthropic-effective-harnesses.md +42 -0
  364. package/vault/wiki/sources/anthropic-prompt-best-practices.md +100 -0
  365. package/vault/wiki/sources/anthropic2026-harness-design.md +63 -0
  366. package/vault/wiki/sources/barrel-files-tkdodo.md +38 -0
  367. package/vault/wiki/sources/birth-of-unix-kernighan-interview.md +57 -0
  368. package/vault/wiki/sources/bockeler2026-harness-engineering.md +69 -0
  369. package/vault/wiki/sources/cast-code-chunking-paper.md +50 -0
  370. package/vault/wiki/sources/ck-semantic-search.md +78 -0
  371. package/vault/wiki/sources/claude-code-architecture-karaxai-2026.md +71 -0
  372. package/vault/wiki/sources/claude-code-architecture-qubytes-2026.md +50 -0
  373. package/vault/wiki/sources/claude-code-architecture-vila-lab-2026.md +64 -0
  374. package/vault/wiki/sources/claude-code-security-architecture-penligent-2026.md +70 -0
  375. package/vault/wiki/sources/claude-context-editing-docs.md +13 -0
  376. package/vault/wiki/sources/cloudflare-codemode.md +63 -0
  377. package/vault/wiki/sources/code-chunk-library-supermemory.md +63 -0
  378. package/vault/wiki/sources/codeact-apple-2024.md +62 -0
  379. package/vault/wiki/sources/codex-dsc-rfc-8573.md +41 -0
  380. package/vault/wiki/sources/codex-open-source-agent-2026.md +110 -0
  381. package/vault/wiki/sources/coir-code-retrieval-benchmark.md +51 -0
  382. package/vault/wiki/sources/colinmcnamara-context-optimization-codemode.md +48 -0
  383. package/vault/wiki/sources/context-folding-paper.md +61 -0
  384. package/vault/wiki/sources/context-mode-website.md +63 -0
  385. package/vault/wiki/sources/cursor-agent-best-practices-2026.md +62 -0
  386. package/vault/wiki/sources/cursor-fork-29b-2025.md +50 -0
  387. package/vault/wiki/sources/cursor-harness-april-2026.md +76 -0
  388. package/vault/wiki/sources/cursor-instant-apply-2024.md +45 -0
  389. package/vault/wiki/sources/cursor-shadow-workspace-2024.md +52 -0
  390. package/vault/wiki/sources/cursor-shipped-coding-agent-2026.md +53 -0
  391. package/vault/wiki/sources/cursor-vs-antigravity-2026.md +51 -0
  392. package/vault/wiki/sources/disler-pi-vs-claude-code.md +69 -0
  393. package/vault/wiki/sources/distill-deterministic-context-compression.md +53 -0
  394. package/vault/wiki/sources/embedding-models-benchmark-supermemory-2025.md +48 -0
  395. package/vault/wiki/sources/executor-rhyssullivan.md +122 -0
  396. package/vault/wiki/sources/fallow-rs-codebase-intelligence.md +125 -0
  397. package/vault/wiki/sources/fan2025-imad.md +60 -0
  398. package/vault/wiki/sources/forgecode-gpt5-agent-improvements.md +63 -0
  399. package/vault/wiki/sources/gemini-3-prompting-guide.md +78 -0
  400. package/vault/wiki/sources/gh-cli-sub-issue-rfc.md +50 -0
  401. package/vault/wiki/sources/gh-sub-issue-extension.md +72 -0
  402. package/vault/wiki/sources/github-fork-issues-discussion.md +44 -0
  403. package/vault/wiki/sources/github-issue-dependencies-docs.md +49 -0
  404. package/vault/wiki/sources/github-sub-issues-docs.md +51 -0
  405. package/vault/wiki/sources/gitingest.md +91 -0
  406. package/vault/wiki/sources/gitreverse.md +63 -0
  407. package/vault/wiki/sources/google-antigravity-official-blog.md +47 -0
  408. package/vault/wiki/sources/google-antigravity-wikipedia.md +53 -0
  409. package/vault/wiki/sources/gsd-codecentric-deep-dive.md +57 -0
  410. package/vault/wiki/sources/gsd-github-repo.md +51 -0
  411. package/vault/wiki/sources/gsd-hn-discussion.md +59 -0
  412. package/vault/wiki/sources/guido-python-design-philosophy.md +56 -0
  413. package/vault/wiki/sources/hejlsberg-7-learnings.md +48 -0
  414. package/vault/wiki/sources/ironclaw-drift-monitor.md +80 -0
  415. package/vault/wiki/sources/langsight-loop-detection.md +80 -0
  416. package/vault/wiki/sources/leanctx-website.md +69 -0
  417. package/vault/wiki/sources/lee2026-meta-harness.md +59 -0
  418. package/vault/wiki/sources/linux-kernel-coding-workflow.md +50 -0
  419. package/vault/wiki/sources/lou2026-autoharness.md +53 -0
  420. package/vault/wiki/sources/martin-fowler-harness-engineering.md +73 -0
  421. package/vault/wiki/sources/mcp-architecture-docs.md +13 -0
  422. package/vault/wiki/sources/meng2026-agent-harness-survey.md +79 -0
  423. package/vault/wiki/sources/mindstudio-four-agent-types.md +68 -0
  424. package/vault/wiki/sources/ms-chat-history-management.md +13 -0
  425. package/vault/wiki/sources/openai-prompt-guidance.md +104 -0
  426. package/vault/wiki/sources/openclaw-session-pruning.md +13 -0
  427. package/vault/wiki/sources/opencode-dcp.md +13 -0
  428. package/vault/wiki/sources/opendev-arxiv-2603.05344v1.md +79 -0
  429. package/vault/wiki/sources/openhands-platform.md +39 -0
  430. package/vault/wiki/sources/oss-guide-codebase-exploration.md +53 -0
  431. package/vault/wiki/sources/pi-compaction-extensions-ecosystem.md +102 -0
  432. package/vault/wiki/sources/pi-context-prune-github-repo.md +38 -0
  433. package/vault/wiki/sources/pi-mono-compaction-docs.md +38 -0
  434. package/vault/wiki/sources/pi-omni-compact-github-repo.md +50 -0
  435. package/vault/wiki/sources/pi-rtk-optimizer-github-repo.md +45 -0
  436. package/vault/wiki/sources/pi-vcc-github-repo.md +69 -0
  437. package/vault/wiki/sources/pi-vscode-marketplace.md +41 -0
  438. package/vault/wiki/sources/pi-vscode-model-provider-marketplace.md +39 -0
  439. package/vault/wiki/sources/py-tree-sitter.md +13 -0
  440. package/vault/wiki/sources/sentrux-dev-landing.md +40 -0
  441. package/vault/wiki/sources/sentrux-docs-pro-architecture.md +75 -0
  442. package/vault/wiki/sources/sentrux-docs-quality-signal.md +46 -0
  443. package/vault/wiki/sources/sentrux-docs-root-cause-metrics.md +57 -0
  444. package/vault/wiki/sources/sentrux-docs-rules-engine.md +58 -0
  445. package/vault/wiki/sources/sentrux-github-repo.md +56 -0
  446. package/vault/wiki/sources/superpowers-github-repo.md +56 -0
  447. package/vault/wiki/sources/superpowers-release-blog.md +54 -0
  448. package/vault/wiki/sources/superpowers-termdock-analysis.md +45 -0
  449. package/vault/wiki/sources/swe-agent-aci.md +42 -0
  450. package/vault/wiki/sources/swe-bench.md +45 -0
  451. package/vault/wiki/sources/swe-pruner-context-pruning.md +13 -0
  452. package/vault/wiki/sources/think-in-code-blog.md +48 -0
  453. package/vault/wiki/sources/tree-sitter-docs.md +13 -0
  454. package/vault/wiki/sources/ts-best-practices-2025-devto.md +42 -0
  455. package/vault/wiki/sources/ts-folder-structure-mingyang.md +58 -0
  456. package/vault/wiki/sources/ts-monorepo-koerselman.md +44 -0
  457. package/vault/wiki/sources/ts-result-error-handling-kkalamarski.md +52 -0
  458. package/vault/wiki/sources/ts-runtimes-comparison-betterstack.md +42 -0
  459. package/vault/wiki/sources/ts-strict-mode-rishikc.md +43 -0
  460. package/vault/wiki/sources/unix-philosophy.md +48 -0
  461. package/vault/wiki/sources/vectara-chunking-vs-embedding-naacl2025.md +39 -0
  462. package/vault/wiki/sources/vectara-guardian-agents.md +79 -0
  463. package/vault/wiki/sources/vgrep-semantic-search.md +76 -0
  464. package/vault/wiki/sources/vitest-official.md +41 -0
  465. package/vault/wiki/sources/vscode-pi-community-extension.md +40 -0
  466. package/vault/wiki/sources/wozcode.md +79 -0
  467. package/.agents/skills/compress/SKILL.md +0 -111
  468. package/.agents/skills/compress/scripts/__init__.py +0 -9
  469. package/.agents/skills/compress/scripts/__main__.py +0 -3
  470. package/.agents/skills/compress/scripts/benchmark.py +0 -78
  471. package/.agents/skills/compress/scripts/cli.py +0 -73
  472. package/.agents/skills/compress/scripts/compress.py +0 -227
  473. package/.agents/skills/compress/scripts/detect.py +0 -121
  474. package/.agents/skills/compress/scripts/validate.py +0 -189
  475. package/.agents/skills/emil-design-eng/SKILL.md +0 -679
  476. package/.agents/skills/lean-ctx/SKILL.md +0 -149
  477. package/.agents/skills/lean-ctx/scripts/install.sh +0 -95
  478. package/.agents/skills/scrapling-official/LICENSE.txt +0 -28
  479. package/.agents/skills/scrapling-official/SKILL.md +0 -390
  480. package/.agents/skills/scrapling-official/examples/01_fetcher_session.py +0 -26
  481. package/.agents/skills/scrapling-official/examples/02_dynamic_session.py +0 -26
  482. package/.agents/skills/scrapling-official/examples/03_stealthy_session.py +0 -26
  483. package/.agents/skills/scrapling-official/examples/04_spider.py +0 -58
  484. package/.agents/skills/scrapling-official/examples/README.md +0 -45
  485. package/.agents/skills/scrapling-official/references/fetching/choosing.md +0 -78
  486. package/.agents/skills/scrapling-official/references/fetching/dynamic.md +0 -352
  487. package/.agents/skills/scrapling-official/references/fetching/static.md +0 -432
  488. package/.agents/skills/scrapling-official/references/fetching/stealthy.md +0 -255
  489. package/.agents/skills/scrapling-official/references/mcp-server.md +0 -214
  490. package/.agents/skills/scrapling-official/references/migrating_from_beautifulsoup.md +0 -86
  491. package/.agents/skills/scrapling-official/references/parsing/adaptive.md +0 -212
  492. package/.agents/skills/scrapling-official/references/parsing/main_classes.md +0 -586
  493. package/.agents/skills/scrapling-official/references/parsing/selection.md +0 -494
  494. package/.agents/skills/scrapling-official/references/spiders/advanced.md +0 -344
  495. package/.agents/skills/scrapling-official/references/spiders/architecture.md +0 -94
  496. package/.agents/skills/scrapling-official/references/spiders/getting-started.md +0 -164
  497. package/.agents/skills/scrapling-official/references/spiders/proxy-blocking.md +0 -235
  498. package/.agents/skills/scrapling-official/references/spiders/requests-responses.md +0 -196
  499. package/.agents/skills/scrapling-official/references/spiders/sessions.md +0 -205
  500. package/PLAN.md +0 -11
  501. package/extensions/lean-ctx-enforce.ts +0 -166
  502. package/skills-lock.json +0 -35
  503. package/wiki/README.md +0 -19
  504. package/wiki/decisions/0001-establish-project-wiki-and-decision-record-format.md +0 -25
  505. package/wiki/decisions/0002-add-project-banner-to-readme.md +0 -26
  506. package/wiki/decisions/0003-remove-redundant-readme-title-heading.md +0 -26
  507. package/wiki/decisions/0004-publish-package-to-npm-as-ultimate-pi.md +0 -26
  508. package/wiki/decisions/0005-automate-npm-publish-with-github-actions.md +0 -27
  509. package/wiki/decisions/0006-switch-to-npm-trusted-publishing.md +0 -26
  510. package/wiki/decisions/0007-use-absolute-banner-url-for-npm-readme-rendering.md +0 -26
  511. package/wiki/decisions/0008-rename-banner-asset-for-cache-busting.md +0 -26
  512. package/wiki/decisions/0009-force-oidc-path-by-clearing-node-auth-token-in-publish-step.md +0 -25
  513. package/wiki/decisions/0010-simplify-setup-node-for-npm-trusted-publishing.md +0 -26
  514. package/wiki/decisions/0011-add-noop-workflow-change-to-force-fresh-publish-run.md +0 -25
  515. package/wiki/decisions/0012-align-workflow-runtime-with-npm-trusted-publishing-requirements.md +0 -26
  516. package/wiki/decisions/0013-add-package-repository-url-for-provenance-validation.md +0 -25
@@ -1,235 +0,0 @@
1
- # Proxy management and handling Blocks
2
-
3
- Scrapling's `ProxyRotator` manages proxy rotation across requests. It works with all session types and integrates with the spider's blocked request retry system.
4
-
5
- ## ProxyRotator
6
-
7
- The `ProxyRotator` class manages a list of proxies and rotates through them automatically. Pass it to any session type via the `proxy_rotator` parameter:
8
-
9
- ```python
10
- from scrapling.spiders import Spider, Response
11
- from scrapling.fetchers import FetcherSession, ProxyRotator
12
-
13
- class MySpider(Spider):
14
- name = "my_spider"
15
- start_urls = ["https://example.com"]
16
-
17
- def configure_sessions(self, manager):
18
- rotator = ProxyRotator([
19
- "http://proxy1:8080",
20
- "http://proxy2:8080",
21
- "http://user:pass@proxy3:8080",
22
- ])
23
- manager.add("default", FetcherSession(proxy_rotator=rotator))
24
-
25
- async def parse(self, response: Response):
26
- # Check which proxy was used
27
- print(f"Proxy used: {response.meta.get('proxy')}")
28
- yield {"title": response.css("title::text").get("")}
29
- ```
30
-
31
- Each request automatically gets the next proxy in the rotation. The proxy used is stored in `response.meta["proxy"]` so you can track which proxy fetched which page.
32
-
33
-
34
- Browser sessions support both string and dict proxy formats:
35
-
36
- ```python
37
- from scrapling.fetchers import AsyncDynamicSession, AsyncStealthySession, ProxyRotator
38
-
39
- # String proxies work for all session types
40
- rotator = ProxyRotator([
41
- "http://proxy1:8080",
42
- "http://proxy2:8080",
43
- ])
44
-
45
- # Dict proxies (Playwright format) work for browser sessions
46
- rotator = ProxyRotator([
47
- {"server": "http://proxy1:8080", "username": "user", "password": "pass"},
48
- {"server": "http://proxy2:8080"},
49
- ])
50
-
51
- # Then inside the spider
52
- def configure_sessions(self, manager):
53
- rotator = ProxyRotator(["http://proxy1:8080", "http://proxy2:8080"])
54
- manager.add("browser", AsyncStealthySession(proxy_rotator=rotator))
55
- ```
56
-
57
- **Important:**
58
-
59
- 1. You cannot use the `proxy_rotator` argument together with the static `proxy` or `proxies` parameters on the same session. Pick one approach when configuring the session, and override it per request later if needed.
60
- 2. By default, all browser-based sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed.
61
-
62
- ## Custom Rotation Strategies
63
-
64
- By default, `ProxyRotator` uses cyclic rotation - it iterates through proxies sequentially, wrapping around at the end.
65
-
66
- You can provide a custom strategy function to change this behavior, but it has to match the below signature:
67
-
68
- ```python
69
- from scrapling.core._types import ProxyType
70
-
71
- def my_strategy(proxies: list, current_index: int) -> tuple[ProxyType, int]:
72
- ...
73
- ```
74
-
75
- It receives the list of proxies and the current index, and must return the chosen proxy and the next index.
76
-
77
- Below are some examples of custom rotation strategies you can use.
78
-
79
- ### Random Rotation
80
-
81
- ```python
82
- import random
83
- from scrapling.fetchers import ProxyRotator
84
-
85
- def random_strategy(proxies, current_index):
86
- idx = random.randint(0, len(proxies) - 1)
87
- return proxies[idx], idx
88
-
89
- rotator = ProxyRotator(
90
- ["http://proxy1:8080", "http://proxy2:8080", "http://proxy3:8080"],
91
- strategy=random_strategy,
92
- )
93
- ```
94
-
95
- ### Weighted Rotation
96
-
97
- ```python
98
- import random
99
-
100
- def weighted_strategy(proxies, current_index):
101
- # First proxy gets 60% of traffic, others split the rest
102
- weights = [60] + [40 // (len(proxies) - 1)] * (len(proxies) - 1)
103
- proxy = random.choices(proxies, weights=weights, k=1)[0]
104
- return proxy, current_index # Index doesn't matter for weighted
105
-
106
- rotator = ProxyRotator(proxies, strategy=weighted_strategy)
107
- ```
108
-
109
-
110
- ## Per-Request Proxy Override
111
-
112
- You can override the rotator for individual requests by passing `proxy=` as a keyword argument:
113
-
114
- ```python
115
- async def parse(self, response: Response):
116
- # This request uses the rotator's next proxy
117
- yield response.follow("/page1", callback=self.parse_page)
118
-
119
- # This request uses a specific proxy, bypassing the rotator
120
- yield response.follow(
121
- "/special-page",
122
- callback=self.parse_page,
123
- proxy="http://special-proxy:8080",
124
- )
125
- ```
126
-
127
- This is useful when certain pages require a specific proxy (e.g., a geo-located proxy for region-specific content).
128
-
129
- ## Blocked Request Handling
130
-
131
- The spider has built-in blocked request detection and retry. By default, it considers the following HTTP status codes blocked: `401`, `403`, `407`, `429`, `444`, `500`, `502`, `503`, `504`.
132
-
133
- The retry system works like this:
134
-
135
- 1. After a response comes back, the spider calls the `is_blocked(response)` method.
136
- 2. If blocked, it copies the request and calls the `retry_blocked_request()` method so you can modify it before retrying.
137
- 3. The retried request is re-queued with `dont_filter=True` (bypassing deduplication) and lower priority, so it's not retried right away.
138
- 4. This repeats up to `max_blocked_retries` times (default: 3).
139
-
140
- **Tip:**
141
-
142
- 1. On retry, the previous `proxy`/`proxies` kwargs are cleared from the request automatically, so the rotator assigns a fresh proxy.
143
- 2. The `max_blocked_retries` attribute is different than the session retries and doesn't share the counter.
144
-
145
- ### Custom Block Detection
146
-
147
- Override `is_blocked()` to add your own detection logic:
148
-
149
- ```python
150
- class MySpider(Spider):
151
- name = "my_spider"
152
- start_urls = ["https://example.com"]
153
-
154
- async def is_blocked(self, response: Response) -> bool:
155
- # Check status codes (default behavior)
156
- if response.status in {403, 429, 503}:
157
- return True
158
-
159
- # Check response content
160
- body = response.body.decode("utf-8", errors="ignore")
161
- if "access denied" in body.lower() or "rate limit" in body.lower():
162
- return True
163
-
164
- return False
165
-
166
- async def parse(self, response: Response):
167
- yield {"title": response.css("title::text").get("")}
168
- ```
169
-
170
- ### Customizing Retries
171
-
172
- Override `retry_blocked_request()` to modify the request before retrying. The `max_blocked_retries` attribute controls how many times a blocked request is retried (default: 3):
173
-
174
- ```python
175
- from scrapling.spiders import Spider, SessionManager, Request, Response
176
- from scrapling.fetchers import FetcherSession, AsyncStealthySession
177
-
178
-
179
- class MySpider(Spider):
180
- name = "my_spider"
181
- start_urls = ["https://example.com"]
182
- max_blocked_retries = 5
183
-
184
- def configure_sessions(self, manager: SessionManager) -> None:
185
- manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari']))
186
- manager.add('stealth', AsyncStealthySession(block_webrtc=True), lazy=True)
187
-
188
- async def retry_blocked_request(self, request: Request, response: Response) -> Request:
189
- request.sid = "stealth"
190
- self.logger.info(f"Retrying blocked request: {request.url}")
191
- return request
192
-
193
- async def parse(self, response: Response):
194
- yield {"title": response.css("title::text").get("")}
195
- ```
196
-
197
- What happened above is that I left the blocking detection logic unchanged and had the spider mainly use requests until it got blocked, then switch to the stealthy browser.
198
-
199
-
200
- Putting it all together:
201
-
202
- ```python
203
- from scrapling.spiders import Spider, SessionManager, Request, Response
204
- from scrapling.fetchers import FetcherSession, AsyncStealthySession, ProxyRotator
205
-
206
-
207
- cheap_proxies = ProxyRotator([ "http://proxy1:8080", "http://proxy2:8080"])
208
-
209
- # A format acceptable by the browser
210
- expensive_proxies = ProxyRotator([
211
- {"server": "http://residential_proxy1:8080", "username": "user", "password": "pass"},
212
- {"server": "http://residential_proxy2:8080", "username": "user", "password": "pass"},
213
- {"server": "http://mobile_proxy1:8080", "username": "user", "password": "pass"},
214
- {"server": "http://mobile_proxy2:8080", "username": "user", "password": "pass"},
215
- ])
216
-
217
-
218
- class MySpider(Spider):
219
- name = "my_spider"
220
- start_urls = ["https://example.com"]
221
- max_blocked_retries = 5
222
-
223
- def configure_sessions(self, manager: SessionManager) -> None:
224
- manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari'], proxy_rotator=cheap_proxies))
225
- manager.add('stealth', AsyncStealthySession(block_webrtc=True, proxy_rotator=expensive_proxies), lazy=True)
226
-
227
- async def retry_blocked_request(self, request: Request, response: Response) -> Request:
228
- request.sid = "stealth"
229
- self.logger.info(f"Retrying blocked request: {request.url}")
230
- return request
231
-
232
- async def parse(self, response: Response):
233
- yield {"title": response.css("title::text").get("")}
234
- ```
235
- The above logic is: requests are made with cheap proxies, such as datacenter proxies, until they are blocked, then retried with higher-quality proxies, such as residential or mobile proxies.
@@ -1,196 +0,0 @@
1
- # Requests & Responses
2
-
3
- This page covers the `Request` object in detail: how to construct requests, pass data between callbacks, control priority and deduplication, and use `response.follow()` for link-following.
4
-
5
- ## The Request Object
6
-
7
- A `Request` represents a URL to be fetched. You create requests either directly or via `response.follow()`:
8
-
9
- ```python
10
- from scrapling.spiders import Request
11
-
12
- # Direct construction
13
- request = Request(
14
- "https://example.com/page",
15
- callback=self.parse_page,
16
- priority=5,
17
- )
18
-
19
- # Via response.follow (preferred in callbacks)
20
- request = response.follow("/page", callback=self.parse_page)
21
- ```
22
-
23
- Here are all the arguments you can pass to `Request`:
24
-
25
- | Argument | Type | Default | Description |
26
- |---------------|------------|------------|-------------------------------------------------------------------------------------------------------|
27
- | `url` | `str` | *required* | The URL to fetch |
28
- | `sid` | `str` | `""` | Session ID - routes the request to a specific session (see [Sessions](sessions.md)) |
29
- | `callback` | `callable` | `None` | Async generator method to process the response. Defaults to `parse()` |
30
- | `priority` | `int` | `0` | Higher values are processed first |
31
- | `dont_filter` | `bool` | `False` | If `True`, skip deduplication (allow duplicate requests) |
32
- | `meta` | `dict` | `{}` | Arbitrary metadata passed through to the response |
33
- | `**kwargs` | | | Additional keyword arguments passed to the session's fetch method (e.g., `headers`, `method`, `data`) |
34
-
35
- Any extra keyword arguments are forwarded directly to the underlying session. For example, to make a POST request:
36
-
37
- ```python
38
- yield Request(
39
- "https://example.com/api",
40
- method="POST",
41
- data={"key": "value"},
42
- callback=self.parse_result,
43
- )
44
- ```
45
-
46
- ## Response.follow()
47
-
48
- `response.follow()` is the recommended way to create follow-up requests inside callbacks. It offers several advantages over constructing `Request` objects directly:
49
-
50
- - **Relative URLs** are resolved automatically against the current page URL
51
- - **Referer header** is set to the current page URL by default
52
- - **Session kwargs** from the original request are inherited (headers, proxy settings, etc.)
53
- - **Callback, session ID, and priority** are inherited from the original request if not specified
54
-
55
- ```python
56
- async def parse(self, response: Response):
57
- # Minimal - inherits callback, sid, priority from current request
58
- yield response.follow("/next-page")
59
-
60
- # Override specific fields
61
- yield response.follow(
62
- "/product/123",
63
- callback=self.parse_product,
64
- priority=10,
65
- )
66
-
67
- # Pass additional metadata to
68
- yield response.follow(
69
- "/details",
70
- callback=self.parse_details,
71
- meta={"category": "electronics"},
72
- )
73
- ```
74
-
75
- | Argument | Type | Default | Description |
76
- |--------------------|------------|------------|------------------------------------------------------------|
77
- | `url` | `str` | *required* | URL to follow (absolute or relative) |
78
- | `sid` | `str` | `""` | Session ID (inherits from original request if empty) |
79
- | `callback` | `callable` | `None` | Callback method (inherits from original request if `None`) |
80
- | `priority` | `int` | `None` | Priority (inherits from original request if `None`) |
81
- | `dont_filter` | `bool` | `False` | Skip deduplication |
82
- | `meta` | `dict` | `None` | Metadata (merged with existing response meta) |
83
- | **`referer_flow`** | `bool` | `True` | Set current URL as Referer header |
84
- | `**kwargs` | | | Merged with original request's session kwargs |
85
-
86
- ### Disabling Referer Flow
87
-
88
- By default, `response.follow()` sets the `Referer` header to the current page URL. To disable this:
89
-
90
- ```python
91
- yield response.follow("/page", referer_flow=False)
92
- ```
93
-
94
- ## Callbacks
95
-
96
- Callbacks are async generator methods on your spider that process responses. They must `yield` one of three types:
97
-
98
- - **`dict`**: A scraped item, added to the results
99
- - **`Request`**: A follow-up request, added to the queue
100
- - **`None`**: Silently ignored
101
-
102
- ```python
103
- class MySpider(Spider):
104
- name = "my_spider"
105
- start_urls = ["https://example.com"]
106
-
107
- async def parse(self, response: Response):
108
- # Yield items (dicts)
109
- yield {"url": response.url, "title": response.css("title::text").get("")}
110
-
111
- # Yield follow-up requests
112
- for link in response.css("a::attr(href)").getall():
113
- yield response.follow(link, callback=self.parse_page)
114
-
115
- async def parse_page(self, response: Response):
116
- yield {"content": response.css("article::text").get("")}
117
- ```
118
-
119
- **Note:** All callback methods must be `async def` and use `yield` (not `return`). Even if a callback only yields items with no follow-up requests, it must still be an async generator.
120
-
121
- ## Request Priority
122
-
123
- Requests with higher priority values are processed first. This is useful when some pages are more important to be processed first before others:
124
-
125
- ```python
126
- async def parse(self, response: Response):
127
- # High priority - process product pages first
128
- for link in response.css("a.product::attr(href)").getall():
129
- yield response.follow(link, callback=self.parse_product, priority=10)
130
-
131
- # Low priority - pagination links processed after products
132
- next_page = response.css("a.next::attr(href)").get()
133
- if next_page:
134
- yield response.follow(next_page, callback=self.parse, priority=0)
135
- ```
136
-
137
- When using `response.follow()`, the priority is inherited from the original request unless you specify a new one.
138
-
139
- ## Deduplication
140
-
141
- The spider automatically deduplicates requests based on a fingerprint computed from the URL, HTTP method, request body, and session ID. If two requests produce the same fingerprint, the second one is silently dropped.
142
-
143
- To allow duplicate requests (e.g., re-visiting a page after login), set `dont_filter=True`:
144
-
145
- ```python
146
- yield Request("https://example.com/dashboard", dont_filter=True, callback=self.parse_dashboard)
147
-
148
- # Or with response.follow
149
- yield response.follow("/dashboard", dont_filter=True, callback=self.parse_dashboard)
150
- ```
151
-
152
- You can fine-tune what goes into the fingerprint using class attributes on your spider:
153
-
154
- | Attribute | Default | Effect |
155
- |----------------------|---------|-----------------------------------------------------------------------------------------------------------------|
156
- | `fp_include_kwargs` | `False` | Include extra request kwargs (arguments you passed to the session fetch, like headers, etc.) in the fingerprint |
157
- | `fp_keep_fragments` | `False` | Keep URL fragments (`#section`) when computing fingerprints |
158
- | `fp_include_headers` | `False` | Include request headers in the fingerprint |
159
-
160
- For example, if you need to treat `https://example.com/page#section1` and `https://example.com/page#section2` as different URLs:
161
-
162
- ```python
163
- class MySpider(Spider):
164
- name = "my_spider"
165
- fp_keep_fragments = True
166
- # ...
167
- ```
168
-
169
- ## Request Meta
170
-
171
- The `meta` dictionary lets you pass arbitrary data between callbacks. This is useful when you need context from one page to process another:
172
-
173
- ```python
174
- async def parse(self, response: Response):
175
- for product in response.css("div.product"):
176
- category = product.css("span.category::text").get("")
177
- link = product.css("a::attr(href)").get()
178
- if link:
179
- yield response.follow(
180
- link,
181
- callback=self.parse_product,
182
- meta={"category": category},
183
- )
184
-
185
- async def parse_product(self, response: Response):
186
- yield {
187
- "name": response.css("h1::text").get(""),
188
- "price": response.css(".price::text").get(""),
189
- # Access meta from the request
190
- "category": response.meta.get("category", ""),
191
- }
192
- ```
193
-
194
- When using `response.follow()`, the meta from the current response is merged with the new meta you provide (new values take precedence).
195
-
196
- The spider system also automatically stores some metadata. For example, the proxy used for a request is available as `response.meta["proxy"]` when proxy rotation is enabled.
@@ -1,205 +0,0 @@
1
- # Spiders sessions
2
-
3
- A spider can use multiple fetcher sessions simultaneously. For example, a fast HTTP session for simple pages and a stealth browser session for protected pages.
4
-
5
- ## What are Sessions?
6
-
7
- A session is a pre-configured fetcher instance that stays alive for the duration of the crawl. Instead of creating a new connection or browser for every request, the spider reuses sessions, which is faster and more resource-efficient.
8
-
9
- By default, every spider creates a single [FetcherSession](../fetching/static.md). You can add more sessions or swap the default by overriding the `configure_sessions()` method, but you have to use the async version of each session only, as the table shows below:
10
-
11
-
12
- | Session Type | Use Case |
13
- |-------------------------------------------------|------------------------------------------|
14
- | [FetcherSession](../fetching/static.md) | Fast HTTP requests, no JavaScript |
15
- | [AsyncDynamicSession](../fetching/dynamic.md) | Browser automation, JavaScript rendering |
16
- | [AsyncStealthySession](../fetching/stealthy.md) | Anti-bot bypass, Cloudflare, etc. |
17
-
18
-
19
- ## Configuring Sessions
20
-
21
- Override `configure_sessions()` on your spider to set up sessions. The `manager` parameter is a `SessionManager` instance - use `manager.add()` to register sessions:
22
-
23
- ```python
24
- from scrapling.spiders import Spider, Response
25
- from scrapling.fetchers import FetcherSession
26
-
27
- class MySpider(Spider):
28
- name = "my_spider"
29
- start_urls = ["https://example.com"]
30
-
31
- def configure_sessions(self, manager):
32
- manager.add("default", FetcherSession())
33
-
34
- async def parse(self, response: Response):
35
- yield {"title": response.css("title::text").get("")}
36
- ```
37
-
38
- The `manager.add()` method takes:
39
-
40
- | Argument | Type | Default | Description |
41
- |--------------|-----------|------------|----------------------------------------------|
42
- | `session_id` | `str` | *required* | A name to reference this session in requests |
43
- | `session` | `Session` | *required* | The session instance |
44
- | `default` | `bool` | `False` | Make this the default session |
45
- | `lazy` | `bool` | `False` | Start the session only when first used |
46
-
47
- **Notes:**
48
-
49
- 1. In all requests, if you don't specify which session to use, the default session is used. The default session is determined in one of two ways:
50
- 1. The first session you add to the manager becomes the default automatically.
51
- 2. The session that gets `default=True` while added to the manager.
52
- 2. The instances you pass of each session don't have to be already started by you; the spider checks on all sessions if they are not already started and starts them.
53
- 3. If you want a specific session to start when used only, then use the `lazy` argument while adding that session to the manager. Example: start the browser only when you need it, not with the spider start.
54
-
55
- ## Multi-Session Spider
56
-
57
- Here's a practical example: use a fast HTTP session for listing pages and a stealth browser for detail pages that have bot protection:
58
-
59
- ```python
60
- from scrapling.spiders import Spider, Response
61
- from scrapling.fetchers import FetcherSession, AsyncStealthySession
62
-
63
- class ProductSpider(Spider):
64
- name = "products"
65
- start_urls = ["https://shop.example.com/products"]
66
-
67
- def configure_sessions(self, manager):
68
- # Fast HTTP for listing pages (default)
69
- manager.add("http", FetcherSession())
70
-
71
- # Stealth browser for protected product pages
72
- manager.add("stealth", AsyncStealthySession(
73
- headless=True,
74
- network_idle=True,
75
- ))
76
-
77
- async def parse(self, response: Response):
78
- for link in response.css("a.product::attr(href)").getall():
79
- # Route product pages through the stealth session
80
- yield response.follow(link, sid="stealth", callback=self.parse_product)
81
-
82
- next_page = response.css("a.next::attr(href)").get()
83
- if next_page:
84
- yield response.follow(next_page)
85
-
86
- async def parse_product(self, response: Response):
87
- yield {
88
- "name": response.css("h1::text").get(""),
89
- "price": response.css(".price::text").get(""),
90
- }
91
- ```
92
-
93
- The key is the `sid` parameter - it tells the spider which session to use for each request. When you call `response.follow()` without `sid`, the session ID from the original request is inherited.
94
-
95
- Sessions can also be different instances of the same class with different configurations:
96
-
97
- ```python
98
- from scrapling.spiders import Spider, Response
99
- from scrapling.fetchers import FetcherSession
100
-
101
- class ProductSpider(Spider):
102
- name = "products"
103
- start_urls = ["https://shop.example.com/products"]
104
-
105
- def configure_sessions(self, manager):
106
- chrome_requests = FetcherSession(impersonate="chrome")
107
- firefox_requests = FetcherSession(impersonate="firefox")
108
-
109
- manager.add("chrome", chrome_requests)
110
- manager.add("firefox", firefox_requests)
111
-
112
- async def parse(self, response: Response):
113
- for link in response.css("a.product::attr(href)").getall():
114
- yield response.follow(link, callback=self.parse_product)
115
-
116
- next_page = response.css("a.next::attr(href)").get()
117
- if next_page:
118
- yield response.follow(next_page, sid="firefox")
119
-
120
- async def parse_product(self, response: Response):
121
- yield {
122
- "name": response.css("h1::text").get(""),
123
- "price": response.css(".price::text").get(""),
124
- }
125
- ```
126
-
127
- ## Session Arguments
128
-
129
- Extra keyword arguments passed to a `Request` (or through `response.follow(**kwargs)`) are forwarded to the session's fetch method. This lets you customize individual requests without changing the session configuration:
130
-
131
- ```python
132
- async def parse(self, response: Response):
133
- # Pass extra headers for this specific request
134
- yield Request(
135
- "https://api.example.com/data",
136
- headers={"Authorization": "Bearer token123"},
137
- callback=self.parse_api,
138
- )
139
-
140
- # Use a different HTTP method
141
- yield Request(
142
- "https://example.com/submit",
143
- method="POST",
144
- data={"field": "value"},
145
- sid="firefox",
146
- callback=self.parse_result,
147
- )
148
- ```
149
-
150
- **Warning:** When using `FetcherSession` in spiders, you cannot use `.get()` and `.post()` methods directly. By default, the request is an HTTP GET request; to use another HTTP method, pass it to the `method` argument as in the above example. This unifies the `Request` interface across all session types.
151
-
152
- For browser sessions (`AsyncDynamicSession`, `AsyncStealthySession`), you can pass browser-specific arguments like `wait_selector`, `page_action`, or `extra_headers`:
153
-
154
- ```python
155
- async def parse(self, response: Response):
156
- # Use Cloudflare solver with the `AsyncStealthySession` we configured above
157
- yield Request(
158
- "https://nopecha.com/demo/cloudflare",
159
- sid="stealth",
160
- callback=self.parse_result,
161
- solve_cloudflare=True,
162
- block_webrtc=True,
163
- hide_canvas=True,
164
- google_search=True,
165
- )
166
-
167
- yield response.follow(
168
- "/dynamic-page",
169
- sid="browser",
170
- callback=self.parse_dynamic,
171
- wait_selector="div.loaded",
172
- network_idle=True,
173
- )
174
- ```
175
-
176
- **Warning:** Session arguments (**kwargs) passed from the original request are inherited by `response.follow()`. New kwargs take precedence over inherited ones.
177
-
178
- ```python
179
- from scrapling.spiders import Spider, Response
180
- from scrapling.fetchers import FetcherSession
181
-
182
- class ProductSpider(Spider):
183
- name = "products"
184
- start_urls = ["https://shop.example.com/products"]
185
-
186
- def configure_sessions(self, manager):
187
- manager.add("http", FetcherSession(impersonate='chrome'))
188
-
189
- async def parse(self, response: Response):
190
- # I don't want the follow request to impersonate a desktop Chrome like the previous request, but a mobile one
191
- # so I override it like this
192
- for link in response.css("a.product::attr(href)").getall():
193
- yield response.follow(link, impersonate="chrome131_android", callback=self.parse_product)
194
-
195
- next_page = response.css("a.next::attr(href)").get()
196
- if next_page:
197
- yield Request(next_page)
198
-
199
- async def parse_product(self, response: Response):
200
- yield {
201
- "name": response.css("h1::text").get(""),
202
- "price": response.css(".price::text").get(""),
203
- }
204
- ```
205
- **Note:** Upon spider closure, the manager automatically checks whether any sessions are still running and closes them before closing the spider.
package/PLAN.md DELETED
@@ -1,11 +0,0 @@
1
- # Plan
2
-
3
- ## Harness execution model improvements
4
- 1. Make harness mimic enterprise software engineering team execution.
5
- 2. Require project wiki creation at project start.
6
- 3. Require every design decision to be documented in wiki with rationale.
7
- 4. Before code changes, require referencing relevant wiki design decisions/guidelines to maintain continuity.
8
- 5. Reference https://handbook.gitlab.com/handbook/engineering/ while building the harness architecture.
9
-
10
- ## Tracking note
11
- - New execution-model requirements are now tracked here for implementation.