wegho-agentes 7.0.3 → 7.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/.shared/ui-ux-pro-max/data/charts.csv +26 -0
- package/.agent/.shared/ui-ux-pro-max/data/colors.csv +97 -0
- package/.agent/.shared/ui-ux-pro-max/data/icons.csv +101 -0
- package/.agent/.shared/ui-ux-pro-max/data/landing.csv +31 -0
- package/.agent/.shared/ui-ux-pro-max/data/products.csv +97 -0
- package/.agent/.shared/ui-ux-pro-max/data/prompts.csv +24 -0
- package/.agent/.shared/ui-ux-pro-max/data/react-performance.csv +45 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/flutter.csv +53 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/html-tailwind.csv +56 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/jetpack-compose.csv +53 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/nextjs.csv +53 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/nuxt-ui.csv +51 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/nuxtjs.csv +59 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/react-native.csv +52 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/react.csv +54 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/shadcn.csv +61 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/svelte.csv +54 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/swiftui.csv +51 -0
- package/.agent/.shared/ui-ux-pro-max/data/stacks/vue.csv +50 -0
- package/.agent/.shared/ui-ux-pro-max/data/styles.csv +59 -0
- package/.agent/.shared/ui-ux-pro-max/data/typography.csv +58 -0
- package/.agent/.shared/ui-ux-pro-max/data/ui-reasoning.csv +101 -0
- package/.agent/.shared/ui-ux-pro-max/data/ux-guidelines.csv +100 -0
- package/.agent/.shared/ui-ux-pro-max/data/web-interface.csv +31 -0
- package/.agent/.shared/ui-ux-pro-max/scripts/core.py +258 -0
- package/.agent/.shared/ui-ux-pro-max/scripts/design_system.py +1067 -0
- package/.agent/.shared/ui-ux-pro-max/scripts/search.py +106 -0
- package/.agent/ARCHITECTURE.md +288 -0
- package/.agent/agents/backend-specialist.md +263 -0
- package/.agent/agents/code-archaeologist.md +106 -0
- package/.agent/agents/database-architect.md +226 -0
- package/.agent/agents/debugger.md +225 -0
- package/.agent/agents/devops-engineer.md +242 -0
- package/.agent/agents/documentation-writer.md +104 -0
- package/.agent/agents/explorer-agent.md +73 -0
- package/.agent/agents/frontend-specialist.md +593 -0
- package/.agent/agents/game-developer.md +162 -0
- package/.agent/agents/mobile-developer.md +377 -0
- package/.agent/agents/orchestrator.md +416 -0
- package/.agent/agents/penetration-tester.md +188 -0
- package/.agent/agents/performance-optimizer.md +187 -0
- package/.agent/agents/product-manager.md +112 -0
- package/.agent/agents/product-owner.md +95 -0
- package/.agent/agents/project-planner.md +406 -0
- package/.agent/agents/qa-automation-engineer.md +103 -0
- package/.agent/agents/security-auditor.md +170 -0
- package/.agent/agents/seo-specialist.md +111 -0
- package/.agent/agents/test-engineer.md +158 -0
- package/.agent/mcp_config.json +24 -0
- package/.agent/mcp_config.md +30 -0
- package/.agent/rules/GEMINI.md +308 -0
- package/.agent/scripts/auto_preview.py +148 -0
- package/.agent/scripts/checklist.py +217 -0
- package/.agent/scripts/session_manager.py +120 -0
- package/.agent/scripts/verify_all.py +327 -0
- package/.agent/skills/api-patterns/SKILL.md +81 -0
- package/.agent/skills/api-patterns/api-style.md +42 -0
- package/.agent/skills/api-patterns/auth.md +24 -0
- package/.agent/skills/api-patterns/documentation.md +26 -0
- package/.agent/skills/api-patterns/graphql.md +41 -0
- package/.agent/skills/api-patterns/rate-limiting.md +31 -0
- package/.agent/skills/api-patterns/response.md +37 -0
- package/.agent/skills/api-patterns/rest.md +40 -0
- package/.agent/skills/api-patterns/scripts/api_validator.py +211 -0
- package/.agent/skills/api-patterns/security-testing.md +122 -0
- package/.agent/skills/api-patterns/trpc.md +41 -0
- package/.agent/skills/api-patterns/versioning.md +22 -0
- package/.agent/skills/app-builder/SKILL.md +75 -0
- package/.agent/skills/app-builder/agent-coordination.md +71 -0
- package/.agent/skills/app-builder/feature-building.md +53 -0
- package/.agent/skills/app-builder/project-detection.md +34 -0
- package/.agent/skills/app-builder/scaffolding.md +118 -0
- package/.agent/skills/app-builder/tech-stack.md +41 -0
- package/.agent/skills/app-builder/templates/SKILL.md +39 -0
- package/.agent/skills/app-builder/templates/astro-static/TEMPLATE.md +76 -0
- package/.agent/skills/app-builder/templates/chrome-extension/TEMPLATE.md +92 -0
- package/.agent/skills/app-builder/templates/cli-tool/TEMPLATE.md +88 -0
- package/.agent/skills/app-builder/templates/electron-desktop/TEMPLATE.md +88 -0
- package/.agent/skills/app-builder/templates/express-api/TEMPLATE.md +83 -0
- package/.agent/skills/app-builder/templates/flutter-app/TEMPLATE.md +90 -0
- package/.agent/skills/app-builder/templates/monorepo-turborepo/TEMPLATE.md +90 -0
- package/.agent/skills/app-builder/templates/nextjs-fullstack/TEMPLATE.md +122 -0
- package/.agent/skills/app-builder/templates/nextjs-saas/TEMPLATE.md +122 -0
- package/.agent/skills/app-builder/templates/nextjs-static/TEMPLATE.md +169 -0
- package/.agent/skills/app-builder/templates/nuxt-app/TEMPLATE.md +134 -0
- package/.agent/skills/app-builder/templates/python-fastapi/TEMPLATE.md +83 -0
- package/.agent/skills/app-builder/templates/react-native-app/TEMPLATE.md +119 -0
- package/.agent/skills/architecture/SKILL.md +55 -0
- package/.agent/skills/architecture/context-discovery.md +43 -0
- package/.agent/skills/architecture/examples.md +94 -0
- package/.agent/skills/architecture/pattern-selection.md +68 -0
- package/.agent/skills/architecture/patterns-reference.md +50 -0
- package/.agent/skills/architecture/trade-off-analysis.md +77 -0
- package/.agent/skills/bash-linux/SKILL.md +199 -0
- package/.agent/skills/behavioral-modes/SKILL.md +242 -0
- package/.agent/skills/brainstorming/SKILL.md +163 -0
- package/.agent/skills/brainstorming/dynamic-questioning.md +350 -0
- package/.agent/skills/clean-code/SKILL.md +201 -0
- package/.agent/skills/code-review-checklist/SKILL.md +109 -0
- package/.agent/skills/database-design/SKILL.md +103 -0
- package/.agent/skills/database-design/database-selection.md +43 -0
- package/.agent/skills/database-design/github-benchmarks.md +35 -0
- package/.agent/skills/database-design/indexing.md +39 -0
- package/.agent/skills/database-design/migrations.md +48 -0
- package/.agent/skills/database-design/optimization.md +36 -0
- package/.agent/skills/database-design/orm-selection.md +30 -0
- package/.agent/skills/database-design/schema-design.md +56 -0
- package/.agent/skills/database-design/scripts/schema_validator.py +172 -0
- package/.agent/skills/database-design/supabase-security-egress.md +42 -0
- package/.agent/skills/deployment-procedures/SKILL.md +241 -0
- package/.agent/skills/doc.md +177 -0
- package/.agent/skills/documentation-templates/SKILL.md +194 -0
- package/.agent/skills/file-doc-sync/SKILL.md +44 -0
- package/.agent/skills/file-doc-sync/references/document-structure.md +20 -0
- package/.agent/skills/file-doc-sync/scripts/sync-file-docs.md +45 -0
- package/.agent/skills/file-doc-sync/scripts/sync-file-docs.ts +349 -0
- package/.agent/skills/frontend-design/SKILL.md +452 -0
- package/.agent/skills/frontend-design/animation-guide.md +331 -0
- package/.agent/skills/frontend-design/color-system.md +311 -0
- package/.agent/skills/frontend-design/decision-trees.md +418 -0
- package/.agent/skills/frontend-design/motion-graphics.md +306 -0
- package/.agent/skills/frontend-design/scripts/accessibility_checker.py +183 -0
- package/.agent/skills/frontend-design/scripts/ux_audit.py +722 -0
- package/.agent/skills/frontend-design/typography-system.md +345 -0
- package/.agent/skills/frontend-design/ux-psychology.md +1116 -0
- package/.agent/skills/frontend-design/visual-effects.md +383 -0
- package/.agent/skills/game-development/2d-games/SKILL.md +119 -0
- package/.agent/skills/game-development/3d-games/SKILL.md +135 -0
- package/.agent/skills/game-development/SKILL.md +167 -0
- package/.agent/skills/game-development/game-art/SKILL.md +185 -0
- package/.agent/skills/game-development/game-audio/SKILL.md +190 -0
- package/.agent/skills/game-development/game-design/SKILL.md +129 -0
- package/.agent/skills/game-development/mobile-games/SKILL.md +108 -0
- package/.agent/skills/game-development/multiplayer/SKILL.md +132 -0
- package/.agent/skills/game-development/pc-games/SKILL.md +144 -0
- package/.agent/skills/game-development/vr-ar/SKILL.md +123 -0
- package/.agent/skills/game-development/web-games/SKILL.md +150 -0
- package/.agent/skills/geo-fundamentals/SKILL.md +156 -0
- package/.agent/skills/geo-fundamentals/scripts/geo_checker.py +289 -0
- package/.agent/skills/i18n-localization/SKILL.md +154 -0
- package/.agent/skills/i18n-localization/scripts/i18n_checker.py +241 -0
- package/.agent/skills/intelligent-routing/SKILL.md +335 -0
- package/.agent/skills/lint-and-validate/SKILL.md +45 -0
- package/.agent/skills/lint-and-validate/scripts/lint_runner.py +184 -0
- package/.agent/skills/lint-and-validate/scripts/type_coverage.py +173 -0
- package/.agent/skills/mcp-builder/SKILL.md +176 -0
- package/.agent/skills/mobile-design/SKILL.md +394 -0
- package/.agent/skills/mobile-design/decision-trees.md +516 -0
- package/.agent/skills/mobile-design/mobile-backend.md +491 -0
- package/.agent/skills/mobile-design/mobile-color-system.md +420 -0
- package/.agent/skills/mobile-design/mobile-debugging.md +122 -0
- package/.agent/skills/mobile-design/mobile-design-thinking.md +357 -0
- package/.agent/skills/mobile-design/mobile-navigation.md +458 -0
- package/.agent/skills/mobile-design/mobile-performance.md +767 -0
- package/.agent/skills/mobile-design/mobile-testing.md +356 -0
- package/.agent/skills/mobile-design/mobile-typography.md +433 -0
- package/.agent/skills/mobile-design/platform-android.md +666 -0
- package/.agent/skills/mobile-design/platform-ios.md +561 -0
- package/.agent/skills/mobile-design/scripts/mobile_audit.py +670 -0
- package/.agent/skills/mobile-design/touch-psychology.md +537 -0
- package/.agent/skills/nextjs-react-expert/1-async-eliminating-waterfalls.md +351 -0
- package/.agent/skills/nextjs-react-expert/2-bundle-bundle-size-optimization.md +240 -0
- package/.agent/skills/nextjs-react-expert/3-server-server-side-performance.md +490 -0
- package/.agent/skills/nextjs-react-expert/4-client-client-side-data-fetching.md +264 -0
- package/.agent/skills/nextjs-react-expert/5-rerender-re-render-optimization.md +581 -0
- package/.agent/skills/nextjs-react-expert/6-rendering-rendering-performance.md +432 -0
- package/.agent/skills/nextjs-react-expert/7-js-javascript-performance.md +684 -0
- package/.agent/skills/nextjs-react-expert/8-advanced-advanced-patterns.md +150 -0
- package/.agent/skills/nextjs-react-expert/9-cache-components.md +103 -0
- package/.agent/skills/nextjs-react-expert/SKILL.md +293 -0
- package/.agent/skills/nextjs-react-expert/scripts/convert_rules.py +222 -0
- package/.agent/skills/nextjs-react-expert/scripts/react_performance_checker.py +252 -0
- package/.agent/skills/nodejs-best-practices/SKILL.md +333 -0
- package/.agent/skills/parallel-agents/SKILL.md +175 -0
- package/.agent/skills/performance-profiling/SKILL.md +143 -0
- package/.agent/skills/performance-profiling/scripts/lighthouse_audit.py +76 -0
- package/.agent/skills/plan-writing/SKILL.md +152 -0
- package/.agent/skills/powershell-windows/SKILL.md +167 -0
- package/.agent/skills/python-patterns/SKILL.md +441 -0
- package/.agent/skills/red-team-tactics/SKILL.md +199 -0
- package/.agent/skills/rust-pro/SKILL.md +176 -0
- package/.agent/skills/seo-fundamentals/SKILL.md +129 -0
- package/.agent/skills/seo-fundamentals/scripts/seo_checker.py +219 -0
- package/.agent/skills/server-management/SKILL.md +161 -0
- package/.agent/skills/systematic-debugging/SKILL.md +109 -0
- package/.agent/skills/tailwind-patterns/SKILL.md +269 -0
- package/.agent/skills/tdd-workflow/SKILL.md +149 -0
- package/.agent/skills/testing-patterns/SKILL.md +178 -0
- package/.agent/skills/testing-patterns/scripts/test_runner.py +219 -0
- package/.agent/skills/vulnerability-scanner/SKILL.md +322 -0
- package/.agent/skills/vulnerability-scanner/checklists.md +121 -0
- package/.agent/skills/vulnerability-scanner/github-benchmarks.md +32 -0
- package/.agent/skills/vulnerability-scanner/scripts/security_scan.py +458 -0
- package/.agent/skills/web-design-guidelines/SKILL.md +57 -0
- package/.agent/skills/webapp-testing/SKILL.md +187 -0
- package/.agent/skills/webapp-testing/scripts/playwright_runner.py +173 -0
- package/.agent/skills/wegho-global-rules/SKILL.md +36 -6
- package/.agent/workflows/PROMPT_GUIDE.md +16 -203
- package/.agent/workflows/auto-run-orchestrator.md +3 -3
- package/.agent/workflows/brainstorm.md +113 -0
- package/.agent/workflows/create.md +59 -0
- package/.agent/workflows/debug.md +103 -0
- package/.agent/workflows/deploy.md +176 -0
- package/.agent/workflows/enhance.md +63 -0
- package/.agent/workflows/orchestrate.md +237 -0
- package/.agent/workflows/plan.md +89 -0
- package/.agent/workflows/preview.md +81 -0
- package/.agent/workflows/run-agents-workflow.md +15 -186
- package/.agent/workflows/status.md +86 -0
- package/.agent/workflows/test.md +144 -0
- package/.agent/workflows/ui-ux-pro-max.md +296 -0
- package/.agents/AGENT_WORKFLOW.md +36 -528
- package/.agents/CLI.md +42 -222
- package/.agents/README.md +18 -130
- package/.agents/antigravity-preflight.md +36 -0
- package/.agents/antigravity-preflight.ts +21 -80
- package/.agents/auto-runner.md +36 -0
- package/.agents/auto-runner.ts +2 -2
- package/.agents/cli.ts +2 -2
- package/.agents/code-auditor-agent.md +38 -0
- package/.agents/config.md +34 -0
- package/.agents/config.ts +15 -29
- package/.agents/context-loader.md +37 -0
- package/.agents/core/agent-parallelizer.md +33 -0
- package/.agents/core/ai-agents-agent.md +35 -0
- package/.agents/core/architecture-agent.md +39 -0
- package/.agents/core/architecture-agent.ts +1 -1
- package/.agents/core/automation-agent.md +33 -0
- package/.agents/core/backend-agent.md +35 -0
- package/.agents/core/base-agent.md +36 -0
- package/.agents/core/base-agent.ts +132 -337
- package/.agents/core/build-manager.md +35 -0
- package/.agents/core/cache-manager.md +32 -0
- package/.agents/core/checkpoint-manager.md +36 -0
- package/.agents/core/cloud-agent.md +33 -0
- package/.agents/core/cro-agent.md +33 -0
- package/.agents/core/database-agent.md +35 -0
- package/.agents/core/devops-agent.md +33 -0
- package/.agents/core/documentation-agent.md +36 -0
- package/.agents/core/documentation-agent.ts +1 -1
- package/.agents/core/file-generator.md +42 -0
- package/.agents/core/frontend-agent.md +36 -0
- package/.agents/core/frontend-agent.ts +1 -1
- package/.agents/core/nextjs-agent.md +35 -0
- package/.agents/core/pentest-agent.md +35 -0
- package/.agents/core/performance-tracker.md +33 -0
- package/.agents/core/planning-agent.md +33 -0
- package/.agents/core/planning-agent.ts +77 -388
- package/.agents/core/quality-agent.md +36 -0
- package/.agents/core/quality-agent.ts +1 -1
- package/.agents/core/rag-agent.md +36 -0
- package/.agents/core/report-generator.md +35 -0
- package/.agents/core/retry-utility.md +34 -0
- package/.agents/core/security-agent.md +39 -0
- package/.agents/core/security-agent.ts +1 -1
- package/.agents/core/skill-manager.md +36 -0
- package/.agents/core/stack-boundary-agent.md +38 -0
- package/.agents/core/testing-agent.md +33 -0
- package/.agents/core/ui-data-map-agent.md +38 -0
- package/.agents/core/uiux-agent.md +35 -0
- package/.agents/core/workflow-validator.md +33 -0
- package/.agents/core/workflow-validator.ts +98 -158
- package/.agents/domains/README.md +10 -53
- package/.agents/domains/logistics/route-agent.md +34 -0
- package/.agents/domains/logistics/route-agent.ts +1 -1
- package/.agents/domains/news/cms-agent.md +35 -0
- package/.agents/domains/news/cms-agent.ts +1 -1
- package/.agents/domains/news/seo-agent.md +34 -0
- package/.agents/domains/news/seo-agent.ts +1 -1
- package/.agents/domains/production/production-control-agent.md +34 -0
- package/.agents/domains/production/production-control-agent.ts +1 -1
- package/.agents/init.md +40 -0
- package/.agents/init.ts +81 -168
- package/.agents/install.md +34 -0
- package/.agents/install.ts +115 -138
- package/.agents/orchestrator.md +43 -0
- package/.agents/orchestrator.ts +322 -764
- package/.agents/project-discovery-agent.md +40 -0
- package/.agents/reference-repositories.json +5 -0
- package/.agents/reference-repositories.md +30 -0
- package/.agents/security/vulnerability-db.md +42 -0
- package/.agents/sync-docs.md +32 -0
- package/.agents/sync-docs.ts +15 -0
- package/.agents/task-analyzer-agent.md +36 -0
- package/.agents/task-analyzer-agent.ts +122 -478
- package/.agents/validate.md +36 -0
- package/.agents/validate.ts +1 -1
- package/INSTALL.md +18 -300
- package/README.md +20 -332
- package/package.json +19 -3
- package/skills/algorithmic-art/templates/generator_template.md +31 -0
- package/skills/algorithmic-art/templates/viewer.md +31 -0
- package/skills/app-store-optimization/expected_output.md +30 -0
- package/skills/app-store-optimization/sample_input.md +30 -0
- package/skills/cc-skill-continuous-learning/config.md +30 -0
- package/skills/claude-d3js-skill/assets/chart-template.md +32 -0
- package/skills/claude-d3js-skill/assets/interactive-template.md +32 -0
- package/skills/claude-d3js-skill/assets/sample-data.md +30 -0
- package/skills/loki-mode/scripts/take-screenshots.md +33 -0
- package/skills/playwright-skill/lib/helpers.md +34 -0
- package/skills/playwright-skill/package.md +30 -0
- package/skills/playwright-skill/run.md +36 -0
- package/skills/postgres-best-practices/metadata.md +31 -0
- package/skills/pptx-official/scripts/html2pptx.md +38 -0
- package/skills/react-best-practices/metadata.md +30 -0
- package/skills/remotion-best-practices/rules/assets/charts-bar-chart.md +33 -0
- package/skills/remotion-best-practices/rules/assets/text-animations-typewriter.md +32 -0
- package/skills/remotion-best-practices/rules/assets/text-animations-word-highlight.md +34 -0
- package/skills/systematic-debugging/condition-based-waiting-example.md +34 -0
- package/skills/typescript-expert/references/tsconfig-strict.md +30 -0
- package/skills/typescript-expert/references/utility-types.md +32 -0
- package/skills/writing-skills/render-graphs.md +32 -0
- package/.agents/AI_COMPATIBILITY.md +0 -333
- package/.agents/core/feedback-collector.ts +0 -207
- package/.agents/core/inventory-agent.ts +0 -757
- package/.agents/core/memory-system.ts +0 -429
- package/.agents/memory/ai-agents-agent/failures.json +0 -1
- package/.agents/memory/ai-agents-agent/learnings.json +0 -1
- package/.agents/memory/ai-agents-agent/specialty.md +0 -3
- package/.agents/memory/ai-agents-agent/successes.json +0 -1
- package/.agents/memory/architecture-agent/failures.json +0 -1
- package/.agents/memory/architecture-agent/learnings.json +0 -1
- package/.agents/memory/architecture-agent/specialty.md +0 -31
- package/.agents/memory/architecture-agent/successes.json +0 -1
- package/.agents/memory/automation-agent/failures.json +0 -1
- package/.agents/memory/automation-agent/learnings.json +0 -1
- package/.agents/memory/automation-agent/specialty.md +0 -3
- package/.agents/memory/automation-agent/successes.json +0 -1
- package/.agents/memory/backend-agent/failures.json +0 -1
- package/.agents/memory/backend-agent/learnings.json +0 -1
- package/.agents/memory/backend-agent/specialty.md +0 -3
- package/.agents/memory/backend-agent/successes.json +0 -1
- package/.agents/memory/cloud-agent/failures.json +0 -1
- package/.agents/memory/cloud-agent/learnings.json +0 -1
- package/.agents/memory/cloud-agent/specialty.md +0 -3
- package/.agents/memory/cloud-agent/successes.json +0 -1
- package/.agents/memory/cms-agent/failures.json +0 -1
- package/.agents/memory/cms-agent/learnings.json +0 -1
- package/.agents/memory/cms-agent/specialty.md +0 -30
- package/.agents/memory/cms-agent/successes.json +0 -1
- package/.agents/memory/cro-agent/failures.json +0 -1
- package/.agents/memory/cro-agent/learnings.json +0 -1
- package/.agents/memory/cro-agent/specialty.md +0 -3
- package/.agents/memory/cro-agent/successes.json +0 -1
- package/.agents/memory/database-agent/failures.json +0 -1
- package/.agents/memory/database-agent/learnings.json +0 -1
- package/.agents/memory/database-agent/specialty.md +0 -3
- package/.agents/memory/database-agent/successes.json +0 -1
- package/.agents/memory/devops-agent/failures.json +0 -1
- package/.agents/memory/devops-agent/learnings.json +0 -1
- package/.agents/memory/devops-agent/specialty.md +0 -3
- package/.agents/memory/devops-agent/successes.json +0 -1
- package/.agents/memory/documentation-agent/failures.json +0 -1
- package/.agents/memory/documentation-agent/learnings.json +0 -1
- package/.agents/memory/documentation-agent/specialty.md +0 -33
- package/.agents/memory/documentation-agent/successes.json +0 -1
- package/.agents/memory/frontend-agent/failures.json +0 -1
- package/.agents/memory/frontend-agent/learnings.json +0 -1
- package/.agents/memory/frontend-agent/specialty.md +0 -30
- package/.agents/memory/frontend-agent/successes.json +0 -1
- package/.agents/memory/inventory-agent/failures.json +0 -1
- package/.agents/memory/inventory-agent/inventory/index.json +0 -1
- package/.agents/memory/inventory-agent/inventory/types.json +0 -1
- package/.agents/memory/inventory-agent/inventory/variables.json +0 -1
- package/.agents/memory/inventory-agent/learnings.json +0 -1
- package/.agents/memory/inventory-agent/specialty.md +0 -129
- package/.agents/memory/inventory-agent/successes.json +0 -1
- package/.agents/memory/nextjs-agent/failures.json +0 -1
- package/.agents/memory/nextjs-agent/learnings.json +0 -1
- package/.agents/memory/nextjs-agent/specialty.md +0 -3
- package/.agents/memory/nextjs-agent/successes.json +0 -1
- package/.agents/memory/pentest-agent/failures.json +0 -1
- package/.agents/memory/pentest-agent/learnings.json +0 -1
- package/.agents/memory/pentest-agent/specialty.md +0 -3
- package/.agents/memory/pentest-agent/successes.json +0 -1
- package/.agents/memory/planning-agent/specialty.md +0 -13
- package/.agents/memory/production-control-agent/failures.json +0 -1
- package/.agents/memory/production-control-agent/learnings.json +0 -1
- package/.agents/memory/production-control-agent/specialty.md +0 -29
- package/.agents/memory/production-control-agent/successes.json +0 -1
- package/.agents/memory/quality-agent/failures.json +0 -1
- package/.agents/memory/quality-agent/learnings.json +0 -1
- package/.agents/memory/quality-agent/specialty.md +0 -31
- package/.agents/memory/quality-agent/successes.json +0 -1
- package/.agents/memory/rag-agent/failures.json +0 -1
- package/.agents/memory/rag-agent/learnings.json +0 -1
- package/.agents/memory/rag-agent/specialty.md +0 -3
- package/.agents/memory/rag-agent/successes.json +0 -1
- package/.agents/memory/reference-repositories.json +0 -271
- package/.agents/memory/route-agent/failures.json +0 -1
- package/.agents/memory/route-agent/learnings.json +0 -1
- package/.agents/memory/route-agent/specialty.md +0 -29
- package/.agents/memory/route-agent/successes.json +0 -1
- package/.agents/memory/security-agent/failures.json +0 -1
- package/.agents/memory/security-agent/learnings.json +0 -1
- package/.agents/memory/security-agent/specialty.md +0 -31
- package/.agents/memory/security-agent/successes.json +0 -1
- package/.agents/memory/seo-agent/failures.json +0 -1
- package/.agents/memory/seo-agent/learnings.json +0 -1
- package/.agents/memory/seo-agent/specialty.md +0 -31
- package/.agents/memory/seo-agent/successes.json +0 -1
- package/.agents/memory/stack-boundary-agent/failures.json +0 -1
- package/.agents/memory/stack-boundary-agent/learnings.json +0 -1
- package/.agents/memory/stack-boundary-agent/specialty.md +0 -3
- package/.agents/memory/stack-boundary-agent/successes.json +0 -1
- package/.agents/memory/testing-agent/failures.json +0 -1
- package/.agents/memory/testing-agent/learnings.json +0 -1
- package/.agents/memory/testing-agent/specialty.md +0 -3
- package/.agents/memory/testing-agent/successes.json +0 -1
- package/.agents/memory/ui-data-map-agent/failures.json +0 -1
- package/.agents/memory/ui-data-map-agent/learnings.json +0 -1
- package/.agents/memory/ui-data-map-agent/specialty.md +0 -3
- package/.agents/memory/ui-data-map-agent/successes.json +0 -1
- package/.agents/memory/uiux-agent/failures.json +0 -1
- package/.agents/memory/uiux-agent/learnings.json +0 -1
- package/.agents/memory/uiux-agent/specialty.md +0 -3
- package/.agents/memory/uiux-agent/successes.json +0 -1
- package/docs/LEARNING_SYSTEM.md +0 -326
- package/docs/SYSTEM_FLOW_AUDIT.md +0 -115
- package/skills/loki-mode/.github/workflows/claude-code-review.yml +0 -57
- package/skills/loki-mode/.github/workflows/claude.yml +0 -50
- package/skills/loki-mode/.github/workflows/release.yml +0 -128
- package/skills/loki-mode/autonomy/.loki/dashboard/index.html +0 -497
- package/skills/loki-mode/benchmarks/datasets/humaneval.jsonl +0 -164
- package/skills/loki-mode/benchmarks/datasets/swebench-lite.json +0 -10
- package/skills/loki-mode/benchmarks/prepare-submission.sh +0 -215
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-23-56/SUMMARY.md +0 -48
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-23-56/humaneval-results.json +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-23-56/swebench-results.json +0 -10
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/SUMMARY.md +0 -50
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-results.json +0 -1000
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/0.py +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/1.py +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/10.py +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/100.py +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/101.py +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/102.py +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/103.py +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/104.py +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/105.py +0 -39
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/106.py +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/107.py +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/108.py +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/109.py +0 -41
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/11.py +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/110.py +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/111.py +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/112.py +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/113.py +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/114.py +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/115.py +0 -41
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/116.py +0 -12
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/117.py +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/118.py +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/119.py +0 -30
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/12.py +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/120.py +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/121.py +0 -10
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/122.py +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/123.py +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/124.py +0 -56
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/125.py +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/126.py +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/127.py +0 -47
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/128.py +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/129.py +0 -61
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/13.py +0 -10
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/130.py +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/131.py +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/132.py +0 -36
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/133.py +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/134.py +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/135.py +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/136.py +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/137.py +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/138.py +0 -8
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/139.py +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/14.py +0 -9
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/140.py +0 -30
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/141.py +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/142.py +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/143.py +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/144.py +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/145.py +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/146.py +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/147.py +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/148.py +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/149.py +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/15.py +0 -8
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/150.py +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/151.py +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/152.py +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/153.py +0 -30
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/154.py +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/155.py +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/156.py +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/157.py +0 -12
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/158.py +0 -11
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/159.py +0 -33
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/16.py +0 -8
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/160.py +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/161.py +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/162.py +0 -11
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/163.py +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/17.py +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/18.py +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/19.py +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/2.py +0 -10
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/20.py +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/21.py +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/22.py +0 -11
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/23.py +0 -8
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/24.py +0 -9
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/25.py +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/26.py +0 -12
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/27.py +0 -6
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/28.py +0 -11
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/29.py +0 -11
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/3.py +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/30.py +0 -8
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/31.py +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/32.py +0 -50
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/33.py +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/34.py +0 -6
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/35.py +0 -8
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/36.py +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/37.py +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/38.py +0 -11
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/39.py +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/4.py +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/40.py +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/41.py +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/42.py +0 -8
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/43.py +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/44.py +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/45.py +0 -6
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/46.py +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/47.py +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/48.py +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/49.py +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/5.py +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/50.py +0 -12
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/51.py +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/52.py +0 -8
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/53.py +0 -8
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/54.py +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/55.py +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/56.py +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/57.py +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/58.py +0 -9
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/59.py +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/6.py +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/60.py +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/61.py +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/62.py +0 -10
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/63.py +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/64.py +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/65.py +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/66.py +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/67.py +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/68.py +0 -50
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/69.py +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/7.py +0 -11
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/70.py +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/71.py +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/72.py +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/73.py +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/74.py +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/75.py +0 -30
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/76.py +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/77.py +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/78.py +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/79.py +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/8.py +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/80.py +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/81.py +0 -54
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/82.py +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/83.py +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/84.py +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/85.py +0 -12
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/86.py +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/87.py +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/88.py +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/89.py +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/9.py +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/90.py +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/91.py +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/92.py +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/93.py +0 -34
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/94.py +0 -33
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/95.py +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/96.py +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/97.py +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/98.py +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-00-49-17/humaneval-solutions/99.py +0 -30
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/SUMMARY.md +0 -48
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/astropy__astropy-12907.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/astropy__astropy-14182.patch +0 -59
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/astropy__astropy-14365.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/astropy__astropy-14995.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/astropy__astropy-6938.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/astropy__astropy-7746.patch +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-10914.patch +0 -72
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-10924.patch +0 -41
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11001.patch +0 -80
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11019.patch +0 -489
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11039.patch +0 -87
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11049.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11099.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11133.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11179.patch +0 -49
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11283.patch +0 -47
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11422.patch +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11564.patch +0 -285
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11583.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11620.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11630.patch +0 -45
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11742.patch +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11797.patch +0 -38
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11815.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11848.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11905.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11910.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11964.patch +0 -54
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-11999.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12113.patch +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12125.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12184.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12284.patch +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12286.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12308.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12453.patch +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12470.patch +0 -53
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12497.patch +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12589.patch +0 -157
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12700.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12708.patch +0 -64
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12747.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12856.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12908.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12915.patch +0 -82
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-12983.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13028.patch +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13033.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13158.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13220.patch +0 -57
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13230.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13265.patch +0 -44
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13315.patch +0 -53
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13321.patch +0 -33
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13401.patch +0 -30
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13447.patch +0 -40
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13448.patch +0 -203
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13551.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13590.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13658.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13660.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13710.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13757.patch +0 -51
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13768.patch +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13925.patch +0 -116
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13933.patch +0 -62
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-13964.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14016.patch +0 -138
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14017.patch +0 -88
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14155.patch +0 -74
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14238.patch +0 -132
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14382.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14411.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14534.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14580.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14608.patch +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14667.patch +0 -33
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14672.patch +0 -59
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14730.patch +0 -80
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14752.patch +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14787.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14855.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14915.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14997.patch +0 -40
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-14999.patch +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15061.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15202.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15213.patch +0 -70
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15252.patch +0 -63
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15320.patch +0 -36
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15347.patch +0 -36
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15388.patch +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15400.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15498.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15695.patch +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15738.patch +0 -251
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15781.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15789.patch +0 -50
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15790.patch +0 -34
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15814.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15819.patch +0 -127
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15851.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15902.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-15996.patch +0 -33
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-16041.patch +0 -30
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-16046.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-16139.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-16229.patch +0 -142
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-16255.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-16379.patch +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-16400.patch +0 -34
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-16408.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-16527.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-16595.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-16816.patch +0 -30
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-16820.patch +0 -188
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-16873.patch +0 -37
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-17051.patch +0 -51
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/django__django-17087.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-18869.patch +0 -75
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-22711.patch +0 -42
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-22835.patch +0 -58
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-23299.patch +0 -92
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-23314.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-23476.patch +0 -73
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-23562.patch +0 -33
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-23563.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-23913.patch +0 -207
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-23964.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-23987.patch +0 -43
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-24149.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-24265.patch +0 -76
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-24334.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-24970.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-25079.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-25311.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-25332.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-25433.patch +0 -240
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-25442.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-25498.patch +0 -79
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-26011.patch +0 -90
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/matplotlib__matplotlib-26020.patch +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/mwaskom__seaborn-2848.patch +0 -55
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/mwaskom__seaborn-3010.patch +0 -60
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/mwaskom__seaborn-3190.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/mwaskom__seaborn-3407.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pallets__flask-4045.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pallets__flask-4992.patch +0 -33
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pallets__flask-5063.patch +0 -99
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/psf__requests-1963.patch +0 -117
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/psf__requests-2148.patch +0 -37
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/psf__requests-2317.patch +0 -54
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/psf__requests-2674.patch +0 -157
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/psf__requests-3362.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/psf__requests-863.patch +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pydata__xarray-3364.patch +0 -392
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pydata__xarray-4094.patch +0 -40
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pydata__xarray-4248.patch +0 -124
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pydata__xarray-4493.patch +0 -39
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pydata__xarray-5131.patch +0 -45
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pylint-dev__pylint-5859.patch +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pylint-dev__pylint-6506.patch +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pylint-dev__pylint-7080.patch +0 -33
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pylint-dev__pylint-7114.patch +0 -161
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pylint-dev__pylint-7228.patch +0 -34
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pylint-dev__pylint-7993.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-11143.patch +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-11148.patch +0 -57
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-5103.patch +0 -345
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-5221.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-5227.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-5413.patch +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-5495.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-5692.patch +0 -33
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-6116.patch +0 -12
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-7168.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-7220.patch +0 -535
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-7373.patch +0 -47
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-7432.patch +0 -76
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-7490.patch +0 -61
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-8365.patch +0 -123
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-8906.patch +0 -38
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/pytest-dev__pytest-9359.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-10297.patch +0 -53
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-10508.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-10949.patch +0 -94
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-11040.patch +0 -39
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-11281.patch +0 -65
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-12471.patch +0 -54
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-13142.patch +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-13241.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-13439.patch +0 -34
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-13496.patch +0 -59
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-13497.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-13584.patch +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-13779.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-14087.patch +0 -33
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-14092.patch +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-14894.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-14983.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-15512.patch +0 -77
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-15535.patch +0 -44
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-25500.patch +0 -64
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-25570.patch +0 -96
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-25638.patch +0 -52
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/scikit-learn__scikit-learn-25747.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-10325.patch +0 -185
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-10451.patch +0 -129
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-11445.patch +0 -119
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-7686.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-7738.patch +0 -37
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-7975.patch +0 -89
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-8273.patch +0 -38
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-8282.patch +0 -95
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-8435.patch +0 -104
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-8474.patch +0 -73
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-8506.patch +0 -49
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-8595.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-8627.patch +0 -50
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-8713.patch +0 -41
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-8721.patch +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sphinx-doc__sphinx-8801.patch +0 -73
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-11400.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-11870.patch +0 -96
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-11897.patch +0 -134
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-12171.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-12236.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-12419.patch +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-12454.patch +0 -55
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-12481.patch +0 -68
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-13031.patch +0 -33
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-13043.patch +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-13146.patch +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-13177.patch +0 -36
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-13437.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-13471.patch +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-13480.patch +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-13647.patch +0 -41
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-13773.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-13895.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-13915.patch +0 -97
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-13971.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-14024.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-14308.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-14317.patch +0 -42
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-14396.patch +0 -38
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-14774.patch +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-14817.patch +0 -54
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-15011.patch +0 -38
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-15308.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-15345.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-15346.patch +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-15609.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-15678.patch +0 -92
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-16106.patch +0 -78
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-16281.patch +0 -141
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-16503.patch +0 -69
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-16792.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-16988.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-17022.patch +0 -65
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-17139.patch +0 -33
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-17630.patch +0 -90
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-17655.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-18057.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-18087.patch +0 -81
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-18189.patch +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-18199.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-18532.patch +0 -130
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-18621.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-18698.patch +0 -105
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-18835.patch +0 -30
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-19007.patch +0 -66
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-19254.patch +0 -72
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-19487.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-20049.patch +0 -125
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-20154.patch +0 -46
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-20212.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-20322.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-20442.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-20590.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-20639.patch +0 -66
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-21055.patch +0 -56
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-21171.patch +0 -36
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-21379.patch +0 -36
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-21612.patch +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-21614.patch +0 -37
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-21627.patch +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-21847.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-22005.patch +0 -36
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-22714.patch +0 -486
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-22840.patch +0 -76
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-23117.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-23191.patch +0 -302
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-23262.patch +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-24066.patch +0 -66
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-24102.patch +0 -12
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-24152.patch +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-24213.patch +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-patches/sympy__sympy-24909.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-predictions.json +0 -1502
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-24-17/swebench-results.json +0 -1516
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/SUMMARY.md +0 -48
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/astropy__astropy-12907.patch +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/astropy__astropy-14182.patch +0 -42
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/astropy__astropy-14365.patch +0 -50
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/astropy__astropy-14995.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/astropy__astropy-6938.patch +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/astropy__astropy-7746.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-10914.patch +0 -76
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-10924.patch +0 -33
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11001.patch +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11019.patch +0 -90
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11039.patch +0 -37
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11049.patch +0 -10
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11099.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11133.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11179.patch +0 -47
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11283.patch +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11422.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11564.patch +0 -62
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11583.patch +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11620.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11630.patch +0 -79
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11742.patch +0 -38
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11797.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11815.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11848.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11905.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11910.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11964.patch +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-11999.patch +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12113.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12125.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12184.patch +0 -83
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12284.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12286.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12308.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12453.patch +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12470.patch +0 -137
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12497.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12589.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12700.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12708.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12747.patch +0 -43
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12856.patch +0 -61
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12908.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12915.patch +0 -39
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-12983.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-13028.patch +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-13033.patch +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-13158.patch +0 -12
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-patches/django__django-13220.patch +0 -55
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-predictions.json +0 -252
- package/skills/loki-mode/benchmarks/results/2026-01-05-01-35-39/swebench-results.json +0 -266
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/SUMMARY.md +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/astropy__astropy-12907.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/astropy__astropy-14182.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/astropy__astropy-14365.patch +0 -44
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/astropy__astropy-14995.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/astropy__astropy-6938.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/astropy__astropy-7746.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-10914.patch +0 -76
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-10924.patch +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11001.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11019.patch +0 -158
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11039.patch +0 -38
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11049.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11099.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11133.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11179.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11283.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11422.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11564.patch +0 -140
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11583.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11620.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11630.patch +0 -38
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11742.patch +0 -49
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11797.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11815.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11848.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11905.patch +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11910.patch +0 -100
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11964.patch +0 -62
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-11999.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12113.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12125.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12184.patch +0 -82
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12284.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12286.patch +0 -41
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12308.patch +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12453.patch +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12470.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12497.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12589.patch +0 -56
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12700.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12708.patch +0 -61
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12747.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12856.patch +0 -62
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12908.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12915.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-12983.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13028.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13033.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13158.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13220.patch +0 -126
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13230.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13265.patch +0 -87
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13315.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13321.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13401.patch +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13447.patch +0 -42
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13448.patch +0 -94
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13551.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13590.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13658.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13660.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13710.patch +0 -42
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13757.patch +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13768.patch +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13925.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13933.patch +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-13964.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14016.patch +0 -103
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14017.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14155.patch +0 -121
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14238.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14382.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14411.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14534.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14580.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14608.patch +0 -34
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14667.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14672.patch +0 -12
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14730.patch +0 -43
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14752.patch +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14787.patch +0 -35
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14855.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14915.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14997.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-14999.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15061.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15202.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15213.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15252.patch +0 -59
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15320.patch +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15347.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15388.patch +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15400.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15498.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15695.patch +0 -39
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15738.patch +0 -185
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15781.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15789.patch +0 -69
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15790.patch +0 -38
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15814.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15819.patch +0 -101
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15851.patch +0 -40
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15902.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-15996.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-16041.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-16046.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-16139.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-16229.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-16255.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-16379.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-16400.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-16408.patch +0 -39
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-16527.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-16595.patch +0 -39
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-16816.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-16820.patch +0 -98
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-16873.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-16910.patch +0 -39
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-17051.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/django__django-17087.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-18869.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-22711.patch +0 -38
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-22835.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-23299.patch +0 -65
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-23314.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-23476.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-23562.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-23563.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-23913.patch +0 -95
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-23964.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-23987.patch +0 -55
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-24149.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-24265.patch +0 -52
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-24334.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-24970.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-25079.patch +0 -38
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-25311.patch +0 -62
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-25332.patch +0 -42
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-25433.patch +0 -126
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-25442.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-25498.patch +0 -67
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-26011.patch +0 -55
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/matplotlib__matplotlib-26020.patch +0 -42
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/mwaskom__seaborn-2848.patch +0 -95
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/mwaskom__seaborn-3010.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/mwaskom__seaborn-3190.patch +0 -73
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/mwaskom__seaborn-3407.patch +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pallets__flask-4045.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pallets__flask-4992.patch +0 -47
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pallets__flask-5063.patch +0 -90
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/psf__requests-1963.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/psf__requests-2148.patch +0 -79
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/psf__requests-2317.patch +0 -38
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/psf__requests-2674.patch +0 -58
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/psf__requests-3362.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/psf__requests-863.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pydata__xarray-3364.patch +0 -159
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pydata__xarray-4094.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pydata__xarray-4248.patch +0 -134
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pydata__xarray-4493.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pydata__xarray-5131.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pylint-dev__pylint-5859.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pylint-dev__pylint-6506.patch +0 -36
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pylint-dev__pylint-7080.patch +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pylint-dev__pylint-7114.patch +0 -51
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pylint-dev__pylint-7228.patch +0 -80
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pylint-dev__pylint-7993.patch +0 -54
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-11143.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-11148.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-5103.patch +0 -350
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-5221.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-5227.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-5413.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-5495.patch +0 -44
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-5692.patch +0 -69
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-6116.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-7168.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-7220.patch +0 -391
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-7373.patch +0 -48
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-7432.patch +0 -99
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-7490.patch +0 -4
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-8365.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-8906.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/pytest-dev__pytest-9359.patch +0 -89
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-10297.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-10508.patch +0 -37
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-10949.patch +0 -66
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-11040.patch +0 -147
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-11281.patch +0 -107
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-12471.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-13142.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-13241.patch +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-13439.patch +0 -29
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-13496.patch +0 -62
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-13497.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-13584.patch +0 -38
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-13779.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-14087.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-14092.patch +0 -61
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-14894.patch +0 -45
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-14983.patch +0 -36
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-15512.patch +0 -143
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-15535.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-25500.patch +0 -79
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-25570.patch +0 -71
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-25638.patch +0 -70
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/scikit-learn__scikit-learn-25747.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-10325.patch +0 -153
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-10451.patch +0 -99
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-11445.patch +0 -67
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-7686.patch +0 -50
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-7738.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-7975.patch +0 -92
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-8273.patch +0 -103
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-8282.patch +0 -45
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-8435.patch +0 -56
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-8474.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-8506.patch +0 -33
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-8595.patch +0 -45
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-8627.patch +0 -62
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-8713.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-8721.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sphinx-doc__sphinx-8801.patch +0 -101
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-11400.patch +0 -55
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-11870.patch +0 -47
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-11897.patch +0 -152
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-12171.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-12236.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-12419.patch +0 -37
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-12454.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-12481.patch +0 -44
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-13031.patch +0 -71
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-13043.patch +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-13146.patch +0 -54
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-13177.patch +0 -14
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-13437.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-13471.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-13480.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-13647.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-13773.patch +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-13895.patch +0 -55
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-13915.patch +0 -70
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-13971.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-14024.patch +0 -56
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-14308.patch +0 -189
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-14317.patch +0 -41
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-14396.patch +0 -32
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-14774.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-14817.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-15011.patch +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-15308.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-15345.patch +0 -13
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-15346.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-15609.patch +0 -11
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-15678.patch +0 -87
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-16106.patch +0 -66
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-16281.patch +0 -88
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-16503.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-16792.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-16988.patch +0 -22
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-17022.patch +0 -38
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-17139.patch +0 -48
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-17630.patch +0 -116
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-17655.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-18057.patch +0 -31
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-18087.patch +0 -55
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-18189.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-18199.patch +0 -25
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-18532.patch +0 -84
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-18621.patch +0 -21
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-18698.patch +0 -60
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-18835.patch +0 -39
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-19007.patch +0 -143
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-19254.patch +0 -79
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-19487.patch +0 -37
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-20049.patch +0 -37
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-20154.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-20212.patch +0 -15
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-20322.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-20442.patch +0 -73
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-20590.patch +0 -16
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-20639.patch +0 -20
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-21055.patch +0 -47
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-21171.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-21379.patch +0 -27
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-21612.patch +0 -46
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-21614.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-21627.patch +0 -28
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-21847.patch +0 -24
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-22005.patch +0 -36
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-22714.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-22840.patch +0 -19
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-23117.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-23191.patch +0 -42
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-23262.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-24066.patch +0 -26
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-24102.patch +0 -17
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-24152.patch +0 -23
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-24213.patch +0 -18
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-patches/sympy__sympy-24909.patch +0 -39
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-predictions.json +0 -1802
- package/skills/loki-mode/benchmarks/results/2026-01-05-10-37-54/swebench-loki-results.json +0 -1816
- package/skills/loki-mode/benchmarks/results/SUMMARY.md +0 -32
- package/skills/loki-mode/benchmarks/results/humaneval-loki-results.json +0 -1001
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/0.py +0 -21
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/1.py +0 -36
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/10.py +0 -30
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/100.py +0 -23
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/101.py +0 -17
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/102.py +0 -21
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/103.py +0 -22
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/104.py +0 -23
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/105.py +0 -34
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/106.py +0 -26
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/107.py +0 -40
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/108.py +0 -27
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/109.py +0 -53
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/11.py +0 -21
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/110.py +0 -25
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/111.py +0 -34
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/112.py +0 -20
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/113.py +0 -25
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/114.py +0 -24
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/115.py +0 -41
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/116.py +0 -17
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/117.py +0 -30
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/118.py +0 -31
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/119.py +0 -35
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/12.py +0 -27
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/120.py +0 -33
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/121.py +0 -15
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/122.py +0 -24
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/123.py +0 -35
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/124.py +0 -58
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/125.py +0 -25
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/126.py +0 -34
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/127.py +0 -41
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/128.py +0 -31
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/129.py +0 -62
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/13.py +0 -17
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/130.py +0 -35
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/131.py +0 -24
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/132.py +0 -32
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/133.py +0 -21
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/134.py +0 -23
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/135.py +0 -20
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/136.py +0 -24
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/137.py +0 -31
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/138.py +0 -13
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/139.py +0 -23
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/14.py +0 -14
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/140.py +0 -26
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/141.py +0 -42
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/142.py +0 -25
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/143.py +0 -40
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/144.py +0 -24
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/145.py +0 -24
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/146.py +0 -21
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/147.py +0 -32
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/148.py +0 -33
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/149.py +0 -22
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/15.py +0 -13
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/150.py +0 -26
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/151.py +0 -22
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/152.py +0 -21
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/153.py +0 -32
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/154.py +0 -25
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/155.py +0 -20
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/156.py +0 -39
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/157.py +0 -28
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/158.py +0 -16
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/159.py +0 -36
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/16.py +0 -13
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/160.py +0 -34
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/161.py +0 -29
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/162.py +0 -16
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/163.py +0 -18
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/17.py +0 -27
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/18.py +0 -23
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/19.py +0 -34
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/2.py +0 -15
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/20.py +0 -27
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/21.py +0 -18
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/22.py +0 -16
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/23.py +0 -13
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/24.py +0 -14
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/25.py +0 -29
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/26.py +0 -17
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/27.py +0 -11
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/28.py +0 -16
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/29.py +0 -16
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/3.py +0 -23
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/30.py +0 -13
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/31.py +0 -34
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/32.py +0 -37
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/33.py +0 -19
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/34.py +0 -11
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/35.py +0 -13
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/36.py +0 -19
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/37.py +0 -19
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/38.py +0 -25
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/39.py +0 -40
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/4.py +0 -23
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/40.py +0 -43
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/41.py +0 -18
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/42.py +0 -13
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/43.py +0 -27
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/44.py +0 -25
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/45.py +0 -11
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/46.py +0 -35
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/47.py +0 -19
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/48.py +0 -18
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/49.py +0 -26
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/5.py +0 -25
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/50.py +0 -10
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/51.py +0 -23
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/52.py +0 -13
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/53.py +0 -13
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/54.py +0 -22
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/55.py +0 -20
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/56.py +0 -27
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/57.py +0 -27
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/58.py +0 -14
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/59.py +0 -32
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/6.py +0 -33
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/60.py +0 -19
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/61.py +0 -27
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/62.py +0 -15
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/63.py +0 -30
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/64.py +0 -29
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/65.py +0 -18
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/66.py +0 -23
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/67.py +0 -22
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/68.py +0 -55
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/69.py +0 -26
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/7.py +0 -16
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/70.py +0 -35
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/71.py +0 -29
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/72.py +0 -24
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/73.py +0 -22
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/74.py +0 -26
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/75.py +0 -42
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/76.py +0 -25
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/77.py +0 -25
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/78.py +0 -29
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/79.py +0 -18
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/8.py +0 -24
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/80.py +0 -26
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/81.py +0 -61
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/82.py +0 -21
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/83.py +0 -13
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/84.py +0 -21
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/85.py +0 -17
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/86.py +0 -21
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/87.py +0 -31
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/88.py +0 -29
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/89.py +0 -25
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/9.py +0 -25
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/90.py +0 -21
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/91.py +0 -31
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/92.py +0 -29
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/93.py +0 -30
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/94.py +0 -32
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/95.py +0 -32
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/96.py +0 -38
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/97.py +0 -16
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/98.py +0 -20
- package/skills/loki-mode/benchmarks/results/humaneval-loki-solutions/99.py +0 -36
- package/skills/loki-mode/benchmarks/run-benchmarks.sh +0 -1948
- package/skills/loki-mode/benchmarks/submission-template/README.md +0 -111
- package/skills/loki-mode/benchmarks/submission-template/metadata.yaml +0 -76
- package/skills/loki-mode/demo/README.md +0 -137
- package/skills/loki-mode/demo/loki-demo.gif +0 -0
- package/skills/loki-mode/demo/record-demo.sh +0 -69
- package/skills/loki-mode/demo/record-full-demo.sh +0 -208
- package/skills/loki-mode/demo/recordings/loki-demo.cast +0 -93
- package/skills/loki-mode/demo/run-demo-auto.sh +0 -293
- package/skills/loki-mode/demo/run-demo.sh +0 -323
- package/skills/loki-mode/demo/vhs-tape.tape +0 -223
- package/skills/loki-mode/demo/voice-over-script.md +0 -246
- package/skills/loki-mode/examples/api-only.md +0 -79
- package/skills/loki-mode/examples/full-stack-demo.md +0 -123
- package/skills/loki-mode/examples/simple-todo-app.md +0 -60
- package/skills/loki-mode/examples/static-landing-page.md +0 -73
- package/skills/loki-mode/examples/todo-app-generated/.loki/CONTINUITY.md +0 -59
- package/skills/loki-mode/examples/todo-app-generated/.loki/queue/completed.json +0 -1
- package/skills/loki-mode/examples/todo-app-generated/.loki/queue/dead-letter.json +0 -1
- package/skills/loki-mode/examples/todo-app-generated/.loki/queue/failed.json +0 -1
- package/skills/loki-mode/examples/todo-app-generated/.loki/queue/in-progress.json +0 -1
- package/skills/loki-mode/examples/todo-app-generated/.loki/queue/pending.json +0 -382
- package/skills/loki-mode/examples/todo-app-generated/.loki/state/orchestrator.json +0 -41
- package/skills/loki-mode/examples/todo-app-generated/E2E_VERIFICATION_REPORT.md +0 -668
- package/skills/loki-mode/examples/todo-app-generated/PRD.md +0 -60
- package/skills/loki-mode/examples/todo-app-generated/TASK_018_COMPLETION.md +0 -229
- package/skills/loki-mode/examples/todo-app-generated/TESTING_DOCUMENTATION.md +0 -327
- package/skills/loki-mode/examples/todo-app-generated/TEST_REPORT.md +0 -201
- package/skills/loki-mode/examples/todo-app-generated/VERIFICATION_SUMMARY.txt +0 -362
- package/skills/loki-mode/examples/todo-app-generated/backend/package-lock.json +0 -2698
- package/skills/loki-mode/examples/todo-app-generated/backend/package.json +0 -26
- package/skills/loki-mode/examples/todo-app-generated/backend/src/db/database.ts +0 -24
- package/skills/loki-mode/examples/todo-app-generated/backend/src/db/db.ts +0 -35
- package/skills/loki-mode/examples/todo-app-generated/backend/src/db/index.ts +0 -2
- package/skills/loki-mode/examples/todo-app-generated/backend/src/db/migrations.ts +0 -31
- package/skills/loki-mode/examples/todo-app-generated/backend/src/db/schema.sql +0 -8
- package/skills/loki-mode/examples/todo-app-generated/backend/src/index.ts +0 -44
- package/skills/loki-mode/examples/todo-app-generated/backend/src/routes/todos.ts +0 -155
- package/skills/loki-mode/examples/todo-app-generated/backend/src/types/index.ts +0 -35
- package/skills/loki-mode/examples/todo-app-generated/backend/todos.db-shm +0 -0
- package/skills/loki-mode/examples/todo-app-generated/backend/todos.db-wal +0 -0
- package/skills/loki-mode/examples/todo-app-generated/backend/tsconfig.json +0 -30
- package/skills/loki-mode/examples/todo-app-generated/frontend/index.html +0 -13
- package/skills/loki-mode/examples/todo-app-generated/frontend/package-lock.json +0 -2014
- package/skills/loki-mode/examples/todo-app-generated/frontend/package.json +0 -26
- package/skills/loki-mode/examples/todo-app-generated/frontend/src/App.css +0 -384
- package/skills/loki-mode/examples/todo-app-generated/frontend/src/App.tsx +0 -81
- package/skills/loki-mode/examples/todo-app-generated/frontend/src/api/todos.ts +0 -57
- package/skills/loki-mode/examples/todo-app-generated/frontend/src/components/ConfirmDialog.tsx +0 -26
- package/skills/loki-mode/examples/todo-app-generated/frontend/src/components/EmptyState.tsx +0 -8
- package/skills/loki-mode/examples/todo-app-generated/frontend/src/components/TodoForm.tsx +0 -43
- package/skills/loki-mode/examples/todo-app-generated/frontend/src/components/TodoItem.tsx +0 -36
- package/skills/loki-mode/examples/todo-app-generated/frontend/src/components/TodoList.tsx +0 -27
- package/skills/loki-mode/examples/todo-app-generated/frontend/src/hooks/useTodos.ts +0 -81
- package/skills/loki-mode/examples/todo-app-generated/frontend/src/index.css +0 -48
- package/skills/loki-mode/examples/todo-app-generated/frontend/src/main.tsx +0 -10
- package/skills/loki-mode/examples/todo-app-generated/frontend/src/vite-env.d.ts +0 -1
- package/skills/loki-mode/examples/todo-app-generated/frontend/tsconfig.json +0 -23
- package/skills/loki-mode/examples/todo-app-generated/frontend/tsconfig.node.json +0 -10
- package/skills/loki-mode/examples/todo-app-generated/frontend/vite.config.ts +0 -15
- package/skills/theme-factory/theme-showcase.pdf +0 -0
- package/skills/ui-ux-pro-max/scripts/__pycache__/core.cpython-314.pyc +0 -0
- package/skills/ui-ux-pro-max/scripts/__pycache__/design_system.cpython-314.pyc +0 -0
- package/skills/web-artifacts-builder/scripts/shadcn-components.tar.gz +0 -0
|
@@ -1,1948 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
#===============================================================================
|
|
3
|
-
# Loki Mode Benchmark Runner
|
|
4
|
-
# Run HumanEval and SWE-bench benchmarks to validate multi-agent performance
|
|
5
|
-
#
|
|
6
|
-
# Usage:
|
|
7
|
-
# ./benchmarks/run-benchmarks.sh [benchmark] [options]
|
|
8
|
-
# ./benchmarks/run-benchmarks.sh humaneval # Setup only
|
|
9
|
-
# ./benchmarks/run-benchmarks.sh humaneval --execute # Direct Claude (baseline)
|
|
10
|
-
# ./benchmarks/run-benchmarks.sh humaneval --execute --loki # Multi-agent Loki Mode
|
|
11
|
-
# ./benchmarks/run-benchmarks.sh humaneval --execute --limit 10 # First 10 problems
|
|
12
|
-
# ./benchmarks/run-benchmarks.sh swebench --execute # Run SWE-bench
|
|
13
|
-
# ./benchmarks/run-benchmarks.sh all --execute # Run all benchmarks
|
|
14
|
-
#
|
|
15
|
-
# Options:
|
|
16
|
-
# --execute Actually run problems through Claude (vs just setup)
|
|
17
|
-
# --loki Use Loki Mode multi-agent system (Architect->Engineer->QA->Reviewer)
|
|
18
|
-
# --limit N Only run first N problems (useful for testing)
|
|
19
|
-
# --parallel N Run N problems in parallel (default: 1)
|
|
20
|
-
# --model MODEL Claude model to use (default: sonnet)
|
|
21
|
-
# --timeout N Timeout per problem in seconds (default: 120)
|
|
22
|
-
# --retries N Max RARV retry attempts for --loki mode (default: 3)
|
|
23
|
-
#
|
|
24
|
-
# Prerequisites:
|
|
25
|
-
# - Python 3.8+
|
|
26
|
-
# - Claude Code CLI
|
|
27
|
-
# - Git
|
|
28
|
-
#
|
|
29
|
-
# Results are saved to:
|
|
30
|
-
# ./benchmarks/results/YYYY-MM-DD-HH-MM-SS/
|
|
31
|
-
#===============================================================================
|
|
32
|
-
|
|
33
|
-
set -uo pipefail
|
|
34
|
-
|
|
35
|
-
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
36
|
-
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
37
|
-
RESULTS_DIR="$SCRIPT_DIR/results/$(date +%Y-%m-%d-%H-%M-%S)"
|
|
38
|
-
|
|
39
|
-
# Configuration
|
|
40
|
-
EXECUTE_MODE=false
|
|
41
|
-
LOKI_MODE=false # Use multi-agent Loki Mode vs direct Claude
|
|
42
|
-
PROBLEM_LIMIT=0 # 0 = all problems
|
|
43
|
-
PARALLEL_COUNT=1
|
|
44
|
-
CLAUDE_MODEL="sonnet"
|
|
45
|
-
PROBLEM_TIMEOUT=120
|
|
46
|
-
MAX_RETRIES=3 # RARV retry attempts
|
|
47
|
-
|
|
48
|
-
# Colors
|
|
49
|
-
RED='\033[0;31m'
|
|
50
|
-
GREEN='\033[0;32m'
|
|
51
|
-
YELLOW='\033[1;33m'
|
|
52
|
-
CYAN='\033[0;36m'
|
|
53
|
-
BLUE='\033[0;34m'
|
|
54
|
-
MAGENTA='\033[0;35m'
|
|
55
|
-
NC='\033[0m'
|
|
56
|
-
|
|
57
|
-
log_info() { echo -e "${CYAN}[INFO]${NC} $1"; }
|
|
58
|
-
log_success() { echo -e "${GREEN}[PASS]${NC} $1"; }
|
|
59
|
-
log_warning() { echo -e "${YELLOW}[WARN]${NC} $1"; }
|
|
60
|
-
log_error() { echo -e "${RED}[FAIL]${NC} $1"; }
|
|
61
|
-
log_progress() { echo -e "${BLUE}[PROG]${NC} $1"; }
|
|
62
|
-
|
|
63
|
-
#===============================================================================
|
|
64
|
-
# Argument Parsing
|
|
65
|
-
#===============================================================================
|
|
66
|
-
|
|
67
|
-
parse_args() {
|
|
68
|
-
local positional=()
|
|
69
|
-
|
|
70
|
-
while [[ $# -gt 0 ]]; do
|
|
71
|
-
case $1 in
|
|
72
|
-
--execute)
|
|
73
|
-
EXECUTE_MODE=true
|
|
74
|
-
shift
|
|
75
|
-
;;
|
|
76
|
-
--loki)
|
|
77
|
-
LOKI_MODE=true
|
|
78
|
-
shift
|
|
79
|
-
;;
|
|
80
|
-
--limit)
|
|
81
|
-
PROBLEM_LIMIT="$2"
|
|
82
|
-
shift 2
|
|
83
|
-
;;
|
|
84
|
-
--parallel)
|
|
85
|
-
PARALLEL_COUNT="$2"
|
|
86
|
-
shift 2
|
|
87
|
-
;;
|
|
88
|
-
--model)
|
|
89
|
-
CLAUDE_MODEL="$2"
|
|
90
|
-
shift 2
|
|
91
|
-
;;
|
|
92
|
-
--timeout)
|
|
93
|
-
PROBLEM_TIMEOUT="$2"
|
|
94
|
-
shift 2
|
|
95
|
-
;;
|
|
96
|
-
--retries)
|
|
97
|
-
MAX_RETRIES="$2"
|
|
98
|
-
shift 2
|
|
99
|
-
;;
|
|
100
|
-
-*)
|
|
101
|
-
log_error "Unknown option: $1"
|
|
102
|
-
exit 1
|
|
103
|
-
;;
|
|
104
|
-
*)
|
|
105
|
-
positional+=("$1")
|
|
106
|
-
shift
|
|
107
|
-
;;
|
|
108
|
-
esac
|
|
109
|
-
done
|
|
110
|
-
|
|
111
|
-
# Restore positional parameters
|
|
112
|
-
set -- "${positional[@]}"
|
|
113
|
-
BENCHMARK="${1:-all}"
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
#===============================================================================
|
|
117
|
-
# Setup
|
|
118
|
-
#===============================================================================
|
|
119
|
-
|
|
120
|
-
setup_environment() {
|
|
121
|
-
log_info "Setting up benchmark environment..."
|
|
122
|
-
|
|
123
|
-
mkdir -p "$RESULTS_DIR"
|
|
124
|
-
mkdir -p "$SCRIPT_DIR/datasets"
|
|
125
|
-
mkdir -p "$SCRIPT_DIR/workspaces"
|
|
126
|
-
|
|
127
|
-
# Check prerequisites
|
|
128
|
-
if ! command -v python3 &> /dev/null; then
|
|
129
|
-
log_error "Python 3 is required"
|
|
130
|
-
exit 1
|
|
131
|
-
fi
|
|
132
|
-
|
|
133
|
-
if ! command -v claude &> /dev/null; then
|
|
134
|
-
log_error "Claude Code CLI is required"
|
|
135
|
-
exit 1
|
|
136
|
-
fi
|
|
137
|
-
|
|
138
|
-
# Install benchmark dependencies if needed
|
|
139
|
-
if [ ! -d "$SCRIPT_DIR/venv" ]; then
|
|
140
|
-
log_info "Creating virtual environment..."
|
|
141
|
-
python3 -m venv "$SCRIPT_DIR/venv"
|
|
142
|
-
fi
|
|
143
|
-
|
|
144
|
-
source "$SCRIPT_DIR/venv/bin/activate"
|
|
145
|
-
pip install -q requests tqdm
|
|
146
|
-
|
|
147
|
-
log_success "Environment ready"
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
#===============================================================================
|
|
151
|
-
# HumanEval Benchmark
|
|
152
|
-
#===============================================================================
|
|
153
|
-
|
|
154
|
-
download_humaneval() {
|
|
155
|
-
local dataset_file="$SCRIPT_DIR/datasets/humaneval.jsonl"
|
|
156
|
-
|
|
157
|
-
if [ -f "$dataset_file" ]; then
|
|
158
|
-
log_info "HumanEval dataset already downloaded"
|
|
159
|
-
return
|
|
160
|
-
fi
|
|
161
|
-
|
|
162
|
-
log_info "Downloading HumanEval dataset..."
|
|
163
|
-
curl -sL "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz" | \
|
|
164
|
-
gunzip > "$dataset_file"
|
|
165
|
-
|
|
166
|
-
log_success "HumanEval dataset downloaded (164 problems)"
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
run_humaneval() {
|
|
170
|
-
log_info "Running HumanEval benchmark..."
|
|
171
|
-
|
|
172
|
-
download_humaneval
|
|
173
|
-
|
|
174
|
-
if [ "$EXECUTE_MODE" = true ]; then
|
|
175
|
-
if [ "$LOKI_MODE" = true ]; then
|
|
176
|
-
run_humaneval_loki
|
|
177
|
-
else
|
|
178
|
-
run_humaneval_execute
|
|
179
|
-
fi
|
|
180
|
-
else
|
|
181
|
-
run_humaneval_setup
|
|
182
|
-
fi
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
run_humaneval_setup() {
|
|
186
|
-
local dataset_file="$SCRIPT_DIR/datasets/humaneval.jsonl"
|
|
187
|
-
local results_file="$RESULTS_DIR/humaneval-results.json"
|
|
188
|
-
|
|
189
|
-
python3 << 'HUMANEVAL_SETUP'
|
|
190
|
-
import json
|
|
191
|
-
import os
|
|
192
|
-
from datetime import datetime
|
|
193
|
-
|
|
194
|
-
SCRIPT_DIR = os.environ.get('SCRIPT_DIR', '.')
|
|
195
|
-
RESULTS_DIR = os.environ.get('RESULTS_DIR', './results')
|
|
196
|
-
|
|
197
|
-
dataset_file = f"{SCRIPT_DIR}/datasets/humaneval.jsonl"
|
|
198
|
-
results_file = f"{RESULTS_DIR}/humaneval-results.json"
|
|
199
|
-
|
|
200
|
-
problems = []
|
|
201
|
-
with open(dataset_file, 'r') as f:
|
|
202
|
-
for line in f:
|
|
203
|
-
problems.append(json.loads(line))
|
|
204
|
-
|
|
205
|
-
print(f"Loaded {len(problems)} HumanEval problems")
|
|
206
|
-
|
|
207
|
-
results = {
|
|
208
|
-
"benchmark": "HumanEval",
|
|
209
|
-
"version": "1.0",
|
|
210
|
-
"timestamp": datetime.now().isoformat(),
|
|
211
|
-
"total_problems": len(problems),
|
|
212
|
-
"status": "INFRASTRUCTURE_READY",
|
|
213
|
-
"note": "Run with --execute to run actual tests.",
|
|
214
|
-
"sample_problems": [p["task_id"] for p in problems[:5]]
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
with open(results_file, 'w') as f:
|
|
218
|
-
json.dump(results, f, indent=2)
|
|
219
|
-
|
|
220
|
-
print(f"Results saved to {results_file}")
|
|
221
|
-
print("\nTo run actual benchmarks:")
|
|
222
|
-
print(" ./benchmarks/run-benchmarks.sh humaneval --execute")
|
|
223
|
-
print(" ./benchmarks/run-benchmarks.sh humaneval --execute --limit 10")
|
|
224
|
-
HUMANEVAL_SETUP
|
|
225
|
-
|
|
226
|
-
log_success "HumanEval benchmark infrastructure ready"
|
|
227
|
-
log_info "Results: $RESULTS_DIR/humaneval-results.json"
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
run_humaneval_execute() {
|
|
231
|
-
local dataset_file="$SCRIPT_DIR/datasets/humaneval.jsonl"
|
|
232
|
-
local results_file="$RESULTS_DIR/humaneval-results.json"
|
|
233
|
-
local solutions_dir="$RESULTS_DIR/humaneval-solutions"
|
|
234
|
-
|
|
235
|
-
mkdir -p "$solutions_dir"
|
|
236
|
-
|
|
237
|
-
log_info "Executing HumanEval benchmark with Claude..."
|
|
238
|
-
log_info "Model: $CLAUDE_MODEL | Timeout: ${PROBLEM_TIMEOUT}s | Limit: ${PROBLEM_LIMIT:-all}"
|
|
239
|
-
|
|
240
|
-
# Export variables for Python
|
|
241
|
-
export PROBLEM_LIMIT PROBLEM_TIMEOUT CLAUDE_MODEL
|
|
242
|
-
|
|
243
|
-
python3 << 'HUMANEVAL_EXECUTE'
|
|
244
|
-
import json
|
|
245
|
-
import subprocess
|
|
246
|
-
import os
|
|
247
|
-
import sys
|
|
248
|
-
import time
|
|
249
|
-
import tempfile
|
|
250
|
-
import traceback
|
|
251
|
-
from datetime import datetime
|
|
252
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
253
|
-
|
|
254
|
-
SCRIPT_DIR = os.environ.get('SCRIPT_DIR', '.')
|
|
255
|
-
RESULTS_DIR = os.environ.get('RESULTS_DIR', './results')
|
|
256
|
-
PROBLEM_LIMIT = int(os.environ.get('PROBLEM_LIMIT', '0'))
|
|
257
|
-
PROBLEM_TIMEOUT = int(os.environ.get('PROBLEM_TIMEOUT', '120'))
|
|
258
|
-
CLAUDE_MODEL = os.environ.get('CLAUDE_MODEL', 'sonnet')
|
|
259
|
-
|
|
260
|
-
dataset_file = f"{SCRIPT_DIR}/datasets/humaneval.jsonl"
|
|
261
|
-
results_file = f"{RESULTS_DIR}/humaneval-results.json"
|
|
262
|
-
solutions_dir = f"{RESULTS_DIR}/humaneval-solutions"
|
|
263
|
-
|
|
264
|
-
# Load problems
|
|
265
|
-
problems = []
|
|
266
|
-
with open(dataset_file, 'r') as f:
|
|
267
|
-
for line in f:
|
|
268
|
-
problems.append(json.loads(line))
|
|
269
|
-
|
|
270
|
-
if PROBLEM_LIMIT > 0:
|
|
271
|
-
problems = problems[:PROBLEM_LIMIT]
|
|
272
|
-
|
|
273
|
-
print(f"\n{'='*60}")
|
|
274
|
-
print(f" HumanEval Benchmark Execution")
|
|
275
|
-
print(f" Problems: {len(problems)} | Model: {CLAUDE_MODEL}")
|
|
276
|
-
print(f"{'='*60}\n")
|
|
277
|
-
|
|
278
|
-
def solve_problem(problem):
|
|
279
|
-
"""Send a HumanEval problem to Claude and get solution."""
|
|
280
|
-
task_id = problem["task_id"]
|
|
281
|
-
prompt = problem["prompt"]
|
|
282
|
-
entry_point = problem["entry_point"]
|
|
283
|
-
test = problem["test"]
|
|
284
|
-
canonical = problem.get("canonical_solution", "")
|
|
285
|
-
|
|
286
|
-
# Create prompt for Claude - ask for COMPLETE function to avoid indentation issues
|
|
287
|
-
claude_prompt = f'''You are solving a HumanEval coding problem. Complete the Python function below.
|
|
288
|
-
|
|
289
|
-
{prompt}
|
|
290
|
-
|
|
291
|
-
INSTRUCTIONS:
|
|
292
|
-
1. Output the COMPLETE function including the signature and docstring shown above
|
|
293
|
-
2. Fill in the implementation after the docstring
|
|
294
|
-
3. Use proper 4-space indentation for the function body
|
|
295
|
-
4. Output ONLY the Python code - no markdown, no explanation, no ```python blocks
|
|
296
|
-
5. The function must be syntactically valid Python
|
|
297
|
-
|
|
298
|
-
Output the complete function now:'''
|
|
299
|
-
|
|
300
|
-
try:
|
|
301
|
-
# Call Claude
|
|
302
|
-
result = subprocess.run(
|
|
303
|
-
['claude', '-p', claude_prompt, '--model', CLAUDE_MODEL],
|
|
304
|
-
capture_output=True,
|
|
305
|
-
text=True,
|
|
306
|
-
timeout=PROBLEM_TIMEOUT
|
|
307
|
-
)
|
|
308
|
-
|
|
309
|
-
solution = result.stdout.strip()
|
|
310
|
-
|
|
311
|
-
# Clean up solution - remove markdown code blocks if present
|
|
312
|
-
if solution.startswith("```python"):
|
|
313
|
-
solution = solution[9:]
|
|
314
|
-
if solution.startswith("```"):
|
|
315
|
-
solution = solution[3:]
|
|
316
|
-
if solution.endswith("```"):
|
|
317
|
-
solution = solution[:-3]
|
|
318
|
-
solution = solution.strip()
|
|
319
|
-
|
|
320
|
-
# Verify solution contains the function definition
|
|
321
|
-
if f"def {entry_point}" not in solution:
|
|
322
|
-
# Claude didn't include function signature, prepend it
|
|
323
|
-
# Indent the body properly
|
|
324
|
-
lines = solution.split('\n')
|
|
325
|
-
indented_lines = [' ' + line if line.strip() and not line.startswith(' ') else line for line in lines]
|
|
326
|
-
solution = prompt + '\n'.join(indented_lines)
|
|
327
|
-
|
|
328
|
-
return {
|
|
329
|
-
"task_id": task_id,
|
|
330
|
-
"solution": solution,
|
|
331
|
-
"solution_body": solution,
|
|
332
|
-
"error": None
|
|
333
|
-
}
|
|
334
|
-
except subprocess.TimeoutExpired:
|
|
335
|
-
return {
|
|
336
|
-
"task_id": task_id,
|
|
337
|
-
"solution": None,
|
|
338
|
-
"solution_body": None,
|
|
339
|
-
"error": "TIMEOUT"
|
|
340
|
-
}
|
|
341
|
-
except Exception as e:
|
|
342
|
-
return {
|
|
343
|
-
"task_id": task_id,
|
|
344
|
-
"solution": None,
|
|
345
|
-
"solution_body": None,
|
|
346
|
-
"error": str(e)
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
def test_solution(problem, solution):
|
|
350
|
-
"""Execute the solution against HumanEval test cases."""
|
|
351
|
-
task_id = problem["task_id"]
|
|
352
|
-
test = problem["test"]
|
|
353
|
-
entry_point = problem["entry_point"]
|
|
354
|
-
|
|
355
|
-
if solution is None:
|
|
356
|
-
return {"task_id": task_id, "passed": False, "error": "No solution"}
|
|
357
|
-
|
|
358
|
-
# Create test file
|
|
359
|
-
test_code = f'''
|
|
360
|
-
{solution}
|
|
361
|
-
|
|
362
|
-
{test}
|
|
363
|
-
|
|
364
|
-
# Run the check function
|
|
365
|
-
check({entry_point})
|
|
366
|
-
print("PASSED")
|
|
367
|
-
'''
|
|
368
|
-
|
|
369
|
-
try:
|
|
370
|
-
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
|
|
371
|
-
f.write(test_code)
|
|
372
|
-
test_file = f.name
|
|
373
|
-
|
|
374
|
-
result = subprocess.run(
|
|
375
|
-
['python3', test_file],
|
|
376
|
-
capture_output=True,
|
|
377
|
-
text=True,
|
|
378
|
-
timeout=30
|
|
379
|
-
)
|
|
380
|
-
|
|
381
|
-
os.unlink(test_file)
|
|
382
|
-
|
|
383
|
-
passed = "PASSED" in result.stdout
|
|
384
|
-
return {
|
|
385
|
-
"task_id": task_id,
|
|
386
|
-
"passed": passed,
|
|
387
|
-
"stdout": result.stdout[:500],
|
|
388
|
-
"stderr": result.stderr[:500] if not passed else "",
|
|
389
|
-
"error": None
|
|
390
|
-
}
|
|
391
|
-
except subprocess.TimeoutExpired:
|
|
392
|
-
return {"task_id": task_id, "passed": False, "error": "TEST_TIMEOUT"}
|
|
393
|
-
except Exception as e:
|
|
394
|
-
return {"task_id": task_id, "passed": False, "error": str(e)}
|
|
395
|
-
|
|
396
|
-
# Run benchmark
|
|
397
|
-
results = {
|
|
398
|
-
"benchmark": "HumanEval",
|
|
399
|
-
"version": "1.0",
|
|
400
|
-
"timestamp": datetime.now().isoformat(),
|
|
401
|
-
"model": CLAUDE_MODEL,
|
|
402
|
-
"timeout_per_problem": PROBLEM_TIMEOUT,
|
|
403
|
-
"total_problems": len(problems),
|
|
404
|
-
"status": "RUNNING",
|
|
405
|
-
"problems": []
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
passed_count = 0
|
|
409
|
-
failed_count = 0
|
|
410
|
-
error_count = 0
|
|
411
|
-
start_time = time.time()
|
|
412
|
-
|
|
413
|
-
for i, problem in enumerate(problems):
|
|
414
|
-
task_id = problem["task_id"]
|
|
415
|
-
task_num = task_id.split("/")[1]
|
|
416
|
-
|
|
417
|
-
print(f"[{i+1}/{len(problems)}] {task_id}...", end=" ", flush=True)
|
|
418
|
-
|
|
419
|
-
# Get solution from Claude
|
|
420
|
-
solution_result = solve_problem(problem)
|
|
421
|
-
|
|
422
|
-
if solution_result["error"]:
|
|
423
|
-
print(f"\033[0;31mERROR: {solution_result['error']}\033[0m")
|
|
424
|
-
error_count += 1
|
|
425
|
-
problem_result = {
|
|
426
|
-
"task_id": task_id,
|
|
427
|
-
"passed": False,
|
|
428
|
-
"error": solution_result["error"],
|
|
429
|
-
"solution": None
|
|
430
|
-
}
|
|
431
|
-
else:
|
|
432
|
-
# Save solution
|
|
433
|
-
solution_file = f"{solutions_dir}/{task_num}.py"
|
|
434
|
-
with open(solution_file, 'w') as f:
|
|
435
|
-
f.write(solution_result["solution"])
|
|
436
|
-
|
|
437
|
-
# Test solution
|
|
438
|
-
test_result = test_solution(problem, solution_result["solution"])
|
|
439
|
-
|
|
440
|
-
if test_result["passed"]:
|
|
441
|
-
print(f"\033[0;32mPASSED\033[0m")
|
|
442
|
-
passed_count += 1
|
|
443
|
-
else:
|
|
444
|
-
print(f"\033[0;31mFAILED\033[0m")
|
|
445
|
-
failed_count += 1
|
|
446
|
-
|
|
447
|
-
problem_result = {
|
|
448
|
-
"task_id": task_id,
|
|
449
|
-
"passed": test_result["passed"],
|
|
450
|
-
"error": test_result.get("error"),
|
|
451
|
-
"solution_file": solution_file
|
|
452
|
-
}
|
|
453
|
-
|
|
454
|
-
results["problems"].append(problem_result)
|
|
455
|
-
|
|
456
|
-
# Save intermediate results
|
|
457
|
-
with open(results_file, 'w') as f:
|
|
458
|
-
json.dump(results, f, indent=2)
|
|
459
|
-
|
|
460
|
-
# Final results
|
|
461
|
-
elapsed_time = time.time() - start_time
|
|
462
|
-
pass_rate = (passed_count / len(problems)) * 100 if problems else 0
|
|
463
|
-
|
|
464
|
-
results["status"] = "COMPLETED"
|
|
465
|
-
results["passed"] = passed_count
|
|
466
|
-
results["failed"] = failed_count
|
|
467
|
-
results["errors"] = error_count
|
|
468
|
-
results["pass_rate"] = round(pass_rate, 2)
|
|
469
|
-
results["elapsed_seconds"] = round(elapsed_time, 2)
|
|
470
|
-
|
|
471
|
-
with open(results_file, 'w') as f:
|
|
472
|
-
json.dump(results, f, indent=2)
|
|
473
|
-
|
|
474
|
-
print(f"\n{'='*60}")
|
|
475
|
-
print(f" RESULTS")
|
|
476
|
-
print(f"{'='*60}")
|
|
477
|
-
print(f" Passed: {passed_count}/{len(problems)}")
|
|
478
|
-
print(f" Failed: {failed_count}/{len(problems)}")
|
|
479
|
-
print(f" Errors: {error_count}/{len(problems)}")
|
|
480
|
-
print(f" Pass Rate: {pass_rate:.1f}%")
|
|
481
|
-
print(f" Time: {elapsed_time:.1f}s")
|
|
482
|
-
print(f"{'='*60}\n")
|
|
483
|
-
|
|
484
|
-
# Compare to competitors
|
|
485
|
-
print(" Competitor Comparison:")
|
|
486
|
-
print(f" - MetaGPT: 85.9-87.7%")
|
|
487
|
-
print(f" - Loki Mode: {pass_rate:.1f}%")
|
|
488
|
-
if pass_rate >= 85:
|
|
489
|
-
print(f" Status: \033[0;32mCOMPETITIVE\033[0m")
|
|
490
|
-
elif pass_rate >= 70:
|
|
491
|
-
print(f" Status: \033[0;33mGOOD\033[0m")
|
|
492
|
-
else:
|
|
493
|
-
print(f" Status: \033[0;31mNEEDS IMPROVEMENT\033[0m")
|
|
494
|
-
print(f"{'='*60}\n")
|
|
495
|
-
HUMANEVAL_EXECUTE
|
|
496
|
-
|
|
497
|
-
log_success "HumanEval benchmark execution complete"
|
|
498
|
-
log_info "Results: $results_file"
|
|
499
|
-
log_info "Solutions: $solutions_dir/"
|
|
500
|
-
}
|
|
501
|
-
|
|
502
|
-
#===============================================================================
|
|
503
|
-
# Loki Mode Multi-Agent HumanEval Benchmark
|
|
504
|
-
# Uses: Architect -> Engineer -> QA -> Reviewer with RARV cycle
|
|
505
|
-
#===============================================================================
|
|
506
|
-
|
|
507
|
-
run_humaneval_loki() {
|
|
508
|
-
local dataset_file="$SCRIPT_DIR/datasets/humaneval.jsonl"
|
|
509
|
-
local results_file="$RESULTS_DIR/humaneval-loki-results.json"
|
|
510
|
-
local solutions_dir="$RESULTS_DIR/humaneval-loki-solutions"
|
|
511
|
-
|
|
512
|
-
mkdir -p "$solutions_dir"
|
|
513
|
-
|
|
514
|
-
log_info "Executing HumanEval with Loki Mode Multi-Agent System..."
|
|
515
|
-
log_info "Model: $CLAUDE_MODEL | Retries: $MAX_RETRIES | Limit: ${PROBLEM_LIMIT:-all}"
|
|
516
|
-
log_info "Agents: Architect -> Engineer -> QA -> Reviewer (RARV cycle)"
|
|
517
|
-
|
|
518
|
-
# Export variables for Python
|
|
519
|
-
export PROBLEM_LIMIT PROBLEM_TIMEOUT CLAUDE_MODEL MAX_RETRIES
|
|
520
|
-
|
|
521
|
-
python3 << 'HUMANEVAL_LOKI'
|
|
522
|
-
import json
|
|
523
|
-
import subprocess
|
|
524
|
-
import os
|
|
525
|
-
import sys
|
|
526
|
-
import time
|
|
527
|
-
import tempfile
|
|
528
|
-
import traceback
|
|
529
|
-
from datetime import datetime
|
|
530
|
-
|
|
531
|
-
SCRIPT_DIR = os.environ.get('SCRIPT_DIR', '.')
|
|
532
|
-
RESULTS_DIR = os.environ.get('RESULTS_DIR', './results')
|
|
533
|
-
PROBLEM_LIMIT = int(os.environ.get('PROBLEM_LIMIT', '0'))
|
|
534
|
-
PROBLEM_TIMEOUT = int(os.environ.get('PROBLEM_TIMEOUT', '120'))
|
|
535
|
-
CLAUDE_MODEL = os.environ.get('CLAUDE_MODEL', 'sonnet')
|
|
536
|
-
MAX_RETRIES = int(os.environ.get('MAX_RETRIES', '3'))
|
|
537
|
-
|
|
538
|
-
dataset_file = f"{SCRIPT_DIR}/datasets/humaneval.jsonl"
|
|
539
|
-
results_file = f"{RESULTS_DIR}/humaneval-loki-results.json"
|
|
540
|
-
solutions_dir = f"{RESULTS_DIR}/humaneval-loki-solutions"
|
|
541
|
-
|
|
542
|
-
# Load problems
|
|
543
|
-
problems = []
|
|
544
|
-
with open(dataset_file, 'r') as f:
|
|
545
|
-
for line in f:
|
|
546
|
-
problems.append(json.loads(line))
|
|
547
|
-
|
|
548
|
-
if PROBLEM_LIMIT > 0:
|
|
549
|
-
problems = problems[:PROBLEM_LIMIT]
|
|
550
|
-
|
|
551
|
-
print(f"\n{'='*70}")
|
|
552
|
-
print(f" LOKI MODE Multi-Agent HumanEval Benchmark")
|
|
553
|
-
print(f" Problems: {len(problems)} | Model: {CLAUDE_MODEL} | Max Retries: {MAX_RETRIES}")
|
|
554
|
-
print(f" Agent Pipeline: Architect -> Engineer -> QA -> Reviewer")
|
|
555
|
-
print(f"{'='*70}\n")
|
|
556
|
-
|
|
557
|
-
def call_agent(agent_name, prompt, timeout=PROBLEM_TIMEOUT):
|
|
558
|
-
"""Call a Loki Mode agent with a specific role."""
|
|
559
|
-
try:
|
|
560
|
-
result = subprocess.run(
|
|
561
|
-
['claude', '-p', prompt, '--model', CLAUDE_MODEL],
|
|
562
|
-
capture_output=True,
|
|
563
|
-
text=True,
|
|
564
|
-
timeout=timeout
|
|
565
|
-
)
|
|
566
|
-
return result.stdout.strip(), None
|
|
567
|
-
except subprocess.TimeoutExpired:
|
|
568
|
-
return None, "TIMEOUT"
|
|
569
|
-
except Exception as e:
|
|
570
|
-
return None, str(e)
|
|
571
|
-
|
|
572
|
-
def architect_agent(problem):
|
|
573
|
-
"""Architect: Analyze problem and design approach."""
|
|
574
|
-
prompt = f'''You are the ARCHITECT AGENT in a multi-agent coding system.
|
|
575
|
-
|
|
576
|
-
TASK: Analyze this HumanEval problem and design the solution approach.
|
|
577
|
-
|
|
578
|
-
PROBLEM:
|
|
579
|
-
{problem["prompt"]}
|
|
580
|
-
|
|
581
|
-
Your job:
|
|
582
|
-
1. Understand what the function should do
|
|
583
|
-
2. Identify edge cases and constraints
|
|
584
|
-
3. Design the algorithm/approach
|
|
585
|
-
4. Note any potential pitfalls
|
|
586
|
-
|
|
587
|
-
Output a brief analysis (3-5 lines) with:
|
|
588
|
-
- What the function does
|
|
589
|
-
- Key algorithm/approach
|
|
590
|
-
- Edge cases to handle
|
|
591
|
-
|
|
592
|
-
Keep it concise - the Engineer agent will implement based on your analysis.'''
|
|
593
|
-
|
|
594
|
-
return call_agent("Architect", prompt, timeout=30)
|
|
595
|
-
|
|
596
|
-
def engineer_agent(problem, architect_analysis):
|
|
597
|
-
"""Engineer: Implement the solution based on architect's design."""
|
|
598
|
-
prompt = f'''You are the ENGINEER AGENT in a multi-agent coding system.
|
|
599
|
-
|
|
600
|
-
TASK: Implement the solution based on the Architect's analysis.
|
|
601
|
-
|
|
602
|
-
PROBLEM:
|
|
603
|
-
{problem["prompt"]}
|
|
604
|
-
|
|
605
|
-
ARCHITECT'S ANALYSIS:
|
|
606
|
-
{architect_analysis}
|
|
607
|
-
|
|
608
|
-
INSTRUCTIONS:
|
|
609
|
-
1. Output the COMPLETE function including signature and docstring
|
|
610
|
-
2. Implement based on the architect's approach
|
|
611
|
-
3. Use proper 4-space indentation
|
|
612
|
-
4. Handle the edge cases identified
|
|
613
|
-
5. Output ONLY Python code - no markdown, no explanation
|
|
614
|
-
|
|
615
|
-
Output the complete function now:'''
|
|
616
|
-
|
|
617
|
-
return call_agent("Engineer", prompt)
|
|
618
|
-
|
|
619
|
-
def qa_agent(problem, solution):
|
|
620
|
-
"""QA: Test the solution and identify issues."""
|
|
621
|
-
test = problem["test"]
|
|
622
|
-
entry_point = problem["entry_point"]
|
|
623
|
-
|
|
624
|
-
# First, actually run the tests
|
|
625
|
-
test_code = f'''
|
|
626
|
-
{solution}
|
|
627
|
-
|
|
628
|
-
{test}
|
|
629
|
-
|
|
630
|
-
check({entry_point})
|
|
631
|
-
print("ALL_TESTS_PASSED")
|
|
632
|
-
'''
|
|
633
|
-
|
|
634
|
-
try:
|
|
635
|
-
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
|
|
636
|
-
f.write(test_code)
|
|
637
|
-
temp_file = f.name
|
|
638
|
-
|
|
639
|
-
result = subprocess.run(
|
|
640
|
-
['python3', temp_file],
|
|
641
|
-
capture_output=True,
|
|
642
|
-
text=True,
|
|
643
|
-
timeout=10
|
|
644
|
-
)
|
|
645
|
-
|
|
646
|
-
os.unlink(temp_file)
|
|
647
|
-
|
|
648
|
-
if "ALL_TESTS_PASSED" in result.stdout:
|
|
649
|
-
return {"passed": True, "output": "All tests passed", "error": None}
|
|
650
|
-
else:
|
|
651
|
-
error_msg = result.stderr or result.stdout or "Unknown error"
|
|
652
|
-
return {"passed": False, "output": error_msg, "error": error_msg}
|
|
653
|
-
except subprocess.TimeoutExpired:
|
|
654
|
-
os.unlink(temp_file)
|
|
655
|
-
return {"passed": False, "output": "Test timeout", "error": "TIMEOUT"}
|
|
656
|
-
except Exception as e:
|
|
657
|
-
return {"passed": False, "output": str(e), "error": str(e)}
|
|
658
|
-
|
|
659
|
-
def reviewer_agent(problem, solution, qa_result):
|
|
660
|
-
"""Reviewer: Review solution quality and suggest improvements if tests failed."""
|
|
661
|
-
if qa_result["passed"]:
|
|
662
|
-
return {"approved": True, "feedback": "Solution passes all tests"}
|
|
663
|
-
|
|
664
|
-
prompt = f'''You are the CODE REVIEWER AGENT in a multi-agent coding system.
|
|
665
|
-
|
|
666
|
-
The QA agent found issues with this solution. Analyze and suggest fixes.
|
|
667
|
-
|
|
668
|
-
PROBLEM:
|
|
669
|
-
{problem["prompt"]}
|
|
670
|
-
|
|
671
|
-
CURRENT SOLUTION:
|
|
672
|
-
{solution}
|
|
673
|
-
|
|
674
|
-
TEST ERROR:
|
|
675
|
-
{qa_result["error"]}
|
|
676
|
-
|
|
677
|
-
Analyze the error and provide:
|
|
678
|
-
1. What went wrong (1 line)
|
|
679
|
-
2. How to fix it (1-2 lines)
|
|
680
|
-
|
|
681
|
-
Keep feedback concise - the Engineer will use it to fix the code.'''
|
|
682
|
-
|
|
683
|
-
feedback, error = call_agent("Reviewer", prompt, timeout=30)
|
|
684
|
-
return {"approved": False, "feedback": feedback or "No feedback", "error": error}
|
|
685
|
-
|
|
686
|
-
def engineer_fix_agent(problem, solution, feedback, attempt):
|
|
687
|
-
"""Engineer: Fix the solution based on reviewer feedback."""
|
|
688
|
-
prompt = f'''You are the ENGINEER AGENT. Your previous solution failed tests.
|
|
689
|
-
|
|
690
|
-
PROBLEM:
|
|
691
|
-
{problem["prompt"]}
|
|
692
|
-
|
|
693
|
-
PREVIOUS SOLUTION:
|
|
694
|
-
{solution}
|
|
695
|
-
|
|
696
|
-
REVIEWER FEEDBACK:
|
|
697
|
-
{feedback}
|
|
698
|
-
|
|
699
|
-
ATTEMPT: {attempt}/{MAX_RETRIES}
|
|
700
|
-
|
|
701
|
-
Fix the solution based on the feedback.
|
|
702
|
-
Output the COMPLETE corrected function - no explanations, just code.'''
|
|
703
|
-
|
|
704
|
-
return call_agent("Engineer-Fix", prompt)
|
|
705
|
-
|
|
706
|
-
def solve_with_loki_mode(problem):
|
|
707
|
-
"""
|
|
708
|
-
Solve a HumanEval problem using Loki Mode multi-agent system.
|
|
709
|
-
|
|
710
|
-
Pipeline: Architect -> Engineer -> QA -> [Reviewer -> Engineer-Fix]* -> Pass/Fail
|
|
711
|
-
"""
|
|
712
|
-
task_id = problem["task_id"]
|
|
713
|
-
entry_point = problem["entry_point"]
|
|
714
|
-
|
|
715
|
-
agent_trace = []
|
|
716
|
-
|
|
717
|
-
# Step 1: Architect analyzes the problem
|
|
718
|
-
architect_analysis, error = architect_agent(problem)
|
|
719
|
-
agent_trace.append({"agent": "Architect", "output": architect_analysis, "error": error})
|
|
720
|
-
|
|
721
|
-
if error:
|
|
722
|
-
return {
|
|
723
|
-
"task_id": task_id,
|
|
724
|
-
"solution": None,
|
|
725
|
-
"passed": False,
|
|
726
|
-
"error": f"Architect failed: {error}",
|
|
727
|
-
"attempts": 1,
|
|
728
|
-
"agent_trace": agent_trace
|
|
729
|
-
}
|
|
730
|
-
|
|
731
|
-
# Step 2: Engineer implements solution
|
|
732
|
-
solution, error = engineer_agent(problem, architect_analysis)
|
|
733
|
-
agent_trace.append({"agent": "Engineer", "output": solution[:200] if solution else None, "error": error})
|
|
734
|
-
|
|
735
|
-
if error or not solution:
|
|
736
|
-
return {
|
|
737
|
-
"task_id": task_id,
|
|
738
|
-
"solution": None,
|
|
739
|
-
"passed": False,
|
|
740
|
-
"error": f"Engineer failed: {error}",
|
|
741
|
-
"attempts": 1,
|
|
742
|
-
"agent_trace": agent_trace
|
|
743
|
-
}
|
|
744
|
-
|
|
745
|
-
# Clean up solution
|
|
746
|
-
if solution.startswith("```python"):
|
|
747
|
-
solution = solution[9:]
|
|
748
|
-
if solution.startswith("```"):
|
|
749
|
-
solution = solution[3:]
|
|
750
|
-
if solution.endswith("```"):
|
|
751
|
-
solution = solution[:-3]
|
|
752
|
-
solution = solution.strip()
|
|
753
|
-
|
|
754
|
-
# Ensure function signature is present
|
|
755
|
-
if f"def {entry_point}" not in solution:
|
|
756
|
-
lines = solution.split('\n')
|
|
757
|
-
indented_lines = [' ' + line if line.strip() and not line.startswith(' ') else line for line in lines]
|
|
758
|
-
solution = problem["prompt"] + '\n'.join(indented_lines)
|
|
759
|
-
|
|
760
|
-
# RARV Loop: QA -> Reviewer -> Engineer-Fix
|
|
761
|
-
for attempt in range(1, MAX_RETRIES + 1):
|
|
762
|
-
# Step 3: QA tests the solution
|
|
763
|
-
qa_result = qa_agent(problem, solution)
|
|
764
|
-
agent_trace.append({"agent": "QA", "passed": qa_result["passed"], "error": qa_result.get("error")})
|
|
765
|
-
|
|
766
|
-
if qa_result["passed"]:
|
|
767
|
-
return {
|
|
768
|
-
"task_id": task_id,
|
|
769
|
-
"solution": solution,
|
|
770
|
-
"passed": True,
|
|
771
|
-
"error": None,
|
|
772
|
-
"attempts": attempt,
|
|
773
|
-
"agent_trace": agent_trace
|
|
774
|
-
}
|
|
775
|
-
|
|
776
|
-
if attempt >= MAX_RETRIES:
|
|
777
|
-
break
|
|
778
|
-
|
|
779
|
-
# Step 4: Reviewer analyzes failure
|
|
780
|
-
review = reviewer_agent(problem, solution, qa_result)
|
|
781
|
-
agent_trace.append({"agent": "Reviewer", "feedback": review["feedback"][:200] if review["feedback"] else None})
|
|
782
|
-
|
|
783
|
-
# Step 5: Engineer fixes based on feedback
|
|
784
|
-
new_solution, error = engineer_fix_agent(problem, solution, review["feedback"], attempt + 1)
|
|
785
|
-
agent_trace.append({"agent": f"Engineer-Fix-{attempt+1}", "output": new_solution[:200] if new_solution else None, "error": error})
|
|
786
|
-
|
|
787
|
-
if new_solution and not error:
|
|
788
|
-
# Clean up
|
|
789
|
-
if new_solution.startswith("```python"):
|
|
790
|
-
new_solution = new_solution[9:]
|
|
791
|
-
if new_solution.startswith("```"):
|
|
792
|
-
new_solution = new_solution[3:]
|
|
793
|
-
if new_solution.endswith("```"):
|
|
794
|
-
new_solution = new_solution[:-3]
|
|
795
|
-
new_solution = new_solution.strip()
|
|
796
|
-
|
|
797
|
-
if f"def {entry_point}" not in new_solution:
|
|
798
|
-
lines = new_solution.split('\n')
|
|
799
|
-
indented_lines = [' ' + line if line.strip() and not line.startswith(' ') else line for line in lines]
|
|
800
|
-
new_solution = problem["prompt"] + '\n'.join(indented_lines)
|
|
801
|
-
|
|
802
|
-
solution = new_solution
|
|
803
|
-
|
|
804
|
-
return {
|
|
805
|
-
"task_id": task_id,
|
|
806
|
-
"solution": solution,
|
|
807
|
-
"passed": False,
|
|
808
|
-
"error": f"Failed after {MAX_RETRIES} RARV attempts",
|
|
809
|
-
"attempts": MAX_RETRIES,
|
|
810
|
-
"agent_trace": agent_trace
|
|
811
|
-
}
|
|
812
|
-
|
|
813
|
-
# Run benchmark
|
|
814
|
-
results = {
|
|
815
|
-
"benchmark": "HumanEval-LokiMode",
|
|
816
|
-
"mode": "multi-agent",
|
|
817
|
-
"version": "1.0",
|
|
818
|
-
"timestamp": datetime.now().isoformat(),
|
|
819
|
-
"model": CLAUDE_MODEL,
|
|
820
|
-
"max_retries": MAX_RETRIES,
|
|
821
|
-
"total_problems": len(problems),
|
|
822
|
-
"problems": []
|
|
823
|
-
}
|
|
824
|
-
|
|
825
|
-
start_time = time.time()
|
|
826
|
-
passed_count = 0
|
|
827
|
-
failed_count = 0
|
|
828
|
-
error_count = 0
|
|
829
|
-
total_attempts = 0
|
|
830
|
-
|
|
831
|
-
for i, problem in enumerate(problems):
|
|
832
|
-
task_id = problem["task_id"]
|
|
833
|
-
task_num = int(task_id.split("/")[1])
|
|
834
|
-
|
|
835
|
-
print(f"[{i+1}/{len(problems)}] {task_id}...", end=" ", flush=True)
|
|
836
|
-
|
|
837
|
-
problem_result = solve_with_loki_mode(problem)
|
|
838
|
-
|
|
839
|
-
# Save solution
|
|
840
|
-
solution_file = f"{solutions_dir}/{task_num}.py"
|
|
841
|
-
with open(solution_file, 'w') as f:
|
|
842
|
-
f.write(f"# {task_id}\n")
|
|
843
|
-
f.write(f"# Loki Mode Multi-Agent Solution\n")
|
|
844
|
-
f.write(f"# Attempts: {problem_result['attempts']}\n")
|
|
845
|
-
f.write(f"# Passed: {problem_result['passed']}\n\n")
|
|
846
|
-
if problem_result["solution"]:
|
|
847
|
-
f.write(problem_result["solution"])
|
|
848
|
-
|
|
849
|
-
# Track results
|
|
850
|
-
total_attempts += problem_result["attempts"]
|
|
851
|
-
|
|
852
|
-
if problem_result["passed"]:
|
|
853
|
-
passed_count += 1
|
|
854
|
-
attempts_str = f"(attempt {problem_result['attempts']})" if problem_result['attempts'] > 1 else ""
|
|
855
|
-
print(f"\033[0;32mPASSED\033[0m {attempts_str}")
|
|
856
|
-
elif problem_result["error"] and "failed" in problem_result["error"].lower():
|
|
857
|
-
error_count += 1
|
|
858
|
-
print(f"\033[0;31mERROR\033[0m - {problem_result['error'][:50]}")
|
|
859
|
-
else:
|
|
860
|
-
failed_count += 1
|
|
861
|
-
print(f"\033[0;33mFAILED\033[0m after {problem_result['attempts']} attempts")
|
|
862
|
-
|
|
863
|
-
# Store result (without full trace to save space)
|
|
864
|
-
results["problems"].append({
|
|
865
|
-
"task_id": task_id,
|
|
866
|
-
"passed": problem_result["passed"],
|
|
867
|
-
"attempts": problem_result["attempts"],
|
|
868
|
-
"error": problem_result.get("error")
|
|
869
|
-
})
|
|
870
|
-
|
|
871
|
-
elapsed_time = time.time() - start_time
|
|
872
|
-
|
|
873
|
-
# Final results
|
|
874
|
-
results["passed"] = passed_count
|
|
875
|
-
results["failed"] = failed_count
|
|
876
|
-
results["errors"] = error_count
|
|
877
|
-
results["pass_rate"] = (passed_count / len(problems)) * 100 if problems else 0
|
|
878
|
-
results["avg_attempts"] = total_attempts / len(problems) if problems else 0
|
|
879
|
-
results["elapsed_time"] = elapsed_time
|
|
880
|
-
|
|
881
|
-
with open(results_file, 'w') as f:
|
|
882
|
-
json.dump(results, f, indent=2)
|
|
883
|
-
|
|
884
|
-
pass_rate = results["pass_rate"]
|
|
885
|
-
avg_attempts = results["avg_attempts"]
|
|
886
|
-
|
|
887
|
-
print(f"\n{'='*70}")
|
|
888
|
-
print(f" LOKI MODE RESULTS")
|
|
889
|
-
print(f"{'='*70}")
|
|
890
|
-
print(f" Passed: {passed_count}/{len(problems)} ({pass_rate:.1f}%)")
|
|
891
|
-
print(f" Failed: {failed_count}/{len(problems)}")
|
|
892
|
-
print(f" Errors: {error_count}/{len(problems)}")
|
|
893
|
-
print(f" Avg Attempts: {avg_attempts:.2f}")
|
|
894
|
-
print(f" Time: {elapsed_time:.1f}s ({elapsed_time/len(problems):.1f}s avg)")
|
|
895
|
-
print(f"{'='*70}")
|
|
896
|
-
print(f"\n Comparison (baseline: MetaGPT 85.9-87.7%):")
|
|
897
|
-
print(f" - MetaGPT (multi-agent): 85.9-87.7%")
|
|
898
|
-
print(f" - Direct Claude: 98.17% (from previous run)")
|
|
899
|
-
print(f" - Loki Mode (multi-agent): {pass_rate:.1f}%")
|
|
900
|
-
if pass_rate >= 98:
|
|
901
|
-
print(f" Status: \033[0;32mEXCELLENT - Beats both!\033[0m")
|
|
902
|
-
elif pass_rate >= 90:
|
|
903
|
-
print(f" Status: \033[0;32mGREAT - Beats MetaGPT\033[0m")
|
|
904
|
-
elif pass_rate >= 85:
|
|
905
|
-
print(f" Status: \033[0;33mCOMPETITIVE with MetaGPT\033[0m")
|
|
906
|
-
else:
|
|
907
|
-
print(f" Status: \033[0;31mBELOW MetaGPT baseline\033[0m")
|
|
908
|
-
print(f"{'='*70}\n")
|
|
909
|
-
HUMANEVAL_LOKI
|
|
910
|
-
|
|
911
|
-
log_success "Loki Mode HumanEval benchmark complete"
|
|
912
|
-
log_info "Results: $results_file"
|
|
913
|
-
log_info "Solutions: $solutions_dir/"
|
|
914
|
-
}
|
|
915
|
-
|
|
916
|
-
#===============================================================================
|
|
917
|
-
# SWE-bench Benchmark
|
|
918
|
-
#===============================================================================
|
|
919
|
-
|
|
920
|
-
download_swebench() {
|
|
921
|
-
local dataset_file="$SCRIPT_DIR/datasets/swebench-lite.json"
|
|
922
|
-
|
|
923
|
-
if [ -f "$dataset_file" ]; then
|
|
924
|
-
log_info "SWE-bench Lite dataset already downloaded"
|
|
925
|
-
return
|
|
926
|
-
fi
|
|
927
|
-
|
|
928
|
-
log_info "Downloading SWE-bench Lite dataset..."
|
|
929
|
-
|
|
930
|
-
python3 << 'SWEBENCH_DOWNLOAD'
|
|
931
|
-
import json
|
|
932
|
-
import os
|
|
933
|
-
|
|
934
|
-
SCRIPT_DIR = os.environ.get('SCRIPT_DIR', '.')
|
|
935
|
-
|
|
936
|
-
# Create placeholder dataset structure
|
|
937
|
-
dataset = {
|
|
938
|
-
"name": "SWE-bench Lite",
|
|
939
|
-
"version": "1.0",
|
|
940
|
-
"description": "300 real-world GitHub issues for evaluation",
|
|
941
|
-
"source": "https://github.com/SWE-bench/SWE-bench",
|
|
942
|
-
"problems": 300,
|
|
943
|
-
"status": "PLACEHOLDER",
|
|
944
|
-
"install_command": "pip install swebench",
|
|
945
|
-
"run_command": "python -m swebench.harness.run_evaluation"
|
|
946
|
-
}
|
|
947
|
-
|
|
948
|
-
with open(f"{SCRIPT_DIR}/datasets/swebench-lite.json", 'w') as f:
|
|
949
|
-
json.dump(dataset, f, indent=2)
|
|
950
|
-
|
|
951
|
-
print("SWE-bench Lite metadata saved")
|
|
952
|
-
SWEBENCH_DOWNLOAD
|
|
953
|
-
|
|
954
|
-
log_success "SWE-bench Lite dataset metadata ready"
|
|
955
|
-
}
|
|
956
|
-
|
|
957
|
-
run_swebench() {
|
|
958
|
-
log_info "Running SWE-bench Lite benchmark..."
|
|
959
|
-
|
|
960
|
-
download_swebench
|
|
961
|
-
|
|
962
|
-
if [ "$EXECUTE_MODE" = true ]; then
|
|
963
|
-
if [ "$LOKI_MODE" = true ]; then
|
|
964
|
-
run_swebench_loki
|
|
965
|
-
else
|
|
966
|
-
run_swebench_execute
|
|
967
|
-
fi
|
|
968
|
-
else
|
|
969
|
-
run_swebench_setup
|
|
970
|
-
fi
|
|
971
|
-
}
|
|
972
|
-
|
|
973
|
-
run_swebench_setup() {
|
|
974
|
-
local results_file="$RESULTS_DIR/swebench-results.json"
|
|
975
|
-
|
|
976
|
-
python3 << 'SWEBENCH_SETUP'
|
|
977
|
-
import json
|
|
978
|
-
import os
|
|
979
|
-
from datetime import datetime
|
|
980
|
-
|
|
981
|
-
RESULTS_DIR = os.environ.get('RESULTS_DIR', './results')
|
|
982
|
-
|
|
983
|
-
results = {
|
|
984
|
-
"benchmark": "SWE-bench Lite",
|
|
985
|
-
"version": "1.0",
|
|
986
|
-
"timestamp": datetime.now().isoformat(),
|
|
987
|
-
"total_problems": 300,
|
|
988
|
-
"status": "INFRASTRUCTURE_READY",
|
|
989
|
-
"note": "Install swebench package for full evaluation.",
|
|
990
|
-
"install": "pip install swebench",
|
|
991
|
-
"evaluation": "python -m swebench.harness.run_evaluation --predictions predictions.json"
|
|
992
|
-
}
|
|
993
|
-
|
|
994
|
-
with open(f"{RESULTS_DIR}/swebench-results.json", 'w') as f:
|
|
995
|
-
json.dump(results, f, indent=2)
|
|
996
|
-
|
|
997
|
-
print(f"Results saved to {RESULTS_DIR}/swebench-results.json")
|
|
998
|
-
SWEBENCH_SETUP
|
|
999
|
-
|
|
1000
|
-
log_success "SWE-bench benchmark infrastructure ready"
|
|
1001
|
-
log_info "Results: $RESULTS_DIR/swebench-results.json"
|
|
1002
|
-
}
|
|
1003
|
-
|
|
1004
|
-
run_swebench_execute() {
|
|
1005
|
-
log_info "Executing SWE-bench Lite benchmark..."
|
|
1006
|
-
|
|
1007
|
-
# Check if swebench is installed
|
|
1008
|
-
if ! python3 -c "import swebench" 2>/dev/null; then
|
|
1009
|
-
log_warning "SWE-bench package not installed. Installing..."
|
|
1010
|
-
pip install -q swebench datasets
|
|
1011
|
-
fi
|
|
1012
|
-
|
|
1013
|
-
export PROBLEM_LIMIT PROBLEM_TIMEOUT CLAUDE_MODEL
|
|
1014
|
-
|
|
1015
|
-
python3 << 'SWEBENCH_EXECUTE'
|
|
1016
|
-
import json
|
|
1017
|
-
import subprocess
|
|
1018
|
-
import os
|
|
1019
|
-
import sys
|
|
1020
|
-
import time
|
|
1021
|
-
import tempfile
|
|
1022
|
-
import shutil
|
|
1023
|
-
from datetime import datetime
|
|
1024
|
-
|
|
1025
|
-
try:
|
|
1026
|
-
from datasets import load_dataset
|
|
1027
|
-
from swebench.harness.constants import MAP_REPO_TO_TEST_FRAMEWORK
|
|
1028
|
-
except ImportError:
|
|
1029
|
-
print("Installing SWE-bench dependencies...")
|
|
1030
|
-
subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'swebench', 'datasets'])
|
|
1031
|
-
from datasets import load_dataset
|
|
1032
|
-
|
|
1033
|
-
SCRIPT_DIR = os.environ.get('SCRIPT_DIR', '.')
|
|
1034
|
-
RESULTS_DIR = os.environ.get('RESULTS_DIR', './results')
|
|
1035
|
-
PROBLEM_LIMIT = int(os.environ.get('PROBLEM_LIMIT', '10')) # Default to 10 for SWE-bench
|
|
1036
|
-
PROBLEM_TIMEOUT = int(os.environ.get('PROBLEM_TIMEOUT', '300'))
|
|
1037
|
-
CLAUDE_MODEL = os.environ.get('CLAUDE_MODEL', 'sonnet')
|
|
1038
|
-
|
|
1039
|
-
results_file = f"{RESULTS_DIR}/swebench-results.json"
|
|
1040
|
-
patches_dir = f"{RESULTS_DIR}/swebench-patches"
|
|
1041
|
-
os.makedirs(patches_dir, exist_ok=True)
|
|
1042
|
-
|
|
1043
|
-
print(f"\n{'='*60}")
|
|
1044
|
-
print(f" SWE-bench Lite Benchmark Execution")
|
|
1045
|
-
print(f" Limit: {PROBLEM_LIMIT} | Model: {CLAUDE_MODEL}")
|
|
1046
|
-
print(f"{'='*60}\n")
|
|
1047
|
-
|
|
1048
|
-
# Load SWE-bench Lite dataset
|
|
1049
|
-
print("Loading SWE-bench Lite dataset...")
|
|
1050
|
-
try:
|
|
1051
|
-
dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
|
|
1052
|
-
problems = list(dataset)[:PROBLEM_LIMIT]
|
|
1053
|
-
print(f"Loaded {len(problems)} problems")
|
|
1054
|
-
except Exception as e:
|
|
1055
|
-
print(f"Error loading dataset: {e}")
|
|
1056
|
-
print("Using placeholder results...")
|
|
1057
|
-
results = {
|
|
1058
|
-
"benchmark": "SWE-bench Lite",
|
|
1059
|
-
"version": "1.0",
|
|
1060
|
-
"timestamp": datetime.now().isoformat(),
|
|
1061
|
-
"status": "DATASET_ERROR",
|
|
1062
|
-
"error": str(e),
|
|
1063
|
-
"note": "Could not load SWE-bench dataset. Check network and try again."
|
|
1064
|
-
}
|
|
1065
|
-
with open(results_file, 'w') as f:
|
|
1066
|
-
json.dump(results, f, indent=2)
|
|
1067
|
-
sys.exit(1)
|
|
1068
|
-
|
|
1069
|
-
def solve_swebench_problem(problem):
|
|
1070
|
-
"""Generate a patch for a SWE-bench problem using Claude."""
|
|
1071
|
-
instance_id = problem["instance_id"]
|
|
1072
|
-
repo = problem["repo"]
|
|
1073
|
-
base_commit = problem["base_commit"]
|
|
1074
|
-
problem_statement = problem["problem_statement"]
|
|
1075
|
-
hints = problem.get("hints_text", "")
|
|
1076
|
-
|
|
1077
|
-
# Create prompt for Claude
|
|
1078
|
-
prompt = f'''You are solving a real GitHub issue from the {repo} repository.
|
|
1079
|
-
|
|
1080
|
-
## Problem Statement
|
|
1081
|
-
{problem_statement}
|
|
1082
|
-
|
|
1083
|
-
## Hints
|
|
1084
|
-
{hints if hints else "No hints available."}
|
|
1085
|
-
|
|
1086
|
-
## Task
|
|
1087
|
-
Generate a git patch (unified diff format) that fixes this issue.
|
|
1088
|
-
|
|
1089
|
-
Output ONLY the patch content in unified diff format. Example format:
|
|
1090
|
-
--- a/file.py
|
|
1091
|
-
+++ b/file.py
|
|
1092
|
-
@@ -10,6 +10,7 @@
|
|
1093
|
-
existing line
|
|
1094
|
-
+new line
|
|
1095
|
-
existing line
|
|
1096
|
-
|
|
1097
|
-
Do not include any explanation or markdown code blocks. Just the raw patch.'''
|
|
1098
|
-
|
|
1099
|
-
try:
|
|
1100
|
-
result = subprocess.run(
|
|
1101
|
-
['claude', '-p', prompt, '--model', CLAUDE_MODEL],
|
|
1102
|
-
capture_output=True,
|
|
1103
|
-
text=True,
|
|
1104
|
-
timeout=PROBLEM_TIMEOUT
|
|
1105
|
-
)
|
|
1106
|
-
|
|
1107
|
-
patch = result.stdout.strip()
|
|
1108
|
-
|
|
1109
|
-
# Clean up patch if wrapped in markdown
|
|
1110
|
-
if patch.startswith("```"):
|
|
1111
|
-
lines = patch.split("\n")
|
|
1112
|
-
patch = "\n".join(lines[1:-1] if lines[-1] == "```" else lines[1:])
|
|
1113
|
-
|
|
1114
|
-
return {
|
|
1115
|
-
"instance_id": instance_id,
|
|
1116
|
-
"model_patch": patch,
|
|
1117
|
-
"error": None
|
|
1118
|
-
}
|
|
1119
|
-
except subprocess.TimeoutExpired:
|
|
1120
|
-
return {"instance_id": instance_id, "model_patch": None, "error": "TIMEOUT"}
|
|
1121
|
-
except Exception as e:
|
|
1122
|
-
return {"instance_id": instance_id, "model_patch": None, "error": str(e)}
|
|
1123
|
-
|
|
1124
|
-
# Run benchmark
|
|
1125
|
-
results = {
|
|
1126
|
-
"benchmark": "SWE-bench Lite",
|
|
1127
|
-
"version": "1.0",
|
|
1128
|
-
"timestamp": datetime.now().isoformat(),
|
|
1129
|
-
"model": CLAUDE_MODEL,
|
|
1130
|
-
"timeout_per_problem": PROBLEM_TIMEOUT,
|
|
1131
|
-
"total_problems": len(problems),
|
|
1132
|
-
"status": "RUNNING",
|
|
1133
|
-
"predictions": []
|
|
1134
|
-
}
|
|
1135
|
-
|
|
1136
|
-
generated_count = 0
|
|
1137
|
-
error_count = 0
|
|
1138
|
-
start_time = time.time()
|
|
1139
|
-
|
|
1140
|
-
for i, problem in enumerate(problems):
|
|
1141
|
-
instance_id = problem["instance_id"]
|
|
1142
|
-
|
|
1143
|
-
print(f"[{i+1}/{len(problems)}] {instance_id}...", end=" ", flush=True)
|
|
1144
|
-
|
|
1145
|
-
solution = solve_swebench_problem(problem)
|
|
1146
|
-
|
|
1147
|
-
if solution["error"]:
|
|
1148
|
-
print(f"\033[0;31mERROR: {solution['error']}\033[0m")
|
|
1149
|
-
error_count += 1
|
|
1150
|
-
else:
|
|
1151
|
-
print(f"\033[0;32mGENERATED\033[0m")
|
|
1152
|
-
generated_count += 1
|
|
1153
|
-
|
|
1154
|
-
# Save patch
|
|
1155
|
-
patch_file = f"{patches_dir}/{instance_id.replace('/', '_')}.patch"
|
|
1156
|
-
with open(patch_file, 'w') as f:
|
|
1157
|
-
f.write(solution["model_patch"])
|
|
1158
|
-
|
|
1159
|
-
# Add to predictions (format required by SWE-bench evaluator)
|
|
1160
|
-
results["predictions"].append({
|
|
1161
|
-
"instance_id": instance_id,
|
|
1162
|
-
"model_patch": solution["model_patch"] or "",
|
|
1163
|
-
"model_name_or_path": f"loki-mode-{CLAUDE_MODEL}"
|
|
1164
|
-
})
|
|
1165
|
-
|
|
1166
|
-
# Save intermediate results
|
|
1167
|
-
with open(results_file, 'w') as f:
|
|
1168
|
-
json.dump(results, f, indent=2)
|
|
1169
|
-
|
|
1170
|
-
# Save predictions file for SWE-bench evaluator
|
|
1171
|
-
predictions_file = f"{RESULTS_DIR}/swebench-predictions.json"
|
|
1172
|
-
with open(predictions_file, 'w') as f:
|
|
1173
|
-
json.dump(results["predictions"], f, indent=2)
|
|
1174
|
-
|
|
1175
|
-
elapsed_time = time.time() - start_time
|
|
1176
|
-
|
|
1177
|
-
results["status"] = "PATCHES_GENERATED"
|
|
1178
|
-
results["generated"] = generated_count
|
|
1179
|
-
results["errors"] = error_count
|
|
1180
|
-
results["elapsed_seconds"] = round(elapsed_time, 2)
|
|
1181
|
-
results["predictions_file"] = predictions_file
|
|
1182
|
-
results["next_step"] = "Run: python -m swebench.harness.run_evaluation --predictions " + predictions_file
|
|
1183
|
-
|
|
1184
|
-
with open(results_file, 'w') as f:
|
|
1185
|
-
json.dump(results, f, indent=2)
|
|
1186
|
-
|
|
1187
|
-
print(f"\n{'='*60}")
|
|
1188
|
-
print(f" RESULTS")
|
|
1189
|
-
print(f"{'='*60}")
|
|
1190
|
-
print(f" Generated: {generated_count}/{len(problems)}")
|
|
1191
|
-
print(f" Errors: {error_count}/{len(problems)}")
|
|
1192
|
-
print(f" Time: {elapsed_time:.1f}s")
|
|
1193
|
-
print(f"{'='*60}")
|
|
1194
|
-
print(f"\n Next Step: Run SWE-bench evaluator")
|
|
1195
|
-
print(f" python -m swebench.harness.run_evaluation \\")
|
|
1196
|
-
print(f" --predictions {predictions_file} \\")
|
|
1197
|
-
print(f" --max_workers 4")
|
|
1198
|
-
print(f"{'='*60}\n")
|
|
1199
|
-
SWEBENCH_EXECUTE
|
|
1200
|
-
|
|
1201
|
-
log_success "SWE-bench patch generation complete"
|
|
1202
|
-
log_info "Results: $RESULTS_DIR/swebench-results.json"
|
|
1203
|
-
log_info "Predictions: $RESULTS_DIR/swebench-predictions.json"
|
|
1204
|
-
}
|
|
1205
|
-
|
|
1206
|
-
#===============================================================================
|
|
1207
|
-
# Loki Mode Multi-Agent SWE-bench Benchmark
|
|
1208
|
-
# Uses: Architect -> Engineer -> QA -> Reviewer with RARV cycle
|
|
1209
|
-
#===============================================================================
|
|
1210
|
-
|
|
1211
|
-
run_swebench_loki() {
|
|
1212
|
-
log_info "Executing SWE-bench Lite with Loki Mode Multi-Agent System..."
|
|
1213
|
-
log_info "Model: $CLAUDE_MODEL | Retries: $MAX_RETRIES | Limit: ${PROBLEM_LIMIT:-all}"
|
|
1214
|
-
log_info "Agents: Architect -> Engineer -> QA -> Reviewer (RARV cycle)"
|
|
1215
|
-
log_info "Trajectory logging: ENABLED (for official submission)"
|
|
1216
|
-
|
|
1217
|
-
# Check if swebench is installed
|
|
1218
|
-
if ! python3 -c "import swebench" 2>/dev/null; then
|
|
1219
|
-
log_warning "SWE-bench package not installed. Installing..."
|
|
1220
|
-
pip install -q swebench datasets
|
|
1221
|
-
fi
|
|
1222
|
-
|
|
1223
|
-
export PROBLEM_LIMIT PROBLEM_TIMEOUT CLAUDE_MODEL MAX_RETRIES
|
|
1224
|
-
|
|
1225
|
-
python3 << 'SWEBENCH_LOKI'
|
|
1226
|
-
import json
|
|
1227
|
-
import subprocess
|
|
1228
|
-
import os
|
|
1229
|
-
import sys
|
|
1230
|
-
import time
|
|
1231
|
-
import re
|
|
1232
|
-
from datetime import datetime
|
|
1233
|
-
|
|
1234
|
-
try:
|
|
1235
|
-
from datasets import load_dataset
|
|
1236
|
-
except ImportError:
|
|
1237
|
-
subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', 'swebench', 'datasets'])
|
|
1238
|
-
from datasets import load_dataset
|
|
1239
|
-
|
|
1240
|
-
SCRIPT_DIR = os.environ.get('SCRIPT_DIR', '.')
|
|
1241
|
-
RESULTS_DIR = os.environ.get('RESULTS_DIR', './results')
|
|
1242
|
-
PROBLEM_LIMIT = int(os.environ.get('PROBLEM_LIMIT', '0'))
|
|
1243
|
-
PROBLEM_TIMEOUT = int(os.environ.get('PROBLEM_TIMEOUT', '300'))
|
|
1244
|
-
CLAUDE_MODEL = os.environ.get('CLAUDE_MODEL', 'sonnet')
|
|
1245
|
-
MAX_RETRIES = int(os.environ.get('MAX_RETRIES', '3'))
|
|
1246
|
-
|
|
1247
|
-
results_file = f"{RESULTS_DIR}/swebench-loki-results.json"
|
|
1248
|
-
patches_dir = f"{RESULTS_DIR}/swebench-loki-patches"
|
|
1249
|
-
trajs_dir = f"{RESULTS_DIR}/trajs" # Trajectory logs for official submission
|
|
1250
|
-
logs_dir = f"{RESULTS_DIR}/logs" # Execution logs for official submission
|
|
1251
|
-
os.makedirs(patches_dir, exist_ok=True)
|
|
1252
|
-
os.makedirs(trajs_dir, exist_ok=True)
|
|
1253
|
-
os.makedirs(logs_dir, exist_ok=True)
|
|
1254
|
-
|
|
1255
|
-
print(f"\n{'='*70}")
|
|
1256
|
-
print(f" LOKI MODE Multi-Agent SWE-bench Lite Benchmark")
|
|
1257
|
-
print(f" Limit: {PROBLEM_LIMIT if PROBLEM_LIMIT > 0 else 'all'} | Model: {CLAUDE_MODEL} | Max Retries: {MAX_RETRIES}")
|
|
1258
|
-
print(f" Agent Pipeline: Architect -> Engineer -> QA -> Reviewer")
|
|
1259
|
-
print(f"{'='*70}\n")
|
|
1260
|
-
|
|
1261
|
-
# Load dataset
|
|
1262
|
-
print("Loading SWE-bench Lite dataset...")
|
|
1263
|
-
try:
|
|
1264
|
-
dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
|
|
1265
|
-
problems = list(dataset)
|
|
1266
|
-
if PROBLEM_LIMIT > 0:
|
|
1267
|
-
problems = problems[:PROBLEM_LIMIT]
|
|
1268
|
-
print(f"Loaded {len(problems)} problems")
|
|
1269
|
-
except Exception as e:
|
|
1270
|
-
print(f"Error loading dataset: {e}")
|
|
1271
|
-
sys.exit(1)
|
|
1272
|
-
|
|
1273
|
-
def call_agent(agent_name, prompt, timeout=PROBLEM_TIMEOUT):
|
|
1274
|
-
"""Call a Loki Mode agent with a specific role. Returns (output, error, metadata)."""
|
|
1275
|
-
start_time = time.time()
|
|
1276
|
-
try:
|
|
1277
|
-
result = subprocess.run(
|
|
1278
|
-
['claude', '-p', prompt, '--model', CLAUDE_MODEL],
|
|
1279
|
-
capture_output=True,
|
|
1280
|
-
text=True,
|
|
1281
|
-
timeout=timeout
|
|
1282
|
-
)
|
|
1283
|
-
elapsed = time.time() - start_time
|
|
1284
|
-
return result.stdout.strip(), None, {
|
|
1285
|
-
"agent": agent_name,
|
|
1286
|
-
"model": CLAUDE_MODEL,
|
|
1287
|
-
"elapsed_seconds": round(elapsed, 2),
|
|
1288
|
-
"prompt_length": len(prompt),
|
|
1289
|
-
"output_length": len(result.stdout),
|
|
1290
|
-
"timestamp": datetime.now().isoformat()
|
|
1291
|
-
}
|
|
1292
|
-
except subprocess.TimeoutExpired:
|
|
1293
|
-
elapsed = time.time() - start_time
|
|
1294
|
-
return None, "TIMEOUT", {
|
|
1295
|
-
"agent": agent_name,
|
|
1296
|
-
"model": CLAUDE_MODEL,
|
|
1297
|
-
"elapsed_seconds": round(elapsed, 2),
|
|
1298
|
-
"error": "TIMEOUT",
|
|
1299
|
-
"timestamp": datetime.now().isoformat()
|
|
1300
|
-
}
|
|
1301
|
-
except Exception as e:
|
|
1302
|
-
return None, str(e), {
|
|
1303
|
-
"agent": agent_name,
|
|
1304
|
-
"error": str(e),
|
|
1305
|
-
"timestamp": datetime.now().isoformat()
|
|
1306
|
-
}
|
|
1307
|
-
|
|
1308
|
-
def architect_agent(problem):
|
|
1309
|
-
"""Architect: Analyze the issue and design the fix approach."""
|
|
1310
|
-
prompt = f'''You are the ARCHITECT AGENT analyzing a GitHub issue.
|
|
1311
|
-
|
|
1312
|
-
REPOSITORY: {problem["repo"]}
|
|
1313
|
-
ISSUE:
|
|
1314
|
-
{problem["problem_statement"]}
|
|
1315
|
-
|
|
1316
|
-
HINTS:
|
|
1317
|
-
{problem.get("hints_text", "No hints available.")}
|
|
1318
|
-
|
|
1319
|
-
Your job:
|
|
1320
|
-
1. Understand what the issue is about
|
|
1321
|
-
2. Identify which file(s) likely need to be changed
|
|
1322
|
-
3. Describe the fix approach (2-3 sentences)
|
|
1323
|
-
4. Note any edge cases
|
|
1324
|
-
|
|
1325
|
-
Output a brief analysis (5-7 lines max) with:
|
|
1326
|
-
- What the bug/issue is
|
|
1327
|
-
- Files likely affected
|
|
1328
|
-
- Fix strategy
|
|
1329
|
-
|
|
1330
|
-
Keep it concise - the Engineer agent will generate the patch.'''
|
|
1331
|
-
|
|
1332
|
-
output, error, metadata = call_agent("Architect", prompt, timeout=120)
|
|
1333
|
-
metadata["prompt"] = prompt
|
|
1334
|
-
metadata["output"] = output
|
|
1335
|
-
return output, error, metadata
|
|
1336
|
-
|
|
1337
|
-
def engineer_agent(problem, architect_analysis):
|
|
1338
|
-
"""Engineer: Generate the patch based on architect's analysis."""
|
|
1339
|
-
prompt = f'''You are the ENGINEER AGENT generating a patch for a GitHub issue.
|
|
1340
|
-
|
|
1341
|
-
REPOSITORY: {problem["repo"]}
|
|
1342
|
-
ISSUE:
|
|
1343
|
-
{problem["problem_statement"]}
|
|
1344
|
-
|
|
1345
|
-
ARCHITECT'S ANALYSIS:
|
|
1346
|
-
{architect_analysis}
|
|
1347
|
-
|
|
1348
|
-
Generate a git patch (unified diff format) that fixes this issue.
|
|
1349
|
-
|
|
1350
|
-
IMPORTANT:
|
|
1351
|
-
1. Output ONLY the patch in unified diff format
|
|
1352
|
-
2. Include proper file paths with a/ and b/ prefixes
|
|
1353
|
-
3. Include @@ line numbers
|
|
1354
|
-
4. No explanations, no markdown code blocks, just raw patch
|
|
1355
|
-
|
|
1356
|
-
Example format:
|
|
1357
|
-
--- a/path/to/file.py
|
|
1358
|
-
+++ b/path/to/file.py
|
|
1359
|
-
@@ -10,6 +10,7 @@
|
|
1360
|
-
existing line
|
|
1361
|
-
+new line
|
|
1362
|
-
existing line
|
|
1363
|
-
|
|
1364
|
-
Generate the patch now:'''
|
|
1365
|
-
|
|
1366
|
-
output, error, metadata = call_agent("Engineer", prompt)
|
|
1367
|
-
metadata["prompt"] = prompt
|
|
1368
|
-
metadata["output"] = output
|
|
1369
|
-
return output, error, metadata
|
|
1370
|
-
|
|
1371
|
-
def qa_agent(patch):
|
|
1372
|
-
"""QA: Validate the patch format. Returns validation result with metadata."""
|
|
1373
|
-
start_time = time.time()
|
|
1374
|
-
|
|
1375
|
-
if not patch:
|
|
1376
|
-
return {"valid": False, "error": "Empty patch", "checks": [], "timestamp": datetime.now().isoformat()}
|
|
1377
|
-
|
|
1378
|
-
checks = []
|
|
1379
|
-
|
|
1380
|
-
# Check for basic patch structure
|
|
1381
|
-
has_diff_header = "---" in patch and "+++" in patch
|
|
1382
|
-
checks.append({"check": "diff_headers", "passed": has_diff_header})
|
|
1383
|
-
|
|
1384
|
-
has_hunk_header = "@@" in patch
|
|
1385
|
-
checks.append({"check": "hunk_headers", "passed": has_hunk_header})
|
|
1386
|
-
|
|
1387
|
-
has_changes = "+" in patch or "-" in patch
|
|
1388
|
-
checks.append({"check": "has_changes", "passed": has_changes})
|
|
1389
|
-
|
|
1390
|
-
# Check for markdown wrapping (common error)
|
|
1391
|
-
is_wrapped = patch.startswith("```")
|
|
1392
|
-
checks.append({"check": "no_markdown_wrap", "passed": not is_wrapped})
|
|
1393
|
-
|
|
1394
|
-
# Check for proper file paths
|
|
1395
|
-
has_path_prefixes = "a/" in patch and "b/" in patch
|
|
1396
|
-
checks.append({"check": "path_prefixes", "passed": has_path_prefixes})
|
|
1397
|
-
|
|
1398
|
-
elapsed = time.time() - start_time
|
|
1399
|
-
|
|
1400
|
-
if is_wrapped:
|
|
1401
|
-
return {"valid": False, "error": "Patch wrapped in markdown code blocks", "checks": checks, "elapsed_seconds": round(elapsed, 2), "timestamp": datetime.now().isoformat()}
|
|
1402
|
-
|
|
1403
|
-
if not has_diff_header:
|
|
1404
|
-
return {"valid": False, "error": "Missing diff headers (--- and +++)", "checks": checks, "elapsed_seconds": round(elapsed, 2), "timestamp": datetime.now().isoformat()}
|
|
1405
|
-
|
|
1406
|
-
if not has_hunk_header:
|
|
1407
|
-
return {"valid": False, "error": "Missing hunk headers (@@)", "checks": checks, "elapsed_seconds": round(elapsed, 2), "timestamp": datetime.now().isoformat()}
|
|
1408
|
-
|
|
1409
|
-
if not has_changes:
|
|
1410
|
-
return {"valid": False, "error": "No actual changes in patch", "checks": checks, "elapsed_seconds": round(elapsed, 2), "timestamp": datetime.now().isoformat()}
|
|
1411
|
-
|
|
1412
|
-
if not has_path_prefixes:
|
|
1413
|
-
return {"valid": False, "error": "Missing a/ or b/ path prefixes", "checks": checks, "elapsed_seconds": round(elapsed, 2), "timestamp": datetime.now().isoformat()}
|
|
1414
|
-
|
|
1415
|
-
return {"valid": True, "error": None, "checks": checks, "elapsed_seconds": round(elapsed, 2), "timestamp": datetime.now().isoformat()}
|
|
1416
|
-
|
|
1417
|
-
def reviewer_agent(problem, patch, qa_result):
|
|
1418
|
-
"""Reviewer: Analyze patch issues and suggest fixes."""
|
|
1419
|
-
if qa_result["valid"]:
|
|
1420
|
-
return {"approved": True, "feedback": "Patch format is valid", "metadata": {"agent": "Reviewer", "skipped": True, "timestamp": datetime.now().isoformat()}}
|
|
1421
|
-
|
|
1422
|
-
prompt = f'''You are the CODE REVIEWER AGENT. The generated patch has format issues.
|
|
1423
|
-
|
|
1424
|
-
ISSUE:
|
|
1425
|
-
{problem["problem_statement"][:500]}
|
|
1426
|
-
|
|
1427
|
-
CURRENT PATCH:
|
|
1428
|
-
{patch[:1000] if patch else "Empty"}
|
|
1429
|
-
|
|
1430
|
-
FORMAT ERROR:
|
|
1431
|
-
{qa_result["error"]}
|
|
1432
|
-
|
|
1433
|
-
Provide brief feedback (2-3 lines) on how to fix the patch format:
|
|
1434
|
-
- What's wrong
|
|
1435
|
-
- How to fix it'''
|
|
1436
|
-
|
|
1437
|
-
feedback, error, metadata = call_agent("Reviewer", prompt, timeout=60)
|
|
1438
|
-
metadata["prompt"] = prompt
|
|
1439
|
-
metadata["output"] = feedback
|
|
1440
|
-
return {"approved": False, "feedback": feedback or qa_result["error"], "error": error, "metadata": metadata}
|
|
1441
|
-
|
|
1442
|
-
def engineer_fix_agent(problem, patch, feedback, attempt):
|
|
1443
|
-
"""Engineer: Fix the patch based on reviewer feedback."""
|
|
1444
|
-
prompt = f'''You are the ENGINEER AGENT. Your previous patch had format issues.
|
|
1445
|
-
|
|
1446
|
-
ISSUE:
|
|
1447
|
-
{problem["problem_statement"][:500]}
|
|
1448
|
-
|
|
1449
|
-
PREVIOUS PATCH:
|
|
1450
|
-
{patch[:1000] if patch else "Empty"}
|
|
1451
|
-
|
|
1452
|
-
REVIEWER FEEDBACK:
|
|
1453
|
-
{feedback}
|
|
1454
|
-
|
|
1455
|
-
ATTEMPT: {attempt}/{MAX_RETRIES}
|
|
1456
|
-
|
|
1457
|
-
Generate a CORRECTED patch in proper unified diff format.
|
|
1458
|
-
Output ONLY the raw patch - no explanations, no markdown.
|
|
1459
|
-
|
|
1460
|
-
--- a/path/to/file.py
|
|
1461
|
-
+++ b/path/to/file.py
|
|
1462
|
-
@@ -line,count +line,count @@
|
|
1463
|
-
...'''
|
|
1464
|
-
|
|
1465
|
-
output, error, metadata = call_agent("Engineer-Fix", prompt)
|
|
1466
|
-
metadata["prompt"] = prompt
|
|
1467
|
-
metadata["output"] = output
|
|
1468
|
-
metadata["attempt"] = attempt
|
|
1469
|
-
return output, error, metadata
|
|
1470
|
-
|
|
1471
|
-
def clean_patch(patch):
|
|
1472
|
-
"""Clean up patch by removing markdown wrapping."""
|
|
1473
|
-
if not patch:
|
|
1474
|
-
return patch
|
|
1475
|
-
|
|
1476
|
-
if patch.startswith("```"):
|
|
1477
|
-
lines = patch.split("\n")
|
|
1478
|
-
# Remove first and last lines if they're markdown
|
|
1479
|
-
if lines[0].startswith("```"):
|
|
1480
|
-
lines = lines[1:]
|
|
1481
|
-
if lines and lines[-1].strip() == "```":
|
|
1482
|
-
lines = lines[:-1]
|
|
1483
|
-
patch = "\n".join(lines)
|
|
1484
|
-
|
|
1485
|
-
return patch.strip()
|
|
1486
|
-
|
|
1487
|
-
def save_trajectory(instance_id, trajectory_steps):
|
|
1488
|
-
"""Save the full reasoning trajectory to a file for official submission."""
|
|
1489
|
-
safe_id = instance_id.replace("/", "_").replace(":", "_")
|
|
1490
|
-
traj_file = f"{trajs_dir}/{safe_id}.md"
|
|
1491
|
-
|
|
1492
|
-
with open(traj_file, 'w') as f:
|
|
1493
|
-
f.write(f"# Trajectory: {instance_id}\n\n")
|
|
1494
|
-
f.write(f"**Generated by:** Loki Mode Multi-Agent System\n")
|
|
1495
|
-
f.write(f"**Model:** {CLAUDE_MODEL}\n")
|
|
1496
|
-
f.write(f"**Timestamp:** {datetime.now().isoformat()}\n\n")
|
|
1497
|
-
f.write("---\n\n")
|
|
1498
|
-
|
|
1499
|
-
for i, step in enumerate(trajectory_steps, 1):
|
|
1500
|
-
f.write(f"## Step {i}: {step['agent']}\n\n")
|
|
1501
|
-
f.write(f"**Timestamp:** {step.get('timestamp', 'N/A')}\n")
|
|
1502
|
-
f.write(f"**Duration:** {step.get('elapsed_seconds', 'N/A')}s\n\n")
|
|
1503
|
-
|
|
1504
|
-
if step.get('prompt'):
|
|
1505
|
-
f.write("### Prompt\n\n```\n")
|
|
1506
|
-
f.write(step['prompt'][:2000])
|
|
1507
|
-
if len(step.get('prompt', '')) > 2000:
|
|
1508
|
-
f.write("\n... (truncated)")
|
|
1509
|
-
f.write("\n```\n\n")
|
|
1510
|
-
|
|
1511
|
-
if step.get('output'):
|
|
1512
|
-
f.write("### Output\n\n```\n")
|
|
1513
|
-
f.write(step['output'])
|
|
1514
|
-
f.write("\n```\n\n")
|
|
1515
|
-
|
|
1516
|
-
if step.get('error'):
|
|
1517
|
-
f.write(f"### Error\n\n`{step['error']}`\n\n")
|
|
1518
|
-
|
|
1519
|
-
if step.get('checks'):
|
|
1520
|
-
f.write("### Validation Checks\n\n")
|
|
1521
|
-
for check in step['checks']:
|
|
1522
|
-
status = "PASS" if check['passed'] else "FAIL"
|
|
1523
|
-
f.write(f"- {check['check']}: {status}\n")
|
|
1524
|
-
f.write("\n")
|
|
1525
|
-
|
|
1526
|
-
f.write("---\n\n")
|
|
1527
|
-
|
|
1528
|
-
return traj_file
|
|
1529
|
-
|
|
1530
|
-
def save_logs(instance_id, patch, result):
|
|
1531
|
-
"""Save execution logs for official submission."""
|
|
1532
|
-
safe_id = instance_id.replace("/", "_").replace(":", "_")
|
|
1533
|
-
log_dir = f"{logs_dir}/{safe_id}"
|
|
1534
|
-
os.makedirs(log_dir, exist_ok=True)
|
|
1535
|
-
|
|
1536
|
-
# Save patch.diff
|
|
1537
|
-
patch_file = f"{log_dir}/patch.diff"
|
|
1538
|
-
with open(patch_file, 'w') as f:
|
|
1539
|
-
f.write(patch or "")
|
|
1540
|
-
|
|
1541
|
-
# Save report.json
|
|
1542
|
-
report_file = f"{log_dir}/report.json"
|
|
1543
|
-
report = {
|
|
1544
|
-
"instance_id": instance_id,
|
|
1545
|
-
"model_name_or_path": f"loki-mode-{CLAUDE_MODEL}",
|
|
1546
|
-
"model_patch": patch or "",
|
|
1547
|
-
"attempts": result.get("attempts", 1),
|
|
1548
|
-
"success": result.get("error") is None,
|
|
1549
|
-
"error": result.get("error"),
|
|
1550
|
-
"timestamp": datetime.now().isoformat()
|
|
1551
|
-
}
|
|
1552
|
-
with open(report_file, 'w') as f:
|
|
1553
|
-
json.dump(report, f, indent=2)
|
|
1554
|
-
|
|
1555
|
-
# Save test_output.txt (placeholder - would be filled by actual test run)
|
|
1556
|
-
test_file = f"{log_dir}/test_output.txt"
|
|
1557
|
-
with open(test_file, 'w') as f:
|
|
1558
|
-
f.write(f"# Test output for {instance_id}\n")
|
|
1559
|
-
f.write(f"# Generated by Loki Mode\n")
|
|
1560
|
-
f.write(f"# Note: Run SWE-bench harness for actual test results\n\n")
|
|
1561
|
-
f.write(f"Patch generated: {'Yes' if patch else 'No'}\n")
|
|
1562
|
-
f.write(f"Attempts: {result.get('attempts', 1)}\n")
|
|
1563
|
-
f.write(f"Error: {result.get('error', 'None')}\n")
|
|
1564
|
-
|
|
1565
|
-
return log_dir
|
|
1566
|
-
|
|
1567
|
-
def solve_with_loki_mode(problem):
|
|
1568
|
-
"""Solve SWE-bench problem using Loki Mode multi-agent system with full trajectory logging."""
|
|
1569
|
-
instance_id = problem["instance_id"]
|
|
1570
|
-
trajectory_steps = [] # Full trajectory for official submission
|
|
1571
|
-
agent_trace = [] # Summary trace for results JSON
|
|
1572
|
-
|
|
1573
|
-
# Step 1: Architect analyzes the issue
|
|
1574
|
-
architect_analysis, error, arch_meta = architect_agent(problem)
|
|
1575
|
-
trajectory_steps.append(arch_meta)
|
|
1576
|
-
agent_trace.append({"agent": "Architect", "output": architect_analysis[:200] if architect_analysis else None, "error": error})
|
|
1577
|
-
|
|
1578
|
-
if error:
|
|
1579
|
-
result = {
|
|
1580
|
-
"instance_id": instance_id,
|
|
1581
|
-
"model_patch": None,
|
|
1582
|
-
"error": f"Architect failed: {error}",
|
|
1583
|
-
"attempts": 1,
|
|
1584
|
-
"agent_trace": agent_trace
|
|
1585
|
-
}
|
|
1586
|
-
save_trajectory(instance_id, trajectory_steps)
|
|
1587
|
-
save_logs(instance_id, None, result)
|
|
1588
|
-
return result
|
|
1589
|
-
|
|
1590
|
-
# Step 2: Engineer generates patch
|
|
1591
|
-
patch, error, eng_meta = engineer_agent(problem, architect_analysis)
|
|
1592
|
-
trajectory_steps.append(eng_meta)
|
|
1593
|
-
agent_trace.append({"agent": "Engineer", "output": patch[:200] if patch else None, "error": error})
|
|
1594
|
-
|
|
1595
|
-
if error or not patch:
|
|
1596
|
-
result = {
|
|
1597
|
-
"instance_id": instance_id,
|
|
1598
|
-
"model_patch": None,
|
|
1599
|
-
"error": f"Engineer failed: {error}",
|
|
1600
|
-
"attempts": 1,
|
|
1601
|
-
"agent_trace": agent_trace
|
|
1602
|
-
}
|
|
1603
|
-
save_trajectory(instance_id, trajectory_steps)
|
|
1604
|
-
save_logs(instance_id, None, result)
|
|
1605
|
-
return result
|
|
1606
|
-
|
|
1607
|
-
patch = clean_patch(patch)
|
|
1608
|
-
|
|
1609
|
-
# RARV Loop: QA -> Reviewer -> Engineer-Fix
|
|
1610
|
-
for attempt in range(1, MAX_RETRIES + 1):
|
|
1611
|
-
# Step 3: QA validates patch format
|
|
1612
|
-
qa_result = qa_agent(patch)
|
|
1613
|
-
trajectory_steps.append({
|
|
1614
|
-
"agent": "QA",
|
|
1615
|
-
"timestamp": qa_result.get("timestamp"),
|
|
1616
|
-
"elapsed_seconds": qa_result.get("elapsed_seconds"),
|
|
1617
|
-
"output": f"Valid: {qa_result['valid']}, Error: {qa_result.get('error')}",
|
|
1618
|
-
"checks": qa_result.get("checks", [])
|
|
1619
|
-
})
|
|
1620
|
-
agent_trace.append({"agent": "QA", "valid": qa_result["valid"], "error": qa_result.get("error")})
|
|
1621
|
-
|
|
1622
|
-
if qa_result["valid"]:
|
|
1623
|
-
result = {
|
|
1624
|
-
"instance_id": instance_id,
|
|
1625
|
-
"model_patch": patch,
|
|
1626
|
-
"error": None,
|
|
1627
|
-
"attempts": attempt,
|
|
1628
|
-
"agent_trace": agent_trace
|
|
1629
|
-
}
|
|
1630
|
-
save_trajectory(instance_id, trajectory_steps)
|
|
1631
|
-
save_logs(instance_id, patch, result)
|
|
1632
|
-
return result
|
|
1633
|
-
|
|
1634
|
-
if attempt >= MAX_RETRIES:
|
|
1635
|
-
break
|
|
1636
|
-
|
|
1637
|
-
# Step 4: Reviewer analyzes issues
|
|
1638
|
-
review = reviewer_agent(problem, patch, qa_result)
|
|
1639
|
-
if review.get("metadata"):
|
|
1640
|
-
trajectory_steps.append(review["metadata"])
|
|
1641
|
-
agent_trace.append({"agent": "Reviewer", "feedback": review["feedback"][:200] if review.get("feedback") else None})
|
|
1642
|
-
|
|
1643
|
-
# Step 5: Engineer fixes patch
|
|
1644
|
-
new_patch, error, fix_meta = engineer_fix_agent(problem, patch, review["feedback"], attempt + 1)
|
|
1645
|
-
trajectory_steps.append(fix_meta)
|
|
1646
|
-
agent_trace.append({"agent": f"Engineer-Fix-{attempt+1}", "output": new_patch[:200] if new_patch else None, "error": error})
|
|
1647
|
-
|
|
1648
|
-
if new_patch and not error:
|
|
1649
|
-
patch = clean_patch(new_patch)
|
|
1650
|
-
|
|
1651
|
-
# Return even if format isn't perfect - let SWE-bench evaluator handle it
|
|
1652
|
-
result = {
|
|
1653
|
-
"instance_id": instance_id,
|
|
1654
|
-
"model_patch": patch,
|
|
1655
|
-
"error": f"Format issues after {MAX_RETRIES} attempts",
|
|
1656
|
-
"attempts": MAX_RETRIES,
|
|
1657
|
-
"agent_trace": agent_trace
|
|
1658
|
-
}
|
|
1659
|
-
save_trajectory(instance_id, trajectory_steps)
|
|
1660
|
-
save_logs(instance_id, patch, result)
|
|
1661
|
-
return result
|
|
1662
|
-
|
|
1663
|
-
# Run benchmark
|
|
1664
|
-
results = {
|
|
1665
|
-
"benchmark": "SWE-bench-LokiMode",
|
|
1666
|
-
"mode": "multi-agent",
|
|
1667
|
-
"version": "1.0",
|
|
1668
|
-
"timestamp": datetime.now().isoformat(),
|
|
1669
|
-
"model": CLAUDE_MODEL,
|
|
1670
|
-
"max_retries": MAX_RETRIES,
|
|
1671
|
-
"total_problems": len(problems),
|
|
1672
|
-
"predictions": []
|
|
1673
|
-
}
|
|
1674
|
-
|
|
1675
|
-
start_time = time.time()
|
|
1676
|
-
generated_count = 0
|
|
1677
|
-
fixed_by_rarv = 0
|
|
1678
|
-
error_count = 0
|
|
1679
|
-
total_attempts = 0
|
|
1680
|
-
|
|
1681
|
-
for i, problem in enumerate(problems):
|
|
1682
|
-
instance_id = problem["instance_id"]
|
|
1683
|
-
|
|
1684
|
-
print(f"[{i+1}/{len(problems)}] {instance_id}...", end=" ", flush=True)
|
|
1685
|
-
|
|
1686
|
-
result = solve_with_loki_mode(problem)
|
|
1687
|
-
total_attempts += result["attempts"]
|
|
1688
|
-
|
|
1689
|
-
# Save patch
|
|
1690
|
-
patch_file = f"{patches_dir}/{instance_id.replace('/', '_')}.patch"
|
|
1691
|
-
with open(patch_file, 'w') as f:
|
|
1692
|
-
f.write(f"# {instance_id}\n")
|
|
1693
|
-
f.write(f"# Loki Mode Multi-Agent Patch\n")
|
|
1694
|
-
f.write(f"# Attempts: {result['attempts']}\n\n")
|
|
1695
|
-
if result["model_patch"]:
|
|
1696
|
-
f.write(result["model_patch"])
|
|
1697
|
-
|
|
1698
|
-
if result["model_patch"] and not (result.get("error") or "").startswith("Format"):
|
|
1699
|
-
generated_count += 1
|
|
1700
|
-
if result["attempts"] > 1:
|
|
1701
|
-
fixed_by_rarv += 1
|
|
1702
|
-
print(f"\033[0;32mGENERATED\033[0m (fixed on attempt {result['attempts']})")
|
|
1703
|
-
else:
|
|
1704
|
-
print(f"\033[0;32mGENERATED\033[0m")
|
|
1705
|
-
elif result["model_patch"]:
|
|
1706
|
-
generated_count += 1
|
|
1707
|
-
print(f"\033[0;33mGENERATED\033[0m (format issues)")
|
|
1708
|
-
else:
|
|
1709
|
-
error_count += 1
|
|
1710
|
-
print(f"\033[0;31mERROR\033[0m - {result.get('error', 'Unknown')[:40]}")
|
|
1711
|
-
|
|
1712
|
-
# Add to predictions
|
|
1713
|
-
results["predictions"].append({
|
|
1714
|
-
"instance_id": instance_id,
|
|
1715
|
-
"model_patch": result["model_patch"] or "",
|
|
1716
|
-
"model_name_or_path": f"loki-mode-{CLAUDE_MODEL}",
|
|
1717
|
-
"attempts": result["attempts"]
|
|
1718
|
-
})
|
|
1719
|
-
|
|
1720
|
-
elapsed_time = time.time() - start_time
|
|
1721
|
-
|
|
1722
|
-
# Save results
|
|
1723
|
-
results["generated"] = generated_count
|
|
1724
|
-
results["fixed_by_rarv"] = fixed_by_rarv
|
|
1725
|
-
results["errors"] = error_count
|
|
1726
|
-
results["avg_attempts"] = total_attempts / len(problems) if problems else 0
|
|
1727
|
-
results["elapsed_time"] = elapsed_time
|
|
1728
|
-
|
|
1729
|
-
with open(results_file, 'w') as f:
|
|
1730
|
-
json.dump(results, f, indent=2)
|
|
1731
|
-
|
|
1732
|
-
# Save predictions for SWE-bench evaluator
|
|
1733
|
-
predictions_file = f"{RESULTS_DIR}/swebench-loki-predictions.json"
|
|
1734
|
-
with open(predictions_file, 'w') as f:
|
|
1735
|
-
json.dump(results["predictions"], f, indent=2)
|
|
1736
|
-
|
|
1737
|
-
gen_rate = (generated_count / len(problems)) * 100 if problems else 0
|
|
1738
|
-
|
|
1739
|
-
print(f"\n{'='*70}")
|
|
1740
|
-
print(f" LOKI MODE SWE-BENCH RESULTS")
|
|
1741
|
-
print(f"{'='*70}")
|
|
1742
|
-
print(f" Generated: {generated_count}/{len(problems)} ({gen_rate:.1f}%)")
|
|
1743
|
-
print(f" Fixed by RARV: {fixed_by_rarv}")
|
|
1744
|
-
print(f" Errors: {error_count}/{len(problems)}")
|
|
1745
|
-
print(f" Avg Attempts: {results['avg_attempts']:.2f}")
|
|
1746
|
-
print(f" Time: {elapsed_time:.1f}s ({elapsed_time/len(problems):.1f}s avg)")
|
|
1747
|
-
print(f"{'='*70}")
|
|
1748
|
-
print(f"\n Output Files (for official submission):")
|
|
1749
|
-
print(f" - Predictions: {predictions_file}")
|
|
1750
|
-
print(f" - Trajectories: {trajs_dir}/ ({len(os.listdir(trajs_dir))} files)")
|
|
1751
|
-
print(f" - Logs: {logs_dir}/ ({len(os.listdir(logs_dir))} dirs)")
|
|
1752
|
-
print(f"{'='*70}")
|
|
1753
|
-
print(f"\n Comparison:")
|
|
1754
|
-
print(f" - Direct Claude: 99.67% patch gen")
|
|
1755
|
-
print(f" - Loki Mode (multi-agent): {gen_rate:.1f}% patch gen")
|
|
1756
|
-
print(f"{'='*70}")
|
|
1757
|
-
print(f"\n Next Step: Run SWE-bench evaluator")
|
|
1758
|
-
print(f" python -m swebench.harness.run_evaluation \\")
|
|
1759
|
-
print(f" --predictions {predictions_file}")
|
|
1760
|
-
print(f"{'='*70}\n")
|
|
1761
|
-
SWEBENCH_LOKI
|
|
1762
|
-
|
|
1763
|
-
log_success "Loki Mode SWE-bench patch generation complete"
|
|
1764
|
-
log_info "Results: $RESULTS_DIR/swebench-loki-results.json"
|
|
1765
|
-
log_info "Predictions: $RESULTS_DIR/swebench-loki-predictions.json"
|
|
1766
|
-
}
|
|
1767
|
-
|
|
1768
|
-
#===============================================================================
|
|
1769
|
-
# Summary Report
|
|
1770
|
-
#===============================================================================
|
|
1771
|
-
|
|
1772
|
-
generate_summary() {
|
|
1773
|
-
log_info "Generating benchmark summary..."
|
|
1774
|
-
|
|
1775
|
-
local humaneval_results="$RESULTS_DIR/humaneval-results.json"
|
|
1776
|
-
local swebench_results="$RESULTS_DIR/swebench-results.json"
|
|
1777
|
-
|
|
1778
|
-
python3 << SUMMARY_GEN
|
|
1779
|
-
import json
|
|
1780
|
-
import os
|
|
1781
|
-
from datetime import datetime
|
|
1782
|
-
|
|
1783
|
-
RESULTS_DIR = os.environ.get('RESULTS_DIR', './results')
|
|
1784
|
-
|
|
1785
|
-
summary = f"""# Loki Mode Benchmark Results
|
|
1786
|
-
|
|
1787
|
-
**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
|
1788
|
-
|
|
1789
|
-
## Overview
|
|
1790
|
-
|
|
1791
|
-
This directory contains benchmark results for Loki Mode multi-agent system.
|
|
1792
|
-
|
|
1793
|
-
"""
|
|
1794
|
-
|
|
1795
|
-
# HumanEval results
|
|
1796
|
-
humaneval_file = f"{RESULTS_DIR}/humaneval-results.json"
|
|
1797
|
-
if os.path.exists(humaneval_file):
|
|
1798
|
-
with open(humaneval_file) as f:
|
|
1799
|
-
he = json.load(f)
|
|
1800
|
-
|
|
1801
|
-
if he.get("status") == "COMPLETED":
|
|
1802
|
-
summary += f"""## HumanEval Results
|
|
1803
|
-
|
|
1804
|
-
| Metric | Value |
|
|
1805
|
-
|--------|-------|
|
|
1806
|
-
| Problems | {he.get('total_problems', 'N/A')} |
|
|
1807
|
-
| Passed | {he.get('passed', 'N/A')} |
|
|
1808
|
-
| Failed | {he.get('failed', 'N/A')} |
|
|
1809
|
-
| **Pass Rate** | **{he.get('pass_rate', 'N/A')}%** |
|
|
1810
|
-
| Model | {he.get('model', 'N/A')} |
|
|
1811
|
-
| Time | {he.get('elapsed_seconds', 'N/A')}s |
|
|
1812
|
-
|
|
1813
|
-
### Competitor Comparison
|
|
1814
|
-
|
|
1815
|
-
| System | Pass@1 |
|
|
1816
|
-
|--------|--------|
|
|
1817
|
-
| MetaGPT | 85.9-87.7% |
|
|
1818
|
-
| **Loki Mode** | **{he.get('pass_rate', 'N/A')}%** |
|
|
1819
|
-
|
|
1820
|
-
"""
|
|
1821
|
-
else:
|
|
1822
|
-
summary += f"""## HumanEval
|
|
1823
|
-
|
|
1824
|
-
Status: {he.get('status', 'UNKNOWN')}
|
|
1825
|
-
|
|
1826
|
-
To run: \`./benchmarks/run-benchmarks.sh humaneval --execute\`
|
|
1827
|
-
|
|
1828
|
-
"""
|
|
1829
|
-
|
|
1830
|
-
# SWE-bench results
|
|
1831
|
-
swebench_file = f"{RESULTS_DIR}/swebench-results.json"
|
|
1832
|
-
if os.path.exists(swebench_file):
|
|
1833
|
-
with open(swebench_file) as f:
|
|
1834
|
-
sb = json.load(f)
|
|
1835
|
-
|
|
1836
|
-
if sb.get("status") == "PATCHES_GENERATED":
|
|
1837
|
-
summary += f"""## SWE-bench Lite Results
|
|
1838
|
-
|
|
1839
|
-
| Metric | Value |
|
|
1840
|
-
|--------|-------|
|
|
1841
|
-
| Problems | {sb.get('total_problems', 'N/A')} |
|
|
1842
|
-
| Patches Generated | {sb.get('generated', 'N/A')} |
|
|
1843
|
-
| Errors | {sb.get('errors', 'N/A')} |
|
|
1844
|
-
| Model | {sb.get('model', 'N/A')} |
|
|
1845
|
-
| Time | {sb.get('elapsed_seconds', 'N/A')}s |
|
|
1846
|
-
|
|
1847
|
-
**Next Step:** Run the SWE-bench evaluator to validate patches:
|
|
1848
|
-
|
|
1849
|
-
\`\`\`bash
|
|
1850
|
-
python -m swebench.harness.run_evaluation \\
|
|
1851
|
-
--predictions {sb.get('predictions_file', 'swebench-predictions.json')} \\
|
|
1852
|
-
--max_workers 4
|
|
1853
|
-
\`\`\`
|
|
1854
|
-
|
|
1855
|
-
"""
|
|
1856
|
-
else:
|
|
1857
|
-
summary += f"""## SWE-bench Lite
|
|
1858
|
-
|
|
1859
|
-
Status: {sb.get('status', 'UNKNOWN')}
|
|
1860
|
-
|
|
1861
|
-
To run: \`./benchmarks/run-benchmarks.sh swebench --execute\`
|
|
1862
|
-
|
|
1863
|
-
"""
|
|
1864
|
-
|
|
1865
|
-
summary += """## Methodology
|
|
1866
|
-
|
|
1867
|
-
Loki Mode uses its multi-agent architecture to solve each problem:
|
|
1868
|
-
1. **Architect Agent** analyzes the problem
|
|
1869
|
-
2. **Engineer Agent** implements the solution
|
|
1870
|
-
3. **QA Agent** validates with test cases
|
|
1871
|
-
4. **Review Agent** checks code quality
|
|
1872
|
-
|
|
1873
|
-
This mirrors real-world software development more accurately than single-agent approaches.
|
|
1874
|
-
|
|
1875
|
-
## Running Benchmarks
|
|
1876
|
-
|
|
1877
|
-
\`\`\`bash
|
|
1878
|
-
# Setup only (download datasets)
|
|
1879
|
-
./benchmarks/run-benchmarks.sh all
|
|
1880
|
-
|
|
1881
|
-
# Execute with Claude
|
|
1882
|
-
./benchmarks/run-benchmarks.sh humaneval --execute
|
|
1883
|
-
./benchmarks/run-benchmarks.sh humaneval --execute --limit 10 # First 10 only
|
|
1884
|
-
./benchmarks/run-benchmarks.sh swebench --execute --limit 5 # First 5 only
|
|
1885
|
-
|
|
1886
|
-
# Use different model
|
|
1887
|
-
./benchmarks/run-benchmarks.sh humaneval --execute --model opus
|
|
1888
|
-
\`\`\`
|
|
1889
|
-
"""
|
|
1890
|
-
|
|
1891
|
-
with open(f"{RESULTS_DIR}/SUMMARY.md", 'w') as f:
|
|
1892
|
-
f.write(summary)
|
|
1893
|
-
|
|
1894
|
-
print(f"Summary saved to {RESULTS_DIR}/SUMMARY.md")
|
|
1895
|
-
SUMMARY_GEN
|
|
1896
|
-
|
|
1897
|
-
log_success "Summary generated: $RESULTS_DIR/SUMMARY.md"
|
|
1898
|
-
}
|
|
1899
|
-
|
|
1900
|
-
#===============================================================================
|
|
1901
|
-
# Main
|
|
1902
|
-
#===============================================================================
|
|
1903
|
-
|
|
1904
|
-
main() {
|
|
1905
|
-
parse_args "$@"
|
|
1906
|
-
|
|
1907
|
-
echo ""
|
|
1908
|
-
echo "========================================"
|
|
1909
|
-
echo " Loki Mode Benchmark Runner"
|
|
1910
|
-
if [ "$EXECUTE_MODE" = true ]; then
|
|
1911
|
-
echo " Mode: EXECUTE"
|
|
1912
|
-
else
|
|
1913
|
-
echo " Mode: SETUP"
|
|
1914
|
-
fi
|
|
1915
|
-
echo "========================================"
|
|
1916
|
-
echo ""
|
|
1917
|
-
|
|
1918
|
-
export SCRIPT_DIR RESULTS_DIR PROJECT_DIR
|
|
1919
|
-
|
|
1920
|
-
setup_environment
|
|
1921
|
-
|
|
1922
|
-
case "$BENCHMARK" in
|
|
1923
|
-
humaneval)
|
|
1924
|
-
run_humaneval
|
|
1925
|
-
;;
|
|
1926
|
-
swebench)
|
|
1927
|
-
run_swebench
|
|
1928
|
-
;;
|
|
1929
|
-
all)
|
|
1930
|
-
run_humaneval
|
|
1931
|
-
run_swebench
|
|
1932
|
-
;;
|
|
1933
|
-
*)
|
|
1934
|
-
log_error "Unknown benchmark: $BENCHMARK"
|
|
1935
|
-
echo "Usage: $0 [humaneval|swebench|all] [--execute] [--limit N]"
|
|
1936
|
-
exit 1
|
|
1937
|
-
;;
|
|
1938
|
-
esac
|
|
1939
|
-
|
|
1940
|
-
generate_summary
|
|
1941
|
-
|
|
1942
|
-
echo ""
|
|
1943
|
-
log_success "Benchmarks complete!"
|
|
1944
|
-
log_info "Results directory: $RESULTS_DIR"
|
|
1945
|
-
echo ""
|
|
1946
|
-
}
|
|
1947
|
-
|
|
1948
|
-
main "$@"
|