agentic-qe 2.1.2 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/agentic-quality-engineering/SKILL.md +4 -4
- package/.claude/skills/cicd-pipeline-qe-orchestrator/README.md +14 -11
- package/.claude/skills/skills-manifest.json +2 -2
- package/CHANGELOG.md +138 -0
- package/README.md +92 -214
- package/dist/agents/BaseAgent.d.ts +5 -1
- package/dist/agents/BaseAgent.d.ts.map +1 -1
- package/dist/agents/BaseAgent.js +32 -17
- package/dist/agents/BaseAgent.js.map +1 -1
- package/dist/agents/index.d.ts.map +1 -1
- package/dist/agents/index.js +5 -1
- package/dist/agents/index.js.map +1 -1
- package/dist/cli/commands/improve/index.d.ts +8 -1
- package/dist/cli/commands/improve/index.d.ts.map +1 -1
- package/dist/cli/commands/improve/index.js +18 -16
- package/dist/cli/commands/improve/index.js.map +1 -1
- package/dist/cli/commands/learn/index.d.ts +10 -2
- package/dist/cli/commands/learn/index.d.ts.map +1 -1
- package/dist/cli/commands/learn/index.js +99 -63
- package/dist/cli/commands/learn/index.js.map +1 -1
- package/dist/cli/commands/patterns/index.d.ts +8 -1
- package/dist/cli/commands/patterns/index.d.ts.map +1 -1
- package/dist/cli/commands/patterns/index.js +79 -45
- package/dist/cli/commands/patterns/index.js.map +1 -1
- package/dist/cli/commands/routing/index.d.ts +5 -0
- package/dist/cli/commands/routing/index.d.ts.map +1 -1
- package/dist/cli/commands/routing/index.js +11 -10
- package/dist/cli/commands/routing/index.js.map +1 -1
- package/dist/cli/init/agents.d.ts +1 -1
- package/dist/cli/init/agents.js +2 -2
- package/dist/cli/init/database-init.d.ts +7 -0
- package/dist/cli/init/database-init.d.ts.map +1 -1
- package/dist/cli/init/database-init.js +29 -48
- package/dist/cli/init/database-init.js.map +1 -1
- package/dist/core/di/AgentDependencies.d.ts +127 -0
- package/dist/core/di/AgentDependencies.d.ts.map +1 -0
- package/dist/core/di/AgentDependencies.js +251 -0
- package/dist/core/di/AgentDependencies.js.map +1 -0
- package/dist/core/di/DIContainer.d.ts +149 -0
- package/dist/core/di/DIContainer.d.ts.map +1 -0
- package/dist/core/di/DIContainer.js +333 -0
- package/dist/core/di/DIContainer.js.map +1 -0
- package/dist/core/di/index.d.ts +11 -0
- package/dist/core/di/index.d.ts.map +1 -0
- package/dist/core/di/index.js +22 -0
- package/dist/core/di/index.js.map +1 -0
- package/dist/core/index.d.ts +1 -0
- package/dist/core/index.d.ts.map +1 -1
- package/dist/core/index.js +11 -1
- package/dist/core/index.js.map +1 -1
- package/dist/core/memory/HNSWVectorMemory.d.ts +261 -0
- package/dist/core/memory/HNSWVectorMemory.d.ts.map +1 -0
- package/dist/core/memory/HNSWVectorMemory.js +647 -0
- package/dist/core/memory/HNSWVectorMemory.js.map +1 -0
- package/dist/core/memory/SwarmMemoryManager.d.ts +7 -0
- package/dist/core/memory/SwarmMemoryManager.d.ts.map +1 -1
- package/dist/core/memory/SwarmMemoryManager.js +9 -0
- package/dist/core/memory/SwarmMemoryManager.js.map +1 -1
- package/dist/core/memory/index.d.ts +2 -0
- package/dist/core/memory/index.d.ts.map +1 -1
- package/dist/core/memory/index.js +11 -1
- package/dist/core/memory/index.js.map +1 -1
- package/dist/learning/ExperienceSharingProtocol.d.ts +243 -0
- package/dist/learning/ExperienceSharingProtocol.d.ts.map +1 -0
- package/dist/learning/ExperienceSharingProtocol.js +538 -0
- package/dist/learning/ExperienceSharingProtocol.js.map +1 -0
- package/dist/learning/ExplainableLearning.d.ts +191 -0
- package/dist/learning/ExplainableLearning.d.ts.map +1 -0
- package/dist/learning/ExplainableLearning.js +441 -0
- package/dist/learning/ExplainableLearning.js.map +1 -0
- package/dist/learning/GossipPatternSharingProtocol.d.ts +228 -0
- package/dist/learning/GossipPatternSharingProtocol.d.ts.map +1 -0
- package/dist/learning/GossipPatternSharingProtocol.js +590 -0
- package/dist/learning/GossipPatternSharingProtocol.js.map +1 -0
- package/dist/learning/LearningEngine.d.ts +104 -4
- package/dist/learning/LearningEngine.d.ts.map +1 -1
- package/dist/learning/LearningEngine.js +350 -16
- package/dist/learning/LearningEngine.js.map +1 -1
- package/dist/learning/PerformanceOptimizer.d.ts +268 -0
- package/dist/learning/PerformanceOptimizer.d.ts.map +1 -0
- package/dist/learning/PerformanceOptimizer.js +552 -0
- package/dist/learning/PerformanceOptimizer.js.map +1 -0
- package/dist/learning/PrivacyManager.d.ts +197 -0
- package/dist/learning/PrivacyManager.d.ts.map +1 -0
- package/dist/learning/PrivacyManager.js +551 -0
- package/dist/learning/PrivacyManager.js.map +1 -0
- package/dist/learning/QLearning.d.ts +38 -125
- package/dist/learning/QLearning.d.ts.map +1 -1
- package/dist/learning/QLearning.js +46 -267
- package/dist/learning/QLearning.js.map +1 -1
- package/dist/learning/QLearningLegacy.d.ts +154 -0
- package/dist/learning/QLearningLegacy.d.ts.map +1 -0
- package/dist/learning/QLearningLegacy.js +337 -0
- package/dist/learning/QLearningLegacy.js.map +1 -0
- package/dist/learning/TransferLearningManager.d.ts +212 -0
- package/dist/learning/TransferLearningManager.d.ts.map +1 -0
- package/dist/learning/TransferLearningManager.js +497 -0
- package/dist/learning/TransferLearningManager.js.map +1 -0
- package/dist/learning/algorithms/AbstractRLLearner.d.ts +162 -0
- package/dist/learning/algorithms/AbstractRLLearner.d.ts.map +1 -0
- package/dist/learning/algorithms/AbstractRLLearner.js +300 -0
- package/dist/learning/algorithms/AbstractRLLearner.js.map +1 -0
- package/dist/learning/algorithms/ActorCriticLearner.d.ts +201 -0
- package/dist/learning/algorithms/ActorCriticLearner.d.ts.map +1 -0
- package/dist/learning/algorithms/ActorCriticLearner.js +447 -0
- package/dist/learning/algorithms/ActorCriticLearner.js.map +1 -0
- package/dist/learning/algorithms/MAMLMetaLearner.d.ts +218 -0
- package/dist/learning/algorithms/MAMLMetaLearner.d.ts.map +1 -0
- package/dist/learning/algorithms/MAMLMetaLearner.js +532 -0
- package/dist/learning/algorithms/MAMLMetaLearner.js.map +1 -0
- package/dist/learning/algorithms/PPOLearner.d.ts +207 -0
- package/dist/learning/algorithms/PPOLearner.d.ts.map +1 -0
- package/dist/learning/algorithms/PPOLearner.js +490 -0
- package/dist/learning/algorithms/PPOLearner.js.map +1 -0
- package/dist/learning/algorithms/QLearning.d.ts +68 -0
- package/dist/learning/algorithms/QLearning.d.ts.map +1 -0
- package/dist/learning/algorithms/QLearning.js +116 -0
- package/dist/learning/algorithms/QLearning.js.map +1 -0
- package/dist/learning/algorithms/SARSALearner.d.ts +107 -0
- package/dist/learning/algorithms/SARSALearner.d.ts.map +1 -0
- package/dist/learning/algorithms/SARSALearner.js +252 -0
- package/dist/learning/algorithms/SARSALearner.js.map +1 -0
- package/dist/learning/algorithms/index.d.ts +32 -0
- package/dist/learning/algorithms/index.d.ts.map +1 -0
- package/dist/learning/algorithms/index.js +50 -0
- package/dist/learning/algorithms/index.js.map +1 -0
- package/dist/learning/index.d.ts +11 -0
- package/dist/learning/index.d.ts.map +1 -1
- package/dist/learning/index.js +31 -1
- package/dist/learning/index.js.map +1 -1
- package/dist/learning/types.d.ts +2 -0
- package/dist/learning/types.d.ts.map +1 -1
- package/dist/mcp/server-instructions.d.ts +1 -1
- package/dist/mcp/server-instructions.js +1 -1
- package/dist/memory/DistributedPatternLibrary.d.ts +159 -0
- package/dist/memory/DistributedPatternLibrary.d.ts.map +1 -0
- package/dist/memory/DistributedPatternLibrary.js +370 -0
- package/dist/memory/DistributedPatternLibrary.js.map +1 -0
- package/dist/memory/PatternQualityScorer.d.ts +169 -0
- package/dist/memory/PatternQualityScorer.d.ts.map +1 -0
- package/dist/memory/PatternQualityScorer.js +327 -0
- package/dist/memory/PatternQualityScorer.js.map +1 -0
- package/dist/memory/PatternReplicationService.d.ts +187 -0
- package/dist/memory/PatternReplicationService.d.ts.map +1 -0
- package/dist/memory/PatternReplicationService.js +392 -0
- package/dist/memory/PatternReplicationService.js.map +1 -0
- package/dist/providers/ClaudeProvider.d.ts +98 -0
- package/dist/providers/ClaudeProvider.d.ts.map +1 -0
- package/dist/providers/ClaudeProvider.js +418 -0
- package/dist/providers/ClaudeProvider.js.map +1 -0
- package/dist/providers/HybridRouter.d.ts +217 -0
- package/dist/providers/HybridRouter.d.ts.map +1 -0
- package/dist/providers/HybridRouter.js +679 -0
- package/dist/providers/HybridRouter.js.map +1 -0
- package/dist/providers/ILLMProvider.d.ts +287 -0
- package/dist/providers/ILLMProvider.d.ts.map +1 -0
- package/dist/providers/ILLMProvider.js +33 -0
- package/dist/providers/ILLMProvider.js.map +1 -0
- package/dist/providers/LLMProviderFactory.d.ts +154 -0
- package/dist/providers/LLMProviderFactory.d.ts.map +1 -0
- package/dist/providers/LLMProviderFactory.js +426 -0
- package/dist/providers/LLMProviderFactory.js.map +1 -0
- package/dist/providers/RuvllmProvider.d.ts +107 -0
- package/dist/providers/RuvllmProvider.d.ts.map +1 -0
- package/dist/providers/RuvllmProvider.js +417 -0
- package/dist/providers/RuvllmProvider.js.map +1 -0
- package/dist/providers/index.d.ts +32 -0
- package/dist/providers/index.d.ts.map +1 -0
- package/dist/providers/index.js +75 -0
- package/dist/providers/index.js.map +1 -0
- package/dist/telemetry/LearningTelemetry.d.ts +190 -0
- package/dist/telemetry/LearningTelemetry.d.ts.map +1 -0
- package/dist/telemetry/LearningTelemetry.js +403 -0
- package/dist/telemetry/LearningTelemetry.js.map +1 -0
- package/dist/telemetry/index.d.ts +1 -0
- package/dist/telemetry/index.d.ts.map +1 -1
- package/dist/telemetry/index.js +20 -2
- package/dist/telemetry/index.js.map +1 -1
- package/dist/telemetry/instrumentation/agent.d.ts +1 -1
- package/dist/telemetry/instrumentation/agent.js +1 -1
- package/dist/telemetry/instrumentation/index.d.ts +1 -1
- package/dist/telemetry/instrumentation/index.js +1 -1
- package/dist/utils/math.d.ts +11 -0
- package/dist/utils/math.d.ts.map +1 -0
- package/dist/utils/math.js +16 -0
- package/dist/utils/math.js.map +1 -0
- package/docs/reference/agents.md +1 -1
- package/docs/reference/skills.md +3 -3
- package/docs/reference/usage.md +4 -4
- package/package.json +1 -1
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"PPOLearner.js","sourceRoot":"","sources":["../../../src/learning/algorithms/PPOLearner.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;;;;GAqBG;;;AA2lBH,wDAqBC;AA9mBD,2DAA0E;AAqD1E;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmCG;AACH,MAAa,UAAW,SAAQ,qCAAiB;IAQ/C,YAAY,MAAiB;QAC3B,KAAK,CAAC,MAAM,CAAC,CAAC;QACd,IAAI,CAAC,SAAS,GAAG,MAAM,CAAC;QACxB,IAAI,CAAC,WAAW,GAAG,IAAI,GAAG,EAAE,CAAC;QAC7B,IAAI,CAAC,UAAU,GAAG,IAAI,GAAG,EAAE,CAAC;QAC5B,IAAI,CAAC,cAAc,GAAG,IAAI,GAAG,EAAE,CAAC;QAChC,IAAI,CAAC,UAAU,GAAG,EAAE,CAAC;QACrB,IAAI,CAAC,kBAAkB,GAAG,MAAM,CAAC,eAAe,CAAC;QAEjD,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,wBAAwB,EAAE;YACzC,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,MAAM,EAAE,MAAM,CAAC,SAAS;YACxB,SAAS,EAAE,MAAM,CAAC,SAAS;YAC3B,YAAY,EAAE,MAAM,CAAC,kBAAkB;SACxC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACM,YAAY,CAAC,KAAgB,EAAE,gBAA+B;QACrE,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAClC,MAAM,IAAI,KAAK,CAAC,qCAAqC,CAAC,CAAC;QACzD,CAAC;QAED,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,KAAK,GAAG,IAAI,CAAC,sBAAsB,CAAC,QAAQ,EAAE,gBAAgB,CAAC,CAAC;QAEtE,2BAA2B;QAC3B,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QAC7B,IAAI,UAAU,GAAG,CAAC,CAAC;QAEnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,gBAAgB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjD,UAAU,IAAI,KAAK,CAAC,CAAC,CAAC,CAAC;YACvB,IAAI,MAAM,IAAI,UAAU,EAAE,CAAC;gBACzB,OAAO,gBAAgB,CAAC,CAAC,CAAC,CAAC;YAC7B,CAAC;QACH,CAAC;QAED,OAAO,gBAAgB,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACvD,CAAC;IAED;;OAEG;IACK,sBAAsB,CAAC,QAAgB,EAAE,gBAA+B;QAC9E,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,KAAK,MAAM,MAAM,IAAI,gBAAgB,EAAE,CAAC;YACtC,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YAC5C,MAAM,MAAM,GAAG,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;YACzD,WAAW,CAAC,IAAI,CAAC,MAAM,CAAC,UAAU,CAAC,CAAC;QACtC,CAAC;QAED,mCAAmC;QACnC,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,WAAW,CAAC,CAAC;QACzC,MAAM,QAAQ,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC;QAC7D,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;QAEvD,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;IACvC,CAAC;IAED;;OAEG;IACK,eAAe,CAAC,QAAgB,EAAE,SAAiB;QACzD,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACnD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,OAAO,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC;QACvD,CAAC;QACD,OAAO,WAAW,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC;IACrF,CAAC;IAED;;OAEG;IACK,UAAU,CAAC,QAAgB,EAAE,SAAiB,EAAE,gBAAgC;QACtF,mCAAmC;QACnC,MAAM,MAAM,GAAG,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAEzD,4DAA4D;QAC5D,IAAI,CAAC,gBAAgB,EAAE,CAAC;YACtB,OAAO,MAAM,CAAC,OAAO,CAAC;QACxB,CAAC;QAED,mCAAmC;QACnC,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,IAAI,UAAU,GAAG,MAAM,CAAC,UAAU,CAAC;QAEnC,KAAK,MAAM,MAAM,IAAI,gBAAgB,EAAE,CAAC;YACtC,MAAM,EAAE,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YACrC,MAAM,CAAC,GAAG,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;YAC7C,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC;YACzB,IAAI,EAAE,KAAK,SAAS,EAAE,CAAC;gBACrB,UAAU,GAAG,CAAC,CAAC,UAAU,CAAC;YAC5B,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,EAAE,UAAU,CAAC,CAAC;QAC/C,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,GAAG,OAAO,CAAC,CAAC;QACjD,MAAM,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,OAAO,CAAC,EAAE,CAAC,CAAC,CAAC;QAExE,OAAO,IAAI,CAAC,GAAG,CAAC,SAAS,GAAG,MAAM,CAAC,CAAC;IACtC,CAAC;IAED;;OAEG;IACM,aAAa,CAAC,KAAgB;QACrC,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACzC,OAAO,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,WAAW,CACT,KAAgB,EAChB,MAAmB,EACnB,MAAc,EACd,SAAoB,EACpB,IAAa;QAEb,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC;QACzC,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;QAC5C,MAAM,YAAY,GAAG,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC;QAEjD,MAAM,KAAK,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACjD,MAAM,OAAO,GAAG,IAAI,CAAC,UAAU,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAErD,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC;YACnB,KAAK,EAAE,QAAQ;YACf,MAAM,EAAE,SAAS;YACjB,MAAM;YACN,SAAS,EAAE,YAAY;YACvB,IAAI;YACJ,KAAK;YACL,OAAO;YACP,SAAS,EAAE,CAAC,EAAE,iBAAiB;YAC/B,OAAO,EAAE,CAAC,CAAI,iBAAiB;SAChC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACM,MAAM,CAAC,UAA0B,EAAE,UAAwB;QAClE,IAAI,CAAC,SAAS,EAAE,CAAC;QAEjB,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,GAAG,UAAU,CAAC;QACxD,MAAM,IAAI,GAAG,UAAU,CAAC,IAAI,IAAI,KAAK,CAAC;QAEtC,eAAe;QACf,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;QAEzD,wCAAwC;QACxC,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,IAAI,IAAI,CAAC,SAAS,CAAC,gBAAgB,EAAE,CAAC;YAC9D,IAAI,CAAC,iBAAiB,EAAE,CAAC;QAC3B,CAAC;IACH,CAAC;IAED;;OAEG;IACH,iBAAiB;QACf,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACjC,OAAO;QACT,CAAC;QAED,+BAA+B;QAC/B,IAAI,CAAC,UAAU,EAAE,CAAC;QAElB,wCAAwC;QACxC,IAAI,CAAC,aAAa,EAAE,CAAC;QAErB,8BAA8B;QAC9B,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,SAAS,EAAE,KAAK,EAAE,EAAE,CAAC;YAC9D,IAAI,CAAC,UAAU,EAAE,CAAC;QACpB,CAAC;QAED,mBAAmB;QACnB,IAAI,CAAC,UAAU,GAAG,EAAE,CAAC;QAErB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,uBAAuB,EAAE;YACxC,MAAM,EAAE,IAAI,CAAC,SAAS,CAAC,SAAS;YAChC,KAAK,EAAE,IAAI,CAAC,SAAS;SACtB,CAAC,CAAC;IACL,CAAC;IAED;;;;;OAKG;IACK,UAAU;QAChB,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,cAAc,CAAC;QACzC,MAAM,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC;QAExC,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,MAAM,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC;QAEjC,2CAA2C;QAC3C,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAChC,MAAM,IAAI,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YAEhC,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI;gBACzB,CAAC,CAAC,CAAC;gBACH,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC;YAE1F,WAAW;YACX,MAAM,KAAK,GAAG,IAAI,CAAC,MAAM,GAAG,KAAK,GAAG,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC;YAE3D,gBAAgB;YAChB,UAAU,GAAG,IAAI,CAAC,IAAI;gBACpB,CAAC,CAAC,KAAK;gBACP,CAAC,CAAC,KAAK,GAAG,KAAK,GAAG,MAAM,GAAG,UAAU,CAAC;YAExC,IAAI,CAAC,SAAS,GAAG,UAAU,CAAC;YAC5B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC;QAC7C,CAAC;QAED,uBAAuB;QACvB,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QACzD,MAAM,IAAI,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC;QACvE,MAAM,QAAQ,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC;QACzF,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,IAAI,CAAC;QAEvC,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;YACnC,IAAI,CAAC,SAAS,GAAG,CAAC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,GAAG,GAAG,CAAC;QACjD,CAAC;IACH,CAAC;IAED;;OAEG;IACK,aAAa;QACnB,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,CAAC;QAC5B,KAAK,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,IAAI,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,CAAC;YAC1D,MAAM,SAAS,GAAG,IAAI,GAAG,EAAwB,CAAC;YAClD,KAAK,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;gBACjD,SAAS,CAAC,GAAG,CAAC,MAAM,EAAE,EAAE,GAAG,MAAM,EAAE,CAAC,CAAC;YACvC,CAAC;YACD,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;QAC5C,CAAC;IACH,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,QAAgB,EAAE,SAAiB;QACvD,MAAM,WAAW,GAAG,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACtD,IAAI,CAAC,WAAW,EAAE,CAAC;YACjB,OAAO,CAAC,CAAC;QACX,CAAC;QACD,OAAO,WAAW,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,OAAO,IAAI,CAAC,CAAC;IAClD,CAAC;IAED;;OAEG;IACK,UAAU;QAChB,qBAAqB;QACrB,MAAM,QAAQ,GAAG,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC;QAEtE,qBAAqB;QACrB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,MAAM,EAAE,CAAC,IAAI,IAAI,CAAC,SAAS,CAAC,aAAa,EAAE,CAAC;YACvE,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,aAAa,CAAC,CAAC;YAClE,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC;QAC7B,CAAC;IACH,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,KAAuB;QAC5C,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,4BAA4B;YAC5B,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;YAC5D,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC,sBAAsB;YACvD,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,GAAG,UAAU,CAAC,CAAC;YAEhD,2CAA2C;YAC3C,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC;YACvC,MAAM,KAAK,GAAG,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC;YACrC,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC;YAE3E,qDAAqD;YACrD,MAAM,UAAU,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;YAE3C,aAAa;YACb,MAAM,WAAW,GAAG,IAAI,CAAC,OAAO,CAAC;YACjC,MAAM,YAAY,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YAC1D,IAAI,SAAS,GAAG,CAAC,YAAY,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC;YAElD,6BAA6B;YAC7B,IAAI,IAAI,CAAC,SAAS,CAAC,aAAa,EAAE,CAAC;gBACjC,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,GAAG,IAAI,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC;gBAC3F,MAAM,gBAAgB,GAAG,CAAC,YAAY,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC;gBAC3D,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,gBAAgB,CAAC,CAAC;YACpD,CAAC;YAED,gBAAgB;YAChB,MAAM,OAAO,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAChD,MAAM,WAAW,GAAG,CAAC,IAAI,CAAC,SAAS,CAAC,kBAAkB,GAAG,OAAO,CAAC;YAEjE,aAAa;YACb,MAAM,SAAS,GAAG,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,oBAAoB,GAAG,SAAS,GAAG,WAAW,CAAC;YAE7F,4CAA4C;YAC5C,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,KAAK,EAAE,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;YAElE,wBAAwB;YACxB,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC;QAC5C,CAAC;IACH,CAAC;IAED;;OAEG;IACK,YAAY,CAClB,QAAgB,EAChB,SAAiB,EACjB,SAAiB,EACjB,KAAa;QAEb,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC;YACpC,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,EAAE,IAAI,GAAG,EAAE,CAAC,CAAC;QAC5C,CAAC;QACD,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAE,CAAC;QAEpD,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,CAAC;QAE5F,mBAAmB;QACnB,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC;QACvC,IAAI,QAAQ,GAAG,SAAS,CAAC;QACzB,IAAI,CAAC,KAAK,GAAG,CAAC,GAAG,GAAG,IAAI,SAAS,GAAG,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,GAAG,GAAG,IAAI,SAAS,GAAG,CAAC,CAAC,EAAE,CAAC;YAC7E,QAAQ,GAAG,CAAC,CAAC,CAAC,sBAAsB;QACtC,CAAC;QAED,oBAAoB;QACpB,MAAM,aAAa,GAAG,OAAO,CAAC,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,kBAAkB,GAAG,QAAQ,CAAC;QACxF,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAExD,WAAW,CAAC,GAAG,CAAC,SAAS,EAAE;YACzB,UAAU,EAAE,aAAa;YACzB,OAAO,EAAE,UAAU;YACnB,WAAW,EAAE,OAAO,CAAC,WAAW,GAAG,CAAC;SACrC,CAAC,CAAC;QAEH,mCAAmC;QACnC,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,SAAS,EAAE,aAAa,CAAC,CAAC;IACrD,CAAC;IAED;;OAEG;IACK,WAAW,CAAC,QAAgB,EAAE,MAAc;QAClD,MAAM,OAAO,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QACnD,MAAM,QAAQ,GAAG,OAAO,GAAG,IAAI,CAAC,SAAS,CAAC,iBAAiB,GAAG,CAAC,MAAM,GAAG,OAAO,CAAC,CAAC;QACjF,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAC1C,CAAC;IAED;;OAEG;IACK,cAAc,CAAC,QAAgB;QACrC,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QACnD,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;YAC3C,OAAO,CAAC,CAAC;QACX,CAAC;QAED,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC;QACtE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC;QACnC,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC;QACvD,MAAM,MAAM,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;QACnD,MAAM,KAAK,GAAG,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;QAE5C,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;YACtB,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBACV,OAAO,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;YAC7B,CAAC;QACH,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACO,yBAAyB;QACjC,OAAO,IAAI,CAAC,kBAAkB,CAAC;IACjC,CAAC;IAED;;OAEG;IACH,gBAAgB;QAQd,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,EAAE,EAAE,CAAC;YACzC,UAAU,IAAI,CAAC,CAAC;QAClB,CAAC;QAED,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,KAAK,MAAM,WAAW,IAAI,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,EAAE,CAAC;YACpD,UAAU,IAAI,WAAW,CAAC,IAAI,CAAC;QACjC,CAAC;QAED,MAAM,YAAY,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC;YAC7C,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,SAAS,EAAE,CAAC,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,MAAM;YAC/E,CAAC,CAAC,CAAC,CAAC;QAEN,OAAO;YACL,gBAAgB,EAAE,IAAI,CAAC,UAAU,CAAC,MAAM;YACxC,cAAc,EAAE,IAAI,CAAC,UAAU,CAAC,IAAI;YACpC,eAAe,EAAE,UAAU;YAC3B,QAAQ,EAAE,IAAI,CAAC,UAAU,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC1E,YAAY;YACZ,YAAY,EAAE,CAAC,CAAC,sCAAsC;SACvD,CAAC;IACJ,CAAC;IAED;;OAEG;IACM,KAAK;QACZ,KAAK,CAAC,KAAK,EAAE,CAAC;QACd,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC;QACzB,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;QACxB,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,CAAC;QAC5B,IAAI,CAAC,UAAU,GAAG,EAAE,CAAC;QACrB,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;IACvC,CAAC;IAED;;OAEG;IACH,SAAS;QAMP,MAAM,gBAAgB,GAAiD,EAAE,CAAC;QAC1E,KAAK,MAAM,CAAC,KAAK,EAAE,OAAO,CAAC,IAAI,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,EAAE,CAAC;YAC1D,gBAAgB,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC;YAC7B,KAAK,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,IAAI,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC;gBACjD,gBAAgB,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,GAAG,MAAM,CAAC;YAC3C,CAAC;QACH,CAAC;QAED,MAAM,eAAe,GAA2B,EAAE,CAAC;QACnD,KAAK,MAAM,CAAC,KAAK,EAAE,KAAK,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC;YACvD,eAAe,CAAC,KAAK,CAAC,GAAG,KAAK,CAAC;QACjC,CAAC;QAED,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,MAAM,EAAE;YACnB,WAAW,EAAE,gBAAgB;YAC7B,UAAU,EAAE,eAAe;YAC3B,SAAS,EAAE,EAAE,GAAG,IAAI,CAAC,SAAS,EAAE;SACjC,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,SAAS,CAAC,KAAwC;QAChD,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAExB,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,CAAC;QACzB,KAAK,MAAM,CAAC,QAAQ,EAAE,OAAO,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,WAAW,CAAC,EAAE,CAAC;YACpE,MAAM,SAAS,GAAG,IAAI,GAAG,EAAwB,CAAC;YAClD,KAAK,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;gBAC1D,SAAS,CAAC,GAAG,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC;YACnC,CAAC;YACD,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;QAC5C,CAAC;QAED,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE,CAAC;QACxB,KAAK,MAAM,CAAC,QAAQ,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,UAAU,CAAC,EAAE,CAAC;YACjE,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QACvC,CAAC;QAED,IAAI,CAAC,SAAS,GAAG,EAAE,GAAG,KAAK,CAAC,SAAS,EAAE,CAAC;QAExC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,oBAAoB,EAAE;YACrC,UAAU,EAAE,IAAI,CAAC,WAAW,CAAC,IAAI;YACjC,SAAS,EAAE,IAAI,CAAC,UAAU,CAAC,IAAI;SAChC,CAAC,CAAC;IACL,CAAC;CACF;AA3fD,gCA2fC;AAED;;GAEG;AACH,SAAgB,sBAAsB;IACpC,OAAO;QACL,YAAY,EAAE,MAAM;QACpB,cAAc,EAAE,IAAI;QACpB,eAAe,EAAE,GAAG,EAAE,mCAAmC;QACzD,gBAAgB,EAAE,GAAG;QACrB,kBAAkB,EAAE,GAAG;QACvB,WAAW,EAAE,GAAG;QAChB,SAAS,EAAE,CAAC;QACZ,aAAa,EAAE,EAAE;QACjB,oBAAoB,EAAE,GAAG;QACzB,kBAAkB,EAAE,IAAI;QACxB,SAAS,EAAE,IAAI;QACf,WAAW,EAAE,GAAG;QAChB,aAAa,EAAE,IAAI;QACnB,kBAAkB,EAAE,MAAM;QAC1B,iBAAiB,EAAE,KAAK;QACxB,mBAAmB,EAAE,KAAK,EAAE,gCAAgC;QAC5D,gBAAgB,EAAE,IAAI,EAAM,iCAAiC;QAC7D,SAAS,EAAE,EAAE;KACd,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* QLearning - Off-policy TD(0) Reinforcement Learning
|
|
3
|
+
*
|
|
4
|
+
* Implements standard Q-learning algorithm for reinforcement learning.
|
|
5
|
+
* Key differences from SARSA:
|
|
6
|
+
* - Off-policy: learns optimal Q-values regardless of policy being followed
|
|
7
|
+
* - Uses max Q-value for next state, not actual next action
|
|
8
|
+
* - Update rule: Q(s,a) ← Q(s,a) + α[r + γ·max(Q(s',a')) - Q(s,a)]
|
|
9
|
+
* - More aggressive than SARSA, finds optimal policy faster
|
|
10
|
+
*/
|
|
11
|
+
import { AbstractRLLearner, RLConfig } from './AbstractRLLearner';
|
|
12
|
+
import { TaskExperience, AgentAction } from '../types';
|
|
13
|
+
/**
|
|
14
|
+
* Q-learning configuration (extends base RL config)
|
|
15
|
+
*/
|
|
16
|
+
export interface QLearningConfig extends RLConfig {
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* QLearning - Standard Q-learning implementation
|
|
20
|
+
*
|
|
21
|
+
* Implements the classic Q-learning algorithm with:
|
|
22
|
+
* - Epsilon-greedy exploration policy
|
|
23
|
+
* - Off-policy temporal difference (TD) learning
|
|
24
|
+
* - Q-table for state-action values
|
|
25
|
+
* - Optional experience replay for stability
|
|
26
|
+
*
|
|
27
|
+
* Update Rule:
|
|
28
|
+
* Q(s,a) ← Q(s,a) + α[r + γ·max_a'(Q(s',a')) - Q(s,a)]
|
|
29
|
+
*
|
|
30
|
+
* Key characteristics:
|
|
31
|
+
* - Off-policy: learns about optimal policy while following exploration policy
|
|
32
|
+
* - Uses max Q-value (greedy) for bootstrapping
|
|
33
|
+
* - Converges to optimal Q* under certain conditions
|
|
34
|
+
* - More sample-efficient than on-policy methods
|
|
35
|
+
*/
|
|
36
|
+
export declare class QLearning extends AbstractRLLearner {
|
|
37
|
+
private readonly defaultConfig;
|
|
38
|
+
constructor(config?: Partial<QLearningConfig>);
|
|
39
|
+
/**
|
|
40
|
+
* Update Q-value using Q-learning update rule
|
|
41
|
+
* Q(s,a) ← Q(s,a) + α[r + γ·max(Q(s',a')) - Q(s,a)]
|
|
42
|
+
*
|
|
43
|
+
* @param experience The transition experience (s, a, r, s')
|
|
44
|
+
* @param nextAction Ignored in Q-learning (uses max Q-value instead)
|
|
45
|
+
*/
|
|
46
|
+
update(experience: TaskExperience, nextAction?: AgentAction): void;
|
|
47
|
+
/**
|
|
48
|
+
* Get the default exploration rate for this algorithm
|
|
49
|
+
*/
|
|
50
|
+
protected getDefaultExplorationRate(): number;
|
|
51
|
+
/**
|
|
52
|
+
* Get algorithm name
|
|
53
|
+
*/
|
|
54
|
+
getAlgorithmName(): string;
|
|
55
|
+
/**
|
|
56
|
+
* Get algorithm type (off-policy)
|
|
57
|
+
*/
|
|
58
|
+
getAlgorithmType(): 'on-policy' | 'off-policy';
|
|
59
|
+
/**
|
|
60
|
+
* Get detailed statistics including Q-learning-specific metrics
|
|
61
|
+
*/
|
|
62
|
+
getDetailedStatistics(): {
|
|
63
|
+
algorithm: string;
|
|
64
|
+
type: 'on-policy' | 'off-policy';
|
|
65
|
+
stats: ReturnType<AbstractRLLearner['getStatistics']>;
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
//# sourceMappingURL=QLearning.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"QLearning.d.ts","sourceRoot":"","sources":["../../../src/learning/algorithms/QLearning.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,qBAAqB,CAAC;AAClE,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AAEvD;;GAEG;AACH,MAAM,WAAW,eAAgB,SAAQ,QAAQ;CAGhD;AAgBD;;;;;;;;;;;;;;;;;GAiBG;AACH,qBAAa,SAAU,SAAQ,iBAAiB;IAC9C,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAkB;gBAEpC,MAAM,GAAE,OAAO,CAAC,eAAe,CAAM;IAOjD;;;;;;OAMG;IACH,MAAM,CAAC,UAAU,EAAE,cAAc,EAAE,UAAU,CAAC,EAAE,WAAW,GAAG,IAAI;IAiClE;;OAEG;IACH,SAAS,CAAC,yBAAyB,IAAI,MAAM;IAI7C;;OAEG;IACH,gBAAgB,IAAI,MAAM;IAI1B;;OAEG;IACH,gBAAgB,IAAI,WAAW,GAAG,YAAY;IAI9C;;OAEG;IACH,qBAAqB,IAAI;QACvB,SAAS,EAAE,MAAM,CAAC;QAClB,IAAI,EAAE,WAAW,GAAG,YAAY,CAAC;QACjC,KAAK,EAAE,UAAU,CAAC,iBAAiB,CAAC,eAAe,CAAC,CAAC,CAAC;KACvD;CAOF"}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* QLearning - Off-policy TD(0) Reinforcement Learning
|
|
4
|
+
*
|
|
5
|
+
* Implements standard Q-learning algorithm for reinforcement learning.
|
|
6
|
+
* Key differences from SARSA:
|
|
7
|
+
* - Off-policy: learns optimal Q-values regardless of policy being followed
|
|
8
|
+
* - Uses max Q-value for next state, not actual next action
|
|
9
|
+
* - Update rule: Q(s,a) ← Q(s,a) + α[r + γ·max(Q(s',a')) - Q(s,a)]
|
|
10
|
+
* - More aggressive than SARSA, finds optimal policy faster
|
|
11
|
+
*/
|
|
12
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
13
|
+
exports.QLearning = void 0;
|
|
14
|
+
const AbstractRLLearner_1 = require("./AbstractRLLearner");
|
|
15
|
+
/**
|
|
16
|
+
* Default Q-learning configuration
|
|
17
|
+
*/
|
|
18
|
+
const DEFAULT_CONFIG = {
|
|
19
|
+
learningRate: 0.1,
|
|
20
|
+
discountFactor: 0.95,
|
|
21
|
+
explorationRate: 0.3,
|
|
22
|
+
explorationDecay: 0.995,
|
|
23
|
+
minExplorationRate: 0.01,
|
|
24
|
+
useExperienceReplay: true,
|
|
25
|
+
replayBufferSize: 10000,
|
|
26
|
+
batchSize: 32
|
|
27
|
+
};
|
|
28
|
+
/**
|
|
29
|
+
* QLearning - Standard Q-learning implementation
|
|
30
|
+
*
|
|
31
|
+
* Implements the classic Q-learning algorithm with:
|
|
32
|
+
* - Epsilon-greedy exploration policy
|
|
33
|
+
* - Off-policy temporal difference (TD) learning
|
|
34
|
+
* - Q-table for state-action values
|
|
35
|
+
* - Optional experience replay for stability
|
|
36
|
+
*
|
|
37
|
+
* Update Rule:
|
|
38
|
+
* Q(s,a) ← Q(s,a) + α[r + γ·max_a'(Q(s',a')) - Q(s,a)]
|
|
39
|
+
*
|
|
40
|
+
* Key characteristics:
|
|
41
|
+
* - Off-policy: learns about optimal policy while following exploration policy
|
|
42
|
+
* - Uses max Q-value (greedy) for bootstrapping
|
|
43
|
+
* - Converges to optimal Q* under certain conditions
|
|
44
|
+
* - More sample-efficient than on-policy methods
|
|
45
|
+
*/
|
|
46
|
+
class QLearning extends AbstractRLLearner_1.AbstractRLLearner {
|
|
47
|
+
constructor(config = {}) {
|
|
48
|
+
const fullConfig = { ...DEFAULT_CONFIG, ...config };
|
|
49
|
+
super(fullConfig);
|
|
50
|
+
this.defaultConfig = fullConfig;
|
|
51
|
+
this.logger.info('QLearning initialized with off-policy TD(0)', { config: fullConfig });
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Update Q-value using Q-learning update rule
|
|
55
|
+
* Q(s,a) ← Q(s,a) + α[r + γ·max(Q(s',a')) - Q(s,a)]
|
|
56
|
+
*
|
|
57
|
+
* @param experience The transition experience (s, a, r, s')
|
|
58
|
+
* @param nextAction Ignored in Q-learning (uses max Q-value instead)
|
|
59
|
+
*/
|
|
60
|
+
update(experience, nextAction) {
|
|
61
|
+
const stateKey = this.encodeState(experience.state);
|
|
62
|
+
const actionKey = this.encodeAction(experience.action);
|
|
63
|
+
const nextStateKey = this.encodeState(experience.nextState);
|
|
64
|
+
// Get current Q-value Q(s,a)
|
|
65
|
+
const stateActions = this.qTable.get(stateKey);
|
|
66
|
+
const currentQ = stateActions?.get(actionKey)?.value ?? 0;
|
|
67
|
+
// Q-Learning: Get max Q-value for next state (greedy)
|
|
68
|
+
// This is the key difference from SARSA (which uses actual next action)
|
|
69
|
+
const nextStateActions = this.qTable.get(nextStateKey);
|
|
70
|
+
const maxNextQ = nextStateActions && nextStateActions.size > 0
|
|
71
|
+
? Math.max(...Array.from(nextStateActions.values()).map(qv => qv.value))
|
|
72
|
+
: 0;
|
|
73
|
+
// Q-learning update rule
|
|
74
|
+
// Q(s,a) = Q(s,a) + α * [r + γ * max(Q(s',a')) - Q(s,a)]
|
|
75
|
+
const tdTarget = experience.reward + this.config.discountFactor * maxNextQ;
|
|
76
|
+
const tdError = tdTarget - currentQ;
|
|
77
|
+
const newQ = currentQ + this.config.learningRate * tdError;
|
|
78
|
+
// Update Q-value
|
|
79
|
+
this.setQValue(stateKey, actionKey, newQ);
|
|
80
|
+
// Add to experience replay buffer if enabled
|
|
81
|
+
if (this.replayBuffer) {
|
|
82
|
+
this.replayBuffer.add(experience, Math.abs(tdError)); // Priority based on TD error
|
|
83
|
+
}
|
|
84
|
+
this.stepCount++;
|
|
85
|
+
}
|
|
86
|
+
/**
|
|
87
|
+
* Get the default exploration rate for this algorithm
|
|
88
|
+
*/
|
|
89
|
+
getDefaultExplorationRate() {
|
|
90
|
+
return this.defaultConfig.explorationRate;
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Get algorithm name
|
|
94
|
+
*/
|
|
95
|
+
getAlgorithmName() {
|
|
96
|
+
return 'Q-Learning';
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Get algorithm type (off-policy)
|
|
100
|
+
*/
|
|
101
|
+
getAlgorithmType() {
|
|
102
|
+
return 'off-policy';
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Get detailed statistics including Q-learning-specific metrics
|
|
106
|
+
*/
|
|
107
|
+
getDetailedStatistics() {
|
|
108
|
+
return {
|
|
109
|
+
algorithm: this.getAlgorithmName(),
|
|
110
|
+
type: this.getAlgorithmType(),
|
|
111
|
+
stats: this.getStatistics()
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
exports.QLearning = QLearning;
|
|
116
|
+
//# sourceMappingURL=QLearning.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"QLearning.js","sourceRoot":"","sources":["../../../src/learning/algorithms/QLearning.ts"],"names":[],"mappings":";AAAA;;;;;;;;;GASG;;;AAEH,2DAAkE;AAWlE;;GAEG;AACH,MAAM,cAAc,GAAoB;IACtC,YAAY,EAAE,GAAG;IACjB,cAAc,EAAE,IAAI;IACpB,eAAe,EAAE,GAAG;IACpB,gBAAgB,EAAE,KAAK;IACvB,kBAAkB,EAAE,IAAI;IACxB,mBAAmB,EAAE,IAAI;IACzB,gBAAgB,EAAE,KAAK;IACvB,SAAS,EAAE,EAAE;CACd,CAAC;AAEF;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAa,SAAU,SAAQ,qCAAiB;IAG9C,YAAY,SAAmC,EAAE;QAC/C,MAAM,UAAU,GAAG,EAAE,GAAG,cAAc,EAAE,GAAG,MAAM,EAAE,CAAC;QACpD,KAAK,CAAC,UAAU,CAAC,CAAC;QAClB,IAAI,CAAC,aAAa,GAAG,UAAU,CAAC;QAChC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,6CAA6C,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,CAAC;IAC1F,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,UAA0B,EAAE,UAAwB;QACzD,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;QACpD,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;QACvD,MAAM,YAAY,GAAG,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC;QAE5D,6BAA6B;QAC7B,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAC/C,MAAM,QAAQ,GAAG,YAAY,EAAE,GAAG,CAAC,SAAS,CAAC,EAAE,KAAK,IAAI,CAAC,CAAC;QAE1D,sDAAsD;QACtD,wEAAwE;QACxE,MAAM,gBAAgB,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;QACvD,MAAM,QAAQ,GAAG,gBAAgB,IAAI,gBAAgB,CAAC,IAAI,GAAG,CAAC;YAC5D,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC,IAAI,CAAC,gBAAgB,CAAC,MAAM,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC;YACxE,CAAC,CAAC,CAAC,CAAC;QAEN,yBAAyB;QACzB,yDAAyD;QACzD,MAAM,QAAQ,GAAG,UAAU,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,cAAc,GAAG,QAAQ,CAAC;QAC3E,MAAM,OAAO,GAAG,QAAQ,GAAG,QAAQ,CAAC;QACpC,MAAM,IAAI,GAAG,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,YAAY,GAAG,OAAO,CAAC;QAE3D,iBAAiB;QACjB,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;QAE1C,6CAA6C;QAC7C,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YACtB,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,UAAU,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,6BAA6B;QACrF,CAAC;QAED,IAAI,CAAC,SAAS,EAAE,CAAC;IACnB,CAAC;IAED;;OAEG;IACO,yBAAyB;QACjC,OAAO,IAAI,CAAC,aAAa,CAAC,eAAe,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,qBAAqB;QAKnB,OAAO;YACL,SAAS,EAAE,IAAI,CAAC,gBAAgB,EAAE;YAClC,IAAI,EAAE,IAAI,CAAC,gBAAgB,EAAE;YAC7B,KAAK,EAAE,IAAI,CAAC,aAAa,EAAE;SAC5B,CAAC;IACJ,CAAC;CACF;AArFD,8BAqFC"}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SARSALearner - On-policy TD(0) Reinforcement Learning
|
|
3
|
+
*
|
|
4
|
+
* Implements SARSA (State-Action-Reward-State-Action) algorithm.
|
|
5
|
+
* Key differences from Q-Learning:
|
|
6
|
+
* - On-policy: learns Q-values for the policy being followed (epsilon-greedy)
|
|
7
|
+
* - Uses actual next action taken, not the max Q-value
|
|
8
|
+
* - Update rule: Q(s,a) ← Q(s,a) + α[r + γQ(s',a') - Q(s,a)]
|
|
9
|
+
* - More conservative than Q-Learning, safer for exploration
|
|
10
|
+
*/
|
|
11
|
+
import { AbstractRLLearner, RLConfig } from './AbstractRLLearner';
|
|
12
|
+
import { TaskExperience, AgentAction, TaskState } from '../types';
|
|
13
|
+
/**
|
|
14
|
+
* SARSA configuration (same as base RL config)
|
|
15
|
+
*/
|
|
16
|
+
export type SARSAConfig = RLConfig;
|
|
17
|
+
/**
|
|
18
|
+
* SARSALearner - On-policy Temporal Difference Learning
|
|
19
|
+
*
|
|
20
|
+
* SARSA is an on-policy TD control algorithm that learns the Q-values
|
|
21
|
+
* for the policy being followed (typically epsilon-greedy).
|
|
22
|
+
*
|
|
23
|
+
* Key Characteristics:
|
|
24
|
+
* - Updates based on (State, Action, Reward, next State, next Action)
|
|
25
|
+
* - Learns Q-values for the actual policy (including exploration)
|
|
26
|
+
* - More conservative than Q-Learning
|
|
27
|
+
* - Better for tasks where exploration is risky
|
|
28
|
+
* - Converges to optimal policy under certain conditions
|
|
29
|
+
*
|
|
30
|
+
* Update Rule:
|
|
31
|
+
* Q(s,a) ← Q(s,a) + α[r + γQ(s',a') - Q(s,a)]
|
|
32
|
+
* where a' is the action actually taken in state s' (not necessarily greedy)
|
|
33
|
+
*/
|
|
34
|
+
export declare class SARSALearner extends AbstractRLLearner {
|
|
35
|
+
private readonly defaultConfig;
|
|
36
|
+
private lastStateAction?;
|
|
37
|
+
constructor(config?: Partial<RLConfig>);
|
|
38
|
+
/**
|
|
39
|
+
* Update Q-value using SARSA on-policy update rule
|
|
40
|
+
* Q(s,a) ← Q(s,a) + α[r + γQ(s',a') - Q(s,a)]
|
|
41
|
+
*
|
|
42
|
+
* @param experience The transition experience (s, a, r, s')
|
|
43
|
+
* @param nextAction The actual action taken in next state (SARSA requires this!)
|
|
44
|
+
* If not provided, selects action using current policy (epsilon-greedy)
|
|
45
|
+
*/
|
|
46
|
+
update(experience: TaskExperience, nextAction?: AgentAction): void;
|
|
47
|
+
/**
|
|
48
|
+
* Calculate expected value for next state under current epsilon-greedy policy
|
|
49
|
+
* This is used when we don't have the actual next action (e.g., in batch updates)
|
|
50
|
+
*
|
|
51
|
+
* Expected SARSA: E[Q(s',a')] = ε/|A| * Σ Q(s',a) + (1-ε) * max Q(s',a)
|
|
52
|
+
*/
|
|
53
|
+
private getExpectedValue;
|
|
54
|
+
/**
|
|
55
|
+
* Select next action and update with SARSA
|
|
56
|
+
* This is the typical SARSA flow: select action, observe reward, select next action, update
|
|
57
|
+
*
|
|
58
|
+
* @param currentState Current state
|
|
59
|
+
* @param currentAction Action taken in current state
|
|
60
|
+
* @param reward Reward received
|
|
61
|
+
* @param nextState Next state observed
|
|
62
|
+
* @param availableActions Actions available in next state
|
|
63
|
+
* @returns Next action selected (for continued learning)
|
|
64
|
+
*/
|
|
65
|
+
selectAndUpdate(currentState: TaskState, currentAction: AgentAction, reward: number, nextState: TaskState, availableActions: AgentAction[]): AgentAction;
|
|
66
|
+
/**
|
|
67
|
+
* Learn from a complete episode trajectory
|
|
68
|
+
* Updates all state-action pairs in the trajectory using SARSA
|
|
69
|
+
*
|
|
70
|
+
* @param trajectory Array of (state, action, reward) tuples
|
|
71
|
+
*/
|
|
72
|
+
learnFromEpisode(trajectory: Array<{
|
|
73
|
+
state: TaskState;
|
|
74
|
+
action: AgentAction;
|
|
75
|
+
reward: number;
|
|
76
|
+
}>): void;
|
|
77
|
+
/**
|
|
78
|
+
* Get the default exploration rate for this algorithm
|
|
79
|
+
*/
|
|
80
|
+
protected getDefaultExplorationRate(): number;
|
|
81
|
+
/**
|
|
82
|
+
* Get algorithm name
|
|
83
|
+
*/
|
|
84
|
+
getAlgorithmName(): string;
|
|
85
|
+
/**
|
|
86
|
+
* Get algorithm type (on-policy)
|
|
87
|
+
*/
|
|
88
|
+
getAlgorithmType(): 'on-policy' | 'off-policy';
|
|
89
|
+
/**
|
|
90
|
+
* Get detailed statistics including SARSA-specific metrics
|
|
91
|
+
*/
|
|
92
|
+
getDetailedStatistics(): {
|
|
93
|
+
algorithm: string;
|
|
94
|
+
type: 'on-policy' | 'off-policy';
|
|
95
|
+
stats: ReturnType<AbstractRLLearner['getStatistics']>;
|
|
96
|
+
};
|
|
97
|
+
/**
|
|
98
|
+
* Compare performance with expected convergence
|
|
99
|
+
* SARSA typically converges slower but more safely than Q-Learning
|
|
100
|
+
*/
|
|
101
|
+
getConvergenceMetrics(): {
|
|
102
|
+
isConverging: boolean;
|
|
103
|
+
convergenceRate: number;
|
|
104
|
+
stability: number;
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
//# sourceMappingURL=SARSALearner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"SARSALearner.d.ts","sourceRoot":"","sources":["../../../src/learning/algorithms/SARSALearner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,qBAAqB,CAAC;AAClE,OAAO,EAAE,cAAc,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAElE;;GAEG;AACH,MAAM,MAAM,WAAW,GAAG,QAAQ,CAAC;AAgBnC;;;;;;;;;;;;;;;;GAgBG;AACH,qBAAa,YAAa,SAAQ,iBAAiB;IACjD,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAW;IACzC,OAAO,CAAC,eAAe,CAAC,CAAoC;gBAEhD,MAAM,GAAE,OAAO,CAAC,QAAQ,CAAM;IAO1C;;;;;;;OAOG;IACH,MAAM,CAAC,UAAU,EAAE,cAAc,EAAE,UAAU,CAAC,EAAE,WAAW,GAAG,IAAI;IA8ClE;;;;;OAKG;IACH,OAAO,CAAC,gBAAgB;IAwBxB;;;;;;;;;;OAUG;IACH,eAAe,CACb,YAAY,EAAE,SAAS,EACvB,aAAa,EAAE,WAAW,EAC1B,MAAM,EAAE,MAAM,EACd,SAAS,EAAE,SAAS,EACpB,gBAAgB,EAAE,WAAW,EAAE,GAC9B,WAAW;IAsBd;;;;;OAKG;IACH,gBAAgB,CACd,UAAU,EAAE,KAAK,CAAC;QAChB,KAAK,EAAE,SAAS,CAAC;QACjB,MAAM,EAAE,WAAW,CAAC;QACpB,MAAM,EAAE,MAAM,CAAC;KAChB,CAAC,GACD,IAAI;IA0CP;;OAEG;IACH,SAAS,CAAC,yBAAyB,IAAI,MAAM;IAI7C;;OAEG;IACH,gBAAgB,IAAI,MAAM;IAI1B;;OAEG;IACH,gBAAgB,IAAI,WAAW,GAAG,YAAY;IAI9C;;OAEG;IACH,qBAAqB,IAAI;QACvB,SAAS,EAAE,MAAM,CAAC;QAClB,IAAI,EAAE,WAAW,GAAG,YAAY,CAAC;QACjC,KAAK,EAAE,UAAU,CAAC,iBAAiB,CAAC,eAAe,CAAC,CAAC,CAAC;KACvD;IAQD;;;OAGG;IACH,qBAAqB,IAAI;QACvB,YAAY,EAAE,OAAO,CAAC;QACtB,eAAe,EAAE,MAAM,CAAC;QACxB,SAAS,EAAE,MAAM,CAAC;KACnB;CA+BF"}
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* SARSALearner - On-policy TD(0) Reinforcement Learning
|
|
4
|
+
*
|
|
5
|
+
* Implements SARSA (State-Action-Reward-State-Action) algorithm.
|
|
6
|
+
* Key differences from Q-Learning:
|
|
7
|
+
* - On-policy: learns Q-values for the policy being followed (epsilon-greedy)
|
|
8
|
+
* - Uses actual next action taken, not the max Q-value
|
|
9
|
+
* - Update rule: Q(s,a) ← Q(s,a) + α[r + γQ(s',a') - Q(s,a)]
|
|
10
|
+
* - More conservative than Q-Learning, safer for exploration
|
|
11
|
+
*/
|
|
12
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
13
|
+
exports.SARSALearner = void 0;
|
|
14
|
+
const AbstractRLLearner_1 = require("./AbstractRLLearner");
|
|
15
|
+
/**
|
|
16
|
+
* Default SARSA configuration
|
|
17
|
+
*/
|
|
18
|
+
const DEFAULT_SARSA_CONFIG = {
|
|
19
|
+
learningRate: 0.1,
|
|
20
|
+
discountFactor: 0.95,
|
|
21
|
+
explorationRate: 0.3,
|
|
22
|
+
explorationDecay: 0.995,
|
|
23
|
+
minExplorationRate: 0.01,
|
|
24
|
+
useExperienceReplay: true,
|
|
25
|
+
replayBufferSize: 10000,
|
|
26
|
+
batchSize: 32
|
|
27
|
+
};
|
|
28
|
+
/**
|
|
29
|
+
* SARSALearner - On-policy Temporal Difference Learning
|
|
30
|
+
*
|
|
31
|
+
* SARSA is an on-policy TD control algorithm that learns the Q-values
|
|
32
|
+
* for the policy being followed (typically epsilon-greedy).
|
|
33
|
+
*
|
|
34
|
+
* Key Characteristics:
|
|
35
|
+
* - Updates based on (State, Action, Reward, next State, next Action)
|
|
36
|
+
* - Learns Q-values for the actual policy (including exploration)
|
|
37
|
+
* - More conservative than Q-Learning
|
|
38
|
+
* - Better for tasks where exploration is risky
|
|
39
|
+
* - Converges to optimal policy under certain conditions
|
|
40
|
+
*
|
|
41
|
+
* Update Rule:
|
|
42
|
+
* Q(s,a) ← Q(s,a) + α[r + γQ(s',a') - Q(s,a)]
|
|
43
|
+
* where a' is the action actually taken in state s' (not necessarily greedy)
|
|
44
|
+
*/
|
|
45
|
+
class SARSALearner extends AbstractRLLearner_1.AbstractRLLearner {
|
|
46
|
+
constructor(config = {}) {
|
|
47
|
+
const fullConfig = { ...DEFAULT_SARSA_CONFIG, ...config };
|
|
48
|
+
super(fullConfig);
|
|
49
|
+
this.defaultConfig = fullConfig;
|
|
50
|
+
this.logger.info('SARSALearner initialized with on-policy TD(0)', { config: fullConfig });
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Update Q-value using SARSA on-policy update rule
|
|
54
|
+
* Q(s,a) ← Q(s,a) + α[r + γQ(s',a') - Q(s,a)]
|
|
55
|
+
*
|
|
56
|
+
* @param experience The transition experience (s, a, r, s')
|
|
57
|
+
* @param nextAction The actual action taken in next state (SARSA requires this!)
|
|
58
|
+
* If not provided, selects action using current policy (epsilon-greedy)
|
|
59
|
+
*/
|
|
60
|
+
update(experience, nextAction) {
|
|
61
|
+
const stateKey = this.encodeState(experience.state);
|
|
62
|
+
const actionKey = this.encodeAction(experience.action);
|
|
63
|
+
const nextStateKey = this.encodeState(experience.nextState);
|
|
64
|
+
// Get current Q-value Q(s,a)
|
|
65
|
+
const stateActions = this.qTable.get(stateKey);
|
|
66
|
+
const currentQ = stateActions?.get(actionKey)?.value ?? 0;
|
|
67
|
+
// SARSA: Get Q-value for next action that will actually be taken
|
|
68
|
+
// This is the key difference from Q-Learning (which uses max Q-value)
|
|
69
|
+
let nextQ = 0;
|
|
70
|
+
if (nextAction) {
|
|
71
|
+
// Use provided next action (typical in online learning)
|
|
72
|
+
const nextActionKey = this.encodeAction(nextAction);
|
|
73
|
+
const nextStateActions = this.qTable.get(nextStateKey);
|
|
74
|
+
nextQ = nextStateActions?.get(nextActionKey)?.value ?? 0;
|
|
75
|
+
}
|
|
76
|
+
else {
|
|
77
|
+
// If no next action provided, we need to select one using epsilon-greedy
|
|
78
|
+
// This happens in batch updates from experience replay
|
|
79
|
+
// We approximate by using a greedy action (conservative estimate)
|
|
80
|
+
const nextStateActions = this.qTable.get(nextStateKey);
|
|
81
|
+
if (nextStateActions && nextStateActions.size > 0) {
|
|
82
|
+
// Use expected SARSA approximation: average over all actions weighted by policy
|
|
83
|
+
nextQ = this.getExpectedValue(experience.nextState, nextStateActions);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
// SARSA update rule
|
|
87
|
+
// Q(s,a) = Q(s,a) + α * [r + γ * Q(s',a') - Q(s,a)]
|
|
88
|
+
const tdTarget = experience.reward + this.config.discountFactor * nextQ;
|
|
89
|
+
const tdError = tdTarget - currentQ;
|
|
90
|
+
const newQ = currentQ + this.config.learningRate * tdError;
|
|
91
|
+
// Update Q-value
|
|
92
|
+
this.setQValue(stateKey, actionKey, newQ);
|
|
93
|
+
// Add to experience replay buffer if enabled
|
|
94
|
+
if (this.replayBuffer) {
|
|
95
|
+
this.replayBuffer.add(experience, Math.abs(tdError)); // Priority based on TD error
|
|
96
|
+
}
|
|
97
|
+
this.stepCount++;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Calculate expected value for next state under current epsilon-greedy policy
|
|
101
|
+
* This is used when we don't have the actual next action (e.g., in batch updates)
|
|
102
|
+
*
|
|
103
|
+
* Expected SARSA: E[Q(s',a')] = ε/|A| * Σ Q(s',a) + (1-ε) * max Q(s',a)
|
|
104
|
+
*/
|
|
105
|
+
getExpectedValue(nextState, nextStateActions) {
|
|
106
|
+
if (nextStateActions.size === 0) {
|
|
107
|
+
return 0;
|
|
108
|
+
}
|
|
109
|
+
const epsilon = this.config.explorationRate;
|
|
110
|
+
const numActions = nextStateActions.size;
|
|
111
|
+
// Calculate average Q-value (for random exploration)
|
|
112
|
+
let sumQ = 0;
|
|
113
|
+
let maxQ = -Infinity;
|
|
114
|
+
for (const qValue of nextStateActions.values()) {
|
|
115
|
+
sumQ += qValue.value;
|
|
116
|
+
maxQ = Math.max(maxQ, qValue.value);
|
|
117
|
+
}
|
|
118
|
+
const avgQ = sumQ / numActions;
|
|
119
|
+
// Expected value under epsilon-greedy policy
|
|
120
|
+
// ε * (average of all actions) + (1-ε) * (max action)
|
|
121
|
+
return epsilon * avgQ + (1 - epsilon) * maxQ;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Select next action and update with SARSA
|
|
125
|
+
* This is the typical SARSA flow: select action, observe reward, select next action, update
|
|
126
|
+
*
|
|
127
|
+
* @param currentState Current state
|
|
128
|
+
* @param currentAction Action taken in current state
|
|
129
|
+
* @param reward Reward received
|
|
130
|
+
* @param nextState Next state observed
|
|
131
|
+
* @param availableActions Actions available in next state
|
|
132
|
+
* @returns Next action selected (for continued learning)
|
|
133
|
+
*/
|
|
134
|
+
selectAndUpdate(currentState, currentAction, reward, nextState, availableActions) {
|
|
135
|
+
// Select next action using epsilon-greedy policy
|
|
136
|
+
const nextAction = this.selectAction(nextState, availableActions);
|
|
137
|
+
// Create experience
|
|
138
|
+
const experience = {
|
|
139
|
+
taskId: `sarsa-${Date.now()}`,
|
|
140
|
+
taskType: 'online-learning',
|
|
141
|
+
state: currentState,
|
|
142
|
+
action: currentAction,
|
|
143
|
+
reward,
|
|
144
|
+
nextState,
|
|
145
|
+
timestamp: new Date(),
|
|
146
|
+
agentId: 'sarsa-learner'
|
|
147
|
+
};
|
|
148
|
+
// Update Q-value using SARSA rule with actual next action
|
|
149
|
+
this.update(experience, nextAction);
|
|
150
|
+
return nextAction;
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Learn from a complete episode trajectory
|
|
154
|
+
* Updates all state-action pairs in the trajectory using SARSA
|
|
155
|
+
*
|
|
156
|
+
* @param trajectory Array of (state, action, reward) tuples
|
|
157
|
+
*/
|
|
158
|
+
learnFromEpisode(trajectory) {
|
|
159
|
+
// SARSA updates each transition with the next action in the trajectory
|
|
160
|
+
for (let i = 0; i < trajectory.length - 1; i++) {
|
|
161
|
+
const current = trajectory[i];
|
|
162
|
+
const next = trajectory[i + 1];
|
|
163
|
+
const experience = {
|
|
164
|
+
taskId: `episode-${Date.now()}-${i}`,
|
|
165
|
+
taskType: 'episode-learning',
|
|
166
|
+
state: current.state,
|
|
167
|
+
action: current.action,
|
|
168
|
+
reward: current.reward,
|
|
169
|
+
nextState: next.state,
|
|
170
|
+
timestamp: new Date(),
|
|
171
|
+
agentId: 'sarsa-learner'
|
|
172
|
+
};
|
|
173
|
+
// Update with the actual next action from trajectory
|
|
174
|
+
this.update(experience, next.action);
|
|
175
|
+
}
|
|
176
|
+
// Handle terminal state (last transition)
|
|
177
|
+
if (trajectory.length > 0) {
|
|
178
|
+
const last = trajectory[trajectory.length - 1];
|
|
179
|
+
const terminalExperience = {
|
|
180
|
+
taskId: `episode-${Date.now()}-terminal`,
|
|
181
|
+
taskType: 'episode-learning',
|
|
182
|
+
state: last.state,
|
|
183
|
+
action: last.action,
|
|
184
|
+
reward: last.reward,
|
|
185
|
+
nextState: last.state, // Terminal state transitions to itself
|
|
186
|
+
timestamp: new Date(),
|
|
187
|
+
agentId: 'sarsa-learner'
|
|
188
|
+
};
|
|
189
|
+
// Terminal state has no next action, Q(terminal, any) = 0
|
|
190
|
+
this.update(terminalExperience);
|
|
191
|
+
}
|
|
192
|
+
this.endEpisode();
|
|
193
|
+
}
|
|
194
|
+
/**
|
|
195
|
+
* Get the default exploration rate for this algorithm
|
|
196
|
+
*/
|
|
197
|
+
getDefaultExplorationRate() {
|
|
198
|
+
return this.defaultConfig.explorationRate;
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* Get algorithm name
|
|
202
|
+
*/
|
|
203
|
+
getAlgorithmName() {
|
|
204
|
+
return 'SARSA';
|
|
205
|
+
}
|
|
206
|
+
/**
|
|
207
|
+
* Get algorithm type (on-policy)
|
|
208
|
+
*/
|
|
209
|
+
getAlgorithmType() {
|
|
210
|
+
return 'on-policy';
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Get detailed statistics including SARSA-specific metrics
|
|
214
|
+
*/
|
|
215
|
+
getDetailedStatistics() {
|
|
216
|
+
return {
|
|
217
|
+
algorithm: this.getAlgorithmName(),
|
|
218
|
+
type: this.getAlgorithmType(),
|
|
219
|
+
stats: this.getStatistics()
|
|
220
|
+
};
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Compare performance with expected convergence
|
|
224
|
+
* SARSA typically converges slower but more safely than Q-Learning
|
|
225
|
+
*/
|
|
226
|
+
getConvergenceMetrics() {
|
|
227
|
+
const stats = this.getStatistics();
|
|
228
|
+
// Check if Q-values are stabilizing
|
|
229
|
+
const avgQValue = stats.avgQValue;
|
|
230
|
+
const qValueRange = stats.maxQValue - stats.minQValue;
|
|
231
|
+
// Convergence indicators:
|
|
232
|
+
// 1. Low exploration rate (mostly exploiting)
|
|
233
|
+
// 2. Reasonable Q-value range (not diverging)
|
|
234
|
+
// 3. Sufficient episodes for learning
|
|
235
|
+
const isConverging = stats.explorationRate < 0.1 && // Low exploration
|
|
236
|
+
qValueRange < 10 && // Bounded Q-values
|
|
237
|
+
stats.episodes > 20; // Sufficient training
|
|
238
|
+
const convergenceRate = stats.episodes > 0
|
|
239
|
+
? Math.min(1.0, stats.episodes / 100)
|
|
240
|
+
: 0;
|
|
241
|
+
const stability = qValueRange > 0
|
|
242
|
+
? 1.0 - Math.min(1.0, qValueRange / 20)
|
|
243
|
+
: 0.5;
|
|
244
|
+
return {
|
|
245
|
+
isConverging,
|
|
246
|
+
convergenceRate,
|
|
247
|
+
stability
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
exports.SARSALearner = SARSALearner;
|
|
252
|
+
//# sourceMappingURL=SARSALearner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"SARSALearner.js","sourceRoot":"","sources":["../../../src/learning/algorithms/SARSALearner.ts"],"names":[],"mappings":";AAAA;;;;;;;;;GASG;;;AAEH,2DAAkE;AAQlE;;GAEG;AACH,MAAM,oBAAoB,GAAa;IACrC,YAAY,EAAE,GAAG;IACjB,cAAc,EAAE,IAAI;IACpB,eAAe,EAAE,GAAG;IACpB,gBAAgB,EAAE,KAAK;IACvB,kBAAkB,EAAE,IAAI;IACxB,mBAAmB,EAAE,IAAI;IACzB,gBAAgB,EAAE,KAAK;IACvB,SAAS,EAAE,EAAE;CACd,CAAC;AAEF;;;;;;;;;;;;;;;;GAgBG;AACH,MAAa,YAAa,SAAQ,qCAAiB;IAIjD,YAAY,SAA4B,EAAE;QACxC,MAAM,UAAU,GAAG,EAAE,GAAG,oBAAoB,EAAE,GAAG,MAAM,EAAE,CAAC;QAC1D,KAAK,CAAC,UAAU,CAAC,CAAC;QAClB,IAAI,CAAC,aAAa,GAAG,UAAU,CAAC;QAChC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,+CAA+C,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE,CAAC,CAAC;IAC5F,CAAC;IAED;;;;;;;OAOG;IACH,MAAM,CAAC,UAA0B,EAAE,UAAwB;QACzD,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;QACpD,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;QACvD,MAAM,YAAY,GAAG,IAAI,CAAC,WAAW,CAAC,UAAU,CAAC,SAAS,CAAC,CAAC;QAE5D,6BAA6B;QAC7B,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAC/C,MAAM,QAAQ,GAAG,YAAY,EAAE,GAAG,CAAC,SAAS,CAAC,EAAE,KAAK,IAAI,CAAC,CAAC;QAE1D,iEAAiE;QACjE,sEAAsE;QACtE,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,IAAI,UAAU,EAAE,CAAC;YACf,wDAAwD;YACxD,MAAM,aAAa,GAAG,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,CAAC;YACpD,MAAM,gBAAgB,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;YACvD,KAAK,GAAG,gBAAgB,EAAE,GAAG,CAAC,aAAa,CAAC,EAAE,KAAK,IAAI,CAAC,CAAC;QAC3D,CAAC;aAAM,CAAC;YACN,yEAAyE;YACzE,uDAAuD;YACvD,kEAAkE;YAClE,MAAM,gBAAgB,GAAG,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC;YACvD,IAAI,gBAAgB,IAAI,gBAAgB,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;gBAClD,gFAAgF;gBAChF,KAAK,GAAG,IAAI,CAAC,gBAAgB,CAAC,UAAU,CAAC,SAAS,EAAE,gBAAgB,CAAC,CAAC;YACxE,CAAC;QACH,CAAC;QAED,oBAAoB;QACpB,oDAAoD;QACpD,MAAM,QAAQ,GAAG,UAAU,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,CAAC,cAAc,GAAG,KAAK,CAAC;QACxE,MAAM,OAAO,GAAG,QAAQ,GAAG,QAAQ,CAAC;QACpC,MAAM,IAAI,GAAG,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,YAAY,GAAG,OAAO,CAAC;QAE3D,iBAAiB;QACjB,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,CAAC,CAAC;QAE1C,6CAA6C;QAC7C,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;YACtB,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,UAAU,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,6BAA6B;QACrF,CAAC;QAED,IAAI,CAAC,SAAS,EAAE,CAAC;IACnB,CAAC;IAED;;;;;OAKG;IACK,gBAAgB,CAAC,SAAoB,EAAE,gBAAkC;QAC/E,IAAI,gBAAgB,CAAC,IAAI,KAAK,CAAC,EAAE,CAAC;YAChC,OAAO,CAAC,CAAC;QACX,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,CAAC,eAAe,CAAC;QAC5C,MAAM,UAAU,GAAG,gBAAgB,CAAC,IAAI,CAAC;QAEzC,qDAAqD;QACrD,IAAI,IAAI,GAAG,CAAC,CAAC;QACb,IAAI,IAAI,GAAG,CAAC,QAAQ,CAAC;QAErB,KAAK,MAAM,MAAM,IAAI,gBAAgB,CAAC,MAAM,EAAE,EAAE,CAAC;YAC/C,IAAI,IAAI,MAAM,CAAC,KAAK,CAAC;YACrB,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;QACtC,CAAC;QAED,MAAM,IAAI,GAAG,IAAI,GAAG,UAAU,CAAC;QAE/B,6CAA6C;QAC7C,sDAAsD;QACtD,OAAO,OAAO,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,OAAO,CAAC,GAAG,IAAI,CAAC;IAC/C,CAAC;IAED;;;;;;;;;;OAUG;IACH,eAAe,CACb,YAAuB,EACvB,aAA0B,EAC1B,MAAc,EACd,SAAoB,EACpB,gBAA+B;QAE/B,iDAAiD;QACjD,MAAM,UAAU,GAAG,IAAI,CAAC,YAAY,CAAC,SAAS,EAAE,gBAAgB,CAAC,CAAC;QAElE,oBAAoB;QACpB,MAAM,UAAU,GAAmB;YACjC,MAAM,EAAE,SAAS,IAAI,CAAC,GAAG,EAAE,EAAE;YAC7B,QAAQ,EAAE,iBAAiB;YAC3B,KAAK,EAAE,YAAY;YACnB,MAAM,EAAE,aAAa;YACrB,MAAM;YACN,SAAS;YACT,SAAS,EAAE,IAAI,IAAI,EAAE;YACrB,OAAO,EAAE,eAAe;SACzB,CAAC;QAEF,0DAA0D;QAC1D,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,UAAU,CAAC,CAAC;QAEpC,OAAO,UAAU,CAAC;IACpB,CAAC;IAED;;;;;OAKG;IACH,gBAAgB,CACd,UAIE;QAEF,uEAAuE;QACvE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC/C,MAAM,OAAO,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAC9B,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;YAE/B,MAAM,UAAU,GAAmB;gBACjC,MAAM,EAAE,WAAW,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,EAAE;gBACpC,QAAQ,EAAE,kBAAkB;gBAC5B,KAAK,EAAE,OAAO,CAAC,KAAK;gBACpB,MAAM,EAAE,OAAO,CAAC,MAAM;gBACtB,MAAM,EAAE,OAAO,CAAC,MAAM;gBACtB,SAAS,EAAE,IAAI,CAAC,KAAK;gBACrB,SAAS,EAAE,IAAI,IAAI,EAAE;gBACrB,OAAO,EAAE,eAAe;aACzB,CAAC;YAEF,qDAAqD;YACrD,IAAI,CAAC,MAAM,CAAC,UAAU,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QACvC,CAAC;QAED,0CAA0C;QAC1C,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1B,MAAM,IAAI,GAAG,UAAU,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YAC/C,MAAM,kBAAkB,GAAmB;gBACzC,MAAM,EAAE,WAAW,IAAI,CAAC,GAAG,EAAE,WAAW;gBACxC,QAAQ,EAAE,kBAAkB;gBAC5B,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,MAAM,EAAE,IAAI,CAAC,MAAM;gBACnB,SAAS,EAAE,IAAI,CAAC,KAAK,EAAE,uCAAuC;gBAC9D,SAAS,EAAE,IAAI,IAAI,EAAE;gBACrB,OAAO,EAAE,eAAe;aACzB,CAAC;YAEF,0DAA0D;YAC1D,IAAI,CAAC,MAAM,CAAC,kBAAkB,CAAC,CAAC;QAClC,CAAC;QAED,IAAI,CAAC,UAAU,EAAE,CAAC;IACpB,CAAC;IAED;;OAEG;IACO,yBAAyB;QACjC,OAAO,IAAI,CAAC,aAAa,CAAC,eAAe,CAAC;IAC5C,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACH,gBAAgB;QACd,OAAO,WAAW,CAAC;IACrB,CAAC;IAED;;OAEG;IACH,qBAAqB;QAKnB,OAAO;YACL,SAAS,EAAE,IAAI,CAAC,gBAAgB,EAAE;YAClC,IAAI,EAAE,IAAI,CAAC,gBAAgB,EAAE;YAC7B,KAAK,EAAE,IAAI,CAAC,aAAa,EAAE;SAC5B,CAAC;IACJ,CAAC;IAED;;;OAGG;IACH,qBAAqB;QAKnB,MAAM,KAAK,GAAG,IAAI,CAAC,aAAa,EAAE,CAAC;QAEnC,oCAAoC;QACpC,MAAM,SAAS,GAAG,KAAK,CAAC,SAAS,CAAC;QAClC,MAAM,WAAW,GAAG,KAAK,CAAC,SAAS,GAAG,KAAK,CAAC,SAAS,CAAC;QAEtD,0BAA0B;QAC1B,8CAA8C;QAC9C,8CAA8C;QAC9C,sCAAsC;QAEtC,MAAM,YAAY,GAChB,KAAK,CAAC,eAAe,GAAG,GAAG,IAAI,kBAAkB;YACjD,WAAW,GAAG,EAAE,IAAI,mBAAmB;YACvC,KAAK,CAAC,QAAQ,GAAG,EAAE,CAAC,CAAC,sBAAsB;QAE7C,MAAM,eAAe,GAAG,KAAK,CAAC,QAAQ,GAAG,CAAC;YACxC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,KAAK,CAAC,QAAQ,GAAG,GAAG,CAAC;YACrC,CAAC,CAAC,CAAC,CAAC;QAEN,MAAM,SAAS,GAAG,WAAW,GAAG,CAAC;YAC/B,CAAC,CAAC,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,WAAW,GAAG,EAAE,CAAC;YACvC,CAAC,CAAC,GAAG,CAAC;QAER,OAAO;YACL,YAAY;YACZ,eAAe;YACf,SAAS;SACV,CAAC;IACJ,CAAC;CACF;AAvQD,oCAuQC"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Learning Algorithms - Reinforcement Learning Implementations
|
|
3
|
+
*
|
|
4
|
+
* This module provides various RL algorithms for agent learning:
|
|
5
|
+
* - AbstractRLLearner: Base class for all RL algorithms
|
|
6
|
+
* - QLearning: Off-policy TD(0) algorithm
|
|
7
|
+
* - SARSALearner: On-policy TD(0) algorithm
|
|
8
|
+
* - ActorCriticLearner: Advantage Actor-Critic (A2C) algorithm
|
|
9
|
+
* - PPOLearner: Proximal Policy Optimization (PPO-Clip) algorithm
|
|
10
|
+
* - MAMLMetaLearner: Model-Agnostic Meta-Learning for fast adaptation
|
|
11
|
+
*/
|
|
12
|
+
import { AbstractRLLearner, RLConfig, QValue } from './AbstractRLLearner';
|
|
13
|
+
import { QLearning, QLearningConfig } from '../QLearning';
|
|
14
|
+
import { SARSALearner, SARSAConfig } from './SARSALearner';
|
|
15
|
+
import { ActorCriticLearner, ActorCriticConfig, createDefaultActorCriticConfig } from './ActorCriticLearner';
|
|
16
|
+
import { PPOLearner, PPOConfig, createDefaultPPOConfig } from './PPOLearner';
|
|
17
|
+
import { MAMLMetaLearner, MAMLConfig, createDefaultMAMLConfig } from './MAMLMetaLearner';
|
|
18
|
+
export { AbstractRLLearner, RLConfig, QValue };
|
|
19
|
+
export { QLearning, QLearningConfig };
|
|
20
|
+
export { SARSALearner, SARSAConfig };
|
|
21
|
+
export { ActorCriticLearner, ActorCriticConfig, createDefaultActorCriticConfig };
|
|
22
|
+
export { PPOLearner, PPOConfig, createDefaultPPOConfig };
|
|
23
|
+
export { MAMLMetaLearner, MAMLConfig, createDefaultMAMLConfig };
|
|
24
|
+
/**
|
|
25
|
+
* Supported RL algorithm types
|
|
26
|
+
*/
|
|
27
|
+
export type RLAlgorithmType = 'q-learning' | 'sarsa' | 'actor-critic' | 'ppo' | 'maml' | 'legacy';
|
|
28
|
+
/**
|
|
29
|
+
* Factory function to create RL algorithm instances
|
|
30
|
+
*/
|
|
31
|
+
export declare function createRLAlgorithm(type: RLAlgorithmType, config?: any): AbstractRLLearner;
|
|
32
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/learning/algorithms/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,qBAAqB,CAAC;AAC1E,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,cAAc,CAAC;AAC1D,OAAO,EAAE,YAAY,EAAE,WAAW,EAAE,MAAM,gBAAgB,CAAC;AAC3D,OAAO,EAAE,kBAAkB,EAAE,iBAAiB,EAAE,8BAA8B,EAAE,MAAM,sBAAsB,CAAC;AAC7G,OAAO,EAAE,UAAU,EAAE,SAAS,EAAE,sBAAsB,EAAE,MAAM,cAAc,CAAC;AAC7E,OAAO,EAAE,eAAe,EAAE,UAAU,EAAE,uBAAuB,EAAE,MAAM,mBAAmB,CAAC;AAEzF,OAAO,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC;AAC/C,OAAO,EAAE,SAAS,EAAE,eAAe,EAAE,CAAC;AACtC,OAAO,EAAE,YAAY,EAAE,WAAW,EAAE,CAAC;AACrC,OAAO,EAAE,kBAAkB,EAAE,iBAAiB,EAAE,8BAA8B,EAAE,CAAC;AACjF,OAAO,EAAE,UAAU,EAAE,SAAS,EAAE,sBAAsB,EAAE,CAAC;AACzD,OAAO,EAAE,eAAe,EAAE,UAAU,EAAE,uBAAuB,EAAE,CAAC;AAEhE;;GAEG;AACH,MAAM,MAAM,eAAe,GAAG,YAAY,GAAG,OAAO,GAAG,cAAc,GAAG,KAAK,GAAG,MAAM,GAAG,QAAQ,CAAC;AAElG;;GAEG;AACH,wBAAgB,iBAAiB,CAC/B,IAAI,EAAE,eAAe,EACrB,MAAM,CAAC,EAAE,GAAG,GACX,iBAAiB,CAenB"}
|