synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (226) hide show
  1. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -1
  2. examples/swe/task_app/grpo_swe_mini.py +55 -26
  3. examples/swe/task_app/hosted/rollout.py +40 -0
  4. examples/swe/task_app/hosted/test_service.py +5 -6
  5. examples/task_apps/TESTING.md +275 -0
  6. examples/task_apps/__init__.py +0 -0
  7. examples/task_apps/crafter/__init__.py +0 -0
  8. examples/task_apps/crafter/task_app/__init__.py +2 -0
  9. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +18 -13
  10. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  11. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
  12. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +25 -3
  13. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +10 -0
  14. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  15. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  16. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  17. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  18. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  19. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  20. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  21. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  22. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  23. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  24. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  25. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  26. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  27. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  28. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  29. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  30. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  31. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  32. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  33. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  34. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  35. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  36. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  37. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  38. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  39. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  40. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  41. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  42. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  43. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  44. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  45. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  71. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  72. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  73. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  74. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  75. examples/task_apps/enron/__init__.py +1 -0
  76. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  77. examples/task_apps/enron/task_app/README.md +14 -0
  78. examples/task_apps/enron/task_app/__init__.py +1 -0
  79. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  80. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  81. examples/task_apps/enron/tests/__init__.py +2 -0
  82. examples/task_apps/enron/tests/conftest.py +115 -0
  83. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  84. examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
  85. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  86. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  87. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  88. examples/task_apps/math/__init__.py +0 -0
  89. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  90. examples/task_apps/pokemon_battle/__init__.py +2 -0
  91. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  92. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  93. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  94. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  95. examples/task_apps/pokemon_red/README.md +357 -0
  96. examples/task_apps/pokemon_red/__init__.py +3 -0
  97. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  98. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
  99. examples/task_apps/pokemon_red/task_app.py +606 -0
  100. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
  101. examples/task_apps/sokoban/README.md +307 -0
  102. examples/task_apps/sokoban/__init__.py +3 -0
  103. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  104. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  105. examples/task_apps/sokoban/task_app.py +1058 -0
  106. examples/task_apps/sokoban/tests/__init__.py +2 -0
  107. examples/task_apps/sokoban/tests/conftest.py +113 -0
  108. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  109. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  110. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  111. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  112. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  113. examples/task_apps/verilog/__init__.py +1 -0
  114. examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
  115. examples/task_apps/verilog/task_app/README.md +12 -0
  116. examples/task_apps/verilog/task_app/__init__.py +1 -0
  117. examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
  118. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  119. examples/task_apps/verilog/tests/__init__.py +2 -0
  120. examples/task_apps/verilog/tests/conftest.py +115 -0
  121. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  122. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
  123. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  124. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  125. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  126. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  127. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  128. examples/workflows/__init__.py +0 -0
  129. examples/workflows/math_rl/__init__.py +0 -0
  130. examples/workflows/math_rl/download_dataset.py +80 -0
  131. synth_ai/__init__.py +2 -2
  132. synth_ai/api/train/builders.py +25 -11
  133. synth_ai/api/train/cli.py +12 -6
  134. synth_ai/api/train/configs/__init__.py +10 -10
  135. synth_ai/api/train/configs/rl.py +5 -4
  136. synth_ai/api/train/configs/sft.py +4 -3
  137. synth_ai/api/train/env_resolver.py +5 -2
  138. synth_ai/api/train/supported_algos.py +10 -5
  139. synth_ai/api/train/utils.py +7 -4
  140. synth_ai/cli/__init__.py +7 -51
  141. synth_ai/cli/_storage.py +4 -3
  142. synth_ai/cli/_validate_task_app.py +11 -0
  143. synth_ai/cli/balance.py +4 -3
  144. synth_ai/cli/calc.py +2 -2
  145. synth_ai/cli/demo.py +14 -7
  146. synth_ai/cli/legacy_root_backup.py +1 -1
  147. synth_ai/cli/rl_demo.py +8 -7
  148. synth_ai/cli/root.py +0 -97
  149. synth_ai/cli/task_apps.py +1707 -186
  150. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
  151. synth_ai/environments/examples/enron/engine.py +7 -2
  152. synth_ai/environments/examples/enron/environment.py +68 -0
  153. synth_ai/environments/examples/red/engine.py +27 -0
  154. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  155. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  156. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  157. synth_ai/environments/examples/red/environment.py +60 -0
  158. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  159. synth_ai/environments/examples/verilog/engine.py +30 -4
  160. synth_ai/evals/client.py +58 -61
  161. synth_ai/jobs/client.py +16 -4
  162. synth_ai/judge_schemas.py +16 -16
  163. synth_ai/py.typed +0 -0
  164. synth_ai/task/__init__.py +14 -5
  165. synth_ai/task/contracts.py +124 -38
  166. synth_ai/task/proxy.py +48 -56
  167. synth_ai/task/rubrics/__init__.py +53 -0
  168. synth_ai/task/rubrics/loaders.py +133 -0
  169. synth_ai/task/rubrics/models.py +57 -0
  170. synth_ai/task/rubrics/scoring.py +113 -0
  171. synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
  172. synth_ai/task/server.py +8 -7
  173. synth_ai/task/validators.py +269 -6
  174. synth_ai/tracing_v3/decorators.py +7 -3
  175. synth_ai/tracing_v3/replica_sync.py +4 -4
  176. synth_ai/tracing_v3/serialization.py +5 -5
  177. synth_ai/tracing_v3/trace_utils.py +317 -0
  178. synth_ai/tracing_v3/turso/native_manager.py +3 -3
  179. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
  180. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +214 -101
  181. examples/agora_ex/README_MoE.md +0 -224
  182. examples/agora_ex/__init__.py +0 -7
  183. examples/agora_ex/agora_ex.py +0 -65
  184. examples/agora_ex/agora_ex_task_app.py +0 -590
  185. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
  186. examples/agora_ex/reward_fn_grpo-human.py +0 -129
  187. examples/agora_ex/system_prompt_CURRENT.md +0 -63
  188. examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
  189. examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
  190. examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
  191. synth_ai/rubrics/__init__.py +0 -22
  192. synth_ai/task/rubrics.py +0 -219
  193. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  194. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  195. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  196. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  197. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  198. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  199. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  200. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  201. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
  202. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
  203. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  204. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  205. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  206. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  207. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +0 -0
  208. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  209. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  210. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  211. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  212. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  213. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
  214. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  215. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  216. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  217. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  218. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  219. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  220. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  221. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  222. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  223. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
  224. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -0
  225. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
  226. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,514 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ LLM Logger utility for logging all VLM interactions
4
+
5
+ This module provides a centralized logging system for all LLM interactions,
6
+ including input prompts, responses, and metadata. Logs are saved to dated
7
+ files in the llm_logs directory.
8
+ """
9
+
10
+ import os
11
+ import json
12
+ import time
13
+ from datetime import datetime
14
+ from typing import Dict, Any, Optional
15
+ import logging
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ class LLMLogger:
20
+ """Logger for all LLM interactions"""
21
+
22
+ def __init__(self, log_dir: str = "llm_logs"):
23
+ """Initialize the LLM logger
24
+
25
+ Args:
26
+ log_dir: Directory to store log files
27
+ """
28
+ self.log_dir = log_dir
29
+ self.session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
30
+ self.log_file = os.path.join(log_dir, f"llm_log_{self.session_id}.jsonl")
31
+
32
+ # Ensure log directory exists
33
+ os.makedirs(log_dir, exist_ok=True)
34
+
35
+ # Initialize cumulative metrics
36
+ self.cumulative_metrics = {
37
+ "total_tokens": 0,
38
+ "prompt_tokens": 0,
39
+ "completion_tokens": 0,
40
+ "total_cost": 0.0,
41
+ "total_actions": 0,
42
+ "start_time": time.time(),
43
+ "total_llm_calls": 0
44
+ }
45
+
46
+ # Model pricing (per 1K tokens) - can be updated based on actual pricing
47
+ self.pricing = {
48
+ "gpt-4o": {"prompt": 0.01, "completion": 0.03},
49
+ "gpt-4o-mini": {"prompt": 0.00015, "completion": 0.0006},
50
+ "o3-mini": {"prompt": 0.0012, "completion": 0.0048},
51
+ "gemini-2.5-flash": {"prompt": 0.000315, "completion": 0.00126},
52
+ "gemini-2.5-pro": {"prompt": 0.00125, "completion": 0.005},
53
+ "default": {"prompt": 0.001, "completion": 0.002} # Default pricing
54
+ }
55
+
56
+ # Initialize log file with session info
57
+ self._log_session_start()
58
+
59
+ logger.info(f"LLM Logger initialized: {self.log_file}")
60
+
61
+ def _log_session_start(self):
62
+ """Log session start information"""
63
+ session_info = {
64
+ "timestamp": datetime.now().isoformat(),
65
+ "type": "session_start",
66
+ "session_id": self.session_id,
67
+ "log_file": self.log_file
68
+ }
69
+ self._write_log_entry(session_info)
70
+
71
+ def log_interaction(self,
72
+ interaction_type: str,
73
+ prompt: str,
74
+ response: str,
75
+ metadata: Optional[Dict[str, Any]] = None,
76
+ duration: Optional[float] = None,
77
+ model_info: Optional[Dict[str, Any]] = None):
78
+ """Log a complete LLM interaction
79
+
80
+ Args:
81
+ interaction_type: Type of interaction (e.g., "perception", "planning", "action")
82
+ prompt: The input prompt sent to the LLM
83
+ response: The response received from the LLM
84
+ metadata: Additional metadata about the interaction
85
+ duration: Time taken for the interaction in seconds
86
+ model_info: Information about the model used
87
+ """
88
+ log_entry = {
89
+ "timestamp": datetime.now().isoformat(),
90
+ "type": "interaction",
91
+ "interaction_type": interaction_type,
92
+ "prompt": prompt,
93
+ "response": response,
94
+ "duration": duration,
95
+ "metadata": metadata or {},
96
+ "model_info": model_info or {}
97
+ }
98
+
99
+ self._write_log_entry(log_entry)
100
+
101
+ # Update cumulative metrics
102
+ self.cumulative_metrics["total_llm_calls"] += 1
103
+
104
+ # Track token usage if available
105
+ if metadata and "token_usage" in metadata:
106
+ token_usage = metadata["token_usage"]
107
+ if token_usage:
108
+ self.cumulative_metrics["total_tokens"] += token_usage.get("total_tokens", 0)
109
+ self.cumulative_metrics["prompt_tokens"] += token_usage.get("prompt_tokens", 0)
110
+ self.cumulative_metrics["completion_tokens"] += token_usage.get("completion_tokens", 0)
111
+
112
+ # Calculate cost based on model
113
+ model_name = model_info.get("model", "") if model_info else ""
114
+ pricing = self.pricing.get("default")
115
+ for key in self.pricing:
116
+ if key in model_name.lower():
117
+ pricing = self.pricing[key]
118
+ break
119
+
120
+ prompt_cost = (token_usage.get("prompt_tokens", 0) / 1000) * pricing["prompt"]
121
+ completion_cost = (token_usage.get("completion_tokens", 0) / 1000) * pricing["completion"]
122
+ self.cumulative_metrics["total_cost"] += prompt_cost + completion_cost
123
+
124
+ # Track actions if this is an action interaction
125
+ if "action" in interaction_type.lower():
126
+ # Count actions in response - look for valid button presses
127
+ # Response could be single button like "A" or multiple like "A A B" or with commas
128
+ valid_buttons = ['A', 'B', 'SELECT', 'START', 'UP', 'DOWN', 'LEFT', 'RIGHT', 'L', 'R']
129
+
130
+ # Convert response to uppercase and split by spaces or commas
131
+ response_upper = response.upper()
132
+ tokens = response_upper.replace(',', ' ').split()
133
+
134
+ # Count each valid button found
135
+ action_count = sum(1 for token in tokens if token in valid_buttons)
136
+
137
+ # If no actions found but response contains button names, count them
138
+ if action_count == 0:
139
+ # Also check for arrow notations
140
+ action_count += response_upper.count('UP')
141
+ action_count += response_upper.count('DOWN')
142
+ action_count += response_upper.count('LEFT')
143
+ action_count += response_upper.count('RIGHT')
144
+ action_count += response.count('↑')
145
+ action_count += response.count('↓')
146
+ action_count += response.count('←')
147
+ action_count += response.count('→')
148
+ # Count single letter buttons
149
+ for char in 'ABLR':
150
+ if char in response_upper:
151
+ action_count += response_upper.count(char)
152
+
153
+ if action_count > 0:
154
+ self.cumulative_metrics["total_actions"] += action_count
155
+ logger.debug(f"Counted {action_count} actions in response: {response[:50]}")
156
+
157
+ # Also log to console for debugging
158
+ logger.info(f"LLM {interaction_type.upper()}: {duration:.2f}s")
159
+ if duration:
160
+ logger.debug(f"Prompt length: {len(prompt)} chars, Response length: {len(response)} chars")
161
+
162
+ def log_error(self,
163
+ interaction_type: str,
164
+ prompt: str,
165
+ error: str,
166
+ metadata: Optional[Dict[str, Any]] = None):
167
+ """Log an LLM interaction error
168
+
169
+ Args:
170
+ interaction_type: Type of interaction that failed
171
+ prompt: The input prompt that was sent
172
+ error: The error message
173
+ metadata: Additional metadata about the error
174
+ """
175
+ log_entry = {
176
+ "timestamp": datetime.now().isoformat(),
177
+ "type": "error",
178
+ "interaction_type": interaction_type,
179
+ "prompt": prompt,
180
+ "error": error,
181
+ "metadata": metadata or {}
182
+ }
183
+
184
+ self._write_log_entry(log_entry)
185
+ logger.error(f"LLM {interaction_type.upper()} ERROR: {error}")
186
+
187
+ def log_step_start(self, step: int, step_type: str = "agent_step"):
188
+ """Log the start of an agent step
189
+
190
+ Args:
191
+ step: Step number
192
+ step_type: Type of step (e.g., "agent_step", "perception", "planning")
193
+ """
194
+ log_entry = {
195
+ "timestamp": datetime.now().isoformat(),
196
+ "type": "step_start",
197
+ "step": step,
198
+ "step_type": step_type
199
+ }
200
+
201
+ self._write_log_entry(log_entry)
202
+ logger.info(f"Starting {step_type} {step}")
203
+
204
+ def log_step_end(self, step: int, step_type: str = "agent_step",
205
+ duration: Optional[float] = None,
206
+ summary: Optional[str] = None):
207
+ """Log the end of an agent step
208
+
209
+ Args:
210
+ step: Step number
211
+ step_type: Type of step
212
+ duration: Time taken for the step
213
+ summary: Summary of what happened in the step
214
+ """
215
+ log_entry = {
216
+ "timestamp": datetime.now().isoformat(),
217
+ "type": "step_end",
218
+ "step": step,
219
+ "step_type": step_type,
220
+ "duration": duration,
221
+ "summary": summary
222
+ }
223
+
224
+ self._write_log_entry(log_entry)
225
+ if duration:
226
+ logger.info(f"Completed {step_type} {step} in {duration:.2f}s")
227
+ else:
228
+ logger.info(f"Completed {step_type} {step}")
229
+
230
+ def log_state_snapshot(self, state_data: Dict[str, Any], step: int):
231
+ """Log a snapshot of the game state
232
+
233
+ Args:
234
+ state_data: The game state data
235
+ step: Current step number
236
+ """
237
+ # Extract key information to avoid logging too much data
238
+ state_summary = {
239
+ "step": step,
240
+ "player_location": state_data.get("player", {}).get("location"),
241
+ "player_position": state_data.get("player", {}).get("position"),
242
+ "game_state": state_data.get("game", {}).get("game_state"),
243
+ "is_in_battle": state_data.get("game", {}).get("is_in_battle"),
244
+ "party_size": len(state_data.get("player", {}).get("party", [])),
245
+ "money": state_data.get("game", {}).get("money"),
246
+ "dialog_text": state_data.get("game", {}).get("dialog_text", "")[:100] + "..." if state_data.get("game", {}).get("dialog_text") else None
247
+ }
248
+
249
+ log_entry = {
250
+ "timestamp": datetime.now().isoformat(),
251
+ "type": "state_snapshot",
252
+ "step": step,
253
+ "state_summary": state_summary
254
+ }
255
+
256
+ self._write_log_entry(log_entry)
257
+
258
+ def log_action(self, action: str, step: int, reasoning: Optional[str] = None):
259
+ """Log an action taken by the agent
260
+
261
+ Args:
262
+ action: The action taken
263
+ step: Current step number
264
+ reasoning: Reasoning behind the action
265
+ """
266
+ log_entry = {
267
+ "timestamp": datetime.now().isoformat(),
268
+ "type": "action",
269
+ "step": step,
270
+ "action": action,
271
+ "reasoning": reasoning
272
+ }
273
+
274
+ self._write_log_entry(log_entry)
275
+ logger.info(f"Action {step}: {action}")
276
+ if reasoning:
277
+ logger.debug(f"Reasoning: {reasoning}")
278
+
279
+ def _write_log_entry(self, log_entry: Dict[str, Any]):
280
+ """Write a log entry to the log file
281
+
282
+ Args:
283
+ log_entry: The log entry to write
284
+ """
285
+ try:
286
+ with open(self.log_file, 'a', encoding='utf-8') as f:
287
+ f.write(json.dumps(log_entry, ensure_ascii=False) + '\n')
288
+ except Exception as e:
289
+ logger.error(f"Failed to write log entry: {e}")
290
+
291
+ def get_cumulative_metrics(self) -> Dict[str, Any]:
292
+ """Get cumulative metrics for the session
293
+
294
+ Returns:
295
+ Dictionary with cumulative metrics
296
+ """
297
+ # Update runtime
298
+ self.cumulative_metrics["total_run_time"] = time.time() - self.cumulative_metrics["start_time"]
299
+ return self.cumulative_metrics.copy()
300
+
301
+ def get_session_summary(self) -> Dict[str, Any]:
302
+ """Get a summary of the current session
303
+
304
+ Returns:
305
+ Dictionary with session summary information
306
+ """
307
+ try:
308
+ with open(self.log_file, 'r', encoding='utf-8') as f:
309
+ lines = f.readlines()
310
+
311
+ interactions = 0
312
+ errors = 0
313
+ total_duration = 0.0
314
+
315
+ for line in lines:
316
+ try:
317
+ entry = json.loads(line.strip())
318
+ if entry.get("type") == "interaction":
319
+ interactions += 1
320
+ if entry.get("duration"):
321
+ total_duration += entry["duration"]
322
+ elif entry.get("type") == "error":
323
+ errors += 1
324
+ except json.JSONDecodeError:
325
+ continue
326
+
327
+ return {
328
+ "session_id": self.session_id,
329
+ "log_file": self.log_file,
330
+ "total_interactions": interactions,
331
+ "total_errors": errors,
332
+ "total_duration": total_duration,
333
+ "average_duration": total_duration / interactions if interactions > 0 else 0
334
+ }
335
+ except Exception as e:
336
+ logger.error(f"Failed to get session summary: {e}")
337
+ return {"error": str(e)}
338
+
339
+ def save_checkpoint(self, checkpoint_file: str = None, agent_step_count: int = None):
340
+ """Save current LLM interaction history to checkpoint file
341
+
342
+ Args:
343
+ checkpoint_file: Path to save the checkpoint (defaults to cache folder)
344
+ agent_step_count: Current agent step count for persistence
345
+ """
346
+ try:
347
+ # Use cache folder by default
348
+ if checkpoint_file is None or checkpoint_file == "checkpoint_llm.txt":
349
+ cache_dir = ".pokeagent_cache"
350
+ os.makedirs(cache_dir, exist_ok=True)
351
+ checkpoint_file = os.path.join(cache_dir, "checkpoint_llm.txt")
352
+ # Read all current log entries
353
+ log_entries = []
354
+ if os.path.exists(self.log_file):
355
+ with open(self.log_file, 'r', encoding='utf-8') as f:
356
+ for line in f:
357
+ try:
358
+ log_entries.append(json.loads(line.strip()))
359
+ except json.JSONDecodeError:
360
+ continue
361
+
362
+ # Update run time in metrics
363
+ self.cumulative_metrics["total_run_time"] = time.time() - self.cumulative_metrics["start_time"]
364
+
365
+ # Add checkpoint metadata
366
+ checkpoint_data = {
367
+ "checkpoint_timestamp": datetime.now().isoformat(),
368
+ "session_id": self.session_id,
369
+ "original_log_file": self.log_file,
370
+ "total_entries": len(log_entries),
371
+ "agent_step_count": agent_step_count, # Save current step count
372
+ "cumulative_metrics": self.cumulative_metrics, # Save metrics
373
+ "log_entries": log_entries
374
+ }
375
+
376
+ # Add map stitcher data if available via callback
377
+ if hasattr(self, '_map_stitcher_callback') and self._map_stitcher_callback:
378
+ try:
379
+ self._map_stitcher_callback(checkpoint_data)
380
+ except Exception as e:
381
+ logger.debug(f"Failed to save map stitcher to checkpoint: {e}")
382
+
383
+ # Save to checkpoint file
384
+ with open(checkpoint_file, 'w', encoding='utf-8') as f:
385
+ json.dump(checkpoint_data, f, indent=2, ensure_ascii=False)
386
+
387
+ logger.info(f"LLM checkpoint saved: {checkpoint_file} ({len(log_entries)} entries)")
388
+
389
+ except Exception as e:
390
+ logger.error(f"Failed to save LLM checkpoint: {e}")
391
+
392
+ def load_checkpoint(self, checkpoint_file: str = None) -> Optional[int]:
393
+ """Load LLM interaction history from checkpoint file
394
+
395
+ Args:
396
+ checkpoint_file: Path to load the checkpoint from (defaults to cache folder)
397
+
398
+ Returns:
399
+ Last agent step count from the checkpoint, or None if not found
400
+ """
401
+ try:
402
+ # Use cache folder by default
403
+ if checkpoint_file is None or checkpoint_file == "checkpoint_llm.txt":
404
+ cache_dir = ".pokeagent_cache"
405
+ checkpoint_file = os.path.join(cache_dir, "checkpoint_llm.txt")
406
+
407
+ if not os.path.exists(checkpoint_file):
408
+ logger.info(f"No checkpoint file found at {checkpoint_file}")
409
+ return None
410
+ with open(checkpoint_file, 'r', encoding='utf-8') as f:
411
+ checkpoint_data = json.load(f)
412
+
413
+ log_entries = checkpoint_data.get("log_entries", [])
414
+
415
+ # Restore cumulative metrics if available
416
+ if "cumulative_metrics" in checkpoint_data:
417
+ saved_metrics = checkpoint_data["cumulative_metrics"]
418
+ # Restore all metrics including the original start_time
419
+ self.cumulative_metrics.update(saved_metrics)
420
+
421
+ # If the checkpoint has a start_time, use it to preserve the original session start
422
+ if "start_time" in saved_metrics:
423
+ logger.info(f"Restored original start time from checkpoint: {saved_metrics['start_time']}")
424
+ else:
425
+ logger.warning("No start_time found in checkpoint, using current time")
426
+
427
+ # Restore log entries to current log file
428
+ with open(self.log_file, 'w', encoding='utf-8') as f:
429
+ for entry in log_entries:
430
+ f.write(json.dumps(entry, ensure_ascii=False) + '\n')
431
+
432
+ # Try to get step count from checkpoint metadata first
433
+ last_step = checkpoint_data.get("agent_step_count")
434
+
435
+ # If not in metadata, find the last agent step from log entries
436
+ if last_step is None:
437
+ for entry in reversed(log_entries):
438
+ if entry.get("type") == "step_start" and "step_number" in entry:
439
+ last_step = entry["step_number"]
440
+ break
441
+
442
+ logger.info(f"LLM checkpoint loaded: {checkpoint_file} ({len(log_entries)} entries, step {last_step})")
443
+
444
+ # Load map stitcher data if available via callback
445
+ if hasattr(self, '_map_stitcher_load_callback') and self._map_stitcher_load_callback:
446
+ try:
447
+ self._map_stitcher_load_callback(checkpoint_data)
448
+ except Exception as e:
449
+ logger.debug(f"Failed to load map stitcher from checkpoint: {e}")
450
+
451
+ return last_step
452
+
453
+ except Exception as e:
454
+ logger.error(f"Failed to load LLM checkpoint: {e}")
455
+ return None
456
+
457
+ # Global logger instance
458
+ _llm_logger = None
459
+
460
+ def get_llm_logger() -> LLMLogger:
461
+ """Get the global LLM logger instance
462
+
463
+ Returns:
464
+ The global LLM logger instance
465
+ """
466
+ global _llm_logger
467
+ if _llm_logger is None:
468
+ _llm_logger = LLMLogger()
469
+ return _llm_logger
470
+
471
+ def setup_map_stitcher_checkpoint_integration(memory_reader):
472
+ """Set up map stitcher integration with checkpoint system"""
473
+ logger = get_llm_logger()
474
+
475
+ def save_callback(checkpoint_data):
476
+ if hasattr(memory_reader, '_map_stitcher') and memory_reader._map_stitcher:
477
+ memory_reader._map_stitcher.save_to_checkpoint(checkpoint_data)
478
+
479
+ def load_callback(checkpoint_data):
480
+ if hasattr(memory_reader, '_map_stitcher') and memory_reader._map_stitcher:
481
+ memory_reader._map_stitcher.load_from_checkpoint(checkpoint_data)
482
+
483
+ logger._map_stitcher_callback = save_callback
484
+ logger._map_stitcher_load_callback = load_callback
485
+
486
+ def log_llm_interaction(interaction_type: str, prompt: str, response: str,
487
+ metadata: Optional[Dict[str, Any]] = None,
488
+ duration: Optional[float] = None,
489
+ model_info: Optional[Dict[str, Any]] = None):
490
+ """Convenience function to log an LLM interaction
491
+
492
+ Args:
493
+ interaction_type: Type of interaction
494
+ prompt: Input prompt
495
+ response: LLM response
496
+ metadata: Additional metadata
497
+ duration: Time taken
498
+ model_info: Model information
499
+ """
500
+ logger = get_llm_logger()
501
+ logger.log_interaction(interaction_type, prompt, response, metadata, duration, model_info)
502
+
503
+ def log_llm_error(interaction_type: str, prompt: str, error: str,
504
+ metadata: Optional[Dict[str, Any]] = None):
505
+ """Convenience function to log an LLM error
506
+
507
+ Args:
508
+ interaction_type: Type of interaction that failed
509
+ prompt: Input prompt
510
+ error: Error message
511
+ metadata: Additional metadata
512
+ """
513
+ logger = get_llm_logger()
514
+ logger.log_error(interaction_type, prompt, error, metadata)