patchpal 0.3.1__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {patchpal-0.3.1/patchpal.egg-info → patchpal-0.4.1}/PKG-INFO +92 -1
- {patchpal-0.3.1 → patchpal-0.4.1}/README.md +91 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/patchpal/__init__.py +1 -1
- {patchpal-0.3.1 → patchpal-0.4.1}/patchpal/agent.py +29 -7
- {patchpal-0.3.1 → patchpal-0.4.1}/patchpal/cli.py +67 -1
- {patchpal-0.3.1 → patchpal-0.4.1}/patchpal/tools.py +225 -22
- {patchpal-0.3.1 → patchpal-0.4.1/patchpal.egg-info}/PKG-INFO +92 -1
- {patchpal-0.3.1 → patchpal-0.4.1}/tests/test_agent.py +7 -7
- {patchpal-0.3.1 → patchpal-0.4.1}/LICENSE +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/MANIFEST.in +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/patchpal/context.py +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/patchpal/permissions.py +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/patchpal/skills.py +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/patchpal/system_prompt.md +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/patchpal.egg-info/SOURCES.txt +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/patchpal.egg-info/dependency_links.txt +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/patchpal.egg-info/entry_points.txt +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/patchpal.egg-info/requires.txt +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/patchpal.egg-info/top_level.txt +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/pyproject.toml +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/setup.cfg +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/tests/test_cli.py +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/tests/test_context.py +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/tests/test_guardrails.py +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/tests/test_operational_safety.py +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/tests/test_skills.py +0 -0
- {patchpal-0.3.1 → patchpal-0.4.1}/tests/test_tools.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: patchpal
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: A lean Claude Code clone in pure Python
|
|
5
5
|
Author: PatchPal Contributors
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -905,6 +905,11 @@ You: /status
|
|
|
905
905
|
# - Token usage breakdown
|
|
906
906
|
# - Visual progress bar
|
|
907
907
|
# - Auto-compaction status
|
|
908
|
+
# - Session statistics:
|
|
909
|
+
# - Total LLM calls made
|
|
910
|
+
# - Cumulative input tokens (all requests combined)
|
|
911
|
+
# - Cumulative output tokens (all responses combined)
|
|
912
|
+
# - Total tokens (helps estimate API costs)
|
|
908
913
|
|
|
909
914
|
# Manually trigger compaction
|
|
910
915
|
You: /compact
|
|
@@ -916,6 +921,23 @@ You: /compact
|
|
|
916
921
|
# Note: Requires at least 5 messages; most effective when context >50% full
|
|
917
922
|
```
|
|
918
923
|
|
|
924
|
+
**Understanding Session Statistics:**
|
|
925
|
+
|
|
926
|
+
The `/status` command shows cumulative token usage:
|
|
927
|
+
|
|
928
|
+
- **Cumulative input tokens**: Total tokens sent to the LLM across all calls
|
|
929
|
+
- Each LLM call resends the entire conversation history
|
|
930
|
+
- **Note on Anthropic models**: PatchPal uses prompt caching
|
|
931
|
+
- System prompt and last 2 messages are cached
|
|
932
|
+
- Cached tokens cost much less than regular input tokens
|
|
933
|
+
- The displayed token counts show raw totals, not cache-adjusted costs
|
|
934
|
+
|
|
935
|
+
- **Cumulative output tokens**: Total tokens generated by the LLM
|
|
936
|
+
- Usually much smaller than input (just the generated responses)
|
|
937
|
+
- Typically costs more per token than input
|
|
938
|
+
|
|
939
|
+
**Important**: The token counts shown are raw totals and don't reflect prompt caching discounts. For accurate cost information, check your provider's usage dashboard which shows cache hits and actual billing.
|
|
940
|
+
|
|
919
941
|
**Configuration:**
|
|
920
942
|
|
|
921
943
|
See the [Configuration](https://github.com/amaiya/patchpal?tab=readme-ov-file#configuration) section for context management settings including:
|
|
@@ -1004,3 +1026,72 @@ The system ensures you can work for extended periods without hitting context lim
|
|
|
1004
1026
|
- Context is automatically managed at 75% capacity through pruning and compaction.
|
|
1005
1027
|
- **Note:** Token estimation may be slightly inaccurate compared to the model's actual counting. If you see this error despite auto-compaction being enabled, the 75% threshold may need to be lowered further for your workload. You can adjust it with `export PATCHPAL_COMPACT_THRESHOLD=0.70` (or lower).
|
|
1006
1028
|
- See [Configuration](https://github.com/amaiya/patchpal?tab=readme-ov-file#configuration) for context management settings.
|
|
1029
|
+
|
|
1030
|
+
**Reducing API Costs via Token Optimization**
|
|
1031
|
+
|
|
1032
|
+
When using cloud LLM providers (Anthropic, OpenAI, etc.), token usage directly impacts costs. PatchPal includes several features to help minimize token consumption:
|
|
1033
|
+
|
|
1034
|
+
**1. Use Pruning to Manage Long Sessions**
|
|
1035
|
+
- **Automatic pruning** removes old tool outputs while preserving conversation context
|
|
1036
|
+
- Configure pruning thresholds to be more aggressive:
|
|
1037
|
+
```bash
|
|
1038
|
+
export PATCHPAL_PRUNE_PROTECT=20000 # Reduce from 40k to 20k tokens
|
|
1039
|
+
export PATCHPAL_PRUNE_MINIMUM=10000 # Reduce minimum saved from 20k to 10k
|
|
1040
|
+
```
|
|
1041
|
+
- Pruning happens transparently before compaction and is much faster (no LLM call needed)
|
|
1042
|
+
|
|
1043
|
+
**2. Monitor Session Token Usage**
|
|
1044
|
+
- Use `/status` to see cumulative token usage in real-time
|
|
1045
|
+
- **Session Statistics** section shows:
|
|
1046
|
+
- Total LLM calls made
|
|
1047
|
+
- Cumulative input tokens (raw totals, before caching discounts)
|
|
1048
|
+
- Cumulative output tokens
|
|
1049
|
+
- Total tokens for the session
|
|
1050
|
+
- Check periodically during long sessions to monitor usage
|
|
1051
|
+
- **Important**: Token counts don't reflect prompt caching discounts (Anthropic models)
|
|
1052
|
+
- For actual costs, check your provider's usage dashboard which shows cache-adjusted billing
|
|
1053
|
+
|
|
1054
|
+
**3. Manual Compaction for Cost Control**
|
|
1055
|
+
- Use `/status` regularly to monitor context window usage
|
|
1056
|
+
- Run `/compact` proactively when context grows large (before hitting auto-compact threshold)
|
|
1057
|
+
- Manual compaction gives you control over when the summarization LLM call happens
|
|
1058
|
+
|
|
1059
|
+
**4. Adjust Auto-Compaction Threshold**
|
|
1060
|
+
- Lower threshold = more frequent compaction = smaller context = lower per-request costs
|
|
1061
|
+
- Higher threshold = fewer compaction calls = larger context = higher per-request costs
|
|
1062
|
+
```bash
|
|
1063
|
+
# More aggressive compaction (compact at 60% instead of 75%)
|
|
1064
|
+
export PATCHPAL_COMPACT_THRESHOLD=0.60
|
|
1065
|
+
```
|
|
1066
|
+
- Find the sweet spot for your workload (balance between compaction frequency and context size)
|
|
1067
|
+
|
|
1068
|
+
**5. Use Local Models for Zero API Costs**
|
|
1069
|
+
- **Best option:** Run vLLM locally to eliminate API costs entirely
|
|
1070
|
+
```bash
|
|
1071
|
+
export HOSTED_VLLM_API_BASE=http://localhost:8000
|
|
1072
|
+
export HOSTED_VLLM_API_KEY=token-abc123
|
|
1073
|
+
patchpal --model hosted_vllm/openai/gpt-oss-20b
|
|
1074
|
+
```
|
|
1075
|
+
- **Alternative:** Use Ollama (requires `OLLAMA_CONTEXT_LENGTH=32768`)
|
|
1076
|
+
- See [Using Local Models](https://github.com/amaiya/patchpal?tab=readme-ov-file#using-local-models-vllm--ollama) for setup
|
|
1077
|
+
|
|
1078
|
+
**6. Start Fresh When Appropriate**
|
|
1079
|
+
- Use `/clear` command to reset conversation history without restarting PatchPal
|
|
1080
|
+
- Exit and restart PatchPal between unrelated tasks to clear context completely
|
|
1081
|
+
- Each fresh start begins with minimal tokens (just the system prompt)
|
|
1082
|
+
- Better than carrying large conversation history across different tasks
|
|
1083
|
+
|
|
1084
|
+
**7. Use Smaller Models for Simple Tasks**
|
|
1085
|
+
- Use less expensive models for routine tasks:
|
|
1086
|
+
```bash
|
|
1087
|
+
patchpal --model anthropic/claude-3-7-sonnet-latest # Cheaper than claude-sonnet-4-5
|
|
1088
|
+
patchpal --model openai/gpt-4o-mini # Cheaper than gpt-4o
|
|
1089
|
+
```
|
|
1090
|
+
- Reserve premium models for complex reasoning tasks
|
|
1091
|
+
|
|
1092
|
+
**Cost Monitoring Tips:**
|
|
1093
|
+
- Check `/status` before large operations to see current token usage
|
|
1094
|
+
- **Anthropic models**: Prompt caching reduces costs (system prompt + last 2 messages cached)
|
|
1095
|
+
- Most cloud providers offer usage dashboards showing cache hits and actual charges
|
|
1096
|
+
- Set up billing alerts with your provider to avoid surprises
|
|
1097
|
+
- Consider local models (vLLM recommended) for high-volume usage or zero API costs
|
|
@@ -868,6 +868,11 @@ You: /status
|
|
|
868
868
|
# - Token usage breakdown
|
|
869
869
|
# - Visual progress bar
|
|
870
870
|
# - Auto-compaction status
|
|
871
|
+
# - Session statistics:
|
|
872
|
+
# - Total LLM calls made
|
|
873
|
+
# - Cumulative input tokens (all requests combined)
|
|
874
|
+
# - Cumulative output tokens (all responses combined)
|
|
875
|
+
# - Total tokens (helps estimate API costs)
|
|
871
876
|
|
|
872
877
|
# Manually trigger compaction
|
|
873
878
|
You: /compact
|
|
@@ -879,6 +884,23 @@ You: /compact
|
|
|
879
884
|
# Note: Requires at least 5 messages; most effective when context >50% full
|
|
880
885
|
```
|
|
881
886
|
|
|
887
|
+
**Understanding Session Statistics:**
|
|
888
|
+
|
|
889
|
+
The `/status` command shows cumulative token usage:
|
|
890
|
+
|
|
891
|
+
- **Cumulative input tokens**: Total tokens sent to the LLM across all calls
|
|
892
|
+
- Each LLM call resends the entire conversation history
|
|
893
|
+
- **Note on Anthropic models**: PatchPal uses prompt caching
|
|
894
|
+
- System prompt and last 2 messages are cached
|
|
895
|
+
- Cached tokens cost much less than regular input tokens
|
|
896
|
+
- The displayed token counts show raw totals, not cache-adjusted costs
|
|
897
|
+
|
|
898
|
+
- **Cumulative output tokens**: Total tokens generated by the LLM
|
|
899
|
+
- Usually much smaller than input (just the generated responses)
|
|
900
|
+
- Typically costs more per token than input
|
|
901
|
+
|
|
902
|
+
**Important**: The token counts shown are raw totals and don't reflect prompt caching discounts. For accurate cost information, check your provider's usage dashboard which shows cache hits and actual billing.
|
|
903
|
+
|
|
882
904
|
**Configuration:**
|
|
883
905
|
|
|
884
906
|
See the [Configuration](https://github.com/amaiya/patchpal?tab=readme-ov-file#configuration) section for context management settings including:
|
|
@@ -967,3 +989,72 @@ The system ensures you can work for extended periods without hitting context lim
|
|
|
967
989
|
- Context is automatically managed at 75% capacity through pruning and compaction.
|
|
968
990
|
- **Note:** Token estimation may be slightly inaccurate compared to the model's actual counting. If you see this error despite auto-compaction being enabled, the 75% threshold may need to be lowered further for your workload. You can adjust it with `export PATCHPAL_COMPACT_THRESHOLD=0.70` (or lower).
|
|
969
991
|
- See [Configuration](https://github.com/amaiya/patchpal?tab=readme-ov-file#configuration) for context management settings.
|
|
992
|
+
|
|
993
|
+
**Reducing API Costs via Token Optimization**
|
|
994
|
+
|
|
995
|
+
When using cloud LLM providers (Anthropic, OpenAI, etc.), token usage directly impacts costs. PatchPal includes several features to help minimize token consumption:
|
|
996
|
+
|
|
997
|
+
**1. Use Pruning to Manage Long Sessions**
|
|
998
|
+
- **Automatic pruning** removes old tool outputs while preserving conversation context
|
|
999
|
+
- Configure pruning thresholds to be more aggressive:
|
|
1000
|
+
```bash
|
|
1001
|
+
export PATCHPAL_PRUNE_PROTECT=20000 # Reduce from 40k to 20k tokens
|
|
1002
|
+
export PATCHPAL_PRUNE_MINIMUM=10000 # Reduce minimum saved from 20k to 10k
|
|
1003
|
+
```
|
|
1004
|
+
- Pruning happens transparently before compaction and is much faster (no LLM call needed)
|
|
1005
|
+
|
|
1006
|
+
**2. Monitor Session Token Usage**
|
|
1007
|
+
- Use `/status` to see cumulative token usage in real-time
|
|
1008
|
+
- **Session Statistics** section shows:
|
|
1009
|
+
- Total LLM calls made
|
|
1010
|
+
- Cumulative input tokens (raw totals, before caching discounts)
|
|
1011
|
+
- Cumulative output tokens
|
|
1012
|
+
- Total tokens for the session
|
|
1013
|
+
- Check periodically during long sessions to monitor usage
|
|
1014
|
+
- **Important**: Token counts don't reflect prompt caching discounts (Anthropic models)
|
|
1015
|
+
- For actual costs, check your provider's usage dashboard which shows cache-adjusted billing
|
|
1016
|
+
|
|
1017
|
+
**3. Manual Compaction for Cost Control**
|
|
1018
|
+
- Use `/status` regularly to monitor context window usage
|
|
1019
|
+
- Run `/compact` proactively when context grows large (before hitting auto-compact threshold)
|
|
1020
|
+
- Manual compaction gives you control over when the summarization LLM call happens
|
|
1021
|
+
|
|
1022
|
+
**4. Adjust Auto-Compaction Threshold**
|
|
1023
|
+
- Lower threshold = more frequent compaction = smaller context = lower per-request costs
|
|
1024
|
+
- Higher threshold = fewer compaction calls = larger context = higher per-request costs
|
|
1025
|
+
```bash
|
|
1026
|
+
# More aggressive compaction (compact at 60% instead of 75%)
|
|
1027
|
+
export PATCHPAL_COMPACT_THRESHOLD=0.60
|
|
1028
|
+
```
|
|
1029
|
+
- Find the sweet spot for your workload (balance between compaction frequency and context size)
|
|
1030
|
+
|
|
1031
|
+
**5. Use Local Models for Zero API Costs**
|
|
1032
|
+
- **Best option:** Run vLLM locally to eliminate API costs entirely
|
|
1033
|
+
```bash
|
|
1034
|
+
export HOSTED_VLLM_API_BASE=http://localhost:8000
|
|
1035
|
+
export HOSTED_VLLM_API_KEY=token-abc123
|
|
1036
|
+
patchpal --model hosted_vllm/openai/gpt-oss-20b
|
|
1037
|
+
```
|
|
1038
|
+
- **Alternative:** Use Ollama (requires `OLLAMA_CONTEXT_LENGTH=32768`)
|
|
1039
|
+
- See [Using Local Models](https://github.com/amaiya/patchpal?tab=readme-ov-file#using-local-models-vllm--ollama) for setup
|
|
1040
|
+
|
|
1041
|
+
**6. Start Fresh When Appropriate**
|
|
1042
|
+
- Use `/clear` command to reset conversation history without restarting PatchPal
|
|
1043
|
+
- Exit and restart PatchPal between unrelated tasks to clear context completely
|
|
1044
|
+
- Each fresh start begins with minimal tokens (just the system prompt)
|
|
1045
|
+
- Better than carrying large conversation history across different tasks
|
|
1046
|
+
|
|
1047
|
+
**7. Use Smaller Models for Simple Tasks**
|
|
1048
|
+
- Use less expensive models for routine tasks:
|
|
1049
|
+
```bash
|
|
1050
|
+
patchpal --model anthropic/claude-3-7-sonnet-latest # Cheaper than claude-sonnet-4-5
|
|
1051
|
+
patchpal --model openai/gpt-4o-mini # Cheaper than gpt-4o
|
|
1052
|
+
```
|
|
1053
|
+
- Reserve premium models for complex reasoning tasks
|
|
1054
|
+
|
|
1055
|
+
**Cost Monitoring Tips:**
|
|
1056
|
+
- Check `/status` before large operations to see current token usage
|
|
1057
|
+
- **Anthropic models**: Prompt caching reduces costs (system prompt + last 2 messages cached)
|
|
1058
|
+
- Most cloud providers offer usage dashboards showing cache hits and actual charges
|
|
1059
|
+
- Set up billing alerts with your provider to avoid surprises
|
|
1060
|
+
- Consider local models (vLLM recommended) for high-volume usage or zero API costs
|
|
@@ -541,7 +541,7 @@ TOOLS = [
|
|
|
541
541
|
"type": "function",
|
|
542
542
|
"function": {
|
|
543
543
|
"name": "run_shell",
|
|
544
|
-
"description": "Run a safe shell command in the repository. Privilege escalation (sudo, su) blocked by default unless PATCHPAL_ALLOW_SUDO=true.",
|
|
544
|
+
"description": "Run a safe shell command in the repository. Commands execute from repository root automatically (no need for 'cd'). Privilege escalation (sudo, su) blocked by default unless PATCHPAL_ALLOW_SUDO=true.",
|
|
545
545
|
"parameters": {
|
|
546
546
|
"type": "object",
|
|
547
547
|
"properties": {
|
|
@@ -725,9 +725,7 @@ def _apply_prompt_caching(messages: List[Dict[str, Any]], model_id: str) -> List
|
|
|
725
725
|
|
|
726
726
|
Caches:
|
|
727
727
|
- System messages (first 1-2 messages with role="system")
|
|
728
|
-
- Last 2
|
|
729
|
-
|
|
730
|
-
This provides 90% cost reduction on cached content after the first request.
|
|
728
|
+
- Last 2 non-system messages (recent context, any role except system)
|
|
731
729
|
|
|
732
730
|
Args:
|
|
733
731
|
messages: List of message dictionaries
|
|
@@ -744,8 +742,8 @@ def _apply_prompt_caching(messages: List[Dict[str, Any]], model_id: str) -> List
|
|
|
744
742
|
# Bedrock uses cachePoint
|
|
745
743
|
cache_marker = {"cachePoint": {"type": "ephemeral"}}
|
|
746
744
|
else:
|
|
747
|
-
# Direct Anthropic API uses
|
|
748
|
-
cache_marker = {"
|
|
745
|
+
# Direct Anthropic API uses cache_control
|
|
746
|
+
cache_marker = {"cache_control": {"type": "ephemeral"}}
|
|
749
747
|
|
|
750
748
|
# Find system messages (usually at the start)
|
|
751
749
|
system_messages = [i for i, msg in enumerate(messages) if msg.get("role") == "system"]
|
|
@@ -818,6 +816,11 @@ class PatchPalAgent:
|
|
|
818
816
|
# Track last compaction to prevent compaction loops
|
|
819
817
|
self._last_compaction_message_count = 0
|
|
820
818
|
|
|
819
|
+
# Track cumulative token usage across all LLM calls
|
|
820
|
+
self.total_llm_calls = 0
|
|
821
|
+
self.cumulative_input_tokens = 0
|
|
822
|
+
self.cumulative_output_tokens = 0
|
|
823
|
+
|
|
821
824
|
# LiteLLM settings for models that need parameter dropping
|
|
822
825
|
self.litellm_kwargs = {}
|
|
823
826
|
if self.model_id.startswith("bedrock/"):
|
|
@@ -896,12 +899,22 @@ class PatchPalAgent:
|
|
|
896
899
|
messages = [{"role": "system", "content": SYSTEM_PROMPT}] + msgs
|
|
897
900
|
# Apply prompt caching for supported models
|
|
898
901
|
messages = _apply_prompt_caching(messages, self.model_id)
|
|
899
|
-
|
|
902
|
+
response = litellm.completion(
|
|
900
903
|
model=self.model_id,
|
|
901
904
|
messages=messages,
|
|
902
905
|
**self.litellm_kwargs,
|
|
903
906
|
)
|
|
904
907
|
|
|
908
|
+
# Track token usage from compaction call
|
|
909
|
+
self.total_llm_calls += 1
|
|
910
|
+
if hasattr(response, "usage") and response.usage:
|
|
911
|
+
if hasattr(response.usage, "prompt_tokens"):
|
|
912
|
+
self.cumulative_input_tokens += response.usage.prompt_tokens
|
|
913
|
+
if hasattr(response.usage, "completion_tokens"):
|
|
914
|
+
self.cumulative_output_tokens += response.usage.completion_tokens
|
|
915
|
+
|
|
916
|
+
return response
|
|
917
|
+
|
|
905
918
|
summary_msg, summary_text = self.context_manager.create_compaction(
|
|
906
919
|
self.messages,
|
|
907
920
|
compaction_completion,
|
|
@@ -995,6 +1008,15 @@ class PatchPalAgent:
|
|
|
995
1008
|
tool_choice="auto",
|
|
996
1009
|
**self.litellm_kwargs,
|
|
997
1010
|
)
|
|
1011
|
+
|
|
1012
|
+
# Track token usage from this LLM call
|
|
1013
|
+
self.total_llm_calls += 1
|
|
1014
|
+
if hasattr(response, "usage") and response.usage:
|
|
1015
|
+
if hasattr(response.usage, "prompt_tokens"):
|
|
1016
|
+
self.cumulative_input_tokens += response.usage.prompt_tokens
|
|
1017
|
+
if hasattr(response.usage, "completion_tokens"):
|
|
1018
|
+
self.cumulative_output_tokens += response.usage.completion_tokens
|
|
1019
|
+
|
|
998
1020
|
except Exception as e:
|
|
999
1021
|
return f"Error calling model: {e}"
|
|
1000
1022
|
|
|
@@ -248,7 +248,9 @@ Supported models: Any LiteLLM-supported model
|
|
|
248
248
|
print(f"\033[1;36m🔧 Using custom system prompt: {custom_prompt_path}\033[0m")
|
|
249
249
|
|
|
250
250
|
print("\nType 'exit' to quit.")
|
|
251
|
-
print(
|
|
251
|
+
print(
|
|
252
|
+
"Use '/status' to check context window usage, '/compact' to manually compact, '/clear' to start fresh."
|
|
253
|
+
)
|
|
252
254
|
print("Use 'list skills' to see available skills or /skillname to invoke skills.")
|
|
253
255
|
print("Press Ctrl-C during agent execution to interrupt the agent.\n")
|
|
254
256
|
|
|
@@ -360,6 +362,70 @@ Supported models: Any LiteLLM-supported model
|
|
|
360
362
|
"\n Auto-compaction: \033[33mDisabled\033[0m (set PATCHPAL_DISABLE_AUTOCOMPACT=false to enable)"
|
|
361
363
|
)
|
|
362
364
|
|
|
365
|
+
# Show cumulative token usage
|
|
366
|
+
print("\n\033[1;36mSession Statistics\033[0m")
|
|
367
|
+
print(f" LLM calls: {agent.total_llm_calls}")
|
|
368
|
+
|
|
369
|
+
# Check if usage info is available (if we have LLM calls but no token counts)
|
|
370
|
+
has_usage_info = (
|
|
371
|
+
agent.cumulative_input_tokens > 0 or agent.cumulative_output_tokens > 0
|
|
372
|
+
)
|
|
373
|
+
if agent.total_llm_calls > 0 and not has_usage_info:
|
|
374
|
+
print(
|
|
375
|
+
" \033[2mToken usage unavailable (model doesn't report usage info)\033[0m"
|
|
376
|
+
)
|
|
377
|
+
else:
|
|
378
|
+
print(f" Cumulative input tokens: {agent.cumulative_input_tokens:,}")
|
|
379
|
+
print(f" Cumulative output tokens: {agent.cumulative_output_tokens:,}")
|
|
380
|
+
total_tokens = agent.cumulative_input_tokens + agent.cumulative_output_tokens
|
|
381
|
+
print(f" Total tokens: {total_tokens:,}")
|
|
382
|
+
|
|
383
|
+
print("=" * 70 + "\n")
|
|
384
|
+
continue
|
|
385
|
+
|
|
386
|
+
# Handle /clear command - clear conversation history
|
|
387
|
+
if user_input.lower() in ["clear", "/clear"]:
|
|
388
|
+
print("\n" + "=" * 70)
|
|
389
|
+
print("\033[1;36mClear Context\033[0m")
|
|
390
|
+
print("=" * 70)
|
|
391
|
+
|
|
392
|
+
if not agent.messages:
|
|
393
|
+
print("\033[1;33m Context is already empty.\033[0m")
|
|
394
|
+
print("=" * 70 + "\n")
|
|
395
|
+
continue
|
|
396
|
+
|
|
397
|
+
# Show current status
|
|
398
|
+
stats = agent.context_manager.get_usage_stats(agent.messages)
|
|
399
|
+
print(
|
|
400
|
+
f" Current: {len(agent.messages)} messages, {stats['total_tokens']:,} tokens"
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
# Confirm before clearing
|
|
404
|
+
try:
|
|
405
|
+
confirm = pt_prompt(
|
|
406
|
+
FormattedText(
|
|
407
|
+
[
|
|
408
|
+
("ansiyellow", " Clear all context and start fresh? (y/n): "),
|
|
409
|
+
("", ""),
|
|
410
|
+
]
|
|
411
|
+
)
|
|
412
|
+
).strip()
|
|
413
|
+
if confirm.lower() not in ["y", "yes"]:
|
|
414
|
+
print(" Cancelled.")
|
|
415
|
+
print("=" * 70 + "\n")
|
|
416
|
+
continue
|
|
417
|
+
except KeyboardInterrupt:
|
|
418
|
+
print("\n Cancelled.")
|
|
419
|
+
print("=" * 70 + "\n")
|
|
420
|
+
continue
|
|
421
|
+
|
|
422
|
+
# Clear conversation history
|
|
423
|
+
agent.messages = []
|
|
424
|
+
agent._last_compaction_message_count = 0
|
|
425
|
+
|
|
426
|
+
print("\n\033[1;32m✓ Context cleared successfully!\033[0m")
|
|
427
|
+
print(" Starting fresh with empty conversation history.")
|
|
428
|
+
print(" All previous context has been removed - ready for a new task.")
|
|
363
429
|
print("=" * 70 + "\n")
|
|
364
430
|
continue
|
|
365
431
|
|
|
@@ -100,6 +100,10 @@ WEB_USER_AGENT = f"PatchPal/{__version__} (AI Code Assistant)"
|
|
|
100
100
|
# Shell command configuration
|
|
101
101
|
SHELL_TIMEOUT = int(os.getenv("PATCHPAL_SHELL_TIMEOUT", 30)) # 30 seconds default
|
|
102
102
|
|
|
103
|
+
# Output filtering configuration - reduce token usage from verbose commands
|
|
104
|
+
ENABLE_OUTPUT_FILTERING = os.getenv("PATCHPAL_FILTER_OUTPUTS", "true").lower() == "true"
|
|
105
|
+
MAX_OUTPUT_LINES = int(os.getenv("PATCHPAL_MAX_OUTPUT_LINES", 500)) # Max lines of output
|
|
106
|
+
|
|
103
107
|
# Global flag for requiring permission on ALL operations (including reads)
|
|
104
108
|
# Set via CLI flag --require-permission-for-all
|
|
105
109
|
_REQUIRE_PERMISSION_FOR_ALL = False
|
|
@@ -195,10 +199,194 @@ class OperationLimiter:
|
|
|
195
199
|
audit_logger.info(f"Operation {self.operations}/{self.max_operations}: {operation}")
|
|
196
200
|
|
|
197
201
|
def reset(self):
|
|
198
|
-
"""Reset operation counter."""
|
|
202
|
+
"""Reset the operation counter (used in tests)."""
|
|
199
203
|
self.operations = 0
|
|
200
204
|
|
|
201
205
|
|
|
206
|
+
class OutputFilter:
|
|
207
|
+
"""Filter verbose command outputs to reduce token usage.
|
|
208
|
+
|
|
209
|
+
This class implements Claude Code's strategy of filtering verbose outputs
|
|
210
|
+
to show only relevant information (e.g., test failures, error messages).
|
|
211
|
+
Can save 75% or more on output tokens for verbose commands.
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
@staticmethod
|
|
215
|
+
def should_filter(cmd: str) -> bool:
|
|
216
|
+
"""Check if a command should have its output filtered.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
cmd: The shell command
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
True if filtering should be applied
|
|
223
|
+
"""
|
|
224
|
+
if not ENABLE_OUTPUT_FILTERING:
|
|
225
|
+
return False
|
|
226
|
+
|
|
227
|
+
# Test runners - show only failures
|
|
228
|
+
test_patterns = [
|
|
229
|
+
"pytest",
|
|
230
|
+
"npm test",
|
|
231
|
+
"npm run test",
|
|
232
|
+
"yarn test",
|
|
233
|
+
"go test",
|
|
234
|
+
"cargo test",
|
|
235
|
+
"mvn test",
|
|
236
|
+
"gradle test",
|
|
237
|
+
"ruby -I test",
|
|
238
|
+
"rspec",
|
|
239
|
+
]
|
|
240
|
+
|
|
241
|
+
# Version control - limit log output
|
|
242
|
+
vcs_patterns = [
|
|
243
|
+
"git log",
|
|
244
|
+
"git reflog",
|
|
245
|
+
]
|
|
246
|
+
|
|
247
|
+
# Package managers - show only important info
|
|
248
|
+
pkg_patterns = [
|
|
249
|
+
"npm install",
|
|
250
|
+
"pip install",
|
|
251
|
+
"cargo build",
|
|
252
|
+
"go build",
|
|
253
|
+
]
|
|
254
|
+
|
|
255
|
+
all_patterns = test_patterns + vcs_patterns + pkg_patterns
|
|
256
|
+
return any(pattern in cmd for pattern in all_patterns)
|
|
257
|
+
|
|
258
|
+
@staticmethod
|
|
259
|
+
def filter_output(cmd: str, output: str) -> str:
|
|
260
|
+
"""Filter command output to reduce token usage.
|
|
261
|
+
|
|
262
|
+
Args:
|
|
263
|
+
cmd: The shell command
|
|
264
|
+
output: The raw command output
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Filtered output with only relevant information
|
|
268
|
+
"""
|
|
269
|
+
if not output or not ENABLE_OUTPUT_FILTERING:
|
|
270
|
+
return output
|
|
271
|
+
|
|
272
|
+
lines = output.split("\n")
|
|
273
|
+
original_lines = len(lines)
|
|
274
|
+
|
|
275
|
+
# Test output - show only failures and summary
|
|
276
|
+
if any(
|
|
277
|
+
pattern in cmd
|
|
278
|
+
for pattern in ["pytest", "npm test", "yarn test", "go test", "cargo test", "rspec"]
|
|
279
|
+
):
|
|
280
|
+
filtered_lines = []
|
|
281
|
+
in_failure = False
|
|
282
|
+
failure_context = []
|
|
283
|
+
|
|
284
|
+
for line in lines:
|
|
285
|
+
# Capture failure indicators
|
|
286
|
+
if any(
|
|
287
|
+
keyword in line.upper()
|
|
288
|
+
for keyword in ["FAIL", "ERROR", "FAILED", "✗", "✖", "FAILURE"]
|
|
289
|
+
):
|
|
290
|
+
in_failure = True
|
|
291
|
+
failure_context = [line]
|
|
292
|
+
elif in_failure:
|
|
293
|
+
# Capture context after failure (up to 10 lines or until next test/blank line)
|
|
294
|
+
failure_context.append(line)
|
|
295
|
+
# End failure context on: blank line, next test case, or 10 lines
|
|
296
|
+
if (
|
|
297
|
+
not line.strip()
|
|
298
|
+
or "::" in line
|
|
299
|
+
or line.startswith("=")
|
|
300
|
+
or len(failure_context) >= 10
|
|
301
|
+
):
|
|
302
|
+
filtered_lines.extend(failure_context)
|
|
303
|
+
in_failure = False
|
|
304
|
+
failure_context = []
|
|
305
|
+
# Always capture summary lines
|
|
306
|
+
elif any(
|
|
307
|
+
keyword in line.lower()
|
|
308
|
+
for keyword in ["passed", "failed", "error", "summary", "total"]
|
|
309
|
+
):
|
|
310
|
+
filtered_lines.append(line)
|
|
311
|
+
|
|
312
|
+
# Add remaining failure context
|
|
313
|
+
if failure_context:
|
|
314
|
+
filtered_lines.extend(failure_context)
|
|
315
|
+
|
|
316
|
+
# If we filtered significantly, add header
|
|
317
|
+
if filtered_lines and len(filtered_lines) < original_lines * 0.5:
|
|
318
|
+
header = f"[Filtered test output - showing failures only ({len(filtered_lines)}/{original_lines} lines)]"
|
|
319
|
+
return header + "\n" + "\n".join(filtered_lines)
|
|
320
|
+
else:
|
|
321
|
+
# Not much to filter, return original but truncated if too long
|
|
322
|
+
return OutputFilter._truncate_output(output, lines, original_lines)
|
|
323
|
+
|
|
324
|
+
# Git log - limit to reasonable number of commits
|
|
325
|
+
elif "git log" in cmd or "git reflog" in cmd:
|
|
326
|
+
# Take first 50 lines (typically ~5-10 commits with details)
|
|
327
|
+
if len(lines) > 50:
|
|
328
|
+
truncated = "\n".join(lines[:50])
|
|
329
|
+
footer = f"\n[Output truncated: showing first 50/{original_lines} lines. Use --max-count to limit commits]"
|
|
330
|
+
return truncated + footer
|
|
331
|
+
return output
|
|
332
|
+
|
|
333
|
+
# Build/install output - show only errors and final status
|
|
334
|
+
elif any(
|
|
335
|
+
pattern in cmd for pattern in ["npm install", "pip install", "cargo build", "go build"]
|
|
336
|
+
):
|
|
337
|
+
filtered_lines = []
|
|
338
|
+
|
|
339
|
+
for line in lines:
|
|
340
|
+
# Keep error/warning lines
|
|
341
|
+
if any(
|
|
342
|
+
keyword in line.upper()
|
|
343
|
+
for keyword in ["ERROR", "WARN", "FAIL", "SUCCESSFULLY", "COMPLETE"]
|
|
344
|
+
):
|
|
345
|
+
filtered_lines.append(line)
|
|
346
|
+
# Keep final summary lines
|
|
347
|
+
elif any(
|
|
348
|
+
keyword in line.lower()
|
|
349
|
+
for keyword in ["installed", "built", "compiled", "finished"]
|
|
350
|
+
):
|
|
351
|
+
filtered_lines.append(line)
|
|
352
|
+
|
|
353
|
+
if filtered_lines and len(filtered_lines) < original_lines * 0.3:
|
|
354
|
+
header = f"[Filtered build output - showing errors and summary only ({len(filtered_lines)}/{original_lines} lines)]"
|
|
355
|
+
return header + "\n" + "\n".join(filtered_lines)
|
|
356
|
+
else:
|
|
357
|
+
return OutputFilter._truncate_output(output, lines, original_lines)
|
|
358
|
+
|
|
359
|
+
# Default: truncate if too long
|
|
360
|
+
return OutputFilter._truncate_output(output, lines, original_lines)
|
|
361
|
+
|
|
362
|
+
@staticmethod
|
|
363
|
+
def _truncate_output(output: str, lines: list, original_lines: int) -> str:
|
|
364
|
+
"""Truncate output if it exceeds maximum lines.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
output: Original output string
|
|
368
|
+
lines: Split lines
|
|
369
|
+
original_lines: Count of original lines
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
Truncated output if necessary
|
|
373
|
+
"""
|
|
374
|
+
if original_lines > MAX_OUTPUT_LINES:
|
|
375
|
+
# Show first and last portions
|
|
376
|
+
keep_start = MAX_OUTPUT_LINES // 2
|
|
377
|
+
keep_end = MAX_OUTPUT_LINES // 2
|
|
378
|
+
|
|
379
|
+
truncated_lines = (
|
|
380
|
+
lines[:keep_start]
|
|
381
|
+
+ ["", f"... [truncated {original_lines - MAX_OUTPUT_LINES} lines] ...", ""]
|
|
382
|
+
+ lines[-keep_end:]
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
return "\n".join(truncated_lines)
|
|
386
|
+
|
|
387
|
+
return output
|
|
388
|
+
|
|
389
|
+
|
|
202
390
|
# Global operation limiter
|
|
203
391
|
_operation_limiter = OperationLimiter()
|
|
204
392
|
|
|
@@ -1738,26 +1926,7 @@ def edit_file(path: str, old_string: str, new_string: str) -> str:
|
|
|
1738
1926
|
f"💡 Tip: Use read_lines() to see the exact context, or use apply_patch() for multiple changes."
|
|
1739
1927
|
)
|
|
1740
1928
|
|
|
1741
|
-
#
|
|
1742
|
-
permission_manager = _get_permission_manager()
|
|
1743
|
-
|
|
1744
|
-
# Format colored diff for permission prompt (use the matched string for accurate diff)
|
|
1745
|
-
diff_display = _format_colored_diff(matched_string, new_string, file_path=path)
|
|
1746
|
-
|
|
1747
|
-
# Add warning if writing outside repository
|
|
1748
|
-
outside_repo_warning = ""
|
|
1749
|
-
if not _is_inside_repo(p):
|
|
1750
|
-
outside_repo_warning = "\n ⚠️ WARNING: Writing file outside repository\n"
|
|
1751
|
-
|
|
1752
|
-
description = f" ● Update({path}){outside_repo_warning}\n{diff_display}"
|
|
1753
|
-
|
|
1754
|
-
if not permission_manager.request_permission("edit_file", description, pattern=path):
|
|
1755
|
-
return "Operation cancelled by user."
|
|
1756
|
-
|
|
1757
|
-
# Backup if enabled
|
|
1758
|
-
backup_path = _backup_file(p)
|
|
1759
|
-
|
|
1760
|
-
# Perform replacement using the matched string
|
|
1929
|
+
# Perform indentation adjustment and trailing newline preservation BEFORE showing diff
|
|
1761
1930
|
# Important: Adjust indentation and preserve trailing newlines to maintain file structure
|
|
1762
1931
|
adjusted_new_string = new_string
|
|
1763
1932
|
|
|
@@ -1803,6 +1972,25 @@ def edit_file(path: str, old_string: str, new_string: str) -> str:
|
|
|
1803
1972
|
trailing_newlines = len(matched_string) - len(matched_string.rstrip("\n"))
|
|
1804
1973
|
adjusted_new_string = adjusted_new_string + ("\n" * trailing_newlines)
|
|
1805
1974
|
|
|
1975
|
+
# Check permission before proceeding (use adjusted_new_string for accurate diff display)
|
|
1976
|
+
permission_manager = _get_permission_manager()
|
|
1977
|
+
|
|
1978
|
+
# Format colored diff for permission prompt (use adjusted_new_string so user sees what will actually be written)
|
|
1979
|
+
diff_display = _format_colored_diff(matched_string, adjusted_new_string, file_path=path)
|
|
1980
|
+
|
|
1981
|
+
# Add warning if writing outside repository
|
|
1982
|
+
outside_repo_warning = ""
|
|
1983
|
+
if not _is_inside_repo(p):
|
|
1984
|
+
outside_repo_warning = "\n ⚠️ WARNING: Writing file outside repository\n"
|
|
1985
|
+
|
|
1986
|
+
description = f" ● Update({path}){outside_repo_warning}\n{diff_display}"
|
|
1987
|
+
|
|
1988
|
+
if not permission_manager.request_permission("edit_file", description, pattern=path):
|
|
1989
|
+
return "Operation cancelled by user."
|
|
1990
|
+
|
|
1991
|
+
# Backup if enabled
|
|
1992
|
+
backup_path = _backup_file(p)
|
|
1993
|
+
|
|
1806
1994
|
new_content = content.replace(matched_string, adjusted_new_string)
|
|
1807
1995
|
|
|
1808
1996
|
# Write the new content
|
|
@@ -2359,4 +2547,19 @@ def run_shell(cmd: str) -> str:
|
|
|
2359
2547
|
stdout = result.stdout.decode("utf-8", errors="replace") if result.stdout else ""
|
|
2360
2548
|
stderr = result.stderr.decode("utf-8", errors="replace") if result.stderr else ""
|
|
2361
2549
|
|
|
2362
|
-
|
|
2550
|
+
output = stdout + stderr
|
|
2551
|
+
|
|
2552
|
+
# Apply output filtering to reduce token usage
|
|
2553
|
+
if OutputFilter.should_filter(cmd):
|
|
2554
|
+
filtered_output = OutputFilter.filter_output(cmd, output)
|
|
2555
|
+
# Log if we filtered significantly
|
|
2556
|
+
original_lines = len(output.split("\n"))
|
|
2557
|
+
filtered_lines = len(filtered_output.split("\n"))
|
|
2558
|
+
if filtered_lines < original_lines * 0.5:
|
|
2559
|
+
audit_logger.info(
|
|
2560
|
+
f"SHELL_FILTER: Reduced output from {original_lines} to {filtered_lines} lines "
|
|
2561
|
+
f"(~{int((1 - filtered_lines / original_lines) * 100)}% reduction)"
|
|
2562
|
+
)
|
|
2563
|
+
return filtered_output
|
|
2564
|
+
|
|
2565
|
+
return output
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: patchpal
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: A lean Claude Code clone in pure Python
|
|
5
5
|
Author: PatchPal Contributors
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -905,6 +905,11 @@ You: /status
|
|
|
905
905
|
# - Token usage breakdown
|
|
906
906
|
# - Visual progress bar
|
|
907
907
|
# - Auto-compaction status
|
|
908
|
+
# - Session statistics:
|
|
909
|
+
# - Total LLM calls made
|
|
910
|
+
# - Cumulative input tokens (all requests combined)
|
|
911
|
+
# - Cumulative output tokens (all responses combined)
|
|
912
|
+
# - Total tokens (helps estimate API costs)
|
|
908
913
|
|
|
909
914
|
# Manually trigger compaction
|
|
910
915
|
You: /compact
|
|
@@ -916,6 +921,23 @@ You: /compact
|
|
|
916
921
|
# Note: Requires at least 5 messages; most effective when context >50% full
|
|
917
922
|
```
|
|
918
923
|
|
|
924
|
+
**Understanding Session Statistics:**
|
|
925
|
+
|
|
926
|
+
The `/status` command shows cumulative token usage:
|
|
927
|
+
|
|
928
|
+
- **Cumulative input tokens**: Total tokens sent to the LLM across all calls
|
|
929
|
+
- Each LLM call resends the entire conversation history
|
|
930
|
+
- **Note on Anthropic models**: PatchPal uses prompt caching
|
|
931
|
+
- System prompt and last 2 messages are cached
|
|
932
|
+
- Cached tokens cost much less than regular input tokens
|
|
933
|
+
- The displayed token counts show raw totals, not cache-adjusted costs
|
|
934
|
+
|
|
935
|
+
- **Cumulative output tokens**: Total tokens generated by the LLM
|
|
936
|
+
- Usually much smaller than input (just the generated responses)
|
|
937
|
+
- Typically costs more per token than input
|
|
938
|
+
|
|
939
|
+
**Important**: The token counts shown are raw totals and don't reflect prompt caching discounts. For accurate cost information, check your provider's usage dashboard which shows cache hits and actual billing.
|
|
940
|
+
|
|
919
941
|
**Configuration:**
|
|
920
942
|
|
|
921
943
|
See the [Configuration](https://github.com/amaiya/patchpal?tab=readme-ov-file#configuration) section for context management settings including:
|
|
@@ -1004,3 +1026,72 @@ The system ensures you can work for extended periods without hitting context lim
|
|
|
1004
1026
|
- Context is automatically managed at 75% capacity through pruning and compaction.
|
|
1005
1027
|
- **Note:** Token estimation may be slightly inaccurate compared to the model's actual counting. If you see this error despite auto-compaction being enabled, the 75% threshold may need to be lowered further for your workload. You can adjust it with `export PATCHPAL_COMPACT_THRESHOLD=0.70` (or lower).
|
|
1006
1028
|
- See [Configuration](https://github.com/amaiya/patchpal?tab=readme-ov-file#configuration) for context management settings.
|
|
1029
|
+
|
|
1030
|
+
**Reducing API Costs via Token Optimization**
|
|
1031
|
+
|
|
1032
|
+
When using cloud LLM providers (Anthropic, OpenAI, etc.), token usage directly impacts costs. PatchPal includes several features to help minimize token consumption:
|
|
1033
|
+
|
|
1034
|
+
**1. Use Pruning to Manage Long Sessions**
|
|
1035
|
+
- **Automatic pruning** removes old tool outputs while preserving conversation context
|
|
1036
|
+
- Configure pruning thresholds to be more aggressive:
|
|
1037
|
+
```bash
|
|
1038
|
+
export PATCHPAL_PRUNE_PROTECT=20000 # Reduce from 40k to 20k tokens
|
|
1039
|
+
export PATCHPAL_PRUNE_MINIMUM=10000 # Reduce minimum saved from 20k to 10k
|
|
1040
|
+
```
|
|
1041
|
+
- Pruning happens transparently before compaction and is much faster (no LLM call needed)
|
|
1042
|
+
|
|
1043
|
+
**2. Monitor Session Token Usage**
|
|
1044
|
+
- Use `/status` to see cumulative token usage in real-time
|
|
1045
|
+
- **Session Statistics** section shows:
|
|
1046
|
+
- Total LLM calls made
|
|
1047
|
+
- Cumulative input tokens (raw totals, before caching discounts)
|
|
1048
|
+
- Cumulative output tokens
|
|
1049
|
+
- Total tokens for the session
|
|
1050
|
+
- Check periodically during long sessions to monitor usage
|
|
1051
|
+
- **Important**: Token counts don't reflect prompt caching discounts (Anthropic models)
|
|
1052
|
+
- For actual costs, check your provider's usage dashboard which shows cache-adjusted billing
|
|
1053
|
+
|
|
1054
|
+
**3. Manual Compaction for Cost Control**
|
|
1055
|
+
- Use `/status` regularly to monitor context window usage
|
|
1056
|
+
- Run `/compact` proactively when context grows large (before hitting auto-compact threshold)
|
|
1057
|
+
- Manual compaction gives you control over when the summarization LLM call happens
|
|
1058
|
+
|
|
1059
|
+
**4. Adjust Auto-Compaction Threshold**
|
|
1060
|
+
- Lower threshold = more frequent compaction = smaller context = lower per-request costs
|
|
1061
|
+
- Higher threshold = fewer compaction calls = larger context = higher per-request costs
|
|
1062
|
+
```bash
|
|
1063
|
+
# More aggressive compaction (compact at 60% instead of 75%)
|
|
1064
|
+
export PATCHPAL_COMPACT_THRESHOLD=0.60
|
|
1065
|
+
```
|
|
1066
|
+
- Find the sweet spot for your workload (balance between compaction frequency and context size)
|
|
1067
|
+
|
|
1068
|
+
**5. Use Local Models for Zero API Costs**
|
|
1069
|
+
- **Best option:** Run vLLM locally to eliminate API costs entirely
|
|
1070
|
+
```bash
|
|
1071
|
+
export HOSTED_VLLM_API_BASE=http://localhost:8000
|
|
1072
|
+
export HOSTED_VLLM_API_KEY=token-abc123
|
|
1073
|
+
patchpal --model hosted_vllm/openai/gpt-oss-20b
|
|
1074
|
+
```
|
|
1075
|
+
- **Alternative:** Use Ollama (requires `OLLAMA_CONTEXT_LENGTH=32768`)
|
|
1076
|
+
- See [Using Local Models](https://github.com/amaiya/patchpal?tab=readme-ov-file#using-local-models-vllm--ollama) for setup
|
|
1077
|
+
|
|
1078
|
+
**6. Start Fresh When Appropriate**
|
|
1079
|
+
- Use `/clear` command to reset conversation history without restarting PatchPal
|
|
1080
|
+
- Exit and restart PatchPal between unrelated tasks to clear context completely
|
|
1081
|
+
- Each fresh start begins with minimal tokens (just the system prompt)
|
|
1082
|
+
- Better than carrying large conversation history across different tasks
|
|
1083
|
+
|
|
1084
|
+
**7. Use Smaller Models for Simple Tasks**
|
|
1085
|
+
- Use less expensive models for routine tasks:
|
|
1086
|
+
```bash
|
|
1087
|
+
patchpal --model anthropic/claude-3-7-sonnet-latest # Cheaper than claude-sonnet-4-5
|
|
1088
|
+
patchpal --model openai/gpt-4o-mini # Cheaper than gpt-4o
|
|
1089
|
+
```
|
|
1090
|
+
- Reserve premium models for complex reasoning tasks
|
|
1091
|
+
|
|
1092
|
+
**Cost Monitoring Tips:**
|
|
1093
|
+
- Check `/status` before large operations to see current token usage
|
|
1094
|
+
- **Anthropic models**: Prompt caching reduces costs (system prompt + last 2 messages cached)
|
|
1095
|
+
- Most cloud providers offer usage dashboards showing cache hits and actual charges
|
|
1096
|
+
- Set up billing alerts with your provider to avoid surprises
|
|
1097
|
+
- Consider local models (vLLM recommended) for high-volume usage or zero API costs
|
|
@@ -441,13 +441,13 @@ def test_prompt_caching_application_anthropic():
|
|
|
441
441
|
# Test with direct Anthropic API
|
|
442
442
|
cached_messages = _apply_prompt_caching(messages.copy(), "anthropic/claude-sonnet-4-5")
|
|
443
443
|
|
|
444
|
-
# System message should have
|
|
445
|
-
assert "
|
|
446
|
-
assert cached_messages[0]["
|
|
444
|
+
# System message should have cache_control
|
|
445
|
+
assert "cache_control" in cached_messages[0]
|
|
446
|
+
assert cached_messages[0]["cache_control"] == {"type": "ephemeral"}
|
|
447
447
|
|
|
448
|
-
# Last 2 messages should have
|
|
449
|
-
assert "
|
|
450
|
-
assert "
|
|
448
|
+
# Last 2 messages should have cache_control
|
|
449
|
+
assert "cache_control" in cached_messages[-1] # Last user message
|
|
450
|
+
assert "cache_control" in cached_messages[-2] # Last assistant message
|
|
451
451
|
|
|
452
452
|
|
|
453
453
|
def test_prompt_caching_application_bedrock():
|
|
@@ -488,7 +488,7 @@ def test_prompt_caching_no_modification_for_unsupported():
|
|
|
488
488
|
cached_messages = _apply_prompt_caching(messages.copy(), "openai/gpt-4o")
|
|
489
489
|
|
|
490
490
|
# Messages should be unchanged
|
|
491
|
-
assert "
|
|
491
|
+
assert "cache_control" not in cached_messages[0]
|
|
492
492
|
assert "cachePoint" not in cached_messages[0]
|
|
493
493
|
assert cached_messages == messages
|
|
494
494
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|