pdd-cli 0.0.90__py3-none-any.whl → 0.0.121__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdd/__init__.py +38 -6
- pdd/agentic_bug.py +323 -0
- pdd/agentic_bug_orchestrator.py +506 -0
- pdd/agentic_change.py +231 -0
- pdd/agentic_change_orchestrator.py +537 -0
- pdd/agentic_common.py +533 -770
- pdd/agentic_crash.py +2 -1
- pdd/agentic_e2e_fix.py +319 -0
- pdd/agentic_e2e_fix_orchestrator.py +582 -0
- pdd/agentic_fix.py +118 -3
- pdd/agentic_update.py +27 -9
- pdd/agentic_verify.py +3 -2
- pdd/architecture_sync.py +565 -0
- pdd/auth_service.py +210 -0
- pdd/auto_deps_main.py +63 -53
- pdd/auto_include.py +236 -3
- pdd/auto_update.py +125 -47
- pdd/bug_main.py +195 -23
- pdd/cmd_test_main.py +345 -197
- pdd/code_generator.py +4 -2
- pdd/code_generator_main.py +118 -32
- pdd/commands/__init__.py +6 -0
- pdd/commands/analysis.py +113 -48
- pdd/commands/auth.py +309 -0
- pdd/commands/connect.py +358 -0
- pdd/commands/fix.py +155 -114
- pdd/commands/generate.py +5 -0
- pdd/commands/maintenance.py +3 -2
- pdd/commands/misc.py +8 -0
- pdd/commands/modify.py +225 -163
- pdd/commands/sessions.py +284 -0
- pdd/commands/utility.py +12 -7
- pdd/construct_paths.py +334 -32
- pdd/context_generator_main.py +167 -170
- pdd/continue_generation.py +6 -3
- pdd/core/__init__.py +33 -0
- pdd/core/cli.py +44 -7
- pdd/core/cloud.py +237 -0
- pdd/core/dump.py +68 -20
- pdd/core/errors.py +4 -0
- pdd/core/remote_session.py +61 -0
- pdd/crash_main.py +219 -23
- pdd/data/llm_model.csv +4 -4
- pdd/docs/prompting_guide.md +864 -0
- pdd/docs/whitepaper_with_benchmarks/data_and_functions/benchmark_analysis.py +495 -0
- pdd/docs/whitepaper_with_benchmarks/data_and_functions/creation_compare.py +528 -0
- pdd/fix_code_loop.py +208 -34
- pdd/fix_code_module_errors.py +6 -2
- pdd/fix_error_loop.py +291 -38
- pdd/fix_main.py +208 -6
- pdd/fix_verification_errors_loop.py +235 -26
- pdd/fix_verification_main.py +269 -83
- pdd/frontend/dist/assets/index-B5DZHykP.css +1 -0
- pdd/frontend/dist/assets/index-CUWd8al1.js +450 -0
- pdd/frontend/dist/index.html +376 -0
- pdd/frontend/dist/logo.svg +33 -0
- pdd/generate_output_paths.py +46 -5
- pdd/generate_test.py +212 -151
- pdd/get_comment.py +19 -44
- pdd/get_extension.py +8 -9
- pdd/get_jwt_token.py +309 -20
- pdd/get_language.py +8 -7
- pdd/get_run_command.py +7 -5
- pdd/insert_includes.py +2 -1
- pdd/llm_invoke.py +531 -97
- pdd/load_prompt_template.py +15 -34
- pdd/operation_log.py +342 -0
- pdd/path_resolution.py +140 -0
- pdd/postprocess.py +122 -97
- pdd/preprocess.py +68 -12
- pdd/preprocess_main.py +33 -1
- pdd/prompts/agentic_bug_step10_pr_LLM.prompt +182 -0
- pdd/prompts/agentic_bug_step1_duplicate_LLM.prompt +73 -0
- pdd/prompts/agentic_bug_step2_docs_LLM.prompt +129 -0
- pdd/prompts/agentic_bug_step3_triage_LLM.prompt +95 -0
- pdd/prompts/agentic_bug_step4_reproduce_LLM.prompt +97 -0
- pdd/prompts/agentic_bug_step5_root_cause_LLM.prompt +123 -0
- pdd/prompts/agentic_bug_step6_test_plan_LLM.prompt +107 -0
- pdd/prompts/agentic_bug_step7_generate_LLM.prompt +172 -0
- pdd/prompts/agentic_bug_step8_verify_LLM.prompt +119 -0
- pdd/prompts/agentic_bug_step9_e2e_test_LLM.prompt +289 -0
- pdd/prompts/agentic_change_step10_identify_issues_LLM.prompt +1006 -0
- pdd/prompts/agentic_change_step11_fix_issues_LLM.prompt +984 -0
- pdd/prompts/agentic_change_step12_create_pr_LLM.prompt +140 -0
- pdd/prompts/agentic_change_step1_duplicate_LLM.prompt +73 -0
- pdd/prompts/agentic_change_step2_docs_LLM.prompt +101 -0
- pdd/prompts/agentic_change_step3_research_LLM.prompt +126 -0
- pdd/prompts/agentic_change_step4_clarify_LLM.prompt +164 -0
- pdd/prompts/agentic_change_step5_docs_change_LLM.prompt +981 -0
- pdd/prompts/agentic_change_step6_devunits_LLM.prompt +1005 -0
- pdd/prompts/agentic_change_step7_architecture_LLM.prompt +1044 -0
- pdd/prompts/agentic_change_step8_analyze_LLM.prompt +1027 -0
- pdd/prompts/agentic_change_step9_implement_LLM.prompt +1077 -0
- pdd/prompts/agentic_e2e_fix_step1_unit_tests_LLM.prompt +90 -0
- pdd/prompts/agentic_e2e_fix_step2_e2e_tests_LLM.prompt +91 -0
- pdd/prompts/agentic_e2e_fix_step3_root_cause_LLM.prompt +89 -0
- pdd/prompts/agentic_e2e_fix_step4_fix_e2e_tests_LLM.prompt +96 -0
- pdd/prompts/agentic_e2e_fix_step5_identify_devunits_LLM.prompt +91 -0
- pdd/prompts/agentic_e2e_fix_step6_create_unit_tests_LLM.prompt +106 -0
- pdd/prompts/agentic_e2e_fix_step7_verify_tests_LLM.prompt +116 -0
- pdd/prompts/agentic_e2e_fix_step8_run_pdd_fix_LLM.prompt +120 -0
- pdd/prompts/agentic_e2e_fix_step9_verify_all_LLM.prompt +146 -0
- pdd/prompts/agentic_fix_primary_LLM.prompt +2 -2
- pdd/prompts/agentic_update_LLM.prompt +192 -338
- pdd/prompts/auto_include_LLM.prompt +22 -0
- pdd/prompts/change_LLM.prompt +3093 -1
- pdd/prompts/detect_change_LLM.prompt +571 -14
- pdd/prompts/fix_code_module_errors_LLM.prompt +8 -0
- pdd/prompts/fix_errors_from_unit_tests_LLM.prompt +1 -0
- pdd/prompts/generate_test_LLM.prompt +19 -1
- pdd/prompts/generate_test_from_example_LLM.prompt +366 -0
- pdd/prompts/insert_includes_LLM.prompt +262 -252
- pdd/prompts/prompt_code_diff_LLM.prompt +123 -0
- pdd/prompts/prompt_diff_LLM.prompt +82 -0
- pdd/remote_session.py +876 -0
- pdd/server/__init__.py +52 -0
- pdd/server/app.py +335 -0
- pdd/server/click_executor.py +587 -0
- pdd/server/executor.py +338 -0
- pdd/server/jobs.py +661 -0
- pdd/server/models.py +241 -0
- pdd/server/routes/__init__.py +31 -0
- pdd/server/routes/architecture.py +451 -0
- pdd/server/routes/auth.py +364 -0
- pdd/server/routes/commands.py +929 -0
- pdd/server/routes/config.py +42 -0
- pdd/server/routes/files.py +603 -0
- pdd/server/routes/prompts.py +1347 -0
- pdd/server/routes/websocket.py +473 -0
- pdd/server/security.py +243 -0
- pdd/server/terminal_spawner.py +217 -0
- pdd/server/token_counter.py +222 -0
- pdd/summarize_directory.py +236 -237
- pdd/sync_animation.py +8 -4
- pdd/sync_determine_operation.py +329 -47
- pdd/sync_main.py +272 -28
- pdd/sync_orchestration.py +289 -211
- pdd/sync_order.py +304 -0
- pdd/template_expander.py +161 -0
- pdd/templates/architecture/architecture_json.prompt +41 -46
- pdd/trace.py +1 -1
- pdd/track_cost.py +0 -13
- pdd/unfinished_prompt.py +2 -1
- pdd/update_main.py +68 -26
- {pdd_cli-0.0.90.dist-info → pdd_cli-0.0.121.dist-info}/METADATA +15 -10
- pdd_cli-0.0.121.dist-info/RECORD +229 -0
- pdd_cli-0.0.90.dist-info/RECORD +0 -153
- {pdd_cli-0.0.90.dist-info → pdd_cli-0.0.121.dist-info}/WHEEL +0 -0
- {pdd_cli-0.0.90.dist-info → pdd_cli-0.0.121.dist-info}/entry_points.txt +0 -0
- {pdd_cli-0.0.90.dist-info → pdd_cli-0.0.121.dist-info}/licenses/LICENSE +0 -0
- {pdd_cli-0.0.90.dist-info → pdd_cli-0.0.121.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,495 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import matplotlib.pyplot as plt
|
|
4
|
+
import matplotlib.figure
|
|
5
|
+
import seaborn as sns
|
|
6
|
+
from scipy import stats
|
|
7
|
+
import statsmodels.stats.proportion as smp
|
|
8
|
+
import os
|
|
9
|
+
from typing import List, Tuple, Optional, Dict, Any
|
|
10
|
+
|
|
11
|
+
# --- Configuration ---
|
|
12
|
+
CLAUDE_RESULTS_PATH = 'analysis/claude_results.csv'
|
|
13
|
+
PDD_RESULTS_PATH = 'analysis/PDD_results.csv'
|
|
14
|
+
OUTPUT_SUBDIR = 'analysis_report' # Subdirectory within 'analysis/'
|
|
15
|
+
OUTPUT_DIR = os.path.join('analysis', OUTPUT_SUBDIR)
|
|
16
|
+
|
|
17
|
+
# Weights for overall score
|
|
18
|
+
W_TIME = 0.3
|
|
19
|
+
W_COST = 0.3
|
|
20
|
+
W_SUCCESS = 0.4
|
|
21
|
+
|
|
22
|
+
# --- Helper Functions ---
|
|
23
|
+
|
|
24
|
+
def create_output_directory() -> None:
|
|
25
|
+
"""Creates the output directory if it doesn't exist."""
|
|
26
|
+
if not os.path.exists(OUTPUT_DIR):
|
|
27
|
+
os.makedirs(OUTPUT_DIR)
|
|
28
|
+
print(f"Created directory: {OUTPUT_DIR}")
|
|
29
|
+
|
|
30
|
+
def save_plot(fig: matplotlib.figure.Figure, filename: str, tight_layout: bool = True) -> str:
|
|
31
|
+
"""Saves the given matplotlib figure to the output directory."""
|
|
32
|
+
if tight_layout:
|
|
33
|
+
fig.tight_layout()
|
|
34
|
+
path = os.path.join(OUTPUT_DIR, filename)
|
|
35
|
+
fig.savefig(path)
|
|
36
|
+
plt.close(fig)
|
|
37
|
+
return path
|
|
38
|
+
|
|
39
|
+
def min_max_normalize(series: pd.Series, lower_is_better: bool = True) -> pd.Series:
|
|
40
|
+
"""Normalizes a pandas Series using min-max scaling."""
|
|
41
|
+
min_val = series.min()
|
|
42
|
+
max_val = series.max()
|
|
43
|
+
if max_val == min_val: # Avoid division by zero if all values are the same
|
|
44
|
+
return pd.Series(0.5, index=series.index) # Neutral score if no variance
|
|
45
|
+
|
|
46
|
+
normalized = (series - min_val) / (max_val - min_val)
|
|
47
|
+
if lower_is_better:
|
|
48
|
+
return 1 - normalized
|
|
49
|
+
return normalized
|
|
50
|
+
|
|
51
|
+
def calculate_rank_biserial(U: float, n1: int, n2: int) -> float:
|
|
52
|
+
"""Calculates rank-biserial correlation from Mann-Whitney U."""
|
|
53
|
+
# Common formula: r = 1 - (2U / (n1 * n2))
|
|
54
|
+
# U here should be the U statistic for the group that would be expected to have smaller sum of ranks if H0 is false
|
|
55
|
+
# Or, more simply, use the smaller of U1 and U2.
|
|
56
|
+
# scipy.stats.mannwhitneyu returns one U value (typically U2).
|
|
57
|
+
# The formula r = 1 - (2 * U_scipy) / (n1 * n2) is equivalent to (U1 - U2) / (n1 * n2)
|
|
58
|
+
# where U_scipy is the U value returned by scipy.
|
|
59
|
+
if n1 * n2 == 0: return 0.0 # Avoid division by zero if either group is empty
|
|
60
|
+
return 1 - (2 * U) / (n1 * n2)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def calculate_cramers_v(chi2: float, n: int, contingency_table: pd.DataFrame) -> float:
|
|
64
|
+
"""Calculates Cramér's V from Chi-squared test."""
|
|
65
|
+
if n == 0: return 0.0
|
|
66
|
+
phi2 = chi2 / n
|
|
67
|
+
k, r = contingency_table.shape
|
|
68
|
+
if min(k-1, r-1) == 0: return 0.0 # Avoid division by zero
|
|
69
|
+
return np.sqrt(phi2 / min(k - 1, r - 1))
|
|
70
|
+
|
|
71
|
+
# --- Main Analysis Functions ---
|
|
72
|
+
|
|
73
|
+
def load_and_prepare_data() -> Optional[pd.DataFrame]:
|
|
74
|
+
"""Loads and prepares the benchmark data."""
|
|
75
|
+
try:
|
|
76
|
+
claude_df = pd.read_csv(CLAUDE_RESULTS_PATH)
|
|
77
|
+
pdd_df = pd.read_csv(PDD_RESULTS_PATH)
|
|
78
|
+
except FileNotFoundError as e:
|
|
79
|
+
print(f"Error: Input CSV file not found. {e}")
|
|
80
|
+
print("Please ensure 'analysis/claude_results.csv' and 'analysis/PDD_results.csv' exist.")
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
claude_df['tool'] = 'Claude'
|
|
84
|
+
pdd_df['tool'] = 'PDD'
|
|
85
|
+
|
|
86
|
+
combined_df = pd.concat([claude_df, pdd_df], ignore_index=True)
|
|
87
|
+
|
|
88
|
+
# Ensure numeric types for relevant columns
|
|
89
|
+
for col in ['execution_time_seconds', 'api_cost', 'success']:
|
|
90
|
+
combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
|
|
91
|
+
|
|
92
|
+
# Handle potential NaNs from coercion or in data
|
|
93
|
+
combined_df.dropna(subset=['execution_time_seconds', 'api_cost', 'success'], inplace=True)
|
|
94
|
+
|
|
95
|
+
return combined_df
|
|
96
|
+
|
|
97
|
+
def overall_performance_analysis(df: pd.DataFrame) -> Tuple[List[str], pd.DataFrame]:
|
|
98
|
+
"""Performs overall performance comparison."""
|
|
99
|
+
report_parts: List[str] = ["## 1. Overall Performance Comparison\n"]
|
|
100
|
+
|
|
101
|
+
overall_stats = df.groupby('tool').agg(
|
|
102
|
+
avg_execution_time=('execution_time_seconds', 'mean'),
|
|
103
|
+
avg_api_cost=('api_cost', 'mean'),
|
|
104
|
+
success_rate=('success', 'mean'),
|
|
105
|
+
total_tasks=('success', 'count')
|
|
106
|
+
).reset_index()
|
|
107
|
+
|
|
108
|
+
report_parts.append("### Key Metrics\n")
|
|
109
|
+
report_parts.append(overall_stats.to_markdown(index=False) + "\n")
|
|
110
|
+
|
|
111
|
+
# Bar charts
|
|
112
|
+
metrics_to_plot: Dict[str, str] = {
|
|
113
|
+
'avg_execution_time': 'Average Execution Time (s)',
|
|
114
|
+
'avg_api_cost': 'Average API Cost ($)',
|
|
115
|
+
'success_rate': 'Success Rate'
|
|
116
|
+
}
|
|
117
|
+
for metric, title in metrics_to_plot.items():
|
|
118
|
+
fig, ax = plt.subplots()
|
|
119
|
+
|
|
120
|
+
plot_data = overall_stats
|
|
121
|
+
y_label = title
|
|
122
|
+
|
|
123
|
+
if "time" in metric:
|
|
124
|
+
y_label = "Average Execution Time (seconds)"
|
|
125
|
+
elif "cost" in metric:
|
|
126
|
+
y_label = "Average API Cost (dollars)"
|
|
127
|
+
elif "success" in metric:
|
|
128
|
+
y_label = "Success Rate (%)"
|
|
129
|
+
plot_data = overall_stats.copy()
|
|
130
|
+
plot_data[metric] = plot_data[metric] * 100
|
|
131
|
+
|
|
132
|
+
sns.barplot(x='tool', y=metric, data=plot_data, ax=ax)
|
|
133
|
+
ax.set_title(title)
|
|
134
|
+
ax.set_xlabel("Tool")
|
|
135
|
+
ax.set_ylabel(y_label)
|
|
136
|
+
img_path = save_plot(fig, f"overall_{metric}.png")
|
|
137
|
+
report_parts.append(f"})\n")
|
|
138
|
+
|
|
139
|
+
# Weighted Scoring
|
|
140
|
+
report_parts.append("\n### Weighted Scoring for Best Overall Tool\n")
|
|
141
|
+
report_parts.append(f"The best overall tool is determined using a weighted scoring system. The formula is: \n`Overall Score = {W_TIME} * norm_time + {W_COST} * norm_cost + {W_SUCCESS} * norm_success_rate`\n")
|
|
142
|
+
report_parts.append("Time and cost are normalized using min-max scaling (lower is better, hence `norm_time` and `norm_cost` are already inverted if needed). Success rate is already a [0, 1] metric (higher is better).\n")
|
|
143
|
+
|
|
144
|
+
# Normalize across both tools for fair comparison
|
|
145
|
+
all_avg_times = overall_stats['avg_execution_time']
|
|
146
|
+
all_avg_costs = overall_stats['avg_api_cost']
|
|
147
|
+
|
|
148
|
+
overall_stats['norm_time'] = min_max_normalize(all_avg_times, lower_is_better=True)
|
|
149
|
+
overall_stats['norm_cost'] = min_max_normalize(all_avg_costs, lower_is_better=True)
|
|
150
|
+
# Success rate is already normalized (0-1), and higher is better.
|
|
151
|
+
overall_stats['norm_success_rate'] = overall_stats['success_rate']
|
|
152
|
+
|
|
153
|
+
overall_stats['overall_score'] = (
|
|
154
|
+
W_TIME * overall_stats['norm_time'] +
|
|
155
|
+
W_COST * overall_stats['norm_cost'] +
|
|
156
|
+
W_SUCCESS * overall_stats['norm_success_rate']
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
report_parts.append("#### Calculated Scores:\n")
|
|
160
|
+
report_parts.append(overall_stats[['tool', 'norm_time', 'norm_cost', 'norm_success_rate', 'overall_score']].to_markdown(index=False) + "\n")
|
|
161
|
+
|
|
162
|
+
best_tool = overall_stats.loc[overall_stats['overall_score'].idxmax()]
|
|
163
|
+
report_parts.append(f"\n**Best Overall Tool (based on weighted score): {best_tool['tool']}** with a score of {best_tool['overall_score']:.3f}\n")
|
|
164
|
+
|
|
165
|
+
return report_parts, overall_stats
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def dimension_specific_analysis(df: pd.DataFrame) -> List[str]:
|
|
169
|
+
"""Performs dimension-specific analysis."""
|
|
170
|
+
report_parts: List[str] = ["\n## 2. Dimension-Specific Analysis\n"]
|
|
171
|
+
dimensions: List[str] = ['file_size', 'language', 'edit_type']
|
|
172
|
+
|
|
173
|
+
for dim in dimensions:
|
|
174
|
+
report_parts.append(f"### Performance by {dim.replace('_', ' ').title()}\n")
|
|
175
|
+
|
|
176
|
+
dim_stats = df.groupby(['tool', dim]).agg(
|
|
177
|
+
avg_execution_time=('execution_time_seconds', 'mean'),
|
|
178
|
+
avg_api_cost=('api_cost', 'mean'),
|
|
179
|
+
success_rate=('success', 'mean')
|
|
180
|
+
).reset_index()
|
|
181
|
+
|
|
182
|
+
report_parts.append(dim_stats.to_markdown(index=False) + "\n")
|
|
183
|
+
|
|
184
|
+
# Grouped bar charts
|
|
185
|
+
metrics_to_plot: Dict[str, str] = {
|
|
186
|
+
'avg_execution_time': f'Average Execution Time by {dim}',
|
|
187
|
+
'avg_api_cost': f'Average API Cost by {dim}',
|
|
188
|
+
'success_rate': f'Success Rate by {dim}'
|
|
189
|
+
}
|
|
190
|
+
for metric, title in metrics_to_plot.items():
|
|
191
|
+
fig, ax = plt.subplots(figsize=(10, 6))
|
|
192
|
+
|
|
193
|
+
plot_data = dim_stats
|
|
194
|
+
y_label = metric
|
|
195
|
+
|
|
196
|
+
if "time" in metric:
|
|
197
|
+
y_label = "Average Execution Time (seconds)"
|
|
198
|
+
elif "cost" in metric:
|
|
199
|
+
y_label = "Average API Cost (dollars)"
|
|
200
|
+
elif "success" in metric:
|
|
201
|
+
y_label = "Success Rate (%)"
|
|
202
|
+
plot_data = dim_stats.copy()
|
|
203
|
+
plot_data[metric] = plot_data[metric] * 100
|
|
204
|
+
|
|
205
|
+
sns.barplot(x=dim, y=metric, hue='tool', data=plot_data, ax=ax)
|
|
206
|
+
ax.set_title(title)
|
|
207
|
+
ax.set_xlabel(dim.replace('_', ' ').title())
|
|
208
|
+
ax.set_ylabel(y_label)
|
|
209
|
+
img_path = save_plot(fig, f"{metric}_by_{dim}.png")
|
|
210
|
+
report_parts.append(f"})\n")
|
|
211
|
+
|
|
212
|
+
return report_parts
|
|
213
|
+
|
|
214
|
+
def cost_efficiency_analysis(df: pd.DataFrame) -> List[str]:
|
|
215
|
+
"""Performs cost-efficiency analysis."""
|
|
216
|
+
report_parts: List[str] = ["\n## 3. Cost-Efficiency Analysis\n"]
|
|
217
|
+
|
|
218
|
+
# Cost per successful task
|
|
219
|
+
total_costs = df.groupby('tool')['api_cost'].sum()
|
|
220
|
+
successful_tasks = df[df['success'] == 1].groupby('tool')['success'].count()
|
|
221
|
+
|
|
222
|
+
cost_efficiency = pd.DataFrame({
|
|
223
|
+
'total_api_cost': total_costs,
|
|
224
|
+
'num_successful_tasks': successful_tasks
|
|
225
|
+
})
|
|
226
|
+
# Ensure num_successful_tasks is present for all tools, fill with 0 if no successful tasks
|
|
227
|
+
cost_efficiency = cost_efficiency.reindex(df['tool'].unique(), fill_value=0)
|
|
228
|
+
cost_efficiency['cost_per_successful_task'] = np.where(
|
|
229
|
+
cost_efficiency['num_successful_tasks'] > 0,
|
|
230
|
+
cost_efficiency['total_api_cost'] / cost_efficiency['num_successful_tasks'],
|
|
231
|
+
np.nan # Or 0, or some other indicator for no successful tasks
|
|
232
|
+
)
|
|
233
|
+
cost_efficiency = cost_efficiency.reset_index().rename(columns={'index': 'tool'})
|
|
234
|
+
|
|
235
|
+
report_parts.append("### Cost Per Successful Task\n")
|
|
236
|
+
report_parts.append(cost_efficiency.to_markdown(index=False) + "\n")
|
|
237
|
+
|
|
238
|
+
fig, ax = plt.subplots()
|
|
239
|
+
sns.barplot(x='tool', y='cost_per_successful_task', data=cost_efficiency, ax=ax)
|
|
240
|
+
ax.set_title('Cost Per Successful Task')
|
|
241
|
+
ax.set_ylabel('Cost Per Successful Task (dollars)')
|
|
242
|
+
img_path = save_plot(fig, "cost_per_successful_task.png")
|
|
243
|
+
report_parts.append(f"})\n")
|
|
244
|
+
|
|
245
|
+
# Scatter plot: execution_time vs. api_cost
|
|
246
|
+
fig, ax = plt.subplots(figsize=(10, 6))
|
|
247
|
+
sns.scatterplot(x='execution_time_seconds', y='api_cost', hue='tool', data=df, alpha=0.6, ax=ax)
|
|
248
|
+
ax.set_title('Execution Time vs. API Cost')
|
|
249
|
+
ax.set_xlabel('Execution Time (seconds)')
|
|
250
|
+
ax.set_ylabel('API Cost (dollars)')
|
|
251
|
+
img_path = save_plot(fig, "time_vs_cost_scatter.png")
|
|
252
|
+
report_parts.append(f"})\n")
|
|
253
|
+
|
|
254
|
+
# Bar chart for total API cost (Required Visualizations #6)
|
|
255
|
+
fig, ax = plt.subplots()
|
|
256
|
+
sns.barplot(x='tool', y='total_api_cost', data=cost_efficiency, ax=ax) # Re-using cost_efficiency df
|
|
257
|
+
ax.set_title('Total API Cost for All Benchmarks')
|
|
258
|
+
ax.set_ylabel('Total API Cost (dollars)')
|
|
259
|
+
img_path = save_plot(fig, "total_api_cost_comparison.png")
|
|
260
|
+
report_parts.append(f"})\n")
|
|
261
|
+
|
|
262
|
+
return report_parts
|
|
263
|
+
|
|
264
|
+
def success_and_error_analysis(df: pd.DataFrame, overall_stats: pd.DataFrame) -> Tuple[List[str], pd.DataFrame]:
|
|
265
|
+
"""Performs success and error analysis."""
|
|
266
|
+
report_parts: List[str] = ["\n## 4. Success and Error Analysis\n"]
|
|
267
|
+
|
|
268
|
+
# Success rates across dimensions (already plotted in dimension_specific_analysis)
|
|
269
|
+
report_parts.append("Success rates across dimensions are visualized in Section 2.\n")
|
|
270
|
+
|
|
271
|
+
# Error message analysis
|
|
272
|
+
failed_tasks = df[df['success'] == 0]
|
|
273
|
+
if 'error_message' in failed_tasks.columns:
|
|
274
|
+
error_summary = failed_tasks.groupby(['tool', 'error_message']).size().reset_index(name='count').sort_values(by=['tool', 'count'], ascending=[True, False])
|
|
275
|
+
report_parts.append("### Common Error Messages for Failed Tasks\n")
|
|
276
|
+
for tool in df['tool'].unique():
|
|
277
|
+
report_parts.append(f"#### {tool}:\n")
|
|
278
|
+
tool_errors = error_summary[error_summary['tool'] == tool]
|
|
279
|
+
if not tool_errors.empty:
|
|
280
|
+
report_parts.append(tool_errors[['error_message', 'count']].to_markdown(index=False) + "\n")
|
|
281
|
+
else:
|
|
282
|
+
report_parts.append("No failed tasks recorded for this tool or error messages not available.\n")
|
|
283
|
+
else:
|
|
284
|
+
report_parts.append("### Common Error Messages for Failed Tasks\n")
|
|
285
|
+
report_parts.append("Column 'error_message' not found in the data for failed tasks.\n")
|
|
286
|
+
|
|
287
|
+
# Confidence intervals for overall success rates
|
|
288
|
+
report_parts.append("### Overall Success Rate Confidence Intervals (95%)\n")
|
|
289
|
+
ci_data: List[Dict[str, Any]] = []
|
|
290
|
+
for tool_name in overall_stats['tool'].unique():
|
|
291
|
+
tool_data = overall_stats[overall_stats['tool'] == tool_name].iloc[0]
|
|
292
|
+
n_success = int(tool_data['success_rate'] * tool_data['total_tasks'])
|
|
293
|
+
n_total = int(tool_data['total_tasks'])
|
|
294
|
+
if n_total > 0:
|
|
295
|
+
ci_low, ci_upp = smp.proportion_confint(n_success, n_total, method='wilson')
|
|
296
|
+
ci_data.append({'tool': tool_name, 'success_rate': tool_data['success_rate'], 'ci_lower': ci_low, 'ci_upper': ci_upp})
|
|
297
|
+
else:
|
|
298
|
+
ci_data.append({'tool': tool_name, 'success_rate': np.nan, 'ci_lower': np.nan, 'ci_upper': np.nan})
|
|
299
|
+
|
|
300
|
+
ci_df = pd.DataFrame(ci_data)
|
|
301
|
+
report_parts.append(ci_df.to_markdown(index=False) + "\n")
|
|
302
|
+
|
|
303
|
+
return report_parts, ci_df
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def statistical_significance_analysis(df: pd.DataFrame) -> Tuple[List[str], pd.DataFrame]:
|
|
307
|
+
"""Performs statistical significance tests."""
|
|
308
|
+
report_parts: List[str] = ["\n## 5. Statistical Significance Analysis\n"]
|
|
309
|
+
stat_results: List[Dict[str, Any]] = []
|
|
310
|
+
|
|
311
|
+
pdd_data = df[df['tool'] == 'PDD']
|
|
312
|
+
claude_data = df[df['tool'] == 'Claude']
|
|
313
|
+
|
|
314
|
+
# Mann-Whitney U for continuous metrics
|
|
315
|
+
for metric in ['execution_time_seconds', 'api_cost']:
|
|
316
|
+
pdd_metric_data = pdd_data[metric].dropna()
|
|
317
|
+
claude_metric_data = claude_data[metric].dropna()
|
|
318
|
+
|
|
319
|
+
if pdd_metric_data.empty or claude_metric_data.empty:
|
|
320
|
+
report_parts.append(f"Skipping {metric} due to empty data for one or both tools after dropping NaNs.\n")
|
|
321
|
+
stat_results.append({'Metric': metric, 'Test': 'Mann-Whitney U', 'p-value': 'N/A', 'Effect Size (RBC)': 'N/A', 'Significance': 'N/A'})
|
|
322
|
+
continue
|
|
323
|
+
|
|
324
|
+
mwu_stat, p_value = stats.mannwhitneyu(pdd_metric_data, claude_metric_data, alternative='two-sided')
|
|
325
|
+
|
|
326
|
+
n1, n2 = len(pdd_metric_data), len(claude_metric_data)
|
|
327
|
+
rbc_val = calculate_rank_biserial(mwu_stat, n1, n2)
|
|
328
|
+
rbc_str = f"{rbc_val:.3f}"
|
|
329
|
+
|
|
330
|
+
significance = "Yes" if p_value < 0.05 else "No"
|
|
331
|
+
stat_results.append({'Metric': metric, 'Test': 'Mann-Whitney U', 'p-value': f"{p_value:.3g}", 'Effect Size (RBC)': rbc_str, 'Significance': significance})
|
|
332
|
+
|
|
333
|
+
# Chi-squared for success metric
|
|
334
|
+
if not pdd_data.empty and not claude_data.empty:
|
|
335
|
+
contingency_table = pd.crosstab(df['tool'], df['success'])
|
|
336
|
+
if contingency_table.shape == (2,2) and contingency_table.values.min() > 0 : # Check if table is 2x2 and has counts > 0
|
|
337
|
+
chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)
|
|
338
|
+
|
|
339
|
+
n_total = contingency_table.sum().sum()
|
|
340
|
+
cramers_v = calculate_cramers_v(chi2, n_total, contingency_table)
|
|
341
|
+
cramers_v_str = f"{cramers_v:.3f}"
|
|
342
|
+
significance = "Yes" if p_value < 0.05 else "No"
|
|
343
|
+
else:
|
|
344
|
+
p_value = 1.0 # Not applicable
|
|
345
|
+
cramers_v_str = "N/A"
|
|
346
|
+
significance = "N/A (contingency table not 2x2 or has zero/low counts)"
|
|
347
|
+
|
|
348
|
+
stat_results.append({'Metric': 'success', 'Test': 'Chi-squared', 'p-value': f"{p_value:.3g}", "Effect Size (Cramér's V)": cramers_v_str, 'Significance': significance})
|
|
349
|
+
else:
|
|
350
|
+
stat_results.append({'Metric': 'success', 'Test': 'Chi-squared', 'p-value': 'N/A', "Effect Size (Cramér's V)": 'N/A', 'Significance': 'N/A'})
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
stat_results_df = pd.DataFrame(stat_results)
|
|
354
|
+
report_parts.append("### Statistical Test Summary\n")
|
|
355
|
+
report_parts.append(stat_results_df.to_markdown(index=False) + "\n")
|
|
356
|
+
|
|
357
|
+
return report_parts, stat_results_df
|
|
358
|
+
|
|
359
|
+
def generate_markdown_report(
|
|
360
|
+
report_sections: List[List[str]],
|
|
361
|
+
overall_stats_df: pd.DataFrame,
|
|
362
|
+
cost_eff_parts: List[str], # Specifically for parsing cost per task
|
|
363
|
+
stat_results_df: pd.DataFrame
|
|
364
|
+
) -> None:
|
|
365
|
+
"""Generates the final Markdown report."""
|
|
366
|
+
|
|
367
|
+
report_content: List[str] = ["# Benchmark Analysis Report: PDD vs. Claude\n"]
|
|
368
|
+
report_content.append("This report analyzes and compares the performance of PDD and Claude AI coding assistants based on benchmark data.\n")
|
|
369
|
+
|
|
370
|
+
# Executive Summary
|
|
371
|
+
report_content.append("## Executive Summary\n")
|
|
372
|
+
|
|
373
|
+
best_tool_overall = overall_stats_df.loc[overall_stats_df['overall_score'].idxmax()]['tool']
|
|
374
|
+
pdd_overall_series = overall_stats_df[overall_stats_df['tool'] == 'PDD']
|
|
375
|
+
claude_overall_series = overall_stats_df[overall_stats_df['tool'] == 'Claude']
|
|
376
|
+
|
|
377
|
+
summary_points: List[str] = [
|
|
378
|
+
f"- **Overall Winner (Weighted Score):** {best_tool_overall}"
|
|
379
|
+
]
|
|
380
|
+
if not pdd_overall_series.empty:
|
|
381
|
+
pdd_overall = pdd_overall_series.iloc[0]
|
|
382
|
+
summary_points.append(f"- **PDD Performance:** Avg Time: {pdd_overall['avg_execution_time']:.2f}s, Avg Cost: ${pdd_overall['avg_api_cost']:.4f}, Success Rate: {pdd_overall['success_rate']:.2%}")
|
|
383
|
+
if not claude_overall_series.empty:
|
|
384
|
+
claude_overall = claude_overall_series.iloc[0]
|
|
385
|
+
summary_points.append(f"- **Claude Performance:** Avg Time: {claude_overall['avg_execution_time']:.2f}s, Avg Cost: ${claude_overall['avg_api_cost']:.4f}, Success Rate: {claude_overall['success_rate']:.2%}")
|
|
386
|
+
|
|
387
|
+
# Add cost-efficiency summary by parsing the markdown part from cost_eff_parts
|
|
388
|
+
markdown_table_str = cost_eff_parts[1] # cost_eff_parts[0] is header, cost_eff_parts[1] is table
|
|
389
|
+
lines = markdown_table_str.strip().split('\n')
|
|
390
|
+
|
|
391
|
+
pdd_cost_val_str: Optional[str] = None
|
|
392
|
+
claude_cost_val_str: Optional[str] = None
|
|
393
|
+
|
|
394
|
+
# Expecting table rows like: | ToolName | TotalCost | NumSuccess | CostPerSuccess |
|
|
395
|
+
# Indices after split('|'): 0="", 1="ToolName", 2="TotalCost", ..., 4="CostPerSuccess"
|
|
396
|
+
for line_idx in range(2, len(lines)): # Start from index 2 (skip header and separator)
|
|
397
|
+
cols = lines[line_idx].split('|')
|
|
398
|
+
if len(cols) > 4: # Ensure enough columns for CostPerSuccess
|
|
399
|
+
tool_name_in_col = cols[1].strip()
|
|
400
|
+
cost_value = cols[4].strip()
|
|
401
|
+
if "PDD" in tool_name_in_col:
|
|
402
|
+
pdd_cost_val_str = cost_value
|
|
403
|
+
elif "Claude" in tool_name_in_col:
|
|
404
|
+
claude_cost_val_str = cost_value
|
|
405
|
+
|
|
406
|
+
if pdd_cost_val_str and claude_cost_val_str:
|
|
407
|
+
try:
|
|
408
|
+
pdd_float = float(pdd_cost_val_str)
|
|
409
|
+
claude_float = float(claude_cost_val_str)
|
|
410
|
+
summary_points.append(f"- **Cost per Successful Task:** PDD: ${pdd_float:.4f}, Claude: ${claude_float:.4f}")
|
|
411
|
+
except ValueError:
|
|
412
|
+
summary_points.append("- Cost per Successful Task: Error parsing values from markdown table.")
|
|
413
|
+
else:
|
|
414
|
+
summary_points.append("- Cost per Successful Task: Data not found or table format unexpected in markdown.")
|
|
415
|
+
|
|
416
|
+
# Add statistical significance summary
|
|
417
|
+
for _, row in stat_results_df.iterrows():
|
|
418
|
+
if row['Significance'] == 'Yes':
|
|
419
|
+
effect_size_col_name = [col for col in row.index if "Effect Size" in col][0]
|
|
420
|
+
effect_size_val = row[effect_size_col_name]
|
|
421
|
+
metric_name = row['Metric']
|
|
422
|
+
p_val = row['p-value']
|
|
423
|
+
summary_points.append(f"- Statistically significant difference in **{metric_name}** (p={p_val}, {effect_size_col_name.split('(')[1][:-1]}={effect_size_val}).")
|
|
424
|
+
|
|
425
|
+
report_content.append("\n".join(summary_points) + "\n")
|
|
426
|
+
|
|
427
|
+
# Append all main sections
|
|
428
|
+
for section_parts_list in report_sections:
|
|
429
|
+
report_content.extend(section_parts_list)
|
|
430
|
+
|
|
431
|
+
# Final Recommendation
|
|
432
|
+
report_content.append("\n## 6. Final Recommendation\n")
|
|
433
|
+
# Note: The recommendation part uses placeholders like "[mention specific areas...]"
|
|
434
|
+
# These would ideally be filled dynamically if more detailed insights from dim_stats_parts etc. were parsed here.
|
|
435
|
+
# For now, it's a template.
|
|
436
|
+
recommendation = f"""
|
|
437
|
+
Based on this analysis:
|
|
438
|
+
|
|
439
|
+
- For tasks where **overall balanced performance (time, cost, success)** is critical, **{best_tool_overall}** is recommended due to its higher weighted score.
|
|
440
|
+
- If **minimizing API cost** is the absolute priority, analyze the 'Average API Cost' and 'Cost Per Successful Task' metrics. The tool with lower values here might be preferred, even if slightly slower or less successful.
|
|
441
|
+
- If **maximizing success rate** is paramount, the tool with the higher overall success rate and better performance on specific critical dimensions (e.g., specific languages or edit types) should be chosen.
|
|
442
|
+
- **PDD** shows strengths in [mention specific areas if evident, e.g., specific languages/file_sizes based on dimensional analysis].
|
|
443
|
+
- **Claude** shows strengths in [mention specific areas if evident, e.g., specific languages/file_sizes based on dimensional analysis].
|
|
444
|
+
|
|
445
|
+
Consider the specific context of your tasks (e.g., dominant language, typical file size, importance of speed vs. cost) when making a final decision.
|
|
446
|
+
Further investigation into common error patterns for each tool could lead to improved prompt engineering or identify areas where one tool might need more support.
|
|
447
|
+
"""
|
|
448
|
+
report_content.append(recommendation)
|
|
449
|
+
|
|
450
|
+
# Write report to file
|
|
451
|
+
report_path = os.path.join(OUTPUT_DIR, 'benchmark_analysis.md')
|
|
452
|
+
with open(report_path, 'w', encoding='utf-8') as f:
|
|
453
|
+
f.write("\n".join(report_content))
|
|
454
|
+
print(f"Markdown report saved to: {report_path}")
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
# --- Main Execution ---
|
|
458
|
+
def main() -> None:
|
|
459
|
+
"""Main function to run the benchmark analysis."""
|
|
460
|
+
print("Starting benchmark analysis...")
|
|
461
|
+
create_output_directory()
|
|
462
|
+
|
|
463
|
+
df = load_and_prepare_data()
|
|
464
|
+
if df is None:
|
|
465
|
+
return
|
|
466
|
+
|
|
467
|
+
report_sections_collector: List[List[str]] = []
|
|
468
|
+
|
|
469
|
+
overall_parts, overall_stats_df = overall_performance_analysis(df)
|
|
470
|
+
report_sections_collector.append(overall_parts)
|
|
471
|
+
|
|
472
|
+
dim_stats_parts = dimension_specific_analysis(df)
|
|
473
|
+
report_sections_collector.append(dim_stats_parts)
|
|
474
|
+
|
|
475
|
+
# cost_efficiency_analysis no longer needs overall_stats_df
|
|
476
|
+
cost_eff_parts = cost_efficiency_analysis(df)
|
|
477
|
+
report_sections_collector.append(cost_eff_parts)
|
|
478
|
+
|
|
479
|
+
success_err_parts, ci_df = success_and_error_analysis(df, overall_stats_df)
|
|
480
|
+
report_sections_collector.append(success_err_parts)
|
|
481
|
+
|
|
482
|
+
stat_sig_parts, stat_results_df = statistical_significance_analysis(df)
|
|
483
|
+
report_sections_collector.append(stat_sig_parts)
|
|
484
|
+
|
|
485
|
+
generate_markdown_report(
|
|
486
|
+
report_sections=report_sections_collector,
|
|
487
|
+
overall_stats_df=overall_stats_df,
|
|
488
|
+
cost_eff_parts=cost_eff_parts, # Pass this specifically for parsing
|
|
489
|
+
stat_results_df=stat_results_df
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
print("Benchmark analysis complete. Outputs are in the 'analysis/analysis_report/' directory.")
|
|
493
|
+
|
|
494
|
+
if __name__ == '__main__':
|
|
495
|
+
main()
|