pdd-cli 0.0.90__py3-none-any.whl → 0.0.121__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. pdd/__init__.py +38 -6
  2. pdd/agentic_bug.py +323 -0
  3. pdd/agentic_bug_orchestrator.py +506 -0
  4. pdd/agentic_change.py +231 -0
  5. pdd/agentic_change_orchestrator.py +537 -0
  6. pdd/agentic_common.py +533 -770
  7. pdd/agentic_crash.py +2 -1
  8. pdd/agentic_e2e_fix.py +319 -0
  9. pdd/agentic_e2e_fix_orchestrator.py +582 -0
  10. pdd/agentic_fix.py +118 -3
  11. pdd/agentic_update.py +27 -9
  12. pdd/agentic_verify.py +3 -2
  13. pdd/architecture_sync.py +565 -0
  14. pdd/auth_service.py +210 -0
  15. pdd/auto_deps_main.py +63 -53
  16. pdd/auto_include.py +236 -3
  17. pdd/auto_update.py +125 -47
  18. pdd/bug_main.py +195 -23
  19. pdd/cmd_test_main.py +345 -197
  20. pdd/code_generator.py +4 -2
  21. pdd/code_generator_main.py +118 -32
  22. pdd/commands/__init__.py +6 -0
  23. pdd/commands/analysis.py +113 -48
  24. pdd/commands/auth.py +309 -0
  25. pdd/commands/connect.py +358 -0
  26. pdd/commands/fix.py +155 -114
  27. pdd/commands/generate.py +5 -0
  28. pdd/commands/maintenance.py +3 -2
  29. pdd/commands/misc.py +8 -0
  30. pdd/commands/modify.py +225 -163
  31. pdd/commands/sessions.py +284 -0
  32. pdd/commands/utility.py +12 -7
  33. pdd/construct_paths.py +334 -32
  34. pdd/context_generator_main.py +167 -170
  35. pdd/continue_generation.py +6 -3
  36. pdd/core/__init__.py +33 -0
  37. pdd/core/cli.py +44 -7
  38. pdd/core/cloud.py +237 -0
  39. pdd/core/dump.py +68 -20
  40. pdd/core/errors.py +4 -0
  41. pdd/core/remote_session.py +61 -0
  42. pdd/crash_main.py +219 -23
  43. pdd/data/llm_model.csv +4 -4
  44. pdd/docs/prompting_guide.md +864 -0
  45. pdd/docs/whitepaper_with_benchmarks/data_and_functions/benchmark_analysis.py +495 -0
  46. pdd/docs/whitepaper_with_benchmarks/data_and_functions/creation_compare.py +528 -0
  47. pdd/fix_code_loop.py +208 -34
  48. pdd/fix_code_module_errors.py +6 -2
  49. pdd/fix_error_loop.py +291 -38
  50. pdd/fix_main.py +208 -6
  51. pdd/fix_verification_errors_loop.py +235 -26
  52. pdd/fix_verification_main.py +269 -83
  53. pdd/frontend/dist/assets/index-B5DZHykP.css +1 -0
  54. pdd/frontend/dist/assets/index-CUWd8al1.js +450 -0
  55. pdd/frontend/dist/index.html +376 -0
  56. pdd/frontend/dist/logo.svg +33 -0
  57. pdd/generate_output_paths.py +46 -5
  58. pdd/generate_test.py +212 -151
  59. pdd/get_comment.py +19 -44
  60. pdd/get_extension.py +8 -9
  61. pdd/get_jwt_token.py +309 -20
  62. pdd/get_language.py +8 -7
  63. pdd/get_run_command.py +7 -5
  64. pdd/insert_includes.py +2 -1
  65. pdd/llm_invoke.py +531 -97
  66. pdd/load_prompt_template.py +15 -34
  67. pdd/operation_log.py +342 -0
  68. pdd/path_resolution.py +140 -0
  69. pdd/postprocess.py +122 -97
  70. pdd/preprocess.py +68 -12
  71. pdd/preprocess_main.py +33 -1
  72. pdd/prompts/agentic_bug_step10_pr_LLM.prompt +182 -0
  73. pdd/prompts/agentic_bug_step1_duplicate_LLM.prompt +73 -0
  74. pdd/prompts/agentic_bug_step2_docs_LLM.prompt +129 -0
  75. pdd/prompts/agentic_bug_step3_triage_LLM.prompt +95 -0
  76. pdd/prompts/agentic_bug_step4_reproduce_LLM.prompt +97 -0
  77. pdd/prompts/agentic_bug_step5_root_cause_LLM.prompt +123 -0
  78. pdd/prompts/agentic_bug_step6_test_plan_LLM.prompt +107 -0
  79. pdd/prompts/agentic_bug_step7_generate_LLM.prompt +172 -0
  80. pdd/prompts/agentic_bug_step8_verify_LLM.prompt +119 -0
  81. pdd/prompts/agentic_bug_step9_e2e_test_LLM.prompt +289 -0
  82. pdd/prompts/agentic_change_step10_identify_issues_LLM.prompt +1006 -0
  83. pdd/prompts/agentic_change_step11_fix_issues_LLM.prompt +984 -0
  84. pdd/prompts/agentic_change_step12_create_pr_LLM.prompt +140 -0
  85. pdd/prompts/agentic_change_step1_duplicate_LLM.prompt +73 -0
  86. pdd/prompts/agentic_change_step2_docs_LLM.prompt +101 -0
  87. pdd/prompts/agentic_change_step3_research_LLM.prompt +126 -0
  88. pdd/prompts/agentic_change_step4_clarify_LLM.prompt +164 -0
  89. pdd/prompts/agentic_change_step5_docs_change_LLM.prompt +981 -0
  90. pdd/prompts/agentic_change_step6_devunits_LLM.prompt +1005 -0
  91. pdd/prompts/agentic_change_step7_architecture_LLM.prompt +1044 -0
  92. pdd/prompts/agentic_change_step8_analyze_LLM.prompt +1027 -0
  93. pdd/prompts/agentic_change_step9_implement_LLM.prompt +1077 -0
  94. pdd/prompts/agentic_e2e_fix_step1_unit_tests_LLM.prompt +90 -0
  95. pdd/prompts/agentic_e2e_fix_step2_e2e_tests_LLM.prompt +91 -0
  96. pdd/prompts/agentic_e2e_fix_step3_root_cause_LLM.prompt +89 -0
  97. pdd/prompts/agentic_e2e_fix_step4_fix_e2e_tests_LLM.prompt +96 -0
  98. pdd/prompts/agentic_e2e_fix_step5_identify_devunits_LLM.prompt +91 -0
  99. pdd/prompts/agentic_e2e_fix_step6_create_unit_tests_LLM.prompt +106 -0
  100. pdd/prompts/agentic_e2e_fix_step7_verify_tests_LLM.prompt +116 -0
  101. pdd/prompts/agentic_e2e_fix_step8_run_pdd_fix_LLM.prompt +120 -0
  102. pdd/prompts/agentic_e2e_fix_step9_verify_all_LLM.prompt +146 -0
  103. pdd/prompts/agentic_fix_primary_LLM.prompt +2 -2
  104. pdd/prompts/agentic_update_LLM.prompt +192 -338
  105. pdd/prompts/auto_include_LLM.prompt +22 -0
  106. pdd/prompts/change_LLM.prompt +3093 -1
  107. pdd/prompts/detect_change_LLM.prompt +571 -14
  108. pdd/prompts/fix_code_module_errors_LLM.prompt +8 -0
  109. pdd/prompts/fix_errors_from_unit_tests_LLM.prompt +1 -0
  110. pdd/prompts/generate_test_LLM.prompt +19 -1
  111. pdd/prompts/generate_test_from_example_LLM.prompt +366 -0
  112. pdd/prompts/insert_includes_LLM.prompt +262 -252
  113. pdd/prompts/prompt_code_diff_LLM.prompt +123 -0
  114. pdd/prompts/prompt_diff_LLM.prompt +82 -0
  115. pdd/remote_session.py +876 -0
  116. pdd/server/__init__.py +52 -0
  117. pdd/server/app.py +335 -0
  118. pdd/server/click_executor.py +587 -0
  119. pdd/server/executor.py +338 -0
  120. pdd/server/jobs.py +661 -0
  121. pdd/server/models.py +241 -0
  122. pdd/server/routes/__init__.py +31 -0
  123. pdd/server/routes/architecture.py +451 -0
  124. pdd/server/routes/auth.py +364 -0
  125. pdd/server/routes/commands.py +929 -0
  126. pdd/server/routes/config.py +42 -0
  127. pdd/server/routes/files.py +603 -0
  128. pdd/server/routes/prompts.py +1347 -0
  129. pdd/server/routes/websocket.py +473 -0
  130. pdd/server/security.py +243 -0
  131. pdd/server/terminal_spawner.py +217 -0
  132. pdd/server/token_counter.py +222 -0
  133. pdd/summarize_directory.py +236 -237
  134. pdd/sync_animation.py +8 -4
  135. pdd/sync_determine_operation.py +329 -47
  136. pdd/sync_main.py +272 -28
  137. pdd/sync_orchestration.py +289 -211
  138. pdd/sync_order.py +304 -0
  139. pdd/template_expander.py +161 -0
  140. pdd/templates/architecture/architecture_json.prompt +41 -46
  141. pdd/trace.py +1 -1
  142. pdd/track_cost.py +0 -13
  143. pdd/unfinished_prompt.py +2 -1
  144. pdd/update_main.py +68 -26
  145. {pdd_cli-0.0.90.dist-info → pdd_cli-0.0.121.dist-info}/METADATA +15 -10
  146. pdd_cli-0.0.121.dist-info/RECORD +229 -0
  147. pdd_cli-0.0.90.dist-info/RECORD +0 -153
  148. {pdd_cli-0.0.90.dist-info → pdd_cli-0.0.121.dist-info}/WHEEL +0 -0
  149. {pdd_cli-0.0.90.dist-info → pdd_cli-0.0.121.dist-info}/entry_points.txt +0 -0
  150. {pdd_cli-0.0.90.dist-info → pdd_cli-0.0.121.dist-info}/licenses/LICENSE +0 -0
  151. {pdd_cli-0.0.90.dist-info → pdd_cli-0.0.121.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,495 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import matplotlib.pyplot as plt
4
+ import matplotlib.figure
5
+ import seaborn as sns
6
+ from scipy import stats
7
+ import statsmodels.stats.proportion as smp
8
+ import os
9
+ from typing import List, Tuple, Optional, Dict, Any
10
+
11
+ # --- Configuration ---
12
+ CLAUDE_RESULTS_PATH = 'analysis/claude_results.csv'
13
+ PDD_RESULTS_PATH = 'analysis/PDD_results.csv'
14
+ OUTPUT_SUBDIR = 'analysis_report' # Subdirectory within 'analysis/'
15
+ OUTPUT_DIR = os.path.join('analysis', OUTPUT_SUBDIR)
16
+
17
+ # Weights for overall score
18
+ W_TIME = 0.3
19
+ W_COST = 0.3
20
+ W_SUCCESS = 0.4
21
+
22
+ # --- Helper Functions ---
23
+
24
+ def create_output_directory() -> None:
25
+ """Creates the output directory if it doesn't exist."""
26
+ if not os.path.exists(OUTPUT_DIR):
27
+ os.makedirs(OUTPUT_DIR)
28
+ print(f"Created directory: {OUTPUT_DIR}")
29
+
30
+ def save_plot(fig: matplotlib.figure.Figure, filename: str, tight_layout: bool = True) -> str:
31
+ """Saves the given matplotlib figure to the output directory."""
32
+ if tight_layout:
33
+ fig.tight_layout()
34
+ path = os.path.join(OUTPUT_DIR, filename)
35
+ fig.savefig(path)
36
+ plt.close(fig)
37
+ return path
38
+
39
+ def min_max_normalize(series: pd.Series, lower_is_better: bool = True) -> pd.Series:
40
+ """Normalizes a pandas Series using min-max scaling."""
41
+ min_val = series.min()
42
+ max_val = series.max()
43
+ if max_val == min_val: # Avoid division by zero if all values are the same
44
+ return pd.Series(0.5, index=series.index) # Neutral score if no variance
45
+
46
+ normalized = (series - min_val) / (max_val - min_val)
47
+ if lower_is_better:
48
+ return 1 - normalized
49
+ return normalized
50
+
51
+ def calculate_rank_biserial(U: float, n1: int, n2: int) -> float:
52
+ """Calculates rank-biserial correlation from Mann-Whitney U."""
53
+ # Common formula: r = 1 - (2U / (n1 * n2))
54
+ # U here should be the U statistic for the group that would be expected to have smaller sum of ranks if H0 is false
55
+ # Or, more simply, use the smaller of U1 and U2.
56
+ # scipy.stats.mannwhitneyu returns one U value (typically U2).
57
+ # The formula r = 1 - (2 * U_scipy) / (n1 * n2) is equivalent to (U1 - U2) / (n1 * n2)
58
+ # where U_scipy is the U value returned by scipy.
59
+ if n1 * n2 == 0: return 0.0 # Avoid division by zero if either group is empty
60
+ return 1 - (2 * U) / (n1 * n2)
61
+
62
+
63
+ def calculate_cramers_v(chi2: float, n: int, contingency_table: pd.DataFrame) -> float:
64
+ """Calculates Cramér's V from Chi-squared test."""
65
+ if n == 0: return 0.0
66
+ phi2 = chi2 / n
67
+ k, r = contingency_table.shape
68
+ if min(k-1, r-1) == 0: return 0.0 # Avoid division by zero
69
+ return np.sqrt(phi2 / min(k - 1, r - 1))
70
+
71
+ # --- Main Analysis Functions ---
72
+
73
+ def load_and_prepare_data() -> Optional[pd.DataFrame]:
74
+ """Loads and prepares the benchmark data."""
75
+ try:
76
+ claude_df = pd.read_csv(CLAUDE_RESULTS_PATH)
77
+ pdd_df = pd.read_csv(PDD_RESULTS_PATH)
78
+ except FileNotFoundError as e:
79
+ print(f"Error: Input CSV file not found. {e}")
80
+ print("Please ensure 'analysis/claude_results.csv' and 'analysis/PDD_results.csv' exist.")
81
+ return None
82
+
83
+ claude_df['tool'] = 'Claude'
84
+ pdd_df['tool'] = 'PDD'
85
+
86
+ combined_df = pd.concat([claude_df, pdd_df], ignore_index=True)
87
+
88
+ # Ensure numeric types for relevant columns
89
+ for col in ['execution_time_seconds', 'api_cost', 'success']:
90
+ combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')
91
+
92
+ # Handle potential NaNs from coercion or in data
93
+ combined_df.dropna(subset=['execution_time_seconds', 'api_cost', 'success'], inplace=True)
94
+
95
+ return combined_df
96
+
97
+ def overall_performance_analysis(df: pd.DataFrame) -> Tuple[List[str], pd.DataFrame]:
98
+ """Performs overall performance comparison."""
99
+ report_parts: List[str] = ["## 1. Overall Performance Comparison\n"]
100
+
101
+ overall_stats = df.groupby('tool').agg(
102
+ avg_execution_time=('execution_time_seconds', 'mean'),
103
+ avg_api_cost=('api_cost', 'mean'),
104
+ success_rate=('success', 'mean'),
105
+ total_tasks=('success', 'count')
106
+ ).reset_index()
107
+
108
+ report_parts.append("### Key Metrics\n")
109
+ report_parts.append(overall_stats.to_markdown(index=False) + "\n")
110
+
111
+ # Bar charts
112
+ metrics_to_plot: Dict[str, str] = {
113
+ 'avg_execution_time': 'Average Execution Time (s)',
114
+ 'avg_api_cost': 'Average API Cost ($)',
115
+ 'success_rate': 'Success Rate'
116
+ }
117
+ for metric, title in metrics_to_plot.items():
118
+ fig, ax = plt.subplots()
119
+
120
+ plot_data = overall_stats
121
+ y_label = title
122
+
123
+ if "time" in metric:
124
+ y_label = "Average Execution Time (seconds)"
125
+ elif "cost" in metric:
126
+ y_label = "Average API Cost (dollars)"
127
+ elif "success" in metric:
128
+ y_label = "Success Rate (%)"
129
+ plot_data = overall_stats.copy()
130
+ plot_data[metric] = plot_data[metric] * 100
131
+
132
+ sns.barplot(x='tool', y=metric, data=plot_data, ax=ax)
133
+ ax.set_title(title)
134
+ ax.set_xlabel("Tool")
135
+ ax.set_ylabel(y_label)
136
+ img_path = save_plot(fig, f"overall_{metric}.png")
137
+ report_parts.append(f"![{title}]({os.path.basename(img_path)})\n")
138
+
139
+ # Weighted Scoring
140
+ report_parts.append("\n### Weighted Scoring for Best Overall Tool\n")
141
+ report_parts.append(f"The best overall tool is determined using a weighted scoring system. The formula is: \n`Overall Score = {W_TIME} * norm_time + {W_COST} * norm_cost + {W_SUCCESS} * norm_success_rate`\n")
142
+ report_parts.append("Time and cost are normalized using min-max scaling (lower is better, hence `norm_time` and `norm_cost` are already inverted if needed). Success rate is already a [0, 1] metric (higher is better).\n")
143
+
144
+ # Normalize across both tools for fair comparison
145
+ all_avg_times = overall_stats['avg_execution_time']
146
+ all_avg_costs = overall_stats['avg_api_cost']
147
+
148
+ overall_stats['norm_time'] = min_max_normalize(all_avg_times, lower_is_better=True)
149
+ overall_stats['norm_cost'] = min_max_normalize(all_avg_costs, lower_is_better=True)
150
+ # Success rate is already normalized (0-1), and higher is better.
151
+ overall_stats['norm_success_rate'] = overall_stats['success_rate']
152
+
153
+ overall_stats['overall_score'] = (
154
+ W_TIME * overall_stats['norm_time'] +
155
+ W_COST * overall_stats['norm_cost'] +
156
+ W_SUCCESS * overall_stats['norm_success_rate']
157
+ )
158
+
159
+ report_parts.append("#### Calculated Scores:\n")
160
+ report_parts.append(overall_stats[['tool', 'norm_time', 'norm_cost', 'norm_success_rate', 'overall_score']].to_markdown(index=False) + "\n")
161
+
162
+ best_tool = overall_stats.loc[overall_stats['overall_score'].idxmax()]
163
+ report_parts.append(f"\n**Best Overall Tool (based on weighted score): {best_tool['tool']}** with a score of {best_tool['overall_score']:.3f}\n")
164
+
165
+ return report_parts, overall_stats
166
+
167
+
168
+ def dimension_specific_analysis(df: pd.DataFrame) -> List[str]:
169
+ """Performs dimension-specific analysis."""
170
+ report_parts: List[str] = ["\n## 2. Dimension-Specific Analysis\n"]
171
+ dimensions: List[str] = ['file_size', 'language', 'edit_type']
172
+
173
+ for dim in dimensions:
174
+ report_parts.append(f"### Performance by {dim.replace('_', ' ').title()}\n")
175
+
176
+ dim_stats = df.groupby(['tool', dim]).agg(
177
+ avg_execution_time=('execution_time_seconds', 'mean'),
178
+ avg_api_cost=('api_cost', 'mean'),
179
+ success_rate=('success', 'mean')
180
+ ).reset_index()
181
+
182
+ report_parts.append(dim_stats.to_markdown(index=False) + "\n")
183
+
184
+ # Grouped bar charts
185
+ metrics_to_plot: Dict[str, str] = {
186
+ 'avg_execution_time': f'Average Execution Time by {dim}',
187
+ 'avg_api_cost': f'Average API Cost by {dim}',
188
+ 'success_rate': f'Success Rate by {dim}'
189
+ }
190
+ for metric, title in metrics_to_plot.items():
191
+ fig, ax = plt.subplots(figsize=(10, 6))
192
+
193
+ plot_data = dim_stats
194
+ y_label = metric
195
+
196
+ if "time" in metric:
197
+ y_label = "Average Execution Time (seconds)"
198
+ elif "cost" in metric:
199
+ y_label = "Average API Cost (dollars)"
200
+ elif "success" in metric:
201
+ y_label = "Success Rate (%)"
202
+ plot_data = dim_stats.copy()
203
+ plot_data[metric] = plot_data[metric] * 100
204
+
205
+ sns.barplot(x=dim, y=metric, hue='tool', data=plot_data, ax=ax)
206
+ ax.set_title(title)
207
+ ax.set_xlabel(dim.replace('_', ' ').title())
208
+ ax.set_ylabel(y_label)
209
+ img_path = save_plot(fig, f"{metric}_by_{dim}.png")
210
+ report_parts.append(f"![{title}]({os.path.basename(img_path)})\n")
211
+
212
+ return report_parts
213
+
214
+ def cost_efficiency_analysis(df: pd.DataFrame) -> List[str]:
215
+ """Performs cost-efficiency analysis."""
216
+ report_parts: List[str] = ["\n## 3. Cost-Efficiency Analysis\n"]
217
+
218
+ # Cost per successful task
219
+ total_costs = df.groupby('tool')['api_cost'].sum()
220
+ successful_tasks = df[df['success'] == 1].groupby('tool')['success'].count()
221
+
222
+ cost_efficiency = pd.DataFrame({
223
+ 'total_api_cost': total_costs,
224
+ 'num_successful_tasks': successful_tasks
225
+ })
226
+ # Ensure num_successful_tasks is present for all tools, fill with 0 if no successful tasks
227
+ cost_efficiency = cost_efficiency.reindex(df['tool'].unique(), fill_value=0)
228
+ cost_efficiency['cost_per_successful_task'] = np.where(
229
+ cost_efficiency['num_successful_tasks'] > 0,
230
+ cost_efficiency['total_api_cost'] / cost_efficiency['num_successful_tasks'],
231
+ np.nan # Or 0, or some other indicator for no successful tasks
232
+ )
233
+ cost_efficiency = cost_efficiency.reset_index().rename(columns={'index': 'tool'})
234
+
235
+ report_parts.append("### Cost Per Successful Task\n")
236
+ report_parts.append(cost_efficiency.to_markdown(index=False) + "\n")
237
+
238
+ fig, ax = plt.subplots()
239
+ sns.barplot(x='tool', y='cost_per_successful_task', data=cost_efficiency, ax=ax)
240
+ ax.set_title('Cost Per Successful Task')
241
+ ax.set_ylabel('Cost Per Successful Task (dollars)')
242
+ img_path = save_plot(fig, "cost_per_successful_task.png")
243
+ report_parts.append(f"![Cost Per Successful Task]({os.path.basename(img_path)})\n")
244
+
245
+ # Scatter plot: execution_time vs. api_cost
246
+ fig, ax = plt.subplots(figsize=(10, 6))
247
+ sns.scatterplot(x='execution_time_seconds', y='api_cost', hue='tool', data=df, alpha=0.6, ax=ax)
248
+ ax.set_title('Execution Time vs. API Cost')
249
+ ax.set_xlabel('Execution Time (seconds)')
250
+ ax.set_ylabel('API Cost (dollars)')
251
+ img_path = save_plot(fig, "time_vs_cost_scatter.png")
252
+ report_parts.append(f"![Time vs. Cost Scatter Plot]({os.path.basename(img_path)})\n")
253
+
254
+ # Bar chart for total API cost (Required Visualizations #6)
255
+ fig, ax = plt.subplots()
256
+ sns.barplot(x='tool', y='total_api_cost', data=cost_efficiency, ax=ax) # Re-using cost_efficiency df
257
+ ax.set_title('Total API Cost for All Benchmarks')
258
+ ax.set_ylabel('Total API Cost (dollars)')
259
+ img_path = save_plot(fig, "total_api_cost_comparison.png")
260
+ report_parts.append(f"![Total API Cost Comparison]({os.path.basename(img_path)})\n")
261
+
262
+ return report_parts
263
+
264
+ def success_and_error_analysis(df: pd.DataFrame, overall_stats: pd.DataFrame) -> Tuple[List[str], pd.DataFrame]:
265
+ """Performs success and error analysis."""
266
+ report_parts: List[str] = ["\n## 4. Success and Error Analysis\n"]
267
+
268
+ # Success rates across dimensions (already plotted in dimension_specific_analysis)
269
+ report_parts.append("Success rates across dimensions are visualized in Section 2.\n")
270
+
271
+ # Error message analysis
272
+ failed_tasks = df[df['success'] == 0]
273
+ if 'error_message' in failed_tasks.columns:
274
+ error_summary = failed_tasks.groupby(['tool', 'error_message']).size().reset_index(name='count').sort_values(by=['tool', 'count'], ascending=[True, False])
275
+ report_parts.append("### Common Error Messages for Failed Tasks\n")
276
+ for tool in df['tool'].unique():
277
+ report_parts.append(f"#### {tool}:\n")
278
+ tool_errors = error_summary[error_summary['tool'] == tool]
279
+ if not tool_errors.empty:
280
+ report_parts.append(tool_errors[['error_message', 'count']].to_markdown(index=False) + "\n")
281
+ else:
282
+ report_parts.append("No failed tasks recorded for this tool or error messages not available.\n")
283
+ else:
284
+ report_parts.append("### Common Error Messages for Failed Tasks\n")
285
+ report_parts.append("Column 'error_message' not found in the data for failed tasks.\n")
286
+
287
+ # Confidence intervals for overall success rates
288
+ report_parts.append("### Overall Success Rate Confidence Intervals (95%)\n")
289
+ ci_data: List[Dict[str, Any]] = []
290
+ for tool_name in overall_stats['tool'].unique():
291
+ tool_data = overall_stats[overall_stats['tool'] == tool_name].iloc[0]
292
+ n_success = int(tool_data['success_rate'] * tool_data['total_tasks'])
293
+ n_total = int(tool_data['total_tasks'])
294
+ if n_total > 0:
295
+ ci_low, ci_upp = smp.proportion_confint(n_success, n_total, method='wilson')
296
+ ci_data.append({'tool': tool_name, 'success_rate': tool_data['success_rate'], 'ci_lower': ci_low, 'ci_upper': ci_upp})
297
+ else:
298
+ ci_data.append({'tool': tool_name, 'success_rate': np.nan, 'ci_lower': np.nan, 'ci_upper': np.nan})
299
+
300
+ ci_df = pd.DataFrame(ci_data)
301
+ report_parts.append(ci_df.to_markdown(index=False) + "\n")
302
+
303
+ return report_parts, ci_df
304
+
305
+
306
+ def statistical_significance_analysis(df: pd.DataFrame) -> Tuple[List[str], pd.DataFrame]:
307
+ """Performs statistical significance tests."""
308
+ report_parts: List[str] = ["\n## 5. Statistical Significance Analysis\n"]
309
+ stat_results: List[Dict[str, Any]] = []
310
+
311
+ pdd_data = df[df['tool'] == 'PDD']
312
+ claude_data = df[df['tool'] == 'Claude']
313
+
314
+ # Mann-Whitney U for continuous metrics
315
+ for metric in ['execution_time_seconds', 'api_cost']:
316
+ pdd_metric_data = pdd_data[metric].dropna()
317
+ claude_metric_data = claude_data[metric].dropna()
318
+
319
+ if pdd_metric_data.empty or claude_metric_data.empty:
320
+ report_parts.append(f"Skipping {metric} due to empty data for one or both tools after dropping NaNs.\n")
321
+ stat_results.append({'Metric': metric, 'Test': 'Mann-Whitney U', 'p-value': 'N/A', 'Effect Size (RBC)': 'N/A', 'Significance': 'N/A'})
322
+ continue
323
+
324
+ mwu_stat, p_value = stats.mannwhitneyu(pdd_metric_data, claude_metric_data, alternative='two-sided')
325
+
326
+ n1, n2 = len(pdd_metric_data), len(claude_metric_data)
327
+ rbc_val = calculate_rank_biserial(mwu_stat, n1, n2)
328
+ rbc_str = f"{rbc_val:.3f}"
329
+
330
+ significance = "Yes" if p_value < 0.05 else "No"
331
+ stat_results.append({'Metric': metric, 'Test': 'Mann-Whitney U', 'p-value': f"{p_value:.3g}", 'Effect Size (RBC)': rbc_str, 'Significance': significance})
332
+
333
+ # Chi-squared for success metric
334
+ if not pdd_data.empty and not claude_data.empty:
335
+ contingency_table = pd.crosstab(df['tool'], df['success'])
336
+ if contingency_table.shape == (2,2) and contingency_table.values.min() > 0 : # Check if table is 2x2 and has counts > 0
337
+ chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)
338
+
339
+ n_total = contingency_table.sum().sum()
340
+ cramers_v = calculate_cramers_v(chi2, n_total, contingency_table)
341
+ cramers_v_str = f"{cramers_v:.3f}"
342
+ significance = "Yes" if p_value < 0.05 else "No"
343
+ else:
344
+ p_value = 1.0 # Not applicable
345
+ cramers_v_str = "N/A"
346
+ significance = "N/A (contingency table not 2x2 or has zero/low counts)"
347
+
348
+ stat_results.append({'Metric': 'success', 'Test': 'Chi-squared', 'p-value': f"{p_value:.3g}", "Effect Size (Cramér's V)": cramers_v_str, 'Significance': significance})
349
+ else:
350
+ stat_results.append({'Metric': 'success', 'Test': 'Chi-squared', 'p-value': 'N/A', "Effect Size (Cramér's V)": 'N/A', 'Significance': 'N/A'})
351
+
352
+
353
+ stat_results_df = pd.DataFrame(stat_results)
354
+ report_parts.append("### Statistical Test Summary\n")
355
+ report_parts.append(stat_results_df.to_markdown(index=False) + "\n")
356
+
357
+ return report_parts, stat_results_df
358
+
359
+ def generate_markdown_report(
360
+ report_sections: List[List[str]],
361
+ overall_stats_df: pd.DataFrame,
362
+ cost_eff_parts: List[str], # Specifically for parsing cost per task
363
+ stat_results_df: pd.DataFrame
364
+ ) -> None:
365
+ """Generates the final Markdown report."""
366
+
367
+ report_content: List[str] = ["# Benchmark Analysis Report: PDD vs. Claude\n"]
368
+ report_content.append("This report analyzes and compares the performance of PDD and Claude AI coding assistants based on benchmark data.\n")
369
+
370
+ # Executive Summary
371
+ report_content.append("## Executive Summary\n")
372
+
373
+ best_tool_overall = overall_stats_df.loc[overall_stats_df['overall_score'].idxmax()]['tool']
374
+ pdd_overall_series = overall_stats_df[overall_stats_df['tool'] == 'PDD']
375
+ claude_overall_series = overall_stats_df[overall_stats_df['tool'] == 'Claude']
376
+
377
+ summary_points: List[str] = [
378
+ f"- **Overall Winner (Weighted Score):** {best_tool_overall}"
379
+ ]
380
+ if not pdd_overall_series.empty:
381
+ pdd_overall = pdd_overall_series.iloc[0]
382
+ summary_points.append(f"- **PDD Performance:** Avg Time: {pdd_overall['avg_execution_time']:.2f}s, Avg Cost: ${pdd_overall['avg_api_cost']:.4f}, Success Rate: {pdd_overall['success_rate']:.2%}")
383
+ if not claude_overall_series.empty:
384
+ claude_overall = claude_overall_series.iloc[0]
385
+ summary_points.append(f"- **Claude Performance:** Avg Time: {claude_overall['avg_execution_time']:.2f}s, Avg Cost: ${claude_overall['avg_api_cost']:.4f}, Success Rate: {claude_overall['success_rate']:.2%}")
386
+
387
+ # Add cost-efficiency summary by parsing the markdown part from cost_eff_parts
388
+ markdown_table_str = cost_eff_parts[1] # cost_eff_parts[0] is header, cost_eff_parts[1] is table
389
+ lines = markdown_table_str.strip().split('\n')
390
+
391
+ pdd_cost_val_str: Optional[str] = None
392
+ claude_cost_val_str: Optional[str] = None
393
+
394
+ # Expecting table rows like: | ToolName | TotalCost | NumSuccess | CostPerSuccess |
395
+ # Indices after split('|'): 0="", 1="ToolName", 2="TotalCost", ..., 4="CostPerSuccess"
396
+ for line_idx in range(2, len(lines)): # Start from index 2 (skip header and separator)
397
+ cols = lines[line_idx].split('|')
398
+ if len(cols) > 4: # Ensure enough columns for CostPerSuccess
399
+ tool_name_in_col = cols[1].strip()
400
+ cost_value = cols[4].strip()
401
+ if "PDD" in tool_name_in_col:
402
+ pdd_cost_val_str = cost_value
403
+ elif "Claude" in tool_name_in_col:
404
+ claude_cost_val_str = cost_value
405
+
406
+ if pdd_cost_val_str and claude_cost_val_str:
407
+ try:
408
+ pdd_float = float(pdd_cost_val_str)
409
+ claude_float = float(claude_cost_val_str)
410
+ summary_points.append(f"- **Cost per Successful Task:** PDD: ${pdd_float:.4f}, Claude: ${claude_float:.4f}")
411
+ except ValueError:
412
+ summary_points.append("- Cost per Successful Task: Error parsing values from markdown table.")
413
+ else:
414
+ summary_points.append("- Cost per Successful Task: Data not found or table format unexpected in markdown.")
415
+
416
+ # Add statistical significance summary
417
+ for _, row in stat_results_df.iterrows():
418
+ if row['Significance'] == 'Yes':
419
+ effect_size_col_name = [col for col in row.index if "Effect Size" in col][0]
420
+ effect_size_val = row[effect_size_col_name]
421
+ metric_name = row['Metric']
422
+ p_val = row['p-value']
423
+ summary_points.append(f"- Statistically significant difference in **{metric_name}** (p={p_val}, {effect_size_col_name.split('(')[1][:-1]}={effect_size_val}).")
424
+
425
+ report_content.append("\n".join(summary_points) + "\n")
426
+
427
+ # Append all main sections
428
+ for section_parts_list in report_sections:
429
+ report_content.extend(section_parts_list)
430
+
431
+ # Final Recommendation
432
+ report_content.append("\n## 6. Final Recommendation\n")
433
+ # Note: The recommendation part uses placeholders like "[mention specific areas...]"
434
+ # These would ideally be filled dynamically if more detailed insights from dim_stats_parts etc. were parsed here.
435
+ # For now, it's a template.
436
+ recommendation = f"""
437
+ Based on this analysis:
438
+
439
+ - For tasks where **overall balanced performance (time, cost, success)** is critical, **{best_tool_overall}** is recommended due to its higher weighted score.
440
+ - If **minimizing API cost** is the absolute priority, analyze the 'Average API Cost' and 'Cost Per Successful Task' metrics. The tool with lower values here might be preferred, even if slightly slower or less successful.
441
+ - If **maximizing success rate** is paramount, the tool with the higher overall success rate and better performance on specific critical dimensions (e.g., specific languages or edit types) should be chosen.
442
+ - **PDD** shows strengths in [mention specific areas if evident, e.g., specific languages/file_sizes based on dimensional analysis].
443
+ - **Claude** shows strengths in [mention specific areas if evident, e.g., specific languages/file_sizes based on dimensional analysis].
444
+
445
+ Consider the specific context of your tasks (e.g., dominant language, typical file size, importance of speed vs. cost) when making a final decision.
446
+ Further investigation into common error patterns for each tool could lead to improved prompt engineering or identify areas where one tool might need more support.
447
+ """
448
+ report_content.append(recommendation)
449
+
450
+ # Write report to file
451
+ report_path = os.path.join(OUTPUT_DIR, 'benchmark_analysis.md')
452
+ with open(report_path, 'w', encoding='utf-8') as f:
453
+ f.write("\n".join(report_content))
454
+ print(f"Markdown report saved to: {report_path}")
455
+
456
+
457
+ # --- Main Execution ---
458
+ def main() -> None:
459
+ """Main function to run the benchmark analysis."""
460
+ print("Starting benchmark analysis...")
461
+ create_output_directory()
462
+
463
+ df = load_and_prepare_data()
464
+ if df is None:
465
+ return
466
+
467
+ report_sections_collector: List[List[str]] = []
468
+
469
+ overall_parts, overall_stats_df = overall_performance_analysis(df)
470
+ report_sections_collector.append(overall_parts)
471
+
472
+ dim_stats_parts = dimension_specific_analysis(df)
473
+ report_sections_collector.append(dim_stats_parts)
474
+
475
+ # cost_efficiency_analysis no longer needs overall_stats_df
476
+ cost_eff_parts = cost_efficiency_analysis(df)
477
+ report_sections_collector.append(cost_eff_parts)
478
+
479
+ success_err_parts, ci_df = success_and_error_analysis(df, overall_stats_df)
480
+ report_sections_collector.append(success_err_parts)
481
+
482
+ stat_sig_parts, stat_results_df = statistical_significance_analysis(df)
483
+ report_sections_collector.append(stat_sig_parts)
484
+
485
+ generate_markdown_report(
486
+ report_sections=report_sections_collector,
487
+ overall_stats_df=overall_stats_df,
488
+ cost_eff_parts=cost_eff_parts, # Pass this specifically for parsing
489
+ stat_results_df=stat_results_df
490
+ )
491
+
492
+ print("Benchmark analysis complete. Outputs are in the 'analysis/analysis_report/' directory.")
493
+
494
+ if __name__ == '__main__':
495
+ main()