deepset-mcp 0.0.2rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepset_mcp/__init__.py +0 -0
- deepset_mcp/agents/__init__.py +0 -0
- deepset_mcp/agents/debugging/__init__.py +0 -0
- deepset_mcp/agents/debugging/debugging_agent.py +37 -0
- deepset_mcp/agents/debugging/system_prompt.md +214 -0
- deepset_mcp/agents/generalist/__init__.py +0 -0
- deepset_mcp/agents/generalist/generalist_agent.py +38 -0
- deepset_mcp/agents/generalist/system_prompt.md +241 -0
- deepset_mcp/api/README.md +536 -0
- deepset_mcp/api/__init__.py +0 -0
- deepset_mcp/api/client.py +277 -0
- deepset_mcp/api/custom_components/__init__.py +0 -0
- deepset_mcp/api/custom_components/models.py +25 -0
- deepset_mcp/api/custom_components/protocols.py +17 -0
- deepset_mcp/api/custom_components/resource.py +56 -0
- deepset_mcp/api/exceptions.py +70 -0
- deepset_mcp/api/haystack_service/__init__.py +0 -0
- deepset_mcp/api/haystack_service/protocols.py +13 -0
- deepset_mcp/api/haystack_service/resource.py +55 -0
- deepset_mcp/api/indexes/__init__.py +0 -0
- deepset_mcp/api/indexes/models.py +63 -0
- deepset_mcp/api/indexes/protocols.py +53 -0
- deepset_mcp/api/indexes/resource.py +138 -0
- deepset_mcp/api/integrations/__init__.py +1 -0
- deepset_mcp/api/integrations/models.py +49 -0
- deepset_mcp/api/integrations/protocols.py +27 -0
- deepset_mcp/api/integrations/resource.py +57 -0
- deepset_mcp/api/pipeline/__init__.py +17 -0
- deepset_mcp/api/pipeline/log_level.py +9 -0
- deepset_mcp/api/pipeline/models.py +235 -0
- deepset_mcp/api/pipeline/protocols.py +83 -0
- deepset_mcp/api/pipeline/resource.py +378 -0
- deepset_mcp/api/pipeline_template/__init__.py +0 -0
- deepset_mcp/api/pipeline_template/models.py +56 -0
- deepset_mcp/api/pipeline_template/protocols.py +17 -0
- deepset_mcp/api/pipeline_template/resource.py +88 -0
- deepset_mcp/api/protocols.py +122 -0
- deepset_mcp/api/secrets/__init__.py +0 -0
- deepset_mcp/api/secrets/models.py +16 -0
- deepset_mcp/api/secrets/protocols.py +29 -0
- deepset_mcp/api/secrets/resource.py +112 -0
- deepset_mcp/api/shared_models.py +17 -0
- deepset_mcp/api/transport.py +336 -0
- deepset_mcp/api/user/__init__.py +0 -0
- deepset_mcp/api/user/protocols.py +11 -0
- deepset_mcp/api/user/resource.py +38 -0
- deepset_mcp/api/workspace/__init__.py +7 -0
- deepset_mcp/api/workspace/models.py +23 -0
- deepset_mcp/api/workspace/protocols.py +41 -0
- deepset_mcp/api/workspace/resource.py +94 -0
- deepset_mcp/benchmark/README.md +425 -0
- deepset_mcp/benchmark/__init__.py +1 -0
- deepset_mcp/benchmark/agent_configs/debugging_agent.yml +10 -0
- deepset_mcp/benchmark/agent_configs/generalist_agent.yml +6 -0
- deepset_mcp/benchmark/dp_validation_error_analysis/__init__.py +0 -0
- deepset_mcp/benchmark/dp_validation_error_analysis/eda.ipynb +757 -0
- deepset_mcp/benchmark/dp_validation_error_analysis/prepare_interaction_data.ipynb +167 -0
- deepset_mcp/benchmark/dp_validation_error_analysis/preprocessing_utils.py +213 -0
- deepset_mcp/benchmark/runner/__init__.py +0 -0
- deepset_mcp/benchmark/runner/agent_benchmark_runner.py +561 -0
- deepset_mcp/benchmark/runner/agent_loader.py +110 -0
- deepset_mcp/benchmark/runner/cli.py +39 -0
- deepset_mcp/benchmark/runner/cli_agent.py +373 -0
- deepset_mcp/benchmark/runner/cli_index.py +71 -0
- deepset_mcp/benchmark/runner/cli_pipeline.py +73 -0
- deepset_mcp/benchmark/runner/cli_tests.py +226 -0
- deepset_mcp/benchmark/runner/cli_utils.py +61 -0
- deepset_mcp/benchmark/runner/config.py +73 -0
- deepset_mcp/benchmark/runner/config_loader.py +64 -0
- deepset_mcp/benchmark/runner/interactive.py +140 -0
- deepset_mcp/benchmark/runner/models.py +203 -0
- deepset_mcp/benchmark/runner/repl.py +67 -0
- deepset_mcp/benchmark/runner/setup_actions.py +238 -0
- deepset_mcp/benchmark/runner/streaming.py +360 -0
- deepset_mcp/benchmark/runner/teardown_actions.py +196 -0
- deepset_mcp/benchmark/runner/tracing.py +21 -0
- deepset_mcp/benchmark/tasks/chat_rag_answers_wrong_format.yml +16 -0
- deepset_mcp/benchmark/tasks/documents_output_wrong.yml +13 -0
- deepset_mcp/benchmark/tasks/jinja_str_instead_of_complex_type.yml +11 -0
- deepset_mcp/benchmark/tasks/jinja_syntax_error.yml +11 -0
- deepset_mcp/benchmark/tasks/missing_output_mapping.yml +14 -0
- deepset_mcp/benchmark/tasks/no_query_input.yml +13 -0
- deepset_mcp/benchmark/tasks/pipelines/chat_agent_jinja_str.yml +141 -0
- deepset_mcp/benchmark/tasks/pipelines/chat_agent_jinja_syntax.yml +141 -0
- deepset_mcp/benchmark/tasks/pipelines/chat_rag_answers_wrong_format.yml +181 -0
- deepset_mcp/benchmark/tasks/pipelines/chat_rag_missing_output_mapping.yml +189 -0
- deepset_mcp/benchmark/tasks/pipelines/rag_documents_wrong_format.yml +193 -0
- deepset_mcp/benchmark/tasks/pipelines/rag_no_query_input.yml +191 -0
- deepset_mcp/benchmark/tasks/pipelines/standard_index.yml +167 -0
- deepset_mcp/initialize_embedding_model.py +12 -0
- deepset_mcp/main.py +133 -0
- deepset_mcp/prompts/deepset_copilot_prompt.md +271 -0
- deepset_mcp/prompts/deepset_debugging_agent.md +214 -0
- deepset_mcp/store.py +5 -0
- deepset_mcp/tool_factory.py +473 -0
- deepset_mcp/tools/__init__.py +0 -0
- deepset_mcp/tools/custom_components.py +52 -0
- deepset_mcp/tools/doc_search.py +83 -0
- deepset_mcp/tools/haystack_service.py +358 -0
- deepset_mcp/tools/haystack_service_models.py +97 -0
- deepset_mcp/tools/indexes.py +129 -0
- deepset_mcp/tools/model_protocol.py +16 -0
- deepset_mcp/tools/pipeline.py +335 -0
- deepset_mcp/tools/pipeline_template.py +116 -0
- deepset_mcp/tools/secrets.py +45 -0
- deepset_mcp/tools/tokonomics/__init__.py +73 -0
- deepset_mcp/tools/tokonomics/decorators.py +396 -0
- deepset_mcp/tools/tokonomics/explorer.py +347 -0
- deepset_mcp/tools/tokonomics/object_store.py +177 -0
- deepset_mcp/tools/workspace.py +61 -0
- deepset_mcp-0.0.2rc1.dist-info/METADATA +292 -0
- deepset_mcp-0.0.2rc1.dist-info/RECORD +114 -0
- deepset_mcp-0.0.2rc1.dist-info/WHEEL +4 -0
- deepset_mcp-0.0.2rc1.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,757 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "code",
|
|
5
|
+
"id": "initial_id",
|
|
6
|
+
"metadata": {
|
|
7
|
+
"collapsed": true
|
|
8
|
+
},
|
|
9
|
+
"source": [
|
|
10
|
+
"import re\n",
|
|
11
|
+
"\n",
|
|
12
|
+
"import pandas as pd\n",
|
|
13
|
+
"import matplotlib.pyplot as plt\n",
|
|
14
|
+
"import seaborn as sns\n",
|
|
15
|
+
"\n",
|
|
16
|
+
"import warnings"
|
|
17
|
+
],
|
|
18
|
+
"outputs": [],
|
|
19
|
+
"execution_count": null
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"metadata": {},
|
|
23
|
+
"cell_type": "code",
|
|
24
|
+
"source": "from deepset_mcp.benchmark.dp_validation_error_analysis.preprocessing_utils import add_error_class_column",
|
|
25
|
+
"id": "35b01b4fc75710c2",
|
|
26
|
+
"outputs": [],
|
|
27
|
+
"execution_count": null
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"metadata": {},
|
|
31
|
+
"cell_type": "code",
|
|
32
|
+
"source": "df = pd.read_csv(\"../../../../data/raw/metabase_validation_errors_last_3_months_exported_28-05-2025.csv\")",
|
|
33
|
+
"id": "bef8e3aac2385ca4",
|
|
34
|
+
"outputs": [],
|
|
35
|
+
"execution_count": null
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"metadata": {},
|
|
39
|
+
"cell_type": "code",
|
|
40
|
+
"source": "len(df)",
|
|
41
|
+
"id": "7c405c24642c215c",
|
|
42
|
+
"outputs": [],
|
|
43
|
+
"execution_count": null
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"metadata": {},
|
|
47
|
+
"cell_type": "code",
|
|
48
|
+
"source": "df = df.drop(columns=[\"Context Library Name\", \"Context Library Version\", \"Event\", \"Received At\", \"UUID Ts\", \"Timestamp\", \"Sent At\", \"Deepset Cloud Version\", \"Is External User\"])",
|
|
49
|
+
"id": "71e4abde2034523e",
|
|
50
|
+
"outputs": [],
|
|
51
|
+
"execution_count": null
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"metadata": {},
|
|
55
|
+
"cell_type": "code",
|
|
56
|
+
"source": "df.head()",
|
|
57
|
+
"id": "cff0d577c45e2609",
|
|
58
|
+
"outputs": [],
|
|
59
|
+
"execution_count": null
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
"metadata": {},
|
|
63
|
+
"cell_type": "code",
|
|
64
|
+
"source": "df = df.rename(columns={\"ID\": \"event_id\", \"Pipeline Name\": \"pipeline_name\", \"Source\": \"event_source\", \"User ID\": \"user_id\", \"Event Text\": \"event_text\", \"Organization Name\": \"organization_name\", \"Deepset User\": \"is_deepset_user\", \"Deepset Orga\": \"is_deepset_org\", \"Original Timestamp\": \"event_timestamp\", \"Workspace Name\": \"workspace_name\", \"Workspace ID\": \"workspace_id\", \"Organization ID\": \"organization_id\", \"Error\": \"error_message\", \"Organization Type\": \"organization_type\", \"User Email\": \"user_email\", \"Environment\": \"environment\", \"Pipeline ID\": \"pipeline_id\"})",
|
|
65
|
+
"id": "be34de3aba586b63",
|
|
66
|
+
"outputs": [],
|
|
67
|
+
"execution_count": null
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"metadata": {},
|
|
71
|
+
"cell_type": "code",
|
|
72
|
+
"source": "df.head()",
|
|
73
|
+
"id": "5a052dc3bf8ab8cd",
|
|
74
|
+
"outputs": [],
|
|
75
|
+
"execution_count": null
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
"metadata": {},
|
|
79
|
+
"cell_type": "code",
|
|
80
|
+
"source": "df = add_error_class_column(df)",
|
|
81
|
+
"id": "c289bb3164ff80f0",
|
|
82
|
+
"outputs": [],
|
|
83
|
+
"execution_count": null
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
"metadata": {},
|
|
87
|
+
"cell_type": "code",
|
|
88
|
+
"source": "df.error_class.value_counts()",
|
|
89
|
+
"id": "38349fa857cd1ff",
|
|
90
|
+
"outputs": [],
|
|
91
|
+
"execution_count": null
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
"metadata": {},
|
|
95
|
+
"cell_type": "code",
|
|
96
|
+
"source": [
|
|
97
|
+
"# Add component types\n",
|
|
98
|
+
"df['component_type'] = df['error_message'].str.extract(\n",
|
|
99
|
+
" r\"component of type ['\\\"']([^'\\\"]+)['\\\"']\",\n",
|
|
100
|
+
" expand=False\n",
|
|
101
|
+
")"
|
|
102
|
+
],
|
|
103
|
+
"id": "4ac0cf0ef85ba777",
|
|
104
|
+
"outputs": [],
|
|
105
|
+
"execution_count": null
|
|
106
|
+
},
|
|
107
|
+
{
|
|
108
|
+
"metadata": {},
|
|
109
|
+
"cell_type": "code",
|
|
110
|
+
"source": "df.component_type.value_counts()",
|
|
111
|
+
"id": "800d8cfdb68e3594",
|
|
112
|
+
"outputs": [],
|
|
113
|
+
"execution_count": null
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
"metadata": {},
|
|
117
|
+
"cell_type": "code",
|
|
118
|
+
"source": [
|
|
119
|
+
"df['config_error_message'] = df['error_message'].str.extract(\n",
|
|
120
|
+
" r\"Error:\\s*`?([^`\\n]+)`?\",\n",
|
|
121
|
+
" flags=re.IGNORECASE,\n",
|
|
122
|
+
" expand=False\n",
|
|
123
|
+
")"
|
|
124
|
+
],
|
|
125
|
+
"id": "5f7c171c4e691278",
|
|
126
|
+
"outputs": [],
|
|
127
|
+
"execution_count": null
|
|
128
|
+
},
|
|
129
|
+
{
|
|
130
|
+
"metadata": {},
|
|
131
|
+
"cell_type": "code",
|
|
132
|
+
"source": "df.head()",
|
|
133
|
+
"id": "6fafa6edb4f63848",
|
|
134
|
+
"outputs": [],
|
|
135
|
+
"execution_count": null
|
|
136
|
+
},
|
|
137
|
+
{
|
|
138
|
+
"metadata": {},
|
|
139
|
+
"cell_type": "code",
|
|
140
|
+
"source": [
|
|
141
|
+
"\n",
|
|
142
|
+
"warnings.filterwarnings('ignore')\n",
|
|
143
|
+
"\n",
|
|
144
|
+
"# Set consistent style for professional look\n",
|
|
145
|
+
"plt.style.use('seaborn-v0_8-whitegrid')\n",
|
|
146
|
+
"sns.set_context(\"notebook\", font_scale=1.2)\n",
|
|
147
|
+
"\n",
|
|
148
|
+
"# Define consistent color palette - professional and accessible\n",
|
|
149
|
+
"COLORS = {\n",
|
|
150
|
+
" 'primary': '#1E40AF', # Deep blue\n",
|
|
151
|
+
" 'secondary': '#F59E0B', # Amber\n",
|
|
152
|
+
" 'success': '#10B981', # Emerald\n",
|
|
153
|
+
" 'danger': '#EF4444', # Red\n",
|
|
154
|
+
" 'purple': '#8B5CF6', # Purple\n",
|
|
155
|
+
" 'pink': '#EC4899', # Pink\n",
|
|
156
|
+
" 'cyan': '#06B6D4', # Cyan\n",
|
|
157
|
+
" 'gray': '#6B7280' # Gray\n",
|
|
158
|
+
"}\n",
|
|
159
|
+
"\n",
|
|
160
|
+
"# ============================================\n",
|
|
161
|
+
"# LOAD YOUR DATA HERE\n",
|
|
162
|
+
"# ============================================\n",
|
|
163
|
+
"# DATA STRUCTURE ASSUMPTIONS:\n",
|
|
164
|
+
"# error_data: timestamp, error_class, component_type, is_deepset_user, organization_type, organization_name\n",
|
|
165
|
+
"# activity_data: timestamp, is_deepset_user, organization_type, organization_name (general activity indicator)\n",
|
|
166
|
+
"\n",
|
|
167
|
+
"# Replace with your actual data:\n",
|
|
168
|
+
"error_data = df.copy() # Your error events\n",
|
|
169
|
+
"activity_data = pd.read_csv(\"../../../../data/processed/created_updated_deployed_events.csv\") # Your general activity events\n",
|
|
170
|
+
"# Ensure both datasets have consistent timestamp column\n",
|
|
171
|
+
"error_data['timestamp'] = pd.to_datetime(error_data['event_timestamp'])\n",
|
|
172
|
+
"activity_data['timestamp'] = pd.to_datetime(activity_data['event_timestamp'])\n",
|
|
173
|
+
"\n",
|
|
174
|
+
"# ============================================\n",
|
|
175
|
+
"# ACTIVITY-ADJUSTED ERROR ANALYSIS FUNCTIONS\n",
|
|
176
|
+
"# ============================================\n",
|
|
177
|
+
"\n",
|
|
178
|
+
"def calculate_activity_adjusted_metrics(error_df, activity_df, group_by_cols):\n",
|
|
179
|
+
" \"\"\"\n",
|
|
180
|
+
" Calculate activity-adjusted error metrics by grouping both datasets\n",
|
|
181
|
+
" Returns errors per 1000 activity events for normalization\n",
|
|
182
|
+
" Only use this for segments that can have different activity levels (time, orgs, user types)\n",
|
|
183
|
+
" \"\"\"\n",
|
|
184
|
+
" error_counts = error_df.groupby(group_by_cols).size().reset_index(name='error_count')\n",
|
|
185
|
+
" activity_counts = activity_df.groupby(group_by_cols).size().reset_index(name='activity_count')\n",
|
|
186
|
+
"\n",
|
|
187
|
+
" # Merge to get both counts for each group\n",
|
|
188
|
+
" merged = pd.merge(activity_counts, error_counts, on=group_by_cols, how='left')\n",
|
|
189
|
+
" merged['error_count'] = merged['error_count'].fillna(0)\n",
|
|
190
|
+
"\n",
|
|
191
|
+
" # Calculate errors per 1000 activity events for easier interpretation\n",
|
|
192
|
+
" merged['errors_per_1k_activity'] = (merged['error_count'] / merged['activity_count']) * 1000\n",
|
|
193
|
+
" merged['activity_share'] = (merged['activity_count'] / merged['activity_count'].sum()) * 100\n",
|
|
194
|
+
"\n",
|
|
195
|
+
" return merged\n",
|
|
196
|
+
"\n",
|
|
197
|
+
"# ============================================\n",
|
|
198
|
+
"# CALCULATE KEY METRICS\n",
|
|
199
|
+
"# ============================================\n",
|
|
200
|
+
"\n",
|
|
201
|
+
"total_errors = len(error_data)\n",
|
|
202
|
+
"total_activity = len(activity_data)\n",
|
|
203
|
+
"overall_error_intensity = (total_errors / total_activity) * 1000 # errors per 1000 activities\n",
|
|
204
|
+
"\n",
|
|
205
|
+
"date_range_start = min(error_data['timestamp'].min(), activity_data['timestamp'].min()).strftime('%B %d, %Y')\n",
|
|
206
|
+
"date_range_end = max(error_data['timestamp'].max(), activity_data['timestamp'].max()).strftime('%B %d, %Y')\n",
|
|
207
|
+
"\n",
|
|
208
|
+
"# Weekly aggregation with activity adjustment\n",
|
|
209
|
+
"error_data['week'] = error_data['timestamp'].dt.to_period('W').dt.to_timestamp()\n",
|
|
210
|
+
"activity_data['week'] = activity_data['timestamp'].dt.to_period('W').dt.to_timestamp()\n",
|
|
211
|
+
"\n",
|
|
212
|
+
"weekly_metrics = calculate_activity_adjusted_metrics(error_data, activity_data, ['week'])\n",
|
|
213
|
+
"weekly_metrics = weekly_metrics.sort_values('week')\n",
|
|
214
|
+
"avg_weekly_error_intensity = weekly_metrics['errors_per_1k_activity'].mean()\n",
|
|
215
|
+
"\n",
|
|
216
|
+
"# Activity-adjusted metrics by different dimensions\n",
|
|
217
|
+
"org_metrics = calculate_activity_adjusted_metrics(error_data, activity_data, ['organization_name'])\n",
|
|
218
|
+
"org_metrics = org_metrics.sort_values('error_count', ascending=False)\n",
|
|
219
|
+
"\n",
|
|
220
|
+
"# Error distribution by error-specific dimensions (no activity adjustment needed)\n",
|
|
221
|
+
"error_class_counts = error_data['error_class'].value_counts()\n",
|
|
222
|
+
"error_class_distribution = pd.DataFrame({\n",
|
|
223
|
+
" 'error_class': error_class_counts.index,\n",
|
|
224
|
+
" 'error_count': error_class_counts.values,\n",
|
|
225
|
+
" 'error_share': (error_class_counts.values / error_class_counts.sum()) * 100\n",
|
|
226
|
+
"})\n",
|
|
227
|
+
"\n",
|
|
228
|
+
"component_counts = error_data['component_type'].value_counts()\n",
|
|
229
|
+
"component_distribution = pd.DataFrame({\n",
|
|
230
|
+
" 'component_type': component_counts.index,\n",
|
|
231
|
+
" 'error_count': component_counts.values,\n",
|
|
232
|
+
" 'error_share': (component_counts.values / component_counts.sum()) * 100\n",
|
|
233
|
+
"})\n",
|
|
234
|
+
"\n",
|
|
235
|
+
"# User segmentation with activity adjustment\n",
|
|
236
|
+
"internal_errors = error_data['is_deepset_user'].sum()\n",
|
|
237
|
+
"internal_activity = activity_data['is_deepset_user'].sum()\n",
|
|
238
|
+
"external_errors = len(error_data) - internal_errors\n",
|
|
239
|
+
"external_activity = len(activity_data) - internal_activity\n",
|
|
240
|
+
"\n",
|
|
241
|
+
"internal_intensity = (internal_errors / internal_activity) * 1000 if internal_activity > 0 else 0\n",
|
|
242
|
+
"external_intensity = (external_errors / external_activity) * 1000 if external_activity > 0 else 0\n",
|
|
243
|
+
"\n",
|
|
244
|
+
"# Enterprise vs Free activity-adjusted metrics\n",
|
|
245
|
+
"enterprise_errors = (error_data['organization_type'] == 'FULL_DEEPSET_CLOUD').sum()\n",
|
|
246
|
+
"enterprise_activity = (activity_data['organization_type'] == 'FULL_DEEPSET_CLOUD').sum()\n",
|
|
247
|
+
"free_errors = (error_data['organization_type'] == 'DEEPSET_STUDIO_WITH_LIMITS').sum()\n",
|
|
248
|
+
"free_activity = (activity_data['organization_type'] == 'DEEPSET_STUDIO_WITH_LIMITS').sum()\n",
|
|
249
|
+
"\n",
|
|
250
|
+
"enterprise_intensity = (enterprise_errors / enterprise_activity) * 1000 if enterprise_activity > 0 else 0\n",
|
|
251
|
+
"free_intensity = (free_errors / free_activity) * 1000 if free_activity > 0 else 0\n",
|
|
252
|
+
"\n",
|
|
253
|
+
"# External Enterprise vs External Free activity-adjusted metrics\n",
|
|
254
|
+
"external_enterprise_errors = ((error_data['is_deepset_user'] == False) &\n",
|
|
255
|
+
" (error_data['organization_type'] == 'FULL_DEEPSET_CLOUD')).sum()\n",
|
|
256
|
+
"external_enterprise_activity = ((activity_data['is_deepset_user'] == False) &\n",
|
|
257
|
+
" (activity_data['organization_type'] == 'FULL_DEEPSET_CLOUD')).sum()\n",
|
|
258
|
+
"external_free_errors = ((error_data['is_deepset_user'] == False) &\n",
|
|
259
|
+
" (error_data['organization_type'] == 'DEEPSET_STUDIO_WITH_LIMITS')).sum()\n",
|
|
260
|
+
"external_free_activity = ((activity_data['is_deepset_user'] == False) &\n",
|
|
261
|
+
" (activity_data['organization_type'] == 'DEEPSET_STUDIO_WITH_LIMITS')).sum()\n",
|
|
262
|
+
"\n",
|
|
263
|
+
"external_enterprise_intensity = (external_enterprise_errors / external_enterprise_activity) * 1000 if external_enterprise_activity > 0 else 0\n",
|
|
264
|
+
"external_free_intensity = (external_free_errors / external_free_activity) * 1000 if external_free_activity > 0 else 0\n",
|
|
265
|
+
"\n",
|
|
266
|
+
"# ============================================\n",
|
|
267
|
+
"# PLOT 1: KEY METRICS SUMMARY\n",
|
|
268
|
+
"# ============================================\n",
|
|
269
|
+
"fig, axes = plt.subplots(2, 3, figsize=(15, 8), facecolor='white')\n",
|
|
270
|
+
"fig.suptitle('Activity-Adjusted Error Analysis - Key Metrics', fontsize=18, fontweight='bold', y=0.98)\n",
|
|
271
|
+
"\n",
|
|
272
|
+
"axes = axes.flatten()\n",
|
|
273
|
+
"\n",
|
|
274
|
+
"# Define metrics\n",
|
|
275
|
+
"metrics = [\n",
|
|
276
|
+
" {'value': f'{overall_error_intensity:.1f}', 'label': 'Errors per 1K Activity', 'sublabel': f'{total_errors:,} errors / {total_activity:,} activities'},\n",
|
|
277
|
+
" {'value': f'{avg_weekly_error_intensity:.1f}', 'label': 'Avg Weekly Intensity', 'sublabel': 'errors per 1K activities/week'},\n",
|
|
278
|
+
" {'value': f'{error_class_distribution.iloc[0][\"error_share\"]:.1f}%',\n",
|
|
279
|
+
" 'label': 'Top Error Class Share', 'sublabel': error_class_distribution.iloc[0]['error_class'].replace('_', ' ').title()},\n",
|
|
280
|
+
" {'value': f'{component_distribution.iloc[0][\"error_share\"]:.1f}%',\n",
|
|
281
|
+
" 'label': 'Top Component Share', 'sublabel': component_distribution.iloc[0]['component_type']},\n",
|
|
282
|
+
" {'value': f'{external_intensity:.1f}', 'label': 'External User Intensity', 'sublabel': f'{external_errors:,} errors / {external_activity:,} activities'},\n",
|
|
283
|
+
" {'value': f'{enterprise_intensity:.1f}', 'label': 'Enterprise Intensity', 'sublabel': f'{enterprise_errors:,} errors / {enterprise_activity:,} activities'}\n",
|
|
284
|
+
"]\n",
|
|
285
|
+
"\n",
|
|
286
|
+
"# Create metric cards\n",
|
|
287
|
+
"for ax, metric, color in zip(axes, metrics, [COLORS['primary'], COLORS['secondary'],\n",
|
|
288
|
+
" COLORS['danger'], COLORS['pink'],\n",
|
|
289
|
+
" COLORS['success'], COLORS['cyan']]):\n",
|
|
290
|
+
" ax.text(0.5, 0.5, metric['value'], ha='center', va='center',\n",
|
|
291
|
+
" fontsize=32, fontweight='bold', color=color, transform=ax.transAxes)\n",
|
|
292
|
+
" ax.text(0.5, 0.15, metric['label'], ha='center', va='center',\n",
|
|
293
|
+
" fontsize=14, color=COLORS['gray'], transform=ax.transAxes)\n",
|
|
294
|
+
" ax.text(0.5, 0.85, metric['sublabel'], ha='center', va='center',\n",
|
|
295
|
+
" fontsize=11, color='#9CA3AF', transform=ax.transAxes)\n",
|
|
296
|
+
"\n",
|
|
297
|
+
" ax.set_xlim(0, 1)\n",
|
|
298
|
+
" ax.set_ylim(0, 1)\n",
|
|
299
|
+
" ax.axis('off')\n",
|
|
300
|
+
"\n",
|
|
301
|
+
" # Add subtle border\n",
|
|
302
|
+
" for spine in ['top', 'right', 'bottom', 'left']:\n",
|
|
303
|
+
" ax.spines[spine].set_visible(True)\n",
|
|
304
|
+
" ax.spines[spine].set_color('#E5E7EB')\n",
|
|
305
|
+
" ax.spines[spine].set_linewidth(1)\n",
|
|
306
|
+
"\n",
|
|
307
|
+
"plt.tight_layout()\n",
|
|
308
|
+
"plt.savefig('activity_adjusted_summary.png', dpi=300, bbox_inches='tight', facecolor='white')\n",
|
|
309
|
+
"plt.show()\n",
|
|
310
|
+
"\n",
|
|
311
|
+
"# ============================================\n",
|
|
312
|
+
"# PLOT 2: WEEKLY ERROR INTENSITY TREND\n",
|
|
313
|
+
"# ============================================\n",
|
|
314
|
+
"fig, ax1 = plt.subplots(figsize=(14, 8), facecolor='white')\n",
|
|
315
|
+
"\n",
|
|
316
|
+
"# Plot error intensity (primary y-axis)\n",
|
|
317
|
+
"color1 = COLORS['primary']\n",
|
|
318
|
+
"ax1.set_xlabel('Week', fontsize=13)\n",
|
|
319
|
+
"ax1.set_ylabel('Errors per 1K Activities', color=color1, fontsize=13)\n",
|
|
320
|
+
"line1 = ax1.plot(weekly_metrics['week'], weekly_metrics['errors_per_1k_activity'],\n",
|
|
321
|
+
" color=color1, linewidth=3, marker='o', markersize=6,\n",
|
|
322
|
+
" label='Error Intensity')\n",
|
|
323
|
+
"ax1.tick_params(axis='y', labelcolor=color1)\n",
|
|
324
|
+
"ax1.grid(True, alpha=0.3, linestyle=':')\n",
|
|
325
|
+
"\n",
|
|
326
|
+
"# Add rolling average\n",
|
|
327
|
+
"weekly_metrics['intensity_rolling'] = weekly_metrics['errors_per_1k_activity'].rolling(window=3, center=True, min_periods=1).mean()\n",
|
|
328
|
+
"ax1.plot(weekly_metrics['week'], weekly_metrics['intensity_rolling'],\n",
|
|
329
|
+
" color=color1, linewidth=2, linestyle='--', alpha=0.7,\n",
|
|
330
|
+
" label='3-week moving avg')\n",
|
|
331
|
+
"\n",
|
|
332
|
+
"# Add average line\n",
|
|
333
|
+
"ax1.axhline(y=avg_weekly_error_intensity, color=color1, linestyle=':', alpha=0.7, linewidth=2)\n",
|
|
334
|
+
"avg_text_y = avg_weekly_error_intensity + weekly_metrics['errors_per_1k_activity'].std()*0.1\n",
|
|
335
|
+
"ax1.text(weekly_metrics['week'].iloc[0], avg_text_y,\n",
|
|
336
|
+
" f'Avg: {avg_weekly_error_intensity:.1f}', fontsize=11, color=color1,\n",
|
|
337
|
+
" bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))\n",
|
|
338
|
+
"\n",
|
|
339
|
+
"# Create secondary y-axis for activity volume\n",
|
|
340
|
+
"ax2 = ax1.twinx()\n",
|
|
341
|
+
"color2 = COLORS['secondary']\n",
|
|
342
|
+
"ax2.set_ylabel('Activity Volume', color=color2, fontsize=13)\n",
|
|
343
|
+
"bars = ax2.bar(weekly_metrics['week'], weekly_metrics['activity_count'],\n",
|
|
344
|
+
" alpha=0.3, color=color2, label='Total Activities', width=5)\n",
|
|
345
|
+
"ax2.tick_params(axis='y', labelcolor=color2)\n",
|
|
346
|
+
"\n",
|
|
347
|
+
"# Combine legends\n",
|
|
348
|
+
"lines1, labels1 = ax1.get_legend_handles_labels()\n",
|
|
349
|
+
"lines2, labels2 = ax2.get_legend_handles_labels()\n",
|
|
350
|
+
"ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left',\n",
|
|
351
|
+
" frameon=True, fancybox=True, shadow=True)\n",
|
|
352
|
+
"\n",
|
|
353
|
+
"plt.title('Weekly Error Intensity Trend (Activity-Adjusted)', fontsize=16, fontweight='bold', pad=20)\n",
|
|
354
|
+
"\n",
|
|
355
|
+
"ax1.spines['top'].set_visible(False)\n",
|
|
356
|
+
"ax2.spines['top'].set_visible(False)\n",
|
|
357
|
+
"\n",
|
|
358
|
+
"plt.tight_layout()\n",
|
|
359
|
+
"plt.savefig('weekly_error_intensity_trend.png', dpi=300, bbox_inches='tight', facecolor='white')\n",
|
|
360
|
+
"plt.show()\n",
|
|
361
|
+
"\n",
|
|
362
|
+
"# ============================================\n",
|
|
363
|
+
"# PLOT 3: ERROR CLASS DISTRIBUTION\n",
|
|
364
|
+
"# ============================================\n",
|
|
365
|
+
"plt.figure(figsize=(12, 8), facecolor='white')\n",
|
|
366
|
+
"\n",
|
|
367
|
+
"top_error_classes = error_class_distribution.head(8)\n",
|
|
368
|
+
"colors = [COLORS['danger'], COLORS['secondary'], COLORS['purple'], COLORS['cyan'],\n",
|
|
369
|
+
" COLORS['success'], COLORS['pink'], COLORS['primary'], COLORS['gray']]\n",
|
|
370
|
+
"\n",
|
|
371
|
+
"# Error share (percentage of all errors)\n",
|
|
372
|
+
"bars = plt.barh(range(8), top_error_classes['error_share'], color=colors, alpha=0.8, height=0.7)\n",
|
|
373
|
+
"plt.yticks(range(8), [err.replace('_', ' ').title() for err in top_error_classes['error_class']], fontsize=12)\n",
|
|
374
|
+
"plt.xlabel('Share of Total Errors (%)', fontsize=13)\n",
|
|
375
|
+
"plt.title('Top 8 Error Classes Distribution', fontsize=16, fontweight='bold', pad=20)\n",
|
|
376
|
+
"plt.grid(True, alpha=0.3, axis='x', linestyle=':')\n",
|
|
377
|
+
"\n",
|
|
378
|
+
"for i, (bar, share, count) in enumerate(zip(bars, top_error_classes['error_share'],\n",
|
|
379
|
+
" top_error_classes['error_count'])):\n",
|
|
380
|
+
" plt.text(share + top_error_classes['error_share'].max()*0.02,\n",
|
|
381
|
+
" bar.get_y() + bar.get_height()/2,\n",
|
|
382
|
+
" f'{share:.1f}% ({count:.0f})', va='center', fontsize=11,\n",
|
|
383
|
+
" bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))\n",
|
|
384
|
+
"\n",
|
|
385
|
+
"ax = plt.gca()\n",
|
|
386
|
+
"ax.spines['top'].set_visible(False)\n",
|
|
387
|
+
"ax.spines['right'].set_visible(False)\n",
|
|
388
|
+
"ax.spines['left'].set_visible(False)\n",
|
|
389
|
+
"\n",
|
|
390
|
+
"plt.tight_layout()\n",
|
|
391
|
+
"plt.savefig('error_class_distribution.png', dpi=300, bbox_inches='tight', facecolor='white')\n",
|
|
392
|
+
"plt.show()\n",
|
|
393
|
+
"\n",
|
|
394
|
+
"# ============================================\n",
|
|
395
|
+
"# PLOT 4: COMPONENT DISTRIBUTION\n",
|
|
396
|
+
"# ============================================\n",
|
|
397
|
+
"plt.figure(figsize=(12, 8), facecolor='white')\n",
|
|
398
|
+
"\n",
|
|
399
|
+
"top_components = component_distribution.head(8)\n",
|
|
400
|
+
"colors = [COLORS['pink'], COLORS['purple'], COLORS['secondary'], COLORS['cyan'],\n",
|
|
401
|
+
" COLORS['success'], COLORS['danger'], COLORS['primary'], COLORS['gray']]\n",
|
|
402
|
+
"\n",
|
|
403
|
+
"# Component error share\n",
|
|
404
|
+
"bars = plt.barh(range(8), top_components['error_share'], color=colors, alpha=0.8, height=0.7)\n",
|
|
405
|
+
"plt.yticks(range(8), top_components['component_type'], fontsize=12)\n",
|
|
406
|
+
"plt.xlabel('Share of Total Errors (%)', fontsize=13)\n",
|
|
407
|
+
"plt.title('Top 8 Component Types Distribution', fontsize=16, fontweight='bold', pad=20)\n",
|
|
408
|
+
"plt.grid(True, alpha=0.3, axis='x', linestyle=':')\n",
|
|
409
|
+
"\n",
|
|
410
|
+
"for i, (bar, share, count) in enumerate(zip(bars, top_components['error_share'],\n",
|
|
411
|
+
" top_components['error_count'])):\n",
|
|
412
|
+
" plt.text(share + top_components['error_share'].max()*0.02,\n",
|
|
413
|
+
" bar.get_y() + bar.get_height()/2,\n",
|
|
414
|
+
" f'{share:.1f}% ({count:.0f})', va='center', fontsize=11,\n",
|
|
415
|
+
" bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))\n",
|
|
416
|
+
"\n",
|
|
417
|
+
"ax = plt.gca()\n",
|
|
418
|
+
"ax.spines['top'].set_visible(False)\n",
|
|
419
|
+
"ax.spines['right'].set_visible(False)\n",
|
|
420
|
+
"ax.spines['left'].set_visible(False)\n",
|
|
421
|
+
"\n",
|
|
422
|
+
"plt.tight_layout()\n",
|
|
423
|
+
"plt.savefig('component_error_distribution.png', dpi=300, bbox_inches='tight', facecolor='white')\n",
|
|
424
|
+
"plt.show()\n",
|
|
425
|
+
"\n",
|
|
426
|
+
"# ============================================\n",
|
|
427
|
+
"# PLOT 5: ORGANIZATION ANALYSIS\n",
|
|
428
|
+
"# ============================================\n",
|
|
429
|
+
"plt.figure(figsize=(12, 8), facecolor='white')\n",
|
|
430
|
+
"\n",
|
|
431
|
+
"top_orgs = org_metrics.head(8)\n",
|
|
432
|
+
"\n",
|
|
433
|
+
"# Organization error intensity\n",
|
|
434
|
+
"bars = plt.barh(range(8), top_orgs['errors_per_1k_activity'], color=COLORS['primary'], alpha=0.8, height=0.7)\n",
|
|
435
|
+
"for i, bar in enumerate(bars):\n",
|
|
436
|
+
" bar.set_alpha(0.9 - i*0.1)\n",
|
|
437
|
+
"\n",
|
|
438
|
+
"plt.yticks(range(8), top_orgs['organization_name'], fontsize=12)\n",
|
|
439
|
+
"plt.xlabel('Errors per 1K Activities', fontsize=13)\n",
|
|
440
|
+
"plt.title('Top 8 Organizations by Error Intensity', fontsize=16, fontweight='bold', pad=20)\n",
|
|
441
|
+
"plt.grid(True, alpha=0.3, axis='x', linestyle=':')\n",
|
|
442
|
+
"\n",
|
|
443
|
+
"for i, (bar, intensity, errors, activities) in enumerate(zip(bars, top_orgs['errors_per_1k_activity'],\n",
|
|
444
|
+
" top_orgs['error_count'], top_orgs['activity_count'])):\n",
|
|
445
|
+
" plt.text(intensity + top_orgs['errors_per_1k_activity'].max()*0.02,\n",
|
|
446
|
+
" bar.get_y() + bar.get_height()/2,\n",
|
|
447
|
+
" f'{intensity:.1f} ({errors:.0f}/{activities:.0f})', va='center', fontsize=10,\n",
|
|
448
|
+
" bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))\n",
|
|
449
|
+
"\n",
|
|
450
|
+
"ax = plt.gca()\n",
|
|
451
|
+
"ax.spines['top'].set_visible(False)\n",
|
|
452
|
+
"ax.spines['right'].set_visible(False)\n",
|
|
453
|
+
"\n",
|
|
454
|
+
"plt.tight_layout()\n",
|
|
455
|
+
"plt.savefig('organization_intensity_analysis.png', dpi=300, bbox_inches='tight', facecolor='white')\n",
|
|
456
|
+
"plt.show()\n",
|
|
457
|
+
"\n",
|
|
458
|
+
"# ============================================\n",
|
|
459
|
+
"# PLOT 6: USER SEGMENT COMPARISON\n",
|
|
460
|
+
"# ============================================\n",
|
|
461
|
+
"fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 10), facecolor='white')\n",
|
|
462
|
+
"\n",
|
|
463
|
+
"# Internal vs External Error Intensity\n",
|
|
464
|
+
"categories1 = ['Internal\\n(Deepset)', 'External\\n(Customers)']\n",
|
|
465
|
+
"intensities1 = [internal_intensity, external_intensity]\n",
|
|
466
|
+
"activity_counts1 = [internal_activity, external_activity]\n",
|
|
467
|
+
"colors1 = [COLORS['primary'], COLORS['secondary']]\n",
|
|
468
|
+
"\n",
|
|
469
|
+
"bars1 = ax1.bar(categories1, intensities1, color=colors1, alpha=0.8, width=0.6)\n",
|
|
470
|
+
"for bar, intensity, activities in zip(bars1, intensities1, activity_counts1):\n",
|
|
471
|
+
" height = bar.get_height()\n",
|
|
472
|
+
" ax1.text(bar.get_x() + bar.get_width()/2., height + max(intensities1)*0.02,\n",
|
|
473
|
+
" f'{intensity:.1f}\\n({activities:,} activities)', ha='center', va='bottom', fontsize=11,\n",
|
|
474
|
+
" bbox=dict(boxstyle='round,pad=0.4', facecolor='white', alpha=0.9))\n",
|
|
475
|
+
"\n",
|
|
476
|
+
"ax1.set_title('Error Intensity: Internal vs External', fontsize=12, fontweight='bold', pad=15)\n",
|
|
477
|
+
"ax1.set_ylabel('Errors per 1K Activities', fontsize=11)\n",
|
|
478
|
+
"ax1.grid(True, alpha=0.3, axis='y', linestyle=':')\n",
|
|
479
|
+
"ax1.spines['top'].set_visible(False)\n",
|
|
480
|
+
"ax1.spines['right'].set_visible(False)\n",
|
|
481
|
+
"\n",
|
|
482
|
+
"# External Enterprise vs External Free Error Intensity\n",
|
|
483
|
+
"categories2 = ['External\\nEnterprise', 'External\\nFree']\n",
|
|
484
|
+
"intensities2 = [external_enterprise_intensity, external_free_intensity]\n",
|
|
485
|
+
"activity_counts2 = [external_enterprise_activity, external_free_activity]\n",
|
|
486
|
+
"colors2 = [COLORS['success'], COLORS['cyan']]\n",
|
|
487
|
+
"\n",
|
|
488
|
+
"bars2 = ax2.bar(categories2, intensities2, color=colors2, alpha=0.8, width=0.6)\n",
|
|
489
|
+
"for bar, intensity, activities in zip(bars2, intensities2, activity_counts2):\n",
|
|
490
|
+
" height = bar.get_height()\n",
|
|
491
|
+
" ax2.text(bar.get_x() + bar.get_width()/2., height + max(intensities2)*0.02,\n",
|
|
492
|
+
" f'{intensity:.1f}\\n({activities:,} activities)', ha='center', va='bottom', fontsize=11,\n",
|
|
493
|
+
" bbox=dict(boxstyle='round,pad=0.4', facecolor='white', alpha=0.9))\n",
|
|
494
|
+
"\n",
|
|
495
|
+
"ax2.set_title('Error Intensity: External Enterprise vs Free', fontsize=12, fontweight='bold', pad=15)\n",
|
|
496
|
+
"ax2.set_ylabel('Errors per 1K Activities', fontsize=11)\n",
|
|
497
|
+
"ax2.grid(True, alpha=0.3, axis='y', linestyle=':')\n",
|
|
498
|
+
"ax2.spines['top'].set_visible(False)\n",
|
|
499
|
+
"ax2.spines['right'].set_visible(False)\n",
|
|
500
|
+
"\n",
|
|
501
|
+
"# Activity Distribution - Internal vs External\n",
|
|
502
|
+
"bars3 = ax3.bar(categories1, activity_counts1, color=colors1, alpha=0.8, width=0.6)\n",
|
|
503
|
+
"for bar, count in zip(bars3, activity_counts1):\n",
|
|
504
|
+
" height = bar.get_height()\n",
|
|
505
|
+
" ax3.text(bar.get_x() + bar.get_width()/2., height + max(activity_counts1)*0.02,\n",
|
|
506
|
+
" f'{count:,}', ha='center', va='bottom', fontsize=11,\n",
|
|
507
|
+
" bbox=dict(boxstyle='round,pad=0.4', facecolor='white', alpha=0.9))\n",
|
|
508
|
+
"\n",
|
|
509
|
+
"ax3.set_title('Activity Volume: Internal vs External', fontsize=12, fontweight='bold', pad=15)\n",
|
|
510
|
+
"ax3.set_ylabel('Total Activities', fontsize=11)\n",
|
|
511
|
+
"ax3.grid(True, alpha=0.3, axis='y', linestyle=':')\n",
|
|
512
|
+
"ax3.spines['top'].set_visible(False)\n",
|
|
513
|
+
"ax3.spines['right'].set_visible(False)\n",
|
|
514
|
+
"\n",
|
|
515
|
+
"# Activity Distribution - External Enterprise vs External Free\n",
|
|
516
|
+
"bars4 = ax4.bar(categories2, activity_counts2, color=colors2, alpha=0.8, width=0.6)\n",
|
|
517
|
+
"for bar, count in zip(bars4, activity_counts2):\n",
|
|
518
|
+
" height = bar.get_height()\n",
|
|
519
|
+
" ax4.text(bar.get_x() + bar.get_width()/2., height + max(activity_counts2)*0.02,\n",
|
|
520
|
+
" f'{count:,}', ha='center', va='bottom', fontsize=11,\n",
|
|
521
|
+
" bbox=dict(boxstyle='round,pad=0.4', facecolor='white', alpha=0.9))\n",
|
|
522
|
+
"\n",
|
|
523
|
+
"ax4.set_title('Activity Volume: External Enterprise vs Free', fontsize=12, fontweight='bold', pad=15)\n",
|
|
524
|
+
"ax4.set_ylabel('Total Activities', fontsize=11)\n",
|
|
525
|
+
"ax4.grid(True, alpha=0.3, axis='y', linestyle=':')\n",
|
|
526
|
+
"ax4.spines['top'].set_visible(False)\n",
|
|
527
|
+
"ax4.spines['right'].set_visible(False)\n",
|
|
528
|
+
"\n",
|
|
529
|
+
"plt.tight_layout()\n",
|
|
530
|
+
"plt.savefig('user_segment_intensity_comparison.png', dpi=300, bbox_inches='tight', facecolor='white')\n",
|
|
531
|
+
"plt.show()"
|
|
532
|
+
],
|
|
533
|
+
"id": "7ca4a6f32ece17e6",
|
|
534
|
+
"outputs": [],
|
|
535
|
+
"execution_count": null
|
|
536
|
+
},
|
|
537
|
+
{
|
|
538
|
+
"metadata": {},
|
|
539
|
+
"cell_type": "code",
|
|
540
|
+
"source": [
|
|
541
|
+
"# ============================================\n",
|
|
542
|
+
"# SEGMENT ANALYSIS FUNCTIONS\n",
|
|
543
|
+
"# ============================================\n",
|
|
544
|
+
"\n",
|
|
545
|
+
"def analyze_error_distribution_by_segment(df, segment_col, segment_values, segment_labels, error_type_col, top_n=10):\n",
|
|
546
|
+
" \"\"\"\n",
|
|
547
|
+
" Analyze error type distribution across user segments\n",
|
|
548
|
+
" \"\"\"\n",
|
|
549
|
+
" results = {}\n",
|
|
550
|
+
"\n",
|
|
551
|
+
" for segment_val, segment_label in zip(segment_values, segment_labels):\n",
|
|
552
|
+
" if segment_col == 'user_type':\n",
|
|
553
|
+
" # Special handling for internal vs external\n",
|
|
554
|
+
" if segment_val == 'internal':\n",
|
|
555
|
+
" segment_df = df[df['is_deepset_user'] == True]\n",
|
|
556
|
+
" else:\n",
|
|
557
|
+
" segment_df = df[df['is_deepset_user'] == False]\n",
|
|
558
|
+
" else:\n",
|
|
559
|
+
" segment_df = df[df[segment_col] == segment_val]\n",
|
|
560
|
+
"\n",
|
|
561
|
+
" # Get error type distribution\n",
|
|
562
|
+
" error_counts = segment_df[error_type_col].value_counts().head(top_n)\n",
|
|
563
|
+
" total_errors = len(segment_df)\n",
|
|
564
|
+
"\n",
|
|
565
|
+
" results[segment_label] = {\n",
|
|
566
|
+
" 'counts': error_counts,\n",
|
|
567
|
+
" 'percentages': (error_counts / total_errors * 100),\n",
|
|
568
|
+
" 'total_errors': total_errors\n",
|
|
569
|
+
" }\n",
|
|
570
|
+
"\n",
|
|
571
|
+
" return results\n",
|
|
572
|
+
"\n",
|
|
573
|
+
"def create_comparison_plot(results, title, error_type_name, segment_labels, colors, filename):\n",
|
|
574
|
+
" \"\"\"\n",
|
|
575
|
+
" Create side-by-side comparison plots for error distributions (percentages only)\n",
|
|
576
|
+
" \"\"\"\n",
|
|
577
|
+
" fig, axes = plt.subplots(1, len(segment_labels), figsize=(6*len(segment_labels), 6), facecolor='white')\n",
|
|
578
|
+
" fig.suptitle(title, fontsize=16, fontweight='bold', y=0.95)\n",
|
|
579
|
+
"\n",
|
|
580
|
+
" # Handle single subplot case\n",
|
|
581
|
+
" if len(segment_labels) == 1:\n",
|
|
582
|
+
" axes = [axes]\n",
|
|
583
|
+
"\n",
|
|
584
|
+
" # Percentages within each segment\n",
|
|
585
|
+
" for i, (segment_label, color) in enumerate(zip(segment_labels, colors)):\n",
|
|
586
|
+
" ax = axes[i]\n",
|
|
587
|
+
" data = results[segment_label]['percentages']\n",
|
|
588
|
+
" counts_data = results[segment_label]['counts']\n",
|
|
589
|
+
"\n",
|
|
590
|
+
" bars = ax.barh(range(len(data)), data.values, color=color, alpha=0.8, height=0.7)\n",
|
|
591
|
+
" ax.set_yticks(range(len(data)))\n",
|
|
592
|
+
" ax.set_yticklabels([str(x).replace('_', ' ').title() if isinstance(x, str) else str(x)\n",
|
|
593
|
+
" for x in data.index], fontsize=10)\n",
|
|
594
|
+
" ax.set_xlabel('Percentage of Errors (%)', fontsize=12)\n",
|
|
595
|
+
" ax.set_title(f'{segment_label}\\n({results[segment_label][\"total_errors\"]:,} total errors)',\n",
|
|
596
|
+
" fontsize=12, fontweight='bold', pad=15)\n",
|
|
597
|
+
" ax.grid(True, alpha=0.3, axis='x', linestyle=':')\n",
|
|
598
|
+
"\n",
|
|
599
|
+
" # Add percentage and absolute count labels\n",
|
|
600
|
+
" for bar, pct, error_type in zip(bars, data.values, data.index):\n",
|
|
601
|
+
" count = counts_data[error_type]\n",
|
|
602
|
+
" ax.text(pct + data.max()*0.01, bar.get_y() + bar.get_height()/2,\n",
|
|
603
|
+
" f'{pct:.1f}% ({count:,})', va='center', fontsize=9,\n",
|
|
604
|
+
" bbox=dict(boxstyle='round,pad=0.2', facecolor='white', alpha=0.8))\n",
|
|
605
|
+
"\n",
|
|
606
|
+
" ax.spines['top'].set_visible(False)\n",
|
|
607
|
+
" ax.spines['right'].set_visible(False)\n",
|
|
608
|
+
" ax.spines['left'].set_visible(False)\n",
|
|
609
|
+
"\n",
|
|
610
|
+
" plt.tight_layout()\n",
|
|
611
|
+
" plt.savefig(filename, dpi=300, bbox_inches='tight', facecolor='white')\n",
|
|
612
|
+
" plt.show()\n",
|
|
613
|
+
"\n",
|
|
614
|
+
"# ============================================\n",
|
|
615
|
+
"# ANALYSIS 1: INTERNAL VS EXTERNAL USERS\n",
|
|
616
|
+
"# ============================================\n",
|
|
617
|
+
"print(\"=\"*80)\n",
|
|
618
|
+
"print(\"ERROR DISTRIBUTION ANALYSIS BY USER SEGMENTS\")\n",
|
|
619
|
+
"print(\"=\"*80)\n",
|
|
620
|
+
"\n",
|
|
621
|
+
"# Error Classes: Internal vs External\n",
|
|
622
|
+
"print(\"\\n1. ERROR CLASSES: INTERNAL vs EXTERNAL USERS\")\n",
|
|
623
|
+
"print(\"-\" * 60)\n",
|
|
624
|
+
"\n",
|
|
625
|
+
"error_class_internal_external = analyze_error_distribution_by_segment(\n",
|
|
626
|
+
" error_data, 'user_type', ['internal', 'external'],\n",
|
|
627
|
+
" ['Internal (Deepset)', 'External (Customers)'], 'error_class', top_n=10\n",
|
|
628
|
+
")\n",
|
|
629
|
+
"\n",
|
|
630
|
+
"create_comparison_plot(\n",
|
|
631
|
+
" error_class_internal_external,\n",
|
|
632
|
+
" 'Error Class Distribution: Internal vs External Users',\n",
|
|
633
|
+
" 'Error Class',\n",
|
|
634
|
+
" ['Internal (Deepset)', 'External (Customers)'],\n",
|
|
635
|
+
" [COLORS['primary'], COLORS['secondary']],\n",
|
|
636
|
+
" 'error_classes_internal_vs_external.png'\n",
|
|
637
|
+
")\n",
|
|
638
|
+
"\n",
|
|
639
|
+
"component_internal_external = analyze_error_distribution_by_segment(\n",
|
|
640
|
+
" error_data, 'user_type', ['internal', 'external'],\n",
|
|
641
|
+
" ['Internal (Deepset)', 'External (Customers)'], 'component_type', top_n=10\n",
|
|
642
|
+
")\n",
|
|
643
|
+
"\n",
|
|
644
|
+
"create_comparison_plot(\n",
|
|
645
|
+
" component_internal_external,\n",
|
|
646
|
+
" 'Component Type Distribution: Internal vs External Users',\n",
|
|
647
|
+
" 'Component Type',\n",
|
|
648
|
+
" ['Internal (Deepset)', 'External (Customers)'],\n",
|
|
649
|
+
" [COLORS['primary'], COLORS['secondary']],\n",
|
|
650
|
+
" 'component_types_internal_vs_external.png'\n",
|
|
651
|
+
")\n",
|
|
652
|
+
"\n",
|
|
653
|
+
"\n",
|
|
654
|
+
"# ============================================\n",
|
|
655
|
+
"# ANALYSIS 2: ENTERPRISE VS FREE USERS\n",
|
|
656
|
+
"# ============================================\n",
|
|
657
|
+
"print(f\"\\n\\n3. ERROR CLASSES: ENTERPRISE vs FREE USERS\")\n",
|
|
658
|
+
"print(\"-\" * 60)\n",
|
|
659
|
+
"\n",
|
|
660
|
+
"error_class_enterprise_free = analyze_error_distribution_by_segment(\n",
|
|
661
|
+
" error_data, 'organization_type',\n",
|
|
662
|
+
" ['FULL_DEEPSET_CLOUD', 'DEEPSET_STUDIO_WITH_LIMITS'],\n",
|
|
663
|
+
" ['Enterprise', 'Free'], 'error_class', top_n=10\n",
|
|
664
|
+
")\n",
|
|
665
|
+
"\n",
|
|
666
|
+
"create_comparison_plot(\n",
|
|
667
|
+
" error_class_enterprise_free,\n",
|
|
668
|
+
" 'Error Class Distribution: Enterprise vs Free Users',\n",
|
|
669
|
+
" 'Error Class',\n",
|
|
670
|
+
" ['Enterprise', 'Free'],\n",
|
|
671
|
+
" [COLORS['success'], COLORS['cyan']],\n",
|
|
672
|
+
" 'error_classes_enterprise_vs_free.png'\n",
|
|
673
|
+
")\n",
|
|
674
|
+
"\n",
|
|
675
|
+
"\n",
|
|
676
|
+
"component_enterprise_free = analyze_error_distribution_by_segment(\n",
|
|
677
|
+
" error_data, 'organization_type',\n",
|
|
678
|
+
" ['FULL_DEEPSET_CLOUD', 'DEEPSET_STUDIO_WITH_LIMITS'],\n",
|
|
679
|
+
" ['Enterprise', 'Free'], 'component_type', top_n=10\n",
|
|
680
|
+
")\n",
|
|
681
|
+
"\n",
|
|
682
|
+
"create_comparison_plot(\n",
|
|
683
|
+
" component_enterprise_free,\n",
|
|
684
|
+
" 'Component Type Distribution: Enterprise vs Free Users',\n",
|
|
685
|
+
" 'Component Type',\n",
|
|
686
|
+
" ['Enterprise', 'Free'],\n",
|
|
687
|
+
" [COLORS['success'], COLORS['cyan']],\n",
|
|
688
|
+
" 'component_types_enterprise_vs_free.png'\n",
|
|
689
|
+
")\n",
|
|
690
|
+
"\n",
|
|
691
|
+
"\n",
|
|
692
|
+
"# ============================================\n",
|
|
693
|
+
"# ANALYSIS 3: EXTERNAL ENTERPRISE vs EXTERNAL FREE\n",
|
|
694
|
+
"# ============================================\n",
|
|
695
|
+
"print(f\"\\n\\n5. ERROR CLASSES: EXTERNAL ENTERPRISE vs EXTERNAL FREE\")\n",
|
|
696
|
+
"print(\"-\" * 60)\n",
|
|
697
|
+
"\n",
|
|
698
|
+
"# Filter for external users only\n",
|
|
699
|
+
"external_users = error_data[error_data['is_deepset_user'] == False]\n",
|
|
700
|
+
"\n",
|
|
701
|
+
"error_class_external_segments = analyze_error_distribution_by_segment(\n",
|
|
702
|
+
" external_users, 'organization_type',\n",
|
|
703
|
+
" ['FULL_DEEPSET_CLOUD', 'DEEPSET_STUDIO_WITH_LIMITS'],\n",
|
|
704
|
+
" ['External Enterprise', 'External Free'], 'error_class', top_n=10\n",
|
|
705
|
+
")\n",
|
|
706
|
+
"\n",
|
|
707
|
+
"create_comparison_plot(\n",
|
|
708
|
+
" error_class_external_segments,\n",
|
|
709
|
+
" 'Error Class Distribution: External Enterprise vs External Free Users',\n",
|
|
710
|
+
" 'Error Class',\n",
|
|
711
|
+
" ['External Enterprise', 'External Free'],\n",
|
|
712
|
+
" [COLORS['purple'], COLORS['pink']],\n",
|
|
713
|
+
" 'error_classes_external_enterprise_vs_free.png'\n",
|
|
714
|
+
")\n",
|
|
715
|
+
"\n",
|
|
716
|
+
"component_external_segments = analyze_error_distribution_by_segment(\n",
|
|
717
|
+
" external_users, 'organization_type',\n",
|
|
718
|
+
" ['FULL_DEEPSET_CLOUD', 'DEEPSET_STUDIO_WITH_LIMITS'],\n",
|
|
719
|
+
" ['External Enterprise', 'External Free'], 'component_type', top_n=10\n",
|
|
720
|
+
")\n",
|
|
721
|
+
"\n",
|
|
722
|
+
"create_comparison_plot(\n",
|
|
723
|
+
" component_external_segments,\n",
|
|
724
|
+
" 'Component Type Distribution: External Enterprise vs External Free Users',\n",
|
|
725
|
+
" 'Component Type',\n",
|
|
726
|
+
" ['External Enterprise', 'External Free'],\n",
|
|
727
|
+
" [COLORS['purple'], COLORS['pink']],\n",
|
|
728
|
+
" 'component_types_external_enterprise_vs_free.png'\n",
|
|
729
|
+
")\n"
|
|
730
|
+
],
|
|
731
|
+
"id": "88038fcd25d25346",
|
|
732
|
+
"outputs": [],
|
|
733
|
+
"execution_count": null
|
|
734
|
+
}
|
|
735
|
+
],
|
|
736
|
+
"metadata": {
|
|
737
|
+
"kernelspec": {
|
|
738
|
+
"display_name": "Python 3",
|
|
739
|
+
"language": "python",
|
|
740
|
+
"name": "python3"
|
|
741
|
+
},
|
|
742
|
+
"language_info": {
|
|
743
|
+
"codemirror_mode": {
|
|
744
|
+
"name": "ipython",
|
|
745
|
+
"version": 2
|
|
746
|
+
},
|
|
747
|
+
"file_extension": ".py",
|
|
748
|
+
"mimetype": "text/x-python",
|
|
749
|
+
"name": "python",
|
|
750
|
+
"nbconvert_exporter": "python",
|
|
751
|
+
"pygments_lexer": "ipython2",
|
|
752
|
+
"version": "2.7.6"
|
|
753
|
+
}
|
|
754
|
+
},
|
|
755
|
+
"nbformat": 4,
|
|
756
|
+
"nbformat_minor": 5
|
|
757
|
+
}
|