lopace 0.1.6.dev2__py3-none-any.whl → 0.1.6.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lopace/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.6.dev2'
32
- __version_tuple__ = version_tuple = (0, 1, 6, 'dev2')
31
+ __version__ = version = '0.1.6.dev3'
32
+ __version_tuple__ = version_tuple = (0, 1, 6, 'dev3')
33
33
 
34
34
  __commit_id__ = commit_id = None
lopace/compressor.py CHANGED
@@ -126,7 +126,8 @@ class PromptCompressor:
126
126
  >>> original = compressor.decompress_token(compressed)
127
127
  """
128
128
  # Step 1: Convert text to list of token IDs
129
- token_ids = list(self.tokenizer.encode(text)) # Ensure it's a list
129
+ # Allow special tokens to be encoded as normal text (disable the check)
130
+ token_ids = list(self.tokenizer.encode(text, disallowed_special=())) # Ensure it's a list
130
131
 
131
132
  if not token_ids:
132
133
  # Empty token list - return just format byte
@@ -221,7 +222,8 @@ class PromptCompressor:
221
222
  >>> original = compressor.decompress_hybrid(compressed)
222
223
  """
223
224
  # Step 1: Tokenize
224
- tokens = list(self.tokenizer.encode(text)) # Ensure it's a list
225
+ # Allow special tokens to be encoded as normal text (disable the check)
226
+ tokens = list(self.tokenizer.encode(text, disallowed_special=())) # Ensure it's a list
225
227
 
226
228
  if not tokens:
227
229
  # Empty token list - return compressed empty data
@@ -393,7 +395,7 @@ class PromptCompressor:
393
395
  original_size = len(text.encode('utf-8'))
394
396
  stats = {
395
397
  'original_size_bytes': original_size,
396
- 'original_size_tokens': len(self.tokenizer.encode(text)),
398
+ 'original_size_tokens': len(self.tokenizer.encode(text, disallowed_special=())),
397
399
  'methods': {}
398
400
  }
399
401
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lopace
3
- Version: 0.1.6.dev2
3
+ Version: 0.1.6.dev3
4
4
  Summary: Lossless Optimized Prompt Accurate Compression Engine
5
5
  Home-page: https://github.com/connectaman/LoPace
6
6
  Author: Aman Ulla
@@ -0,0 +1,14 @@
1
+ lopace/__init__.py,sha256=1X75FQdnzJaDueL_xRGTBn3vvIAXR1T_siSA_mW391E,443
2
+ lopace/_version.py,sha256=i2UCDvrkFPHmTq7NY-S1iwtQtX1H6Bze5geryvTCYAk,717
3
+ lopace/compressor.py,sha256=caWBQUl1hWJ8jTvL5hm_XdZpJirlsXZ2NyB2WldP5A4,19289
4
+ lopace-0.1.6.dev3.dist-info/licenses/LICENSE,sha256=uFUrlsfsOwx_8Nzhq2pUgNaJghcJxXBMML3l7T39Tm0,1067
5
+ scripts/README.md,sha256=UEsrHKKfiEixTMtWV8trYBKnzgkImJxxEnXTyDI4r9g,2226
6
+ scripts/__init__.py,sha256=XLq0VmLoEBfnWjzYmxb_JRzAIqwZDv-2s10TO692TLc,59
7
+ scripts/generate_visualizations.py,sha256=rAwzqDc5l1vi5nya5zUlLL4om4CCTDCPcLZ8mg0NUzE,34870
8
+ scripts/requirements.txt,sha256=EvUUoksfGtvbA45zkCG8to1EaPzWv1eurCONAp8Pdx4,112
9
+ tests/__init__.py,sha256=yXNVJE20E2iHo0qbit5SgRE35eXWq89F1kkhNHy7VJA,31
10
+ tests/test_compressor.py,sha256=-vMztSzY89n5dpShcACrFboEQOlfJ6FxF7eQOEU3swM,8273
11
+ lopace-0.1.6.dev3.dist-info/METADATA,sha256=kl7_6wf_a-3JCHxqoREE7YmOaKiAE9_eg8YV0aTfd0s,20788
12
+ lopace-0.1.6.dev3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
13
+ lopace-0.1.6.dev3.dist-info/top_level.txt,sha256=k-gL-51ulMq50vhNS91c1eyGRNse0vs_PzS9VdAiYlw,21
14
+ lopace-0.1.6.dev3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.10.1)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -7,11 +7,12 @@ import os
7
7
  import sys
8
8
  import time
9
9
  import tracemalloc
10
+ import json
10
11
  from pathlib import Path
11
12
  from typing import List, Dict, Tuple
12
13
  import numpy as np
13
14
  import matplotlib
14
- matplotlib.use('SVG') # Use SVG backend for vector graphics
15
+ matplotlib.use('Agg') # Use Agg backend which supports both SVG and PNG
15
16
  import matplotlib.pyplot as plt
16
17
  import seaborn as sns
17
18
  from matplotlib.patches import Rectangle
@@ -23,6 +24,15 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
23
24
  from lopace import PromptCompressor, CompressionMethod
24
25
 
25
26
 
27
+ def save_both_formats(output_dir: Path, filename_base: str):
28
+ """Save figure in both SVG and high-quality PNG formats."""
29
+ # Save SVG
30
+ plt.savefig(output_dir / f'{filename_base}.svg', format='svg', bbox_inches='tight', dpi=300)
31
+ # Save PNG with high quality
32
+ plt.savefig(output_dir / f'{filename_base}.png', format='png', bbox_inches='tight', dpi=300, facecolor='white')
33
+ print(f" Saved: {filename_base}.svg and {filename_base}.png")
34
+
35
+
26
36
  # Set style for publication-quality plots
27
37
  sns.set_style("whitegrid")
28
38
  plt.rcParams.update({
@@ -37,7 +47,7 @@ plt.rcParams.update({
37
47
  'figure.titlesize': 18,
38
48
  'figure.dpi': 300,
39
49
  'savefig.dpi': 300,
40
- 'savefig.format': 'svg',
50
+ 'savefig.format': 'png', # Default format, but we'll save both
41
51
  'svg.fonttype': 'none', # Editable text in SVG
42
52
  'mathtext.default': 'regular',
43
53
  'axes.linewidth': 1.2,
@@ -47,125 +57,34 @@ plt.rcParams.update({
47
57
  })
48
58
 
49
59
 
50
- def generate_test_prompts() -> List[Tuple[str, str]]:
51
- """Generate test prompts of various sizes."""
60
+ def load_prompts_from_jsonl(jsonl_path: Path) -> List[Tuple[str, str]]:
61
+ """Load prompts from JSONL file. Returns list of (title, prompt_text) tuples."""
52
62
  prompts = []
53
63
 
54
- # Small prompts (50-200 chars)
55
- small_prompts = [
56
- "You are a helpful AI assistant.",
57
- "Translate the following text to French.",
58
- "Summarize this document in 3 sentences.",
59
- "You are an expert Python developer.",
60
- ]
61
-
62
- # Medium prompts (500-2000 chars)
63
- medium_prompts = [
64
- """You are a helpful AI assistant designed to provide accurate,
65
- detailed, and helpful responses to user queries. Your goal is to assist users
66
- by understanding their questions and providing relevant information, explanations,
67
- or guidance. Always be respectful, clear, and concise in your communications.
68
- If you are uncertain about something, it's better to acknowledge that uncertainty
69
- rather than provide potentially incorrect information.""",
70
-
71
- """As an advanced language model, your primary function is to understand
72
- and respond to user inputs in a helpful, accurate, and safe manner. You should
73
- provide informative answers, assist with problem-solving, engage in creative
74
- writing tasks, and support various learning activities. Maintain objectivity,
75
- cite sources when appropriate, and always prioritize user safety and ethical
76
- considerations in your responses.""",
77
-
78
- """You are a professional software engineer with expertise in multiple
79
- programming languages including Python, JavaScript, Java, and C++. Your role
80
- is to help users write clean, efficient, and maintainable code. Provide
81
- code examples, explain best practices, debug issues, and suggest improvements.
82
- Always consider performance, security, and scalability in your recommendations.""",
83
- ]
84
-
85
- # Large prompts (5000-20000 chars)
86
- large_prompts = [
87
- """You are a comprehensive AI assistant specializing in technical documentation
88
- and educational content. Your expertise spans multiple domains including computer science,
89
- data science, machine learning, software engineering, and web development. When responding
90
- to queries, you should provide thorough explanations, include relevant examples, and
91
- structure your responses in a clear and organized manner. Always aim to educate while
92
- solving problems. Break down complex concepts into digestible parts, use analogies when
93
- helpful, and provide practical applications of theoretical knowledge. Maintain accuracy
94
- by acknowledging when you're uncertain and suggest reliable sources for further learning.
95
- Your communication style should be professional yet accessible, avoiding unnecessary
96
- jargon while ensuring precision in technical details. Consider the user's background
97
- and adjust your explanation depth accordingly. For code-related queries, always provide
98
- complete, working examples with comments explaining key parts. For conceptual questions,
99
- use diagrams, step-by-step breakdowns, and real-world analogies. When discussing
100
- best practices, explain not just what to do but why, including trade-offs and
101
- alternative approaches. Your goal is to empower users with knowledge and skills
102
- rather than just providing answers. Encourage critical thinking, experimentation,
103
- and continuous learning. Address potential pitfalls, common mistakes, and how to
104
- avoid them. Provide context about industry standards and emerging trends when relevant.
105
- Remember that effective teaching involves understanding the learner's perspective,
106
- patience, and encouragement. Always prioritize clarity, accuracy, and educational value
107
- in every interaction. Balance thoroughness with conciseness, ensuring responses are
108
- comprehensive yet not overwhelming. Use formatting effectively to improve readability,
109
- including bullet points, numbered lists, and section headers when appropriate.""",
110
-
111
- """System Prompt for Advanced Multi-Modal AI Assistant: This AI system is designed
112
- to be a versatile, intelligent, and highly capable assistant that can handle a wide range
113
- of tasks across multiple domains. The system integrates natural language processing,
114
- reasoning capabilities, knowledge retrieval, and contextual understanding to provide
115
- comprehensive support. Primary capabilities include question answering, problem-solving,
116
- creative tasks, analysis, code generation, data interpretation, and educational support.
117
- The assistant maintains a knowledge base spanning science, technology, humanities,
118
- business, arts, and current events. When interacting with users, the system should
119
- prioritize accuracy, helpfulness, safety, and ethical considerations. Responses should
120
- be well-structured, clear, and appropriately detailed based on the complexity of the query.
121
- The assistant should ask clarifying questions when necessary, acknowledge limitations,
122
- and provide sources or references when making factual claims. For technical questions,
123
- provide detailed explanations with examples. For creative tasks, demonstrate imagination
124
- while maintaining coherence and appropriateness. For analytical tasks, show step-by-step
125
- reasoning and present conclusions clearly. The system should adapt its communication style
126
- to match the user's level of expertise and the context of the conversation. Always aim
127
- to be constructive, respectful, and professional. When dealing with sensitive topics,
128
- exercise caution and provide balanced perspectives. For coding tasks, write clean,
129
- well-commented code following best practices. For writing tasks, ensure proper grammar,
130
- style, and structure. The assistant should continuously learn from interactions while
131
- maintaining core principles and guidelines. It should handle ambiguity gracefully,
132
- provide multiple perspectives when appropriate, and help users think critically about
133
- complex issues. The system is designed to be a tool for empowerment, education, and
134
- efficient problem-solving.""",
135
- ]
136
-
137
- # Combine and label prompts
138
- for prompt in small_prompts:
139
- prompts.append(("Small", prompt))
140
-
141
- for prompt in medium_prompts:
142
- prompts.append(("Medium", prompt))
143
-
144
- for large_prompts_list in large_prompts:
145
- prompts.append(("Large", large_prompts_list))
146
-
147
- # Add one more large prompt if needed
148
- if len(prompts) < 10:
149
- additional_large = """Comprehensive System Prompt for Advanced AI Assistant: This sophisticated
150
- artificial intelligence system represents a state-of-the-art language model designed to excel across
151
- a multitude of domains and applications. The system integrates deep learning architectures, extensive
152
- knowledge bases, and advanced reasoning capabilities to provide exceptional assistance. Core competencies
153
- include natural language understanding and generation, logical reasoning, creative problem-solving,
154
- technical expertise, and ethical decision-making. The assistant maintains extensive knowledge spanning
155
- STEM fields, humanities, arts, business, law, medicine, and contemporary issues. When engaging with users,
156
- the system employs sophisticated contextual understanding, adapts communication styles appropriately,
157
- and provides nuanced, well-reasoned responses. The architecture supports multi-modal interactions,
158
- real-time learning, and seamless integration with external tools and databases. Quality assurance
159
- mechanisms ensure accuracy, relevance, and safety in all outputs. The system demonstrates exceptional
160
- capabilities in code generation and analysis, creative writing, data analysis, educational instruction,
161
- research assistance, and complex problem decomposition. Advanced features include meta-cognitive reasoning,
162
- uncertainty quantification, bias detection and mitigation, and explainable AI principles. The assistant
163
- prioritizes user empowerment through education, transparency, and collaborative problem-solving approaches."""
164
- prompts.append(("Large", additional_large))
165
-
166
- # Ensure we have exactly 10 prompts
167
- prompts = prompts[:10]
168
-
64
+ if not jsonl_path.exists():
65
+ raise FileNotFoundError(f"JSONL file not found: {jsonl_path}")
66
+
67
+ print(f"Loading prompts from: {jsonl_path}")
68
+
69
+ with open(jsonl_path, 'r', encoding='utf-8') as f:
70
+ for line_num, line in enumerate(f, 1):
71
+ if not line.strip():
72
+ continue
73
+
74
+ try:
75
+ data = json.loads(line)
76
+ # Extract markdown content (the actual prompt text)
77
+ markdown = data.get('markdown', '')
78
+ title = data.get('title', f'Prompt {line_num}')
79
+
80
+ # Use markdown as the prompt text
81
+ if markdown and len(markdown.strip()) > 0:
82
+ prompts.append((title, markdown))
83
+ except json.JSONDecodeError as e:
84
+ print(f"Warning: Skipping invalid JSON on line {line_num}: {e}")
85
+ continue
86
+
87
+ print(f"Loaded {len(prompts)} prompts from JSONL file")
169
88
  return prompts
170
89
 
171
90
 
@@ -223,21 +142,22 @@ def measure_compression(
223
142
  return metrics
224
143
 
225
144
 
226
- def run_benchmarks() -> pd.DataFrame:
145
+ def run_benchmarks(jsonl_path: Path) -> pd.DataFrame:
227
146
  """Run compression benchmarks on all prompts and methods."""
228
147
  compressor = PromptCompressor(model="cl100k_base", zstd_level=15)
229
- prompts = generate_test_prompts()
148
+ prompts = load_prompts_from_jsonl(jsonl_path)
230
149
 
231
150
  all_results = []
232
151
 
233
152
  print("Running benchmarks...")
234
- for idx, (category, prompt) in enumerate(prompts, 1):
235
- print(f" Processing prompt {idx}/10 ({category}, {len(prompt)} chars)...")
153
+ total_prompts = len(prompts)
154
+ for idx, (title, prompt) in enumerate(prompts, 1):
155
+ print(f" Processing prompt {idx}/{total_prompts} ({len(prompt)} chars)...")
236
156
 
237
157
  for method in [CompressionMethod.ZSTD, CompressionMethod.TOKEN, CompressionMethod.HYBRID]:
238
158
  metrics = measure_compression(compressor, prompt, method)
239
159
  metrics['prompt_id'] = idx
240
- metrics['prompt_category'] = category
160
+ metrics['prompt_title'] = title
241
161
  metrics['prompt_length'] = len(prompt)
242
162
  all_results.append(metrics)
243
163
 
@@ -246,10 +166,10 @@ def run_benchmarks() -> pd.DataFrame:
246
166
 
247
167
 
248
168
  def plot_compression_ratio(df: pd.DataFrame, output_dir: Path):
249
- """Plot compression ratios by method and prompt size."""
169
+ """Plot compression ratios by method."""
250
170
  fig, axes = plt.subplots(1, 2, figsize=(14, 6))
251
171
 
252
- # Left: Compression ratio by method
172
+ # Left: Compression ratio by method (boxplot)
253
173
  ax1 = axes[0]
254
174
  method_order = ['zstd', 'token', 'hybrid']
255
175
  method_labels = ['Zstd', 'Token (BPE)', 'Hybrid']
@@ -270,83 +190,71 @@ def plot_compression_ratio(df: pd.DataFrame, output_dir: Path):
270
190
  ax1.grid(True, alpha=0.3, linestyle='--')
271
191
  ax1.set_ylim(bottom=0)
272
192
 
273
- # Right: Compression ratio by prompt category
193
+ # Right: Compression ratio vs prompt length (scatter/line plot)
274
194
  ax2 = axes[1]
275
- categories = ['Small', 'Medium', 'Large']
276
- category_data = []
277
-
278
- for category in categories:
279
- category_df = df[df['prompt_category'] == category]
280
- method_data = [category_df[category_df['method'] == m]['compression_ratio'].mean()
281
- for m in method_order]
282
- category_data.append(method_data)
283
-
284
- x = np.arange(len(categories))
285
- width = 0.25
286
-
287
- for i, (method, color) in enumerate(zip(method_labels, colors)):
288
- values = [category_data[j][i] for j in range(len(categories))]
289
- ax2.bar(x + i * width, values, width, label=method, color=color, alpha=0.8)
195
+ method_order = ['zstd', 'token', 'hybrid']
196
+ method_labels = ['Zstd', 'Token (BPE)', 'Hybrid']
197
+ colors = ['#3498db', '#2ecc71', '#9b59b6']
290
198
 
291
- ax2.set_ylabel('Mean Compression Ratio', fontweight='bold')
292
- ax2.set_xlabel('Prompt Category', fontweight='bold')
293
- ax2.set_title('(b) Compression Ratio by Prompt Size', fontweight='bold', pad=15)
294
- ax2.set_xticks(x + width)
295
- ax2.set_xticklabels(categories)
199
+ for method, label, color in zip(method_order, method_labels, colors):
200
+ method_df = df[df['method'] == method]
201
+ ax2.scatter(method_df['prompt_length'], method_df['compression_ratio'],
202
+ label=label, color=color, alpha=0.6, s=50)
203
+
204
+ # Add trend line
205
+ if len(method_df) > 1:
206
+ z = np.polyfit(method_df['prompt_length'], method_df['compression_ratio'], 1)
207
+ p = np.poly1d(z)
208
+ sorted_lengths = sorted(method_df['prompt_length'].unique())
209
+ ax2.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
210
+ linewidth=2, alpha=0.8)
211
+
212
+ ax2.set_ylabel('Compression Ratio', fontweight='bold')
213
+ ax2.set_xlabel('Prompt Length (characters)', fontweight='bold')
214
+ ax2.set_title('(b) Compression Ratio vs Prompt Length', fontweight='bold', pad=15)
296
215
  ax2.legend(loc='upper left', framealpha=0.9)
297
- ax2.grid(True, alpha=0.3, linestyle='--', axis='y')
216
+ ax2.grid(True, alpha=0.3, linestyle='--')
298
217
  ax2.set_ylim(bottom=0)
218
+ ax2.set_xscale('log')
299
219
 
300
220
  plt.tight_layout()
301
- plt.savefig(output_dir / 'compression_ratio.svg', format='svg', bbox_inches='tight')
221
+ save_both_formats(output_dir, 'compression_ratio')
302
222
  plt.close()
303
- print(f" Saved: compression_ratio.svg")
304
223
 
305
224
 
306
225
  def plot_space_savings(df: pd.DataFrame, output_dir: Path):
307
226
  """Plot space savings percentages."""
308
227
  fig, ax = plt.subplots(figsize=(12, 7))
309
228
 
310
- categories = ['Small', 'Medium', 'Large']
311
229
  method_order = ['zstd', 'token', 'hybrid']
312
230
  method_labels = ['Zstd', 'Token (BPE)', 'Hybrid']
313
231
  colors = ['#3498db', '#2ecc71', '#9b59b6']
314
232
 
315
- x = np.arange(len(categories))
316
- width = 0.25
233
+ # Create boxplot for space savings by method
234
+ data_by_method = [df[df['method'] == m]['space_savings_percent'].values for m in method_order]
317
235
 
318
- for i, (method, label, color) in enumerate(zip(method_order, method_labels, colors)):
319
- means = []
320
- stds = []
321
- for category in categories:
322
- subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
323
- means.append(subset['space_savings_percent'].mean())
324
- stds.append(subset['space_savings_percent'].std())
325
-
326
- bars = ax.bar(x + i * width, means, width, label=label, color=color, alpha=0.8,
327
- yerr=stds, capsize=5, error_kw={'elinewidth': 2, 'capthick': 2})
236
+ bp = ax.boxplot(data_by_method, labels=method_labels, patch_artist=True,
237
+ widths=0.6, showmeans=True, meanline=True)
238
+
239
+ for patch, color in zip(bp['boxes'], colors):
240
+ patch.set_facecolor(color)
241
+ patch.set_alpha(0.7)
328
242
 
329
243
  ax.set_ylabel('Space Savings (%)', fontweight='bold')
330
- ax.set_xlabel('Prompt Category', fontweight='bold')
331
- ax.set_title('Space Savings by Compression Method and Prompt Size', fontweight='bold', pad=15)
332
- ax.set_xticks(x + width)
333
- ax.set_xticklabels(categories)
334
- ax.legend(loc='upper left', framealpha=0.9, ncol=3)
244
+ ax.set_xlabel('Compression Method', fontweight='bold')
245
+ ax.set_title('Space Savings by Compression Method', fontweight='bold', pad=15)
335
246
  ax.grid(True, alpha=0.3, linestyle='--', axis='y')
336
247
  ax.set_ylim(0, 100)
337
248
 
338
- # Add value labels on bars
339
- for i, method in enumerate(method_order):
340
- for j, category in enumerate(categories):
341
- subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
342
- mean_val = subset['space_savings_percent'].mean()
343
- ax.text(j + i * width, mean_val + 2, f'{mean_val:.1f}%',
344
- ha='center', va='bottom', fontsize=10, fontweight='bold')
249
+ # Add mean value annotations
250
+ for i, (method, label) in enumerate(zip(method_order, method_labels)):
251
+ mean_val = df[df['method'] == method]['space_savings_percent'].mean()
252
+ ax.text(i + 1, mean_val + 3, f'Mean: {mean_val:.1f}%',
253
+ ha='center', va='bottom', fontsize=11, fontweight='bold', color=colors[i])
345
254
 
346
255
  plt.tight_layout()
347
- plt.savefig(output_dir / 'space_savings.svg', format='svg', bbox_inches='tight')
256
+ save_both_formats(output_dir, 'space_savings')
348
257
  plt.close()
349
- print(f" Saved: space_savings.svg")
350
258
 
351
259
 
352
260
  def plot_disk_size_comparison(df: pd.DataFrame, output_dir: Path):
@@ -357,68 +265,63 @@ def plot_disk_size_comparison(df: pd.DataFrame, output_dir: Path):
357
265
  method_labels = ['Zstd', 'Token (BPE)', 'Hybrid']
358
266
  colors = ['#3498db', '#2ecc71', '#9b59b6']
359
267
 
360
- # Top: Stacked bar chart showing original vs compressed
268
+ # Top: Scatter plot showing original vs compressed sizes
361
269
  ax1 = axes[0]
362
- categories = ['Small', 'Medium', 'Large']
363
270
 
364
- x = np.arange(len(categories))
271
+ # Get unique prompts (by prompt_id)
272
+ unique_prompts = df.groupby('prompt_id').first()
273
+ original_sizes = unique_prompts['original_size_bytes'].values / 1024 # KB
274
+
275
+ x_pos = np.arange(len(unique_prompts))
365
276
  width = 0.25
366
277
 
367
- for category_idx, category in enumerate(categories):
368
- category_df = df[df['prompt_category'] == category]
369
- original_sizes = category_df.groupby('prompt_id')['original_size_bytes'].first().mean()
370
-
371
- compressed_means = []
372
- for method in method_order:
373
- method_df = category_df[category_df['method'] == method]
374
- compressed_means.append(method_df['compressed_size_bytes'].mean())
375
-
376
- # Stack bars
377
- bottom = 0
378
- for i, (method, label, color) in enumerate(zip(method_order, method_labels, colors)):
379
- if i == 0:
380
- # First method: show original vs compressed
381
- ax1.bar(category_idx + i * width, original_sizes / 1024, width,
382
- label='Original Size' if category_idx == 0 else '', color='#e74c3c', alpha=0.7)
383
- ax1.bar(category_idx + i * width, compressed_means[i] / 1024, width,
384
- bottom=0, label=label if category_idx == 0 else '', color=color, alpha=0.8)
385
- else:
386
- # Other methods: just compressed size
387
- ax1.bar(category_idx + i * width, compressed_means[i] / 1024, width,
388
- label=label if category_idx == 0 else '', color=color, alpha=0.8)
278
+ # Plot original size
279
+ ax1.bar(x_pos - width, original_sizes, width, label='Original Size',
280
+ color='#e74c3c', alpha=0.7)
281
+
282
+ # Plot compressed sizes for each method
283
+ for i, (method, label, color) in enumerate(zip(method_order, method_labels, colors)):
284
+ method_df = df[df['method'] == method].sort_values('prompt_id')
285
+ compressed_sizes = method_df['compressed_size_bytes'].values / 1024 # KB
286
+ ax1.bar(x_pos + i * width, compressed_sizes, width, label=label,
287
+ color=color, alpha=0.8)
389
288
 
390
289
  ax1.set_ylabel('Size (KB)', fontweight='bold')
391
- ax1.set_xlabel('Prompt Category', fontweight='bold')
290
+ ax1.set_xlabel('Prompt ID', fontweight='bold')
392
291
  ax1.set_title('Disk Size: Original vs Compressed', fontweight='bold', pad=15)
393
- ax1.set_xticks(x + width)
394
- ax1.set_xticklabels(categories)
292
+ ax1.set_xticks(x_pos)
293
+ ax1.set_xticklabels([f'P{i+1}' for i in range(len(unique_prompts))], rotation=45, ha='right')
395
294
  ax1.legend(loc='upper left', framealpha=0.9, ncol=4)
396
295
  ax1.grid(True, alpha=0.3, linestyle='--', axis='y')
397
296
  ax1.set_yscale('log')
398
297
 
399
- # Bottom: Percentage reduction
298
+ # Bottom: Space savings distribution
400
299
  ax2 = axes[1]
401
300
 
402
- for i, (method, label, color) in enumerate(zip(method_order, method_labels, colors)):
403
- means = []
404
- for category in categories:
405
- subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
406
- means.append(subset['space_savings_percent'].mean())
301
+ for method, label, color in zip(method_order, method_labels, colors):
302
+ method_df = df[df['method'] == method]
303
+ ax2.scatter(method_df['prompt_length'], method_df['space_savings_percent'],
304
+ label=label, color=color, alpha=0.6, s=50)
407
305
 
408
- ax2.plot(categories, means, marker='o', linewidth=2.5, markersize=10,
409
- label=label, color=color, markerfacecolor=color, markeredgewidth=2)
306
+ # Add trend line
307
+ if len(method_df) > 1:
308
+ z = np.polyfit(method_df['prompt_length'], method_df['space_savings_percent'], 1)
309
+ p = np.poly1d(z)
310
+ sorted_lengths = sorted(method_df['prompt_length'].unique())
311
+ ax2.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
312
+ linewidth=2, alpha=0.8)
410
313
 
411
- ax2.set_ylabel('Size Reduction (%)', fontweight='bold')
412
- ax2.set_xlabel('Prompt Category', fontweight='bold')
413
- ax2.set_title('Size Reduction by Prompt Category', fontweight='bold', pad=15)
314
+ ax2.set_ylabel('Space Savings (%)', fontweight='bold')
315
+ ax2.set_xlabel('Prompt Length (characters)', fontweight='bold')
316
+ ax2.set_title('Space Savings vs Prompt Length', fontweight='bold', pad=15)
414
317
  ax2.legend(loc='best', framealpha=0.9)
415
318
  ax2.grid(True, alpha=0.3, linestyle='--')
416
319
  ax2.set_ylim(0, 100)
320
+ ax2.set_xscale('log')
417
321
 
418
322
  plt.tight_layout()
419
- plt.savefig(output_dir / 'disk_size_comparison.svg', format='svg', bbox_inches='tight')
323
+ save_both_formats(output_dir, 'disk_size_comparison')
420
324
  plt.close()
421
- print(f" Saved: disk_size_comparison.svg")
422
325
 
423
326
 
424
327
  def plot_speed_metrics(df: pd.DataFrame, output_dir: Path):
@@ -428,89 +331,102 @@ def plot_speed_metrics(df: pd.DataFrame, output_dir: Path):
428
331
  method_order = ['zstd', 'token', 'hybrid']
429
332
  method_labels = ['Zstd', 'Token (BPE)', 'Hybrid']
430
333
  colors = ['#3498db', '#2ecc71', '#9b59b6']
431
- categories = ['Small', 'Medium', 'Large']
432
334
 
433
- # Top-left: Compression time
335
+ # Top-left: Compression time vs prompt length
434
336
  ax1 = axes[0, 0]
435
- x = np.arange(len(categories))
436
- width = 0.25
437
-
438
- for i, (method, label, color) in enumerate(zip(method_order, method_labels, colors)):
439
- means = []
440
- for category in categories:
441
- subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
442
- means.append(subset['compression_time_ms'].mean())
337
+ for method, label, color in zip(method_order, method_labels, colors):
338
+ method_df = df[df['method'] == method]
339
+ ax1.scatter(method_df['prompt_length'], method_df['compression_time_ms'],
340
+ label=label, color=color, alpha=0.6, s=50)
443
341
 
444
- ax1.bar(x + i * width, means, width, label=label, color=color, alpha=0.8)
342
+ # Add trend line
343
+ if len(method_df) > 1:
344
+ z = np.polyfit(method_df['prompt_length'], method_df['compression_time_ms'], 1)
345
+ p = np.poly1d(z)
346
+ sorted_lengths = sorted(method_df['prompt_length'].unique())
347
+ ax1.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
348
+ linewidth=2, alpha=0.8)
445
349
 
446
350
  ax1.set_ylabel('Compression Time (ms)', fontweight='bold')
447
- ax1.set_xlabel('Prompt Category', fontweight='bold')
448
- ax1.set_title('(a) Compression Time', fontweight='bold', pad=15)
449
- ax1.set_xticks(x + width)
450
- ax1.set_xticklabels(categories)
351
+ ax1.set_xlabel('Prompt Length (characters)', fontweight='bold')
352
+ ax1.set_title('(a) Compression Time vs Prompt Length', fontweight='bold', pad=15)
451
353
  ax1.legend(framealpha=0.9)
452
- ax1.grid(True, alpha=0.3, linestyle='--', axis='y')
354
+ ax1.grid(True, alpha=0.3, linestyle='--')
453
355
  ax1.set_yscale('log')
356
+ ax1.set_xscale('log')
454
357
 
455
- # Top-right: Decompression time
358
+ # Top-right: Decompression time vs prompt length
456
359
  ax2 = axes[0, 1]
457
- for i, (method, label, color) in enumerate(zip(method_order, method_labels, colors)):
458
- means = []
459
- for category in categories:
460
- subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
461
- means.append(subset['decompression_time_ms'].mean())
360
+ for method, label, color in zip(method_order, method_labels, colors):
361
+ method_df = df[df['method'] == method]
362
+ ax2.scatter(method_df['prompt_length'], method_df['decompression_time_ms'],
363
+ label=label, color=color, alpha=0.6, s=50)
462
364
 
463
- ax2.bar(x + i * width, means, width, label=label, color=color, alpha=0.8)
365
+ # Add trend line
366
+ if len(method_df) > 1:
367
+ z = np.polyfit(method_df['prompt_length'], method_df['decompression_time_ms'], 1)
368
+ p = np.poly1d(z)
369
+ sorted_lengths = sorted(method_df['prompt_length'].unique())
370
+ ax2.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
371
+ linewidth=2, alpha=0.8)
464
372
 
465
373
  ax2.set_ylabel('Decompression Time (ms)', fontweight='bold')
466
- ax2.set_xlabel('Prompt Category', fontweight='bold')
467
- ax2.set_title('(b) Decompression Time', fontweight='bold', pad=15)
468
- ax2.set_xticks(x + width)
469
- ax2.set_xticklabels(categories)
374
+ ax2.set_xlabel('Prompt Length (characters)', fontweight='bold')
375
+ ax2.set_title('(b) Decompression Time vs Prompt Length', fontweight='bold', pad=15)
470
376
  ax2.legend(framealpha=0.9)
471
- ax2.grid(True, alpha=0.3, linestyle='--', axis='y')
377
+ ax2.grid(True, alpha=0.3, linestyle='--')
472
378
  ax2.set_yscale('log')
379
+ ax2.set_xscale('log')
473
380
 
474
- # Bottom-left: Compression throughput
381
+ # Bottom-left: Compression throughput vs prompt length
475
382
  ax3 = axes[1, 0]
476
- for i, (method, label, color) in enumerate(zip(method_order, method_labels, colors)):
477
- means = []
478
- for category in categories:
479
- subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
480
- means.append(subset['compression_throughput_mbps'].mean())
383
+ for method, label, color in zip(method_order, method_labels, colors):
384
+ method_df = df[df['method'] == method]
385
+ ax3.scatter(method_df['prompt_length'], method_df['compression_throughput_mbps'],
386
+ label=label, color=color, alpha=0.6, s=50)
481
387
 
482
- ax3.plot(categories, means, marker='o', linewidth=2.5, markersize=10,
483
- label=label, color=color, markerfacecolor=color, markeredgewidth=2)
388
+ # Add trend line
389
+ if len(method_df) > 1:
390
+ z = np.polyfit(method_df['prompt_length'], method_df['compression_throughput_mbps'], 1)
391
+ p = np.poly1d(z)
392
+ sorted_lengths = sorted(method_df['prompt_length'].unique())
393
+ ax3.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
394
+ linewidth=2, alpha=0.8)
484
395
 
485
396
  ax3.set_ylabel('Throughput (MB/s)', fontweight='bold')
486
- ax3.set_xlabel('Prompt Category', fontweight='bold')
487
- ax3.set_title('(c) Compression Throughput', fontweight='bold', pad=15)
397
+ ax3.set_xlabel('Prompt Length (characters)', fontweight='bold')
398
+ ax3.set_title('(c) Compression Throughput vs Prompt Length', fontweight='bold', pad=15)
488
399
  ax3.legend(framealpha=0.9)
489
400
  ax3.grid(True, alpha=0.3, linestyle='--')
490
401
  ax3.set_ylim(bottom=0)
402
+ ax3.set_xscale('log')
491
403
 
492
- # Bottom-right: Decompression throughput
404
+ # Bottom-right: Decompression throughput vs prompt length
493
405
  ax4 = axes[1, 1]
494
- for i, (method, label, color) in enumerate(zip(method_order, method_labels, colors)):
495
- means = []
496
- for category in categories:
497
- subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
498
- means.append(subset['decompression_throughput_mbps'].mean())
406
+ for method, label, color in zip(method_order, method_labels, colors):
407
+ method_df = df[df['method'] == method]
408
+ ax4.scatter(method_df['prompt_length'], method_df['decompression_throughput_mbps'],
409
+ label=label, color=color, alpha=0.6, s=50)
499
410
 
500
- ax4.plot(categories, means, marker='s', linewidth=2.5, markersize=10,
501
- label=label, color=color, markerfacecolor=color, markeredgewidth=2)
411
+ # Add trend line
412
+ if len(method_df) > 1:
413
+ z = np.polyfit(method_df['prompt_length'], method_df['decompression_throughput_mbps'], 1)
414
+ p = np.poly1d(z)
415
+ sorted_lengths = sorted(method_df['prompt_length'].unique())
416
+ ax4.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
417
+ linewidth=2, alpha=0.8)
502
418
 
503
419
  ax4.set_ylabel('Throughput (MB/s)', fontweight='bold')
504
- ax4.set_xlabel('Prompt Category', fontweight='bold')
505
- ax4.set_title('(d) Decompression Throughput', fontweight='bold', pad=15)
420
+ ax4.set_xlabel('Prompt Length (characters)', fontweight='bold')
421
+ ax4.set_title('(d) Decompression Throughput vs Prompt Length', fontweight='bold', pad=15)
506
422
  ax4.legend(framealpha=0.9)
507
423
  ax4.grid(True, alpha=0.3, linestyle='--')
508
424
  ax4.set_ylim(bottom=0)
425
+ ax4.set_xscale('log')
509
426
 
510
427
  plt.tight_layout()
511
- plt.savefig(output_dir / 'speed_metrics.svg', format='svg', bbox_inches='tight')
428
+ save_both_formats(output_dir, 'speed_metrics')
512
429
  plt.close()
513
- print(f" Saved: speed_metrics.svg")
514
430
 
515
431
 
516
432
  def plot_memory_usage(df: pd.DataFrame, output_dir: Path):
@@ -520,59 +436,56 @@ def plot_memory_usage(df: pd.DataFrame, output_dir: Path):
520
436
  method_order = ['zstd', 'token', 'hybrid']
521
437
  method_labels = ['Zstd', 'Token (BPE)', 'Hybrid']
522
438
  colors = ['#3498db', '#2ecc71', '#9b59b6']
523
- categories = ['Small', 'Medium', 'Large']
524
-
525
- x = np.arange(len(categories))
526
- width = 0.25
527
439
 
528
- # Left: Compression memory
440
+ # Left: Compression memory vs prompt length
529
441
  ax1 = axes[0]
530
- for i, (method, label, color) in enumerate(zip(method_order, method_labels, colors)):
531
- means = []
532
- stds = []
533
- for category in categories:
534
- subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
535
- means.append(subset['compression_memory_mb'].mean())
536
- stds.append(subset['compression_memory_mb'].std())
442
+ for method, label, color in zip(method_order, method_labels, colors):
443
+ method_df = df[df['method'] == method]
444
+ ax1.scatter(method_df['prompt_length'], method_df['compression_memory_mb'],
445
+ label=label, color=color, alpha=0.6, s=50)
537
446
 
538
- ax1.bar(x + i * width, means, width, label=label, color=color, alpha=0.8,
539
- yerr=stds, capsize=5, error_kw={'elinewidth': 2, 'capthick': 2})
447
+ # Add trend line
448
+ if len(method_df) > 1:
449
+ z = np.polyfit(method_df['prompt_length'], method_df['compression_memory_mb'], 1)
450
+ p = np.poly1d(z)
451
+ sorted_lengths = sorted(method_df['prompt_length'].unique())
452
+ ax1.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
453
+ linewidth=2, alpha=0.8)
540
454
 
541
455
  ax1.set_ylabel('Memory Usage (MB)', fontweight='bold')
542
- ax1.set_xlabel('Prompt Category', fontweight='bold')
456
+ ax1.set_xlabel('Prompt Length (characters)', fontweight='bold')
543
457
  ax1.set_title('(a) Compression Memory Usage', fontweight='bold', pad=15)
544
- ax1.set_xticks(x + width)
545
- ax1.set_xticklabels(categories)
546
458
  ax1.legend(framealpha=0.9)
547
- ax1.grid(True, alpha=0.3, linestyle='--', axis='y')
459
+ ax1.grid(True, alpha=0.3, linestyle='--')
548
460
  ax1.set_ylim(bottom=0)
461
+ ax1.set_xscale('log')
549
462
 
550
- # Right: Decompression memory
463
+ # Right: Decompression memory vs prompt length
551
464
  ax2 = axes[1]
552
- for i, (method, label, color) in enumerate(zip(method_order, method_labels, colors)):
553
- means = []
554
- stds = []
555
- for category in categories:
556
- subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
557
- means.append(subset['decompression_memory_mb'].mean())
558
- stds.append(subset['decompression_memory_mb'].std())
465
+ for method, label, color in zip(method_order, method_labels, colors):
466
+ method_df = df[df['method'] == method]
467
+ ax2.scatter(method_df['prompt_length'], method_df['decompression_memory_mb'],
468
+ label=label, color=color, alpha=0.6, s=50)
559
469
 
560
- ax2.bar(x + i * width, means, width, label=label, color=color, alpha=0.8,
561
- yerr=stds, capsize=5, error_kw={'elinewidth': 2, 'capthick': 2})
470
+ # Add trend line
471
+ if len(method_df) > 1:
472
+ z = np.polyfit(method_df['prompt_length'], method_df['decompression_memory_mb'], 1)
473
+ p = np.poly1d(z)
474
+ sorted_lengths = sorted(method_df['prompt_length'].unique())
475
+ ax2.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
476
+ linewidth=2, alpha=0.8)
562
477
 
563
478
  ax2.set_ylabel('Memory Usage (MB)', fontweight='bold')
564
- ax2.set_xlabel('Prompt Category', fontweight='bold')
479
+ ax2.set_xlabel('Prompt Length (characters)', fontweight='bold')
565
480
  ax2.set_title('(b) Decompression Memory Usage', fontweight='bold', pad=15)
566
- ax2.set_xticks(x + width)
567
- ax2.set_xticklabels(categories)
568
481
  ax2.legend(framealpha=0.9)
569
- ax2.grid(True, alpha=0.3, linestyle='--', axis='y')
482
+ ax2.grid(True, alpha=0.3, linestyle='--')
570
483
  ax2.set_ylim(bottom=0)
484
+ ax2.set_xscale('log')
571
485
 
572
486
  plt.tight_layout()
573
- plt.savefig(output_dir / 'memory_usage.svg', format='svg', bbox_inches='tight')
487
+ save_both_formats(output_dir, 'memory_usage')
574
488
  plt.close()
575
- print(f" Saved: memory_usage.svg")
576
489
 
577
490
 
578
491
  def plot_comprehensive_comparison(df: pd.DataFrame, output_dir: Path):
@@ -581,117 +494,104 @@ def plot_comprehensive_comparison(df: pd.DataFrame, output_dir: Path):
581
494
 
582
495
  method_order = ['zstd', 'token', 'hybrid']
583
496
  method_labels = ['Zstd', 'Token\n(BPE)', 'Hybrid']
584
- categories = ['Small', 'Medium', 'Large']
585
497
 
586
- # Top-left: Compression ratio heatmap
498
+ # Top-left: Compression ratio by method (boxplot data as heatmap)
587
499
  ax1 = axes[0, 0]
588
- compression_ratio_matrix = []
500
+ compression_ratio_data = []
589
501
  for method in method_order:
590
- row = []
591
- for category in categories:
592
- subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
593
- row.append(subset['compression_ratio'].mean())
594
- compression_ratio_matrix.append(row)
502
+ method_df = df[df['method'] == method]
503
+ compression_ratio_data.append([method_df['compression_ratio'].mean()])
595
504
 
505
+ # Create a single column heatmap
506
+ compression_ratio_matrix = np.array(compression_ratio_data)
596
507
  im1 = ax1.imshow(compression_ratio_matrix, cmap='YlOrRd', aspect='auto', vmin=0)
597
- ax1.set_xticks(np.arange(len(categories)))
508
+ ax1.set_xticks([0])
598
509
  ax1.set_yticks(np.arange(len(method_labels)))
599
- ax1.set_xticklabels(categories)
510
+ ax1.set_xticklabels(['All Prompts'])
600
511
  ax1.set_yticklabels(method_labels)
601
512
  ax1.set_ylabel('Compression Method', fontweight='bold')
602
- ax1.set_xlabel('Prompt Category', fontweight='bold')
603
- ax1.set_title('(a) Compression Ratio', fontweight='bold', pad=15)
513
+ ax1.set_xlabel('', fontweight='bold')
514
+ ax1.set_title('(a) Mean Compression Ratio', fontweight='bold', pad=15)
604
515
 
605
516
  # Add text annotations
606
517
  for i in range(len(method_labels)):
607
- for j in range(len(categories)):
608
- text = ax1.text(j, i, f'{compression_ratio_matrix[i][j]:.2f}x',
609
- ha="center", va="center", color="black", fontweight='bold')
518
+ text = ax1.text(0, i, f'{compression_ratio_matrix[i][0]:.2f}x',
519
+ ha="center", va="center", color="black", fontweight='bold', fontsize=12)
610
520
 
611
521
  plt.colorbar(im1, ax=ax1, label='Compression Ratio')
612
522
 
613
- # Top-right: Space savings heatmap
523
+ # Top-right: Space savings by method
614
524
  ax2 = axes[0, 1]
615
- space_savings_matrix = []
525
+ space_savings_data = []
616
526
  for method in method_order:
617
- row = []
618
- for category in categories:
619
- subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
620
- row.append(subset['space_savings_percent'].mean())
621
- space_savings_matrix.append(row)
527
+ method_df = df[df['method'] == method]
528
+ space_savings_data.append([method_df['space_savings_percent'].mean()])
622
529
 
530
+ space_savings_matrix = np.array(space_savings_data)
623
531
  im2 = ax2.imshow(space_savings_matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=100)
624
- ax2.set_xticks(np.arange(len(categories)))
532
+ ax2.set_xticks([0])
625
533
  ax2.set_yticks(np.arange(len(method_labels)))
626
- ax2.set_xticklabels(categories)
534
+ ax2.set_xticklabels(['All Prompts'])
627
535
  ax2.set_yticklabels(method_labels)
628
536
  ax2.set_ylabel('Compression Method', fontweight='bold')
629
- ax2.set_xlabel('Prompt Category', fontweight='bold')
630
- ax2.set_title('(b) Space Savings (%)', fontweight='bold', pad=15)
537
+ ax2.set_xlabel('', fontweight='bold')
538
+ ax2.set_title('(b) Mean Space Savings (%)', fontweight='bold', pad=15)
631
539
 
632
540
  for i in range(len(method_labels)):
633
- for j in range(len(categories)):
634
- text = ax2.text(j, i, f'{space_savings_matrix[i][j]:.1f}%',
635
- ha="center", va="center", color="black", fontweight='bold')
541
+ text = ax2.text(0, i, f'{space_savings_matrix[i][0]:.1f}%',
542
+ ha="center", va="center", color="black", fontweight='bold', fontsize=12)
636
543
 
637
544
  plt.colorbar(im2, ax=ax2, label='Space Savings (%)')
638
545
 
639
- # Bottom-left: Compression speed heatmap
546
+ # Bottom-left: Compression throughput by method
640
547
  ax3 = axes[1, 0]
641
- speed_matrix = []
548
+ speed_data = []
642
549
  for method in method_order:
643
- row = []
644
- for category in categories:
645
- subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
646
- row.append(subset['compression_throughput_mbps'].mean())
647
- speed_matrix.append(row)
550
+ method_df = df[df['method'] == method]
551
+ speed_data.append([method_df['compression_throughput_mbps'].mean()])
648
552
 
553
+ speed_matrix = np.array(speed_data)
649
554
  im3 = ax3.imshow(speed_matrix, cmap='viridis', aspect='auto')
650
- ax3.set_xticks(np.arange(len(categories)))
555
+ ax3.set_xticks([0])
651
556
  ax3.set_yticks(np.arange(len(method_labels)))
652
- ax3.set_xticklabels(categories)
557
+ ax3.set_xticklabels(['All Prompts'])
653
558
  ax3.set_yticklabels(method_labels)
654
559
  ax3.set_ylabel('Compression Method', fontweight='bold')
655
- ax3.set_xlabel('Prompt Category', fontweight='bold')
656
- ax3.set_title('(c) Compression Throughput (MB/s)', fontweight='bold', pad=15)
560
+ ax3.set_xlabel('', fontweight='bold')
561
+ ax3.set_title('(c) Mean Compression Throughput (MB/s)', fontweight='bold', pad=15)
657
562
 
658
563
  for i in range(len(method_labels)):
659
- for j in range(len(categories)):
660
- text = ax3.text(j, i, f'{speed_matrix[i][j]:.2f}',
661
- ha="center", va="center", color="white", fontweight='bold')
564
+ text = ax3.text(0, i, f'{speed_matrix[i][0]:.2f}',
565
+ ha="center", va="center", color="white", fontweight='bold', fontsize=12)
662
566
 
663
567
  plt.colorbar(im3, ax=ax3, label='Throughput (MB/s)')
664
568
 
665
- # Bottom-right: Memory usage heatmap
569
+ # Bottom-right: Memory usage by method
666
570
  ax4 = axes[1, 1]
667
- memory_matrix = []
571
+ memory_data = []
668
572
  for method in method_order:
669
- row = []
670
- for category in categories:
671
- subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
672
- row.append(subset['compression_memory_mb'].mean())
673
- memory_matrix.append(row)
573
+ method_df = df[df['method'] == method]
574
+ memory_data.append([method_df['compression_memory_mb'].mean()])
674
575
 
576
+ memory_matrix = np.array(memory_data)
675
577
  im4 = ax4.imshow(memory_matrix, cmap='plasma', aspect='auto')
676
- ax4.set_xticks(np.arange(len(categories)))
578
+ ax4.set_xticks([0])
677
579
  ax4.set_yticks(np.arange(len(method_labels)))
678
- ax4.set_xticklabels(categories)
580
+ ax4.set_xticklabels(['All Prompts'])
679
581
  ax4.set_yticklabels(method_labels)
680
582
  ax4.set_ylabel('Compression Method', fontweight='bold')
681
- ax4.set_xlabel('Prompt Category', fontweight='bold')
682
- ax4.set_title('(d) Compression Memory Usage (MB)', fontweight='bold', pad=15)
583
+ ax4.set_xlabel('', fontweight='bold')
584
+ ax4.set_title('(d) Mean Compression Memory Usage (MB)', fontweight='bold', pad=15)
683
585
 
684
586
  for i in range(len(method_labels)):
685
- for j in range(len(categories)):
686
- text = ax4.text(j, i, f'{memory_matrix[i][j]:.2f}',
687
- ha="center", va="center", color="white", fontweight='bold')
587
+ text = ax4.text(0, i, f'{memory_matrix[i][0]:.2f}',
588
+ ha="center", va="center", color="white", fontweight='bold', fontsize=12)
688
589
 
689
590
  plt.colorbar(im4, ax=ax4, label='Memory (MB)')
690
591
 
691
592
  plt.tight_layout()
692
- plt.savefig(output_dir / 'comprehensive_comparison.svg', format='svg', bbox_inches='tight')
593
+ save_both_formats(output_dir, 'comprehensive_comparison')
693
594
  plt.close()
694
- print(f" Saved: comprehensive_comparison.svg")
695
595
 
696
596
 
697
597
  def plot_scalability(df: pd.DataFrame, output_dir: Path):
@@ -791,24 +691,18 @@ def plot_scalability(df: pd.DataFrame, output_dir: Path):
791
691
  ax4.set_xscale('log')
792
692
 
793
693
  plt.tight_layout()
794
- plt.savefig(output_dir / 'scalability_analysis.svg', format='svg', bbox_inches='tight')
694
+ save_both_formats(output_dir, 'scalability_analysis')
795
695
  plt.close()
796
- print(f" Saved: scalability_analysis.svg")
797
696
 
798
697
 
799
- def plot_original_vs_decompressed(output_dir: Path):
698
+ def plot_original_vs_decompressed(jsonl_path: Path, output_dir: Path):
800
699
  """Plot original vs decompressed data comparison across multiple prompts."""
801
700
  compressor = PromptCompressor(model="cl100k_base", zstd_level=15)
802
- prompts = generate_test_prompts()
701
+ prompts = load_prompts_from_jsonl(jsonl_path)
803
702
 
804
- # Select a few diverse prompts for visualization
805
- selected_prompts = [
806
- ("Small Prompt 1", prompts[0][1]),
807
- ("Medium Prompt 1", prompts[4][1]),
808
- ("Large Prompt 1", prompts[7][1]),
809
- ("Medium Prompt 2", prompts[5][1]),
810
- ("Small Prompt 2", prompts[1][1]),
811
- ]
703
+ # Select a diverse sample of prompts for visualization (up to 5)
704
+ num_to_show = min(5, len(prompts))
705
+ selected_prompts = prompts[:num_to_show]
812
706
 
813
707
  # Use Hybrid method (best compression)
814
708
  method = CompressionMethod.HYBRID
@@ -889,9 +783,8 @@ def plot_original_vs_decompressed(output_dir: Path):
889
783
  ax.axhspan(-5, 105, alpha=0.05, color='green', zorder=0)
890
784
 
891
785
  plt.tight_layout(rect=[0, 0, 1, 0.99])
892
- plt.savefig(output_dir / 'original_vs_decompressed.svg', format='svg', bbox_inches='tight')
786
+ save_both_formats(output_dir, 'original_vs_decompressed')
893
787
  plt.close()
894
- print(f" Saved: original_vs_decompressed.svg")
895
788
 
896
789
 
897
790
  def main():
@@ -900,15 +793,19 @@ def main():
900
793
  output_dir = Path(__file__).parent.parent / 'screenshots'
901
794
  output_dir.mkdir(exist_ok=True)
902
795
 
796
+ # JSONL file path
797
+ jsonl_path = Path(__file__).parent / 'transformers-4-34-0.jsonl'
798
+
903
799
  print("=" * 70)
904
800
  print("LoPace Visualization Generator")
905
801
  print("=" * 70)
906
802
  print(f"Output directory: {output_dir}")
803
+ print(f"JSONL file: {jsonl_path}")
907
804
  print()
908
805
 
909
806
  # Run benchmarks
910
807
  print("Step 1: Running compression benchmarks...")
911
- df = run_benchmarks()
808
+ df = run_benchmarks(jsonl_path)
912
809
 
913
810
  # Save raw data
914
811
  csv_path = output_dir / 'benchmark_data.csv'
@@ -925,7 +822,7 @@ def main():
925
822
  plot_memory_usage(df, output_dir)
926
823
  plot_comprehensive_comparison(df, output_dir)
927
824
  plot_scalability(df, output_dir)
928
- plot_original_vs_decompressed(output_dir)
825
+ plot_original_vs_decompressed(jsonl_path, output_dir)
929
826
 
930
827
  print("\n" + "=" * 70)
931
828
  print("Visualization generation complete!")
@@ -1,14 +0,0 @@
1
- lopace/__init__.py,sha256=1X75FQdnzJaDueL_xRGTBn3vvIAXR1T_siSA_mW391E,443
2
- lopace/_version.py,sha256=PLPzXBIkQAXc-mvnIPLNmQqit_i0rmO6w4hdXJlO3xc,717
3
- lopace/compressor.py,sha256=nUTWDcAPYvQaeSFKx_lne-D2xIQ02IMVGE4yLODo8qE,19060
4
- lopace-0.1.6.dev2.dist-info/licenses/LICENSE,sha256=uFUrlsfsOwx_8Nzhq2pUgNaJghcJxXBMML3l7T39Tm0,1067
5
- scripts/README.md,sha256=UEsrHKKfiEixTMtWV8trYBKnzgkImJxxEnXTyDI4r9g,2226
6
- scripts/__init__.py,sha256=XLq0VmLoEBfnWjzYmxb_JRzAIqwZDv-2s10TO692TLc,59
7
- scripts/generate_visualizations.py,sha256=AJm2DNs-tiwdTHLivEQL9QkztmCclgGT1u4ds5QY4BQ,41812
8
- scripts/requirements.txt,sha256=EvUUoksfGtvbA45zkCG8to1EaPzWv1eurCONAp8Pdx4,112
9
- tests/__init__.py,sha256=yXNVJE20E2iHo0qbit5SgRE35eXWq89F1kkhNHy7VJA,31
10
- tests/test_compressor.py,sha256=-vMztSzY89n5dpShcACrFboEQOlfJ6FxF7eQOEU3swM,8273
11
- lopace-0.1.6.dev2.dist-info/METADATA,sha256=3vzfkpgjCojuDUQ-Rq8O8IU18-m7SkdZhuB9TobCQGw,20788
12
- lopace-0.1.6.dev2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
13
- lopace-0.1.6.dev2.dist-info/top_level.txt,sha256=k-gL-51ulMq50vhNS91c1eyGRNse0vs_PzS9VdAiYlw,21
14
- lopace-0.1.6.dev2.dist-info/RECORD,,