lopace 0.1.6.dev2__py3-none-any.whl → 0.1.6.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lopace/_version.py +2 -2
- lopace/compressor.py +5 -3
- {lopace-0.1.6.dev2.dist-info → lopace-0.1.6.dev3.dist-info}/METADATA +1 -1
- lopace-0.1.6.dev3.dist-info/RECORD +14 -0
- {lopace-0.1.6.dev2.dist-info → lopace-0.1.6.dev3.dist-info}/WHEEL +1 -1
- scripts/generate_visualizations.py +274 -377
- lopace-0.1.6.dev2.dist-info/RECORD +0 -14
- {lopace-0.1.6.dev2.dist-info → lopace-0.1.6.dev3.dist-info}/licenses/LICENSE +0 -0
- {lopace-0.1.6.dev2.dist-info → lopace-0.1.6.dev3.dist-info}/top_level.txt +0 -0
lopace/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.6.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1, 6, '
|
|
31
|
+
__version__ = version = '0.1.6.dev3'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 6, 'dev3')
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
lopace/compressor.py
CHANGED
|
@@ -126,7 +126,8 @@ class PromptCompressor:
|
|
|
126
126
|
>>> original = compressor.decompress_token(compressed)
|
|
127
127
|
"""
|
|
128
128
|
# Step 1: Convert text to list of token IDs
|
|
129
|
-
|
|
129
|
+
# Allow special tokens to be encoded as normal text (disable the check)
|
|
130
|
+
token_ids = list(self.tokenizer.encode(text, disallowed_special=())) # Ensure it's a list
|
|
130
131
|
|
|
131
132
|
if not token_ids:
|
|
132
133
|
# Empty token list - return just format byte
|
|
@@ -221,7 +222,8 @@ class PromptCompressor:
|
|
|
221
222
|
>>> original = compressor.decompress_hybrid(compressed)
|
|
222
223
|
"""
|
|
223
224
|
# Step 1: Tokenize
|
|
224
|
-
tokens
|
|
225
|
+
# Allow special tokens to be encoded as normal text (disable the check)
|
|
226
|
+
tokens = list(self.tokenizer.encode(text, disallowed_special=())) # Ensure it's a list
|
|
225
227
|
|
|
226
228
|
if not tokens:
|
|
227
229
|
# Empty token list - return compressed empty data
|
|
@@ -393,7 +395,7 @@ class PromptCompressor:
|
|
|
393
395
|
original_size = len(text.encode('utf-8'))
|
|
394
396
|
stats = {
|
|
395
397
|
'original_size_bytes': original_size,
|
|
396
|
-
'original_size_tokens': len(self.tokenizer.encode(text)),
|
|
398
|
+
'original_size_tokens': len(self.tokenizer.encode(text, disallowed_special=())),
|
|
397
399
|
'methods': {}
|
|
398
400
|
}
|
|
399
401
|
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
lopace/__init__.py,sha256=1X75FQdnzJaDueL_xRGTBn3vvIAXR1T_siSA_mW391E,443
|
|
2
|
+
lopace/_version.py,sha256=i2UCDvrkFPHmTq7NY-S1iwtQtX1H6Bze5geryvTCYAk,717
|
|
3
|
+
lopace/compressor.py,sha256=caWBQUl1hWJ8jTvL5hm_XdZpJirlsXZ2NyB2WldP5A4,19289
|
|
4
|
+
lopace-0.1.6.dev3.dist-info/licenses/LICENSE,sha256=uFUrlsfsOwx_8Nzhq2pUgNaJghcJxXBMML3l7T39Tm0,1067
|
|
5
|
+
scripts/README.md,sha256=UEsrHKKfiEixTMtWV8trYBKnzgkImJxxEnXTyDI4r9g,2226
|
|
6
|
+
scripts/__init__.py,sha256=XLq0VmLoEBfnWjzYmxb_JRzAIqwZDv-2s10TO692TLc,59
|
|
7
|
+
scripts/generate_visualizations.py,sha256=rAwzqDc5l1vi5nya5zUlLL4om4CCTDCPcLZ8mg0NUzE,34870
|
|
8
|
+
scripts/requirements.txt,sha256=EvUUoksfGtvbA45zkCG8to1EaPzWv1eurCONAp8Pdx4,112
|
|
9
|
+
tests/__init__.py,sha256=yXNVJE20E2iHo0qbit5SgRE35eXWq89F1kkhNHy7VJA,31
|
|
10
|
+
tests/test_compressor.py,sha256=-vMztSzY89n5dpShcACrFboEQOlfJ6FxF7eQOEU3swM,8273
|
|
11
|
+
lopace-0.1.6.dev3.dist-info/METADATA,sha256=kl7_6wf_a-3JCHxqoREE7YmOaKiAE9_eg8YV0aTfd0s,20788
|
|
12
|
+
lopace-0.1.6.dev3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
13
|
+
lopace-0.1.6.dev3.dist-info/top_level.txt,sha256=k-gL-51ulMq50vhNS91c1eyGRNse0vs_PzS9VdAiYlw,21
|
|
14
|
+
lopace-0.1.6.dev3.dist-info/RECORD,,
|
|
@@ -7,11 +7,12 @@ import os
|
|
|
7
7
|
import sys
|
|
8
8
|
import time
|
|
9
9
|
import tracemalloc
|
|
10
|
+
import json
|
|
10
11
|
from pathlib import Path
|
|
11
12
|
from typing import List, Dict, Tuple
|
|
12
13
|
import numpy as np
|
|
13
14
|
import matplotlib
|
|
14
|
-
matplotlib.use('
|
|
15
|
+
matplotlib.use('Agg') # Use Agg backend which supports both SVG and PNG
|
|
15
16
|
import matplotlib.pyplot as plt
|
|
16
17
|
import seaborn as sns
|
|
17
18
|
from matplotlib.patches import Rectangle
|
|
@@ -23,6 +24,15 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
23
24
|
from lopace import PromptCompressor, CompressionMethod
|
|
24
25
|
|
|
25
26
|
|
|
27
|
+
def save_both_formats(output_dir: Path, filename_base: str):
|
|
28
|
+
"""Save figure in both SVG and high-quality PNG formats."""
|
|
29
|
+
# Save SVG
|
|
30
|
+
plt.savefig(output_dir / f'{filename_base}.svg', format='svg', bbox_inches='tight', dpi=300)
|
|
31
|
+
# Save PNG with high quality
|
|
32
|
+
plt.savefig(output_dir / f'{filename_base}.png', format='png', bbox_inches='tight', dpi=300, facecolor='white')
|
|
33
|
+
print(f" Saved: {filename_base}.svg and {filename_base}.png")
|
|
34
|
+
|
|
35
|
+
|
|
26
36
|
# Set style for publication-quality plots
|
|
27
37
|
sns.set_style("whitegrid")
|
|
28
38
|
plt.rcParams.update({
|
|
@@ -37,7 +47,7 @@ plt.rcParams.update({
|
|
|
37
47
|
'figure.titlesize': 18,
|
|
38
48
|
'figure.dpi': 300,
|
|
39
49
|
'savefig.dpi': 300,
|
|
40
|
-
'savefig.format': '
|
|
50
|
+
'savefig.format': 'png', # Default format, but we'll save both
|
|
41
51
|
'svg.fonttype': 'none', # Editable text in SVG
|
|
42
52
|
'mathtext.default': 'regular',
|
|
43
53
|
'axes.linewidth': 1.2,
|
|
@@ -47,125 +57,34 @@ plt.rcParams.update({
|
|
|
47
57
|
})
|
|
48
58
|
|
|
49
59
|
|
|
50
|
-
def
|
|
51
|
-
"""
|
|
60
|
+
def load_prompts_from_jsonl(jsonl_path: Path) -> List[Tuple[str, str]]:
|
|
61
|
+
"""Load prompts from JSONL file. Returns list of (title, prompt_text) tuples."""
|
|
52
62
|
prompts = []
|
|
53
63
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
"""You are a professional software engineer with expertise in multiple
|
|
79
|
-
programming languages including Python, JavaScript, Java, and C++. Your role
|
|
80
|
-
is to help users write clean, efficient, and maintainable code. Provide
|
|
81
|
-
code examples, explain best practices, debug issues, and suggest improvements.
|
|
82
|
-
Always consider performance, security, and scalability in your recommendations.""",
|
|
83
|
-
]
|
|
84
|
-
|
|
85
|
-
# Large prompts (5000-20000 chars)
|
|
86
|
-
large_prompts = [
|
|
87
|
-
"""You are a comprehensive AI assistant specializing in technical documentation
|
|
88
|
-
and educational content. Your expertise spans multiple domains including computer science,
|
|
89
|
-
data science, machine learning, software engineering, and web development. When responding
|
|
90
|
-
to queries, you should provide thorough explanations, include relevant examples, and
|
|
91
|
-
structure your responses in a clear and organized manner. Always aim to educate while
|
|
92
|
-
solving problems. Break down complex concepts into digestible parts, use analogies when
|
|
93
|
-
helpful, and provide practical applications of theoretical knowledge. Maintain accuracy
|
|
94
|
-
by acknowledging when you're uncertain and suggest reliable sources for further learning.
|
|
95
|
-
Your communication style should be professional yet accessible, avoiding unnecessary
|
|
96
|
-
jargon while ensuring precision in technical details. Consider the user's background
|
|
97
|
-
and adjust your explanation depth accordingly. For code-related queries, always provide
|
|
98
|
-
complete, working examples with comments explaining key parts. For conceptual questions,
|
|
99
|
-
use diagrams, step-by-step breakdowns, and real-world analogies. When discussing
|
|
100
|
-
best practices, explain not just what to do but why, including trade-offs and
|
|
101
|
-
alternative approaches. Your goal is to empower users with knowledge and skills
|
|
102
|
-
rather than just providing answers. Encourage critical thinking, experimentation,
|
|
103
|
-
and continuous learning. Address potential pitfalls, common mistakes, and how to
|
|
104
|
-
avoid them. Provide context about industry standards and emerging trends when relevant.
|
|
105
|
-
Remember that effective teaching involves understanding the learner's perspective,
|
|
106
|
-
patience, and encouragement. Always prioritize clarity, accuracy, and educational value
|
|
107
|
-
in every interaction. Balance thoroughness with conciseness, ensuring responses are
|
|
108
|
-
comprehensive yet not overwhelming. Use formatting effectively to improve readability,
|
|
109
|
-
including bullet points, numbered lists, and section headers when appropriate.""",
|
|
110
|
-
|
|
111
|
-
"""System Prompt for Advanced Multi-Modal AI Assistant: This AI system is designed
|
|
112
|
-
to be a versatile, intelligent, and highly capable assistant that can handle a wide range
|
|
113
|
-
of tasks across multiple domains. The system integrates natural language processing,
|
|
114
|
-
reasoning capabilities, knowledge retrieval, and contextual understanding to provide
|
|
115
|
-
comprehensive support. Primary capabilities include question answering, problem-solving,
|
|
116
|
-
creative tasks, analysis, code generation, data interpretation, and educational support.
|
|
117
|
-
The assistant maintains a knowledge base spanning science, technology, humanities,
|
|
118
|
-
business, arts, and current events. When interacting with users, the system should
|
|
119
|
-
prioritize accuracy, helpfulness, safety, and ethical considerations. Responses should
|
|
120
|
-
be well-structured, clear, and appropriately detailed based on the complexity of the query.
|
|
121
|
-
The assistant should ask clarifying questions when necessary, acknowledge limitations,
|
|
122
|
-
and provide sources or references when making factual claims. For technical questions,
|
|
123
|
-
provide detailed explanations with examples. For creative tasks, demonstrate imagination
|
|
124
|
-
while maintaining coherence and appropriateness. For analytical tasks, show step-by-step
|
|
125
|
-
reasoning and present conclusions clearly. The system should adapt its communication style
|
|
126
|
-
to match the user's level of expertise and the context of the conversation. Always aim
|
|
127
|
-
to be constructive, respectful, and professional. When dealing with sensitive topics,
|
|
128
|
-
exercise caution and provide balanced perspectives. For coding tasks, write clean,
|
|
129
|
-
well-commented code following best practices. For writing tasks, ensure proper grammar,
|
|
130
|
-
style, and structure. The assistant should continuously learn from interactions while
|
|
131
|
-
maintaining core principles and guidelines. It should handle ambiguity gracefully,
|
|
132
|
-
provide multiple perspectives when appropriate, and help users think critically about
|
|
133
|
-
complex issues. The system is designed to be a tool for empowerment, education, and
|
|
134
|
-
efficient problem-solving.""",
|
|
135
|
-
]
|
|
136
|
-
|
|
137
|
-
# Combine and label prompts
|
|
138
|
-
for prompt in small_prompts:
|
|
139
|
-
prompts.append(("Small", prompt))
|
|
140
|
-
|
|
141
|
-
for prompt in medium_prompts:
|
|
142
|
-
prompts.append(("Medium", prompt))
|
|
143
|
-
|
|
144
|
-
for large_prompts_list in large_prompts:
|
|
145
|
-
prompts.append(("Large", large_prompts_list))
|
|
146
|
-
|
|
147
|
-
# Add one more large prompt if needed
|
|
148
|
-
if len(prompts) < 10:
|
|
149
|
-
additional_large = """Comprehensive System Prompt for Advanced AI Assistant: This sophisticated
|
|
150
|
-
artificial intelligence system represents a state-of-the-art language model designed to excel across
|
|
151
|
-
a multitude of domains and applications. The system integrates deep learning architectures, extensive
|
|
152
|
-
knowledge bases, and advanced reasoning capabilities to provide exceptional assistance. Core competencies
|
|
153
|
-
include natural language understanding and generation, logical reasoning, creative problem-solving,
|
|
154
|
-
technical expertise, and ethical decision-making. The assistant maintains extensive knowledge spanning
|
|
155
|
-
STEM fields, humanities, arts, business, law, medicine, and contemporary issues. When engaging with users,
|
|
156
|
-
the system employs sophisticated contextual understanding, adapts communication styles appropriately,
|
|
157
|
-
and provides nuanced, well-reasoned responses. The architecture supports multi-modal interactions,
|
|
158
|
-
real-time learning, and seamless integration with external tools and databases. Quality assurance
|
|
159
|
-
mechanisms ensure accuracy, relevance, and safety in all outputs. The system demonstrates exceptional
|
|
160
|
-
capabilities in code generation and analysis, creative writing, data analysis, educational instruction,
|
|
161
|
-
research assistance, and complex problem decomposition. Advanced features include meta-cognitive reasoning,
|
|
162
|
-
uncertainty quantification, bias detection and mitigation, and explainable AI principles. The assistant
|
|
163
|
-
prioritizes user empowerment through education, transparency, and collaborative problem-solving approaches."""
|
|
164
|
-
prompts.append(("Large", additional_large))
|
|
165
|
-
|
|
166
|
-
# Ensure we have exactly 10 prompts
|
|
167
|
-
prompts = prompts[:10]
|
|
168
|
-
|
|
64
|
+
if not jsonl_path.exists():
|
|
65
|
+
raise FileNotFoundError(f"JSONL file not found: {jsonl_path}")
|
|
66
|
+
|
|
67
|
+
print(f"Loading prompts from: {jsonl_path}")
|
|
68
|
+
|
|
69
|
+
with open(jsonl_path, 'r', encoding='utf-8') as f:
|
|
70
|
+
for line_num, line in enumerate(f, 1):
|
|
71
|
+
if not line.strip():
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
data = json.loads(line)
|
|
76
|
+
# Extract markdown content (the actual prompt text)
|
|
77
|
+
markdown = data.get('markdown', '')
|
|
78
|
+
title = data.get('title', f'Prompt {line_num}')
|
|
79
|
+
|
|
80
|
+
# Use markdown as the prompt text
|
|
81
|
+
if markdown and len(markdown.strip()) > 0:
|
|
82
|
+
prompts.append((title, markdown))
|
|
83
|
+
except json.JSONDecodeError as e:
|
|
84
|
+
print(f"Warning: Skipping invalid JSON on line {line_num}: {e}")
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
print(f"Loaded {len(prompts)} prompts from JSONL file")
|
|
169
88
|
return prompts
|
|
170
89
|
|
|
171
90
|
|
|
@@ -223,21 +142,22 @@ def measure_compression(
|
|
|
223
142
|
return metrics
|
|
224
143
|
|
|
225
144
|
|
|
226
|
-
def run_benchmarks() -> pd.DataFrame:
|
|
145
|
+
def run_benchmarks(jsonl_path: Path) -> pd.DataFrame:
|
|
227
146
|
"""Run compression benchmarks on all prompts and methods."""
|
|
228
147
|
compressor = PromptCompressor(model="cl100k_base", zstd_level=15)
|
|
229
|
-
prompts =
|
|
148
|
+
prompts = load_prompts_from_jsonl(jsonl_path)
|
|
230
149
|
|
|
231
150
|
all_results = []
|
|
232
151
|
|
|
233
152
|
print("Running benchmarks...")
|
|
234
|
-
|
|
235
|
-
|
|
153
|
+
total_prompts = len(prompts)
|
|
154
|
+
for idx, (title, prompt) in enumerate(prompts, 1):
|
|
155
|
+
print(f" Processing prompt {idx}/{total_prompts} ({len(prompt)} chars)...")
|
|
236
156
|
|
|
237
157
|
for method in [CompressionMethod.ZSTD, CompressionMethod.TOKEN, CompressionMethod.HYBRID]:
|
|
238
158
|
metrics = measure_compression(compressor, prompt, method)
|
|
239
159
|
metrics['prompt_id'] = idx
|
|
240
|
-
metrics['
|
|
160
|
+
metrics['prompt_title'] = title
|
|
241
161
|
metrics['prompt_length'] = len(prompt)
|
|
242
162
|
all_results.append(metrics)
|
|
243
163
|
|
|
@@ -246,10 +166,10 @@ def run_benchmarks() -> pd.DataFrame:
|
|
|
246
166
|
|
|
247
167
|
|
|
248
168
|
def plot_compression_ratio(df: pd.DataFrame, output_dir: Path):
|
|
249
|
-
"""Plot compression ratios by method
|
|
169
|
+
"""Plot compression ratios by method."""
|
|
250
170
|
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
|
|
251
171
|
|
|
252
|
-
# Left: Compression ratio by method
|
|
172
|
+
# Left: Compression ratio by method (boxplot)
|
|
253
173
|
ax1 = axes[0]
|
|
254
174
|
method_order = ['zstd', 'token', 'hybrid']
|
|
255
175
|
method_labels = ['Zstd', 'Token (BPE)', 'Hybrid']
|
|
@@ -270,83 +190,71 @@ def plot_compression_ratio(df: pd.DataFrame, output_dir: Path):
|
|
|
270
190
|
ax1.grid(True, alpha=0.3, linestyle='--')
|
|
271
191
|
ax1.set_ylim(bottom=0)
|
|
272
192
|
|
|
273
|
-
# Right: Compression ratio
|
|
193
|
+
# Right: Compression ratio vs prompt length (scatter/line plot)
|
|
274
194
|
ax2 = axes[1]
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
for category in categories:
|
|
279
|
-
category_df = df[df['prompt_category'] == category]
|
|
280
|
-
method_data = [category_df[category_df['method'] == m]['compression_ratio'].mean()
|
|
281
|
-
for m in method_order]
|
|
282
|
-
category_data.append(method_data)
|
|
283
|
-
|
|
284
|
-
x = np.arange(len(categories))
|
|
285
|
-
width = 0.25
|
|
286
|
-
|
|
287
|
-
for i, (method, color) in enumerate(zip(method_labels, colors)):
|
|
288
|
-
values = [category_data[j][i] for j in range(len(categories))]
|
|
289
|
-
ax2.bar(x + i * width, values, width, label=method, color=color, alpha=0.8)
|
|
195
|
+
method_order = ['zstd', 'token', 'hybrid']
|
|
196
|
+
method_labels = ['Zstd', 'Token (BPE)', 'Hybrid']
|
|
197
|
+
colors = ['#3498db', '#2ecc71', '#9b59b6']
|
|
290
198
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
199
|
+
for method, label, color in zip(method_order, method_labels, colors):
|
|
200
|
+
method_df = df[df['method'] == method]
|
|
201
|
+
ax2.scatter(method_df['prompt_length'], method_df['compression_ratio'],
|
|
202
|
+
label=label, color=color, alpha=0.6, s=50)
|
|
203
|
+
|
|
204
|
+
# Add trend line
|
|
205
|
+
if len(method_df) > 1:
|
|
206
|
+
z = np.polyfit(method_df['prompt_length'], method_df['compression_ratio'], 1)
|
|
207
|
+
p = np.poly1d(z)
|
|
208
|
+
sorted_lengths = sorted(method_df['prompt_length'].unique())
|
|
209
|
+
ax2.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
|
|
210
|
+
linewidth=2, alpha=0.8)
|
|
211
|
+
|
|
212
|
+
ax2.set_ylabel('Compression Ratio', fontweight='bold')
|
|
213
|
+
ax2.set_xlabel('Prompt Length (characters)', fontweight='bold')
|
|
214
|
+
ax2.set_title('(b) Compression Ratio vs Prompt Length', fontweight='bold', pad=15)
|
|
296
215
|
ax2.legend(loc='upper left', framealpha=0.9)
|
|
297
|
-
ax2.grid(True, alpha=0.3, linestyle='--'
|
|
216
|
+
ax2.grid(True, alpha=0.3, linestyle='--')
|
|
298
217
|
ax2.set_ylim(bottom=0)
|
|
218
|
+
ax2.set_xscale('log')
|
|
299
219
|
|
|
300
220
|
plt.tight_layout()
|
|
301
|
-
|
|
221
|
+
save_both_formats(output_dir, 'compression_ratio')
|
|
302
222
|
plt.close()
|
|
303
|
-
print(f" Saved: compression_ratio.svg")
|
|
304
223
|
|
|
305
224
|
|
|
306
225
|
def plot_space_savings(df: pd.DataFrame, output_dir: Path):
|
|
307
226
|
"""Plot space savings percentages."""
|
|
308
227
|
fig, ax = plt.subplots(figsize=(12, 7))
|
|
309
228
|
|
|
310
|
-
categories = ['Small', 'Medium', 'Large']
|
|
311
229
|
method_order = ['zstd', 'token', 'hybrid']
|
|
312
230
|
method_labels = ['Zstd', 'Token (BPE)', 'Hybrid']
|
|
313
231
|
colors = ['#3498db', '#2ecc71', '#9b59b6']
|
|
314
232
|
|
|
315
|
-
|
|
316
|
-
|
|
233
|
+
# Create boxplot for space savings by method
|
|
234
|
+
data_by_method = [df[df['method'] == m]['space_savings_percent'].values for m in method_order]
|
|
317
235
|
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
stds.append(subset['space_savings_percent'].std())
|
|
325
|
-
|
|
326
|
-
bars = ax.bar(x + i * width, means, width, label=label, color=color, alpha=0.8,
|
|
327
|
-
yerr=stds, capsize=5, error_kw={'elinewidth': 2, 'capthick': 2})
|
|
236
|
+
bp = ax.boxplot(data_by_method, labels=method_labels, patch_artist=True,
|
|
237
|
+
widths=0.6, showmeans=True, meanline=True)
|
|
238
|
+
|
|
239
|
+
for patch, color in zip(bp['boxes'], colors):
|
|
240
|
+
patch.set_facecolor(color)
|
|
241
|
+
patch.set_alpha(0.7)
|
|
328
242
|
|
|
329
243
|
ax.set_ylabel('Space Savings (%)', fontweight='bold')
|
|
330
|
-
ax.set_xlabel('
|
|
331
|
-
ax.set_title('Space Savings by Compression Method
|
|
332
|
-
ax.set_xticks(x + width)
|
|
333
|
-
ax.set_xticklabels(categories)
|
|
334
|
-
ax.legend(loc='upper left', framealpha=0.9, ncol=3)
|
|
244
|
+
ax.set_xlabel('Compression Method', fontweight='bold')
|
|
245
|
+
ax.set_title('Space Savings by Compression Method', fontweight='bold', pad=15)
|
|
335
246
|
ax.grid(True, alpha=0.3, linestyle='--', axis='y')
|
|
336
247
|
ax.set_ylim(0, 100)
|
|
337
248
|
|
|
338
|
-
# Add value
|
|
339
|
-
for i, method in enumerate(method_order):
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
ax.text(j + i * width, mean_val + 2, f'{mean_val:.1f}%',
|
|
344
|
-
ha='center', va='bottom', fontsize=10, fontweight='bold')
|
|
249
|
+
# Add mean value annotations
|
|
250
|
+
for i, (method, label) in enumerate(zip(method_order, method_labels)):
|
|
251
|
+
mean_val = df[df['method'] == method]['space_savings_percent'].mean()
|
|
252
|
+
ax.text(i + 1, mean_val + 3, f'Mean: {mean_val:.1f}%',
|
|
253
|
+
ha='center', va='bottom', fontsize=11, fontweight='bold', color=colors[i])
|
|
345
254
|
|
|
346
255
|
plt.tight_layout()
|
|
347
|
-
|
|
256
|
+
save_both_formats(output_dir, 'space_savings')
|
|
348
257
|
plt.close()
|
|
349
|
-
print(f" Saved: space_savings.svg")
|
|
350
258
|
|
|
351
259
|
|
|
352
260
|
def plot_disk_size_comparison(df: pd.DataFrame, output_dir: Path):
|
|
@@ -357,68 +265,63 @@ def plot_disk_size_comparison(df: pd.DataFrame, output_dir: Path):
|
|
|
357
265
|
method_labels = ['Zstd', 'Token (BPE)', 'Hybrid']
|
|
358
266
|
colors = ['#3498db', '#2ecc71', '#9b59b6']
|
|
359
267
|
|
|
360
|
-
# Top:
|
|
268
|
+
# Top: Scatter plot showing original vs compressed sizes
|
|
361
269
|
ax1 = axes[0]
|
|
362
|
-
categories = ['Small', 'Medium', 'Large']
|
|
363
270
|
|
|
364
|
-
|
|
271
|
+
# Get unique prompts (by prompt_id)
|
|
272
|
+
unique_prompts = df.groupby('prompt_id').first()
|
|
273
|
+
original_sizes = unique_prompts['original_size_bytes'].values / 1024 # KB
|
|
274
|
+
|
|
275
|
+
x_pos = np.arange(len(unique_prompts))
|
|
365
276
|
width = 0.25
|
|
366
277
|
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
bottom = 0
|
|
378
|
-
for i, (method, label, color) in enumerate(zip(method_order, method_labels, colors)):
|
|
379
|
-
if i == 0:
|
|
380
|
-
# First method: show original vs compressed
|
|
381
|
-
ax1.bar(category_idx + i * width, original_sizes / 1024, width,
|
|
382
|
-
label='Original Size' if category_idx == 0 else '', color='#e74c3c', alpha=0.7)
|
|
383
|
-
ax1.bar(category_idx + i * width, compressed_means[i] / 1024, width,
|
|
384
|
-
bottom=0, label=label if category_idx == 0 else '', color=color, alpha=0.8)
|
|
385
|
-
else:
|
|
386
|
-
# Other methods: just compressed size
|
|
387
|
-
ax1.bar(category_idx + i * width, compressed_means[i] / 1024, width,
|
|
388
|
-
label=label if category_idx == 0 else '', color=color, alpha=0.8)
|
|
278
|
+
# Plot original size
|
|
279
|
+
ax1.bar(x_pos - width, original_sizes, width, label='Original Size',
|
|
280
|
+
color='#e74c3c', alpha=0.7)
|
|
281
|
+
|
|
282
|
+
# Plot compressed sizes for each method
|
|
283
|
+
for i, (method, label, color) in enumerate(zip(method_order, method_labels, colors)):
|
|
284
|
+
method_df = df[df['method'] == method].sort_values('prompt_id')
|
|
285
|
+
compressed_sizes = method_df['compressed_size_bytes'].values / 1024 # KB
|
|
286
|
+
ax1.bar(x_pos + i * width, compressed_sizes, width, label=label,
|
|
287
|
+
color=color, alpha=0.8)
|
|
389
288
|
|
|
390
289
|
ax1.set_ylabel('Size (KB)', fontweight='bold')
|
|
391
|
-
ax1.set_xlabel('Prompt
|
|
290
|
+
ax1.set_xlabel('Prompt ID', fontweight='bold')
|
|
392
291
|
ax1.set_title('Disk Size: Original vs Compressed', fontweight='bold', pad=15)
|
|
393
|
-
ax1.set_xticks(
|
|
394
|
-
ax1.set_xticklabels(
|
|
292
|
+
ax1.set_xticks(x_pos)
|
|
293
|
+
ax1.set_xticklabels([f'P{i+1}' for i in range(len(unique_prompts))], rotation=45, ha='right')
|
|
395
294
|
ax1.legend(loc='upper left', framealpha=0.9, ncol=4)
|
|
396
295
|
ax1.grid(True, alpha=0.3, linestyle='--', axis='y')
|
|
397
296
|
ax1.set_yscale('log')
|
|
398
297
|
|
|
399
|
-
# Bottom:
|
|
298
|
+
# Bottom: Space savings distribution
|
|
400
299
|
ax2 = axes[1]
|
|
401
300
|
|
|
402
|
-
for
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
means.append(subset['space_savings_percent'].mean())
|
|
301
|
+
for method, label, color in zip(method_order, method_labels, colors):
|
|
302
|
+
method_df = df[df['method'] == method]
|
|
303
|
+
ax2.scatter(method_df['prompt_length'], method_df['space_savings_percent'],
|
|
304
|
+
label=label, color=color, alpha=0.6, s=50)
|
|
407
305
|
|
|
408
|
-
|
|
409
|
-
|
|
306
|
+
# Add trend line
|
|
307
|
+
if len(method_df) > 1:
|
|
308
|
+
z = np.polyfit(method_df['prompt_length'], method_df['space_savings_percent'], 1)
|
|
309
|
+
p = np.poly1d(z)
|
|
310
|
+
sorted_lengths = sorted(method_df['prompt_length'].unique())
|
|
311
|
+
ax2.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
|
|
312
|
+
linewidth=2, alpha=0.8)
|
|
410
313
|
|
|
411
|
-
ax2.set_ylabel('
|
|
412
|
-
ax2.set_xlabel('Prompt
|
|
413
|
-
ax2.set_title('
|
|
314
|
+
ax2.set_ylabel('Space Savings (%)', fontweight='bold')
|
|
315
|
+
ax2.set_xlabel('Prompt Length (characters)', fontweight='bold')
|
|
316
|
+
ax2.set_title('Space Savings vs Prompt Length', fontweight='bold', pad=15)
|
|
414
317
|
ax2.legend(loc='best', framealpha=0.9)
|
|
415
318
|
ax2.grid(True, alpha=0.3, linestyle='--')
|
|
416
319
|
ax2.set_ylim(0, 100)
|
|
320
|
+
ax2.set_xscale('log')
|
|
417
321
|
|
|
418
322
|
plt.tight_layout()
|
|
419
|
-
|
|
323
|
+
save_both_formats(output_dir, 'disk_size_comparison')
|
|
420
324
|
plt.close()
|
|
421
|
-
print(f" Saved: disk_size_comparison.svg")
|
|
422
325
|
|
|
423
326
|
|
|
424
327
|
def plot_speed_metrics(df: pd.DataFrame, output_dir: Path):
|
|
@@ -428,89 +331,102 @@ def plot_speed_metrics(df: pd.DataFrame, output_dir: Path):
|
|
|
428
331
|
method_order = ['zstd', 'token', 'hybrid']
|
|
429
332
|
method_labels = ['Zstd', 'Token (BPE)', 'Hybrid']
|
|
430
333
|
colors = ['#3498db', '#2ecc71', '#9b59b6']
|
|
431
|
-
categories = ['Small', 'Medium', 'Large']
|
|
432
334
|
|
|
433
|
-
# Top-left: Compression time
|
|
335
|
+
# Top-left: Compression time vs prompt length
|
|
434
336
|
ax1 = axes[0, 0]
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
means = []
|
|
440
|
-
for category in categories:
|
|
441
|
-
subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
|
|
442
|
-
means.append(subset['compression_time_ms'].mean())
|
|
337
|
+
for method, label, color in zip(method_order, method_labels, colors):
|
|
338
|
+
method_df = df[df['method'] == method]
|
|
339
|
+
ax1.scatter(method_df['prompt_length'], method_df['compression_time_ms'],
|
|
340
|
+
label=label, color=color, alpha=0.6, s=50)
|
|
443
341
|
|
|
444
|
-
|
|
342
|
+
# Add trend line
|
|
343
|
+
if len(method_df) > 1:
|
|
344
|
+
z = np.polyfit(method_df['prompt_length'], method_df['compression_time_ms'], 1)
|
|
345
|
+
p = np.poly1d(z)
|
|
346
|
+
sorted_lengths = sorted(method_df['prompt_length'].unique())
|
|
347
|
+
ax1.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
|
|
348
|
+
linewidth=2, alpha=0.8)
|
|
445
349
|
|
|
446
350
|
ax1.set_ylabel('Compression Time (ms)', fontweight='bold')
|
|
447
|
-
ax1.set_xlabel('Prompt
|
|
448
|
-
ax1.set_title('(a) Compression Time', fontweight='bold', pad=15)
|
|
449
|
-
ax1.set_xticks(x + width)
|
|
450
|
-
ax1.set_xticklabels(categories)
|
|
351
|
+
ax1.set_xlabel('Prompt Length (characters)', fontweight='bold')
|
|
352
|
+
ax1.set_title('(a) Compression Time vs Prompt Length', fontweight='bold', pad=15)
|
|
451
353
|
ax1.legend(framealpha=0.9)
|
|
452
|
-
ax1.grid(True, alpha=0.3, linestyle='--'
|
|
354
|
+
ax1.grid(True, alpha=0.3, linestyle='--')
|
|
453
355
|
ax1.set_yscale('log')
|
|
356
|
+
ax1.set_xscale('log')
|
|
454
357
|
|
|
455
|
-
# Top-right: Decompression time
|
|
358
|
+
# Top-right: Decompression time vs prompt length
|
|
456
359
|
ax2 = axes[0, 1]
|
|
457
|
-
for
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
means.append(subset['decompression_time_ms'].mean())
|
|
360
|
+
for method, label, color in zip(method_order, method_labels, colors):
|
|
361
|
+
method_df = df[df['method'] == method]
|
|
362
|
+
ax2.scatter(method_df['prompt_length'], method_df['decompression_time_ms'],
|
|
363
|
+
label=label, color=color, alpha=0.6, s=50)
|
|
462
364
|
|
|
463
|
-
|
|
365
|
+
# Add trend line
|
|
366
|
+
if len(method_df) > 1:
|
|
367
|
+
z = np.polyfit(method_df['prompt_length'], method_df['decompression_time_ms'], 1)
|
|
368
|
+
p = np.poly1d(z)
|
|
369
|
+
sorted_lengths = sorted(method_df['prompt_length'].unique())
|
|
370
|
+
ax2.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
|
|
371
|
+
linewidth=2, alpha=0.8)
|
|
464
372
|
|
|
465
373
|
ax2.set_ylabel('Decompression Time (ms)', fontweight='bold')
|
|
466
|
-
ax2.set_xlabel('Prompt
|
|
467
|
-
ax2.set_title('(b) Decompression Time', fontweight='bold', pad=15)
|
|
468
|
-
ax2.set_xticks(x + width)
|
|
469
|
-
ax2.set_xticklabels(categories)
|
|
374
|
+
ax2.set_xlabel('Prompt Length (characters)', fontweight='bold')
|
|
375
|
+
ax2.set_title('(b) Decompression Time vs Prompt Length', fontweight='bold', pad=15)
|
|
470
376
|
ax2.legend(framealpha=0.9)
|
|
471
|
-
ax2.grid(True, alpha=0.3, linestyle='--'
|
|
377
|
+
ax2.grid(True, alpha=0.3, linestyle='--')
|
|
472
378
|
ax2.set_yscale('log')
|
|
379
|
+
ax2.set_xscale('log')
|
|
473
380
|
|
|
474
|
-
# Bottom-left: Compression throughput
|
|
381
|
+
# Bottom-left: Compression throughput vs prompt length
|
|
475
382
|
ax3 = axes[1, 0]
|
|
476
|
-
for
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
means.append(subset['compression_throughput_mbps'].mean())
|
|
383
|
+
for method, label, color in zip(method_order, method_labels, colors):
|
|
384
|
+
method_df = df[df['method'] == method]
|
|
385
|
+
ax3.scatter(method_df['prompt_length'], method_df['compression_throughput_mbps'],
|
|
386
|
+
label=label, color=color, alpha=0.6, s=50)
|
|
481
387
|
|
|
482
|
-
|
|
483
|
-
|
|
388
|
+
# Add trend line
|
|
389
|
+
if len(method_df) > 1:
|
|
390
|
+
z = np.polyfit(method_df['prompt_length'], method_df['compression_throughput_mbps'], 1)
|
|
391
|
+
p = np.poly1d(z)
|
|
392
|
+
sorted_lengths = sorted(method_df['prompt_length'].unique())
|
|
393
|
+
ax3.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
|
|
394
|
+
linewidth=2, alpha=0.8)
|
|
484
395
|
|
|
485
396
|
ax3.set_ylabel('Throughput (MB/s)', fontweight='bold')
|
|
486
|
-
ax3.set_xlabel('Prompt
|
|
487
|
-
ax3.set_title('(c) Compression Throughput', fontweight='bold', pad=15)
|
|
397
|
+
ax3.set_xlabel('Prompt Length (characters)', fontweight='bold')
|
|
398
|
+
ax3.set_title('(c) Compression Throughput vs Prompt Length', fontweight='bold', pad=15)
|
|
488
399
|
ax3.legend(framealpha=0.9)
|
|
489
400
|
ax3.grid(True, alpha=0.3, linestyle='--')
|
|
490
401
|
ax3.set_ylim(bottom=0)
|
|
402
|
+
ax3.set_xscale('log')
|
|
491
403
|
|
|
492
|
-
# Bottom-right: Decompression throughput
|
|
404
|
+
# Bottom-right: Decompression throughput vs prompt length
|
|
493
405
|
ax4 = axes[1, 1]
|
|
494
|
-
for
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
means.append(subset['decompression_throughput_mbps'].mean())
|
|
406
|
+
for method, label, color in zip(method_order, method_labels, colors):
|
|
407
|
+
method_df = df[df['method'] == method]
|
|
408
|
+
ax4.scatter(method_df['prompt_length'], method_df['decompression_throughput_mbps'],
|
|
409
|
+
label=label, color=color, alpha=0.6, s=50)
|
|
499
410
|
|
|
500
|
-
|
|
501
|
-
|
|
411
|
+
# Add trend line
|
|
412
|
+
if len(method_df) > 1:
|
|
413
|
+
z = np.polyfit(method_df['prompt_length'], method_df['decompression_throughput_mbps'], 1)
|
|
414
|
+
p = np.poly1d(z)
|
|
415
|
+
sorted_lengths = sorted(method_df['prompt_length'].unique())
|
|
416
|
+
ax4.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
|
|
417
|
+
linewidth=2, alpha=0.8)
|
|
502
418
|
|
|
503
419
|
ax4.set_ylabel('Throughput (MB/s)', fontweight='bold')
|
|
504
|
-
ax4.set_xlabel('Prompt
|
|
505
|
-
ax4.set_title('(d) Decompression Throughput', fontweight='bold', pad=15)
|
|
420
|
+
ax4.set_xlabel('Prompt Length (characters)', fontweight='bold')
|
|
421
|
+
ax4.set_title('(d) Decompression Throughput vs Prompt Length', fontweight='bold', pad=15)
|
|
506
422
|
ax4.legend(framealpha=0.9)
|
|
507
423
|
ax4.grid(True, alpha=0.3, linestyle='--')
|
|
508
424
|
ax4.set_ylim(bottom=0)
|
|
425
|
+
ax4.set_xscale('log')
|
|
509
426
|
|
|
510
427
|
plt.tight_layout()
|
|
511
|
-
|
|
428
|
+
save_both_formats(output_dir, 'speed_metrics')
|
|
512
429
|
plt.close()
|
|
513
|
-
print(f" Saved: speed_metrics.svg")
|
|
514
430
|
|
|
515
431
|
|
|
516
432
|
def plot_memory_usage(df: pd.DataFrame, output_dir: Path):
|
|
@@ -520,59 +436,56 @@ def plot_memory_usage(df: pd.DataFrame, output_dir: Path):
|
|
|
520
436
|
method_order = ['zstd', 'token', 'hybrid']
|
|
521
437
|
method_labels = ['Zstd', 'Token (BPE)', 'Hybrid']
|
|
522
438
|
colors = ['#3498db', '#2ecc71', '#9b59b6']
|
|
523
|
-
categories = ['Small', 'Medium', 'Large']
|
|
524
|
-
|
|
525
|
-
x = np.arange(len(categories))
|
|
526
|
-
width = 0.25
|
|
527
439
|
|
|
528
|
-
# Left: Compression memory
|
|
440
|
+
# Left: Compression memory vs prompt length
|
|
529
441
|
ax1 = axes[0]
|
|
530
|
-
for
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
|
|
535
|
-
means.append(subset['compression_memory_mb'].mean())
|
|
536
|
-
stds.append(subset['compression_memory_mb'].std())
|
|
442
|
+
for method, label, color in zip(method_order, method_labels, colors):
|
|
443
|
+
method_df = df[df['method'] == method]
|
|
444
|
+
ax1.scatter(method_df['prompt_length'], method_df['compression_memory_mb'],
|
|
445
|
+
label=label, color=color, alpha=0.6, s=50)
|
|
537
446
|
|
|
538
|
-
|
|
539
|
-
|
|
447
|
+
# Add trend line
|
|
448
|
+
if len(method_df) > 1:
|
|
449
|
+
z = np.polyfit(method_df['prompt_length'], method_df['compression_memory_mb'], 1)
|
|
450
|
+
p = np.poly1d(z)
|
|
451
|
+
sorted_lengths = sorted(method_df['prompt_length'].unique())
|
|
452
|
+
ax1.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
|
|
453
|
+
linewidth=2, alpha=0.8)
|
|
540
454
|
|
|
541
455
|
ax1.set_ylabel('Memory Usage (MB)', fontweight='bold')
|
|
542
|
-
ax1.set_xlabel('Prompt
|
|
456
|
+
ax1.set_xlabel('Prompt Length (characters)', fontweight='bold')
|
|
543
457
|
ax1.set_title('(a) Compression Memory Usage', fontweight='bold', pad=15)
|
|
544
|
-
ax1.set_xticks(x + width)
|
|
545
|
-
ax1.set_xticklabels(categories)
|
|
546
458
|
ax1.legend(framealpha=0.9)
|
|
547
|
-
ax1.grid(True, alpha=0.3, linestyle='--'
|
|
459
|
+
ax1.grid(True, alpha=0.3, linestyle='--')
|
|
548
460
|
ax1.set_ylim(bottom=0)
|
|
461
|
+
ax1.set_xscale('log')
|
|
549
462
|
|
|
550
|
-
# Right: Decompression memory
|
|
463
|
+
# Right: Decompression memory vs prompt length
|
|
551
464
|
ax2 = axes[1]
|
|
552
|
-
for
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
|
|
557
|
-
means.append(subset['decompression_memory_mb'].mean())
|
|
558
|
-
stds.append(subset['decompression_memory_mb'].std())
|
|
465
|
+
for method, label, color in zip(method_order, method_labels, colors):
|
|
466
|
+
method_df = df[df['method'] == method]
|
|
467
|
+
ax2.scatter(method_df['prompt_length'], method_df['decompression_memory_mb'],
|
|
468
|
+
label=label, color=color, alpha=0.6, s=50)
|
|
559
469
|
|
|
560
|
-
|
|
561
|
-
|
|
470
|
+
# Add trend line
|
|
471
|
+
if len(method_df) > 1:
|
|
472
|
+
z = np.polyfit(method_df['prompt_length'], method_df['decompression_memory_mb'], 1)
|
|
473
|
+
p = np.poly1d(z)
|
|
474
|
+
sorted_lengths = sorted(method_df['prompt_length'].unique())
|
|
475
|
+
ax2.plot(sorted_lengths, p(sorted_lengths), color=color, linestyle='--',
|
|
476
|
+
linewidth=2, alpha=0.8)
|
|
562
477
|
|
|
563
478
|
ax2.set_ylabel('Memory Usage (MB)', fontweight='bold')
|
|
564
|
-
ax2.set_xlabel('Prompt
|
|
479
|
+
ax2.set_xlabel('Prompt Length (characters)', fontweight='bold')
|
|
565
480
|
ax2.set_title('(b) Decompression Memory Usage', fontweight='bold', pad=15)
|
|
566
|
-
ax2.set_xticks(x + width)
|
|
567
|
-
ax2.set_xticklabels(categories)
|
|
568
481
|
ax2.legend(framealpha=0.9)
|
|
569
|
-
ax2.grid(True, alpha=0.3, linestyle='--'
|
|
482
|
+
ax2.grid(True, alpha=0.3, linestyle='--')
|
|
570
483
|
ax2.set_ylim(bottom=0)
|
|
484
|
+
ax2.set_xscale('log')
|
|
571
485
|
|
|
572
486
|
plt.tight_layout()
|
|
573
|
-
|
|
487
|
+
save_both_formats(output_dir, 'memory_usage')
|
|
574
488
|
plt.close()
|
|
575
|
-
print(f" Saved: memory_usage.svg")
|
|
576
489
|
|
|
577
490
|
|
|
578
491
|
def plot_comprehensive_comparison(df: pd.DataFrame, output_dir: Path):
|
|
@@ -581,117 +494,104 @@ def plot_comprehensive_comparison(df: pd.DataFrame, output_dir: Path):
|
|
|
581
494
|
|
|
582
495
|
method_order = ['zstd', 'token', 'hybrid']
|
|
583
496
|
method_labels = ['Zstd', 'Token\n(BPE)', 'Hybrid']
|
|
584
|
-
categories = ['Small', 'Medium', 'Large']
|
|
585
497
|
|
|
586
|
-
# Top-left: Compression ratio heatmap
|
|
498
|
+
# Top-left: Compression ratio by method (boxplot data as heatmap)
|
|
587
499
|
ax1 = axes[0, 0]
|
|
588
|
-
|
|
500
|
+
compression_ratio_data = []
|
|
589
501
|
for method in method_order:
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
|
|
593
|
-
row.append(subset['compression_ratio'].mean())
|
|
594
|
-
compression_ratio_matrix.append(row)
|
|
502
|
+
method_df = df[df['method'] == method]
|
|
503
|
+
compression_ratio_data.append([method_df['compression_ratio'].mean()])
|
|
595
504
|
|
|
505
|
+
# Create a single column heatmap
|
|
506
|
+
compression_ratio_matrix = np.array(compression_ratio_data)
|
|
596
507
|
im1 = ax1.imshow(compression_ratio_matrix, cmap='YlOrRd', aspect='auto', vmin=0)
|
|
597
|
-
ax1.set_xticks(
|
|
508
|
+
ax1.set_xticks([0])
|
|
598
509
|
ax1.set_yticks(np.arange(len(method_labels)))
|
|
599
|
-
ax1.set_xticklabels(
|
|
510
|
+
ax1.set_xticklabels(['All Prompts'])
|
|
600
511
|
ax1.set_yticklabels(method_labels)
|
|
601
512
|
ax1.set_ylabel('Compression Method', fontweight='bold')
|
|
602
|
-
ax1.set_xlabel('
|
|
603
|
-
ax1.set_title('(a) Compression Ratio', fontweight='bold', pad=15)
|
|
513
|
+
ax1.set_xlabel('', fontweight='bold')
|
|
514
|
+
ax1.set_title('(a) Mean Compression Ratio', fontweight='bold', pad=15)
|
|
604
515
|
|
|
605
516
|
# Add text annotations
|
|
606
517
|
for i in range(len(method_labels)):
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
ha="center", va="center", color="black", fontweight='bold')
|
|
518
|
+
text = ax1.text(0, i, f'{compression_ratio_matrix[i][0]:.2f}x',
|
|
519
|
+
ha="center", va="center", color="black", fontweight='bold', fontsize=12)
|
|
610
520
|
|
|
611
521
|
plt.colorbar(im1, ax=ax1, label='Compression Ratio')
|
|
612
522
|
|
|
613
|
-
# Top-right: Space savings
|
|
523
|
+
# Top-right: Space savings by method
|
|
614
524
|
ax2 = axes[0, 1]
|
|
615
|
-
|
|
525
|
+
space_savings_data = []
|
|
616
526
|
for method in method_order:
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
|
|
620
|
-
row.append(subset['space_savings_percent'].mean())
|
|
621
|
-
space_savings_matrix.append(row)
|
|
527
|
+
method_df = df[df['method'] == method]
|
|
528
|
+
space_savings_data.append([method_df['space_savings_percent'].mean()])
|
|
622
529
|
|
|
530
|
+
space_savings_matrix = np.array(space_savings_data)
|
|
623
531
|
im2 = ax2.imshow(space_savings_matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=100)
|
|
624
|
-
ax2.set_xticks(
|
|
532
|
+
ax2.set_xticks([0])
|
|
625
533
|
ax2.set_yticks(np.arange(len(method_labels)))
|
|
626
|
-
ax2.set_xticklabels(
|
|
534
|
+
ax2.set_xticklabels(['All Prompts'])
|
|
627
535
|
ax2.set_yticklabels(method_labels)
|
|
628
536
|
ax2.set_ylabel('Compression Method', fontweight='bold')
|
|
629
|
-
ax2.set_xlabel('
|
|
630
|
-
ax2.set_title('(b) Space Savings (%)', fontweight='bold', pad=15)
|
|
537
|
+
ax2.set_xlabel('', fontweight='bold')
|
|
538
|
+
ax2.set_title('(b) Mean Space Savings (%)', fontweight='bold', pad=15)
|
|
631
539
|
|
|
632
540
|
for i in range(len(method_labels)):
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
ha="center", va="center", color="black", fontweight='bold')
|
|
541
|
+
text = ax2.text(0, i, f'{space_savings_matrix[i][0]:.1f}%',
|
|
542
|
+
ha="center", va="center", color="black", fontweight='bold', fontsize=12)
|
|
636
543
|
|
|
637
544
|
plt.colorbar(im2, ax=ax2, label='Space Savings (%)')
|
|
638
545
|
|
|
639
|
-
# Bottom-left: Compression
|
|
546
|
+
# Bottom-left: Compression throughput by method
|
|
640
547
|
ax3 = axes[1, 0]
|
|
641
|
-
|
|
548
|
+
speed_data = []
|
|
642
549
|
for method in method_order:
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
|
|
646
|
-
row.append(subset['compression_throughput_mbps'].mean())
|
|
647
|
-
speed_matrix.append(row)
|
|
550
|
+
method_df = df[df['method'] == method]
|
|
551
|
+
speed_data.append([method_df['compression_throughput_mbps'].mean()])
|
|
648
552
|
|
|
553
|
+
speed_matrix = np.array(speed_data)
|
|
649
554
|
im3 = ax3.imshow(speed_matrix, cmap='viridis', aspect='auto')
|
|
650
|
-
ax3.set_xticks(
|
|
555
|
+
ax3.set_xticks([0])
|
|
651
556
|
ax3.set_yticks(np.arange(len(method_labels)))
|
|
652
|
-
ax3.set_xticklabels(
|
|
557
|
+
ax3.set_xticklabels(['All Prompts'])
|
|
653
558
|
ax3.set_yticklabels(method_labels)
|
|
654
559
|
ax3.set_ylabel('Compression Method', fontweight='bold')
|
|
655
|
-
ax3.set_xlabel('
|
|
656
|
-
ax3.set_title('(c) Compression Throughput (MB/s)', fontweight='bold', pad=15)
|
|
560
|
+
ax3.set_xlabel('', fontweight='bold')
|
|
561
|
+
ax3.set_title('(c) Mean Compression Throughput (MB/s)', fontweight='bold', pad=15)
|
|
657
562
|
|
|
658
563
|
for i in range(len(method_labels)):
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
ha="center", va="center", color="white", fontweight='bold')
|
|
564
|
+
text = ax3.text(0, i, f'{speed_matrix[i][0]:.2f}',
|
|
565
|
+
ha="center", va="center", color="white", fontweight='bold', fontsize=12)
|
|
662
566
|
|
|
663
567
|
plt.colorbar(im3, ax=ax3, label='Throughput (MB/s)')
|
|
664
568
|
|
|
665
|
-
# Bottom-right: Memory usage
|
|
569
|
+
# Bottom-right: Memory usage by method
|
|
666
570
|
ax4 = axes[1, 1]
|
|
667
|
-
|
|
571
|
+
memory_data = []
|
|
668
572
|
for method in method_order:
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
subset = df[(df['method'] == method) & (df['prompt_category'] == category)]
|
|
672
|
-
row.append(subset['compression_memory_mb'].mean())
|
|
673
|
-
memory_matrix.append(row)
|
|
573
|
+
method_df = df[df['method'] == method]
|
|
574
|
+
memory_data.append([method_df['compression_memory_mb'].mean()])
|
|
674
575
|
|
|
576
|
+
memory_matrix = np.array(memory_data)
|
|
675
577
|
im4 = ax4.imshow(memory_matrix, cmap='plasma', aspect='auto')
|
|
676
|
-
ax4.set_xticks(
|
|
578
|
+
ax4.set_xticks([0])
|
|
677
579
|
ax4.set_yticks(np.arange(len(method_labels)))
|
|
678
|
-
ax4.set_xticklabels(
|
|
580
|
+
ax4.set_xticklabels(['All Prompts'])
|
|
679
581
|
ax4.set_yticklabels(method_labels)
|
|
680
582
|
ax4.set_ylabel('Compression Method', fontweight='bold')
|
|
681
|
-
ax4.set_xlabel('
|
|
682
|
-
ax4.set_title('(d) Compression Memory Usage (MB)', fontweight='bold', pad=15)
|
|
583
|
+
ax4.set_xlabel('', fontweight='bold')
|
|
584
|
+
ax4.set_title('(d) Mean Compression Memory Usage (MB)', fontweight='bold', pad=15)
|
|
683
585
|
|
|
684
586
|
for i in range(len(method_labels)):
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
ha="center", va="center", color="white", fontweight='bold')
|
|
587
|
+
text = ax4.text(0, i, f'{memory_matrix[i][0]:.2f}',
|
|
588
|
+
ha="center", va="center", color="white", fontweight='bold', fontsize=12)
|
|
688
589
|
|
|
689
590
|
plt.colorbar(im4, ax=ax4, label='Memory (MB)')
|
|
690
591
|
|
|
691
592
|
plt.tight_layout()
|
|
692
|
-
|
|
593
|
+
save_both_formats(output_dir, 'comprehensive_comparison')
|
|
693
594
|
plt.close()
|
|
694
|
-
print(f" Saved: comprehensive_comparison.svg")
|
|
695
595
|
|
|
696
596
|
|
|
697
597
|
def plot_scalability(df: pd.DataFrame, output_dir: Path):
|
|
@@ -791,24 +691,18 @@ def plot_scalability(df: pd.DataFrame, output_dir: Path):
|
|
|
791
691
|
ax4.set_xscale('log')
|
|
792
692
|
|
|
793
693
|
plt.tight_layout()
|
|
794
|
-
|
|
694
|
+
save_both_formats(output_dir, 'scalability_analysis')
|
|
795
695
|
plt.close()
|
|
796
|
-
print(f" Saved: scalability_analysis.svg")
|
|
797
696
|
|
|
798
697
|
|
|
799
|
-
def plot_original_vs_decompressed(output_dir: Path):
|
|
698
|
+
def plot_original_vs_decompressed(jsonl_path: Path, output_dir: Path):
|
|
800
699
|
"""Plot original vs decompressed data comparison across multiple prompts."""
|
|
801
700
|
compressor = PromptCompressor(model="cl100k_base", zstd_level=15)
|
|
802
|
-
prompts =
|
|
701
|
+
prompts = load_prompts_from_jsonl(jsonl_path)
|
|
803
702
|
|
|
804
|
-
# Select a
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
("Medium Prompt 1", prompts[4][1]),
|
|
808
|
-
("Large Prompt 1", prompts[7][1]),
|
|
809
|
-
("Medium Prompt 2", prompts[5][1]),
|
|
810
|
-
("Small Prompt 2", prompts[1][1]),
|
|
811
|
-
]
|
|
703
|
+
# Select a diverse sample of prompts for visualization (up to 5)
|
|
704
|
+
num_to_show = min(5, len(prompts))
|
|
705
|
+
selected_prompts = prompts[:num_to_show]
|
|
812
706
|
|
|
813
707
|
# Use Hybrid method (best compression)
|
|
814
708
|
method = CompressionMethod.HYBRID
|
|
@@ -889,9 +783,8 @@ def plot_original_vs_decompressed(output_dir: Path):
|
|
|
889
783
|
ax.axhspan(-5, 105, alpha=0.05, color='green', zorder=0)
|
|
890
784
|
|
|
891
785
|
plt.tight_layout(rect=[0, 0, 1, 0.99])
|
|
892
|
-
|
|
786
|
+
save_both_formats(output_dir, 'original_vs_decompressed')
|
|
893
787
|
plt.close()
|
|
894
|
-
print(f" Saved: original_vs_decompressed.svg")
|
|
895
788
|
|
|
896
789
|
|
|
897
790
|
def main():
|
|
@@ -900,15 +793,19 @@ def main():
|
|
|
900
793
|
output_dir = Path(__file__).parent.parent / 'screenshots'
|
|
901
794
|
output_dir.mkdir(exist_ok=True)
|
|
902
795
|
|
|
796
|
+
# JSONL file path
|
|
797
|
+
jsonl_path = Path(__file__).parent / 'transformers-4-34-0.jsonl'
|
|
798
|
+
|
|
903
799
|
print("=" * 70)
|
|
904
800
|
print("LoPace Visualization Generator")
|
|
905
801
|
print("=" * 70)
|
|
906
802
|
print(f"Output directory: {output_dir}")
|
|
803
|
+
print(f"JSONL file: {jsonl_path}")
|
|
907
804
|
print()
|
|
908
805
|
|
|
909
806
|
# Run benchmarks
|
|
910
807
|
print("Step 1: Running compression benchmarks...")
|
|
911
|
-
df = run_benchmarks()
|
|
808
|
+
df = run_benchmarks(jsonl_path)
|
|
912
809
|
|
|
913
810
|
# Save raw data
|
|
914
811
|
csv_path = output_dir / 'benchmark_data.csv'
|
|
@@ -925,7 +822,7 @@ def main():
|
|
|
925
822
|
plot_memory_usage(df, output_dir)
|
|
926
823
|
plot_comprehensive_comparison(df, output_dir)
|
|
927
824
|
plot_scalability(df, output_dir)
|
|
928
|
-
plot_original_vs_decompressed(output_dir)
|
|
825
|
+
plot_original_vs_decompressed(jsonl_path, output_dir)
|
|
929
826
|
|
|
930
827
|
print("\n" + "=" * 70)
|
|
931
828
|
print("Visualization generation complete!")
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
lopace/__init__.py,sha256=1X75FQdnzJaDueL_xRGTBn3vvIAXR1T_siSA_mW391E,443
|
|
2
|
-
lopace/_version.py,sha256=PLPzXBIkQAXc-mvnIPLNmQqit_i0rmO6w4hdXJlO3xc,717
|
|
3
|
-
lopace/compressor.py,sha256=nUTWDcAPYvQaeSFKx_lne-D2xIQ02IMVGE4yLODo8qE,19060
|
|
4
|
-
lopace-0.1.6.dev2.dist-info/licenses/LICENSE,sha256=uFUrlsfsOwx_8Nzhq2pUgNaJghcJxXBMML3l7T39Tm0,1067
|
|
5
|
-
scripts/README.md,sha256=UEsrHKKfiEixTMtWV8trYBKnzgkImJxxEnXTyDI4r9g,2226
|
|
6
|
-
scripts/__init__.py,sha256=XLq0VmLoEBfnWjzYmxb_JRzAIqwZDv-2s10TO692TLc,59
|
|
7
|
-
scripts/generate_visualizations.py,sha256=AJm2DNs-tiwdTHLivEQL9QkztmCclgGT1u4ds5QY4BQ,41812
|
|
8
|
-
scripts/requirements.txt,sha256=EvUUoksfGtvbA45zkCG8to1EaPzWv1eurCONAp8Pdx4,112
|
|
9
|
-
tests/__init__.py,sha256=yXNVJE20E2iHo0qbit5SgRE35eXWq89F1kkhNHy7VJA,31
|
|
10
|
-
tests/test_compressor.py,sha256=-vMztSzY89n5dpShcACrFboEQOlfJ6FxF7eQOEU3swM,8273
|
|
11
|
-
lopace-0.1.6.dev2.dist-info/METADATA,sha256=3vzfkpgjCojuDUQ-Rq8O8IU18-m7SkdZhuB9TobCQGw,20788
|
|
12
|
-
lopace-0.1.6.dev2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
13
|
-
lopace-0.1.6.dev2.dist-info/top_level.txt,sha256=k-gL-51ulMq50vhNS91c1eyGRNse0vs_PzS9VdAiYlw,21
|
|
14
|
-
lopace-0.1.6.dev2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|