llama-benchy 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llama_benchy/__main__.py CHANGED
@@ -2,552 +2,44 @@
2
2
  Main entry point for the llama-benchy CLI.
3
3
  """
4
4
 
5
- import argparse
6
- import os
7
- import time
8
- import uuid
9
- import subprocess
10
- import datetime
11
- import numpy as np
12
- from tabulate import tabulate
13
- import aiohttp
14
5
  import asyncio
15
- import json
16
- import codecs
17
- import hashlib
18
- from transformers import AutoTokenizer
19
- import requests
20
-
21
- # Build number is now imported from __init__.py
6
+ import datetime
22
7
  from . import __version__
23
-
24
-
25
-
26
- def parse_arguments():
27
- parser = argparse.ArgumentParser(description="LLM Benchmark Script")
28
- parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
29
- parser.add_argument("--base-url", type=str, required=True, help="OpenAI compatible endpoint URL")
30
- parser.add_argument("--api-key", type=str, default="EMPTY", help="API Key for the endpoint")
31
- parser.add_argument("--model", type=str, required=True, help="Model name to use for benchmarking")
32
- parser.add_argument("--served-model-name", type=str, default=None, help="Model name used in API calls (defaults to --model if not specified)")
33
- parser.add_argument("--tokenizer", type=str, default=None, help="HuggingFace tokenizer name (defaults to model name)")
34
- parser.add_argument("--pp", type=int, nargs='+', required=False, default=[2048], help="List of prompt processing token counts - default: 2048")
35
- parser.add_argument("--tg", type=int, nargs='+', required=False, default=[32], help="List of token generation counts - default: 32")
36
- parser.add_argument("--depth", type=int, nargs='+', default=[0], help="List of context depths (previous conversation tokens) - default: 0")
37
- parser.add_argument("--runs", type=int, default=3, help="Number of runs per test - default: 3")
38
- parser.add_argument("--no-cache", action="store_true", help="Ensure unique requests to avoid prefix caching and send cache_prompt=false to the server")
39
- parser.add_argument("--post-run-cmd", type=str, default=None, help="Command to execute after each test run")
40
- parser.add_argument("--book-url", type=str, default="https://www.gutenberg.org/files/1661/1661-0.txt", help="URL of a book to use for text generation, defaults to Sherlock Holmes (https://www.gutenberg.org/files/1661/1661-0.txt)")
41
- parser.add_argument("--latency-mode", type=str, default="api", choices=["api", "generation", "none"], help="Method to measure latency: 'api' (list models) - default, 'generation' (single token generation), or 'none' (skip latency measurement)")
42
- parser.add_argument("--no-warmup", action="store_true", help="Skip warmup phase")
43
- parser.add_argument("--adapt-prompt", action="store_true", default=True, help="Adapt prompt size based on warmup token usage delta (default: True)")
44
- parser.add_argument("--no-adapt-prompt", action="store_false", dest="adapt_prompt", help="Disable prompt size adaptation")
45
- parser.add_argument("--enable-prefix-caching", action="store_true", help="Enable prefix caching performance measurement")
46
- return parser.parse_args()
47
-
48
-
49
- def get_tokenizer(model_name, tokenizer_name=None):
50
- try:
51
- name = tokenizer_name if tokenizer_name else model_name
52
- return AutoTokenizer.from_pretrained(name)
53
- except Exception as e:
54
- print(f"Error loading tokenizer: {e}")
55
- print("Falling back to 'gpt2' tokenizer as approximation.")
56
- return AutoTokenizer.from_pretrained("gpt2")
57
-
58
-
59
- def prepare_text_data(book_url, tokenizer):
60
- try:
61
- # Create cache directory if it doesn't exist
62
- cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "llama-benchy")
63
- os.makedirs(cache_dir, exist_ok=True)
64
-
65
- # Generate hash of the URL for the filename
66
- url_hash = hashlib.md5(book_url.encode()).hexdigest()
67
- cache_file = os.path.join(cache_dir, f"{url_hash}.txt")
68
-
69
- if os.path.exists(cache_file):
70
- print(f"Loading text from cache: {cache_file}")
71
- with open(cache_file, "r", encoding="utf-8") as f:
72
- text = f.read()
73
- else:
74
- print(f"Downloading book from {book_url}...")
75
- response = requests.get(book_url)
76
- response.raise_for_status()
77
- text = response.text
78
- # Basic cleanup
79
- start_idx = text.find("*** START OF THE PROJECT GUTENBERG EBOOK")
80
- if start_idx != -1:
81
- text = text[start_idx:]
82
-
83
- # Save to cache
84
- with open(cache_file, "w", encoding="utf-8") as f:
85
- f.write(text)
86
- print(f"Saved text to cache: {cache_file}")
87
-
88
- return tokenizer.encode(text, add_special_tokens=False)
89
- except Exception as e:
90
- print(f"Error downloading book: {e}")
91
- exit(1)
92
-
93
-
94
- def generate_prompt(all_tokens, tokenizer, prompt_tokens, context_tokens=0, no_cache=False):
95
- suffix = ""
96
- suffix_len = 0
97
- if no_cache:
98
- suffix = f" {uuid.uuid4()}"
99
- suffix_len = len(tokenizer.encode(suffix, add_special_tokens=False))
100
-
101
- # Adjust prompt tokens to fetch from text
102
- text_prompt_tokens = max(0, prompt_tokens - suffix_len)
103
-
104
- # Create a pool of tokens large enough
105
- total_needed = text_prompt_tokens + context_tokens
106
-
107
- if len(all_tokens) < total_needed:
108
- # Repeat tokens if not enough
109
- all_tokens = all_tokens * (total_needed // len(all_tokens) + 2)
110
-
111
- # Pick a random start position
112
- max_start = len(all_tokens) - total_needed
113
- start_idx = np.random.randint(0, max_start)
114
-
115
- selected_tokens = all_tokens[start_idx : start_idx + total_needed]
116
-
117
- context_text = tokenizer.decode(selected_tokens[:context_tokens]) if context_tokens > 0 else ""
118
- prompt_text = tokenizer.decode(selected_tokens[context_tokens:])
119
-
120
- if no_cache:
121
- prompt_text += suffix
122
-
123
- return context_text, prompt_text
124
-
125
-
126
- async def measure_latency(session, base_url, api_key, mode="api", model_name=None):
127
- if mode == "none":
128
- print("Skipping latency measurement (assuming 0 ms).")
129
- return 0
130
-
131
- print(f"Measuring latency using mode: {mode}...")
132
- latencies = []
133
- headers = {"Authorization": f"Bearer {api_key}"}
134
-
135
- for _ in range(3):
136
- start = time.perf_counter()
137
- try:
138
- if mode == "api":
139
- async with session.get(f"{base_url}/models", headers=headers) as response:
140
- await response.read()
141
- latencies.append(time.perf_counter() - start)
142
- elif mode == "generation":
143
- if not model_name:
144
- raise ValueError("Model name required for generation latency mode")
145
- payload = {
146
- "model": model_name,
147
- "messages": [{"role": "user", "content": "hello"}],
148
- "max_tokens": 1,
149
- "stream": True
150
- }
151
- async with session.post(f"{base_url}/chat/completions", json=payload, headers=headers) as response:
152
- async for _ in response.content:
153
- # record latency as soon as the first byte is received
154
- latencies.append(time.perf_counter() - start)
155
- break
156
- # Drain the rest of the response to keep the connection alive
157
- async for _ in response.content: pass
158
- except Exception as e:
159
- print(f"Error measuring latency: {e}")
160
-
161
- if latencies:
162
- avg_latency = np.mean(latencies)
163
- print(f"Average latency ({mode}): {avg_latency*1000:.2f} ms")
164
- return avg_latency
165
- return 0
166
-
167
-
168
- async def warmup(session, base_url, api_key, model, tokenizer=None):
169
- print("Warming up...")
170
- headers = {"Authorization": f"Bearer {api_key}"}
171
- warmup_text = "Warmup " * 10
172
-
173
- delta_user = 0
174
- delta_context = 0
175
-
176
- # 1. User only (No Context)
177
- payload_user = {
178
- "model": model,
179
- "messages": [{"role": "user", "content": warmup_text}],
180
- "max_tokens": 1
181
- }
182
-
183
- try:
184
- async with session.post(f"{base_url}/chat/completions", json=payload_user, headers=headers) as response:
185
- response_json = await response.json()
186
- if tokenizer:
187
- if 'usage' in response_json:
188
- prompt_tokens = response_json['usage']['prompt_tokens']
189
- local_tokens = len(tokenizer.encode(warmup_text, add_special_tokens=False))
190
- delta_user = prompt_tokens - local_tokens
191
- print(f"Warmup (User only) complete. Delta: {delta_user} tokens (Server: {prompt_tokens}, Local: {local_tokens})")
192
- else:
193
- print("Warmup (User only) complete (no usage stats found).")
194
- else:
195
- print("Warmup complete.")
196
-
197
- if tokenizer:
198
- # 2. System + Empty User (Context Only)
199
- payload_sys_empty = {
200
- "model": model,
201
- "messages": [
202
- {"role": "system", "content": warmup_text},
203
- {"role": "user", "content": ""}
204
- ],
205
- "max_tokens": 1
206
- }
207
- async with session.post(f"{base_url}/chat/completions", json=payload_sys_empty, headers=headers) as response:
208
- response_json = await response.json()
209
- if 'usage' in response_json:
210
- prompt_tokens = response_json['usage']['prompt_tokens']
211
- local_tokens = len(tokenizer.encode(warmup_text, add_special_tokens=False))
212
- delta_context = prompt_tokens - local_tokens
213
- print(f"Warmup (System+Empty) complete. Delta: {delta_context} tokens (Server: {prompt_tokens}, Local: {local_tokens})")
214
- else:
215
- print("Warmup (System+Empty) complete (no usage stats found).")
216
- delta_context = delta_user
217
-
218
- except Exception as e:
219
- print(f"Warmup failed: {e}")
220
- return delta_user, delta_context
221
-
222
-
223
- async def run_benchmark(session, base_url, api_key, model_name, context_text, prompt_text, expected_pp_tokens, tg, no_cache, latency, post_run_cmd):
224
- messages = []
225
- if context_text:
226
- messages.append({"role": "system", "content": context_text})
227
- messages.append({"role": "user", "content": prompt_text})
228
-
229
- ttft = 0
230
- e2e_ttft = 0
231
- token_count = 0
232
- first_token_time = 0
233
- first_response_time = 0
234
- prompt_usage_tokens = 0
235
-
236
- result = {
237
- "pp_speed": None,
238
- "tg_speed": None,
239
- "ttft": None,
240
- "ttfr": None,
241
- "est_ppt": None,
242
- "e2e_ttft": None
243
- }
244
-
245
- # DEBUG: Buffer to store first few lines of raw response
246
- debug_lines = []
247
-
248
- try:
249
- payload = {
250
- "model": model_name,
251
- "messages": messages,
252
- "max_tokens": tg,
253
- "stream": True,
254
- "stream_options": {"include_usage": True},
255
- # "temperature": 0,
256
- # "seed": 42
257
- }
258
-
259
- if no_cache:
260
- payload["cache_prompt"] = False
261
-
262
- headers = {"Authorization": f"Bearer {api_key}"}
263
-
264
- start_time = time.perf_counter()
265
-
266
- async with session.post(f"{base_url}/chat/completions", json=payload, headers=headers) as response:
267
- if response.status != 200:
268
- error_text = await response.text()
269
- print(f"Error: {response.status} - {error_text}")
270
- return None
271
-
272
- buffer = ""
273
- decoder = codecs.getincrementaldecoder("utf-8")(errors='replace')
274
- async for chunk_bytes in response.content:
275
- chunk_time = time.perf_counter()
276
- decoded_chunk = decoder.decode(chunk_bytes, final=False)
277
- buffer += decoded_chunk
278
-
279
- while "\n" in buffer:
280
- line, buffer = buffer.split("\n", 1)
281
- line = line.strip()
282
- if not line:
283
- continue
284
-
285
- # Capture first 5 lines for debugging if needed
286
- if len(debug_lines) < 5:
287
- debug_lines.append(line)
288
-
289
- if line == 'data: [DONE]' or line == 'data:[DONE]':
290
- continue
291
-
292
- if line.startswith('data:'):
293
- try:
294
- # Strip 'data:' and potential whitespace
295
- json_str = line[5:].strip()
296
- chunk = json.loads(json_str)
297
-
298
- if 'usage' in chunk:
299
- prompt_usage_tokens = chunk['usage'].get('prompt_tokens', 0)
300
-
301
- if 'choices' in chunk and len(chunk['choices']) > 0:
302
- if first_response_time == 0:
303
- first_response_time = chunk_time
304
-
305
- delta = chunk['choices'][0].get('delta', {})
306
- content = delta.get('content')
307
- reasoning_content = delta.get('reasoning_content')
308
- reasoning = delta.get('reasoning')
309
-
310
- if content or reasoning_content or reasoning:
311
- if token_count == 0:
312
- first_token_time = chunk_time
313
- e2e_ttft = first_token_time - start_time
314
- ttft = e2e_ttft-latency
315
- if ttft < 0:
316
- ttft = 0
317
-
318
- token_count += 1
319
- except json.JSONDecodeError:
320
- continue
321
-
322
- end_time = time.perf_counter()
323
-
324
- # DEBUG: Print warning if no tokens were collected
325
- if token_count == 0:
326
- print(f"\n[Warning] Run generated 0 tokens. Raw response sample: {debug_lines}")
327
-
328
- if token_count > 0:
329
- # Calculate decode time (time for subsequent tokens)
330
- # If only 1 token, decode_time is effectively 0, so we can't calculate inter-token speed
331
- if token_count > 1:
332
- decode_time = end_time - first_token_time
333
- if decode_time > 0:
334
- # Speed for the generated tokens (excluding the first one which is TTFT)
335
- result["tg_speed"] = (token_count - 1) / decode_time
336
- else:
337
- # Fallback if time is too small
338
- result["tg_speed"] = (token_count - 1) / 0.0001
339
-
340
- # Use expected_pp_tokens for speed calculation
341
- total_prompt_tokens = expected_pp_tokens
342
-
343
- # Only use reported usage if it's close to expected (to handle tokenizer differences)
344
- # but not if it's vastly different (which happens in prefix caching where usage includes cached tokens)
345
- if prompt_usage_tokens > 0:
346
- diff = abs(prompt_usage_tokens - expected_pp_tokens)
347
- if diff < expected_pp_tokens * 0.2: # 20% tolerance
348
- total_prompt_tokens = prompt_usage_tokens
349
-
350
- # Calculate TTFR and Estimated Prompt Processing Time
351
- ttfr = 0
352
- est_ppt = 0
353
- if first_response_time > 0:
354
- ttfr = first_response_time - start_time
355
- est_ppt = ttfr - latency
356
- if est_ppt < 0: est_ppt = 0
357
-
358
- if est_ppt > 0:
359
- result["pp_speed"] = total_prompt_tokens / est_ppt
360
- result["est_ppt"] = est_ppt
361
-
362
- if ttfr > 0:
363
- result["ttfr"] = ttfr
364
-
365
- if ttft > 0:
366
- result["ttft"] = ttft
367
-
368
- if e2e_ttft > 0:
369
- result["e2e_ttft"] = e2e_ttft
370
-
371
- except Exception as e:
372
- print(f"Error during run: {e}")
373
- return None
374
-
375
- if post_run_cmd:
376
- try:
377
- subprocess.run(post_run_cmd, shell=True, check=True)
378
- except subprocess.CalledProcessError as e:
379
- print(f"Post-run command failed: {e}")
380
-
381
- return result
382
-
8
+ from .config import BenchmarkConfig
9
+ from .corpus import TokenizedCorpus
10
+ from .prompts import PromptGenerator
11
+ from .client import LLMClient
12
+ from .runner import BenchmarkRunner
383
13
 
384
14
  async def main_async():
385
- args = parse_arguments()
15
+ # 1. Parse Configuration
16
+ config = BenchmarkConfig.from_args()
386
17
 
387
- if args.enable_prefix_caching and args.no_cache:
388
- print("Error: --enable-prefix-caching and --no-cache are incompatible.")
389
- return
390
-
391
- version_number = __version__
392
-
18
+ # 2. Print Header
393
19
  current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
394
- print(f"llama-benchy ({version_number})")
20
+ print(f"llama-benchy ({__version__})")
395
21
  print(f"Date: {current_time}")
396
- print(f"Benchmarking model: {args.model} at {args.base_url}")
397
-
398
- served_model_name = args.served_model_name if args.served_model_name else args.model
22
+ print(f"Benchmarking model: {config.model} at {config.base_url}")
23
+ print(f"Concurrency levels: {config.concurrency_levels}")
399
24
 
400
- tokenizer = get_tokenizer(args.model, args.tokenizer)
401
- all_tokens = prepare_text_data(args.book_url, tokenizer)
402
- print(f"Total tokens available in text corpus: {len(all_tokens)}")
25
+ # 3. Prepare Data
26
+ corpus = TokenizedCorpus(config.book_url, config.tokenizer, config.model)
27
+ print(f"Total tokens available in text corpus: {len(corpus)}")
403
28
 
404
- # Use a large timeout for long-running benchmarks
405
- timeout = aiohttp.ClientTimeout(total=3600)
406
- connector = aiohttp.TCPConnector(limit=1, force_close=False, keepalive_timeout=600)
407
- async with aiohttp.ClientSession(timeout=timeout, connector=connector, trust_env=True) as session:
408
- delta_user = 0
409
- delta_context = 0
410
- should_warmup = not args.no_warmup
411
- if args.adapt_prompt:
412
- should_warmup = True
413
-
414
- if should_warmup:
415
- delta_user, delta_context = await warmup(session, args.base_url, args.api_key, served_model_name, tokenizer if args.adapt_prompt else None)
416
-
417
- latency = await measure_latency(session, args.base_url, args.api_key, args.latency_mode, served_model_name)
418
-
419
- results = []
420
-
421
- for depth in args.depth:
422
- for pp in args.pp:
423
- for tg in args.tg:
424
- print(f"Running test: pp={pp}, tg={tg}, depth={depth}")
425
- pp_speeds = []
426
- tg_speeds = []
427
- ttft_values = []
428
- ttfr_values = []
429
- est_ppt_values = []
430
- e2e_ttft_values = []
431
-
432
- ctx_pp_speeds = []
433
- ctx_tg_speeds = []
434
- ctx_ttfr_values = []
435
- ctx_est_ppt_values = []
436
- ctx_e2e_ttft_values = []
437
-
438
- for run in range(args.runs):
439
- current_pp = pp
440
- current_depth = depth
441
- if args.adapt_prompt:
442
- if depth == 0:
443
- current_pp = max(1, pp - delta_user)
444
- else:
445
- current_depth = max(1, depth - delta_context)
446
-
447
- context, prompt = generate_prompt(all_tokens, tokenizer, current_pp, current_depth, args.no_cache)
448
-
449
- if args.enable_prefix_caching and depth > 0:
450
- # Request 1: Context only
451
- # We send context as system message, and empty prompt as user message.
452
- # This establishes the prefix: [System: Context] [User: ""]
453
- # Expected PP tokens = current_depth (context size)
454
- print(f" Run {run+1}/{args.runs} (Context Load)...")
455
- ctx_result = await run_benchmark(session, args.base_url, args.api_key, served_model_name, context, "", current_depth, tg, args.no_cache, latency, None)
456
-
457
- if ctx_result:
458
- if ctx_result["pp_speed"] is not None:
459
- ctx_pp_speeds.append(ctx_result["pp_speed"])
460
- if ctx_result["tg_speed"] is not None:
461
- ctx_tg_speeds.append(ctx_result["tg_speed"])
462
- if ctx_result["ttfr"] is not None:
463
- ctx_ttfr_values.append(ctx_result["ttfr"])
464
- if ctx_result["est_ppt"] is not None:
465
- ctx_est_ppt_values.append(ctx_result["est_ppt"])
466
- if ctx_result["e2e_ttft"] is not None:
467
- ctx_e2e_ttft_values.append(ctx_result["e2e_ttft"])
468
-
469
- # Request 2: Context + Prompt
470
- # We send context as system message, and prompt as user message.
471
- # The prefix [System: Context] should be cached.
472
- # Expected PP tokens = current_pp (prompt size only)
473
- print(f" Run {run+1}/{args.runs} (Inference)...")
474
- run_result = await run_benchmark(session, args.base_url, args.api_key, served_model_name, context, prompt, current_pp, tg, args.no_cache, latency, args.post_run_cmd)
475
- else:
476
- # Standard run
477
- # Expected PP tokens = current_pp + current_depth
478
- expected_tokens = current_pp + current_depth
479
- run_result = await run_benchmark(session, args.base_url, args.api_key, served_model_name, context, prompt, expected_tokens, tg, args.no_cache, latency, args.post_run_cmd)
480
-
481
- if run_result:
482
- if run_result["tg_speed"] is not None:
483
- tg_speeds.append(run_result["tg_speed"])
484
- if run_result["pp_speed"] is not None:
485
- pp_speeds.append(run_result["pp_speed"])
486
- if run_result["est_ppt"] is not None:
487
- est_ppt_values.append(run_result["est_ppt"])
488
- if run_result["ttfr"] is not None:
489
- ttfr_values.append(run_result["ttfr"])
490
- if run_result["ttft"] is not None:
491
- ttft_values.append(run_result["ttft"])
492
- if run_result["e2e_ttft"] is not None:
493
- e2e_ttft_values.append(run_result["e2e_ttft"])
494
-
495
- # Aggregate results
496
- def format_result(values, multiplier=1.0):
497
- if not values: return ""
498
- mean = np.mean(values) * multiplier
499
- std = np.std(values) * multiplier
500
- return f"{mean:.2f} ± {std:.2f}"
501
-
502
- # Context PP (if enabled)
503
- if ctx_pp_speeds:
504
- test_name = f"ctx_pp @ d{depth}"
505
- results.append([
506
- args.model,
507
- test_name,
508
- format_result(ctx_pp_speeds),
509
- format_result(ctx_ttfr_values, 1000),
510
- format_result(ctx_est_ppt_values, 1000),
511
- format_result(ctx_e2e_ttft_values, 1000)
512
- ])
513
-
514
- # Context TG (if enabled)
515
- if ctx_tg_speeds:
516
- test_name = f"ctx_tg @ d{depth}"
517
- results.append([args.model, test_name, format_result(ctx_tg_speeds), "", "", ""])
518
-
519
- # Standard PP
520
- if pp_speeds:
521
- test_name = f"pp{pp}"
522
- if depth > 0: test_name += f" @ d{depth}"
523
- results.append([
524
- args.model,
525
- test_name,
526
- format_result(pp_speeds),
527
- format_result(ttfr_values, 1000),
528
- format_result(est_ppt_values, 1000),
529
- format_result(e2e_ttft_values, 1000)
530
- ])
531
-
532
- # Standard TG
533
- if tg_speeds:
534
- test_name = f"tg{tg}"
535
- if depth > 0: test_name += f" @ d{depth}"
536
- results.append([args.model, test_name, format_result(tg_speeds), "", "", ""])
537
-
538
- print()
539
- if not results:
540
- print("No results collected. Check if the model is generating tokens.")
541
- else:
542
- print(tabulate(results, headers=["model", "test", "t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"], tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right")))
543
- print(f"\nllama-benchy ({version_number})")
544
- print(f"date: {current_time} | latency mode: {args.latency_mode}")
545
-
29
+ # 4. Initialize Components
30
+ prompt_gen = PromptGenerator(corpus)
31
+ client = LLMClient(config.base_url, config.api_key, config.served_model_name)
32
+ runner = BenchmarkRunner(config, client, prompt_gen)
33
+
34
+ # 5. Run Benchmark Suite
35
+ await runner.run_suite()
36
+
37
+ print(f"\nllama-benchy ({__version__})")
38
+ print(f"date: {current_time} | latency mode: {config.latency_mode}")
546
39
 
547
40
  def main():
548
41
  """Entry point for the CLI command."""
549
42
  asyncio.run(main_async())
550
43
 
551
-
552
44
  if __name__ == "__main__":
553
- main()
45
+ main()
llama_benchy/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.2'
32
- __version_tuple__ = version_tuple = (0, 1, 2)
31
+ __version__ = version = '0.2.1'
32
+ __version_tuple__ = version_tuple = (0, 2, 1)
33
33
 
34
34
  __commit_id__ = commit_id = None