PyPI - promptum - Versions diffs - 0.0.2__py3-none-any.whl → 0.0.3__py3-none-any.whl - Mend

promptum 0.0.2py3-none-any.whl → 0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

promptum/__init__.py +3 -2
promptum/benchmark/__init__.py +2 -1
promptum/benchmark/report.py +13 -26
promptum/benchmark/summary.py +14 -0
{promptum-0.0.2.dist-info → promptum-0.0.3.dist-info}/METADATA +43 -35
{promptum-0.0.2.dist-info → promptum-0.0.3.dist-info}/RECORD +8 -7
{promptum-0.0.2.dist-info → promptum-0.0.3.dist-info}/WHEEL +0 -0
{promptum-0.0.2.dist-info → promptum-0.0.3.dist-info}/licenses/LICENSE +0 -0

promptum/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from promptum.benchmark import Benchmark, Report, Runner, TestCase, TestResult
+from promptum.benchmark import Benchmark, Report, Runner, Summary, TestCase, TestResult
 from promptum.providers import LLMProvider, Metrics, OpenRouterClient, RetryConfig, RetryStrategy
 from promptum.validation import (
     Contains,
@@ -8,11 +8,12 @@ from promptum.validation import (
     Validator,
 )
-__version__ = "0.0.1"
+__version__ = "0.0.3"
 __all__ = [
     "TestCase",
     "TestResult",
+    "Summary",
     "Metrics",
     "RetryConfig",
     "RetryStrategy",

promptum/benchmark/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@ from promptum.benchmark.benchmark import Benchmark
 from promptum.benchmark.report import Report
 from promptum.benchmark.result import TestResult
 from promptum.benchmark.runner import Runner
+from promptum.benchmark.summary import Summary
 from promptum.benchmark.test_case import TestCase
-__all__ = ["Benchmark", "Report", "Runner", "TestCase", "TestResult"]
+__all__ = ["Benchmark", "Report", "Runner", "Summary", "TestCase", "TestResult"]

promptum/benchmark/report.py CHANGED Viewed

@@ -1,15 +1,15 @@
 from collections.abc import Callable, Sequence
 from dataclasses import dataclass
-from typing import Any
 from promptum.benchmark.result import TestResult
+from promptum.benchmark.summary import Summary
 @dataclass(frozen=True, slots=True)
 class Report:
     results: Sequence[TestResult]
-    def get_summary(self) -> dict[str, Any]:
+    def get_summary(self) -> Summary:
         total = len(self.results)
         passed = sum(1 for r in self.results if r.passed)
@@ -17,18 +17,17 @@ class Report:
         total_cost = sum(r.metrics.cost_usd or 0 for r in self.results if r.metrics)
         total_tokens = sum(r.metrics.total_tokens or 0 for r in self.results if r.metrics)
-        return {
-            "total": total,
-            "passed": passed,
-            "failed": total - passed,
-            "pass_rate": passed / total if total > 0 else 0,
-            "avg_latency_ms": sum(latencies) / len(latencies) if latencies else 0,
-            "p50_latency_ms": self._percentile(latencies, 0.5) if latencies else 0,
-            "p95_latency_ms": self._percentile(latencies, 0.95) if latencies else 0,
-            "p99_latency_ms": self._percentile(latencies, 0.99) if latencies else 0,
-            "total_cost_usd": total_cost,
-            "total_tokens": total_tokens,
-        }
+        return Summary(
+            total=total,
+            passed=passed,
+            failed=total - passed,
+            pass_rate=passed / total if total > 0 else 0,
+            avg_latency_ms=sum(latencies) / len(latencies) if latencies else 0,
+            min_latency_ms=min(latencies) if latencies else 0,
+            max_latency_ms=max(latencies) if latencies else 0,
+            total_cost_usd=total_cost,
+            total_tokens=total_tokens,
+        )
     def filter(
         self,
@@ -60,15 +59,3 @@ class Report:
             groups[group_key].append(result)
         return {k: Report(results=v) for k, v in groups.items()}
-    def compare_models(self) -> dict[str, dict[str, Any]]:
-        by_model = self.group_by(lambda r: r.test_case.model)
-        return {model: report.get_summary() for model, report in by_model.items()}
-    @staticmethod
-    def _percentile(values: list[float], p: float) -> float:
-        if not values:
-            return 0
-        sorted_values = sorted(values)
-        index = int((len(sorted_values) - 1) * p)
-        return sorted_values[index]

promptum/benchmark/summary.py ADDED Viewed

@@ -0,0 +1,14 @@
+from dataclasses import dataclass
+@dataclass(frozen=True, slots=True)
+class Summary:
+    total: int
+    passed: int
+    failed: int
+    pass_rate: float
+    avg_latency_ms: float
+    min_latency_ms: float
+    max_latency_ms: float
+    total_cost_usd: float
+    total_tokens: int

{promptum-0.0.2.dist-info → promptum-0.0.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: promptum
-Version: 0.0.2
+Version: 0.0.3
 Summary: Async LLM benchmarking library with protocol-based extensibility
 Project-URL: Homepage, https://github.com/deyna256/promptum
 Project-URL: Repository, https://github.com/deyna256/promptum
@@ -46,7 +46,7 @@ Description-Content-Type: text/markdown
 ![Async](https://img.shields.io/badge/Async-First-green?style=for-the-badge)
 ![License: MIT](https://img.shields.io/badge/License-MIT-yellow?style=for-the-badge)
-**Benchmark LLMs Like a Pro. In 5 Lines of Code.**
+**Benchmark LLMs Like a Pro.**
 Stop writing boilerplate to test LLMs. Start getting results.
@@ -56,11 +56,12 @@ Stop writing boilerplate to test LLMs. Start getting results.
 ## What's This?
-A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get beautiful reports.
+A dead-simple Python library for benchmarking LLM providers. Write tests once, run them across any model, get structured results.
 ```python
 benchmark = Benchmark(provider=client, name="my_test")
 benchmark.add_test(TestCase(
+    name="basic_math",
     prompt="What is 2+2?",
     model="gpt-3.5-turbo",
     validator=Contains("4")
@@ -130,9 +131,9 @@ async def main():
         report = await benchmark.run_async()
         summary = report.get_summary()
-        print(f"✓ {summary['passed']}/{summary['total']} tests passed")
-        print(f"⚡ {summary['avg_latency_ms']:.0f}ms average")
-        print(f"💰 ${summary['total_cost_usd']:.6f} total cost")
+        print(f"✓ {summary.passed}/{summary.total} tests passed")
+        print(f"⚡ {summary.avg_latency_ms:.0f}ms average")
+        print(f"💰 ${summary.total_cost_usd:.6f} total cost")
 asyncio.run(main())
 ```
@@ -146,7 +147,7 @@ python your_script.py
 ## What You Get
-- [x] **One API for 100+ Models** - OpenRouter support out of the box (OpenAI, Anthropic, Google, etc.)
+- [x] **100+ Models via OpenRouter** - One client for OpenAI, Anthropic, Google, and more
 - [x] **Smart Validation** - ExactMatch, Contains, Regex, JsonSchema, or write your own
 - [x] **Automatic Retries** - Exponential/linear backoff with configurable attempts
 - [x] **Metrics Tracking** - Latency, tokens, cost - automatically captured
@@ -161,35 +162,42 @@ python your_script.py
 Compare GPT-4 vs Claude on your tasks:
 ```python
-from promptum import Benchmark, TestCase, ExactMatch, Contains, Regex
-tests = [
-    TestCase(
-        name="json_output",
-        prompt='Output JSON: {"status": "ok"}',
-        model="openai/gpt-4",
-        validator=Regex(r'\{"status":\s*"ok"\}')
-    ),
-    TestCase(
-        name="json_output",
-        prompt='Output JSON: {"status": "ok"}',
-        model="anthropic/claude-3-5-sonnet",
-        validator=Regex(r'\{"status":\s*"ok"\}')
-    ),
-    TestCase(
-        name="creative_writing",
-        prompt="Write a haiku about Python",
-        model="openai/gpt-4",
-        validator=Contains("Python", case_sensitive=False)
-    ),
-]
-benchmark.add_tests(tests)
-report = await benchmark.run_async()
+import asyncio
+from promptum import Benchmark, TestCase, Contains, Regex, OpenRouterClient
+async def main():
+    async with OpenRouterClient(api_key="your-key") as client:
+        benchmark = Benchmark(provider=client, name="model_comparison")
+        benchmark.add_tests([
+            TestCase(
+                name="json_output_gpt4",
+                prompt='Output JSON: {"status": "ok"}',
+                model="openai/gpt-4",
+                validator=Regex(r'\{"status":\s*"ok"\}')
+            ),
+            TestCase(
+                name="json_output_claude",
+                prompt='Output JSON: {"status": "ok"}',
+                model="anthropic/claude-3-5-sonnet",
+                validator=Regex(r'\{"status":\s*"ok"\}')
+            ),
+            TestCase(
+                name="creative_writing",
+                prompt="Write a haiku about Python",
+                model="openai/gpt-4",
+                validator=Contains("Python", case_sensitive=False)
+            ),
+        ])
-# Side-by-side model comparison
-for model, summary in report.compare_models().items():
-    print(f"{model}: {summary['pass_rate']:.0%} pass rate, {summary['avg_latency_ms']:.0f}ms avg")
+        report = await benchmark.run_async()
+        # Side-by-side model comparison
+        for model, model_report in report.group_by(lambda r: r.test_case.model).items():
+            summary = model_report.get_summary()
+            print(f"{model}: {summary.pass_rate:.0%} pass rate, {summary.avg_latency_ms:.0f}ms avg")
+asyncio.run(main())
 ```
 ---

{promptum-0.0.2.dist-info → promptum-0.0.3.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,11 @@
-promptum/__init__.py,sha256=8IAk_9VlnKEJIdwf-hEDkOfOCV456H2Jng-HrZfewso,582
+promptum/__init__.py,sha256=F2dfvZUHDnAwQOGDXINgFnu8vtnwfS_twr6O5tmL1K8,606
 promptum/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-promptum/benchmark/__init__.py,sha256=0FXYDnK4SGa5ZqX2k9aVkwy3ENDlF_5nW2Mut_OCCbg,311
+promptum/benchmark/__init__.py,sha256=eyeJHW4mZdKv2vuQRfXFOAKSH07YNROeNlBKJCOfWHg,369
 promptum/benchmark/benchmark.py,sha256=hZ3557qPKqFeNNuxrRLPs-b6XBy2JCowIhRDDwatfeI,1403
-promptum/benchmark/report.py,sha256=DhY1p3n29xOSwRYUiQW6V6FhGFGGn-JF6nuNuvj9rro,2659
+promptum/benchmark/report.py,sha256=IkCpd3cswLtL9vTUIXAAYGvZk_t9wyQAzbIFpOSXnkI,2080
 promptum/benchmark/result.py,sha256=nKh-T4zlam2LxsaFoL8jeVaO6kZJ1sfB_tnp4gdNPhM,482
 promptum/benchmark/runner.py,sha256=5p6JBwjTlEHTh6jNv_iuFH1nIrI4_Gv3wmzCT0TWpvA,2407
+promptum/benchmark/summary.py,sha256=dD-i8m2BUMFhMC_7v-ITDNUP5X7fG0ARErwKB5D9_yE,281
 promptum/benchmark/test_case.py,sha256=Okypf2334ewVrvmQG7M3I3D7BzqXDsQ2ihjNw9gGF00,598
 promptum/providers/__init__.py,sha256=UprvJ4vxHqo-VTzzUmZ4wFCj6VybP9xBd7HtpPPSvbI,335
 promptum/providers/metrics.py,sha256=FnS10nHFjQ5Clj5X21C_nW6zAUJU_ZHt0s2fLgp6L28,427
@@ -14,7 +15,7 @@ promptum/providers/retry.py,sha256=mA_RRz9_9J_mge_AUd9f1A-gACOxZLGTI8vTIstAr8s,5
 promptum/validation/__init__.py,sha256=mhykyxaIwn2PJh2RXAi0fi2NRIveFmlC5bg1nyCbfVU,252
 promptum/validation/protocol.py,sha256=xqxm23YX6eNeZHKMLMZ-Wz8iQKn4ZRzAI5Xryxg0uq4,418
 promptum/validation/validators.py,sha256=qSMva2P2miXXJJ5XeTKJsyYgh2x5wORi3dhOnBYuACE,2686
-promptum-0.0.2.dist-info/METADATA,sha256=MQcy0pxUoMpu4uZgM_Q3HEE_RnY3Krcg-_FTF9vvQ54,7845
-promptum-0.0.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-promptum-0.0.2.dist-info/licenses/LICENSE,sha256=Fgn285H5Vy9diOlqO1TzS3hD97WcdF6-GFHvUcFNtmg,1067
-promptum-0.0.2.dist-info/RECORD,,
+promptum-0.0.3.dist-info/METADATA,sha256=P4d6Mf35ly_xATjxAUWZXjQPYyfb5xfBMqiXoGkkDkc,8278
+promptum-0.0.3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+promptum-0.0.3.dist-info/licenses/LICENSE,sha256=Fgn285H5Vy9diOlqO1TzS3hD97WcdF6-GFHvUcFNtmg,1067
+promptum-0.0.3.dist-info/RECORD,,

{promptum-0.0.2.dist-info → promptum-0.0.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{promptum-0.0.2.dist-info → promptum-0.0.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

promptum 0.0.2__py3-none-any.whl → 0.0.3__py3-none-any.whl

promptum 0.0.2py3-none-any.whl → 0.0.3py3-none-any.whl