PyPI - levelapp - Versions diffs - 0.1.1__tar.gz → 0.1.3__tar.gz - Mend

levelapp 0.1.1tar.gz → 0.1.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of levelapp might be problematic. Click here for more details.

Files changed (87) hide show

{levelapp-0.1.1 → levelapp-0.1.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: levelapp
-Version: 0.1.1
+Version: 0.1.3
 Summary: LevelApp is an evaluation framework for AI/LLM-based software application. [Powered by Norma]
 Project-URL: Homepage, https://github.com/levelapp-org
 Project-URL: Repository, https://github.com/levelapp-org/levelapp-framework
@@ -17,14 +17,12 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Topic :: Software Development :: Testing
 Requires-Python: >=3.12
-Requires-Dist: arrow>=1.3.0
 Requires-Dist: google-api-core>=2.25.1
 Requires-Dist: google-auth>=2.40.3
 Requires-Dist: google-cloud-firestore>=2.21.0
 Requires-Dist: httpx>=0.28.1
 Requires-Dist: humanize>=4.13.0
 Requires-Dist: numpy>=2.3.2
-Requires-Dist: openai>=1.99.9
 Requires-Dist: pandas-stubs==2.3.0.250703
 Requires-Dist: pandas>=2.3.1
 Requires-Dist: pydantic>=2.11.7
@@ -35,10 +33,12 @@ Requires-Dist: rapidfuzz>=3.13.0
 Requires-Dist: requests>=2.32.4
 Requires-Dist: tenacity>=9.1.2
 Provides-Extra: dev
-Requires-Dist: arrow>=1.3.0; extra == 'dev'
+Requires-Dist: google-api-core>=2.25.1; extra == 'dev'
+Requires-Dist: google-auth>=2.40.3; extra == 'dev'
+Requires-Dist: google-cloud-firestore>=2.21.0; extra == 'dev'
 Requires-Dist: httpx>=0.28.1; extra == 'dev'
+Requires-Dist: humanize>=4.13.0; extra == 'dev'
 Requires-Dist: numpy>=2.3.2; extra == 'dev'
-Requires-Dist: openai>=1.99.9; extra == 'dev'
 Requires-Dist: pandas-stubs==2.3.0.250703; extra == 'dev'
 Requires-Dist: pandas>=2.3.1; extra == 'dev'
 Requires-Dist: pydantic>=2.11.7; extra == 'dev'
@@ -133,7 +133,7 @@ endpoint:
     generated_metadata: "${generated_metadata}"
 repository:
-  type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM, MONGODB.
+  type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
   project_id: "(default)"
   database_name: ""
 ```
@@ -220,14 +220,14 @@ To run an evaluation:
 ```python
 if __name__ == "__main__":
-    from levelapp.workflow.schemas import WorkflowConfig
+    from levelapp.workflow import WorkflowConfig
     from levelapp.core.session import EvaluationSession
     # Load configuration from YAML
     config = WorkflowConfig.load(path="../data/workflow_config.yaml")
-    # Run evaluation session
-    with EvaluationSession(session_name="test-session-1", workflow_config=config) as session:
+    # Run evaluation session (You can enable/disable the monitoring aspect)
+    with EvaluationSession(session_name="test-session-1", workflow_config=config, enable_monitoring=False) as session:
         session.run()
         results = session.workflow.collect_results()
         print("Results:", results)
@@ -243,14 +243,13 @@ if __name__ == "__main__":
     from levelapp.workflow import WorkflowConfig
     from levelapp.core.session import EvaluationSession
-    # Firestore -> retrieve endpoint config -> data => config_dict
     config_dict = {
         "process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
-        "evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
+        "evaluation": {"evaluators": ["JUDGE", "REFERENCE"], "providers": ["openai", "ionos"], "metrics_map": {"field_1": "EXACT"}},
         "reference_data": {"path": "", "data": {}},
         "endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
-        "repository": {"type": "FIRESTORE", "source": "IN_MEMORY", "metrics_map": {"field_1": "EXACT"}},
+        "repository": {"type": "FIRESTORE", "source": "IN_MEMORY"},
     }
     content = {

{levelapp-0.1.1 → levelapp-0.1.3}/README.md RENAMED Viewed

@@ -81,7 +81,7 @@ endpoint:
     generated_metadata: "${generated_metadata}"
 repository:
-  type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM, MONGODB.
+  type: FIRESTORE # Pick one of the following: FIRESTORE, FILESYSTEM
   project_id: "(default)"
   database_name: ""
 ```
@@ -168,14 +168,14 @@ To run an evaluation:
 ```python
 if __name__ == "__main__":
-    from levelapp.workflow.schemas import WorkflowConfig
+    from levelapp.workflow import WorkflowConfig
     from levelapp.core.session import EvaluationSession
     # Load configuration from YAML
     config = WorkflowConfig.load(path="../data/workflow_config.yaml")
-    # Run evaluation session
-    with EvaluationSession(session_name="test-session-1", workflow_config=config) as session:
+    # Run evaluation session (You can enable/disable the monitoring aspect)
+    with EvaluationSession(session_name="test-session-1", workflow_config=config, enable_monitoring=False) as session:
         session.run()
         results = session.workflow.collect_results()
         print("Results:", results)
@@ -191,14 +191,13 @@ if __name__ == "__main__":
     from levelapp.workflow import WorkflowConfig
     from levelapp.core.session import EvaluationSession
-    # Firestore -> retrieve endpoint config -> data => config_dict
     config_dict = {
         "process": {"project_name": "test-project", "workflow_type": "SIMULATOR", "evaluation_params": {"attempts": 2}},
-        "evaluation": {"evaluators": ["JUDGE"], "providers": ["openai", "ionos"]},
+        "evaluation": {"evaluators": ["JUDGE", "REFERENCE"], "providers": ["openai", "ionos"], "metrics_map": {"field_1": "EXACT"}},
         "reference_data": {"path": "", "data": {}},
         "endpoint": {"base_url": "http://127.0.0.1:8000", "api_key": "key", "model_id": "model"},
-        "repository": {"type": "FIRESTORE", "source": "IN_MEMORY", "metrics_map": {"field_1": "EXACT"}},
+        "repository": {"type": "FIRESTORE", "source": "IN_MEMORY"},
     }
     content = {

levelapp-0.1.3/docs/media/simulator-module-diagram.PNG ADDED Viewed

Binary file

levelapp-0.1.3/docs/media/simulator-sequence-diagram.png ADDED Viewed

Binary file

levelapp-0.1.3/examples/README.md ADDED Viewed

@@ -0,0 +1,322 @@
+# Quickstart Guide: Using LevelApp's Conversation Simulator for Developers
+---
+#### Welcome to LevelApp Quickstart Guide!
+This guide provides a step-by-step walkthrough for developers to set up and use the Simulator Module in LevelApp.
+<br>
+<br>
+The **Simulator** focuses on black-box testing by simulating dialogues using predefined scripts, evaluating responses against references, and computing metrics on extracted metadata.
+It leverages LLM-as-a-judge for qualitative scoring and supports quantitative metrics like exact matches or fuzzy comparisons.
+<br>
+<figure>
+    <img
+    src="../docs/media/simulator-module-diagram.PNG"
+    alt="Sequence Diagram">
+    <figcaption>Fig.1 - Simulator Module Diagram</figcaption>
+</figure>
+<br>
+We'll emphasize technical details, including configuration schemas, placeholders, evaluators, metrics, and code execution flow. This assumes you're familiar with Python, YAML/JSON, and REST APIs for LLM endpoints. By the end, you'll have a runnable example for evaluating a chatbot's conversation flow.
+---
+## Introduction
+First, let's have a quick introduction on what LevelApp is and what it provides as a framework.
+The idea behind LevelApp is to build a framework that assists developers to perform regression tests on their LLM-powered systems ensuring that recent changes to code have not negatively impacted existing functionality or introduced new defects. <br>
+The evaluation of dialogue systems is very cost/time intensive and problematic since assessing the quality of a dialogue requires multiple iteration where a human conducts a message/reply evaluation for each interaction (quite tedious and boring task, if you ask me!).
+Automating the evaluation and introducing an LLM-as-a-judge as an approach to evaluate the correctness of responses can
+ease the process and render it more efficient.
+---
+## Walkthrough
+### Step1: Installation and Prerequisites
+Install LevelApp using pip. This pulls in dependencies like `pydantic`, `numpy`, `python-dotenv`,
+and others for handling LLM clients, data validation, and metrics computation.
+```bash
+  pip install levelapp
+```
+#### Technical Prerequisites:
+* **Python Version**: 3.12+. LevelApp uses modern features like type hints and async support (via `asyncio` for potential batch processing).
+* **LLM Provider Credentials**: You'll need API keys for at least one supported provider (e.g., OpenAI, Anthropic, IONOS, Mistral). These are loaded via `python-dotenv` from a `.env` file. Without them, evaluators like JUDGE won't function.
+* **No Internet for Dependencies**: All core deps are installed automatically; no manual `pip install` needed beyond the initial command.
+* **Environment Setup**: Create a `.env` file in your project root. Example structure (replace with your actual keys):
+```
+IONOS_API_KEY=your-ionos-key
+OPENAI_API_KEY=sk-your-openai-key
+ANTHROPIC_API_KEY=your-anthropic-key
+MISTRAL_API_KEY=your-mistral-key
+IONOS_BASE_URL=https://inference.de-txl.ionos.com
+IONOS_MODEL_ID=0b6c4a15-bb8d-4092-82b0-f357b77c59fd
+# Optional: Path to workflow config if not loading programmatically
+WORKFLOW_CONFIG_PATH=../data/workflow_config.yaml
+```
+**Note**: For IONOS, the base_url and model_id are mandatory in .env as they aren't always configurable via YAML alone.
+LevelApp uses these to construct API requests.
+### Step2: Understanding the Simulator Workflow
+The Simulator Module simulates conversations by:
+1. Sending user messages (from a JSON script) to your LLM-based system's endpoint.
+2. Capturing generated responses and metadata.
+3. Evaluating them using selected evaluators (e.g., JUDGE for LLM-scored quality, REFERENCE for direct comparison).
+4. Computing metrics on metadata (e.g., EXACT for string matching, LEVENSHTEIN for edit distance).
+<figure>
+    <img
+    src="../docs/media/simulator-sequence-diagram.png"
+    alt="Sequence Diagram">
+    <figcaption>Fig.2 - Conversation Simulator Sequence Diagram</figcaption>
+</figure>
+#### Key Technical Concepts:
+Workflow Type: Set to `SIMULATOR` in YAML. This triggers dialogue simulation logic in `levelapp.workflow`.
+* **Evaluators**:
+    * `JUDGE`: Uses an LLM (from providers like OpenAI) to score generated replies against references (e.g., on relevance, fluency). Configurable via providers list.
+    * `REFERENCE`: Direct comparison without LLM, using metrics for metadata (Used for comparing extracted metadata).
+    * `RAG`: Retrieval-Augmented Generation evaluator (for knowledge-grounded responses; requires additional setup).
+<br>
+<br>
+* **Metrics Map**: A dict mapping metadata fields to comparison methods (e.g., `EXACT` for exact string match, `LEVENSHTEIN` for fuzzy matching with distance thresholds).
+Full list in docs: includes Token-based, Embedded (vector similarity), Fuzzy.
+<br>
+<br>
+* **Attempts and Batching**: `evaluation_params` attempts runs simulations multiple times for averaging scores (useful for non-deterministic LLMs). batch_size controls concurrent requests to avoid rate limits.
+<br>
+<br>
+* **Placeholders in Payloads**:
+    * `default_request_payload_template`: For this section, you need to change **field** (e.g,. change the field name `prompt` to `message`) names and not the **placeholder** values. The placeholders are used by the simulator to populate the request body.
+    * `default_response_payload_template`: For this section, you need to change the placeholders values and not the fields, contrary to the request section. The simulator will use the provided placeholder values to extract and map the reply and metadata from the response body.
+### Step 3: Creating the YAML Configuration File
+Create `workflow_config.yaml` to define the workflow. This is parsed into a `WorkflowConfig` Pydantic model for validation.
+Example `workflow_config.yaml` for Simulator:
+```YAML
+# PROCESS SECTION:
+process:
+  project_name: "chatbot-evaluation"
+  workflow_type: SIMULATOR  # Must be SIMULATOR for conversation testing
+  evaluation_params:
+    attempts: 3  # Run each interaction 3 times, average results
+    batch_size: 10  # Process 10 interactions concurrently
+# EVALUATION SECTION:
+evaluation:
+  evaluators:  # Array of evaluators to apply
+    - JUDGE
+    - REFERENCE  # REFERENCE evaluator can be used if your dialogue system returns additional metadata.
+  providers:  # LLM providers for JUDGE (At least one must be provided for the JUDGE evaluator)
+    - openai
+    - ionos
+  metrics_map:  # Map metadata fields to metrics
+    appointment_type: EXACT  # Exact match for strings
+    date: LEVENSHTEIN  # Fuzzy match for dates (e.g., tolerates formatting differences)
+    time: TOKEN_BASED  # Token-level overlap
+# REFERENCE DATA SECTION:
+reference_data:
+  path: "conversation_script.json"  # Path to JSON script
+  data: {}  # Inline data if not using path (dict of scripts)
+# ENDPOINT CONFIGURATION SECTION:
+endpoint:
+  base_url: "http://127.0.0.1:8000"  # Your chatbot's API base URL
+  url_path: "chat"  # Endpoint path (full URL = base_url + url_path)
+  api_key: ""  # Optional; overrides .env if set
+  bearer_token: ""  # For auth
+  model_id: "meta-llama/Meta-Llama-3.1-8B-Instruct"  # Model for your endpoint (if applicable).
+  default_request_payload_template:  # Template for POST body
+    message: "${user_message}"  # Adapt to your API (e.g., 'prompt' for some)
+    payload: "${request_payload}"  # Additional data from JSON script
+  default_response_payload_template:  # Extract from API response
+    agent_reply: "${generated_reply}"  # Map to your response field
+    generated_metadata: "${metadata}"  # e.g., extracted entities
+# REPOSITORY SECTION (Optional):
+repository:
+  type: FILESYSTEM  # Or FIRESTORE/MONGODB for persistence
+  project_id: ""  # For FIRESTORE
+  database_name: ""  # For FIRESTORE/MONGODB
+  source: "LOCAL"  # Or IN_MEMORY for non-persistent
+```
+For the endpoint configuration section (`endpoint`), essentially, you need to provide:
+* base_url
+* url_path
+* headers data: API Key, Bearer Token, or any additional header data.
+As for the request payload, for example, if you have the following request payload schema:
+```JSON
+{
+  "prompt": "Hello, world!",
+  "user_id": "0001",
+  "user_role": "ADMIN",
+}
+```
+You need to configure the `default_request_payload_template` like the following:
+```YAML
+default_request_payload_template:
+  prompt: "${user_message}"  # As you can notice, we only changed the field name and not the placeholder value.
+  payload: "${request_payload}" # The rest of the data will be fetched from the "request_payload" field in the reference data JSON file.
+```
+while providing the rest of the payload request inside the reference data JSON file content:
+```JSON
+{
+  "scripts": [
+    {
+      "interactions": [
+        {
+          "user_message": "Hello, I would like to book an appointment with a doctor.",
+          "reference_reply": "Sure, I can help with that. Could you please specify the type of doctor you need to see?",
+          "interaction_type": "initial",
+          "reference_metadata": {},
+          "guardrail_flag": false,
+          "request_payload": {"user_id":  "0001", "user_role": "ADMIN"}  // Here we add the rest of the request payload data.
+        }
+...
+```
+And for the response payload, if you have the following response payload schema:
+```JSON
+{
+  "response": "Hello, world!",
+  "metadata": {"k1": "v1", "k2": "v2"},
+  "timestamp": "2025-10-14T14:49:00.123Z",
+  "status": "COMPLETE"
+}
+```
+You need to configure the `default_response_payload_template` like the following:
+```YAML
+  default_response_payload_template:
+    agent_reply: "${response}"  # we changed the placeholder value here by adding "response" field where the reply value is held.
+    generated_metadata: "${metadata}"
+```
+### Step 4: Creating the JSON Conversation Script
+The script defines simulation flows. It's a dict with a `scripts` array, each containing `interactions` (sequential turns).
+<br>
+Example `conversation_script.json`:
+```JSON
+{
+  "scripts": [
+    {
+      "interactions": [
+        {
+          "user_message": "Hello, book a doctor appointment.",
+          "reference_reply": "What type of doctor?",
+          "interaction_type": "initial",
+          "reference_metadata": {},
+          "guardrail_flag": false,
+          "request_payload": {"user_id": "123", "role": "user"}
+        },
+        {
+          "user_message": "Cardiologist.",
+          "reference_reply": "When?",
+          "interaction_type": "intermediate",
+          "reference_metadata": {"type": "Cardiology"},
+          "guardrail_flag": false,
+          "request_payload": {"user_id": "123", "role": "user"}
+        },
+        {
+          "user_message": "Next Monday at 10 AM.",
+          "reference_reply": "Booked for 10 AM next Monday.",
+          "interaction_type": "final",
+          "reference_metadata": {
+            "appointment_type": "Cardiology",
+            "date": "2025-10-20",
+            "time": "10:00"
+          },
+          "guardrail_flag": false,
+          "request_payload": {"user_id": "123", "role": "user"}
+        }
+      ],
+      "description": "Doctor booking flow",
+      "details": {"context": "Medical chatbot"}
+    }
+  ]
+}
+```
+#### Technical Notes:
+* **Schema Validation**: Interactions are validated against a schema (e.g., user_message: str, reference_metadata: dict).
+* **Metadata Comparison**: generated_metadata from your endpoint is compared to reference_metadata using metrics_map.
+* **Interaction Types**: initial/intermediate/final for flow control; can influence evaluator behavior (e.g., stricter on final turns).
+* **Request Payload**: Merged into the endpoint request template for context (e.g., user auth).
+### Step 5: Writing and Running the Python Script
+Use this to load configs, run the simulation, and collect results. LevelApp handles session management via context managers.
+<br>
+Example run_simulation.py:
+```Python
+from dotenv import load_dotenv
+from levelapp.workflow import WorkflowConfig
+from levelapp.core.session import EvaluationSession
+# Load .env (automatically done in LevelApp, but explicit for clarity)
+load_dotenv()
+if __name__ == "__main__":
+    # Load YAML config (validates via Pydantic)
+    config = WorkflowConfig.load(path="workflow_config.yaml")
+    # Alternative: Load from dict for in-memory config (e.g., from DB)
+    # config_dict = {...}  # As in README
+    # config = WorkflowConfig.from_dict(content=config_dict)
+    # config.set_reference_data(content={"scripts": [...]})  # Inline script
+    # Create session (handles logging, repository init)
+    with EvaluationSession(session_name="chatbot-sim-1", workflow_config=config) as session:
+        # Run simulation: Sends requests, evaluates, stores in repo
+        session.run()
+        # Collect the evaluation results
+        results = session.workflow.collect_results()
+        print("Evaluation Results:", results)
+    stats = session.get_stats()
+    print("Session Stats:\n", stats)
+```
+Technical Execution Flow:
+1. `WorkflowConfig.load()`: Parses YAML, loads .env secrets, validates.
+2. `EvaluationSession`: Initializes the evaluation session.
+3. `session.run()`: Loops over scripts/interactions:
+   * Substitutes placeholders, sends POST to endpoint.
+   * Extracts chatbot reply and generated metadata.
+   * Applies evaluators (e.g., JUDGE prompts LLM with "Score reply on scale 0-3: generated vs reference").
+   * Computes metrics (e.g., Levenshtein distance via numpy).
+4. `collect_results()`: Returns the evaluation results.
+5. `get_stats()`: Retrieves monitoring stats (API calls details, caching details, processing time, etc.).
+---
+### Let's Test It:
+First, install the packages required to run the examples test:
+<br>
+(it is always recommended to set up a virtual environment for testing)
+```Bash
+  pip install fastapi uvicorn levelapp
+```
+Second, run the chatbot (`example_chatbot.py`) using `uvicorn`:
+<br>
+(don't forget to add your `OPENAI_API_KEY`!)
+```Bash
+  uvicorn example_chatbot:app --reload --port 8000
+```
+Next, optionally, run a health test to see if the chatbot is alive:
+```Bash
+  curl http://localhost:8000/healthz
+```
+Finally, run the evaluation:
+```Bash
+  python example_evaluation.py
+```
+That's it! All you need now is to verify and interpret the evaluation results.
+**Good luck!**

levelapp-0.1.3/examples/conversation_script.json ADDED Viewed

@@ -0,0 +1,38 @@
+{
+  "scripts": [
+    {
+      "interactions": [
+        {
+          "user_message": "Hello, book a doctor appointment.",
+          "reference_reply": "What type of doctor?",
+          "interaction_type": "initial",
+          "reference_metadata": {},
+          "guardrail_flag": false,
+          "request_payload": {"user_id": "123", "role": "user"}
+        },
+        {
+          "user_message": "Cardiologist.",
+          "reference_reply": "When?",
+          "interaction_type": "intermediate",
+          "reference_metadata": {"type": "Cardiology"},
+          "guardrail_flag": false,
+          "request_payload": {"user_id": "123", "role": "user"}
+        },
+        {
+          "user_message": "Next Monday at 10 AM.",
+          "reference_reply": "Booked for 10 AM next Monday.",
+          "interaction_type": "final",
+          "reference_metadata": {
+            "appointment_type": "Cardiology",
+            "date": "2025-10-20",
+            "time": "10:00"
+          },
+          "guardrail_flag": false,
+          "request_payload": {"user_id": "123", "role": "user"}
+        }
+      ],
+      "description": "Doctor booking flow",
+      "details": {"context": "Medical chatbot"}
+    }
+  ]
+}

levelapp-0.1.3/examples/example_chatbot.py ADDED Viewed

@@ -0,0 +1,48 @@
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from openai import OpenAI
+app = FastAPI(title="Tiny Chatbot")
+client = OpenAI(api_key="<YOUR-API-KEY-HERE>")
+SYSTEM_PROMPT = "Play role as a medical assistant."
+class ChatRequest(BaseModel):
+    message: str
+class ChatResponse(BaseModel):
+    reply: str
+def generate_reply(user_message: str) -> str:
+    try:
+        resp = client.chat.completions.create(
+            model="gpt-4o-mini",  # pick any chat-capable model you have access to
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_message},
+            ],
+            temperature=0.3,
+        )
+        return resp.choices[0].message.content.strip()
+    except Exception as e:
+        raise RuntimeError(f"LLM error: {e}")
+@app.post("/chat", response_model=ChatResponse)
+def chat(req: ChatRequest):
+    if not req.message:
+        raise HTTPException(status_code=400, detail="`message` is required.")
+    try:
+        reply = generate_reply(req.message)
+        return ChatResponse(reply=reply)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/healthz")
+def health():
+    return {"status": "ok"}

levelapp-0.1.3/examples/example_evaluation.py ADDED Viewed

@@ -0,0 +1,28 @@
+from dotenv import load_dotenv
+from levelapp.core.session import EvaluationSession
+from levelapp.workflow import WorkflowConfig
+# Load .env (automatically done in LevelApp, but explicit for clarity)
+load_dotenv()
+if __name__ == "__main__":
+    # 1. Load YAML config
+    config = WorkflowConfig.load(path="workflow_configuration.yaml")
+    # Alternatively: Load from dict for in-memory config (e.g., from DB)
+    # config_dict = {...}  # As in README
+    # config = WorkflowConfig.from_dict(content=config_dict)
+    # config.set_reference_data(content={"scripts": [...]})  # Inline script
+    # 2. Create an evaluation session
+    with EvaluationSession(session_name="chatbot-sim-1", workflow_config=config) as session:
+        # 2.1. Run session (simulation session)
+        session.run()
+        # 2.2. Collect evaluation results
+        results = session.workflow.collect_results()
+        print("Evaluation Results:", results)
+    # 3. Get aggregated stats (monitoring stats)
+    stats = session.get_stats()
+    print("Session Stats:\n", stats)

levelapp-0.1.3/examples/workflow_configuration.yaml ADDED Viewed

@@ -0,0 +1,38 @@
+process:
+  project_name: "chatbot-evaluation"
+  workflow_type: SIMULATOR  # Must be SIMULATOR for conversation testing
+  evaluation_params:
+    attempts: 3
+    batch_size: 10
+evaluation:
+  evaluators:
+    - JUDGE
+    - REFERENCE
+  providers:
+    - openai
+    - ionos
+  metrics_map:
+    appointment_type: EXACT
+    date: LEVENSHTEIN
+    time: TOKEN_BASED
+reference_data:
+  path: "conversation_script.json"
+  data: {}
+endpoint:
+  base_url: "http://127.0.0.1:8000"
+  url_path: "chat"
+  api_key: ""
+  bearer_token: ""
+  default_request_payload_template:
+    message: "${user_message}"
+  default_response_payload_template:
+    agent_reply: "${reply}"
+repository:
+  type: FIRESTORE # Keep this value as if for now.
+  project_id: ""
+  database_name: ""
+  source: "LOCAL"

{levelapp-0.1.1 → levelapp-0.1.3}/levelapp/aspects/monitor.py RENAMED Viewed

@@ -343,6 +343,7 @@ class FunctionMonitor:
             category: MetricType,
             enable_timing: bool,
             track_memory: bool,
+            verbose=False
     ) -> Callable[P, T]:
         """
         Wrap function execution with timing and error handling.
@@ -352,6 +353,7 @@ class FunctionMonitor:
             name: Unique identifier for the function
             enable_timing: Enable execution time logging
             track_memory: Enable memory tracking
+            verbose: Enable verbose logging
         Returns:
             Wrapped function
@@ -402,7 +404,7 @@ class FunctionMonitor:
                     self._aggregated_stats[name].update(metrics=metrics)
-                if enable_timing and metrics.duration is not None:
+                if verbose and enable_timing and metrics.duration is not None:
                     log_message = f"[FunctionMonitor] Executed '{name}' in {metrics.duration:.4f}s"
                     if metrics.cache_hit:
                         log_message += " (cache hit)"
@@ -420,7 +422,8 @@ class FunctionMonitor:
             maxsize: int | None = 128,
             enable_timing: bool = True,
             track_memory: bool = True,
-            collectors: List[Type[MetricsCollector]] | None = None
+            collectors: List[Type[MetricsCollector]] | None = None,
+            verbose: bool = False
     ) -> Callable[[Callable[P, T]], Callable[P, T]]:
         """
         Decorator factory for monitoring functions.
@@ -454,7 +457,7 @@ class FunctionMonitor:
             )
             with self._lock:
-                if name in self._monitored_procedures:
+                if name in self._monitored_procedures and verbose:
                     raise ValueError(f"Function '{name}' is already registered.")
                 self._monitored_procedures[name] = monitored_func

{levelapp-0.1.1 → levelapp-0.1.3}/levelapp/clients/__init__.py RENAMED Viewed

@@ -44,7 +44,6 @@ class ClientRegistry:
         cls._wrap_client_methods(client_class)
         cls._clients[provider] = client_class
-        logger.info(f"[ClientRegistry] Registered client for provider: {provider}")
     @classmethod
     def _wrap_client_methods(cls, client_class: Type[BaseChatClient]) -> None:

levelapp 0.1.1__tar.gz → 0.1.3__tar.gz

Potentially problematic release.

levelapp 0.1.1tar.gz → 0.1.3tar.gz