flock-core 0.5.0b28__py3-none-any.whl → 0.5.56b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of flock-core might be problematic. Click here for more details.
- flock/__init__.py +12 -217
- flock/agent.py +678 -0
- flock/api/themes.py +71 -0
- flock/artifacts.py +79 -0
- flock/cli.py +75 -0
- flock/components.py +173 -0
- flock/dashboard/__init__.py +28 -0
- flock/dashboard/collector.py +283 -0
- flock/dashboard/events.py +182 -0
- flock/dashboard/launcher.py +230 -0
- flock/dashboard/service.py +537 -0
- flock/dashboard/websocket.py +235 -0
- flock/engines/__init__.py +6 -0
- flock/engines/dspy_engine.py +856 -0
- flock/examples.py +128 -0
- flock/{core/util → helper}/cli_helper.py +4 -3
- flock/{core/logging → logging}/__init__.py +2 -3
- flock/{core/logging → logging}/formatters/enum_builder.py +3 -4
- flock/{core/logging → logging}/formatters/theme_builder.py +19 -44
- flock/{core/logging → logging}/formatters/themed_formatter.py +69 -115
- flock/{core/logging → logging}/logging.py +77 -61
- flock/{core/logging → logging}/telemetry.py +20 -26
- flock/{core/logging → logging}/telemetry_exporter/base_exporter.py +2 -2
- flock/{core/logging → logging}/telemetry_exporter/file_exporter.py +6 -9
- flock/{core/logging → logging}/telemetry_exporter/sqlite_exporter.py +2 -3
- flock/{core/logging → logging}/trace_and_logged.py +20 -24
- flock/mcp/__init__.py +91 -0
- flock/{core/mcp/mcp_client.py → mcp/client.py} +103 -154
- flock/{core/mcp/mcp_config.py → mcp/config.py} +62 -117
- flock/mcp/manager.py +255 -0
- flock/mcp/servers/sse/__init__.py +1 -1
- flock/mcp/servers/sse/flock_sse_server.py +11 -53
- flock/mcp/servers/stdio/__init__.py +1 -1
- flock/mcp/servers/stdio/flock_stdio_server.py +8 -48
- flock/mcp/servers/streamable_http/flock_streamable_http_server.py +17 -62
- flock/mcp/servers/websockets/flock_websocket_server.py +7 -40
- flock/{core/mcp/flock_mcp_tool.py → mcp/tool.py} +16 -26
- flock/mcp/types/__init__.py +42 -0
- flock/{core/mcp → mcp}/types/callbacks.py +9 -15
- flock/{core/mcp → mcp}/types/factories.py +7 -6
- flock/{core/mcp → mcp}/types/handlers.py +13 -18
- flock/{core/mcp → mcp}/types/types.py +70 -74
- flock/{core/mcp → mcp}/util/helpers.py +1 -1
- flock/orchestrator.py +645 -0
- flock/registry.py +148 -0
- flock/runtime.py +262 -0
- flock/service.py +140 -0
- flock/store.py +69 -0
- flock/subscription.py +111 -0
- flock/themes/andromeda.toml +1 -1
- flock/themes/apple-system-colors.toml +1 -1
- flock/themes/arcoiris.toml +1 -1
- flock/themes/atomonelight.toml +1 -1
- flock/themes/ayu copy.toml +1 -1
- flock/themes/ayu-light.toml +1 -1
- flock/themes/belafonte-day.toml +1 -1
- flock/themes/belafonte-night.toml +1 -1
- flock/themes/blulocodark.toml +1 -1
- flock/themes/breeze.toml +1 -1
- flock/themes/broadcast.toml +1 -1
- flock/themes/brogrammer.toml +1 -1
- flock/themes/builtin-dark.toml +1 -1
- flock/themes/builtin-pastel-dark.toml +1 -1
- flock/themes/catppuccin-latte.toml +1 -1
- flock/themes/catppuccin-macchiato.toml +1 -1
- flock/themes/catppuccin-mocha.toml +1 -1
- flock/themes/cga.toml +1 -1
- flock/themes/chalk.toml +1 -1
- flock/themes/ciapre.toml +1 -1
- flock/themes/coffee-theme.toml +1 -1
- flock/themes/cyberpunkscarletprotocol.toml +1 -1
- flock/themes/dark+.toml +1 -1
- flock/themes/darkermatrix.toml +1 -1
- flock/themes/darkside.toml +1 -1
- flock/themes/desert.toml +1 -1
- flock/themes/django.toml +1 -1
- flock/themes/djangosmooth.toml +1 -1
- flock/themes/doomone.toml +1 -1
- flock/themes/dotgov.toml +1 -1
- flock/themes/dracula+.toml +1 -1
- flock/themes/duckbones.toml +1 -1
- flock/themes/encom.toml +1 -1
- flock/themes/espresso.toml +1 -1
- flock/themes/everblush.toml +1 -1
- flock/themes/fairyfloss.toml +1 -1
- flock/themes/fideloper.toml +1 -1
- flock/themes/fishtank.toml +1 -1
- flock/themes/flexoki-light.toml +1 -1
- flock/themes/floraverse.toml +1 -1
- flock/themes/framer.toml +1 -1
- flock/themes/galizur.toml +1 -1
- flock/themes/github.toml +1 -1
- flock/themes/grass.toml +1 -1
- flock/themes/grey-green.toml +1 -1
- flock/themes/gruvboxlight.toml +1 -1
- flock/themes/guezwhoz.toml +1 -1
- flock/themes/harper.toml +1 -1
- flock/themes/hax0r-blue.toml +1 -1
- flock/themes/hopscotch.256.toml +1 -1
- flock/themes/ic-green-ppl.toml +1 -1
- flock/themes/iceberg-dark.toml +1 -1
- flock/themes/japanesque.toml +1 -1
- flock/themes/jubi.toml +1 -1
- flock/themes/kibble.toml +1 -1
- flock/themes/kolorit.toml +1 -1
- flock/themes/kurokula.toml +1 -1
- flock/themes/materialdesigncolors.toml +1 -1
- flock/themes/matrix.toml +1 -1
- flock/themes/mellifluous.toml +1 -1
- flock/themes/midnight-in-mojave.toml +1 -1
- flock/themes/monokai-remastered.toml +1 -1
- flock/themes/monokai-soda.toml +1 -1
- flock/themes/neon.toml +1 -1
- flock/themes/neopolitan.toml +1 -1
- flock/themes/nord-light.toml +1 -1
- flock/themes/ocean.toml +1 -1
- flock/themes/onehalfdark.toml +1 -1
- flock/themes/onehalflight.toml +1 -1
- flock/themes/palenighthc.toml +1 -1
- flock/themes/paulmillr.toml +1 -1
- flock/themes/pencildark.toml +1 -1
- flock/themes/pnevma.toml +1 -1
- flock/themes/purple-rain.toml +1 -1
- flock/themes/purplepeter.toml +1 -1
- flock/themes/raycast-dark.toml +1 -1
- flock/themes/red-sands.toml +1 -1
- flock/themes/relaxed.toml +1 -1
- flock/themes/retro.toml +1 -1
- flock/themes/rose-pine.toml +1 -1
- flock/themes/royal.toml +1 -1
- flock/themes/ryuuko.toml +1 -1
- flock/themes/sakura.toml +1 -1
- flock/themes/scarlet-protocol.toml +1 -1
- flock/themes/seoulbones-dark.toml +1 -1
- flock/themes/shades-of-purple.toml +1 -1
- flock/themes/smyck.toml +1 -1
- flock/themes/softserver.toml +1 -1
- flock/themes/solarized-darcula.toml +1 -1
- flock/themes/square.toml +1 -1
- flock/themes/sugarplum.toml +1 -1
- flock/themes/thayer-bright.toml +1 -1
- flock/themes/tokyonight.toml +1 -1
- flock/themes/tomorrow.toml +1 -1
- flock/themes/ubuntu.toml +1 -1
- flock/themes/ultradark.toml +1 -1
- flock/themes/ultraviolent.toml +1 -1
- flock/themes/unikitty.toml +1 -1
- flock/themes/urple.toml +1 -1
- flock/themes/vesper.toml +1 -1
- flock/themes/vimbones.toml +1 -1
- flock/themes/wildcherry.toml +1 -1
- flock/themes/wilmersdorf.toml +1 -1
- flock/themes/wryan.toml +1 -1
- flock/themes/xcodedarkhc.toml +1 -1
- flock/themes/xcodelight.toml +1 -1
- flock/themes/zenbones-light.toml +1 -1
- flock/themes/zenwritten-dark.toml +1 -1
- flock/utilities.py +301 -0
- flock/{components/utility → utility}/output_utility_component.py +68 -53
- flock/visibility.py +107 -0
- flock_core-0.5.56b0.dist-info/METADATA +747 -0
- flock_core-0.5.56b0.dist-info/RECORD +398 -0
- flock_core-0.5.56b0.dist-info/entry_points.txt +2 -0
- {flock_core-0.5.0b28.dist-info → flock_core-0.5.56b0.dist-info}/licenses/LICENSE +1 -1
- flock/adapter/__init__.py +0 -14
- flock/adapter/azure_adapter.py +0 -68
- flock/adapter/chroma_adapter.py +0 -73
- flock/adapter/faiss_adapter.py +0 -97
- flock/adapter/pinecone_adapter.py +0 -51
- flock/adapter/vector_base.py +0 -47
- flock/cli/assets/release_notes.md +0 -140
- flock/cli/config.py +0 -8
- flock/cli/constants.py +0 -36
- flock/cli/create_agent.py +0 -1
- flock/cli/create_flock.py +0 -280
- flock/cli/execute_flock.py +0 -620
- flock/cli/load_agent.py +0 -1
- flock/cli/load_examples.py +0 -1
- flock/cli/load_flock.py +0 -192
- flock/cli/load_release_notes.py +0 -20
- flock/cli/loaded_flock_cli.py +0 -254
- flock/cli/manage_agents.py +0 -459
- flock/cli/registry_management.py +0 -889
- flock/cli/runner.py +0 -41
- flock/cli/settings.py +0 -857
- flock/cli/utils.py +0 -135
- flock/cli/view_results.py +0 -29
- flock/cli/yaml_editor.py +0 -396
- flock/components/__init__.py +0 -30
- flock/components/evaluation/__init__.py +0 -9
- flock/components/evaluation/declarative_evaluation_component.py +0 -606
- flock/components/routing/__init__.py +0 -15
- flock/components/routing/conditional_routing_component.py +0 -494
- flock/components/routing/default_routing_component.py +0 -103
- flock/components/routing/llm_routing_component.py +0 -206
- flock/components/utility/__init__.py +0 -22
- flock/components/utility/example_utility_component.py +0 -250
- flock/components/utility/feedback_utility_component.py +0 -206
- flock/components/utility/memory_utility_component.py +0 -550
- flock/components/utility/metrics_utility_component.py +0 -700
- flock/config.py +0 -61
- flock/core/__init__.py +0 -110
- flock/core/agent/__init__.py +0 -16
- flock/core/agent/default_agent.py +0 -216
- flock/core/agent/flock_agent_components.py +0 -104
- flock/core/agent/flock_agent_execution.py +0 -101
- flock/core/agent/flock_agent_integration.py +0 -260
- flock/core/agent/flock_agent_lifecycle.py +0 -186
- flock/core/agent/flock_agent_serialization.py +0 -381
- flock/core/api/__init__.py +0 -10
- flock/core/api/custom_endpoint.py +0 -45
- flock/core/api/endpoints.py +0 -254
- flock/core/api/main.py +0 -162
- flock/core/api/models.py +0 -97
- flock/core/api/run_store.py +0 -224
- flock/core/api/runner.py +0 -44
- flock/core/api/service.py +0 -214
- flock/core/component/__init__.py +0 -15
- flock/core/component/agent_component_base.py +0 -309
- flock/core/component/evaluation_component.py +0 -62
- flock/core/component/routing_component.py +0 -74
- flock/core/component/utility_component.py +0 -69
- flock/core/config/flock_agent_config.py +0 -58
- flock/core/config/scheduled_agent_config.py +0 -40
- flock/core/context/context.py +0 -213
- flock/core/context/context_manager.py +0 -37
- flock/core/context/context_vars.py +0 -10
- flock/core/evaluation/utils.py +0 -396
- flock/core/execution/batch_executor.py +0 -369
- flock/core/execution/evaluation_executor.py +0 -438
- flock/core/execution/local_executor.py +0 -31
- flock/core/execution/opik_executor.py +0 -103
- flock/core/execution/temporal_executor.py +0 -164
- flock/core/flock.py +0 -634
- flock/core/flock_agent.py +0 -336
- flock/core/flock_factory.py +0 -613
- flock/core/flock_scheduler.py +0 -166
- flock/core/flock_server_manager.py +0 -136
- flock/core/interpreter/python_interpreter.py +0 -689
- flock/core/mcp/__init__.py +0 -1
- flock/core/mcp/flock_mcp_server.py +0 -680
- flock/core/mcp/mcp_client_manager.py +0 -201
- flock/core/mcp/types/__init__.py +0 -1
- flock/core/mixin/dspy_integration.py +0 -403
- flock/core/mixin/prompt_parser.py +0 -125
- flock/core/orchestration/__init__.py +0 -15
- flock/core/orchestration/flock_batch_processor.py +0 -94
- flock/core/orchestration/flock_evaluator.py +0 -113
- flock/core/orchestration/flock_execution.py +0 -295
- flock/core/orchestration/flock_initialization.py +0 -149
- flock/core/orchestration/flock_server_manager.py +0 -67
- flock/core/orchestration/flock_web_server.py +0 -117
- flock/core/registry/__init__.py +0 -45
- flock/core/registry/agent_registry.py +0 -69
- flock/core/registry/callable_registry.py +0 -139
- flock/core/registry/component_discovery.py +0 -142
- flock/core/registry/component_registry.py +0 -64
- flock/core/registry/config_mapping.py +0 -64
- flock/core/registry/decorators.py +0 -137
- flock/core/registry/registry_hub.py +0 -205
- flock/core/registry/server_registry.py +0 -57
- flock/core/registry/type_registry.py +0 -86
- flock/core/serialization/__init__.py +0 -13
- flock/core/serialization/callable_registry.py +0 -52
- flock/core/serialization/flock_serializer.py +0 -832
- flock/core/serialization/json_encoder.py +0 -41
- flock/core/serialization/secure_serializer.py +0 -175
- flock/core/serialization/serializable.py +0 -342
- flock/core/serialization/serialization_utils.py +0 -412
- flock/core/util/file_path_utils.py +0 -223
- flock/core/util/hydrator.py +0 -309
- flock/core/util/input_resolver.py +0 -164
- flock/core/util/loader.py +0 -59
- flock/core/util/splitter.py +0 -219
- flock/di.py +0 -27
- flock/platform/docker_tools.py +0 -49
- flock/platform/jaeger_install.py +0 -86
- flock/webapp/__init__.py +0 -1
- flock/webapp/app/__init__.py +0 -0
- flock/webapp/app/api/__init__.py +0 -0
- flock/webapp/app/api/agent_management.py +0 -241
- flock/webapp/app/api/execution.py +0 -709
- flock/webapp/app/api/flock_management.py +0 -129
- flock/webapp/app/api/registry_viewer.py +0 -30
- flock/webapp/app/chat.py +0 -665
- flock/webapp/app/config.py +0 -104
- flock/webapp/app/dependencies.py +0 -117
- flock/webapp/app/main.py +0 -1070
- flock/webapp/app/middleware.py +0 -113
- flock/webapp/app/models_ui.py +0 -7
- flock/webapp/app/services/__init__.py +0 -0
- flock/webapp/app/services/feedback_file_service.py +0 -363
- flock/webapp/app/services/flock_service.py +0 -337
- flock/webapp/app/services/sharing_models.py +0 -81
- flock/webapp/app/services/sharing_store.py +0 -762
- flock/webapp/app/templates/theme_mapper.html +0 -326
- flock/webapp/app/theme_mapper.py +0 -812
- flock/webapp/app/utils.py +0 -85
- flock/webapp/run.py +0 -215
- flock/webapp/static/css/chat.css +0 -301
- flock/webapp/static/css/components.css +0 -167
- flock/webapp/static/css/header.css +0 -39
- flock/webapp/static/css/layout.css +0 -46
- flock/webapp/static/css/sidebar.css +0 -127
- flock/webapp/static/css/two-pane.css +0 -48
- flock/webapp/templates/base.html +0 -200
- flock/webapp/templates/chat.html +0 -152
- flock/webapp/templates/chat_settings.html +0 -19
- flock/webapp/templates/flock_editor.html +0 -16
- flock/webapp/templates/index.html +0 -12
- flock/webapp/templates/partials/_agent_detail_form.html +0 -93
- flock/webapp/templates/partials/_agent_list.html +0 -18
- flock/webapp/templates/partials/_agent_manager_view.html +0 -51
- flock/webapp/templates/partials/_agent_tools_checklist.html +0 -14
- flock/webapp/templates/partials/_chat_container.html +0 -15
- flock/webapp/templates/partials/_chat_messages.html +0 -57
- flock/webapp/templates/partials/_chat_settings_form.html +0 -85
- flock/webapp/templates/partials/_create_flock_form.html +0 -50
- flock/webapp/templates/partials/_dashboard_flock_detail.html +0 -17
- flock/webapp/templates/partials/_dashboard_flock_file_list.html +0 -16
- flock/webapp/templates/partials/_dashboard_flock_properties_preview.html +0 -28
- flock/webapp/templates/partials/_dashboard_upload_flock_form.html +0 -16
- flock/webapp/templates/partials/_dynamic_input_form_content.html +0 -22
- flock/webapp/templates/partials/_env_vars_table.html +0 -23
- flock/webapp/templates/partials/_execution_form.html +0 -118
- flock/webapp/templates/partials/_execution_view_container.html +0 -28
- flock/webapp/templates/partials/_flock_file_list.html +0 -23
- flock/webapp/templates/partials/_flock_properties_form.html +0 -52
- flock/webapp/templates/partials/_flock_upload_form.html +0 -16
- flock/webapp/templates/partials/_header_flock_status.html +0 -5
- flock/webapp/templates/partials/_load_manager_view.html +0 -49
- flock/webapp/templates/partials/_registry_table.html +0 -25
- flock/webapp/templates/partials/_registry_viewer_content.html +0 -70
- flock/webapp/templates/partials/_results_display.html +0 -78
- flock/webapp/templates/partials/_settings_env_content.html +0 -9
- flock/webapp/templates/partials/_settings_theme_content.html +0 -14
- flock/webapp/templates/partials/_settings_view.html +0 -36
- flock/webapp/templates/partials/_share_chat_link_snippet.html +0 -11
- flock/webapp/templates/partials/_share_link_snippet.html +0 -35
- flock/webapp/templates/partials/_sidebar.html +0 -74
- flock/webapp/templates/partials/_streaming_results_container.html +0 -195
- flock/webapp/templates/partials/_structured_data_view.html +0 -40
- flock/webapp/templates/partials/_theme_preview.html +0 -36
- flock/webapp/templates/registry_viewer.html +0 -84
- flock/webapp/templates/shared_run_page.html +0 -140
- flock/workflow/__init__.py +0 -0
- flock/workflow/activities.py +0 -196
- flock/workflow/agent_activities.py +0 -24
- flock/workflow/agent_execution_activity.py +0 -202
- flock/workflow/flock_workflow.py +0 -214
- flock/workflow/temporal_config.py +0 -96
- flock/workflow/temporal_setup.py +0 -68
- flock_core-0.5.0b28.dist-info/METADATA +0 -274
- flock_core-0.5.0b28.dist-info/RECORD +0 -561
- flock_core-0.5.0b28.dist-info/entry_points.txt +0 -2
- /flock/{core/logging → logging}/formatters/themes.py +0 -0
- /flock/{core/logging → logging}/span_middleware/baggage_span_processor.py +0 -0
- /flock/{core/mcp → mcp}/util/__init__.py +0 -0
- {flock_core-0.5.0b28.dist-info → flock_core-0.5.56b0.dist-info}/WHEEL +0 -0
flock/core/context/context.py
DELETED
|
@@ -1,213 +0,0 @@
|
|
|
1
|
-
import uuid
|
|
2
|
-
from dataclasses import asdict
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from typing import Any, Literal
|
|
5
|
-
|
|
6
|
-
from opentelemetry import trace
|
|
7
|
-
from pydantic import BaseModel, Field
|
|
8
|
-
|
|
9
|
-
from flock.core.context.context_vars import FLOCK_LAST_AGENT, FLOCK_LAST_RESULT
|
|
10
|
-
from flock.core.logging.logging import get_logger
|
|
11
|
-
from flock.core.serialization.serializable import Serializable
|
|
12
|
-
|
|
13
|
-
logger = get_logger("context")
|
|
14
|
-
tracer = trace.get_tracer(__name__)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class AgentRunRecord(BaseModel):
|
|
18
|
-
id: str = Field(default="")
|
|
19
|
-
agent: str = Field(default="")
|
|
20
|
-
data: dict[str, Any] = Field(default_factory=dict)
|
|
21
|
-
timestamp: str = Field(default="")
|
|
22
|
-
hand_off: dict | None = Field(default_factory=dict)
|
|
23
|
-
called_from: str | None = Field(default=None)
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class AgentDefinition(BaseModel):
|
|
27
|
-
agent_type: str = Field(default="")
|
|
28
|
-
agent_name: str = Field(default="")
|
|
29
|
-
agent_data: dict = Field(default_factory=dict)
|
|
30
|
-
serializer: Literal["json", "cloudpickle", "msgpack"] = Field(
|
|
31
|
-
default="cloudpickle"
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class FlockContext(Serializable, BaseModel):
|
|
36
|
-
state: dict[str, Any] = Field(default_factory=dict)
|
|
37
|
-
history: list[AgentRunRecord] = Field(default_factory=list)
|
|
38
|
-
agent_definitions: dict[str, AgentDefinition] = Field(default_factory=dict)
|
|
39
|
-
run_id: str = Field(default="")
|
|
40
|
-
workflow_id: str = Field(default="")
|
|
41
|
-
workflow_timestamp: str = Field(default="")
|
|
42
|
-
|
|
43
|
-
def record(
|
|
44
|
-
self,
|
|
45
|
-
agent_name: str,
|
|
46
|
-
data: dict[str, Any],
|
|
47
|
-
timestamp: str,
|
|
48
|
-
hand_off: str,
|
|
49
|
-
called_from: str,
|
|
50
|
-
) -> None:
|
|
51
|
-
record = AgentRunRecord(
|
|
52
|
-
id=agent_name + "_" + uuid.uuid4().hex[:4],
|
|
53
|
-
agent=agent_name,
|
|
54
|
-
data=data.copy(),
|
|
55
|
-
timestamp=timestamp,
|
|
56
|
-
hand_off=hand_off,
|
|
57
|
-
called_from=called_from,
|
|
58
|
-
)
|
|
59
|
-
self.history.append(record)
|
|
60
|
-
for key, value in data.items():
|
|
61
|
-
self.set_variable(f"{agent_name}.{key}", value)
|
|
62
|
-
self.set_variable(FLOCK_LAST_RESULT, data)
|
|
63
|
-
self.set_variable(FLOCK_LAST_AGENT, agent_name)
|
|
64
|
-
logger.info(
|
|
65
|
-
f"Agent run recorded - run_id '{record.id}'",
|
|
66
|
-
agent=agent_name,
|
|
67
|
-
timestamp=timestamp,
|
|
68
|
-
data=data,
|
|
69
|
-
)
|
|
70
|
-
current_span = trace.get_current_span()
|
|
71
|
-
if current_span.get_span_context().is_valid:
|
|
72
|
-
current_span.add_event(
|
|
73
|
-
"record",
|
|
74
|
-
attributes={"agent": agent_name, "timestamp": timestamp},
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
def get_variable(self, key: str, default: Any = None) -> Any:
|
|
78
|
-
return self.state.get(key, default)
|
|
79
|
-
|
|
80
|
-
def set_variable(self, key: str, value: Any) -> None:
|
|
81
|
-
old_value = self.state.get(key)
|
|
82
|
-
self.state[key] = value
|
|
83
|
-
if old_value != value:
|
|
84
|
-
escaped_value = str(value).replace("{", "{{").replace("}", "}}")
|
|
85
|
-
|
|
86
|
-
logger.info(
|
|
87
|
-
"Context variable updated - {} -> {}",
|
|
88
|
-
key,
|
|
89
|
-
escaped_value, # Arguments in order
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
current_span = trace.get_current_span()
|
|
93
|
-
if current_span.get_span_context().is_valid:
|
|
94
|
-
current_span.add_event(
|
|
95
|
-
"set_variable",
|
|
96
|
-
attributes={
|
|
97
|
-
"key": key,
|
|
98
|
-
"old": str(old_value),
|
|
99
|
-
"new": str(value),
|
|
100
|
-
},
|
|
101
|
-
)
|
|
102
|
-
|
|
103
|
-
def deepcopy(self) -> "FlockContext":
|
|
104
|
-
return FlockContext.from_dict(self.to_dict())
|
|
105
|
-
|
|
106
|
-
def get_agent_history(self, agent_name: str) -> list[AgentRunRecord]:
|
|
107
|
-
return [record for record in self.history if record.agent == agent_name]
|
|
108
|
-
|
|
109
|
-
def next_input_for(self, agent) -> Any:
|
|
110
|
-
try:
|
|
111
|
-
if hasattr(agent, "input") and isinstance(agent.input, str):
|
|
112
|
-
keys = [k.strip() for k in agent.input.split(",") if k.strip()]
|
|
113
|
-
if len(keys) == 1:
|
|
114
|
-
return self.get_variable(keys[0])
|
|
115
|
-
else:
|
|
116
|
-
return {key: self.get_variable(key) for key in keys}
|
|
117
|
-
else:
|
|
118
|
-
return self.get_variable("init_input")
|
|
119
|
-
except Exception as e:
|
|
120
|
-
logger.error(
|
|
121
|
-
"Error getting next input for agent",
|
|
122
|
-
agent=agent.name,
|
|
123
|
-
error=str(e),
|
|
124
|
-
)
|
|
125
|
-
raise
|
|
126
|
-
|
|
127
|
-
def get_most_recent_value(self, variable_name: str) -> Any:
|
|
128
|
-
for history_record in reversed(self.history):
|
|
129
|
-
if variable_name in history_record.data:
|
|
130
|
-
return history_record.data[variable_name]
|
|
131
|
-
|
|
132
|
-
def get_agent_definition(self, agent_name: str) -> AgentDefinition | None:
|
|
133
|
-
return self.agent_definitions.get(agent_name)
|
|
134
|
-
|
|
135
|
-
def get_last_agent_name(self) -> str | None:
|
|
136
|
-
"""Returns the name of the agent from the most recent history record."""
|
|
137
|
-
if not self.history:
|
|
138
|
-
return None
|
|
139
|
-
last_record = self.history[-1]
|
|
140
|
-
# The 'called_from' field in the *next* record is the previous agent.
|
|
141
|
-
# However, to get the name of the *last executed agent*, we look at the 'agent' field.
|
|
142
|
-
return last_record.agent
|
|
143
|
-
|
|
144
|
-
def add_agent_definition(
|
|
145
|
-
self, agent_type: type, agent_name: str, agent_data: Any
|
|
146
|
-
) -> None:
|
|
147
|
-
definition = AgentDefinition(
|
|
148
|
-
agent_type=agent_type.__name__,
|
|
149
|
-
agent_name=agent_name,
|
|
150
|
-
agent_data=agent_data,
|
|
151
|
-
)
|
|
152
|
-
self.agent_definitions[agent_name] = definition
|
|
153
|
-
|
|
154
|
-
# Use the reactive setter for dict-like access.
|
|
155
|
-
def __getitem__(self, key: str) -> Any:
|
|
156
|
-
return self.get_variable(key)
|
|
157
|
-
|
|
158
|
-
def __setitem__(self, key: str, value: Any) -> None:
|
|
159
|
-
self.set_variable(key, value)
|
|
160
|
-
|
|
161
|
-
def to_dict(self) -> dict[str, Any]:
|
|
162
|
-
def convert(obj):
|
|
163
|
-
if isinstance(obj, datetime):
|
|
164
|
-
return obj.isoformat()
|
|
165
|
-
if hasattr(obj, "__dataclass_fields__"):
|
|
166
|
-
return asdict(
|
|
167
|
-
obj, dict_factory=lambda x: {k: convert(v) for k, v in x}
|
|
168
|
-
)
|
|
169
|
-
return obj
|
|
170
|
-
|
|
171
|
-
return convert(asdict(self))
|
|
172
|
-
|
|
173
|
-
@classmethod
|
|
174
|
-
def from_dict(cls, data: dict[str, Any]) -> "FlockContext":
|
|
175
|
-
def convert(obj):
|
|
176
|
-
if isinstance(obj, dict):
|
|
177
|
-
if "timestamp" in obj:
|
|
178
|
-
return AgentRunRecord(
|
|
179
|
-
**{
|
|
180
|
-
**obj,
|
|
181
|
-
"timestamp": obj["timestamp"]
|
|
182
|
-
,
|
|
183
|
-
}
|
|
184
|
-
)
|
|
185
|
-
if "agent_type" in obj:
|
|
186
|
-
return AgentDefinition(**obj)
|
|
187
|
-
return {k: convert(v) for k, v in obj.items()}
|
|
188
|
-
if isinstance(obj, list):
|
|
189
|
-
return [convert(v) for v in obj]
|
|
190
|
-
return obj
|
|
191
|
-
|
|
192
|
-
converted = convert(data)
|
|
193
|
-
return cls(**converted)
|
|
194
|
-
|
|
195
|
-
def resolve(self, svc_type):
|
|
196
|
-
"""Resolve a service from the request-scoped DI container if present.
|
|
197
|
-
|
|
198
|
-
The bootstrap code is expected to store the active `ServiceProvider` from
|
|
199
|
-
`wd.di` in the context variable key ``di.container``. This helper
|
|
200
|
-
provides a convenient façade so that Flock components can simply call
|
|
201
|
-
``context.resolve(SomeType)`` regardless of whether a container is
|
|
202
|
-
available. When the container is missing or the service cannot be
|
|
203
|
-
resolved, ``None`` is returned instead of raising to keep backward
|
|
204
|
-
compatibility.
|
|
205
|
-
"""
|
|
206
|
-
container = self.get_variable("di.container")
|
|
207
|
-
if container is None:
|
|
208
|
-
return None
|
|
209
|
-
try:
|
|
210
|
-
return container.get_service(svc_type)
|
|
211
|
-
except Exception:
|
|
212
|
-
# Service not registered or other resolution error – fall back to None
|
|
213
|
-
return None
|
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
"""Module for managing the FlockContext."""
|
|
2
|
-
|
|
3
|
-
from flock.core.context.context import FlockContext
|
|
4
|
-
from flock.core.context.context_vars import (
|
|
5
|
-
FLOCK_CURRENT_AGENT,
|
|
6
|
-
FLOCK_INITIAL_INPUT,
|
|
7
|
-
FLOCK_LOCAL_DEBUG,
|
|
8
|
-
FLOCK_MODEL,
|
|
9
|
-
FLOCK_RUN_ID,
|
|
10
|
-
)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def initialize_context(
|
|
14
|
-
context: FlockContext,
|
|
15
|
-
agent_name: str,
|
|
16
|
-
input_data: dict,
|
|
17
|
-
run_id: str,
|
|
18
|
-
local_debug: bool,
|
|
19
|
-
model: str,
|
|
20
|
-
) -> None:
|
|
21
|
-
"""Initialize the FlockContext with standard variables before running an agent.
|
|
22
|
-
|
|
23
|
-
Args:
|
|
24
|
-
context: The FlockContext instance.
|
|
25
|
-
agent_name: The name of the current agent.
|
|
26
|
-
input_data: A dictionary of inputs for the agent.
|
|
27
|
-
run_id: A unique identifier for the run.
|
|
28
|
-
local_debug: Flag indicating whether local debugging is enabled.
|
|
29
|
-
"""
|
|
30
|
-
context.set_variable(FLOCK_CURRENT_AGENT, agent_name)
|
|
31
|
-
for key, value in input_data.items():
|
|
32
|
-
context.set_variable("flock." + key, value)
|
|
33
|
-
context.set_variable(FLOCK_INITIAL_INPUT, input_data)
|
|
34
|
-
context.set_variable(FLOCK_LOCAL_DEBUG, local_debug)
|
|
35
|
-
context.run_id = run_id
|
|
36
|
-
context.set_variable(FLOCK_RUN_ID, run_id)
|
|
37
|
-
context.set_variable(FLOCK_MODEL, model)
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
"""Context variables for Flock."""
|
|
2
|
-
|
|
3
|
-
FLOCK_CURRENT_AGENT = "flock.current_agent"
|
|
4
|
-
FLOCK_INITIAL_INPUT = "flock.initial_input"
|
|
5
|
-
FLOCK_LOCAL_DEBUG = "flock.local_debug"
|
|
6
|
-
FLOCK_RUN_ID = "flock.run_id"
|
|
7
|
-
FLOCK_LAST_AGENT = "flock.last_agent"
|
|
8
|
-
FLOCK_LAST_RESULT = "flock.last_result"
|
|
9
|
-
FLOCK_MODEL = "flock.model"
|
|
10
|
-
FLOCK_BATCH_SILENT_MODE = "flock.batch_silent"
|
flock/core/evaluation/utils.py
DELETED
|
@@ -1,396 +0,0 @@
|
|
|
1
|
-
# src/flock/core/util/evaluation_helpers.py
|
|
2
|
-
import inspect
|
|
3
|
-
import sys
|
|
4
|
-
from collections.abc import Callable
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
from typing import Any, Union
|
|
7
|
-
|
|
8
|
-
import pandas as pd
|
|
9
|
-
from box import Box
|
|
10
|
-
from datasets import (
|
|
11
|
-
Dataset as HFDataset,
|
|
12
|
-
get_dataset_config_names,
|
|
13
|
-
load_dataset,
|
|
14
|
-
)
|
|
15
|
-
from opik import Opik
|
|
16
|
-
from opik.evaluation import evaluate
|
|
17
|
-
|
|
18
|
-
from flock.core.flock import Flock
|
|
19
|
-
from flock.core.flock_agent import FlockAgent
|
|
20
|
-
|
|
21
|
-
# Legacy FlockEvaluator import removed
|
|
22
|
-
from flock.core.logging.logging import get_logger
|
|
23
|
-
|
|
24
|
-
# Potentially import metrics libraries like rouge_score, nltk, sentence_transformers
|
|
25
|
-
|
|
26
|
-
logger_helpers = get_logger("util.evaluation")
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def evaluate_with_opik(
|
|
30
|
-
dataset: str | Path | list[dict[str, Any]] | pd.DataFrame | HFDataset,
|
|
31
|
-
dataset_name: str,
|
|
32
|
-
experiment_name: str,
|
|
33
|
-
start_agent: FlockAgent | str,
|
|
34
|
-
input_mapping: dict[str, str],
|
|
35
|
-
answer_mapping: dict[str, str],
|
|
36
|
-
metrics: list[
|
|
37
|
-
str
|
|
38
|
-
| Callable[[Any, Any], bool | float | dict[str, Any]]
|
|
39
|
-
| FlockAgent
|
|
40
|
-
| FlockEvaluator
|
|
41
|
-
],
|
|
42
|
-
):
|
|
43
|
-
df = normalize_dataset(dataset)
|
|
44
|
-
client = Opik()
|
|
45
|
-
dataset = client.get_or_create_dataset(name=dataset_name)
|
|
46
|
-
|
|
47
|
-
dataset.insert_from_pandas(dataframe=df, ignore_keys=["source"])
|
|
48
|
-
|
|
49
|
-
# Create a single Flock instance outside the task function
|
|
50
|
-
shared_flock = Flock(
|
|
51
|
-
name="opik_eval", model="azure/gpt-4.1", show_flock_banner=False
|
|
52
|
-
)
|
|
53
|
-
shared_flock.add_agent(start_agent)
|
|
54
|
-
|
|
55
|
-
def evaluation_task(dataset_item):
|
|
56
|
-
agent_input = {
|
|
57
|
-
value: dataset_item[key] for key, value in input_mapping.items()
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
# Use the shared Flock instance instead of creating a new one
|
|
61
|
-
result_flock = shared_flock.run(
|
|
62
|
-
agent=start_agent, input=agent_input, box_result=False
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
# agent_output = result_flock.get(answer_mapping[key], "No answer found")
|
|
66
|
-
|
|
67
|
-
key = next(iter(answer_mapping.keys()))
|
|
68
|
-
reference = dataset_item[key]
|
|
69
|
-
answer = result_flock.get(answer_mapping[key], "No answer found")
|
|
70
|
-
|
|
71
|
-
result = {
|
|
72
|
-
"input": agent_input,
|
|
73
|
-
"output": answer,
|
|
74
|
-
"reference": reference,
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
return result
|
|
78
|
-
|
|
79
|
-
eval_results = evaluate(
|
|
80
|
-
experiment_name=experiment_name,
|
|
81
|
-
dataset=dataset,
|
|
82
|
-
task=evaluation_task,
|
|
83
|
-
scoring_metrics=metrics,
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def load_and_merge_all_configs(dataset_name: str) -> pd.DataFrame:
|
|
88
|
-
all_configs = get_dataset_config_names(dataset_name)
|
|
89
|
-
all_dfs = []
|
|
90
|
-
|
|
91
|
-
for config in all_configs:
|
|
92
|
-
dataset_dict = load_dataset(dataset_name, config)
|
|
93
|
-
for split_name, split_dataset in dataset_dict.items():
|
|
94
|
-
df = split_dataset.to_pandas()
|
|
95
|
-
df["config"] = config
|
|
96
|
-
df["split"] = split_name
|
|
97
|
-
all_dfs.append(df)
|
|
98
|
-
|
|
99
|
-
merged_df = pd.concat(all_dfs, ignore_index=True)
|
|
100
|
-
logger_helpers.info(f"merged_df.head(): {merged_df.head()}")
|
|
101
|
-
return merged_df
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
def import_hf_dataset_to_opik(dataset_name: str) -> pd.DataFrame:
|
|
105
|
-
df = load_and_merge_all_configs(dataset_name)
|
|
106
|
-
logger_helpers.info(
|
|
107
|
-
f"type(df): {type(df)}"
|
|
108
|
-
) # ➜ <class 'pandas.core.frame.DataFrame'>
|
|
109
|
-
logger_helpers.info(f"df.shape: {df.shape}") # e.g. (123456, N_COLUMNS+2)
|
|
110
|
-
logger_helpers.info(
|
|
111
|
-
f"df['split'].value_counts(): {df['split'].value_counts()}"
|
|
112
|
-
)
|
|
113
|
-
logger_helpers.info(f"df['config'].unique(): {df['config'].unique()}")
|
|
114
|
-
client = Opik()
|
|
115
|
-
dataset = client.get_or_create_dataset(name=dataset_name)
|
|
116
|
-
|
|
117
|
-
dataset.insert_from_pandas(dataframe=df, ignore_keys=["source"])
|
|
118
|
-
return df
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def normalize_dataset(dataset: Any) -> pd.DataFrame:
|
|
122
|
-
"""Converts various dataset formats into a pandas DataFrame."""
|
|
123
|
-
if isinstance(dataset, pd.DataFrame):
|
|
124
|
-
return dataset.copy()
|
|
125
|
-
elif isinstance(dataset, str | Path):
|
|
126
|
-
path = Path(dataset)
|
|
127
|
-
if not path.exists():
|
|
128
|
-
try:
|
|
129
|
-
return load_and_merge_all_configs(dataset)
|
|
130
|
-
except Exception as e:
|
|
131
|
-
raise FileNotFoundError(
|
|
132
|
-
f"Dataset file not found: {path}"
|
|
133
|
-
) from e
|
|
134
|
-
if path.suffix.lower() == ".csv":
|
|
135
|
-
return pd.read_csv(path)
|
|
136
|
-
# Add support for json, jsonl etc. if needed
|
|
137
|
-
else:
|
|
138
|
-
raise ValueError(
|
|
139
|
-
f"Unsupported file type for dataset: {path.suffix}"
|
|
140
|
-
)
|
|
141
|
-
elif isinstance(dataset, list):
|
|
142
|
-
if not dataset or not isinstance(dataset[0], dict):
|
|
143
|
-
raise ValueError("Dataset list must contain dictionaries.")
|
|
144
|
-
return pd.DataFrame(dataset)
|
|
145
|
-
elif "datasets" in sys.modules and isinstance(
|
|
146
|
-
dataset, sys.modules["datasets"].Dataset
|
|
147
|
-
):
|
|
148
|
-
# Requires 'datasets' library to be installed
|
|
149
|
-
return dataset.to_pandas()
|
|
150
|
-
else:
|
|
151
|
-
raise TypeError(f"Unsupported dataset type: {type(dataset)}")
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
def extract_value_by_dot_notation(data: dict | Box, key: str) -> Any:
|
|
155
|
-
"""Retrieves a value from a nested dictionary or Box object using dot notation."""
|
|
156
|
-
if not key:
|
|
157
|
-
return None
|
|
158
|
-
keys = key.split(".")
|
|
159
|
-
value = data
|
|
160
|
-
try:
|
|
161
|
-
for k in keys:
|
|
162
|
-
if isinstance(value, (dict, Box)):
|
|
163
|
-
value = value.get(k)
|
|
164
|
-
# Add list index handling if needed: e.g., 'results[0].field'
|
|
165
|
-
# elif isinstance(value, list) and k.isdigit():
|
|
166
|
-
# value = value[int(k)]
|
|
167
|
-
else:
|
|
168
|
-
return None # Cannot traverse further
|
|
169
|
-
if value is None:
|
|
170
|
-
return None # Key not found at this level
|
|
171
|
-
return value
|
|
172
|
-
except (KeyError, IndexError, AttributeError):
|
|
173
|
-
return None
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
def calculate_evaluation_metrics(
|
|
177
|
-
metrics: list[Union[str, Callable, "FlockAgent", "FlockEvaluator"]],
|
|
178
|
-
metric_configs: dict[str, dict[str, Any]],
|
|
179
|
-
predicted_answers: dict[str, Any],
|
|
180
|
-
expected_answers: dict[str, Any],
|
|
181
|
-
agent_inputs: dict[str, Any], # For context
|
|
182
|
-
agent_output: Any, # For context
|
|
183
|
-
) -> dict[str, Any]:
|
|
184
|
-
"""Calculates all specified metrics for a single evaluation item."""
|
|
185
|
-
results = {}
|
|
186
|
-
for metric in metrics:
|
|
187
|
-
metric_name = ""
|
|
188
|
-
metric_result = None
|
|
189
|
-
try:
|
|
190
|
-
if isinstance(metric, str):
|
|
191
|
-
metric_name = metric
|
|
192
|
-
# Find predicted/expected values relevant to this metric string
|
|
193
|
-
# Simple case: metric name matches an answer_mapping key
|
|
194
|
-
if (
|
|
195
|
-
metric_name in predicted_answers
|
|
196
|
-
and metric_name in expected_answers
|
|
197
|
-
):
|
|
198
|
-
predicted = predicted_answers[metric_name]
|
|
199
|
-
expected = expected_answers[metric_name]
|
|
200
|
-
metric_func = _get_metric_function(metric_name)
|
|
201
|
-
config = metric_configs.get(metric_name, {})
|
|
202
|
-
metric_result = metric_func(predicted, expected, **config)
|
|
203
|
-
else:
|
|
204
|
-
logger_helpers.warning(
|
|
205
|
-
f"Could not find matching predicted/expected values for metric '{metric_name}' based on answer_mapping keys."
|
|
206
|
-
)
|
|
207
|
-
metric_result = None # Or some error indicator
|
|
208
|
-
|
|
209
|
-
elif isinstance(metric, Callable):
|
|
210
|
-
metric_name = getattr(metric, "__name__", "custom_function")
|
|
211
|
-
# Custom functions might need specific predicted/expected pairs, or all of them
|
|
212
|
-
# Let's pass all for flexibility, user function needs to handle it
|
|
213
|
-
config = metric_configs.get(metric_name, {})
|
|
214
|
-
# Allow passing context if function signature supports it
|
|
215
|
-
sig = inspect.signature(metric)
|
|
216
|
-
call_kwargs = config.copy()
|
|
217
|
-
if "agent_inputs" in sig.parameters:
|
|
218
|
-
call_kwargs["agent_inputs"] = agent_inputs
|
|
219
|
-
if "agent_output" in sig.parameters:
|
|
220
|
-
call_kwargs["agent_output"] = agent_output
|
|
221
|
-
|
|
222
|
-
metric_result = metric(
|
|
223
|
-
predicted_answers, expected_answers, **call_kwargs
|
|
224
|
-
)
|
|
225
|
-
|
|
226
|
-
# --- Placeholder for Agent/Evaluator based metrics ---
|
|
227
|
-
elif "FlockAgent" in str(
|
|
228
|
-
type(metric)
|
|
229
|
-
): # Avoid hard import if possible
|
|
230
|
-
metric_name = getattr(metric, "name", "judge_agent")
|
|
231
|
-
config = metric_configs.get(metric_name, {})
|
|
232
|
-
# Requires running the judge agent - needs async context
|
|
233
|
-
# metric_result = asyncio.run(_run_judge_agent(metric, predicted_answers, expected_answers, config))
|
|
234
|
-
logger_helpers.warning(
|
|
235
|
-
f"Agent-based metric '{metric_name}' execution not implemented in this sketch."
|
|
236
|
-
)
|
|
237
|
-
metric_result = "[Agent Judge Not Implemented]"
|
|
238
|
-
|
|
239
|
-
elif "FlockEvaluator" in str(
|
|
240
|
-
type(metric)
|
|
241
|
-
): # Avoid hard import if possible
|
|
242
|
-
metric_name = getattr(metric, "name", "judge_evaluator")
|
|
243
|
-
config = metric_configs.get(metric_name, {})
|
|
244
|
-
# Requires running the evaluator - needs async context
|
|
245
|
-
# metric_result = asyncio.run(_run_judge_evaluator(metric, predicted_answers, expected_answers, config))
|
|
246
|
-
logger_helpers.warning(
|
|
247
|
-
f"Evaluator-based metric '{metric_name}' execution not implemented in this sketch."
|
|
248
|
-
)
|
|
249
|
-
metric_result = "[Evaluator Judge Not Implemented]"
|
|
250
|
-
# --- End Placeholder ---
|
|
251
|
-
|
|
252
|
-
else:
|
|
253
|
-
logger_helpers.warning(
|
|
254
|
-
f"Unsupported metric type: {type(metric)}"
|
|
255
|
-
)
|
|
256
|
-
continue
|
|
257
|
-
|
|
258
|
-
# Store result - handle dict results from metrics
|
|
259
|
-
if isinstance(metric_result, dict):
|
|
260
|
-
for sub_key, sub_value in metric_result.items():
|
|
261
|
-
results[f"{metric_name}_{sub_key}"] = sub_value
|
|
262
|
-
else:
|
|
263
|
-
results[metric_name] = metric_result
|
|
264
|
-
|
|
265
|
-
except Exception as e:
|
|
266
|
-
logger_helpers.error(
|
|
267
|
-
f"Error calculating metric '{metric_name}': {e}"
|
|
268
|
-
)
|
|
269
|
-
results[metric_name] = f"[Error: {e}]"
|
|
270
|
-
|
|
271
|
-
return results
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
def _get_metric_function(metric_name: str) -> Callable:
|
|
275
|
-
"""Maps metric names to their implementation functions."""
|
|
276
|
-
# Lazy load metric libraries
|
|
277
|
-
if metric_name == "exact_match":
|
|
278
|
-
return lambda pred, act, **kw: str(pred).strip() == str(act).strip()
|
|
279
|
-
elif metric_name == "fuzzy_match":
|
|
280
|
-
try:
|
|
281
|
-
from thefuzz import fuzz
|
|
282
|
-
|
|
283
|
-
return (
|
|
284
|
-
lambda pred, act, threshold=85, **kw: fuzz.ratio(
|
|
285
|
-
str(pred), str(act)
|
|
286
|
-
)
|
|
287
|
-
>= threshold
|
|
288
|
-
)
|
|
289
|
-
except ImportError:
|
|
290
|
-
logger_helpers.warning(
|
|
291
|
-
"fuzzy_match requires 'thefuzz': pip install thefuzz[speedup]"
|
|
292
|
-
)
|
|
293
|
-
return lambda p, a, **kw: None
|
|
294
|
-
elif metric_name.startswith("rouge"): # rouge_1, rouge_2, rouge_l
|
|
295
|
-
try:
|
|
296
|
-
from rouge_score import rouge_scorer
|
|
297
|
-
|
|
298
|
-
scorer = rouge_scorer.RougeScorer(
|
|
299
|
-
[metric_name.replace("_", "")], use_stemmer=True
|
|
300
|
-
)
|
|
301
|
-
|
|
302
|
-
def calculate_rouge(pred, act, score_type="fmeasure", **kw):
|
|
303
|
-
scores = scorer.score(str(act), str(pred))
|
|
304
|
-
return (
|
|
305
|
-
scores[metric_name.replace("_", "")]
|
|
306
|
-
._asdict()
|
|
307
|
-
.get(score_type, 0.0)
|
|
308
|
-
)
|
|
309
|
-
|
|
310
|
-
return calculate_rouge
|
|
311
|
-
except ImportError:
|
|
312
|
-
logger_helpers.warning(
|
|
313
|
-
"rouge requires 'rouge-score': pip install rouge-score"
|
|
314
|
-
)
|
|
315
|
-
return lambda p, a, **kw: None
|
|
316
|
-
elif metric_name == "semantic_similarity":
|
|
317
|
-
try:
|
|
318
|
-
from sentence_transformers import SentenceTransformer, util
|
|
319
|
-
|
|
320
|
-
# Cache the model? Maybe pass it in via config?
|
|
321
|
-
model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
322
|
-
|
|
323
|
-
def calculate_similarity(pred, act, **kw):
|
|
324
|
-
emb1 = model.encode(str(pred), convert_to_tensor=True)
|
|
325
|
-
emb2 = model.encode(str(act), convert_to_tensor=True)
|
|
326
|
-
return util.pytorch_cos_sim(emb1, emb2).item()
|
|
327
|
-
|
|
328
|
-
return calculate_similarity
|
|
329
|
-
except ImportError:
|
|
330
|
-
logger_helpers.warning(
|
|
331
|
-
"semantic_similarity requires 'sentence-transformers': pip install sentence-transformers"
|
|
332
|
-
)
|
|
333
|
-
return lambda p, a, **kw: None
|
|
334
|
-
# Add bleu, f1 etc.
|
|
335
|
-
elif metric_name == "llm_judge":
|
|
336
|
-
# This is handled by checking type in calculate_evaluation_metrics
|
|
337
|
-
# but we need a placeholder callable here if we map by string first
|
|
338
|
-
return lambda p, a, **kw: "[LLM Judge Not Implemented Directly]"
|
|
339
|
-
else:
|
|
340
|
-
raise ValueError(f"Unknown built-in metric: {metric_name}")
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
def aggregate_results(results_list: list[dict[str, Any]]) -> dict[str, Any]:
|
|
344
|
-
"""Aggregates evaluation results across all items."""
|
|
345
|
-
summary = {"total_items": len(results_list), "errors": 0}
|
|
346
|
-
metric_values: dict[str, list[float | bool]] = {}
|
|
347
|
-
|
|
348
|
-
for item in results_list:
|
|
349
|
-
if item.get("error"):
|
|
350
|
-
summary["errors"] += 1
|
|
351
|
-
metrics = item.get("metrics", {})
|
|
352
|
-
for name, value in metrics.items():
|
|
353
|
-
if isinstance(
|
|
354
|
-
value, (float, int, bool)
|
|
355
|
-
): # Only aggregate numerics/bools
|
|
356
|
-
if name not in metric_values:
|
|
357
|
-
metric_values[name] = []
|
|
358
|
-
metric_values[name].append(value)
|
|
359
|
-
|
|
360
|
-
summary["metrics_summary"] = {}
|
|
361
|
-
for name, values in metric_values.items():
|
|
362
|
-
if not values:
|
|
363
|
-
continue
|
|
364
|
-
# Calculate different stats based on value type
|
|
365
|
-
if all(isinstance(v, bool) for v in values):
|
|
366
|
-
summary["metrics_summary"][name] = {
|
|
367
|
-
"accuracy": sum(values) / len(values)
|
|
368
|
-
}
|
|
369
|
-
elif all(isinstance(v, (int, float)) for v in values):
|
|
370
|
-
numeric_values = [v for v in values if isinstance(v, (int, float))]
|
|
371
|
-
if numeric_values:
|
|
372
|
-
summary["metrics_summary"][name] = {
|
|
373
|
-
"mean": sum(numeric_values) / len(numeric_values),
|
|
374
|
-
"count": len(numeric_values),
|
|
375
|
-
# Add min, max, stddev if needed
|
|
376
|
-
}
|
|
377
|
-
|
|
378
|
-
return summary
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
# --- Placeholder for async judge execution ---
|
|
382
|
-
# Need to run these within the main async context or manage loops carefully
|
|
383
|
-
async def _run_judge_agent(judge_agent, predicted, expected, config):
|
|
384
|
-
# Prepare input for the judge agent based on its signature
|
|
385
|
-
# E.g., judge_input = {"prediction": predicted_value, "reference": expected_value, "criteria": ...}
|
|
386
|
-
# judge_result = await judge_agent.run_async(judge_input)
|
|
387
|
-
# return judge_result # Or extract specific score/judgement
|
|
388
|
-
return "[Agent Judge Not Implemented]"
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
async def _run_judge_evaluator(judge_evaluator, predicted, expected, config):
|
|
392
|
-
# Prepare input for the judge evaluator based on its signature
|
|
393
|
-
# judge_input = {"prediction": predicted_value, "reference": expected_value, **config}
|
|
394
|
-
# judge_result = await judge_evaluator.evaluate(None, judge_input, []) # Agent might not be needed
|
|
395
|
-
# return judge_result # Or extract specific score/judgement
|
|
396
|
-
return "[Evaluator Judge Not Implemented]"
|