flock-core 0.4.505__py3-none-any.whl → 0.4.508__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of flock-core might be problematic. Click here for more details.
- flock/core/evaluation/utils.py +85 -2
- flock/core/flock.py +102 -56
- flock/core/flock_agent.py +2 -2
- flock/core/flock_evaluator.py +8 -1
- flock/core/flock_factory.py +4 -0
- flock/core/mixin/dspy_integration.py +7 -6
- flock/evaluators/declarative/declarative_evaluator.py +112 -88
- flock/webapp/templates/chat.html +1 -0
- flock/webapp/templates/partials/_chat_messages.html +3 -2
- {flock_core-0.4.505.dist-info → flock_core-0.4.508.dist-info}/METADATA +28 -10
- {flock_core-0.4.505.dist-info → flock_core-0.4.508.dist-info}/RECORD +14 -17
- flock/core/api/ui/__init__.py +0 -0
- flock/core/api/ui/routes.py +0 -271
- flock/core/api/ui/utils.py +0 -119
- {flock_core-0.4.505.dist-info → flock_core-0.4.508.dist-info}/WHEEL +0 -0
- {flock_core-0.4.505.dist-info → flock_core-0.4.508.dist-info}/entry_points.txt +0 -0
- {flock_core-0.4.505.dist-info → flock_core-0.4.508.dist-info}/licenses/LICENSE +0 -0
flock/core/evaluation/utils.py
CHANGED
|
@@ -7,8 +7,15 @@ from typing import Any, Union
|
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from box import Box
|
|
10
|
-
from datasets import
|
|
11
|
-
|
|
10
|
+
from datasets import (
|
|
11
|
+
Dataset as HFDataset,
|
|
12
|
+
get_dataset_config_names,
|
|
13
|
+
load_dataset,
|
|
14
|
+
)
|
|
15
|
+
from opik import Opik
|
|
16
|
+
from opik.evaluation import evaluate
|
|
17
|
+
|
|
18
|
+
from flock.core.flock import Flock
|
|
12
19
|
from flock.core.flock_agent import FlockAgent
|
|
13
20
|
from flock.core.flock_evaluator import FlockEvaluator
|
|
14
21
|
from flock.core.logging.logging import get_logger
|
|
@@ -18,6 +25,64 @@ from flock.core.logging.logging import get_logger
|
|
|
18
25
|
logger_helpers = get_logger("util.evaluation")
|
|
19
26
|
|
|
20
27
|
|
|
28
|
+
def evaluate_with_opik(
|
|
29
|
+
dataset: str | Path | list[dict[str, Any]] | pd.DataFrame | HFDataset,
|
|
30
|
+
dataset_name: str,
|
|
31
|
+
experiment_name: str,
|
|
32
|
+
start_agent: FlockAgent | str,
|
|
33
|
+
input_mapping: dict[str, str],
|
|
34
|
+
answer_mapping: dict[str, str],
|
|
35
|
+
metrics: list[
|
|
36
|
+
str
|
|
37
|
+
| Callable[[Any, Any], bool | float | dict[str, Any]]
|
|
38
|
+
| FlockAgent
|
|
39
|
+
| FlockEvaluator
|
|
40
|
+
],
|
|
41
|
+
):
|
|
42
|
+
df = normalize_dataset(dataset)
|
|
43
|
+
client = Opik()
|
|
44
|
+
dataset = client.get_or_create_dataset(name=dataset_name)
|
|
45
|
+
|
|
46
|
+
dataset.insert_from_pandas(dataframe=df, ignore_keys=["source"])
|
|
47
|
+
|
|
48
|
+
# Create a single Flock instance outside the task function
|
|
49
|
+
shared_flock = Flock(
|
|
50
|
+
name="opik_eval", model="azure/gpt-4.1", show_flock_banner=False
|
|
51
|
+
)
|
|
52
|
+
shared_flock.add_agent(start_agent)
|
|
53
|
+
|
|
54
|
+
def evaluation_task(dataset_item):
|
|
55
|
+
agent_input = {
|
|
56
|
+
value: dataset_item[key] for key, value in input_mapping.items()
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Use the shared Flock instance instead of creating a new one
|
|
60
|
+
result_flock = shared_flock.run(
|
|
61
|
+
start_agent=start_agent, input=agent_input, box_result=False
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# agent_output = result_flock.get(answer_mapping[key], "No answer found")
|
|
65
|
+
|
|
66
|
+
key = next(iter(answer_mapping.keys()))
|
|
67
|
+
reference = dataset_item[key]
|
|
68
|
+
answer = result_flock.get(answer_mapping[key], "No answer found")
|
|
69
|
+
|
|
70
|
+
result = {
|
|
71
|
+
"input": agent_input,
|
|
72
|
+
"output": answer,
|
|
73
|
+
"reference": reference,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return result
|
|
77
|
+
|
|
78
|
+
eval_results = evaluate(
|
|
79
|
+
experiment_name=experiment_name,
|
|
80
|
+
dataset=dataset,
|
|
81
|
+
task=evaluation_task,
|
|
82
|
+
scoring_metrics=metrics,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
21
86
|
def load_and_merge_all_configs(dataset_name: str) -> pd.DataFrame:
|
|
22
87
|
all_configs = get_dataset_config_names(dataset_name)
|
|
23
88
|
all_dfs = []
|
|
@@ -31,9 +96,27 @@ def load_and_merge_all_configs(dataset_name: str) -> pd.DataFrame:
|
|
|
31
96
|
all_dfs.append(df)
|
|
32
97
|
|
|
33
98
|
merged_df = pd.concat(all_dfs, ignore_index=True)
|
|
99
|
+
logger_helpers.info(f"merged_df.head(): {merged_df.head()}")
|
|
34
100
|
return merged_df
|
|
35
101
|
|
|
36
102
|
|
|
103
|
+
def import_hf_dataset_to_opik(dataset_name: str) -> pd.DataFrame:
|
|
104
|
+
df = load_and_merge_all_configs(dataset_name)
|
|
105
|
+
logger_helpers.info(
|
|
106
|
+
f"type(df): {type(df)}"
|
|
107
|
+
) # ➜ <class 'pandas.core.frame.DataFrame'>
|
|
108
|
+
logger_helpers.info(f"df.shape: {df.shape}") # e.g. (123456, N_COLUMNS+2)
|
|
109
|
+
logger_helpers.info(
|
|
110
|
+
f"df['split'].value_counts(): {df['split'].value_counts()}"
|
|
111
|
+
)
|
|
112
|
+
logger_helpers.info(f"df['config'].unique(): {df['config'].unique()}")
|
|
113
|
+
client = Opik()
|
|
114
|
+
dataset = client.get_or_create_dataset(name=dataset_name)
|
|
115
|
+
|
|
116
|
+
dataset.insert_from_pandas(dataframe=df, ignore_keys=["source"])
|
|
117
|
+
return df
|
|
118
|
+
|
|
119
|
+
|
|
37
120
|
def normalize_dataset(dataset: Any) -> pd.DataFrame:
|
|
38
121
|
"""Converts various dataset formats into a pandas DataFrame."""
|
|
39
122
|
if isinstance(dataset, pd.DataFrame):
|
flock/core/flock.py
CHANGED
|
@@ -17,7 +17,6 @@ from typing import (
|
|
|
17
17
|
TypeVar,
|
|
18
18
|
)
|
|
19
19
|
|
|
20
|
-
_R = TypeVar("_R")
|
|
21
20
|
# Third-party imports
|
|
22
21
|
from box import Box
|
|
23
22
|
from temporalio import workflow
|
|
@@ -32,8 +31,11 @@ with workflow.unsafe.imports_passed_through():
|
|
|
32
31
|
from flock.core.execution.local_executor import (
|
|
33
32
|
run_local_workflow,
|
|
34
33
|
)
|
|
34
|
+
|
|
35
|
+
import opik
|
|
35
36
|
from opentelemetry import trace
|
|
36
37
|
from opentelemetry.baggage import get_baggage, set_baggage
|
|
38
|
+
from opik.integrations.dspy.callback import OpikCallback
|
|
37
39
|
from pandas import DataFrame # type: ignore
|
|
38
40
|
from pydantic import BaseModel, Field
|
|
39
41
|
|
|
@@ -67,7 +69,7 @@ try:
|
|
|
67
69
|
|
|
68
70
|
PANDAS_AVAILABLE = True
|
|
69
71
|
except ImportError:
|
|
70
|
-
pd = None
|
|
72
|
+
pd = None # type: ignore
|
|
71
73
|
PANDAS_AVAILABLE = False
|
|
72
74
|
|
|
73
75
|
logger = get_logger("flock.api")
|
|
@@ -77,6 +79,7 @@ FlockRegistry = get_registry() # Get the registry instance
|
|
|
77
79
|
|
|
78
80
|
# Define TypeVar for generic class methods like from_dict
|
|
79
81
|
T = TypeVar("T", bound="Flock")
|
|
82
|
+
_R = TypeVar("_R")
|
|
80
83
|
|
|
81
84
|
|
|
82
85
|
class Flock(BaseModel, Serializable):
|
|
@@ -103,6 +106,10 @@ class Flock(BaseModel, Serializable):
|
|
|
103
106
|
default=False,
|
|
104
107
|
description="If True, execute workflows via Temporal; otherwise, run locally.",
|
|
105
108
|
)
|
|
109
|
+
enable_opik: bool = Field(
|
|
110
|
+
default=False,
|
|
111
|
+
description="If True, enable Opik for cost tracking and model management.",
|
|
112
|
+
)
|
|
106
113
|
show_flock_banner: bool = Field(
|
|
107
114
|
default=True,
|
|
108
115
|
description="If True, show the Flock banner on console interactions.",
|
|
@@ -159,11 +166,11 @@ class Flock(BaseModel, Serializable):
|
|
|
159
166
|
"""
|
|
160
167
|
try:
|
|
161
168
|
asyncio.get_running_loop()
|
|
162
|
-
except RuntimeError:
|
|
169
|
+
except RuntimeError: # no loop → simple
|
|
163
170
|
return asyncio.run(coro)
|
|
164
171
|
|
|
165
172
|
# A loop is already running – Jupyter / ASGI / etc.
|
|
166
|
-
ctx = contextvars.copy_context()
|
|
173
|
+
ctx = contextvars.copy_context() # propagate baggage
|
|
167
174
|
with ThreadPoolExecutor(max_workers=1) as pool:
|
|
168
175
|
future = pool.submit(ctx.run, asyncio.run, coro)
|
|
169
176
|
try:
|
|
@@ -179,6 +186,7 @@ class Flock(BaseModel, Serializable):
|
|
|
179
186
|
description: str | None = None,
|
|
180
187
|
show_flock_banner: bool = True,
|
|
181
188
|
enable_temporal: bool = False,
|
|
189
|
+
enable_opik: bool = False,
|
|
182
190
|
agents: list[FlockAgent] | None = None,
|
|
183
191
|
servers: list[FlockMCPServerBase] | None = None,
|
|
184
192
|
temporal_config: TemporalWorkflowConfig | None = None,
|
|
@@ -195,6 +203,7 @@ class Flock(BaseModel, Serializable):
|
|
|
195
203
|
model=model,
|
|
196
204
|
description=description,
|
|
197
205
|
enable_temporal=enable_temporal,
|
|
206
|
+
enable_opik=enable_opik,
|
|
198
207
|
show_flock_banner=show_flock_banner,
|
|
199
208
|
temporal_config=temporal_config,
|
|
200
209
|
temporal_start_in_process_worker=temporal_start_in_process_worker,
|
|
@@ -208,7 +217,6 @@ class Flock(BaseModel, Serializable):
|
|
|
208
217
|
self._start_input = {}
|
|
209
218
|
self._mgr = FlockServerManager()
|
|
210
219
|
|
|
211
|
-
|
|
212
220
|
# Register passed servers
|
|
213
221
|
# (need to be registered first so that agents can retrieve them from the registry)
|
|
214
222
|
# This will also add them to the managed list of self._mgr
|
|
@@ -225,7 +233,6 @@ class Flock(BaseModel, Serializable):
|
|
|
225
233
|
f"Item provided in 'servers' list is not a FlockMCPServer: {type(server)}"
|
|
226
234
|
)
|
|
227
235
|
|
|
228
|
-
|
|
229
236
|
# Register passed agents
|
|
230
237
|
if agents:
|
|
231
238
|
from flock.core.flock_agent import (
|
|
@@ -241,7 +248,7 @@ class Flock(BaseModel, Serializable):
|
|
|
241
248
|
)
|
|
242
249
|
|
|
243
250
|
# Initialize console if needed for banner
|
|
244
|
-
if self.show_flock_banner:
|
|
251
|
+
if self.show_flock_banner: # Check instance attribute
|
|
245
252
|
init_console(clear_screen=True, show_banner=self.show_flock_banner)
|
|
246
253
|
|
|
247
254
|
# Set Temporal debug environment variable
|
|
@@ -252,6 +259,15 @@ class Flock(BaseModel, Serializable):
|
|
|
252
259
|
|
|
253
260
|
FlockRegistry.discover_and_register_components()
|
|
254
261
|
|
|
262
|
+
if self.enable_opik:
|
|
263
|
+
import dspy
|
|
264
|
+
|
|
265
|
+
opik.configure(use_local=True, automatic_approvals=True)
|
|
266
|
+
opik_callback = OpikCallback(project_name=self.name, log_graph=True)
|
|
267
|
+
dspy.settings.configure(
|
|
268
|
+
callbacks=[opik_callback],
|
|
269
|
+
)
|
|
270
|
+
|
|
255
271
|
logger.info(
|
|
256
272
|
"Flock instance initialized",
|
|
257
273
|
name=self.name,
|
|
@@ -259,39 +275,54 @@ class Flock(BaseModel, Serializable):
|
|
|
259
275
|
enable_temporal=self.enable_temporal,
|
|
260
276
|
)
|
|
261
277
|
|
|
262
|
-
def prepare_benchmark(
|
|
278
|
+
def prepare_benchmark(
|
|
279
|
+
self,
|
|
280
|
+
agent: FlockAgent | str | None = None,
|
|
281
|
+
input_field: str | None = None,
|
|
282
|
+
eval_field: str | None = None,
|
|
283
|
+
):
|
|
263
284
|
"""Prepare a benchmark for the Flock instance."""
|
|
264
285
|
from flock.core.flock_agent import FlockAgent as ConcreteFlockAgent
|
|
265
286
|
|
|
266
|
-
logger.info(
|
|
287
|
+
logger.info(
|
|
288
|
+
f"Preparing benchmark for Flock instance '{self.name}' with agent '{agent}'."
|
|
289
|
+
)
|
|
267
290
|
|
|
268
291
|
name = agent.name if isinstance(agent, ConcreteFlockAgent) else agent
|
|
269
292
|
|
|
270
293
|
if self._agents.get(name) is None:
|
|
271
|
-
raise ValueError(
|
|
294
|
+
raise ValueError(
|
|
295
|
+
f"Agent '{name}' not found in Flock instance '{self.name}'."
|
|
296
|
+
)
|
|
272
297
|
|
|
273
298
|
self.benchmark_agent_name = name
|
|
274
299
|
self.benchmark_eval_field = eval_field
|
|
275
300
|
self.benchmark_input_field = input_field
|
|
276
301
|
|
|
277
|
-
|
|
278
|
-
|
|
279
302
|
def inspect(self):
|
|
280
303
|
"""Inspect the Flock instance."""
|
|
281
|
-
logger.info(
|
|
304
|
+
logger.info(
|
|
305
|
+
f"Inspecting Flock instance '{self.name}' with start agent '{self.benchmark_agent_name}' and input '{input}'."
|
|
306
|
+
)
|
|
282
307
|
|
|
283
|
-
async def run(input: dict[str, Any])-> dict[str, Any]:
|
|
308
|
+
async def run(input: dict[str, Any]) -> dict[str, Any]:
|
|
284
309
|
"""Inspect the Flock instance."""
|
|
285
|
-
logger.info(
|
|
310
|
+
logger.info(
|
|
311
|
+
f"Inspecting Flock instance '{self.name}' with start agent '{self.benchmark_agent_name}' and input '{input}'."
|
|
312
|
+
)
|
|
286
313
|
msg_content = input.get("messages")[0].get("content")
|
|
287
314
|
|
|
288
|
-
agent_input = {
|
|
289
|
-
self.benchmark_input_field: msg_content
|
|
290
|
-
}
|
|
315
|
+
agent_input = {self.benchmark_input_field: msg_content}
|
|
291
316
|
|
|
292
|
-
result = await self.run_async(
|
|
317
|
+
result = await self.run_async(
|
|
318
|
+
start_agent=self.benchmark_agent_name,
|
|
319
|
+
input=agent_input,
|
|
320
|
+
box_result=False,
|
|
321
|
+
)
|
|
293
322
|
|
|
294
|
-
agent_output = result.get(
|
|
323
|
+
agent_output = result.get(
|
|
324
|
+
self.benchmark_eval_field, "No answer found"
|
|
325
|
+
)
|
|
295
326
|
|
|
296
327
|
return {
|
|
297
328
|
"output": agent_output,
|
|
@@ -299,8 +330,6 @@ class Flock(BaseModel, Serializable):
|
|
|
299
330
|
|
|
300
331
|
return run
|
|
301
332
|
|
|
302
|
-
|
|
303
|
-
|
|
304
333
|
def _set_temporal_debug_flag(self):
|
|
305
334
|
"""Set or remove LOCAL_DEBUG env var based on enable_temporal."""
|
|
306
335
|
if not self.enable_temporal:
|
|
@@ -373,10 +402,14 @@ class Flock(BaseModel, Serializable):
|
|
|
373
402
|
if agent.name in self._agents:
|
|
374
403
|
# Allow re-adding the same instance, but raise error for different instance with same name
|
|
375
404
|
if self._agents[agent.name] is not agent:
|
|
376
|
-
raise ValueError(
|
|
405
|
+
raise ValueError(
|
|
406
|
+
f"Agent with name '{agent.name}' already exists with a different instance."
|
|
407
|
+
)
|
|
377
408
|
else:
|
|
378
|
-
logger.debug(
|
|
379
|
-
|
|
409
|
+
logger.debug(
|
|
410
|
+
f"Agent '{agent.name}' is already added. Skipping."
|
|
411
|
+
)
|
|
412
|
+
return agent # Return existing agent
|
|
380
413
|
|
|
381
414
|
self._agents[agent.name] = agent
|
|
382
415
|
FlockRegistry.register_agent(agent) # Register globally
|
|
@@ -415,7 +448,7 @@ class Flock(BaseModel, Serializable):
|
|
|
415
448
|
box_result: bool = True,
|
|
416
449
|
agents: list[FlockAgent] | None = None,
|
|
417
450
|
servers: list[FlockMCPServerBase] | None = None,
|
|
418
|
-
memo: dict[str, Any] | None = None
|
|
451
|
+
memo: dict[str, Any] | None = None,
|
|
419
452
|
) -> Box | dict:
|
|
420
453
|
return self._run_sync(
|
|
421
454
|
self.run_async(
|
|
@@ -430,7 +463,6 @@ class Flock(BaseModel, Serializable):
|
|
|
430
463
|
)
|
|
431
464
|
)
|
|
432
465
|
|
|
433
|
-
|
|
434
466
|
async def run_async(
|
|
435
467
|
self,
|
|
436
468
|
start_agent: FlockAgent | str | None = None,
|
|
@@ -474,11 +506,13 @@ class Flock(BaseModel, Serializable):
|
|
|
474
506
|
start_agent_name: str | None = None
|
|
475
507
|
if isinstance(start_agent, ConcreteFlockAgent):
|
|
476
508
|
start_agent_name = start_agent.name
|
|
477
|
-
if
|
|
509
|
+
if (
|
|
510
|
+
start_agent_name not in self._agents
|
|
511
|
+
): # Add if not already present
|
|
478
512
|
self.add_agent(start_agent)
|
|
479
513
|
elif isinstance(start_agent, str):
|
|
480
514
|
start_agent_name = start_agent
|
|
481
|
-
else:
|
|
515
|
+
else: # start_agent is None
|
|
482
516
|
start_agent_name = self._start_agent_name
|
|
483
517
|
|
|
484
518
|
# Default to first agent if only one exists and none specified
|
|
@@ -516,23 +550,27 @@ class Flock(BaseModel, Serializable):
|
|
|
516
550
|
|
|
517
551
|
try:
|
|
518
552
|
resolved_start_agent = self._agents.get(start_agent_name)
|
|
519
|
-
if not resolved_start_agent:
|
|
520
|
-
raise ValueError(
|
|
553
|
+
if not resolved_start_agent: # Should have been handled by now
|
|
554
|
+
raise ValueError(
|
|
555
|
+
f"Start agent '{start_agent_name}' not found after checks."
|
|
556
|
+
)
|
|
521
557
|
|
|
522
558
|
run_context = context if context else FlockContext()
|
|
523
|
-
set_baggage("run_id", effective_run_id)
|
|
559
|
+
set_baggage("run_id", effective_run_id) # Set for OpenTelemetry
|
|
524
560
|
|
|
525
561
|
initialize_context(
|
|
526
562
|
run_context,
|
|
527
563
|
start_agent_name,
|
|
528
564
|
run_input,
|
|
529
565
|
effective_run_id,
|
|
530
|
-
not self.enable_temporal,
|
|
566
|
+
not self.enable_temporal, # local_debug is inverse of enable_temporal
|
|
531
567
|
self.model or resolved_start_agent.model or DEFAULT_MODEL,
|
|
532
568
|
)
|
|
533
569
|
# Add agent definitions to context for routing/serialization within workflow
|
|
534
570
|
for agent_name_iter, agent_instance_iter in self.agents.items():
|
|
535
|
-
agent_dict_repr =
|
|
571
|
+
agent_dict_repr = (
|
|
572
|
+
agent_instance_iter.to_dict()
|
|
573
|
+
) # Agents handle their own serialization
|
|
536
574
|
run_context.add_agent_definition(
|
|
537
575
|
agent_type=type(agent_instance_iter),
|
|
538
576
|
agent_name=agent_name_iter,
|
|
@@ -568,13 +606,14 @@ class Flock(BaseModel, Serializable):
|
|
|
568
606
|
# Execute workflow
|
|
569
607
|
if not self.enable_temporal:
|
|
570
608
|
result = await run_local_workflow(
|
|
571
|
-
run_context,
|
|
609
|
+
run_context,
|
|
610
|
+
box_result=False, # Boxing handled below
|
|
572
611
|
)
|
|
573
612
|
else:
|
|
574
613
|
result = await run_temporal_workflow(
|
|
575
|
-
self,
|
|
614
|
+
self, # Pass the Flock instance
|
|
576
615
|
run_context,
|
|
577
|
-
box_result=False,
|
|
616
|
+
box_result=False, # Boxing handled below
|
|
578
617
|
memo=memo,
|
|
579
618
|
)
|
|
580
619
|
|
|
@@ -616,7 +655,6 @@ class Flock(BaseModel, Serializable):
|
|
|
616
655
|
}
|
|
617
656
|
return Box(error_output) if box_result else error_output
|
|
618
657
|
|
|
619
|
-
|
|
620
658
|
# --- Batch Processing (Delegation) ---
|
|
621
659
|
async def run_batch_async(
|
|
622
660
|
self,
|
|
@@ -689,19 +727,18 @@ class Flock(BaseModel, Serializable):
|
|
|
689
727
|
)
|
|
690
728
|
)
|
|
691
729
|
|
|
692
|
-
|
|
693
730
|
# --- Evaluation (Delegation) ---
|
|
694
731
|
async def evaluate_async(
|
|
695
732
|
self,
|
|
696
|
-
dataset: str | Path | list[dict[str, Any]] | DataFrame | Dataset,
|
|
733
|
+
dataset: str | Path | list[dict[str, Any]] | DataFrame | Dataset, # type: ignore
|
|
697
734
|
start_agent: FlockAgent | str,
|
|
698
735
|
input_mapping: dict[str, str],
|
|
699
736
|
answer_mapping: dict[str, str],
|
|
700
737
|
metrics: list[
|
|
701
738
|
str
|
|
702
739
|
| Callable[[Any, Any], bool | float | dict[str, Any]]
|
|
703
|
-
| FlockAgent
|
|
704
|
-
| FlockEvaluator
|
|
740
|
+
| FlockAgent # Type hint only
|
|
741
|
+
| FlockEvaluator # Type hint only
|
|
705
742
|
],
|
|
706
743
|
metric_configs: dict[str, dict[str, Any]] | None = None,
|
|
707
744
|
static_inputs: dict[str, Any] | None = None,
|
|
@@ -713,7 +750,7 @@ class Flock(BaseModel, Serializable):
|
|
|
713
750
|
return_dataframe: bool = True,
|
|
714
751
|
silent_mode: bool = False,
|
|
715
752
|
metadata_columns: list[str] | None = None,
|
|
716
|
-
) -> DataFrame | list[dict[str, Any]]:
|
|
753
|
+
) -> DataFrame | list[dict[str, Any]]: # type: ignore
|
|
717
754
|
"""Evaluates the Flock's performance against a dataset (delegated)."""
|
|
718
755
|
# Import processor locally
|
|
719
756
|
from flock.core.execution.evaluation_executor import (
|
|
@@ -741,15 +778,15 @@ class Flock(BaseModel, Serializable):
|
|
|
741
778
|
|
|
742
779
|
def evaluate(
|
|
743
780
|
self,
|
|
744
|
-
dataset: str | Path | list[dict[str, Any]] | DataFrame | Dataset,
|
|
781
|
+
dataset: str | Path | list[dict[str, Any]] | DataFrame | Dataset, # type: ignore
|
|
745
782
|
start_agent: FlockAgent | str,
|
|
746
783
|
input_mapping: dict[str, str],
|
|
747
784
|
answer_mapping: dict[str, str],
|
|
748
785
|
metrics: list[
|
|
749
786
|
str
|
|
750
787
|
| Callable[[Any, Any], bool | float | dict[str, Any]]
|
|
751
|
-
| FlockAgent
|
|
752
|
-
| FlockEvaluator
|
|
788
|
+
| FlockAgent # Type hint only
|
|
789
|
+
| FlockEvaluator # Type hint only
|
|
753
790
|
],
|
|
754
791
|
metric_configs: dict[str, dict[str, Any]] | None = None,
|
|
755
792
|
static_inputs: dict[str, Any] | None = None,
|
|
@@ -761,7 +798,7 @@ class Flock(BaseModel, Serializable):
|
|
|
761
798
|
return_dataframe: bool = True,
|
|
762
799
|
silent_mode: bool = False,
|
|
763
800
|
metadata_columns: list[str] | None = None,
|
|
764
|
-
) -> DataFrame | list[dict[str, Any]]:
|
|
801
|
+
) -> DataFrame | list[dict[str, Any]]: # type: ignore
|
|
765
802
|
return self._run_sync(
|
|
766
803
|
self.evaluate_async(
|
|
767
804
|
dataset=dataset,
|
|
@@ -781,18 +818,22 @@ class Flock(BaseModel, Serializable):
|
|
|
781
818
|
metadata_columns=metadata_columns,
|
|
782
819
|
)
|
|
783
820
|
)
|
|
821
|
+
|
|
784
822
|
# --- Server & CLI Starters (Delegation) ---
|
|
785
823
|
def start_api(
|
|
786
824
|
self,
|
|
787
825
|
host: str = "127.0.0.1",
|
|
788
826
|
port: int = 8344,
|
|
789
827
|
server_name: str = "Flock Server",
|
|
790
|
-
create_ui: bool = True,
|
|
828
|
+
create_ui: bool = True, # Default to True for the integrated experience
|
|
791
829
|
ui_theme: str | None = None,
|
|
792
|
-
custom_endpoints: Sequence[FlockEndpoint]
|
|
830
|
+
custom_endpoints: Sequence[FlockEndpoint]
|
|
831
|
+
| dict[tuple[str, list[str] | None], Callable[..., Any]]
|
|
832
|
+
| None = None,
|
|
793
833
|
) -> None:
|
|
794
834
|
"""Starts a unified REST API server and/or Web UI for this Flock instance."""
|
|
795
835
|
import warnings
|
|
836
|
+
|
|
796
837
|
warnings.warn(
|
|
797
838
|
"start_api() is deprecated and will be removed in a future release. "
|
|
798
839
|
"Use serve() instead.",
|
|
@@ -825,7 +866,9 @@ class Flock(BaseModel, Serializable):
|
|
|
825
866
|
chat_history_key: str = "history",
|
|
826
867
|
chat_response_key: str = "response",
|
|
827
868
|
ui_theme: str | None = None,
|
|
828
|
-
custom_endpoints: Sequence[FlockEndpoint]
|
|
869
|
+
custom_endpoints: Sequence[FlockEndpoint]
|
|
870
|
+
| dict[tuple[str, list[str] | None], Callable[..., Any]]
|
|
871
|
+
| None = None,
|
|
829
872
|
) -> None:
|
|
830
873
|
"""Launch an HTTP server that exposes the core REST API and, optionally, the
|
|
831
874
|
browser-based UI.
|
|
@@ -871,7 +914,9 @@ class Flock(BaseModel, Serializable):
|
|
|
871
914
|
|
|
872
915
|
def start_cli(
|
|
873
916
|
self,
|
|
874
|
-
start_agent: FlockAgent
|
|
917
|
+
start_agent: FlockAgent
|
|
918
|
+
| str
|
|
919
|
+
| None = None, # Added start_agent to match method signature in file_26
|
|
875
920
|
server_name: str = "Flock CLI",
|
|
876
921
|
show_results: bool = False,
|
|
877
922
|
edit_mode: bool = False,
|
|
@@ -893,14 +938,13 @@ class Flock(BaseModel, Serializable):
|
|
|
893
938
|
# If start_agent is crucial here, start_flock_cli needs to handle it.
|
|
894
939
|
logger.info(f"Starting CLI for Flock '{self.name}'...")
|
|
895
940
|
start_flock_cli(
|
|
896
|
-
flock=self,
|
|
941
|
+
flock=self, # Pass the Flock instance
|
|
897
942
|
# start_agent=start_agent, # This argument is not in the definition of start_flock_cli in file_50
|
|
898
943
|
server_name=server_name,
|
|
899
944
|
show_results=show_results,
|
|
900
|
-
edit_mode=edit_mode
|
|
945
|
+
edit_mode=edit_mode,
|
|
901
946
|
)
|
|
902
947
|
|
|
903
|
-
|
|
904
948
|
# --- Serialization Delegation Methods ---
|
|
905
949
|
def to_dict(self, path_type: str = "relative") -> dict[str, Any]:
|
|
906
950
|
"""Serialize Flock instance to dictionary using FlockSerializer."""
|
|
@@ -917,12 +961,14 @@ class Flock(BaseModel, Serializable):
|
|
|
917
961
|
|
|
918
962
|
# --- Static Method Loader (Delegates to loader module) ---
|
|
919
963
|
@staticmethod
|
|
920
|
-
def load_from_file(file_path: str) -> Flock:
|
|
964
|
+
def load_from_file(file_path: str) -> Flock: # Ensure return type is Flock
|
|
921
965
|
"""Load a Flock instance from various file formats (delegates to loader)."""
|
|
922
966
|
from flock.core.util.loader import load_flock_from_file
|
|
923
967
|
|
|
924
968
|
loaded_flock = load_flock_from_file(file_path)
|
|
925
969
|
# Ensure the loaded object is indeed a Flock instance
|
|
926
970
|
if not isinstance(loaded_flock, Flock):
|
|
927
|
-
raise TypeError(
|
|
971
|
+
raise TypeError(
|
|
972
|
+
f"Loaded object from {file_path} is not a Flock instance, but {type(loaded_flock)}"
|
|
973
|
+
)
|
|
928
974
|
return loaded_flock
|
flock/core/flock_agent.py
CHANGED
|
@@ -691,8 +691,8 @@ class FlockAgent(BaseModel, Serializable, DSPyIntegrationMixin, ABC):
|
|
|
691
691
|
|
|
692
692
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
693
693
|
filename = f"{agent_name}_output_{timestamp}.json"
|
|
694
|
-
filepath = os.path.join("output/", filename)
|
|
695
|
-
os.makedirs("output/", exist_ok=True)
|
|
694
|
+
filepath = os.path.join(".flock/output/", filename)
|
|
695
|
+
os.makedirs(".flock/output/", exist_ok=True)
|
|
696
696
|
|
|
697
697
|
output_data = {
|
|
698
698
|
"agent": agent_name,
|
flock/core/flock_evaluator.py
CHANGED
|
@@ -45,9 +45,16 @@ class FlockEvaluator(ABC, BaseModel):
|
|
|
45
45
|
description="Evaluator configuration",
|
|
46
46
|
)
|
|
47
47
|
|
|
48
|
+
def __init__(self, **data):
|
|
49
|
+
super().__init__(**data)
|
|
50
|
+
|
|
48
51
|
@abstractmethod
|
|
49
52
|
async def evaluate(
|
|
50
|
-
self,
|
|
53
|
+
self,
|
|
54
|
+
agent: Any,
|
|
55
|
+
inputs: dict[str, Any],
|
|
56
|
+
tools: list[Any],
|
|
57
|
+
mcp_tools: list[Any] | None = None,
|
|
51
58
|
) -> dict[str, Any]:
|
|
52
59
|
"""Evaluate inputs to produce outputs."""
|
|
53
60
|
pass
|
flock/core/flock_factory.py
CHANGED
|
@@ -321,6 +321,8 @@ class FlockFactory:
|
|
|
321
321
|
wait_for_input: bool = False,
|
|
322
322
|
temperature: float = 0.0,
|
|
323
323
|
max_tokens: int = 8192,
|
|
324
|
+
max_tool_calls: int = 10,
|
|
325
|
+
max_retries: int = 3,
|
|
324
326
|
alert_latency_threshold_ms: int = 30000,
|
|
325
327
|
no_output: bool = False,
|
|
326
328
|
print_context: bool = False,
|
|
@@ -343,6 +345,8 @@ class FlockFactory:
|
|
|
343
345
|
use_cache=use_cache,
|
|
344
346
|
max_tokens=max_tokens,
|
|
345
347
|
temperature=temperature,
|
|
348
|
+
max_tool_calls=max_tool_calls,
|
|
349
|
+
max_retries=max_retries,
|
|
346
350
|
stream=stream,
|
|
347
351
|
include_thought_process=include_thought_process,
|
|
348
352
|
)
|
|
@@ -75,7 +75,9 @@ def _resolve_type_string(type_str: str) -> type:
|
|
|
75
75
|
try:
|
|
76
76
|
return tuple(ast.literal_eval(f"[{args_str}]"))
|
|
77
77
|
except (SyntaxError, ValueError) as exc:
|
|
78
|
-
raise ValueError(
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"Cannot parse {args_str!r} as literals"
|
|
80
|
+
) from exc
|
|
79
81
|
|
|
80
82
|
literal_args = parse_literal_args(args_str)
|
|
81
83
|
logger.debug(
|
|
@@ -250,8 +252,7 @@ class DSPyIntegrationMixin:
|
|
|
250
252
|
f"Failed to create dynamic type 'dspy_{agent_name}': {e}",
|
|
251
253
|
exc_info=True,
|
|
252
254
|
)
|
|
253
|
-
raise TypeError(
|
|
254
|
-
f"Could not create DSPy signature type: {e}") from e
|
|
255
|
+
raise TypeError(f"Could not create DSPy signature type: {e}") from e
|
|
255
256
|
|
|
256
257
|
def _configure_language_model(
|
|
257
258
|
self,
|
|
@@ -308,6 +309,7 @@ class DSPyIntegrationMixin:
|
|
|
308
309
|
self,
|
|
309
310
|
signature: Any,
|
|
310
311
|
override_evaluator_type: AgentType,
|
|
312
|
+
max_tool_calls: int = 10,
|
|
311
313
|
tools: list[Any] | None = None,
|
|
312
314
|
mcp_tools: list[Any] | None = None,
|
|
313
315
|
kwargs: dict[str, Any] = {},
|
|
@@ -370,7 +372,7 @@ class DSPyIntegrationMixin:
|
|
|
370
372
|
dspy_program = dspy.ChainOfThought(signature, **kwargs)
|
|
371
373
|
elif selected_type == "ReAct":
|
|
372
374
|
if not kwargs:
|
|
373
|
-
kwargs = {"max_iters":
|
|
375
|
+
kwargs = {"max_iters": max_tool_calls}
|
|
374
376
|
dspy_program = dspy.ReAct(
|
|
375
377
|
signature, tools=merged_tools or [], **kwargs
|
|
376
378
|
)
|
|
@@ -427,8 +429,7 @@ class DSPyIntegrationMixin:
|
|
|
427
429
|
final_result = {**inputs, **output_dict}
|
|
428
430
|
|
|
429
431
|
lm = dspy.settings.get("lm")
|
|
430
|
-
cost = sum([x["cost"]
|
|
431
|
-
for x in lm.history if x["cost"] is not None])
|
|
432
|
+
cost = sum([x["cost"] for x in lm.history if x["cost"] is not None])
|
|
432
433
|
lm_history = lm.history
|
|
433
434
|
|
|
434
435
|
return final_result, cost, lm_history
|