arthur-common 2.1.56__py3-none-any.whl → 2.1.57__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of arthur-common might be problematic. Click here for more details.
- arthur_common/aggregations/functions/agentic_aggregations.py +25 -14
- {arthur_common-2.1.56.dist-info → arthur_common-2.1.57.dist-info}/METADATA +1 -1
- {arthur_common-2.1.56.dist-info → arthur_common-2.1.57.dist-info}/RECORD +4 -4
- {arthur_common-2.1.56.dist-info → arthur_common-2.1.57.dist-info}/WHEEL +0 -0
|
@@ -36,7 +36,9 @@ def extract_spans_with_metrics_and_agents(root_spans):
|
|
|
36
36
|
spans_with_metrics_and_agents = []
|
|
37
37
|
|
|
38
38
|
def traverse_spans(spans, current_agent_name="unknown"):
|
|
39
|
-
for
|
|
39
|
+
for span_str in spans:
|
|
40
|
+
span = json.loads(span_str) if type(span_str) == str else span_str
|
|
41
|
+
|
|
40
42
|
# Update current agent name if this span is an AGENT
|
|
41
43
|
if span.get("span_kind") == "AGENT":
|
|
42
44
|
try:
|
|
@@ -142,7 +144,7 @@ class AgenticMetricsOverTimeAggregation(SketchAggregationFunction):
|
|
|
142
144
|
results = ddb_conn.sql(
|
|
143
145
|
f"""
|
|
144
146
|
SELECT
|
|
145
|
-
time_bucket(INTERVAL '5 minutes',
|
|
147
|
+
time_bucket(INTERVAL '5 minutes', start_time) as ts,
|
|
146
148
|
root_spans
|
|
147
149
|
FROM {dataset.dataset_table_name}
|
|
148
150
|
WHERE root_spans IS NOT NULL AND length(root_spans) > 0
|
|
@@ -175,7 +177,7 @@ class AgenticMetricsOverTimeAggregation(SketchAggregationFunction):
|
|
|
175
177
|
|
|
176
178
|
for metric_result in metric_results:
|
|
177
179
|
metric_type = metric_result.get("metric_type")
|
|
178
|
-
details = metric_result.get("details", {})
|
|
180
|
+
details = json.loads(metric_result.get("details", '{}'))
|
|
179
181
|
|
|
180
182
|
if metric_type == "ToolSelection":
|
|
181
183
|
tool_selection = details.get("tool_selection", {})
|
|
@@ -193,6 +195,7 @@ class AgenticMetricsOverTimeAggregation(SketchAggregationFunction):
|
|
|
193
195
|
"ts": ts,
|
|
194
196
|
"tool_selection_score": tool_selection_score,
|
|
195
197
|
"tool_selection_reason": tool_selection_reason,
|
|
198
|
+
"agent_name": agent_name,
|
|
196
199
|
},
|
|
197
200
|
)
|
|
198
201
|
|
|
@@ -209,6 +212,7 @@ class AgenticMetricsOverTimeAggregation(SketchAggregationFunction):
|
|
|
209
212
|
"ts": ts,
|
|
210
213
|
"tool_usage_score": tool_usage_score,
|
|
211
214
|
"tool_usage_reason": tool_usage_reason,
|
|
215
|
+
"agent_name": agent_name,
|
|
212
216
|
},
|
|
213
217
|
)
|
|
214
218
|
|
|
@@ -228,6 +232,7 @@ class AgenticMetricsOverTimeAggregation(SketchAggregationFunction):
|
|
|
228
232
|
"score_type": "llm_relevance_score",
|
|
229
233
|
"score_value": llm_score,
|
|
230
234
|
"reason": reason,
|
|
235
|
+
"agent_name": agent_name,
|
|
231
236
|
},
|
|
232
237
|
)
|
|
233
238
|
|
|
@@ -238,6 +243,7 @@ class AgenticMetricsOverTimeAggregation(SketchAggregationFunction):
|
|
|
238
243
|
"score_type": "reranker_relevance_score",
|
|
239
244
|
"score_value": reranker_score,
|
|
240
245
|
"reason": reason,
|
|
246
|
+
"agent_name": agent_name,
|
|
241
247
|
},
|
|
242
248
|
)
|
|
243
249
|
|
|
@@ -248,6 +254,7 @@ class AgenticMetricsOverTimeAggregation(SketchAggregationFunction):
|
|
|
248
254
|
"score_type": "bert_f_score",
|
|
249
255
|
"score_value": bert_score,
|
|
250
256
|
"reason": reason,
|
|
257
|
+
"agent_name": agent_name,
|
|
251
258
|
},
|
|
252
259
|
)
|
|
253
260
|
|
|
@@ -269,6 +276,7 @@ class AgenticMetricsOverTimeAggregation(SketchAggregationFunction):
|
|
|
269
276
|
"score_type": "llm_relevance_score",
|
|
270
277
|
"score_value": llm_score,
|
|
271
278
|
"reason": reason,
|
|
279
|
+
"agent_name": agent_name,
|
|
272
280
|
},
|
|
273
281
|
)
|
|
274
282
|
|
|
@@ -289,6 +297,7 @@ class AgenticMetricsOverTimeAggregation(SketchAggregationFunction):
|
|
|
289
297
|
"score_type": "bert_f_score",
|
|
290
298
|
"score_value": bert_score,
|
|
291
299
|
"reason": reason,
|
|
300
|
+
"agent_name": agent_name,
|
|
292
301
|
},
|
|
293
302
|
)
|
|
294
303
|
|
|
@@ -300,7 +309,7 @@ class AgenticMetricsOverTimeAggregation(SketchAggregationFunction):
|
|
|
300
309
|
series = self.group_query_results_to_sketch_metrics(
|
|
301
310
|
df,
|
|
302
311
|
"tool_selection_score",
|
|
303
|
-
["tool_selection_reason"],
|
|
312
|
+
["tool_selection_reason", "agent_name"],
|
|
304
313
|
"ts",
|
|
305
314
|
)
|
|
306
315
|
metrics.append(
|
|
@@ -313,7 +322,7 @@ class AgenticMetricsOverTimeAggregation(SketchAggregationFunction):
|
|
|
313
322
|
series = self.group_query_results_to_sketch_metrics(
|
|
314
323
|
df,
|
|
315
324
|
"tool_usage_score",
|
|
316
|
-
["tool_usage_reason"],
|
|
325
|
+
["tool_usage_reason", "agent_name"],
|
|
317
326
|
"ts",
|
|
318
327
|
)
|
|
319
328
|
metrics.append(self.series_to_metric(self.TOOL_USAGE_METRIC_NAME, series))
|
|
@@ -392,7 +401,7 @@ class AgenticRelevancePassFailCountAggregation(NumericAggregationFunction):
|
|
|
392
401
|
results = ddb_conn.sql(
|
|
393
402
|
f"""
|
|
394
403
|
SELECT
|
|
395
|
-
time_bucket(INTERVAL '5 minutes',
|
|
404
|
+
time_bucket(INTERVAL '5 minutes', start_time) as ts,
|
|
396
405
|
root_spans
|
|
397
406
|
FROM {dataset.dataset_table_name}
|
|
398
407
|
WHERE root_spans IS NOT NULL AND length(root_spans) > 0
|
|
@@ -421,7 +430,7 @@ class AgenticRelevancePassFailCountAggregation(NumericAggregationFunction):
|
|
|
421
430
|
|
|
422
431
|
for metric_result in metric_results:
|
|
423
432
|
metric_type = metric_result.get("metric_type")
|
|
424
|
-
details = metric_result.get("details", {})
|
|
433
|
+
details = json.loads(metric_result.get("details", '{}'))
|
|
425
434
|
|
|
426
435
|
if metric_type in ["QueryRelevance", "ResponseRelevance"]:
|
|
427
436
|
relevance_data = details.get(
|
|
@@ -517,7 +526,7 @@ class AgenticToolPassFailCountAggregation(NumericAggregationFunction):
|
|
|
517
526
|
results = ddb_conn.sql(
|
|
518
527
|
f"""
|
|
519
528
|
SELECT
|
|
520
|
-
time_bucket(INTERVAL '5 minutes',
|
|
529
|
+
time_bucket(INTERVAL '5 minutes', start_time) as ts,
|
|
521
530
|
root_spans
|
|
522
531
|
FROM {dataset.dataset_table_name}
|
|
523
532
|
WHERE root_spans IS NOT NULL AND length(root_spans) > 0
|
|
@@ -546,7 +555,7 @@ class AgenticToolPassFailCountAggregation(NumericAggregationFunction):
|
|
|
546
555
|
|
|
547
556
|
for metric_result in metric_results:
|
|
548
557
|
if metric_result.get("metric_type") == "ToolSelection":
|
|
549
|
-
details = metric_result.get("details", {})
|
|
558
|
+
details = json.loads(metric_result.get("details", '{}'))
|
|
550
559
|
tool_selection = details.get("tool_selection", {})
|
|
551
560
|
|
|
552
561
|
tool_selection_score = tool_selection.get("tool_selection")
|
|
@@ -638,7 +647,7 @@ class AgenticEventCountAggregation(NumericAggregationFunction):
|
|
|
638
647
|
results = ddb_conn.sql(
|
|
639
648
|
f"""
|
|
640
649
|
SELECT
|
|
641
|
-
time_bucket(INTERVAL '5 minutes',
|
|
650
|
+
time_bucket(INTERVAL '5 minutes', start_time) as ts,
|
|
642
651
|
COUNT(*) as count
|
|
643
652
|
FROM {dataset.dataset_table_name}
|
|
644
653
|
GROUP BY ts
|
|
@@ -695,7 +704,7 @@ class AgenticLLMCallCountAggregation(NumericAggregationFunction):
|
|
|
695
704
|
results = ddb_conn.sql(
|
|
696
705
|
f"""
|
|
697
706
|
SELECT
|
|
698
|
-
time_bucket(INTERVAL '5 minutes',
|
|
707
|
+
time_bucket(INTERVAL '5 minutes', start_time) as ts,
|
|
699
708
|
root_spans
|
|
700
709
|
FROM {dataset.dataset_table_name}
|
|
701
710
|
WHERE root_spans IS NOT NULL AND length(root_spans) > 0
|
|
@@ -716,7 +725,9 @@ class AgenticLLMCallCountAggregation(NumericAggregationFunction):
|
|
|
716
725
|
# Count LLM spans in the tree
|
|
717
726
|
def count_llm_spans(spans):
|
|
718
727
|
count = 0
|
|
719
|
-
for
|
|
728
|
+
for span_str in spans:
|
|
729
|
+
span = json.loads(span_str) if type(span_str) == str else span_str
|
|
730
|
+
|
|
720
731
|
# Check if this span is an LLM span
|
|
721
732
|
if span.get("span_kind") == "LLM":
|
|
722
733
|
count += 1
|
|
@@ -790,7 +801,7 @@ class AgenticToolSelectionAndUsageByAgentAggregation(NumericAggregationFunction)
|
|
|
790
801
|
results = ddb_conn.sql(
|
|
791
802
|
f"""
|
|
792
803
|
SELECT
|
|
793
|
-
time_bucket(INTERVAL '5 minutes',
|
|
804
|
+
time_bucket(INTERVAL '5 minutes', start_time) as ts,
|
|
794
805
|
root_spans
|
|
795
806
|
FROM {dataset.dataset_table_name}
|
|
796
807
|
WHERE root_spans IS NOT NULL AND length(root_spans) > 0
|
|
@@ -819,7 +830,7 @@ class AgenticToolSelectionAndUsageByAgentAggregation(NumericAggregationFunction)
|
|
|
819
830
|
|
|
820
831
|
for metric_result in metric_results:
|
|
821
832
|
if metric_result.get("metric_type") == "ToolSelection":
|
|
822
|
-
details = metric_result.get("details", {})
|
|
833
|
+
details = json.loads(metric_result.get("details", '{}'))
|
|
823
834
|
tool_selection = details.get("tool_selection", {})
|
|
824
835
|
|
|
825
836
|
tool_selection_score = tool_selection.get("tool_selection")
|
|
@@ -3,7 +3,7 @@ arthur_common/aggregations/__init__.py,sha256=vISWyciQAtksa71OKeHNP-QyFGd1NzBKq_
|
|
|
3
3
|
arthur_common/aggregations/aggregator.py,sha256=kS9Qru0AhZzZz4Ym20NT7aNrbcQaqg2zgBVYFogFbbg,7936
|
|
4
4
|
arthur_common/aggregations/functions/README.md,sha256=MkZoTAJ94My96R5Z8GAxud7S6vyR0vgVi9gqdt9a4XY,5460
|
|
5
5
|
arthur_common/aggregations/functions/__init__.py,sha256=HqC3UNRURX7ZQHgamTrQvfA8u_FiZGZ4I4eQW7Ooe5o,1299
|
|
6
|
-
arthur_common/aggregations/functions/agentic_aggregations.py,sha256=
|
|
6
|
+
arthur_common/aggregations/functions/agentic_aggregations.py,sha256=AXPuIgESf-q2JG4vRc8XYARFbI8R92e7uaR7cgaTMqY,33401
|
|
7
7
|
arthur_common/aggregations/functions/categorical_count.py,sha256=wc1ovL8JoiSeoSTk9h1fgrLj1QuQeYYZmEqgffGc2cw,5328
|
|
8
8
|
arthur_common/aggregations/functions/confusion_matrix.py,sha256=Zac-biMeIVyLRcMXWmENgYq8X4I7Trm8gOE5NRLGKU0,22108
|
|
9
9
|
arthur_common/aggregations/functions/inference_count.py,sha256=SrRfxQVnX-wRTZ1zbqUKupPdACvfKeUpZDidZs45ZUY,4079
|
|
@@ -39,6 +39,6 @@ arthur_common/tools/functions.py,sha256=FWL4eWO5-vLp86WudT-MGUKvf2B8f02IdoXQFKd6
|
|
|
39
39
|
arthur_common/tools/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
40
|
arthur_common/tools/schema_inferer.py,sha256=Ur4CXGAkd6ZMSU0nMNrkOEElsBopHXq0lctTV8X92W8,5188
|
|
41
41
|
arthur_common/tools/time_utils.py,sha256=4gfiu9NXfvPZltiVNLSIQGylX6h2W0viNi9Kv4bKyfw,1410
|
|
42
|
-
arthur_common-2.1.
|
|
43
|
-
arthur_common-2.1.
|
|
44
|
-
arthur_common-2.1.
|
|
42
|
+
arthur_common-2.1.57.dist-info/METADATA,sha256=tciX3Iwg2PhiaJkObFd625vI7fcLO8S4JvICHdkzPvw,1609
|
|
43
|
+
arthur_common-2.1.57.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
44
|
+
arthur_common-2.1.57.dist-info/RECORD,,
|
|
File without changes
|