supervisely 6.73.238__py3-none-any.whl → 6.73.240__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- supervisely/annotation/annotation.py +2 -2
- supervisely/api/entity_annotation/tag_api.py +11 -4
- supervisely/api/file_api.py +17 -3
- supervisely/nn/__init__.py +1 -0
- supervisely/nn/benchmark/__init__.py +14 -2
- supervisely/nn/benchmark/base_benchmark.py +84 -37
- supervisely/nn/benchmark/base_evaluator.py +120 -0
- supervisely/nn/benchmark/base_visualizer.py +265 -0
- supervisely/nn/benchmark/comparison/detection_visualization/text_templates.py +5 -5
- supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/calibration_score.py +2 -2
- supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/explore_predicttions.py +39 -16
- supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/localization_accuracy.py +1 -1
- supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/outcome_counts.py +4 -4
- supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/overview.py +12 -11
- supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/pr_curve.py +1 -1
- supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/precision_recal_f1.py +6 -6
- supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/speedtest.py +3 -3
- supervisely/nn/benchmark/{instance_segmentation_benchmark.py → instance_segmentation/benchmark.py} +9 -3
- supervisely/nn/benchmark/instance_segmentation/evaluator.py +58 -0
- supervisely/nn/benchmark/{visualization/text_templates/instance_segmentation_text.py → instance_segmentation/text_templates.py} +53 -69
- supervisely/nn/benchmark/instance_segmentation/visualizer.py +18 -0
- supervisely/nn/benchmark/object_detection/__init__.py +0 -0
- supervisely/nn/benchmark/object_detection/base_vis_metric.py +51 -0
- supervisely/nn/benchmark/{object_detection_benchmark.py → object_detection/benchmark.py} +4 -2
- supervisely/nn/benchmark/object_detection/evaluation_params.yaml +2 -0
- supervisely/nn/benchmark/{evaluation/object_detection_evaluator.py → object_detection/evaluator.py} +67 -9
- supervisely/nn/benchmark/{evaluation/coco → object_detection}/metric_provider.py +13 -14
- supervisely/nn/benchmark/{visualization/text_templates/object_detection_text.py → object_detection/text_templates.py} +49 -41
- supervisely/nn/benchmark/object_detection/vis_metrics/__init__.py +48 -0
- supervisely/nn/benchmark/{visualization → object_detection}/vis_metrics/confidence_distribution.py +20 -24
- supervisely/nn/benchmark/object_detection/vis_metrics/confidence_score.py +119 -0
- supervisely/nn/benchmark/{visualization → object_detection}/vis_metrics/confusion_matrix.py +34 -22
- supervisely/nn/benchmark/object_detection/vis_metrics/explore_predictions.py +129 -0
- supervisely/nn/benchmark/{visualization → object_detection}/vis_metrics/f1_score_at_different_iou.py +21 -26
- supervisely/nn/benchmark/object_detection/vis_metrics/frequently_confused.py +137 -0
- supervisely/nn/benchmark/object_detection/vis_metrics/iou_distribution.py +106 -0
- supervisely/nn/benchmark/object_detection/vis_metrics/key_metrics.py +136 -0
- supervisely/nn/benchmark/{visualization → object_detection}/vis_metrics/model_predictions.py +53 -49
- supervisely/nn/benchmark/object_detection/vis_metrics/outcome_counts.py +188 -0
- supervisely/nn/benchmark/object_detection/vis_metrics/outcome_counts_per_class.py +191 -0
- supervisely/nn/benchmark/object_detection/vis_metrics/overview.py +116 -0
- supervisely/nn/benchmark/object_detection/vis_metrics/pr_curve.py +106 -0
- supervisely/nn/benchmark/object_detection/vis_metrics/pr_curve_by_class.py +49 -0
- supervisely/nn/benchmark/object_detection/vis_metrics/precision.py +72 -0
- supervisely/nn/benchmark/object_detection/vis_metrics/precision_avg_per_class.py +59 -0
- supervisely/nn/benchmark/object_detection/vis_metrics/recall.py +71 -0
- supervisely/nn/benchmark/object_detection/vis_metrics/recall_vs_precision.py +56 -0
- supervisely/nn/benchmark/object_detection/vis_metrics/reliability_diagram.py +110 -0
- supervisely/nn/benchmark/object_detection/vis_metrics/speedtest.py +151 -0
- supervisely/nn/benchmark/object_detection/visualizer.py +697 -0
- supervisely/nn/benchmark/semantic_segmentation/__init__.py +9 -0
- supervisely/nn/benchmark/semantic_segmentation/base_vis_metric.py +55 -0
- supervisely/nn/benchmark/semantic_segmentation/benchmark.py +32 -0
- supervisely/nn/benchmark/semantic_segmentation/evaluation_params.yaml +0 -0
- supervisely/nn/benchmark/semantic_segmentation/evaluator.py +162 -0
- supervisely/nn/benchmark/semantic_segmentation/metric_provider.py +153 -0
- supervisely/nn/benchmark/semantic_segmentation/text_templates.py +130 -0
- supervisely/nn/benchmark/semantic_segmentation/vis_metrics/__init__.py +0 -0
- supervisely/nn/benchmark/semantic_segmentation/vis_metrics/acknowledgement.py +15 -0
- supervisely/nn/benchmark/semantic_segmentation/vis_metrics/classwise_error_analysis.py +57 -0
- supervisely/nn/benchmark/semantic_segmentation/vis_metrics/confusion_matrix.py +92 -0
- supervisely/nn/benchmark/semantic_segmentation/vis_metrics/explore_predictions.py +84 -0
- supervisely/nn/benchmark/semantic_segmentation/vis_metrics/frequently_confused.py +101 -0
- supervisely/nn/benchmark/semantic_segmentation/vis_metrics/iou_eou.py +45 -0
- supervisely/nn/benchmark/semantic_segmentation/vis_metrics/key_metrics.py +60 -0
- supervisely/nn/benchmark/semantic_segmentation/vis_metrics/model_predictions.py +107 -0
- supervisely/nn/benchmark/semantic_segmentation/vis_metrics/overview.py +112 -0
- supervisely/nn/benchmark/semantic_segmentation/vis_metrics/renormalized_error_ou.py +48 -0
- supervisely/nn/benchmark/semantic_segmentation/vis_metrics/speedtest.py +178 -0
- supervisely/nn/benchmark/semantic_segmentation/vis_metrics/vis_texts.py +21 -0
- supervisely/nn/benchmark/semantic_segmentation/visualizer.py +304 -0
- supervisely/nn/benchmark/utils/__init__.py +12 -0
- supervisely/nn/benchmark/utils/detection/__init__.py +2 -0
- supervisely/nn/benchmark/{evaluation/coco → utils/detection}/calculate_metrics.py +6 -4
- supervisely/nn/benchmark/utils/detection/metric_provider.py +533 -0
- supervisely/nn/benchmark/{coco_utils → utils/detection}/sly2coco.py +4 -4
- supervisely/nn/benchmark/{coco_utils/utils.py → utils/detection/utlis.py} +11 -0
- supervisely/nn/benchmark/utils/semantic_segmentation/__init__.py +0 -0
- supervisely/nn/benchmark/utils/semantic_segmentation/calculate_metrics.py +35 -0
- supervisely/nn/benchmark/utils/semantic_segmentation/evaluator.py +804 -0
- supervisely/nn/benchmark/utils/semantic_segmentation/loader.py +65 -0
- supervisely/nn/benchmark/utils/semantic_segmentation/utils.py +109 -0
- supervisely/nn/benchmark/visualization/evaluation_result.py +17 -3
- supervisely/nn/benchmark/visualization/vis_click_data.py +1 -1
- supervisely/nn/benchmark/visualization/widgets/__init__.py +3 -0
- supervisely/nn/benchmark/visualization/widgets/chart/chart.py +12 -4
- supervisely/nn/benchmark/visualization/widgets/gallery/gallery.py +35 -8
- supervisely/nn/benchmark/visualization/widgets/gallery/template.html +8 -4
- supervisely/nn/benchmark/visualization/widgets/markdown/markdown.py +1 -1
- supervisely/nn/benchmark/visualization/widgets/notification/notification.py +11 -7
- supervisely/nn/benchmark/visualization/widgets/radio_group/__init__.py +0 -0
- supervisely/nn/benchmark/visualization/widgets/radio_group/radio_group.py +34 -0
- supervisely/nn/benchmark/visualization/widgets/table/table.py +9 -3
- supervisely/nn/benchmark/visualization/widgets/widget.py +4 -0
- supervisely/project/project.py +18 -6
- {supervisely-6.73.238.dist-info → supervisely-6.73.240.dist-info}/METADATA +3 -1
- {supervisely-6.73.238.dist-info → supervisely-6.73.240.dist-info}/RECORD +104 -82
- supervisely/nn/benchmark/coco_utils/__init__.py +0 -2
- supervisely/nn/benchmark/evaluation/__init__.py +0 -3
- supervisely/nn/benchmark/evaluation/base_evaluator.py +0 -64
- supervisely/nn/benchmark/evaluation/coco/__init__.py +0 -2
- supervisely/nn/benchmark/evaluation/instance_segmentation_evaluator.py +0 -88
- supervisely/nn/benchmark/utils.py +0 -13
- supervisely/nn/benchmark/visualization/inference_speed/__init__.py +0 -19
- supervisely/nn/benchmark/visualization/inference_speed/speedtest_batch.py +0 -161
- supervisely/nn/benchmark/visualization/inference_speed/speedtest_intro.py +0 -28
- supervisely/nn/benchmark/visualization/inference_speed/speedtest_overview.py +0 -141
- supervisely/nn/benchmark/visualization/inference_speed/speedtest_real_time.py +0 -63
- supervisely/nn/benchmark/visualization/text_templates/inference_speed_text.py +0 -23
- supervisely/nn/benchmark/visualization/vis_metric_base.py +0 -337
- supervisely/nn/benchmark/visualization/vis_metrics/__init__.py +0 -67
- supervisely/nn/benchmark/visualization/vis_metrics/classwise_error_analysis.py +0 -55
- supervisely/nn/benchmark/visualization/vis_metrics/confidence_score.py +0 -93
- supervisely/nn/benchmark/visualization/vis_metrics/explorer_grid.py +0 -144
- supervisely/nn/benchmark/visualization/vis_metrics/frequently_confused.py +0 -115
- supervisely/nn/benchmark/visualization/vis_metrics/iou_distribution.py +0 -86
- supervisely/nn/benchmark/visualization/vis_metrics/outcome_counts.py +0 -119
- supervisely/nn/benchmark/visualization/vis_metrics/outcome_counts_per_class.py +0 -148
- supervisely/nn/benchmark/visualization/vis_metrics/overall_error_analysis.py +0 -109
- supervisely/nn/benchmark/visualization/vis_metrics/overview.py +0 -189
- supervisely/nn/benchmark/visualization/vis_metrics/percision_avg_per_class.py +0 -57
- supervisely/nn/benchmark/visualization/vis_metrics/pr_curve.py +0 -101
- supervisely/nn/benchmark/visualization/vis_metrics/pr_curve_by_class.py +0 -46
- supervisely/nn/benchmark/visualization/vis_metrics/precision.py +0 -56
- supervisely/nn/benchmark/visualization/vis_metrics/recall.py +0 -54
- supervisely/nn/benchmark/visualization/vis_metrics/recall_vs_precision.py +0 -57
- supervisely/nn/benchmark/visualization/vis_metrics/reliability_diagram.py +0 -88
- supervisely/nn/benchmark/visualization/vis_metrics/what_is.py +0 -23
- supervisely/nn/benchmark/visualization/vis_templates.py +0 -241
- supervisely/nn/benchmark/visualization/vis_widgets.py +0 -128
- supervisely/nn/benchmark/visualization/visualizer.py +0 -729
- /supervisely/nn/benchmark/{visualization/text_templates → instance_segmentation}/__init__.py +0 -0
- /supervisely/nn/benchmark/{evaluation/coco → instance_segmentation}/evaluation_params.yaml +0 -0
- /supervisely/nn/benchmark/{evaluation/coco → utils/detection}/metrics.py +0 -0
- {supervisely-6.73.238.dist-info → supervisely-6.73.240.dist-info}/LICENSE +0 -0
- {supervisely-6.73.238.dist-info → supervisely-6.73.240.dist-info}/WHEEL +0 -0
- {supervisely-6.73.238.dist-info → supervisely-6.73.240.dist-info}/entry_points.txt +0 -0
- {supervisely-6.73.238.dist-info → supervisely-6.73.240.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from supervisely.io.json import dump_json_file
|
|
5
|
+
from supervisely.nn.benchmark.object_detection.evaluator import (
|
|
6
|
+
ObjectDetectionEvalResult,
|
|
7
|
+
ObjectDetectionEvaluator,
|
|
8
|
+
)
|
|
9
|
+
from supervisely.nn.benchmark.object_detection.metric_provider import MetricProvider
|
|
10
|
+
from supervisely.nn.benchmark.utils import calculate_metrics, read_coco_datasets
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class InstanceSegmentationEvalResult(ObjectDetectionEvalResult):
|
|
14
|
+
mp_cls = MetricProvider
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class InstanceSegmentationEvaluator(ObjectDetectionEvaluator):
|
|
18
|
+
EVALUATION_PARAMS_YAML_PATH = f"{Path(__file__).parent}/evaluation_params.yaml"
|
|
19
|
+
eval_result_cls = InstanceSegmentationEvalResult
|
|
20
|
+
accepted_shapes = ["polygon", "bitmap"]
|
|
21
|
+
|
|
22
|
+
def evaluate(self):
|
|
23
|
+
try:
|
|
24
|
+
self.cocoGt_json, self.cocoDt_json = self._convert_to_coco()
|
|
25
|
+
except AssertionError as e:
|
|
26
|
+
raise ValueError(
|
|
27
|
+
f"{e}. Please make sure that your GT and DT projects are correct. "
|
|
28
|
+
"If GT project has nested datasets and DT project was crated with NN app, "
|
|
29
|
+
"try to use newer version of NN app."
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
self._dump_datasets()
|
|
33
|
+
self.cocoGt, self.cocoDt = read_coco_datasets(self.cocoGt_json, self.cocoDt_json)
|
|
34
|
+
with self.pbar(message="Evaluation: Calculating metrics", total=5) as p:
|
|
35
|
+
self.eval_data = calculate_metrics(
|
|
36
|
+
self.cocoGt,
|
|
37
|
+
self.cocoDt,
|
|
38
|
+
iouType="segm",
|
|
39
|
+
progress_cb=p.update,
|
|
40
|
+
evaluation_params=self.evaluation_params,
|
|
41
|
+
)
|
|
42
|
+
self._dump_eval_results()
|
|
43
|
+
|
|
44
|
+
def _dump_eval_results(self):
|
|
45
|
+
_, _, eval_data_path = self._get_eval_paths()
|
|
46
|
+
self._dump_pickle(self.eval_data, eval_data_path)
|
|
47
|
+
|
|
48
|
+
def _get_eval_paths(self):
|
|
49
|
+
base_dir = self.result_dir
|
|
50
|
+
cocoGt_path = os.path.join(base_dir, "cocoGt.json")
|
|
51
|
+
cocoDt_path = os.path.join(base_dir, "cocoDt.json")
|
|
52
|
+
eval_data_path = os.path.join(base_dir, "eval_data.pkl")
|
|
53
|
+
return cocoGt_path, cocoDt_path, eval_data_path
|
|
54
|
+
|
|
55
|
+
def _dump_datasets(self):
|
|
56
|
+
cocoGt_path, cocoDt_path, _ = self._get_eval_paths()
|
|
57
|
+
dump_json_file(self.cocoGt_json, cocoGt_path, indent=None)
|
|
58
|
+
dump_json_file(self.cocoDt_json, cocoDt_path, indent=None)
|
|
@@ -13,6 +13,10 @@ definitions = SimpleNamespace(
|
|
|
13
13
|
iou_threshold="The IoU threshold is a predefined value (set to 0.5 in many benchmarks) that determines the minimum acceptable IoU score for a predicted mask to be considered a correct prediction. When the IoU of a predicted mask and actual mask is higher than this IoU threshold, the prediction is considered correct. Some metrics will evaluate the model with different IoU thresholds to provide more insights about the model's performance.",
|
|
14
14
|
)
|
|
15
15
|
|
|
16
|
+
docs_url = (
|
|
17
|
+
"https://docs.supervisely.com/neural-networks/model-evaluation-benchmark/instant-segmentation"
|
|
18
|
+
)
|
|
19
|
+
|
|
16
20
|
# <i class="zmdi zmdi-check-circle" style="color: #13ce66; margin-right: 5px"></i>
|
|
17
21
|
clickable_label = """
|
|
18
22
|
> <span style="color: #5a6772">
|
|
@@ -35,17 +39,14 @@ markdown_overview = """
|
|
|
35
39
|
- **Architecture**: {}
|
|
36
40
|
- **Task type**: {}
|
|
37
41
|
- **Runtime**: {}
|
|
38
|
-
- **Checkpoint file**: <a href="{}" target="_blank">{}</a>
|
|
42
|
+
- **Checkpoint file**: <a class="checkpoint-url" href="{}" target="_blank">{}</a>
|
|
39
43
|
- **Ground Truth project**: <a href="/projects/{}/datasets" target="_blank">{}</a>, {}{}
|
|
40
44
|
{}
|
|
45
|
+
- **IoU threshold**: {}
|
|
46
|
+
- **Optimal confidence threshold**: {} (calculated automatically), <a href="{}" target="_blank">learn more</a>.
|
|
41
47
|
|
|
42
48
|
Learn more about Model Benchmark, implementation details, and how to use the charts in our <a href="{}" target="_blank">Technical Report</a>.
|
|
43
49
|
"""
|
|
44
|
-
# - **Model**: {}
|
|
45
|
-
# - **Training dataset (?)**: COCO 2017 train
|
|
46
|
-
# - **Model classes (?)**: (80): a, b, c, … (collapse)
|
|
47
|
-
# - **Model weights (?)**: [/path/to/yolov8l.pt]()
|
|
48
|
-
# - **License (?)**: AGPL-3.0
|
|
49
50
|
|
|
50
51
|
markdown_key_metrics = """## Key Metrics
|
|
51
52
|
|
|
@@ -67,17 +68,6 @@ In this section you can visually assess the model performance through examples.
|
|
|
67
68
|
> Filtering options allow you to adjust the confidence threshold (only for predictions) and the model's false outcomes (only for differences). Differences are calculated only for the optimal confidence threshold, allowing you to focus on the most accurate predictions made by the model.
|
|
68
69
|
"""
|
|
69
70
|
|
|
70
|
-
markdown_predictions_gallery = """
|
|
71
|
-
|
|
72
|
-
"""
|
|
73
|
-
# You can choose one of the sorting method:
|
|
74
|
-
|
|
75
|
-
# - **Auto**: The algorithm is trying to gather a diverse set of images that illustrate the model's performance across various scenarios.
|
|
76
|
-
# - **Least accurate**: Displays images where the model made more errors.
|
|
77
|
-
# - **Most accurate**: Displays images where the model made fewer or no errors.
|
|
78
|
-
# - **Dataset order**: Displays images in the original order of the dataset.
|
|
79
|
-
# """
|
|
80
|
-
|
|
81
71
|
markdown_predictions_table = """### Prediction details for every image
|
|
82
72
|
|
|
83
73
|
The table helps you in finding samples with specific cases of interest. You can sort by parameters such as the number of predictions, or specific a metric, e.g, recall, then click on a row to view this image and predictions.
|
|
@@ -88,27 +78,15 @@ The table helps you in finding samples with specific cases of interest. You can
|
|
|
88
78
|
> Click on the row to view the image with **Ground Truth**, **Prediction**, or the **Difference** annotations.
|
|
89
79
|
"""
|
|
90
80
|
|
|
91
|
-
|
|
92
|
-
"""
|
|
93
|
-
|
|
94
|
-
markdown_experts = """
|
|
95
|
-
"""
|
|
96
|
-
|
|
97
|
-
markdown_how_to_use = """
|
|
98
|
-
"""
|
|
99
|
-
|
|
100
|
-
markdown_outcome_counts = (
|
|
101
|
-
"""## Outcome Counts
|
|
81
|
+
markdown_outcome_counts = """## Outcome Counts
|
|
102
82
|
|
|
103
83
|
This chart is used to evaluate the overall model performance by breaking down all predictions into <abbr title="{}">True Positives</abbr> (TP), <abbr title="{}">False Positives</abbr> (FP), and <abbr title="{}">False Negatives</abbr> (FN). This helps to visually assess the type of errors the model often encounters.
|
|
104
84
|
|
|
105
85
|
"""
|
|
106
|
-
+ clickable_label
|
|
107
|
-
)
|
|
108
86
|
|
|
109
87
|
markdown_R = """## Recall
|
|
110
88
|
|
|
111
|
-
This section measures the ability of the model to find **all relevant instances in the dataset**. In other words, it answers the question:
|
|
89
|
+
This section measures the ability of the model to find **all relevant instances in the dataset**. In other words, it answers the question: "Of all instances in the dataset, how many of them is the model managed to find out?"
|
|
112
90
|
|
|
113
91
|
To measure this, we calculate **Recall**. Recall counts errors, when the model does not predict an object that actually is present in a dataset and should be predicted. Recall is calculated as the portion of correct predictions (true positives) over all instances in the dataset (true positives + false negatives).
|
|
114
92
|
"""
|
|
@@ -118,8 +96,7 @@ notification_recall = {
|
|
|
118
96
|
"description": "The model correctly found <b>{} of {}</b> total instances in the dataset.",
|
|
119
97
|
}
|
|
120
98
|
|
|
121
|
-
markdown_R_perclass =
|
|
122
|
-
"""### Per-class Recall
|
|
99
|
+
markdown_R_perclass = """### Per-class Recall
|
|
123
100
|
|
|
124
101
|
This chart further analyzes Recall, breaking it down to each class in separate.
|
|
125
102
|
|
|
@@ -128,15 +105,13 @@ Since the overall recall is calculated as an average across all classes, we prov
|
|
|
128
105
|
_Bars in the chart are sorted by <abbr title="{}">F1-score</abbr> to keep a unified order of classes between different charts._
|
|
129
106
|
|
|
130
107
|
"""
|
|
131
|
-
+ clickable_label
|
|
132
|
-
)
|
|
133
108
|
|
|
134
109
|
|
|
135
110
|
markdown_P = """## Precision
|
|
136
111
|
|
|
137
|
-
This section measures the accuracy of all predictions made by the model. In other words, it answers the question:
|
|
112
|
+
This section measures the accuracy of all predictions made by the model. In other words, it answers the question: "Of all predictions made by the model, how many of them are actually correct?".
|
|
138
113
|
|
|
139
|
-
To measure this, we calculate **Precision**. Precision counts errors, when the model predicts an object, but the image has no objects of the predicted class in this place. Precision is calculated as a portion of correct predictions (true positives) over all model
|
|
114
|
+
To measure this, we calculate **Precision**. Precision counts errors, when the model predicts an object, but the image has no objects of the predicted class in this place. Precision is calculated as a portion of correct predictions (true positives) over all model's predictions (true positives + false positives).
|
|
140
115
|
"""
|
|
141
116
|
|
|
142
117
|
notification_precision = {
|
|
@@ -144,8 +119,7 @@ notification_precision = {
|
|
|
144
119
|
"description": "The model correctly predicted <b>{} of {}</b> predictions made by the model in total.",
|
|
145
120
|
}
|
|
146
121
|
|
|
147
|
-
markdown_P_perclass =
|
|
148
|
-
"""### Per-class Precision
|
|
122
|
+
markdown_P_perclass = """### Per-class Precision
|
|
149
123
|
|
|
150
124
|
This chart further analyzes Precision, breaking it down to each class in separate.
|
|
151
125
|
|
|
@@ -154,20 +128,15 @@ Since the overall precision is computed as an average across all classes, we pro
|
|
|
154
128
|
_Bars in the chart are sorted by <abbr title="{}">F1-score</abbr> to keep a unified order of classes between different charts._
|
|
155
129
|
|
|
156
130
|
"""
|
|
157
|
-
+ clickable_label
|
|
158
|
-
)
|
|
159
131
|
|
|
160
132
|
|
|
161
|
-
markdown_PR =
|
|
162
|
-
"""## Recall vs. Precision
|
|
133
|
+
markdown_PR = """## Recall vs. Precision
|
|
163
134
|
|
|
164
135
|
This section compares Precision and Recall in one graph, identifying **imbalance** between these two.
|
|
165
136
|
|
|
166
137
|
_Bars in the chart are sorted by <abbr title="{}">F1-score</abbr> to keep a unified order of classes between different charts._
|
|
167
138
|
|
|
168
139
|
"""
|
|
169
|
-
+ clickable_label
|
|
170
|
-
)
|
|
171
140
|
|
|
172
141
|
|
|
173
142
|
markdown_pr_curve = """## Precision-Recall Curve
|
|
@@ -199,44 +168,39 @@ notification_ap = {
|
|
|
199
168
|
"description": "",
|
|
200
169
|
}
|
|
201
170
|
|
|
202
|
-
markdown_pr_by_class =
|
|
203
|
-
"""### Precision-Recall Curve by Class
|
|
171
|
+
markdown_pr_by_class = """### Precision-Recall Curve by Class
|
|
204
172
|
|
|
205
173
|
In this plot, you can evaluate PR curve for each class individually.
|
|
206
174
|
|
|
207
175
|
"""
|
|
208
|
-
+ clickable_label
|
|
209
|
-
)
|
|
210
176
|
|
|
211
|
-
markdown_confusion_matrix =
|
|
212
|
-
"""## Confusion Matrix
|
|
177
|
+
markdown_confusion_matrix = """## Confusion Matrix
|
|
213
178
|
|
|
214
179
|
Confusion matrix helps to find the number of confusions between different classes made by the model.
|
|
215
180
|
Each row of the matrix represents the instances in a ground truth class, while each column represents the instances in a predicted class.
|
|
216
181
|
The diagonal elements represent the number of correct predictions for each class (True Positives), and the off-diagonal elements show misclassifications.
|
|
217
182
|
|
|
218
183
|
"""
|
|
219
|
-
+ clickable_label
|
|
220
|
-
)
|
|
221
184
|
|
|
185
|
+
markdown_frequently_confused_empty = """### Frequently Confused Classes
|
|
186
|
+
|
|
187
|
+
No frequently confused class pairs found
|
|
188
|
+
"""
|
|
222
189
|
|
|
223
|
-
markdown_frequently_confused =
|
|
224
|
-
"""### Frequently Confused Classes
|
|
190
|
+
markdown_frequently_confused = """### Frequently Confused Classes
|
|
225
191
|
|
|
226
192
|
This chart displays the most frequently confused pairs of classes. In general, it finds out which classes visually seem very similar to the model.
|
|
227
193
|
|
|
228
|
-
The chart calculates the **probability of confusion** between different pairs of classes. For instance, if the probability of confusion for the pair
|
|
194
|
+
The chart calculates the **probability of confusion** between different pairs of classes. For instance, if the probability of confusion for the pair "{} - {}" is {}, this means that when the model predicts either "{}" or "{}", there is a {}% chance that the model might mistakenly predict one instead of the other.
|
|
229
195
|
|
|
230
196
|
The measure is class-symmetric, meaning that the probability of confusing a {} with a {} is equal to the probability of confusing a {} with a {}.
|
|
231
197
|
|
|
232
198
|
"""
|
|
233
|
-
+ clickable_label
|
|
234
|
-
)
|
|
235
199
|
|
|
236
200
|
|
|
237
|
-
markdown_localization_accuracy = """##
|
|
201
|
+
markdown_localization_accuracy = """## Localization Accuracy (IoU)
|
|
238
202
|
|
|
239
|
-
This section measures how accurately predicted masks match the actual shapes of ground truth instances.
|
|
203
|
+
This section measures how accurately predicted masks match the actual shapes of ground truth instances.
|
|
240
204
|
"""
|
|
241
205
|
|
|
242
206
|
markdown_iou_calculation = """<img src='https://github.com/dataset-ninja/model-benchmark-template/assets/78355358/8d7c63d0-2f3b-4f3f-9fd8-c6383a4bfba4' alt='alt text' width='300' />
|
|
@@ -261,6 +225,7 @@ This section analyzes <abbr title="{}">confidence scores</abbr> (or predicted pr
|
|
|
261
225
|
"""
|
|
262
226
|
|
|
263
227
|
markdown_what_is_calibration = """In some applications, it's crucial for a model not only to make accurate predictions but also to provide reliable **confidence levels**. A well-calibrated model aligns its confidence scores with the actual likelihood of predictions being correct. For example, if a model claims 90% confidence for predictions but they are correct only half the time, it is **overconfident**. Conversely, **underconfidence** occurs when a model assigns lower confidence scores than the actual likelihood of its predictions. In the context of autonomous driving, this might cause a vehicle to brake or slow down too frequently, reducing travel efficiency and potentially causing traffic issues."""
|
|
228
|
+
|
|
264
229
|
markdown_calibration_score_2 = """To evaluate the calibration, we draw a <b>Reliability Diagram</b> and calculate <b>Expected Calibration Error</b> (ECE)."""
|
|
265
230
|
|
|
266
231
|
markdown_reliability_diagram = """### Reliability Diagram
|
|
@@ -269,7 +234,7 @@ Reliability diagram, also known as a Calibration curve, helps in understanding w
|
|
|
269
234
|
"""
|
|
270
235
|
|
|
271
236
|
markdown_calibration_curve_interpretation = """
|
|
272
|
-
1. **The curve is above the perfect line (Underconfidence):** If the calibration curve is consistently above the perfect line, this indicates underconfidence. The model
|
|
237
|
+
1. **The curve is above the perfect line (Underconfidence):** If the calibration curve is consistently above the perfect line, this indicates underconfidence. The model's predictions are more correct than the confidence scores suggest. For example, if the model assigns 70% confidence to some predictions but, empirically, 90% of these predictions are correct, the model is underconfident.
|
|
273
238
|
2. **The curve is below the perfect line (Overconfidence):** If the calibration curve is below the perfect line, the model exhibits overconfidence. This means it is too sure of its predictions. For example, if the model assigns 80% confidence to some predictions, but only 40% of these predictions are correct, the model is overconfident.
|
|
274
239
|
|
|
275
240
|
To quantify the calibration, we calculate **Expected Calibration Error (ECE).** Intuitively, ECE can be viewed as a deviation of the model's calibration curve from the diagonal line, that corresponds to a perfectly calibrated model. When ECE is high, we can not trust predicted probabilities so much.
|
|
@@ -316,14 +281,11 @@ Additionally, it provides a view of how predicted probabilities are distributed.
|
|
|
316
281
|
Ideally, the green histogram (TP predictions) should have higher confidence scores and be shifted to the right, indicating that the model is sure about its correct predictions, and the red histogram (FP predictions) should have lower confidence scores and be shifted to the left.
|
|
317
282
|
"""
|
|
318
283
|
|
|
319
|
-
markdown_class_ap =
|
|
320
|
-
"""## Average Precision by Class
|
|
284
|
+
markdown_class_ap = """## Average Precision by Class
|
|
321
285
|
|
|
322
286
|
A quick visual comparison of the model performance across all classes. Each axis in the chart represents a different class, and the distance to the center indicates the <abbr title="{}">Average Precision</abbr> (AP) for that class.
|
|
323
287
|
|
|
324
288
|
"""
|
|
325
|
-
+ clickable_label
|
|
326
|
-
)
|
|
327
289
|
|
|
328
290
|
|
|
329
291
|
markdown_class_outcome_counts_1 = """### Outcome Counts by Class
|
|
@@ -337,16 +299,38 @@ markdown_normalization = """Normalization is used for better interclass comparis
|
|
|
337
299
|
If normalization is off, the chart will display the total count of instances that correspond to outcome type (one of TP, FP or FN). This mode is identical to the main Outcome Counts graph on the top of the page. However, when normalization is off, you may encounter a class imbalance problem. Visually, bars that correspond to classes with many instances in the dataset will be much larger than others. This complicates the visual analysis.
|
|
338
300
|
"""
|
|
339
301
|
|
|
340
|
-
markdown_class_outcome_counts_2 =
|
|
341
|
-
"""You can switch the plot view between normalized and absolute values.
|
|
302
|
+
markdown_class_outcome_counts_2 = """You can switch the plot view between normalized and absolute values.
|
|
342
303
|
|
|
343
304
|
_Bars in the chart are sorted by <abbr title="{}">F1-score</abbr> to keep a unified order of classes between different charts._
|
|
344
305
|
|
|
345
306
|
"""
|
|
346
|
-
+ clickable_label
|
|
347
|
-
)
|
|
348
307
|
|
|
349
308
|
empty = """### {}
|
|
350
309
|
|
|
351
310
|
> {}
|
|
352
311
|
"""
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
markdown_speedtest_intro = """## Inference Speed
|
|
315
|
+
|
|
316
|
+
This is a speed test benchmark for this model. The model was tested with the following configuration:
|
|
317
|
+
|
|
318
|
+
- **Device**: {}
|
|
319
|
+
- **Hardware**: {}
|
|
320
|
+
- **Runtime**: {}
|
|
321
|
+
"""
|
|
322
|
+
|
|
323
|
+
markdown_speedtest_table = """
|
|
324
|
+
The table below shows the speed test results. For each test, the time taken to process one batch of images is shown, as well as the model's throughput (i.e, the number of images processed per second, or FPS). Results are averaged across **{}** iterations.
|
|
325
|
+
"""
|
|
326
|
+
|
|
327
|
+
markdown_real_time_inference = """## Real-time Inference
|
|
328
|
+
|
|
329
|
+
This chart compares different runtimes and devices (CPU or GPU)."""
|
|
330
|
+
|
|
331
|
+
# We additionally divide **predict** procedure into three stages: pre-process, inference, and post-process. Each bar in this chart consists of these three stages. For example, in the chart you can find how long the post-process phase lasts in a CPU device with an ONNXRuntime environment."""
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
markdown_speedtest_chart = """
|
|
335
|
+
This chart shows how the model's speed changes with different batch sizes . As the batch size increases, you can observe an increase in FPS (images per second).
|
|
336
|
+
"""
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import supervisely.nn.benchmark.instance_segmentation.text_templates as vis_texts
|
|
2
|
+
from supervisely.nn.benchmark.cv_tasks import CVTask
|
|
3
|
+
from supervisely.nn.benchmark.object_detection.visualizer import (
|
|
4
|
+
ObjectDetectionVisualizer,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class InstanceSegmentationVisualizer(ObjectDetectionVisualizer):
|
|
9
|
+
def __init__(self, *args, **kwargs):
|
|
10
|
+
super().__init__(*args, **kwargs)
|
|
11
|
+
|
|
12
|
+
self.vis_texts = vis_texts
|
|
13
|
+
self._widgets = False
|
|
14
|
+
self.ann_opacity = 0.7
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def cv_task(self):
|
|
18
|
+
return CVTask.INSTANCE_SEGMENTATION
|
|
File without changes
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from typing import Dict, Optional
|
|
2
|
+
|
|
3
|
+
from supervisely.nn.benchmark.base_visualizer import BaseVisMetric
|
|
4
|
+
from supervisely.nn.benchmark.object_detection.evaluator import (
|
|
5
|
+
ObjectDetectionEvalResult,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DetectionVisMetric(BaseVisMetric):
|
|
10
|
+
|
|
11
|
+
def __init__(self, *args, **kwargs):
|
|
12
|
+
super().__init__(*args, **kwargs)
|
|
13
|
+
self.eval_result: ObjectDetectionEvalResult
|
|
14
|
+
|
|
15
|
+
def get_click_data(self) -> Optional[Dict]:
|
|
16
|
+
if not self.clickable:
|
|
17
|
+
return
|
|
18
|
+
|
|
19
|
+
res = {}
|
|
20
|
+
|
|
21
|
+
res["layoutTemplate"] = [None, None, None]
|
|
22
|
+
res["clickData"] = {}
|
|
23
|
+
for key, v in self.eval_result.click_data.objects_by_class.items():
|
|
24
|
+
res["clickData"][key] = {}
|
|
25
|
+
res["clickData"][key]["imagesIds"] = []
|
|
26
|
+
|
|
27
|
+
img_ids = set()
|
|
28
|
+
obj_ids = set()
|
|
29
|
+
|
|
30
|
+
res["clickData"][key][
|
|
31
|
+
"title"
|
|
32
|
+
] = f"{key} class: {len(v)} object{'s' if len(v) > 1 else ''}"
|
|
33
|
+
|
|
34
|
+
for x in v:
|
|
35
|
+
img_ids.add(x["dt_img_id"])
|
|
36
|
+
obj_id = x["dt_obj_id"]
|
|
37
|
+
if obj_id is not None:
|
|
38
|
+
obj_ids.add(obj_id)
|
|
39
|
+
|
|
40
|
+
res["clickData"][key]["imagesIds"] = list(img_ids)
|
|
41
|
+
res["clickData"][key]["filters"] = [
|
|
42
|
+
{
|
|
43
|
+
"type": "tag",
|
|
44
|
+
"tagId": "confidence",
|
|
45
|
+
"value": [self.eval_result.mp.f1_optimal_conf, 1],
|
|
46
|
+
},
|
|
47
|
+
{"type": "tag", "tagId": "outcome", "value": "TP"},
|
|
48
|
+
{"type": "specific_objects", "tagId": None, "value": list(obj_ids)},
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
return res
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from supervisely.nn.benchmark.base_benchmark import BaseBenchmark
|
|
2
2
|
from supervisely.nn.benchmark.cv_tasks import CVTask
|
|
3
|
-
from supervisely.nn.benchmark.
|
|
4
|
-
|
|
3
|
+
from supervisely.nn.benchmark.object_detection.evaluator import ObjectDetectionEvaluator
|
|
4
|
+
from supervisely.nn.benchmark.object_detection.visualizer import (
|
|
5
|
+
ObjectDetectionVisualizer,
|
|
5
6
|
)
|
|
6
7
|
from supervisely.nn.benchmark.utils import try_set_conf_auto
|
|
7
8
|
|
|
@@ -9,6 +10,7 @@ CONF_THRES = 0.05
|
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
class ObjectDetectionBenchmark(BaseBenchmark):
|
|
13
|
+
visualizer_cls = ObjectDetectionVisualizer
|
|
12
14
|
|
|
13
15
|
@property
|
|
14
16
|
def cv_task(self) -> str:
|
supervisely/nn/benchmark/{evaluation/object_detection_evaluator.py → object_detection/evaluator.py}
RENAMED
|
@@ -1,14 +1,72 @@
|
|
|
1
1
|
import os
|
|
2
|
-
|
|
3
|
-
from supervisely.io.json import dump_json_file
|
|
4
|
-
from supervisely.nn.benchmark.coco_utils import read_coco_datasets, sly2coco
|
|
5
|
-
from supervisely.nn.benchmark.evaluation import BaseEvaluator
|
|
6
|
-
from supervisely.nn.benchmark.evaluation.coco import calculate_metrics
|
|
2
|
+
import pickle
|
|
7
3
|
from pathlib import Path
|
|
8
4
|
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from supervisely.io.json import dump_json_file, load_json_file
|
|
8
|
+
from supervisely.nn.benchmark.base_evaluator import BaseEvalResult, BaseEvaluator
|
|
9
|
+
from supervisely.nn.benchmark.object_detection.metric_provider import MetricProvider
|
|
10
|
+
from supervisely.nn.benchmark.utils import (
|
|
11
|
+
calculate_metrics,
|
|
12
|
+
read_coco_datasets,
|
|
13
|
+
sly2coco,
|
|
14
|
+
)
|
|
15
|
+
from supervisely.nn.benchmark.visualization.vis_click_data import ClickData, IdMapper
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ObjectDetectionEvalResult(BaseEvalResult):
|
|
19
|
+
mp_cls = MetricProvider
|
|
20
|
+
|
|
21
|
+
def _read_eval_data(self):
|
|
22
|
+
from pycocotools.coco import COCO # pylint: disable=import-error
|
|
23
|
+
|
|
24
|
+
gt_path = str(Path(self.directory) / "cocoGt.json")
|
|
25
|
+
dt_path = str(Path(self.directory) / "cocoDt.json")
|
|
26
|
+
coco_gt, coco_dt = COCO(gt_path), COCO(dt_path)
|
|
27
|
+
self.coco_gt = coco_gt
|
|
28
|
+
self.coco_dt = coco_dt
|
|
29
|
+
self.eval_data = None
|
|
30
|
+
with open(Path(self.directory, "eval_data.pkl"), "rb") as f:
|
|
31
|
+
self.eval_data = pickle.load(f)
|
|
32
|
+
|
|
33
|
+
inference_info_path = Path(self.directory) / "inference_info.json"
|
|
34
|
+
self.inference_info = load_json_file(str(inference_info_path))
|
|
35
|
+
|
|
36
|
+
speedtest_info_path = Path(self.directory).parent / "speedtest" / "speedtest.json"
|
|
37
|
+
if speedtest_info_path.exists():
|
|
38
|
+
self.speedtest_info = load_json_file(str(speedtest_info_path))
|
|
39
|
+
|
|
40
|
+
self.mp = MetricProvider(
|
|
41
|
+
self.eval_data["matches"],
|
|
42
|
+
self.eval_data["coco_metrics"],
|
|
43
|
+
self.eval_data["params"],
|
|
44
|
+
self.coco_gt,
|
|
45
|
+
self.coco_dt,
|
|
46
|
+
)
|
|
47
|
+
self.mp.calculate()
|
|
48
|
+
|
|
49
|
+
self.df_score_profile = pd.DataFrame(
|
|
50
|
+
self.mp.confidence_score_profile(), columns=["scores", "precision", "recall", "f1"]
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# downsample
|
|
54
|
+
if len(self.df_score_profile) > 5000:
|
|
55
|
+
self.dfsp_down = self.df_score_profile.iloc[:: len(self.df_score_profile) // 1000]
|
|
56
|
+
else:
|
|
57
|
+
self.dfsp_down = self.df_score_profile
|
|
58
|
+
|
|
59
|
+
# Click data
|
|
60
|
+
gt_id_mapper = IdMapper(self.coco_gt.dataset)
|
|
61
|
+
dt_id_mapper = IdMapper(self.coco_dt.dataset)
|
|
62
|
+
|
|
63
|
+
self.click_data = ClickData(self.mp.m, gt_id_mapper, dt_id_mapper)
|
|
64
|
+
|
|
9
65
|
|
|
10
66
|
class ObjectDetectionEvaluator(BaseEvaluator):
|
|
11
|
-
EVALUATION_PARAMS_YAML_PATH = f"{Path(__file__).parent}/
|
|
67
|
+
EVALUATION_PARAMS_YAML_PATH = f"{Path(__file__).parent}/evaluation_params.yaml"
|
|
68
|
+
eval_result_cls = ObjectDetectionEvalResult
|
|
69
|
+
accepted_shapes = ["rectangle"]
|
|
12
70
|
|
|
13
71
|
def evaluate(self):
|
|
14
72
|
try:
|
|
@@ -43,14 +101,14 @@ class ObjectDetectionEvaluator(BaseEvaluator):
|
|
|
43
101
|
cocoGt_json = sly2coco(
|
|
44
102
|
self.gt_project_path,
|
|
45
103
|
is_dt_dataset=False,
|
|
46
|
-
accepted_shapes=
|
|
104
|
+
accepted_shapes=self.accepted_shapes,
|
|
47
105
|
progress=self.pbar,
|
|
48
106
|
classes_whitelist=self.classes_whitelist,
|
|
49
107
|
)
|
|
50
108
|
cocoDt_json = sly2coco(
|
|
51
|
-
self.
|
|
109
|
+
self.pred_project_path,
|
|
52
110
|
is_dt_dataset=True,
|
|
53
|
-
accepted_shapes=
|
|
111
|
+
accepted_shapes=self.accepted_shapes,
|
|
54
112
|
progress=self.pbar,
|
|
55
113
|
classes_whitelist=self.classes_whitelist,
|
|
56
114
|
)
|
|
@@ -4,7 +4,7 @@ from copy import deepcopy
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
-
from supervisely.nn.benchmark.
|
|
7
|
+
from supervisely.nn.benchmark.utils.detection import metrics
|
|
8
8
|
|
|
9
9
|
METRIC_NAMES = {
|
|
10
10
|
"mAP": "mAP",
|
|
@@ -266,12 +266,12 @@ class _MetricProvider:
|
|
|
266
266
|
def _init_counts(self):
|
|
267
267
|
cat_ids = self.cat_ids
|
|
268
268
|
iouThrs = self.iouThrs
|
|
269
|
-
|
|
269
|
+
cat_id_to_idx = {cat_id: idx for idx, cat_id in enumerate(cat_ids)}
|
|
270
270
|
ious = []
|
|
271
271
|
cats = []
|
|
272
272
|
for match in self.tp_matches:
|
|
273
273
|
ious.append(match["iou"])
|
|
274
|
-
cats.append(
|
|
274
|
+
cats.append(cat_id_to_idx[match["category_id"]])
|
|
275
275
|
ious = np.array(ious) + np.spacing(1)
|
|
276
276
|
iou_idxs = np.searchsorted(iouThrs, ious) - 1
|
|
277
277
|
cats = np.array(cats)
|
|
@@ -285,7 +285,7 @@ class _MetricProvider:
|
|
|
285
285
|
true_positives = true_positives[:, ::-1].cumsum(1)[:, ::-1]
|
|
286
286
|
tp_count = true_positives[:, 0]
|
|
287
287
|
# FN
|
|
288
|
-
cats_fn = np.array([
|
|
288
|
+
cats_fn = np.array([cat_id_to_idx[match["category_id"]] for match in self.fn_matches])
|
|
289
289
|
if cats_fn.size == 0:
|
|
290
290
|
fn_count = np.zeros((len(cat_ids),), dtype=int)
|
|
291
291
|
else:
|
|
@@ -293,7 +293,7 @@ class _MetricProvider:
|
|
|
293
293
|
gt_count = fn_count + tp_count
|
|
294
294
|
false_negatives = gt_count[:, None] - true_positives
|
|
295
295
|
# FP
|
|
296
|
-
cats_fp = np.array([
|
|
296
|
+
cats_fp = np.array([cat_id_to_idx[match["category_id"]] for match in self.fp_matches])
|
|
297
297
|
if cats_fp.size == 0:
|
|
298
298
|
fp_count = np.zeros((len(cat_ids),), dtype=int)
|
|
299
299
|
else:
|
|
@@ -380,33 +380,32 @@ class _MetricProvider:
|
|
|
380
380
|
|
|
381
381
|
def confusion_matrix(self):
|
|
382
382
|
K = len(self.cat_ids)
|
|
383
|
-
|
|
384
|
-
idx2catId = {i: cat_id for cat_id, i in catId2idx.items()}
|
|
383
|
+
cat_id_to_idx = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
|
|
385
384
|
|
|
386
385
|
confusion_matrix = np.zeros((K + 1, K + 1), dtype=int)
|
|
387
386
|
|
|
388
387
|
for m in self.confused_matches:
|
|
389
|
-
cat_idx_pred =
|
|
390
|
-
cat_idx_gt =
|
|
388
|
+
cat_idx_pred = cat_id_to_idx[m["category_id"]]
|
|
389
|
+
cat_idx_gt = cat_id_to_idx[self.cocoGt.anns[m["gt_id"]]["category_id"]]
|
|
391
390
|
confusion_matrix[cat_idx_pred, cat_idx_gt] += 1
|
|
392
391
|
|
|
393
392
|
for m in self.tp_matches:
|
|
394
|
-
cat_idx =
|
|
393
|
+
cat_idx = cat_id_to_idx[m["category_id"]]
|
|
395
394
|
confusion_matrix[cat_idx, cat_idx] += 1
|
|
396
395
|
|
|
397
396
|
for m in self.fp_not_confused_matches:
|
|
398
|
-
cat_idx_pred =
|
|
397
|
+
cat_idx_pred = cat_id_to_idx[m["category_id"]]
|
|
399
398
|
confusion_matrix[cat_idx_pred, -1] += 1
|
|
400
399
|
|
|
401
400
|
for m in self.fn_matches:
|
|
402
|
-
cat_idx_gt =
|
|
401
|
+
cat_idx_gt = cat_id_to_idx[m["category_id"]]
|
|
403
402
|
confusion_matrix[-1, cat_idx_gt] += 1
|
|
404
403
|
|
|
405
404
|
return confusion_matrix
|
|
406
405
|
|
|
407
406
|
def frequently_confused(self, confusion_matrix, topk_pairs=20):
|
|
408
407
|
# Frequently confused class pairs
|
|
409
|
-
|
|
408
|
+
cat_id_enum = {i: cat_id for i, cat_id in enumerate(self.cat_ids)}
|
|
410
409
|
cm = confusion_matrix[:-1, :-1]
|
|
411
410
|
cm_l = np.tril(cm, -1)
|
|
412
411
|
cm_u = np.triu(cm, 1)
|
|
@@ -427,7 +426,7 @@ class _MetricProvider:
|
|
|
427
426
|
confused_name_pairs = [(self.cat_names[i], self.cat_names[j]) for i, j in confused_idxs]
|
|
428
427
|
confused_counts = confused_counts[inds_sort2]
|
|
429
428
|
confused_prob = confused_prob[inds_sort2]
|
|
430
|
-
confused_catIds = [(
|
|
429
|
+
confused_catIds = [(cat_id_enum[i], cat_id_enum[j]) for i, j in confused_idxs]
|
|
431
430
|
|
|
432
431
|
return pd.DataFrame(
|
|
433
432
|
{
|