supervisely 6.73.238__py3-none-any.whl → 6.73.239__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of supervisely might be problematic. Click here for more details.

Files changed (137) hide show
  1. supervisely/annotation/annotation.py +2 -2
  2. supervisely/api/entity_annotation/tag_api.py +11 -4
  3. supervisely/nn/__init__.py +1 -0
  4. supervisely/nn/benchmark/__init__.py +14 -2
  5. supervisely/nn/benchmark/base_benchmark.py +84 -37
  6. supervisely/nn/benchmark/base_evaluator.py +120 -0
  7. supervisely/nn/benchmark/base_visualizer.py +265 -0
  8. supervisely/nn/benchmark/comparison/detection_visualization/text_templates.py +5 -5
  9. supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/calibration_score.py +2 -2
  10. supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/explore_predicttions.py +39 -16
  11. supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/localization_accuracy.py +1 -1
  12. supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/outcome_counts.py +4 -4
  13. supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/overview.py +12 -11
  14. supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/pr_curve.py +1 -1
  15. supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/precision_recal_f1.py +6 -6
  16. supervisely/nn/benchmark/comparison/detection_visualization/vis_metrics/speedtest.py +3 -3
  17. supervisely/nn/benchmark/{instance_segmentation_benchmark.py → instance_segmentation/benchmark.py} +9 -3
  18. supervisely/nn/benchmark/instance_segmentation/evaluator.py +58 -0
  19. supervisely/nn/benchmark/{visualization/text_templates/instance_segmentation_text.py → instance_segmentation/text_templates.py} +53 -69
  20. supervisely/nn/benchmark/instance_segmentation/visualizer.py +18 -0
  21. supervisely/nn/benchmark/object_detection/__init__.py +0 -0
  22. supervisely/nn/benchmark/object_detection/base_vis_metric.py +51 -0
  23. supervisely/nn/benchmark/{object_detection_benchmark.py → object_detection/benchmark.py} +4 -2
  24. supervisely/nn/benchmark/object_detection/evaluation_params.yaml +2 -0
  25. supervisely/nn/benchmark/{evaluation/object_detection_evaluator.py → object_detection/evaluator.py} +67 -9
  26. supervisely/nn/benchmark/{evaluation/coco → object_detection}/metric_provider.py +13 -14
  27. supervisely/nn/benchmark/{visualization/text_templates/object_detection_text.py → object_detection/text_templates.py} +49 -41
  28. supervisely/nn/benchmark/object_detection/vis_metrics/__init__.py +48 -0
  29. supervisely/nn/benchmark/{visualization → object_detection}/vis_metrics/confidence_distribution.py +20 -24
  30. supervisely/nn/benchmark/object_detection/vis_metrics/confidence_score.py +119 -0
  31. supervisely/nn/benchmark/{visualization → object_detection}/vis_metrics/confusion_matrix.py +34 -22
  32. supervisely/nn/benchmark/object_detection/vis_metrics/explore_predictions.py +129 -0
  33. supervisely/nn/benchmark/{visualization → object_detection}/vis_metrics/f1_score_at_different_iou.py +21 -26
  34. supervisely/nn/benchmark/object_detection/vis_metrics/frequently_confused.py +137 -0
  35. supervisely/nn/benchmark/object_detection/vis_metrics/iou_distribution.py +106 -0
  36. supervisely/nn/benchmark/object_detection/vis_metrics/key_metrics.py +136 -0
  37. supervisely/nn/benchmark/{visualization → object_detection}/vis_metrics/model_predictions.py +53 -49
  38. supervisely/nn/benchmark/object_detection/vis_metrics/outcome_counts.py +188 -0
  39. supervisely/nn/benchmark/object_detection/vis_metrics/outcome_counts_per_class.py +191 -0
  40. supervisely/nn/benchmark/object_detection/vis_metrics/overview.py +116 -0
  41. supervisely/nn/benchmark/object_detection/vis_metrics/pr_curve.py +106 -0
  42. supervisely/nn/benchmark/object_detection/vis_metrics/pr_curve_by_class.py +49 -0
  43. supervisely/nn/benchmark/object_detection/vis_metrics/precision.py +72 -0
  44. supervisely/nn/benchmark/object_detection/vis_metrics/precision_avg_per_class.py +59 -0
  45. supervisely/nn/benchmark/object_detection/vis_metrics/recall.py +71 -0
  46. supervisely/nn/benchmark/object_detection/vis_metrics/recall_vs_precision.py +56 -0
  47. supervisely/nn/benchmark/object_detection/vis_metrics/reliability_diagram.py +110 -0
  48. supervisely/nn/benchmark/object_detection/vis_metrics/speedtest.py +151 -0
  49. supervisely/nn/benchmark/object_detection/visualizer.py +697 -0
  50. supervisely/nn/benchmark/semantic_segmentation/__init__.py +9 -0
  51. supervisely/nn/benchmark/semantic_segmentation/base_vis_metric.py +55 -0
  52. supervisely/nn/benchmark/semantic_segmentation/benchmark.py +32 -0
  53. supervisely/nn/benchmark/semantic_segmentation/evaluation_params.yaml +0 -0
  54. supervisely/nn/benchmark/semantic_segmentation/evaluator.py +162 -0
  55. supervisely/nn/benchmark/semantic_segmentation/metric_provider.py +153 -0
  56. supervisely/nn/benchmark/semantic_segmentation/text_templates.py +130 -0
  57. supervisely/nn/benchmark/semantic_segmentation/vis_metrics/__init__.py +0 -0
  58. supervisely/nn/benchmark/semantic_segmentation/vis_metrics/acknowledgement.py +15 -0
  59. supervisely/nn/benchmark/semantic_segmentation/vis_metrics/classwise_error_analysis.py +57 -0
  60. supervisely/nn/benchmark/semantic_segmentation/vis_metrics/confusion_matrix.py +92 -0
  61. supervisely/nn/benchmark/semantic_segmentation/vis_metrics/explore_predictions.py +84 -0
  62. supervisely/nn/benchmark/semantic_segmentation/vis_metrics/frequently_confused.py +101 -0
  63. supervisely/nn/benchmark/semantic_segmentation/vis_metrics/iou_eou.py +45 -0
  64. supervisely/nn/benchmark/semantic_segmentation/vis_metrics/key_metrics.py +60 -0
  65. supervisely/nn/benchmark/semantic_segmentation/vis_metrics/model_predictions.py +107 -0
  66. supervisely/nn/benchmark/semantic_segmentation/vis_metrics/overview.py +112 -0
  67. supervisely/nn/benchmark/semantic_segmentation/vis_metrics/renormalized_error_ou.py +48 -0
  68. supervisely/nn/benchmark/semantic_segmentation/vis_metrics/speedtest.py +178 -0
  69. supervisely/nn/benchmark/semantic_segmentation/vis_metrics/vis_texts.py +21 -0
  70. supervisely/nn/benchmark/semantic_segmentation/visualizer.py +304 -0
  71. supervisely/nn/benchmark/utils/__init__.py +12 -0
  72. supervisely/nn/benchmark/utils/detection/__init__.py +2 -0
  73. supervisely/nn/benchmark/{evaluation/coco → utils/detection}/calculate_metrics.py +6 -4
  74. supervisely/nn/benchmark/utils/detection/metric_provider.py +533 -0
  75. supervisely/nn/benchmark/{coco_utils → utils/detection}/sly2coco.py +4 -4
  76. supervisely/nn/benchmark/{coco_utils/utils.py → utils/detection/utlis.py} +11 -0
  77. supervisely/nn/benchmark/utils/semantic_segmentation/__init__.py +0 -0
  78. supervisely/nn/benchmark/utils/semantic_segmentation/calculate_metrics.py +35 -0
  79. supervisely/nn/benchmark/utils/semantic_segmentation/evaluator.py +804 -0
  80. supervisely/nn/benchmark/utils/semantic_segmentation/loader.py +65 -0
  81. supervisely/nn/benchmark/utils/semantic_segmentation/utils.py +109 -0
  82. supervisely/nn/benchmark/visualization/evaluation_result.py +17 -3
  83. supervisely/nn/benchmark/visualization/vis_click_data.py +1 -1
  84. supervisely/nn/benchmark/visualization/widgets/__init__.py +3 -0
  85. supervisely/nn/benchmark/visualization/widgets/chart/chart.py +12 -4
  86. supervisely/nn/benchmark/visualization/widgets/gallery/gallery.py +35 -8
  87. supervisely/nn/benchmark/visualization/widgets/gallery/template.html +8 -4
  88. supervisely/nn/benchmark/visualization/widgets/markdown/markdown.py +1 -1
  89. supervisely/nn/benchmark/visualization/widgets/notification/notification.py +11 -7
  90. supervisely/nn/benchmark/visualization/widgets/radio_group/__init__.py +0 -0
  91. supervisely/nn/benchmark/visualization/widgets/radio_group/radio_group.py +34 -0
  92. supervisely/nn/benchmark/visualization/widgets/table/table.py +9 -3
  93. supervisely/nn/benchmark/visualization/widgets/widget.py +4 -0
  94. supervisely/project/project.py +18 -6
  95. {supervisely-6.73.238.dist-info → supervisely-6.73.239.dist-info}/METADATA +3 -1
  96. {supervisely-6.73.238.dist-info → supervisely-6.73.239.dist-info}/RECORD +103 -81
  97. supervisely/nn/benchmark/coco_utils/__init__.py +0 -2
  98. supervisely/nn/benchmark/evaluation/__init__.py +0 -3
  99. supervisely/nn/benchmark/evaluation/base_evaluator.py +0 -64
  100. supervisely/nn/benchmark/evaluation/coco/__init__.py +0 -2
  101. supervisely/nn/benchmark/evaluation/instance_segmentation_evaluator.py +0 -88
  102. supervisely/nn/benchmark/utils.py +0 -13
  103. supervisely/nn/benchmark/visualization/inference_speed/__init__.py +0 -19
  104. supervisely/nn/benchmark/visualization/inference_speed/speedtest_batch.py +0 -161
  105. supervisely/nn/benchmark/visualization/inference_speed/speedtest_intro.py +0 -28
  106. supervisely/nn/benchmark/visualization/inference_speed/speedtest_overview.py +0 -141
  107. supervisely/nn/benchmark/visualization/inference_speed/speedtest_real_time.py +0 -63
  108. supervisely/nn/benchmark/visualization/text_templates/inference_speed_text.py +0 -23
  109. supervisely/nn/benchmark/visualization/vis_metric_base.py +0 -337
  110. supervisely/nn/benchmark/visualization/vis_metrics/__init__.py +0 -67
  111. supervisely/nn/benchmark/visualization/vis_metrics/classwise_error_analysis.py +0 -55
  112. supervisely/nn/benchmark/visualization/vis_metrics/confidence_score.py +0 -93
  113. supervisely/nn/benchmark/visualization/vis_metrics/explorer_grid.py +0 -144
  114. supervisely/nn/benchmark/visualization/vis_metrics/frequently_confused.py +0 -115
  115. supervisely/nn/benchmark/visualization/vis_metrics/iou_distribution.py +0 -86
  116. supervisely/nn/benchmark/visualization/vis_metrics/outcome_counts.py +0 -119
  117. supervisely/nn/benchmark/visualization/vis_metrics/outcome_counts_per_class.py +0 -148
  118. supervisely/nn/benchmark/visualization/vis_metrics/overall_error_analysis.py +0 -109
  119. supervisely/nn/benchmark/visualization/vis_metrics/overview.py +0 -189
  120. supervisely/nn/benchmark/visualization/vis_metrics/percision_avg_per_class.py +0 -57
  121. supervisely/nn/benchmark/visualization/vis_metrics/pr_curve.py +0 -101
  122. supervisely/nn/benchmark/visualization/vis_metrics/pr_curve_by_class.py +0 -46
  123. supervisely/nn/benchmark/visualization/vis_metrics/precision.py +0 -56
  124. supervisely/nn/benchmark/visualization/vis_metrics/recall.py +0 -54
  125. supervisely/nn/benchmark/visualization/vis_metrics/recall_vs_precision.py +0 -57
  126. supervisely/nn/benchmark/visualization/vis_metrics/reliability_diagram.py +0 -88
  127. supervisely/nn/benchmark/visualization/vis_metrics/what_is.py +0 -23
  128. supervisely/nn/benchmark/visualization/vis_templates.py +0 -241
  129. supervisely/nn/benchmark/visualization/vis_widgets.py +0 -128
  130. supervisely/nn/benchmark/visualization/visualizer.py +0 -729
  131. /supervisely/nn/benchmark/{visualization/text_templates → instance_segmentation}/__init__.py +0 -0
  132. /supervisely/nn/benchmark/{evaluation/coco → instance_segmentation}/evaluation_params.yaml +0 -0
  133. /supervisely/nn/benchmark/{evaluation/coco → utils/detection}/metrics.py +0 -0
  134. {supervisely-6.73.238.dist-info → supervisely-6.73.239.dist-info}/LICENSE +0 -0
  135. {supervisely-6.73.238.dist-info → supervisely-6.73.239.dist-info}/WHEEL +0 -0
  136. {supervisely-6.73.238.dist-info → supervisely-6.73.239.dist-info}/entry_points.txt +0 -0
  137. {supervisely-6.73.238.dist-info → supervisely-6.73.239.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,58 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from supervisely.io.json import dump_json_file
5
+ from supervisely.nn.benchmark.object_detection.evaluator import (
6
+ ObjectDetectionEvalResult,
7
+ ObjectDetectionEvaluator,
8
+ )
9
+ from supervisely.nn.benchmark.object_detection.metric_provider import MetricProvider
10
+ from supervisely.nn.benchmark.utils import calculate_metrics, read_coco_datasets
11
+
12
+
13
+ class InstanceSegmentationEvalResult(ObjectDetectionEvalResult):
14
+ mp_cls = MetricProvider
15
+
16
+
17
+ class InstanceSegmentationEvaluator(ObjectDetectionEvaluator):
18
+ EVALUATION_PARAMS_YAML_PATH = f"{Path(__file__).parent}/evaluation_params.yaml"
19
+ eval_result_cls = InstanceSegmentationEvalResult
20
+ accepted_shapes = ["polygon", "bitmap"]
21
+
22
+ def evaluate(self):
23
+ try:
24
+ self.cocoGt_json, self.cocoDt_json = self._convert_to_coco()
25
+ except AssertionError as e:
26
+ raise ValueError(
27
+ f"{e}. Please make sure that your GT and DT projects are correct. "
28
+ "If GT project has nested datasets and DT project was crated with NN app, "
29
+ "try to use newer version of NN app."
30
+ )
31
+
32
+ self._dump_datasets()
33
+ self.cocoGt, self.cocoDt = read_coco_datasets(self.cocoGt_json, self.cocoDt_json)
34
+ with self.pbar(message="Evaluation: Calculating metrics", total=5) as p:
35
+ self.eval_data = calculate_metrics(
36
+ self.cocoGt,
37
+ self.cocoDt,
38
+ iouType="segm",
39
+ progress_cb=p.update,
40
+ evaluation_params=self.evaluation_params,
41
+ )
42
+ self._dump_eval_results()
43
+
44
+ def _dump_eval_results(self):
45
+ _, _, eval_data_path = self._get_eval_paths()
46
+ self._dump_pickle(self.eval_data, eval_data_path)
47
+
48
+ def _get_eval_paths(self):
49
+ base_dir = self.result_dir
50
+ cocoGt_path = os.path.join(base_dir, "cocoGt.json")
51
+ cocoDt_path = os.path.join(base_dir, "cocoDt.json")
52
+ eval_data_path = os.path.join(base_dir, "eval_data.pkl")
53
+ return cocoGt_path, cocoDt_path, eval_data_path
54
+
55
+ def _dump_datasets(self):
56
+ cocoGt_path, cocoDt_path, _ = self._get_eval_paths()
57
+ dump_json_file(self.cocoGt_json, cocoGt_path, indent=None)
58
+ dump_json_file(self.cocoDt_json, cocoDt_path, indent=None)
@@ -13,6 +13,10 @@ definitions = SimpleNamespace(
13
13
  iou_threshold="The IoU threshold is a predefined value (set to 0.5 in many benchmarks) that determines the minimum acceptable IoU score for a predicted mask to be considered a correct prediction. When the IoU of a predicted mask and actual mask is higher than this IoU threshold, the prediction is considered correct. Some metrics will evaluate the model with different IoU thresholds to provide more insights about the model's performance.",
14
14
  )
15
15
 
16
+ docs_url = (
17
+ "https://docs.supervisely.com/neural-networks/model-evaluation-benchmark/instant-segmentation"
18
+ )
19
+
16
20
  # <i class="zmdi zmdi-check-circle" style="color: #13ce66; margin-right: 5px"></i>
17
21
  clickable_label = """
18
22
  > <span style="color: #5a6772">
@@ -35,17 +39,14 @@ markdown_overview = """
35
39
  - **Architecture**: {}
36
40
  - **Task type**: {}
37
41
  - **Runtime**: {}
38
- - **Checkpoint file**: <a href="{}" target="_blank">{}</a>
42
+ - **Checkpoint file**: <a class="checkpoint-url" href="{}" target="_blank">{}</a>
39
43
  - **Ground Truth project**: <a href="/projects/{}/datasets" target="_blank">{}</a>, {}{}
40
44
  {}
45
+ - **IoU threshold**: {}
46
+ - **Optimal confidence threshold**: {} (calculated automatically), <a href="{}" target="_blank">learn more</a>.
41
47
 
42
48
  Learn more about Model Benchmark, implementation details, and how to use the charts in our <a href="{}" target="_blank">Technical Report</a>.
43
49
  """
44
- # - **Model**: {}
45
- # - **Training dataset (?)**: COCO 2017 train
46
- # - **Model classes (?)**: (80): a, b, c, … (collapse)
47
- # - **Model weights (?)**: [/path/to/yolov8l.pt]()
48
- # - **License (?)**: AGPL-3.0
49
50
 
50
51
  markdown_key_metrics = """## Key Metrics
51
52
 
@@ -67,17 +68,6 @@ In this section you can visually assess the model performance through examples.
67
68
  > Filtering options allow you to adjust the confidence threshold (only for predictions) and the model's false outcomes (only for differences). Differences are calculated only for the optimal confidence threshold, allowing you to focus on the most accurate predictions made by the model.
68
69
  """
69
70
 
70
- markdown_predictions_gallery = """
71
-
72
- """
73
- # You can choose one of the sorting method:
74
-
75
- # - **Auto**: The algorithm is trying to gather a diverse set of images that illustrate the model's performance across various scenarios.
76
- # - **Least accurate**: Displays images where the model made more errors.
77
- # - **Most accurate**: Displays images where the model made fewer or no errors.
78
- # - **Dataset order**: Displays images in the original order of the dataset.
79
- # """
80
-
81
71
  markdown_predictions_table = """### Prediction details for every image
82
72
 
83
73
  The table helps you in finding samples with specific cases of interest. You can sort by parameters such as the number of predictions, or specific a metric, e.g, recall, then click on a row to view this image and predictions.
@@ -88,27 +78,15 @@ The table helps you in finding samples with specific cases of interest. You can
88
78
  > Click on the row to view the image with **Ground Truth**, **Prediction**, or the **Difference** annotations.
89
79
  """
90
80
 
91
- markdown_what_is = """
92
- """
93
-
94
- markdown_experts = """
95
- """
96
-
97
- markdown_how_to_use = """
98
- """
99
-
100
- markdown_outcome_counts = (
101
- """## Outcome Counts
81
+ markdown_outcome_counts = """## Outcome Counts
102
82
 
103
83
  This chart is used to evaluate the overall model performance by breaking down all predictions into <abbr title="{}">True Positives</abbr> (TP), <abbr title="{}">False Positives</abbr> (FP), and <abbr title="{}">False Negatives</abbr> (FN). This helps to visually assess the type of errors the model often encounters.
104
84
 
105
85
  """
106
- + clickable_label
107
- )
108
86
 
109
87
  markdown_R = """## Recall
110
88
 
111
- This section measures the ability of the model to find **all relevant instances in the dataset**. In other words, it answers the question: Of all instances in the dataset, how many of them is the model managed to find out?”
89
+ This section measures the ability of the model to find **all relevant instances in the dataset**. In other words, it answers the question: "Of all instances in the dataset, how many of them is the model managed to find out?"
112
90
 
113
91
  To measure this, we calculate **Recall**. Recall counts errors, when the model does not predict an object that actually is present in a dataset and should be predicted. Recall is calculated as the portion of correct predictions (true positives) over all instances in the dataset (true positives + false negatives).
114
92
  """
@@ -118,8 +96,7 @@ notification_recall = {
118
96
  "description": "The model correctly found <b>{} of {}</b> total instances in the dataset.",
119
97
  }
120
98
 
121
- markdown_R_perclass = (
122
- """### Per-class Recall
99
+ markdown_R_perclass = """### Per-class Recall
123
100
 
124
101
  This chart further analyzes Recall, breaking it down to each class in separate.
125
102
 
@@ -128,15 +105,13 @@ Since the overall recall is calculated as an average across all classes, we prov
128
105
  _Bars in the chart are sorted by <abbr title="{}">F1-score</abbr> to keep a unified order of classes between different charts._
129
106
 
130
107
  """
131
- + clickable_label
132
- )
133
108
 
134
109
 
135
110
  markdown_P = """## Precision
136
111
 
137
- This section measures the accuracy of all predictions made by the model. In other words, it answers the question: Of all predictions made by the model, how many of them are actually correct?”.
112
+ This section measures the accuracy of all predictions made by the model. In other words, it answers the question: "Of all predictions made by the model, how many of them are actually correct?".
138
113
 
139
- To measure this, we calculate **Precision**. Precision counts errors, when the model predicts an object, but the image has no objects of the predicted class in this place. Precision is calculated as a portion of correct predictions (true positives) over all models predictions (true positives + false positives).
114
+ To measure this, we calculate **Precision**. Precision counts errors, when the model predicts an object, but the image has no objects of the predicted class in this place. Precision is calculated as a portion of correct predictions (true positives) over all model's predictions (true positives + false positives).
140
115
  """
141
116
 
142
117
  notification_precision = {
@@ -144,8 +119,7 @@ notification_precision = {
144
119
  "description": "The model correctly predicted <b>{} of {}</b> predictions made by the model in total.",
145
120
  }
146
121
 
147
- markdown_P_perclass = (
148
- """### Per-class Precision
122
+ markdown_P_perclass = """### Per-class Precision
149
123
 
150
124
  This chart further analyzes Precision, breaking it down to each class in separate.
151
125
 
@@ -154,20 +128,15 @@ Since the overall precision is computed as an average across all classes, we pro
154
128
  _Bars in the chart are sorted by <abbr title="{}">F1-score</abbr> to keep a unified order of classes between different charts._
155
129
 
156
130
  """
157
- + clickable_label
158
- )
159
131
 
160
132
 
161
- markdown_PR = (
162
- """## Recall vs. Precision
133
+ markdown_PR = """## Recall vs. Precision
163
134
 
164
135
  This section compares Precision and Recall in one graph, identifying **imbalance** between these two.
165
136
 
166
137
  _Bars in the chart are sorted by <abbr title="{}">F1-score</abbr> to keep a unified order of classes between different charts._
167
138
 
168
139
  """
169
- + clickable_label
170
- )
171
140
 
172
141
 
173
142
  markdown_pr_curve = """## Precision-Recall Curve
@@ -199,44 +168,39 @@ notification_ap = {
199
168
  "description": "",
200
169
  }
201
170
 
202
- markdown_pr_by_class = (
203
- """### Precision-Recall Curve by Class
171
+ markdown_pr_by_class = """### Precision-Recall Curve by Class
204
172
 
205
173
  In this plot, you can evaluate PR curve for each class individually.
206
174
 
207
175
  """
208
- + clickable_label
209
- )
210
176
 
211
- markdown_confusion_matrix = (
212
- """## Confusion Matrix
177
+ markdown_confusion_matrix = """## Confusion Matrix
213
178
 
214
179
  Confusion matrix helps to find the number of confusions between different classes made by the model.
215
180
  Each row of the matrix represents the instances in a ground truth class, while each column represents the instances in a predicted class.
216
181
  The diagonal elements represent the number of correct predictions for each class (True Positives), and the off-diagonal elements show misclassifications.
217
182
 
218
183
  """
219
- + clickable_label
220
- )
221
184
 
185
+ markdown_frequently_confused_empty = """### Frequently Confused Classes
186
+
187
+ No frequently confused class pairs found
188
+ """
222
189
 
223
- markdown_frequently_confused = (
224
- """### Frequently Confused Classes
190
+ markdown_frequently_confused = """### Frequently Confused Classes
225
191
 
226
192
  This chart displays the most frequently confused pairs of classes. In general, it finds out which classes visually seem very similar to the model.
227
193
 
228
- The chart calculates the **probability of confusion** between different pairs of classes. For instance, if the probability of confusion for the pair {} - {} is {}, this means that when the model predicts either {} or {}”, there is a {}% chance that the model might mistakenly predict one instead of the other.
194
+ The chart calculates the **probability of confusion** between different pairs of classes. For instance, if the probability of confusion for the pair "{} - {}" is {}, this means that when the model predicts either "{}" or "{}", there is a {}% chance that the model might mistakenly predict one instead of the other.
229
195
 
230
196
  The measure is class-symmetric, meaning that the probability of confusing a {} with a {} is equal to the probability of confusing a {} with a {}.
231
197
 
232
198
  """
233
- + clickable_label
234
- )
235
199
 
236
200
 
237
- markdown_localization_accuracy = """## Mask accuracy (IoU)
201
+ markdown_localization_accuracy = """## Localization Accuracy (IoU)
238
202
 
239
- This section measures how accurately predicted masks match the actual shapes of ground truth instances. We calculate the average <abbr title="{}">IoU score</abbr> of predictions and visualize a histogram of IoU scores.
203
+ This section measures how accurately predicted masks match the actual shapes of ground truth instances.
240
204
  """
241
205
 
242
206
  markdown_iou_calculation = """<img src='https://github.com/dataset-ninja/model-benchmark-template/assets/78355358/8d7c63d0-2f3b-4f3f-9fd8-c6383a4bfba4' alt='alt text' width='300' />
@@ -261,6 +225,7 @@ This section analyzes <abbr title="{}">confidence scores</abbr> (or predicted pr
261
225
  """
262
226
 
263
227
  markdown_what_is_calibration = """In some applications, it's crucial for a model not only to make accurate predictions but also to provide reliable **confidence levels**. A well-calibrated model aligns its confidence scores with the actual likelihood of predictions being correct. For example, if a model claims 90% confidence for predictions but they are correct only half the time, it is **overconfident**. Conversely, **underconfidence** occurs when a model assigns lower confidence scores than the actual likelihood of its predictions. In the context of autonomous driving, this might cause a vehicle to brake or slow down too frequently, reducing travel efficiency and potentially causing traffic issues."""
228
+
264
229
  markdown_calibration_score_2 = """To evaluate the calibration, we draw a <b>Reliability Diagram</b> and calculate <b>Expected Calibration Error</b> (ECE)."""
265
230
 
266
231
  markdown_reliability_diagram = """### Reliability Diagram
@@ -269,7 +234,7 @@ Reliability diagram, also known as a Calibration curve, helps in understanding w
269
234
  """
270
235
 
271
236
  markdown_calibration_curve_interpretation = """
272
- 1. **The curve is above the perfect line (Underconfidence):** If the calibration curve is consistently above the perfect line, this indicates underconfidence. The models predictions are more correct than the confidence scores suggest. For example, if the model assigns 70% confidence to some predictions but, empirically, 90% of these predictions are correct, the model is underconfident.
237
+ 1. **The curve is above the perfect line (Underconfidence):** If the calibration curve is consistently above the perfect line, this indicates underconfidence. The model's predictions are more correct than the confidence scores suggest. For example, if the model assigns 70% confidence to some predictions but, empirically, 90% of these predictions are correct, the model is underconfident.
273
238
  2. **The curve is below the perfect line (Overconfidence):** If the calibration curve is below the perfect line, the model exhibits overconfidence. This means it is too sure of its predictions. For example, if the model assigns 80% confidence to some predictions, but only 40% of these predictions are correct, the model is overconfident.
274
239
 
275
240
  To quantify the calibration, we calculate **Expected Calibration Error (ECE).** Intuitively, ECE can be viewed as a deviation of the model's calibration curve from the diagonal line, that corresponds to a perfectly calibrated model. When ECE is high, we can not trust predicted probabilities so much.
@@ -316,14 +281,11 @@ Additionally, it provides a view of how predicted probabilities are distributed.
316
281
  Ideally, the green histogram (TP predictions) should have higher confidence scores and be shifted to the right, indicating that the model is sure about its correct predictions, and the red histogram (FP predictions) should have lower confidence scores and be shifted to the left.
317
282
  """
318
283
 
319
- markdown_class_ap = (
320
- """## Average Precision by Class
284
+ markdown_class_ap = """## Average Precision by Class
321
285
 
322
286
  A quick visual comparison of the model performance across all classes. Each axis in the chart represents a different class, and the distance to the center indicates the <abbr title="{}">Average Precision</abbr> (AP) for that class.
323
287
 
324
288
  """
325
- + clickable_label
326
- )
327
289
 
328
290
 
329
291
  markdown_class_outcome_counts_1 = """### Outcome Counts by Class
@@ -337,16 +299,38 @@ markdown_normalization = """Normalization is used for better interclass comparis
337
299
  If normalization is off, the chart will display the total count of instances that correspond to outcome type (one of TP, FP or FN). This mode is identical to the main Outcome Counts graph on the top of the page. However, when normalization is off, you may encounter a class imbalance problem. Visually, bars that correspond to classes with many instances in the dataset will be much larger than others. This complicates the visual analysis.
338
300
  """
339
301
 
340
- markdown_class_outcome_counts_2 = (
341
- """You can switch the plot view between normalized and absolute values.
302
+ markdown_class_outcome_counts_2 = """You can switch the plot view between normalized and absolute values.
342
303
 
343
304
  _Bars in the chart are sorted by <abbr title="{}">F1-score</abbr> to keep a unified order of classes between different charts._
344
305
 
345
306
  """
346
- + clickable_label
347
- )
348
307
 
349
308
  empty = """### {}
350
309
 
351
310
  > {}
352
311
  """
312
+
313
+
314
+ markdown_speedtest_intro = """## Inference Speed
315
+
316
+ This is a speed test benchmark for this model. The model was tested with the following configuration:
317
+
318
+ - **Device**: {}
319
+ - **Hardware**: {}
320
+ - **Runtime**: {}
321
+ """
322
+
323
+ markdown_speedtest_table = """
324
+ The table below shows the speed test results. For each test, the time taken to process one batch of images is shown, as well as the model's throughput (i.e, the number of images processed per second, or FPS). Results are averaged across **{}** iterations.
325
+ """
326
+
327
+ markdown_real_time_inference = """## Real-time Inference
328
+
329
+ This chart compares different runtimes and devices (CPU or GPU)."""
330
+
331
+ # We additionally divide **predict** procedure into three stages: pre-process, inference, and post-process. Each bar in this chart consists of these three stages. For example, in the chart you can find how long the post-process phase lasts in a CPU device with an ONNXRuntime environment."""
332
+
333
+
334
+ markdown_speedtest_chart = """
335
+ This chart shows how the model's speed changes with different batch sizes . As the batch size increases, you can observe an increase in FPS (images per second).
336
+ """
@@ -0,0 +1,18 @@
1
+ import supervisely.nn.benchmark.instance_segmentation.text_templates as vis_texts
2
+ from supervisely.nn.benchmark.cv_tasks import CVTask
3
+ from supervisely.nn.benchmark.object_detection.visualizer import (
4
+ ObjectDetectionVisualizer,
5
+ )
6
+
7
+
8
+ class InstanceSegmentationVisualizer(ObjectDetectionVisualizer):
9
+ def __init__(self, *args, **kwargs):
10
+ super().__init__(*args, **kwargs)
11
+
12
+ self.vis_texts = vis_texts
13
+ self._widgets = False
14
+ self.ann_opacity = 0.7
15
+
16
+ @property
17
+ def cv_task(self):
18
+ return CVTask.INSTANCE_SEGMENTATION
File without changes
@@ -0,0 +1,51 @@
1
+ from typing import Dict, Optional
2
+
3
+ from supervisely.nn.benchmark.base_visualizer import BaseVisMetric
4
+ from supervisely.nn.benchmark.object_detection.evaluator import (
5
+ ObjectDetectionEvalResult,
6
+ )
7
+
8
+
9
+ class DetectionVisMetric(BaseVisMetric):
10
+
11
+ def __init__(self, *args, **kwargs):
12
+ super().__init__(*args, **kwargs)
13
+ self.eval_result: ObjectDetectionEvalResult
14
+
15
+ def get_click_data(self) -> Optional[Dict]:
16
+ if not self.clickable:
17
+ return
18
+
19
+ res = {}
20
+
21
+ res["layoutTemplate"] = [None, None, None]
22
+ res["clickData"] = {}
23
+ for key, v in self.eval_result.click_data.objects_by_class.items():
24
+ res["clickData"][key] = {}
25
+ res["clickData"][key]["imagesIds"] = []
26
+
27
+ img_ids = set()
28
+ obj_ids = set()
29
+
30
+ res["clickData"][key][
31
+ "title"
32
+ ] = f"{key} class: {len(v)} object{'s' if len(v) > 1 else ''}"
33
+
34
+ for x in v:
35
+ img_ids.add(x["dt_img_id"])
36
+ obj_id = x["dt_obj_id"]
37
+ if obj_id is not None:
38
+ obj_ids.add(obj_id)
39
+
40
+ res["clickData"][key]["imagesIds"] = list(img_ids)
41
+ res["clickData"][key]["filters"] = [
42
+ {
43
+ "type": "tag",
44
+ "tagId": "confidence",
45
+ "value": [self.eval_result.mp.f1_optimal_conf, 1],
46
+ },
47
+ {"type": "tag", "tagId": "outcome", "value": "TP"},
48
+ {"type": "specific_objects", "tagId": None, "value": list(obj_ids)},
49
+ ]
50
+
51
+ return res
@@ -1,7 +1,8 @@
1
1
  from supervisely.nn.benchmark.base_benchmark import BaseBenchmark
2
2
  from supervisely.nn.benchmark.cv_tasks import CVTask
3
- from supervisely.nn.benchmark.evaluation.object_detection_evaluator import (
4
- ObjectDetectionEvaluator,
3
+ from supervisely.nn.benchmark.object_detection.evaluator import ObjectDetectionEvaluator
4
+ from supervisely.nn.benchmark.object_detection.visualizer import (
5
+ ObjectDetectionVisualizer,
5
6
  )
6
7
  from supervisely.nn.benchmark.utils import try_set_conf_auto
7
8
 
@@ -9,6 +10,7 @@ CONF_THRES = 0.05
9
10
 
10
11
 
11
12
  class ObjectDetectionBenchmark(BaseBenchmark):
13
+ visualizer_cls = ObjectDetectionVisualizer
12
14
 
13
15
  @property
14
16
  def cv_task(self) -> str:
@@ -0,0 +1,2 @@
1
+ # Intersection over Union threshold that will be used for objects mathcing
2
+ iou_threshold: 0.5
@@ -1,14 +1,72 @@
1
1
  import os
2
-
3
- from supervisely.io.json import dump_json_file
4
- from supervisely.nn.benchmark.coco_utils import read_coco_datasets, sly2coco
5
- from supervisely.nn.benchmark.evaluation import BaseEvaluator
6
- from supervisely.nn.benchmark.evaluation.coco import calculate_metrics
2
+ import pickle
7
3
  from pathlib import Path
8
4
 
5
+ import pandas as pd
6
+
7
+ from supervisely.io.json import dump_json_file, load_json_file
8
+ from supervisely.nn.benchmark.base_evaluator import BaseEvalResult, BaseEvaluator
9
+ from supervisely.nn.benchmark.object_detection.metric_provider import MetricProvider
10
+ from supervisely.nn.benchmark.utils import (
11
+ calculate_metrics,
12
+ read_coco_datasets,
13
+ sly2coco,
14
+ )
15
+ from supervisely.nn.benchmark.visualization.vis_click_data import ClickData, IdMapper
16
+
17
+
18
+ class ObjectDetectionEvalResult(BaseEvalResult):
19
+ mp_cls = MetricProvider
20
+
21
+ def _read_eval_data(self):
22
+ from pycocotools.coco import COCO # pylint: disable=import-error
23
+
24
+ gt_path = str(Path(self.directory) / "cocoGt.json")
25
+ dt_path = str(Path(self.directory) / "cocoDt.json")
26
+ coco_gt, coco_dt = COCO(gt_path), COCO(dt_path)
27
+ self.coco_gt = coco_gt
28
+ self.coco_dt = coco_dt
29
+ self.eval_data = None
30
+ with open(Path(self.directory, "eval_data.pkl"), "rb") as f:
31
+ self.eval_data = pickle.load(f)
32
+
33
+ inference_info_path = Path(self.directory) / "inference_info.json"
34
+ self.inference_info = load_json_file(str(inference_info_path))
35
+
36
+ speedtest_info_path = Path(self.directory).parent / "speedtest" / "speedtest.json"
37
+ if speedtest_info_path.exists():
38
+ self.speedtest_info = load_json_file(str(speedtest_info_path))
39
+
40
+ self.mp = MetricProvider(
41
+ self.eval_data["matches"],
42
+ self.eval_data["coco_metrics"],
43
+ self.eval_data["params"],
44
+ self.coco_gt,
45
+ self.coco_dt,
46
+ )
47
+ self.mp.calculate()
48
+
49
+ self.df_score_profile = pd.DataFrame(
50
+ self.mp.confidence_score_profile(), columns=["scores", "precision", "recall", "f1"]
51
+ )
52
+
53
+ # downsample
54
+ if len(self.df_score_profile) > 5000:
55
+ self.dfsp_down = self.df_score_profile.iloc[:: len(self.df_score_profile) // 1000]
56
+ else:
57
+ self.dfsp_down = self.df_score_profile
58
+
59
+ # Click data
60
+ gt_id_mapper = IdMapper(self.coco_gt.dataset)
61
+ dt_id_mapper = IdMapper(self.coco_dt.dataset)
62
+
63
+ self.click_data = ClickData(self.mp.m, gt_id_mapper, dt_id_mapper)
64
+
9
65
 
10
66
  class ObjectDetectionEvaluator(BaseEvaluator):
11
- EVALUATION_PARAMS_YAML_PATH = f"{Path(__file__).parent}/coco/evaluation_params.yaml"
67
+ EVALUATION_PARAMS_YAML_PATH = f"{Path(__file__).parent}/evaluation_params.yaml"
68
+ eval_result_cls = ObjectDetectionEvalResult
69
+ accepted_shapes = ["rectangle"]
12
70
 
13
71
  def evaluate(self):
14
72
  try:
@@ -43,14 +101,14 @@ class ObjectDetectionEvaluator(BaseEvaluator):
43
101
  cocoGt_json = sly2coco(
44
102
  self.gt_project_path,
45
103
  is_dt_dataset=False,
46
- accepted_shapes=["rectangle"],
104
+ accepted_shapes=self.accepted_shapes,
47
105
  progress=self.pbar,
48
106
  classes_whitelist=self.classes_whitelist,
49
107
  )
50
108
  cocoDt_json = sly2coco(
51
- self.dt_project_path,
109
+ self.pred_project_path,
52
110
  is_dt_dataset=True,
53
- accepted_shapes=["rectangle"],
111
+ accepted_shapes=self.accepted_shapes,
54
112
  progress=self.pbar,
55
113
  classes_whitelist=self.classes_whitelist,
56
114
  )
@@ -4,7 +4,7 @@ from copy import deepcopy
4
4
  import numpy as np
5
5
  import pandas as pd
6
6
 
7
- from supervisely.nn.benchmark.evaluation.coco import metrics
7
+ from supervisely.nn.benchmark.utils.detection import metrics
8
8
 
9
9
  METRIC_NAMES = {
10
10
  "mAP": "mAP",
@@ -266,12 +266,12 @@ class _MetricProvider:
266
266
  def _init_counts(self):
267
267
  cat_ids = self.cat_ids
268
268
  iouThrs = self.iouThrs
269
- catId2idx = {cat_id: idx for idx, cat_id in enumerate(cat_ids)}
269
+ cat_id_to_idx = {cat_id: idx for idx, cat_id in enumerate(cat_ids)}
270
270
  ious = []
271
271
  cats = []
272
272
  for match in self.tp_matches:
273
273
  ious.append(match["iou"])
274
- cats.append(catId2idx[match["category_id"]])
274
+ cats.append(cat_id_to_idx[match["category_id"]])
275
275
  ious = np.array(ious) + np.spacing(1)
276
276
  iou_idxs = np.searchsorted(iouThrs, ious) - 1
277
277
  cats = np.array(cats)
@@ -285,7 +285,7 @@ class _MetricProvider:
285
285
  true_positives = true_positives[:, ::-1].cumsum(1)[:, ::-1]
286
286
  tp_count = true_positives[:, 0]
287
287
  # FN
288
- cats_fn = np.array([catId2idx[match["category_id"]] for match in self.fn_matches])
288
+ cats_fn = np.array([cat_id_to_idx[match["category_id"]] for match in self.fn_matches])
289
289
  if cats_fn.size == 0:
290
290
  fn_count = np.zeros((len(cat_ids),), dtype=int)
291
291
  else:
@@ -293,7 +293,7 @@ class _MetricProvider:
293
293
  gt_count = fn_count + tp_count
294
294
  false_negatives = gt_count[:, None] - true_positives
295
295
  # FP
296
- cats_fp = np.array([catId2idx[match["category_id"]] for match in self.fp_matches])
296
+ cats_fp = np.array([cat_id_to_idx[match["category_id"]] for match in self.fp_matches])
297
297
  if cats_fp.size == 0:
298
298
  fp_count = np.zeros((len(cat_ids),), dtype=int)
299
299
  else:
@@ -380,33 +380,32 @@ class _MetricProvider:
380
380
 
381
381
  def confusion_matrix(self):
382
382
  K = len(self.cat_ids)
383
- catId2idx = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
384
- idx2catId = {i: cat_id for cat_id, i in catId2idx.items()}
383
+ cat_id_to_idx = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
385
384
 
386
385
  confusion_matrix = np.zeros((K + 1, K + 1), dtype=int)
387
386
 
388
387
  for m in self.confused_matches:
389
- cat_idx_pred = catId2idx[m["category_id"]]
390
- cat_idx_gt = catId2idx[self.cocoGt.anns[m["gt_id"]]["category_id"]]
388
+ cat_idx_pred = cat_id_to_idx[m["category_id"]]
389
+ cat_idx_gt = cat_id_to_idx[self.cocoGt.anns[m["gt_id"]]["category_id"]]
391
390
  confusion_matrix[cat_idx_pred, cat_idx_gt] += 1
392
391
 
393
392
  for m in self.tp_matches:
394
- cat_idx = catId2idx[m["category_id"]]
393
+ cat_idx = cat_id_to_idx[m["category_id"]]
395
394
  confusion_matrix[cat_idx, cat_idx] += 1
396
395
 
397
396
  for m in self.fp_not_confused_matches:
398
- cat_idx_pred = catId2idx[m["category_id"]]
397
+ cat_idx_pred = cat_id_to_idx[m["category_id"]]
399
398
  confusion_matrix[cat_idx_pred, -1] += 1
400
399
 
401
400
  for m in self.fn_matches:
402
- cat_idx_gt = catId2idx[m["category_id"]]
401
+ cat_idx_gt = cat_id_to_idx[m["category_id"]]
403
402
  confusion_matrix[-1, cat_idx_gt] += 1
404
403
 
405
404
  return confusion_matrix
406
405
 
407
406
  def frequently_confused(self, confusion_matrix, topk_pairs=20):
408
407
  # Frequently confused class pairs
409
- idx2catId = {i: cat_id for i, cat_id in enumerate(self.cat_ids)}
408
+ cat_id_enum = {i: cat_id for i, cat_id in enumerate(self.cat_ids)}
410
409
  cm = confusion_matrix[:-1, :-1]
411
410
  cm_l = np.tril(cm, -1)
412
411
  cm_u = np.triu(cm, 1)
@@ -427,7 +426,7 @@ class _MetricProvider:
427
426
  confused_name_pairs = [(self.cat_names[i], self.cat_names[j]) for i, j in confused_idxs]
428
427
  confused_counts = confused_counts[inds_sort2]
429
428
  confused_prob = confused_prob[inds_sort2]
430
- confused_catIds = [(idx2catId[i], idx2catId[j]) for i, j in confused_idxs]
429
+ confused_catIds = [(cat_id_enum[i], cat_id_enum[j]) for i, j in confused_idxs]
431
430
 
432
431
  return pd.DataFrame(
433
432
  {