active-vision 0.0.2__tar.gz → 0.0.4__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {active_vision-0.0.2 → active_vision-0.0.4}/PKG-INFO +68 -28
- {active_vision-0.0.2 → active_vision-0.0.4}/README.md +65 -27
- {active_vision-0.0.2 → active_vision-0.0.4}/pyproject.toml +3 -1
- active_vision-0.0.4/src/active_vision/__init__.py +3 -0
- active_vision-0.0.4/src/active_vision/core.py +291 -0
- {active_vision-0.0.2 → active_vision-0.0.4}/src/active_vision.egg-info/PKG-INFO +68 -28
- {active_vision-0.0.2 → active_vision-0.0.4}/src/active_vision.egg-info/requires.txt +2 -0
- active_vision-0.0.2/src/active_vision/__init__.py +0 -3
- active_vision-0.0.2/src/active_vision/core.py +0 -149
- {active_vision-0.0.2 → active_vision-0.0.4}/LICENSE +0 -0
- {active_vision-0.0.2 → active_vision-0.0.4}/setup.cfg +0 -0
- {active_vision-0.0.2 → active_vision-0.0.4}/src/active_vision.egg-info/SOURCES.txt +0 -0
- {active_vision-0.0.2 → active_vision-0.0.4}/src/active_vision.egg-info/dependency_links.txt +0 -0
- {active_vision-0.0.2 → active_vision-0.0.4}/src/active_vision.egg-info/top_level.txt +0 -0
@@ -1,53 +1,77 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: active-vision
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.4
|
4
4
|
Summary: Active learning for edge vision.
|
5
5
|
Requires-Python: >=3.10
|
6
6
|
Description-Content-Type: text/markdown
|
7
7
|
License-File: LICENSE
|
8
8
|
Requires-Dist: datasets>=3.2.0
|
9
9
|
Requires-Dist: fastai>=2.7.18
|
10
|
+
Requires-Dist: gradio>=5.12.0
|
10
11
|
Requires-Dist: ipykernel>=6.29.5
|
11
12
|
Requires-Dist: ipywidgets>=8.1.5
|
12
13
|
Requires-Dist: loguru>=0.7.3
|
13
14
|
Requires-Dist: seaborn>=0.13.2
|
15
|
+
Requires-Dist: timm>=1.0.13
|
14
16
|
|
15
17
|
![Python Version](https://img.shields.io/badge/python-3.10%2B-blue?style=for-the-badge)
|
16
18
|
![License](https://img.shields.io/badge/License-Apache%202.0-green.svg?style=for-the-badge)
|
17
|
-
![PyPI](https://img.shields.io/pypi/v/active-vision?style=for-the-badge)
|
19
|
+
[![PyPI](https://img.shields.io/pypi/v/active-vision?style=for-the-badge)](https://pypi.org/project/active-vision/)
|
18
20
|
![Downloads](https://img.shields.io/pepy/dt/active-vision?style=for-the-badge&logo=pypi&logoColor=white&label=Downloads&color=purple)
|
19
21
|
|
20
22
|
<p align="center">
|
21
|
-
<img src="https://
|
23
|
+
<img src="https://raw.githubusercontent.com/dnth/active-vision/main/assets/logo.png" alt="active-vision">
|
22
24
|
</p>
|
23
25
|
|
24
26
|
Active learning at the edge for computer vision.
|
25
27
|
|
26
|
-
The goal of this project is to create a framework for active learning
|
28
|
+
The goal of this project is to create a framework for the active learning loop for computer vision deployed on edge devices.
|
27
29
|
|
28
|
-
|
30
|
+
Supported tasks:
|
31
|
+
- [X] Image classification
|
32
|
+
- [ ] Object detection
|
33
|
+
- [ ] Segmentation
|
29
34
|
|
30
|
-
- Training framework: fastai
|
31
|
-
- User interface: streamlit
|
32
|
-
- Database: sqlite
|
33
|
-
- Experiment tracking: wandb
|
34
35
|
|
35
36
|
## Installation
|
36
37
|
|
37
|
-
PyPI
|
38
|
+
Get a release from PyPI
|
38
39
|
```bash
|
39
40
|
pip install active-vision
|
40
41
|
```
|
41
42
|
|
42
|
-
|
43
|
+
Install from source
|
43
44
|
```bash
|
44
45
|
git clone https://github.com/dnth/active-vision.git
|
45
46
|
cd active-vision
|
46
47
|
pip install -e .
|
47
48
|
```
|
48
49
|
|
50
|
+
I recommend using [uv](https://docs.astral.sh/uv/) to set up a virtual environment and install the package. You can also use other virtual env of your choice.
|
51
|
+
|
52
|
+
If you're using uv:
|
53
|
+
|
54
|
+
```bash
|
55
|
+
uv venv
|
56
|
+
uv sync
|
57
|
+
```
|
58
|
+
Once the virtual environment is created, you can install the package using pip.
|
59
|
+
|
60
|
+
> [!TIP]
|
61
|
+
> If you're using uv add a uv before the pip install command to install into your virtual environment. Eg:
|
62
|
+
> ```bash
|
63
|
+
> uv pip install active-vision
|
64
|
+
> ```
|
65
|
+
|
49
66
|
## Usage
|
50
|
-
See the [notebook](./nbs/
|
67
|
+
See the [notebook](./nbs/04_relabel_loop.ipynb) for a complete example.
|
68
|
+
|
69
|
+
Be sure to prepared 3 datasets:
|
70
|
+
- [initial_samples](./nbs/initial_samples.parquet): A dataframe of an existing labeled training dataset to seed the training set.
|
71
|
+
- [unlabeled](./nbs/unlabeled_samples.parquet): A dataframe of unlabeled data which we will sample from using active learning.
|
72
|
+
- [eval](./nbs/evaluation_samples.parquet): A dataframe of labeled data which we will use to evaluate the performance of the model.
|
73
|
+
|
74
|
+
As a toy example I created the above 3 datasets from the imagenette dataset.
|
51
75
|
|
52
76
|
```python
|
53
77
|
from active_vision import ActiveLearner
|
@@ -56,29 +80,45 @@ import pandas as pd
|
|
56
80
|
# Create an active learner instance with a model
|
57
81
|
al = ActiveLearner("resnet18")
|
58
82
|
|
59
|
-
# Load
|
83
|
+
# Load dataset
|
60
84
|
train_df = pd.read_parquet("training_samples.parquet")
|
61
|
-
al.load_dataset(
|
85
|
+
al.load_dataset(df, filepath_col="filepath", label_col="label")
|
62
86
|
|
63
|
-
# Train
|
87
|
+
# Train model
|
64
88
|
al.train(epochs=3, lr=1e-3)
|
65
89
|
|
66
|
-
#
|
67
|
-
|
90
|
+
# Evaluate the model on a *labeled* evaluation set
|
91
|
+
accuracy = al.evaluate(eval_df, filepath_col="filepath", label_col="label")
|
68
92
|
|
69
|
-
#
|
70
|
-
accuracy = al.evaluate(eval_df, "filepath", "label")
|
71
|
-
|
72
|
-
# Get predictions from an unlabeled set
|
93
|
+
# Get predictions from an *unlabeled* set
|
73
94
|
pred_df = al.predict(filepaths)
|
74
95
|
|
75
|
-
# Sample low confidence predictions
|
96
|
+
# Sample low confidence predictions from unlabeled set
|
76
97
|
uncertain_df = al.sample_uncertain(pred_df, num_samples=10)
|
77
98
|
|
78
|
-
#
|
79
|
-
al.
|
99
|
+
# Launch a Gradio UI to label the low confidence samples
|
100
|
+
al.label(uncertain_df, output_filename="uncertain")
|
80
101
|
```
|
81
102
|
|
103
|
+
![Gradio UI](./assets/labeling_ui.png)
|
104
|
+
|
105
|
+
Once complete, the labeled samples will be save into a new df.
|
106
|
+
We can now add the newly labeled data to the training set.
|
107
|
+
|
108
|
+
```python
|
109
|
+
# Add newly labeled data to training set and save as a new file active_labeled
|
110
|
+
al.add_to_train_set(labeled_df, output_filename="active_labeled")
|
111
|
+
```
|
112
|
+
|
113
|
+
Repeat the process until the model is good enough. Use the dataset to train a larger model and deploy.
|
114
|
+
|
115
|
+
> [!TIP]
|
116
|
+
> For the toy dataset, I got to about 93% accuracy on the evaluation set with 200+ labeled images. The best performing model on the [leaderboard](https://github.com/fastai/imagenette) got 95.11% accuracy training on all 9469 labeled images.
|
117
|
+
>
|
118
|
+
> This took me about 6 iterations of relabeling. Each iteration took about 5 minutes to complete including labeling and model training (resnet18). See the [notebook](./nbs/04_relabel_loop.ipynb) for more details.
|
119
|
+
>
|
120
|
+
> But using the dataset of 200+ images, I trained a more capable model (convnext_small_in22k) and got 99.3% accuracy on the evaluation set. See the [notebook](./nbs/05_retrain_larger.ipynb) for more details.
|
121
|
+
|
82
122
|
## Workflow
|
83
123
|
There are two workflows for active learning at the edge that we can use depending on the availability of labeled data.
|
84
124
|
|
@@ -86,10 +126,10 @@ There are two workflows for active learning at the edge that we can use dependin
|
|
86
126
|
If we have no labeled data, we can use active learning to iteratively improve the model and build a labeled dataset.
|
87
127
|
|
88
128
|
1. Load a small proxy model.
|
89
|
-
2. Label an initial dataset.
|
129
|
+
2. Label an initial dataset. If there is none, you'll have to label some images.
|
90
130
|
3. Train the proxy model on the labeled dataset.
|
91
131
|
4. Run inference on the unlabeled dataset.
|
92
|
-
5. Evaluate the performance of the proxy model
|
132
|
+
5. Evaluate the performance of the proxy model.
|
93
133
|
6. Is model good enough?
|
94
134
|
- Yes: Save the proxy model and the dataset.
|
95
135
|
- No: Select the most informative images to label using active learning.
|
@@ -141,7 +181,7 @@ graph TD
|
|
141
181
|
```
|
142
182
|
|
143
183
|
|
144
|
-
## Methodology
|
184
|
+
<!-- ## Methodology
|
145
185
|
To test out the workflows we will use the [imagenette dataset](https://huggingface.co/datasets/frgfm/imagenette). But this will be applicable to any dataset.
|
146
186
|
|
147
187
|
Imagenette is a subset of the ImageNet dataset with 10 classes. We will use this dataset to test out the workflows. Additionally, Imagenette has an existing leaderboard which we can use to evaluate the performance of the models.
|
@@ -192,4 +232,4 @@ After the first iteration we got 94.57% accuracy on the validation set. See the
|
|
192
232
|
> [!TIP]
|
193
233
|
> | Train Epochs | Number of Images | Validation Accuracy | Source |
|
194
234
|
> |--------------|-----------------|----------------------|------------------|
|
195
|
-
> | 10 | 200 | 94.57% | First relabeling [notebook](./nbs/03_retrain_model.ipynb) |
|
235
|
+
> | 10 | 200 | 94.57% | First relabeling [notebook](./nbs/03_retrain_model.ipynb) | -->
|
@@ -1,39 +1,61 @@
|
|
1
1
|
![Python Version](https://img.shields.io/badge/python-3.10%2B-blue?style=for-the-badge)
|
2
2
|
![License](https://img.shields.io/badge/License-Apache%202.0-green.svg?style=for-the-badge)
|
3
|
-
![PyPI](https://img.shields.io/pypi/v/active-vision?style=for-the-badge)
|
3
|
+
[![PyPI](https://img.shields.io/pypi/v/active-vision?style=for-the-badge)](https://pypi.org/project/active-vision/)
|
4
4
|
![Downloads](https://img.shields.io/pepy/dt/active-vision?style=for-the-badge&logo=pypi&logoColor=white&label=Downloads&color=purple)
|
5
5
|
|
6
6
|
<p align="center">
|
7
|
-
<img src="https://
|
7
|
+
<img src="https://raw.githubusercontent.com/dnth/active-vision/main/assets/logo.png" alt="active-vision">
|
8
8
|
</p>
|
9
9
|
|
10
10
|
Active learning at the edge for computer vision.
|
11
11
|
|
12
|
-
The goal of this project is to create a framework for active learning
|
12
|
+
The goal of this project is to create a framework for the active learning loop for computer vision deployed on edge devices.
|
13
13
|
|
14
|
-
|
14
|
+
Supported tasks:
|
15
|
+
- [X] Image classification
|
16
|
+
- [ ] Object detection
|
17
|
+
- [ ] Segmentation
|
15
18
|
|
16
|
-
- Training framework: fastai
|
17
|
-
- User interface: streamlit
|
18
|
-
- Database: sqlite
|
19
|
-
- Experiment tracking: wandb
|
20
19
|
|
21
20
|
## Installation
|
22
21
|
|
23
|
-
PyPI
|
22
|
+
Get a release from PyPI
|
24
23
|
```bash
|
25
24
|
pip install active-vision
|
26
25
|
```
|
27
26
|
|
28
|
-
|
27
|
+
Install from source
|
29
28
|
```bash
|
30
29
|
git clone https://github.com/dnth/active-vision.git
|
31
30
|
cd active-vision
|
32
31
|
pip install -e .
|
33
32
|
```
|
34
33
|
|
34
|
+
I recommend using [uv](https://docs.astral.sh/uv/) to set up a virtual environment and install the package. You can also use other virtual env of your choice.
|
35
|
+
|
36
|
+
If you're using uv:
|
37
|
+
|
38
|
+
```bash
|
39
|
+
uv venv
|
40
|
+
uv sync
|
41
|
+
```
|
42
|
+
Once the virtual environment is created, you can install the package using pip.
|
43
|
+
|
44
|
+
> [!TIP]
|
45
|
+
> If you're using uv add a uv before the pip install command to install into your virtual environment. Eg:
|
46
|
+
> ```bash
|
47
|
+
> uv pip install active-vision
|
48
|
+
> ```
|
49
|
+
|
35
50
|
## Usage
|
36
|
-
See the [notebook](./nbs/
|
51
|
+
See the [notebook](./nbs/04_relabel_loop.ipynb) for a complete example.
|
52
|
+
|
53
|
+
Be sure to prepared 3 datasets:
|
54
|
+
- [initial_samples](./nbs/initial_samples.parquet): A dataframe of an existing labeled training dataset to seed the training set.
|
55
|
+
- [unlabeled](./nbs/unlabeled_samples.parquet): A dataframe of unlabeled data which we will sample from using active learning.
|
56
|
+
- [eval](./nbs/evaluation_samples.parquet): A dataframe of labeled data which we will use to evaluate the performance of the model.
|
57
|
+
|
58
|
+
As a toy example I created the above 3 datasets from the imagenette dataset.
|
37
59
|
|
38
60
|
```python
|
39
61
|
from active_vision import ActiveLearner
|
@@ -42,29 +64,45 @@ import pandas as pd
|
|
42
64
|
# Create an active learner instance with a model
|
43
65
|
al = ActiveLearner("resnet18")
|
44
66
|
|
45
|
-
# Load
|
67
|
+
# Load dataset
|
46
68
|
train_df = pd.read_parquet("training_samples.parquet")
|
47
|
-
al.load_dataset(
|
69
|
+
al.load_dataset(df, filepath_col="filepath", label_col="label")
|
48
70
|
|
49
|
-
# Train
|
71
|
+
# Train model
|
50
72
|
al.train(epochs=3, lr=1e-3)
|
51
73
|
|
52
|
-
#
|
53
|
-
|
74
|
+
# Evaluate the model on a *labeled* evaluation set
|
75
|
+
accuracy = al.evaluate(eval_df, filepath_col="filepath", label_col="label")
|
54
76
|
|
55
|
-
#
|
56
|
-
accuracy = al.evaluate(eval_df, "filepath", "label")
|
57
|
-
|
58
|
-
# Get predictions from an unlabeled set
|
77
|
+
# Get predictions from an *unlabeled* set
|
59
78
|
pred_df = al.predict(filepaths)
|
60
79
|
|
61
|
-
# Sample low confidence predictions
|
80
|
+
# Sample low confidence predictions from unlabeled set
|
62
81
|
uncertain_df = al.sample_uncertain(pred_df, num_samples=10)
|
63
82
|
|
64
|
-
#
|
65
|
-
al.
|
83
|
+
# Launch a Gradio UI to label the low confidence samples
|
84
|
+
al.label(uncertain_df, output_filename="uncertain")
|
66
85
|
```
|
67
86
|
|
87
|
+
![Gradio UI](./assets/labeling_ui.png)
|
88
|
+
|
89
|
+
Once complete, the labeled samples will be save into a new df.
|
90
|
+
We can now add the newly labeled data to the training set.
|
91
|
+
|
92
|
+
```python
|
93
|
+
# Add newly labeled data to training set and save as a new file active_labeled
|
94
|
+
al.add_to_train_set(labeled_df, output_filename="active_labeled")
|
95
|
+
```
|
96
|
+
|
97
|
+
Repeat the process until the model is good enough. Use the dataset to train a larger model and deploy.
|
98
|
+
|
99
|
+
> [!TIP]
|
100
|
+
> For the toy dataset, I got to about 93% accuracy on the evaluation set with 200+ labeled images. The best performing model on the [leaderboard](https://github.com/fastai/imagenette) got 95.11% accuracy training on all 9469 labeled images.
|
101
|
+
>
|
102
|
+
> This took me about 6 iterations of relabeling. Each iteration took about 5 minutes to complete including labeling and model training (resnet18). See the [notebook](./nbs/04_relabel_loop.ipynb) for more details.
|
103
|
+
>
|
104
|
+
> But using the dataset of 200+ images, I trained a more capable model (convnext_small_in22k) and got 99.3% accuracy on the evaluation set. See the [notebook](./nbs/05_retrain_larger.ipynb) for more details.
|
105
|
+
|
68
106
|
## Workflow
|
69
107
|
There are two workflows for active learning at the edge that we can use depending on the availability of labeled data.
|
70
108
|
|
@@ -72,10 +110,10 @@ There are two workflows for active learning at the edge that we can use dependin
|
|
72
110
|
If we have no labeled data, we can use active learning to iteratively improve the model and build a labeled dataset.
|
73
111
|
|
74
112
|
1. Load a small proxy model.
|
75
|
-
2. Label an initial dataset.
|
113
|
+
2. Label an initial dataset. If there is none, you'll have to label some images.
|
76
114
|
3. Train the proxy model on the labeled dataset.
|
77
115
|
4. Run inference on the unlabeled dataset.
|
78
|
-
5. Evaluate the performance of the proxy model
|
116
|
+
5. Evaluate the performance of the proxy model.
|
79
117
|
6. Is model good enough?
|
80
118
|
- Yes: Save the proxy model and the dataset.
|
81
119
|
- No: Select the most informative images to label using active learning.
|
@@ -127,7 +165,7 @@ graph TD
|
|
127
165
|
```
|
128
166
|
|
129
167
|
|
130
|
-
## Methodology
|
168
|
+
<!-- ## Methodology
|
131
169
|
To test out the workflows we will use the [imagenette dataset](https://huggingface.co/datasets/frgfm/imagenette). But this will be applicable to any dataset.
|
132
170
|
|
133
171
|
Imagenette is a subset of the ImageNet dataset with 10 classes. We will use this dataset to test out the workflows. Additionally, Imagenette has an existing leaderboard which we can use to evaluate the performance of the models.
|
@@ -178,4 +216,4 @@ After the first iteration we got 94.57% accuracy on the validation set. See the
|
|
178
216
|
> [!TIP]
|
179
217
|
> | Train Epochs | Number of Images | Validation Accuracy | Source |
|
180
218
|
> |--------------|-----------------|----------------------|------------------|
|
181
|
-
> | 10 | 200 | 94.57% | First relabeling [notebook](./nbs/03_retrain_model.ipynb) |
|
219
|
+
> | 10 | 200 | 94.57% | First relabeling [notebook](./nbs/03_retrain_model.ipynb) | -->
|
@@ -1,14 +1,16 @@
|
|
1
1
|
[project]
|
2
2
|
name = "active-vision"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.4"
|
4
4
|
description = "Active learning for edge vision."
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.10"
|
7
7
|
dependencies = [
|
8
8
|
"datasets>=3.2.0",
|
9
9
|
"fastai>=2.7.18",
|
10
|
+
"gradio>=5.12.0",
|
10
11
|
"ipykernel>=6.29.5",
|
11
12
|
"ipywidgets>=8.1.5",
|
12
13
|
"loguru>=0.7.3",
|
13
14
|
"seaborn>=0.13.2",
|
15
|
+
"timm>=1.0.13",
|
14
16
|
]
|
@@ -0,0 +1,291 @@
|
|
1
|
+
import pandas as pd
|
2
|
+
from loguru import logger
|
3
|
+
from fastai.vision.models import resnet18, resnet34
|
4
|
+
from fastai.callback.all import ShowGraphCallback
|
5
|
+
from fastai.vision.all import (
|
6
|
+
ImageDataLoaders,
|
7
|
+
aug_transforms,
|
8
|
+
Resize,
|
9
|
+
vision_learner,
|
10
|
+
accuracy,
|
11
|
+
valley,
|
12
|
+
slide,
|
13
|
+
minimum,
|
14
|
+
steep,
|
15
|
+
)
|
16
|
+
import torch
|
17
|
+
import torch.nn.functional as F
|
18
|
+
|
19
|
+
import warnings
|
20
|
+
|
21
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
22
|
+
|
23
|
+
|
24
|
+
class ActiveLearner:
|
25
|
+
def __init__(self, model_name: str):
|
26
|
+
self.model = self.load_model(model_name)
|
27
|
+
|
28
|
+
def load_model(self, model_name: str):
|
29
|
+
models = {"resnet18": resnet18, "resnet34": resnet34}
|
30
|
+
logger.info(f"Loading model {model_name}")
|
31
|
+
if model_name not in models:
|
32
|
+
logger.error(f"Model {model_name} not found")
|
33
|
+
raise ValueError(f"Model {model_name} not found")
|
34
|
+
return models[model_name]
|
35
|
+
|
36
|
+
def load_dataset(
|
37
|
+
self,
|
38
|
+
df: pd.DataFrame,
|
39
|
+
filepath_col: str,
|
40
|
+
label_col: str,
|
41
|
+
valid_pct: float = 0.2,
|
42
|
+
batch_size: int = 16,
|
43
|
+
image_size: int = 224,
|
44
|
+
):
|
45
|
+
logger.info(f"Loading dataset from {filepath_col} and {label_col}")
|
46
|
+
self.train_set = df.copy()
|
47
|
+
|
48
|
+
logger.info("Creating dataloaders")
|
49
|
+
self.dls = ImageDataLoaders.from_df(
|
50
|
+
df,
|
51
|
+
path=".",
|
52
|
+
valid_pct=valid_pct,
|
53
|
+
fn_col=filepath_col,
|
54
|
+
label_col=label_col,
|
55
|
+
bs=batch_size,
|
56
|
+
item_tfms=Resize(image_size),
|
57
|
+
batch_tfms=aug_transforms(size=image_size, min_scale=0.75),
|
58
|
+
)
|
59
|
+
logger.info("Creating learner")
|
60
|
+
self.learn = vision_learner(self.dls, self.model, metrics=accuracy).to_fp16()
|
61
|
+
self.class_names = self.dls.vocab
|
62
|
+
logger.info("Done. Ready to train.")
|
63
|
+
|
64
|
+
def lr_find(self):
|
65
|
+
logger.info("Finding optimal learning rate")
|
66
|
+
self.lrs = self.learn.lr_find(suggest_funcs=(minimum, steep, valley, slide))
|
67
|
+
logger.info(f"Optimal learning rate: {self.lrs.valley}")
|
68
|
+
|
69
|
+
def train(self, epochs: int, lr: float):
|
70
|
+
logger.info(f"Training for {epochs} epochs with learning rate: {lr}")
|
71
|
+
self.learn.fine_tune(epochs, lr, cbs=[ShowGraphCallback()])
|
72
|
+
|
73
|
+
def predict(self, filepaths: list[str], batch_size: int = 16):
|
74
|
+
"""
|
75
|
+
Run inference on an unlabeled dataset. Returns a df with filepaths and predicted labels, and confidence scores.
|
76
|
+
"""
|
77
|
+
logger.info(f"Running inference on {len(filepaths)} samples")
|
78
|
+
test_dl = self.dls.test_dl(filepaths, bs=batch_size)
|
79
|
+
preds, _, cls_preds = self.learn.get_preds(dl=test_dl, with_decoded=True)
|
80
|
+
|
81
|
+
self.pred_df = pd.DataFrame(
|
82
|
+
{
|
83
|
+
"filepath": filepaths,
|
84
|
+
"pred_label": [self.learn.dls.vocab[i] for i in cls_preds.numpy()],
|
85
|
+
"pred_conf": torch.max(F.softmax(preds, dim=1), dim=1)[0].numpy(),
|
86
|
+
}
|
87
|
+
)
|
88
|
+
return self.pred_df
|
89
|
+
|
90
|
+
def evaluate(
|
91
|
+
self, df: pd.DataFrame, filepath_col: str, label_col: str, batch_size: int = 16
|
92
|
+
):
|
93
|
+
"""
|
94
|
+
Evaluate on a labeled dataset. Returns a score.
|
95
|
+
"""
|
96
|
+
self.eval_set = df.copy()
|
97
|
+
|
98
|
+
filepaths = self.eval_set[filepath_col].tolist()
|
99
|
+
labels = self.eval_set[label_col].tolist()
|
100
|
+
test_dl = self.dls.test_dl(filepaths, bs=batch_size)
|
101
|
+
preds, _, cls_preds = self.learn.get_preds(dl=test_dl, with_decoded=True)
|
102
|
+
|
103
|
+
self.eval_df = pd.DataFrame(
|
104
|
+
{
|
105
|
+
"filepath": filepaths,
|
106
|
+
"label": labels,
|
107
|
+
"pred_label": [self.learn.dls.vocab[i] for i in cls_preds.numpy()],
|
108
|
+
}
|
109
|
+
)
|
110
|
+
|
111
|
+
accuracy = float((self.eval_df["label"] == self.eval_df["pred_label"]).mean())
|
112
|
+
logger.info(f"Accuracy: {accuracy:.2%}")
|
113
|
+
return accuracy
|
114
|
+
|
115
|
+
def sample_uncertain(self, df: pd.DataFrame, num_samples: int):
|
116
|
+
"""
|
117
|
+
Sample top `num_samples` low confidence samples. Returns a df with filepaths and predicted labels, and confidence scores.
|
118
|
+
"""
|
119
|
+
logger.info(f"Getting top {num_samples} low confidence samples")
|
120
|
+
uncertain_df = df.sort_values(by="pred_conf", ascending=True).head(num_samples)
|
121
|
+
return uncertain_df
|
122
|
+
|
123
|
+
def label(self, df: pd.DataFrame, output_filename: str = "labeled"):
|
124
|
+
"""
|
125
|
+
Launch a labeling interface for the user to label the samples.
|
126
|
+
Input is a df with filepaths listing the files to be labeled. Output is a df with filepaths and labels.
|
127
|
+
"""
|
128
|
+
import gradio as gr
|
129
|
+
|
130
|
+
shortcut_js = """
|
131
|
+
<script>
|
132
|
+
function shortcuts(e) {
|
133
|
+
// Only block shortcuts if we're in a text input or textarea
|
134
|
+
if (e.target.tagName.toLowerCase() === "textarea" ||
|
135
|
+
(e.target.tagName.toLowerCase() === "input" && e.target.type.toLowerCase() === "text")) {
|
136
|
+
return;
|
137
|
+
}
|
138
|
+
|
139
|
+
if (e.key.toLowerCase() == "w") {
|
140
|
+
document.getElementById("submit_btn").click();
|
141
|
+
} else if (e.key.toLowerCase() == "d") {
|
142
|
+
document.getElementById("next_btn").click();
|
143
|
+
} else if (e.key.toLowerCase() == "a") {
|
144
|
+
document.getElementById("back_btn").click();
|
145
|
+
}
|
146
|
+
}
|
147
|
+
document.addEventListener('keypress', shortcuts, false);
|
148
|
+
</script>
|
149
|
+
"""
|
150
|
+
|
151
|
+
logger.info(f"Launching labeling interface for {len(df)} samples")
|
152
|
+
|
153
|
+
filepaths = df["filepath"].tolist()
|
154
|
+
|
155
|
+
with gr.Blocks(head=shortcut_js) as demo:
|
156
|
+
current_index = gr.State(value=0)
|
157
|
+
|
158
|
+
filename = gr.Textbox(
|
159
|
+
label="Filename", value=filepaths[0], interactive=False
|
160
|
+
)
|
161
|
+
|
162
|
+
image = gr.Image(
|
163
|
+
type="filepath", label="Image", value=filepaths[0], height=500
|
164
|
+
)
|
165
|
+
category = gr.Radio(choices=self.class_names, label="Select Category")
|
166
|
+
|
167
|
+
with gr.Row():
|
168
|
+
back_btn = gr.Button("← Previous (A)", elem_id="back_btn")
|
169
|
+
submit_btn = gr.Button(
|
170
|
+
"Submit (W)",
|
171
|
+
variant="primary",
|
172
|
+
elem_id="submit_btn",
|
173
|
+
interactive=False,
|
174
|
+
)
|
175
|
+
next_btn = gr.Button("Next → (D)", elem_id="next_btn")
|
176
|
+
|
177
|
+
progress = gr.Slider(
|
178
|
+
minimum=0,
|
179
|
+
maximum=len(filepaths) - 1,
|
180
|
+
value=0,
|
181
|
+
label="Progress",
|
182
|
+
interactive=False,
|
183
|
+
)
|
184
|
+
|
185
|
+
finish_btn = gr.Button("Finish Labeling", variant="primary")
|
186
|
+
|
187
|
+
def update_submit_btn(choice):
|
188
|
+
return gr.Button(interactive=choice is not None)
|
189
|
+
|
190
|
+
category.change(
|
191
|
+
fn=update_submit_btn, inputs=[category], outputs=[submit_btn]
|
192
|
+
)
|
193
|
+
|
194
|
+
def navigate(current_idx, direction):
|
195
|
+
next_idx = current_idx + direction
|
196
|
+
if 0 <= next_idx < len(filepaths):
|
197
|
+
return filepaths[next_idx], filepaths[next_idx], next_idx, next_idx
|
198
|
+
return (
|
199
|
+
filepaths[current_idx],
|
200
|
+
filepaths[current_idx],
|
201
|
+
current_idx,
|
202
|
+
current_idx,
|
203
|
+
)
|
204
|
+
|
205
|
+
def save_and_next(current_idx, selected_category):
|
206
|
+
if selected_category is None:
|
207
|
+
return (
|
208
|
+
filepaths[current_idx],
|
209
|
+
filepaths[current_idx],
|
210
|
+
current_idx,
|
211
|
+
current_idx,
|
212
|
+
)
|
213
|
+
|
214
|
+
# Save the current annotation
|
215
|
+
with open(f"{output_filename}.csv", "a") as f:
|
216
|
+
f.write(f"{filepaths[current_idx]},{selected_category}\n")
|
217
|
+
|
218
|
+
# Move to next image if not at the end
|
219
|
+
next_idx = current_idx + 1
|
220
|
+
if next_idx >= len(filepaths):
|
221
|
+
return (
|
222
|
+
filepaths[current_idx],
|
223
|
+
filepaths[current_idx],
|
224
|
+
current_idx,
|
225
|
+
current_idx,
|
226
|
+
)
|
227
|
+
return filepaths[next_idx], filepaths[next_idx], next_idx, next_idx
|
228
|
+
|
229
|
+
def convert_csv_to_parquet():
|
230
|
+
try:
|
231
|
+
df = pd.read_csv(f"{output_filename}.csv", header=None)
|
232
|
+
df.columns = ["filepath", "label"]
|
233
|
+
df = df.drop_duplicates(subset=["filepath"], keep="last")
|
234
|
+
df.to_parquet(f"{output_filename}.parquet")
|
235
|
+
gr.Info(f"Annotation saved to {output_filename}.parquet")
|
236
|
+
except Exception as e:
|
237
|
+
logger.error(e)
|
238
|
+
return
|
239
|
+
|
240
|
+
back_btn.click(
|
241
|
+
fn=lambda idx: navigate(idx, -1),
|
242
|
+
inputs=[current_index],
|
243
|
+
outputs=[filename, image, current_index, progress],
|
244
|
+
)
|
245
|
+
|
246
|
+
next_btn.click(
|
247
|
+
fn=lambda idx: navigate(idx, 1),
|
248
|
+
inputs=[current_index],
|
249
|
+
outputs=[filename, image, current_index, progress],
|
250
|
+
)
|
251
|
+
|
252
|
+
submit_btn.click(
|
253
|
+
fn=save_and_next,
|
254
|
+
inputs=[current_index, category],
|
255
|
+
outputs=[filename, image, current_index, progress],
|
256
|
+
)
|
257
|
+
|
258
|
+
finish_btn.click(fn=convert_csv_to_parquet)
|
259
|
+
|
260
|
+
demo.launch(height=1000)
|
261
|
+
|
262
|
+
def add_to_train_set(self, df: pd.DataFrame, output_filename: str):
|
263
|
+
"""
|
264
|
+
Add samples to the training set.
|
265
|
+
"""
|
266
|
+
new_train_set = df.copy()
|
267
|
+
# new_train_set.drop(columns=["pred_conf"], inplace=True)
|
268
|
+
# new_train_set.rename(columns={"pred_label": "label"}, inplace=True)
|
269
|
+
|
270
|
+
# len_old = len(self.train_set)
|
271
|
+
|
272
|
+
logger.info(f"Adding {len(new_train_set)} samples to training set")
|
273
|
+
self.train_set = pd.concat([self.train_set, new_train_set])
|
274
|
+
|
275
|
+
self.train_set = self.train_set.drop_duplicates(
|
276
|
+
subset=["filepath"], keep="last"
|
277
|
+
)
|
278
|
+
self.train_set.reset_index(drop=True, inplace=True)
|
279
|
+
|
280
|
+
self.train_set.to_parquet(f"{output_filename}.parquet")
|
281
|
+
logger.info(f"Saved training set to {output_filename}.parquet")
|
282
|
+
|
283
|
+
# if len(self.train_set) == len_old:
|
284
|
+
# logger.warning("No new samples added to training set")
|
285
|
+
|
286
|
+
# elif len_old + len(new_train_set) < len(self.train_set):
|
287
|
+
# logger.warning("Some samples were duplicates and removed from training set")
|
288
|
+
|
289
|
+
# else:
|
290
|
+
# logger.info("All new samples added to training set")
|
291
|
+
# logger.info(f"Training set now has {len(self.train_set)} samples")
|
@@ -1,53 +1,77 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: active-vision
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.4
|
4
4
|
Summary: Active learning for edge vision.
|
5
5
|
Requires-Python: >=3.10
|
6
6
|
Description-Content-Type: text/markdown
|
7
7
|
License-File: LICENSE
|
8
8
|
Requires-Dist: datasets>=3.2.0
|
9
9
|
Requires-Dist: fastai>=2.7.18
|
10
|
+
Requires-Dist: gradio>=5.12.0
|
10
11
|
Requires-Dist: ipykernel>=6.29.5
|
11
12
|
Requires-Dist: ipywidgets>=8.1.5
|
12
13
|
Requires-Dist: loguru>=0.7.3
|
13
14
|
Requires-Dist: seaborn>=0.13.2
|
15
|
+
Requires-Dist: timm>=1.0.13
|
14
16
|
|
15
17
|
![Python Version](https://img.shields.io/badge/python-3.10%2B-blue?style=for-the-badge)
|
16
18
|
![License](https://img.shields.io/badge/License-Apache%202.0-green.svg?style=for-the-badge)
|
17
|
-
![PyPI](https://img.shields.io/pypi/v/active-vision?style=for-the-badge)
|
19
|
+
[![PyPI](https://img.shields.io/pypi/v/active-vision?style=for-the-badge)](https://pypi.org/project/active-vision/)
|
18
20
|
![Downloads](https://img.shields.io/pepy/dt/active-vision?style=for-the-badge&logo=pypi&logoColor=white&label=Downloads&color=purple)
|
19
21
|
|
20
22
|
<p align="center">
|
21
|
-
<img src="https://
|
23
|
+
<img src="https://raw.githubusercontent.com/dnth/active-vision/main/assets/logo.png" alt="active-vision">
|
22
24
|
</p>
|
23
25
|
|
24
26
|
Active learning at the edge for computer vision.
|
25
27
|
|
26
|
-
The goal of this project is to create a framework for active learning
|
28
|
+
The goal of this project is to create a framework for the active learning loop for computer vision deployed on edge devices.
|
27
29
|
|
28
|
-
|
30
|
+
Supported tasks:
|
31
|
+
- [X] Image classification
|
32
|
+
- [ ] Object detection
|
33
|
+
- [ ] Segmentation
|
29
34
|
|
30
|
-
- Training framework: fastai
|
31
|
-
- User interface: streamlit
|
32
|
-
- Database: sqlite
|
33
|
-
- Experiment tracking: wandb
|
34
35
|
|
35
36
|
## Installation
|
36
37
|
|
37
|
-
PyPI
|
38
|
+
Get a release from PyPI
|
38
39
|
```bash
|
39
40
|
pip install active-vision
|
40
41
|
```
|
41
42
|
|
42
|
-
|
43
|
+
Install from source
|
43
44
|
```bash
|
44
45
|
git clone https://github.com/dnth/active-vision.git
|
45
46
|
cd active-vision
|
46
47
|
pip install -e .
|
47
48
|
```
|
48
49
|
|
50
|
+
I recommend using [uv](https://docs.astral.sh/uv/) to set up a virtual environment and install the package. You can also use other virtual env of your choice.
|
51
|
+
|
52
|
+
If you're using uv:
|
53
|
+
|
54
|
+
```bash
|
55
|
+
uv venv
|
56
|
+
uv sync
|
57
|
+
```
|
58
|
+
Once the virtual environment is created, you can install the package using pip.
|
59
|
+
|
60
|
+
> [!TIP]
|
61
|
+
> If you're using uv add a uv before the pip install command to install into your virtual environment. Eg:
|
62
|
+
> ```bash
|
63
|
+
> uv pip install active-vision
|
64
|
+
> ```
|
65
|
+
|
49
66
|
## Usage
|
50
|
-
See the [notebook](./nbs/
|
67
|
+
See the [notebook](./nbs/04_relabel_loop.ipynb) for a complete example.
|
68
|
+
|
69
|
+
Be sure to prepared 3 datasets:
|
70
|
+
- [initial_samples](./nbs/initial_samples.parquet): A dataframe of an existing labeled training dataset to seed the training set.
|
71
|
+
- [unlabeled](./nbs/unlabeled_samples.parquet): A dataframe of unlabeled data which we will sample from using active learning.
|
72
|
+
- [eval](./nbs/evaluation_samples.parquet): A dataframe of labeled data which we will use to evaluate the performance of the model.
|
73
|
+
|
74
|
+
As a toy example I created the above 3 datasets from the imagenette dataset.
|
51
75
|
|
52
76
|
```python
|
53
77
|
from active_vision import ActiveLearner
|
@@ -56,29 +80,45 @@ import pandas as pd
|
|
56
80
|
# Create an active learner instance with a model
|
57
81
|
al = ActiveLearner("resnet18")
|
58
82
|
|
59
|
-
# Load
|
83
|
+
# Load dataset
|
60
84
|
train_df = pd.read_parquet("training_samples.parquet")
|
61
|
-
al.load_dataset(
|
85
|
+
al.load_dataset(df, filepath_col="filepath", label_col="label")
|
62
86
|
|
63
|
-
# Train
|
87
|
+
# Train model
|
64
88
|
al.train(epochs=3, lr=1e-3)
|
65
89
|
|
66
|
-
#
|
67
|
-
|
90
|
+
# Evaluate the model on a *labeled* evaluation set
|
91
|
+
accuracy = al.evaluate(eval_df, filepath_col="filepath", label_col="label")
|
68
92
|
|
69
|
-
#
|
70
|
-
accuracy = al.evaluate(eval_df, "filepath", "label")
|
71
|
-
|
72
|
-
# Get predictions from an unlabeled set
|
93
|
+
# Get predictions from an *unlabeled* set
|
73
94
|
pred_df = al.predict(filepaths)
|
74
95
|
|
75
|
-
# Sample low confidence predictions
|
96
|
+
# Sample low confidence predictions from unlabeled set
|
76
97
|
uncertain_df = al.sample_uncertain(pred_df, num_samples=10)
|
77
98
|
|
78
|
-
#
|
79
|
-
al.
|
99
|
+
# Launch a Gradio UI to label the low confidence samples
|
100
|
+
al.label(uncertain_df, output_filename="uncertain")
|
80
101
|
```
|
81
102
|
|
103
|
+
![Gradio UI](./assets/labeling_ui.png)
|
104
|
+
|
105
|
+
Once complete, the labeled samples will be save into a new df.
|
106
|
+
We can now add the newly labeled data to the training set.
|
107
|
+
|
108
|
+
```python
|
109
|
+
# Add newly labeled data to training set and save as a new file active_labeled
|
110
|
+
al.add_to_train_set(labeled_df, output_filename="active_labeled")
|
111
|
+
```
|
112
|
+
|
113
|
+
Repeat the process until the model is good enough. Use the dataset to train a larger model and deploy.
|
114
|
+
|
115
|
+
> [!TIP]
|
116
|
+
> For the toy dataset, I got to about 93% accuracy on the evaluation set with 200+ labeled images. The best performing model on the [leaderboard](https://github.com/fastai/imagenette) got 95.11% accuracy training on all 9469 labeled images.
|
117
|
+
>
|
118
|
+
> This took me about 6 iterations of relabeling. Each iteration took about 5 minutes to complete including labeling and model training (resnet18). See the [notebook](./nbs/04_relabel_loop.ipynb) for more details.
|
119
|
+
>
|
120
|
+
> But using the dataset of 200+ images, I trained a more capable model (convnext_small_in22k) and got 99.3% accuracy on the evaluation set. See the [notebook](./nbs/05_retrain_larger.ipynb) for more details.
|
121
|
+
|
82
122
|
## Workflow
|
83
123
|
There are two workflows for active learning at the edge that we can use depending on the availability of labeled data.
|
84
124
|
|
@@ -86,10 +126,10 @@ There are two workflows for active learning at the edge that we can use dependin
|
|
86
126
|
If we have no labeled data, we can use active learning to iteratively improve the model and build a labeled dataset.
|
87
127
|
|
88
128
|
1. Load a small proxy model.
|
89
|
-
2. Label an initial dataset.
|
129
|
+
2. Label an initial dataset. If there is none, you'll have to label some images.
|
90
130
|
3. Train the proxy model on the labeled dataset.
|
91
131
|
4. Run inference on the unlabeled dataset.
|
92
|
-
5. Evaluate the performance of the proxy model
|
132
|
+
5. Evaluate the performance of the proxy model.
|
93
133
|
6. Is model good enough?
|
94
134
|
- Yes: Save the proxy model and the dataset.
|
95
135
|
- No: Select the most informative images to label using active learning.
|
@@ -141,7 +181,7 @@ graph TD
|
|
141
181
|
```
|
142
182
|
|
143
183
|
|
144
|
-
## Methodology
|
184
|
+
<!-- ## Methodology
|
145
185
|
To test out the workflows we will use the [imagenette dataset](https://huggingface.co/datasets/frgfm/imagenette). But this will be applicable to any dataset.
|
146
186
|
|
147
187
|
Imagenette is a subset of the ImageNet dataset with 10 classes. We will use this dataset to test out the workflows. Additionally, Imagenette has an existing leaderboard which we can use to evaluate the performance of the models.
|
@@ -192,4 +232,4 @@ After the first iteration we got 94.57% accuracy on the validation set. See the
|
|
192
232
|
> [!TIP]
|
193
233
|
> | Train Epochs | Number of Images | Validation Accuracy | Source |
|
194
234
|
> |--------------|-----------------|----------------------|------------------|
|
195
|
-
> | 10 | 200 | 94.57% | First relabeling [notebook](./nbs/03_retrain_model.ipynb) |
|
235
|
+
> | 10 | 200 | 94.57% | First relabeling [notebook](./nbs/03_retrain_model.ipynb) | -->
|
@@ -1,149 +0,0 @@
|
|
1
|
-
import pandas as pd
|
2
|
-
from loguru import logger
|
3
|
-
from fastai.vision.models import resnet18, resnet34
|
4
|
-
from fastai.callback.all import ShowGraphCallback
|
5
|
-
from fastai.vision.all import (
|
6
|
-
ImageDataLoaders,
|
7
|
-
aug_transforms,
|
8
|
-
Resize,
|
9
|
-
vision_learner,
|
10
|
-
accuracy,
|
11
|
-
valley,
|
12
|
-
slide,
|
13
|
-
minimum,
|
14
|
-
steep,
|
15
|
-
)
|
16
|
-
import torch
|
17
|
-
import torch.nn.functional as F
|
18
|
-
|
19
|
-
import warnings
|
20
|
-
|
21
|
-
warnings.filterwarnings("ignore", category=FutureWarning)
|
22
|
-
|
23
|
-
|
24
|
-
class ActiveLearner:
|
25
|
-
def __init__(self, model_name: str):
|
26
|
-
self.model = self.load_model(model_name)
|
27
|
-
|
28
|
-
def load_model(self, model_name: str):
|
29
|
-
models = {"resnet18": resnet18, "resnet34": resnet34}
|
30
|
-
logger.info(f"Loading model {model_name}")
|
31
|
-
if model_name not in models:
|
32
|
-
logger.error(f"Model {model_name} not found")
|
33
|
-
raise ValueError(f"Model {model_name} not found")
|
34
|
-
return models[model_name]
|
35
|
-
|
36
|
-
def load_dataset(
|
37
|
-
self,
|
38
|
-
df: pd.DataFrame,
|
39
|
-
filepath_col: str,
|
40
|
-
label_col: str,
|
41
|
-
valid_pct: float = 0.2,
|
42
|
-
batch_size: int = 16,
|
43
|
-
image_size: int = 224,
|
44
|
-
):
|
45
|
-
logger.info(f"Loading dataset from {filepath_col} and {label_col}")
|
46
|
-
self.train_set = df.copy()
|
47
|
-
|
48
|
-
logger.info("Creating dataloaders")
|
49
|
-
self.dls = ImageDataLoaders.from_df(
|
50
|
-
df,
|
51
|
-
path=".",
|
52
|
-
valid_pct=valid_pct,
|
53
|
-
fn_col=filepath_col,
|
54
|
-
label_col=label_col,
|
55
|
-
bs=batch_size,
|
56
|
-
item_tfms=Resize(image_size),
|
57
|
-
batch_tfms=aug_transforms(size=image_size, min_scale=0.75),
|
58
|
-
)
|
59
|
-
logger.info("Creating learner")
|
60
|
-
self.learn = vision_learner(self.dls, self.model, metrics=accuracy).to_fp16()
|
61
|
-
self.class_names = self.dls.vocab
|
62
|
-
logger.info("Done. Ready to train.")
|
63
|
-
|
64
|
-
def lr_find(self):
|
65
|
-
logger.info("Finding optimal learning rate")
|
66
|
-
self.lrs = self.learn.lr_find(suggest_funcs=(minimum, steep, valley, slide))
|
67
|
-
logger.info(f"Optimal learning rate: {self.lrs.valley}")
|
68
|
-
|
69
|
-
def train(self, epochs: int, lr: float):
|
70
|
-
logger.info(f"Training for {epochs} epochs with learning rate: {lr}")
|
71
|
-
self.learn.fine_tune(epochs, lr, cbs=[ShowGraphCallback()])
|
72
|
-
|
73
|
-
def predict(self, filepaths: list[str], batch_size: int = 16):
|
74
|
-
"""
|
75
|
-
Run inference on an unlabeled dataset. Returns a df with filepaths and predicted labels, and confidence scores.
|
76
|
-
"""
|
77
|
-
logger.info(f"Running inference on {len(filepaths)} samples")
|
78
|
-
test_dl = self.dls.test_dl(filepaths, bs=batch_size)
|
79
|
-
preds, _, cls_preds = self.learn.get_preds(dl=test_dl, with_decoded=True)
|
80
|
-
|
81
|
-
self.pred_df = pd.DataFrame(
|
82
|
-
{
|
83
|
-
"filepath": filepaths,
|
84
|
-
"pred_label": [self.learn.dls.vocab[i] for i in cls_preds.numpy()],
|
85
|
-
"pred_conf": torch.max(F.softmax(preds, dim=1), dim=1)[0].numpy(),
|
86
|
-
}
|
87
|
-
)
|
88
|
-
return self.pred_df
|
89
|
-
|
90
|
-
def evaluate(self, df: pd.DataFrame, filepath_col: str, label_col: str, batch_size: int = 16):
|
91
|
-
"""
|
92
|
-
Evaluate on a labeled dataset. Returns a score.
|
93
|
-
"""
|
94
|
-
self.eval_set = df.copy()
|
95
|
-
|
96
|
-
filepaths = self.eval_set[filepath_col].tolist()
|
97
|
-
labels = self.eval_set[label_col].tolist()
|
98
|
-
test_dl = self.dls.test_dl(filepaths, bs=batch_size)
|
99
|
-
preds, _, cls_preds = self.learn.get_preds(dl=test_dl, with_decoded=True)
|
100
|
-
|
101
|
-
self.eval_df = pd.DataFrame(
|
102
|
-
{
|
103
|
-
"filepath": filepaths,
|
104
|
-
"label": labels,
|
105
|
-
"pred_label": [self.learn.dls.vocab[i] for i in cls_preds.numpy()],
|
106
|
-
}
|
107
|
-
)
|
108
|
-
|
109
|
-
accuracy = float((self.eval_df["label"] == self.eval_df["pred_label"]).mean())
|
110
|
-
logger.info(f"Accuracy: {accuracy:.2%}")
|
111
|
-
return accuracy
|
112
|
-
|
113
|
-
def sample_uncertain(self, df: pd.DataFrame, num_samples: int):
|
114
|
-
"""
|
115
|
-
Sample top `num_samples` low confidence samples. Returns a df with filepaths and predicted labels, and confidence scores.
|
116
|
-
"""
|
117
|
-
uncertain_df = df.sort_values(
|
118
|
-
by="pred_conf", ascending=True
|
119
|
-
).head(num_samples)
|
120
|
-
return uncertain_df
|
121
|
-
|
122
|
-
def add_to_train_set(self, df: pd.DataFrame):
|
123
|
-
"""
|
124
|
-
Add samples to the training set.
|
125
|
-
"""
|
126
|
-
new_train_set = df.copy()
|
127
|
-
new_train_set.drop(columns=["pred_conf"], inplace=True)
|
128
|
-
new_train_set.rename(columns={"pred_label": "label"}, inplace=True)
|
129
|
-
|
130
|
-
len_old = len(self.train_set)
|
131
|
-
|
132
|
-
logger.info(f"Adding {len(new_train_set)} samples to training set")
|
133
|
-
self.train_set = pd.concat([self.train_set, new_train_set])
|
134
|
-
|
135
|
-
self.train_set = self.train_set.drop_duplicates(
|
136
|
-
subset=["filepath"], keep="last"
|
137
|
-
)
|
138
|
-
self.train_set.reset_index(drop=True, inplace=True)
|
139
|
-
|
140
|
-
|
141
|
-
if len(self.train_set) == len_old:
|
142
|
-
logger.warning("No new samples added to training set")
|
143
|
-
|
144
|
-
elif len_old + len(new_train_set) < len(self.train_set):
|
145
|
-
logger.warning("Some samples were duplicates and removed from training set")
|
146
|
-
|
147
|
-
else:
|
148
|
-
logger.info("All new samples added to training set")
|
149
|
-
logger.info(f"Training set now has {len(self.train_set)} samples")
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|