active-vision 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
active_vision/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
- __version__ = "0.4.1"
1
+ __version__ = "0.4.3"
2
2
 
3
3
  from .core import *
active_vision/core.py CHANGED
@@ -20,8 +20,11 @@ from fastai.vision.all import (
20
20
  valley,
21
21
  vision_learner,
22
22
  )
23
+ from itables import show
23
24
  from loguru import logger
24
25
 
26
+ from .utils import get_base64_image
27
+
25
28
  warnings.filterwarnings("ignore", category=FutureWarning)
26
29
  pd.set_option("display.max_colwidth", 50)
27
30
 
@@ -102,6 +105,9 @@ class ActiveLearner:
102
105
  self.valid_set = self.learn.dls.valid_ds.items
103
106
  self.class_names = self.dls.vocab
104
107
  self.num_classes = self.dls.c
108
+
109
+ logger.info(f"Training set size: {len(self.train_set)}")
110
+ logger.info(f"Validation set size: {len(self.valid_set)}")
105
111
  logger.info("Done. Ready to train.")
106
112
 
107
113
  def load_dataset(
@@ -209,7 +215,9 @@ class ActiveLearner:
209
215
  logger.info(f"Learning rate: {lr} with one-cycle learning rate scheduler")
210
216
  self.learn.fine_tune(epochs, lr, freeze_epochs=head_tuning_epochs)
211
217
 
212
- def predict(self, filepaths: list[str], batch_size: int = 16):
218
+ def predict(
219
+ self, filepaths: list[str], batch_size: int = 16, interactive: bool = False
220
+ ):
213
221
  """
214
222
  Run inference on an unlabeled dataset. Returns a df with filepaths and predicted labels, and confidence scores.
215
223
  """
@@ -257,47 +265,108 @@ class ActiveLearner:
257
265
  lambda x: [round(e, 4) for e in x]
258
266
  )
259
267
 
268
+ if interactive:
269
+ logger.info("Rendering interactive table")
270
+ interactive_pred_df = self.pred_df.copy()
271
+ interactive_pred_df["image"] = interactive_pred_df["filepath"].apply(
272
+ get_base64_image
273
+ )
274
+ interactive_pred_df = interactive_pred_df[
275
+ ["image", "filepath", "pred_label", "pred_conf", "logits", "embeddings"]
276
+ ]
277
+
278
+ show(
279
+ interactive_pred_df,
280
+ columnDefs=[{"width": "200px", "targets": "_all"}],
281
+ style="width:1200px",
282
+ autoWidth=False,
283
+ )
284
+
260
285
  return self.pred_df
261
286
 
262
287
  def evaluate(
263
- self, df: pd.DataFrame, filepath_col: str, label_col: str, batch_size: int = 16
288
+ self,
289
+ df: pd.DataFrame,
290
+ filepath_col: str,
291
+ label_col: str,
292
+ batch_size: int = 16,
293
+ interactive: bool = False,
264
294
  ):
265
295
  """
266
296
  Evaluate on a labeled dataset. Returns a score.
267
297
  """
268
298
  self.eval_set = df.copy()
269
299
 
270
- filepaths = self.eval_set[filepath_col].tolist()
271
- labels = self.eval_set[label_col].tolist()
272
- test_dl = self.dls.test_dl(filepaths, bs=batch_size)
273
- preds, _, cls_preds = self.learn.get_preds(dl=test_dl, with_decoded=True)
300
+ test_dl = self.dls.test_dl(self.eval_set, bs=batch_size, with_labels=True)
301
+ probs, targs, cls_preds, loss = self.learn.get_preds(
302
+ dl=test_dl, with_decoded=True, with_loss=True
303
+ )
274
304
 
275
- self.eval_df = pd.DataFrame(
305
+ eval_df = pd.DataFrame(
276
306
  {
277
- "filepath": filepaths,
278
- "label": labels,
307
+ "filepath": self.eval_set[filepath_col].tolist(),
308
+ "label": self.eval_set[label_col].tolist(),
279
309
  "pred_label": [self.learn.dls.vocab[i] for i in cls_preds.numpy()],
310
+ "pred_conf": torch.max(F.softmax(probs, dim=1), dim=1)[0].numpy(),
311
+ "probs": probs.numpy().tolist(),
312
+ "loss": loss.numpy().tolist(),
280
313
  }
281
314
  )
282
315
 
283
- accuracy = float((self.eval_df["label"] == self.eval_df["pred_label"]).mean())
316
+ accuracy = float((eval_df["label"] == eval_df["pred_label"]).mean())
284
317
  self.eval_accuracy = accuracy
285
318
  logger.info(f"Accuracy: {accuracy:.2%}")
286
- return accuracy
319
+
320
+ if interactive:
321
+ logger.info("Rendering interactive table")
322
+
323
+ interactive_eval_df = eval_df.copy()
324
+ interactive_eval_df["image"] = interactive_eval_df["filepath"].apply(
325
+ get_base64_image
326
+ )
327
+ interactive_eval_df = interactive_eval_df[
328
+ [
329
+ "image",
330
+ "filepath",
331
+ "label",
332
+ "pred_label",
333
+ "pred_conf",
334
+ "loss",
335
+ "probs",
336
+ ]
337
+ ]
338
+
339
+ show(
340
+ interactive_eval_df,
341
+ columnDefs=[{"width": "200px", "targets": "_all"}],
342
+ style="width:1200px",
343
+ autoWidth=False,
344
+ )
345
+
346
+ return eval_df
287
347
 
288
348
  def sample_uncertain(
289
- self, df: pd.DataFrame, num_samples: int, strategy: str = "least-confidence"
349
+ self,
350
+ df: pd.DataFrame,
351
+ num_samples: int,
352
+ strategy: str = "least-confidence",
353
+ interactive: bool = False,
290
354
  ):
291
355
  """
292
356
  Sample top `num_samples` low confidence samples. Returns a df with filepaths and predicted labels, and confidence scores.
293
357
 
358
+ Args:
359
+ df: DataFrame with predictions
360
+ num_samples: Number of samples to select
361
+ strategy: Sampling strategy to use
362
+ interactive: Whether to display an interactive table of results
363
+
294
364
  Strategies:
295
- - least-confidence: Get top `num_samples` low confidence samples.
296
- - margin-of-confidence: Get top `num_samples` samples with the smallest margin between the top two predictions.
297
- - ratio-of-confidence: Get top `num_samples` samples with the highest ratio between the top two predictions.
298
- - entropy: Get top `num_samples` samples with the highest entropy.
365
+ - least-confidence: Get top `num_samples` low confidence samples.
366
+ - margin-of-confidence: Get top `num_samples` samples with the smallest margin between the top two predictions.
367
+ - ratio-of-confidence: Get top `num_samples` samples with the highest ratio between the top two predictions.
368
+ - entropy: Get top `num_samples` samples with the highest entropy.
299
369
  """
300
-
301
370
  # Remove samples that is already in the training set
302
371
  df = df[~df["filepath"].isin(self.dataset["filepath"])].copy()
303
372
 
@@ -366,20 +435,45 @@ class ActiveLearner:
366
435
  ]
367
436
 
368
437
  df["score"] = df["score"].round(4)
438
+ result_df = df.sort_values(by="score", ascending=False).head(num_samples)
439
+
440
+ if interactive:
441
+ logger.info("Rendering interactive table")
442
+ interactive_df = result_df.copy()
443
+ interactive_df["image"] = interactive_df["filepath"].apply(get_base64_image)
444
+ interactive_df = interactive_df[
445
+ ["image", "filepath", "strategy", "score", "pred_label", "pred_conf"]
446
+ ]
447
+
448
+ show(
449
+ interactive_df,
450
+ columnDefs=[{"width": "200px", "targets": "_all"}],
451
+ style="width:1200px",
452
+ autoWidth=False,
453
+ )
369
454
 
370
- return df.sort_values(by="score", ascending=False).head(num_samples)
455
+ return result_df
371
456
 
372
457
  def sample_diverse(
373
- self, df: pd.DataFrame, num_samples: int, strategy: str = "model-based-outlier"
458
+ self,
459
+ df: pd.DataFrame,
460
+ num_samples: int,
461
+ strategy: str = "model-based-outlier",
462
+ interactive: bool = False,
374
463
  ):
375
464
  """
376
465
  Sample top `num_samples` diverse samples. Returns a df with filepaths and predicted labels, and confidence scores.
377
466
 
378
- Strategies:
379
- - model-based-outlier: Get top `num_samples` samples with lowest activation of the model's last layer.
380
- - cluster-based: Get top `num_samples` samples with the highest distance to the nearest neighbor.
381
- - representative: Get top `num_samples` samples with the highest distance to the centroid of the training set.
467
+ Args:
468
+ df: DataFrame with predictions
469
+ num_samples: Number of samples to select
470
+ strategy: Sampling strategy to use
471
+ interactive: Whether to display an interactive table of results
382
472
 
473
+ Strategies:
474
+ - model-based-outlier: Get top `num_samples` samples with lowest activation of the model's last layer.
475
+ - cluster-based: Get top `num_samples` samples with the highest distance to the nearest neighbor.
476
+ - representative: Get top `num_samples` samples with the highest distance to the centroid of the training set.
383
477
  """
384
478
  # Remove samples that is already in the training set
385
479
  df = df[~df["filepath"].isin(self.dataset["filepath"])].copy()
@@ -444,19 +538,54 @@ class ActiveLearner:
444
538
  ]
445
539
 
446
540
  df["score"] = df["score"].round(4)
541
+ result_df = df.sort_values(by="score", ascending=False).head(num_samples)
447
542
 
448
- # Sort by score ascending higher rank = more outlier-like compared to the validation set
449
- return df.sort_values(by="score", ascending=False).head(num_samples)
543
+ if interactive:
544
+ logger.info("Rendering interactive table")
545
+ interactive_df = result_df.copy()
546
+ interactive_df["image"] = interactive_df["filepath"].apply(
547
+ get_base64_image
548
+ )
549
+ interactive_df = interactive_df[
550
+ [
551
+ "image",
552
+ "filepath",
553
+ "strategy",
554
+ "score",
555
+ "pred_label",
556
+ "pred_conf",
557
+ ]
558
+ ]
559
+
560
+ show(
561
+ interactive_df,
562
+ columnDefs=[{"width": "200px", "targets": "_all"}],
563
+ style="width:1200px",
564
+ autoWidth=False,
565
+ )
566
+
567
+ return result_df
450
568
 
451
569
  else:
452
570
  logger.error(f"Unknown strategy: {strategy}")
453
571
  raise ValueError(f"Unknown strategy: {strategy}")
454
572
 
455
- def sample_random(self, df: pd.DataFrame, num_samples: int, seed: int = None):
573
+ def sample_random(
574
+ self,
575
+ df: pd.DataFrame,
576
+ num_samples: int,
577
+ seed: int = None,
578
+ interactive: bool = False,
579
+ ):
456
580
  """
457
581
  Sample `num_samples` random samples. Returns a df with filepaths and predicted labels, and confidence scores.
458
- """
459
582
 
583
+ Args:
584
+ df: DataFrame with predictions
585
+ num_samples: Number of samples to select
586
+ seed: Random seed for reproducibility
587
+ interactive: Whether to display an interactive table of results
588
+ """
460
589
  logger.info(f"Sampling {num_samples} random samples")
461
590
  df = df[~df["filepath"].isin(self.dataset["filepath"])].copy()
462
591
  df["strategy"] = "random"
@@ -464,9 +593,32 @@ class ActiveLearner:
464
593
 
465
594
  if seed is not None:
466
595
  logger.info(f"Using seed: {seed}")
467
- return df.sample(n=num_samples, random_state=seed)
596
+ result_df = df.sample(n=num_samples, random_state=seed)
597
+
598
+ if interactive:
599
+ logger.info("Rendering interactive table")
600
+ interactive_df = result_df.copy()
601
+ interactive_df["image"] = interactive_df["filepath"].apply(get_base64_image)
602
+ interactive_df = interactive_df[
603
+ ["image", "filepath", "strategy", "score", "pred_label", "pred_conf"]
604
+ ]
468
605
 
469
- def sample_combination(self, df: pd.DataFrame, num_samples: int, combination: dict):
606
+ show(
607
+ interactive_df,
608
+ columnDefs=[{"width": "200px", "targets": "_all"}],
609
+ style="width:1200px",
610
+ autoWidth=False,
611
+ )
612
+
613
+ return result_df
614
+
615
+ def sample_combination(
616
+ self,
617
+ df: pd.DataFrame,
618
+ num_samples: int,
619
+ combination: dict,
620
+ interactive: bool = False,
621
+ ):
470
622
  """
471
623
  Sample samples based on a combination of strategies.
472
624
 
@@ -491,6 +643,7 @@ class ActiveLearner:
491
643
  - representative
492
644
  Other:
493
645
  - random
646
+ interactive: Whether to display an interactive table of results
494
647
 
495
648
  Returns:
496
649
  DataFrame containing the combined samples
@@ -528,14 +681,16 @@ class ActiveLearner:
528
681
  "entropy",
529
682
  ]:
530
683
  strategy_df = self.sample_uncertain(
531
- df=df, num_samples=n_samples, strategy=strategy
684
+ df=df, num_samples=n_samples, strategy=strategy, interactive=False
532
685
  )
533
686
  elif strategy in ["model-based-outlier", "cluster-based", "representative"]:
534
687
  strategy_df = self.sample_diverse(
535
- df=df, num_samples=n_samples, strategy=strategy
688
+ df=df, num_samples=n_samples, strategy=strategy, interactive=False
536
689
  )
537
690
  elif strategy == "random":
538
- strategy_df = self.sample_random(df=df, num_samples=n_samples)
691
+ strategy_df = self.sample_random(
692
+ df=df, num_samples=n_samples, interactive=False
693
+ )
539
694
  else:
540
695
  raise ValueError(f"Unknown strategy: {strategy}")
541
696
 
@@ -543,7 +698,24 @@ class ActiveLearner:
543
698
  # Remove selected samples from the pool to avoid duplicates
544
699
  df = df[~df["filepath"].isin(strategy_df["filepath"])]
545
700
 
546
- return pd.concat(sampled_dfs, ignore_index=True)
701
+ combined_df = pd.concat(sampled_dfs, ignore_index=True)
702
+
703
+ if interactive:
704
+ logger.info("Rendering interactive table")
705
+ interactive_df = combined_df.copy()
706
+ interactive_df["image"] = interactive_df["filepath"].apply(get_base64_image)
707
+ interactive_df = interactive_df[
708
+ ["image", "filepath", "strategy", "score", "pred_label", "pred_conf"]
709
+ ]
710
+
711
+ show(
712
+ interactive_df,
713
+ columnDefs=[{"width": "200px", "targets": "_all"}],
714
+ style="width:1200px",
715
+ autoWidth=False,
716
+ )
717
+
718
+ return combined_df
547
719
 
548
720
  def summary(self, filename: str = None, show: bool = True):
549
721
  results_df = pd.DataFrame(
@@ -567,13 +739,22 @@ class ActiveLearner:
567
739
  if filename is None:
568
740
  # Generate filename with timestamp, accuracy and dataset size
569
741
  from datetime import datetime
742
+
570
743
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
571
- accuracy_str = f"{self.eval_accuracy:.2%}" if self.eval_accuracy is not None else "no_eval"
744
+ accuracy_str = (
745
+ f"{self.eval_accuracy:.2%}"
746
+ if self.eval_accuracy is not None
747
+ else "no_eval"
748
+ )
572
749
  dataset_size = len(self.train_set) + len(self.valid_set)
573
- filename = f"{self.name}_{timestamp}_acc_{accuracy_str}_n_{dataset_size}.parquet"
750
+ filename = f"{self.name}_{timestamp}_acc_{accuracy_str}_n_{dataset_size}_results.parquet"
574
751
  elif not filename.endswith(".parquet"):
575
752
  filename = f"{filename}.parquet"
576
753
 
754
+ logger.info("Saving learner to pickle file")
755
+ learner_name = filename.replace("_results.parquet", "_learner.pkl")
756
+ self.learn.export(learner_name)
757
+
577
758
  results_df.to_parquet(filename)
578
759
  logger.info(f"Saved results to {filename}")
579
760
  if show:
@@ -597,12 +778,14 @@ class ActiveLearner:
597
778
  return;
598
779
  }
599
780
 
600
- if (e.key === "ArrowUp") {
781
+ if (e.key === "ArrowUp" || e.key === "w") {
601
782
  document.getElementById("submit_btn").click();
602
- } else if (e.key === "ArrowRight") {
783
+ } else if (e.key === "ArrowRight" || e.key === "d") {
603
784
  document.getElementById("next_btn").click();
604
- } else if (e.key === "ArrowLeft") {
785
+ } else if (e.key === "ArrowLeft" || e.key === "a") {
605
786
  document.getElementById("back_btn").click();
787
+ } else if (e.key === "ArrowDown" || e.key === "s") {
788
+ document.getElementById("finish_btn").click();
606
789
  }
607
790
  }
608
791
  document.addEventListener('keydown', shortcuts, false);
@@ -710,6 +893,11 @@ class ActiveLearner:
710
893
  interactive=True,
711
894
  )
712
895
 
896
+ # Add element_id to finish button
897
+ finish_btn = gr.Button(
898
+ "Finish Labeling", variant="primary", elem_id="finish_btn"
899
+ )
900
+
713
901
  # Add event handler for slider changes
714
902
  progress.change(
715
903
  fn=lambda idx: navigate(idx, 0),
@@ -728,8 +916,6 @@ class ActiveLearner:
728
916
  ],
729
917
  )
730
918
 
731
- finish_btn = gr.Button("Finish Labeling", variant="primary")
732
-
733
919
  with gr.Tab("Zero-Shot Inference"):
734
920
  gr.Markdown("""
735
921
  Uses a VLM to predict the label of the image.
active_vision/utils.py ADDED
@@ -0,0 +1,58 @@
1
+ import base64
2
+ import os
3
+ from io import BytesIO
4
+
5
+ from itables import show
6
+ from loguru import logger
7
+ from PIL import Image
8
+
9
+
10
+ def get_base64_image(filepath, width=200):
11
+ try:
12
+ with Image.open(filepath) as img:
13
+ # Convert to RGB if needed
14
+ if img.mode != "RGB":
15
+ img = img.convert("RGB")
16
+
17
+ aspect_ratio = img.height / img.width
18
+ height = int(width * aspect_ratio)
19
+ img = img.resize((width, height), Image.Resampling.LANCZOS)
20
+ buffered = BytesIO()
21
+ img.save(buffered, format="JPEG")
22
+ img_str = base64.b64encode(buffered.getvalue()).decode()
23
+ return f'<img src="data:image/jpeg;base64,{img_str}" width="{width}" alt="Sample Image">'
24
+ except Exception as e:
25
+ logger.warning(f"Failed to encode image {filepath}: {e}")
26
+ return None
27
+
28
+
29
+ def show_interactive_table(df, filepath_col="filepath", image_col="image"):
30
+ """
31
+ Display an interactive table with images from filepaths.
32
+
33
+ Args:
34
+ df: pandas DataFrame that contains a 'filepath' column
35
+ """
36
+ # Create a copy to avoid modifying the original dataframe
37
+ interactive_df = df.copy()
38
+
39
+ # Add image column by applying get_base64_image to filepath column
40
+ interactive_df[image_col] = interactive_df[filepath_col].apply(get_base64_image)
41
+
42
+ # Convert filepath to clickable link with relative path
43
+ interactive_df[filepath_col] = interactive_df[filepath_col].apply(
44
+ lambda x: f'<a href="{x.replace(os.sep, "/")}">{x}</a>'
45
+ )
46
+
47
+ # Reorder columns to show image first
48
+ cols = interactive_df.columns.tolist()
49
+ cols.remove(image_col)
50
+ interactive_df = interactive_df[[image_col] + cols]
51
+
52
+ # Display interactive table
53
+ show(
54
+ interactive_df,
55
+ columnDefs=[{"width": "200px", "targets": "_all"}],
56
+ style="width:1200px",
57
+ autoWidth=False,
58
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: active-vision
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: Active learning for computer vision.
5
5
  Project-URL: Homepage, https://github.com/dnth/active-vision
6
6
  Project-URL: Bug Tracker, https://github.com/dnth/active-vision/issues
@@ -12,6 +12,7 @@ Requires-Dist: fastai>=2.7.18
12
12
  Requires-Dist: gradio>=5.12.0
13
13
  Requires-Dist: ipykernel>=6.29.5
14
14
  Requires-Dist: ipywidgets>=8.1.5
15
+ Requires-Dist: itables>=2.2.4
15
16
  Requires-Dist: loguru>=0.7.3
16
17
  Requires-Dist: seaborn>=0.13.2
17
18
  Requires-Dist: timm>=1.0.13
@@ -59,7 +60,7 @@ Description-Content-Type: text/markdown
59
60
  <br />
60
61
  <a href="https://dnth.github.io/active-vision" target="_blank" rel="noopener noreferrer"><strong>Explore the docs »</strong></a>
61
62
  <br />
62
- <a href="#️-quickstart" target="_blank" rel="noopener noreferrer">Quickstart</a>
63
+ <a href="#-quickstart" target="_blank" rel="noopener noreferrer">Quickstart</a>
63
64
  ·
64
65
  <a href="https://github.com/dnth/active-vision/issues/new?assignees=&labels=Feature+Request&projects=&template=feature_request.md" target="_blank" rel="noopener noreferrer">Feature Request</a>
65
66
  ·
@@ -109,7 +110,12 @@ Get a release from PyPI
109
110
  pip install active-vision
110
111
  ```
111
112
 
112
- Install from source
113
+ Install bleeding edge from source
114
+ ```bash
115
+ pip install git+https://github.com/dnth/active-vision.git
116
+ ```
117
+
118
+ Local install
113
119
  ```bash
114
120
  git clone https://github.com/dnth/active-vision.git
115
121
  cd active-vision
@@ -135,8 +141,8 @@ pip install -e .
135
141
 
136
142
  ## 🚀 Quickstart
137
143
 
138
- [![Open In Colab][colab_badge]](https://colab.research.google.com/github/dnth/active-vision/blob/main/nbs/imagenette/quickstart.ipynb)
139
- [![Open In Kaggle][kaggle_badge]](https://kaggle.com/kernels/welcome?src=https://github.com/dnth/active-vision/blob/main/nbs/imagenette/quickstart.ipynb)
144
+ [![Open In Colab][colab_badge]](https://colab.research.google.com/github/dnth/active-vision/blob/main/docs/quickstart.ipynb)
145
+ [![Open In Kaggle][kaggle_badge]](https://kaggle.com/kernels/welcome?src=https://github.com/dnth/active-vision/blob/main/docs/quickstart.ipynb)
140
146
 
141
147
  The following are code snippets for the active learning loop in active-vision. I recommend running the quickstart notebook in Colab or Kaggle to see the full workflow.
142
148
 
@@ -150,7 +156,7 @@ al = ActiveLearner(name="cycle-1")
150
156
  al.load_model(model="resnet18", pretrained=True)
151
157
 
152
158
  # Load dataset
153
- al.load_dataset(train_df, filepath_col="filepath", label_col="label", batch_size=8)
159
+ al.load_dataset(train_df, filepath_col="filepath", label_col="label")
154
160
 
155
161
  # Train model
156
162
  al.train(epochs=10, lr=5e-3)
@@ -275,7 +281,34 @@ I decided to stop the active learning loop at 1188 labeled images because the pe
275
281
  | 1188 | 96.57% | 13 | vit-base-patch16-224 | ✓ | [Link](https://github.com/dnth/active-vision/blob/main/nbs/eurosat_rgb/02_train.ipynb) |
276
282
 
277
283
 
278
- ## ➿ Workflow
284
+ ### Beans
285
+ - num classes: 3
286
+ - num images: 1034
287
+
288
+ | #Labeled Images | Evaluation Accuracy | Train Epochs | Model | Active Learning | Source |
289
+ |----------------: |--------------------: |-------------: |---------------------- |:---------------: |-------------------------------------------------------------------------------------------- |
290
+ | 380 | 95.31% | 13 | vit_small_patch16_224 | ✓ | [Link](https://github.com/dnth/active-vision/blob/main/nbs/beans/active_learning.ipynb) |
291
+ | 1034 | 98.43% | 13 | vit_small_patch16_224 | ❌ | [Link](https://github.com/dnth/active-vision/blob/main/nbs/beans/train_all.ipynb) |
292
+
293
+
294
+ ## 🧱 Sampling Approaches
295
+
296
+ Uncertainty and diversity sampling are most effective when combined. Some recommmendations:
297
+
298
+ - Least Confidence Sampling with Cluster-Based Sampling: This approach first selects a large sample of the most uncertain items using least confidence sampling, and then applies cluster-based sampling to ensure diversity within that selection. This method helps to select data points that are both uncertain and representative of different clusters in the data.
299
+
300
+ - Uncertainty Sampling with Model-Based Outliers: This strategy combines uncertainty sampling to find items near the decision boundary with model-based outlier detection to identify items with features that are relatively unknown to the current model. This approach aims to maximize the model's current confusion by selecting items that are both uncertain and different from the current training data.
301
+
302
+ - Uncertainty Sampling with Model-Based Outliers and Clustering: To address the issue of the previous approach potentially oversampling similar items, clustering can be applied after using uncertainty sampling with model-based outliers to ensure diversity.
303
+
304
+ - Representative Sampling with Cluster-Based Sampling: This method addresses the limitation of basic representative sampling by independently clustering both the training data and the unlabeled data. It then identifies clusters that are most representative of the unlabeled data and oversamples from those clusters. This leads to a more diverse set of items compared to representative sampling alone.
305
+
306
+ - Sampling from the Highest-Entropy Cluster: This method combines clustering with uncertainty by selecting the cluster with the highest average uncertainty (using entropy). This method aims to sample data points from the cluster that straddles the decision boundary most closely.
307
+
308
+ - Combining Active Learning Scores: Rather than filtering the output of one sampling strategy with another, this approach combines the scores from different sampling strategies and ranks items based on an aggregate score. This allows for a more nuanced approach to selecting items.
309
+
310
+
311
+ ## ➿ Workflows
279
312
  This section describes a more detailed workflow for active learning. There are two workflows for active learning that we can use depending on the availability of labeled data.
280
313
 
281
314
  ### With unlabeled data
@@ -340,22 +373,6 @@ graph TD
340
373
  ```
341
374
 
342
375
 
376
+ ## 📚 Acknowledgements
343
377
 
344
- ## 🧱 Sampling Approaches
345
-
346
- Recommendation 1:
347
- - 10% randomly selected from unlabeled items.
348
- - 80% selected from the lowest confidence items.
349
- - 10% selected as outliers.
350
-
351
- Recommendation 2:
352
-
353
- - Sample 100 predicted images at 10–20% confidence.
354
- - Sample 100 predicted images at 20–30% confidence.
355
- - Sample 100 predicted images at 30–40% confidence, and so on.
356
-
357
-
358
- Uncertainty and diversity sampling are most effective when combined. For instance, you could first sample the most uncertain items using an uncertainty sampling method, then apply a diversity sampling method such as clustering to select a diverse set from the uncertain items.
359
-
360
- Ultimately, the right ratios can depend on the specific task and dataset.
361
-
378
+ This project is inspired by the book [Human-in-the-Loop Machine Learning by Robert Monarch](https://www.manning.com/books/human-in-the-loop-machine-learning).
@@ -0,0 +1,7 @@
1
+ active_vision/__init__.py,sha256=ztL09ANIuHmKmue2Uaui475201zOvb4sEYHNspIzDEA,43
2
+ active_vision/core.py,sha256=9Qwmrere_ryQKqFKLm0aRp-3drz2yBzHfm6jumjWAto,47765
3
+ active_vision/utils.py,sha256=L2nIUSohqCABqV8qjKNTHmxQoiom2DXMjce5aw2_Tjc,1902
4
+ active_vision-0.4.3.dist-info/METADATA,sha256=JLNqzChj2bhefMu9a4HSrimzImsYm1ePuJhJbTNUWe8,19798
5
+ active_vision-0.4.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
+ active_vision-0.4.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
+ active_vision-0.4.3.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- active_vision/__init__.py,sha256=vauWDAlrr6fiIylIKSzErXOEopRtTsBk8G4hC9418M0,43
2
- active_vision/core.py,sha256=ZDRylM3KsoLxy9qA9bld4WxzcKcyCwH8IJ1cFxtz5mE,41607
3
- active_vision-0.4.1.dist-info/METADATA,sha256=LpgLc_E7jJVXxUHrIPv-1RZq_CEE3enyb0O2PDZMrJM,17262
4
- active_vision-0.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
5
- active_vision-0.4.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
6
- active_vision-0.4.1.dist-info/RECORD,,