active-vision 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
active_vision/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
- __version__ = "0.2.0"
1
+ __version__ = "0.3.0"
2
2
 
3
3
  from .core import *
active_vision/core.py CHANGED
@@ -3,6 +3,7 @@ from loguru import logger
3
3
  from fastai.vision.all import *
4
4
  import torch
5
5
  import numpy as np
6
+ import bisect
6
7
 
7
8
  import warnings
8
9
  from typing import Callable
@@ -56,7 +57,6 @@ class ActiveLearner:
56
57
  learner_path: str = None,
57
58
  ):
58
59
  logger.info(f"Loading dataset from {filepath_col} and {label_col}")
59
- self.train_set = df.copy()
60
60
 
61
61
  logger.info("Creating dataloaders")
62
62
  self.dls = ImageDataLoaders.from_df(
@@ -85,6 +85,8 @@ class ActiveLearner:
85
85
  self.dls, self.model, metrics=accuracy
86
86
  ).to_fp16()
87
87
 
88
+ self.train_set = self.learn.dls.train_ds.items
89
+ self.valid_set = self.learn.dls.valid_ds.items
88
90
  self.class_names = self.dls.vocab
89
91
  self.num_classes = self.dls.c
90
92
  logger.info("Done. Ready to train.")
@@ -136,16 +138,24 @@ class ActiveLearner:
136
138
  """
137
139
  logger.info(f"Running inference on {len(filepaths)} samples")
138
140
  test_dl = self.dls.test_dl(filepaths, bs=batch_size)
139
- preds, _, cls_preds = self.learn.get_preds(dl=test_dl, with_decoded=True)
141
+
142
+ def identity(x):
143
+ return x
144
+
145
+ logits, _, class_idxs = self.learn.get_preds(
146
+ dl=test_dl, with_decoded=True, act=identity
147
+ )
140
148
 
141
149
  self.pred_df = pd.DataFrame(
142
150
  {
143
151
  "filepath": filepaths,
144
- "pred_label": [self.learn.dls.vocab[i] for i in cls_preds.numpy()],
145
- "pred_conf": torch.max(preds, dim=1)[0].numpy(),
146
- "pred_raw": preds.numpy().tolist(),
152
+ "pred_label": [self.learn.dls.vocab[i] for i in class_idxs.numpy()],
153
+ "pred_conf": torch.max(F.softmax(logits, dim=1), dim=1)[0].numpy(),
154
+ "probs": F.softmax(logits, dim=1).numpy().tolist(),
155
+ "logits": logits.numpy().tolist(),
147
156
  }
148
157
  )
158
+
149
159
  return self.pred_df
150
160
 
151
161
  def evaluate(
@@ -193,7 +203,7 @@ class ActiveLearner:
193
203
  logger.info(
194
204
  f"Using least confidence strategy to get top {num_samples} samples"
195
205
  )
196
- df.loc[:, "uncertainty_score"] = 1 - (df["pred_conf"]) / (
206
+ df.loc[:, "score"] = 1 - (df["pred_conf"]) / (
197
207
  self.num_classes - (self.num_classes - 1)
198
208
  )
199
209
 
@@ -201,12 +211,12 @@ class ActiveLearner:
201
211
  logger.info(
202
212
  f"Using margin of confidence strategy to get top {num_samples} samples"
203
213
  )
204
- if len(df["pred_raw"].iloc[0]) < 2:
205
- logger.error("pred_raw has less than 2 elements")
206
- raise ValueError("pred_raw has less than 2 elements")
214
+ if len(df["probs"].iloc[0]) < 2:
215
+ logger.error("probs has less than 2 elements")
216
+ raise ValueError("probs has less than 2 elements")
207
217
 
208
218
  # Calculate uncertainty score as 1 - (difference between top two predictions)
209
- df.loc[:, "uncertainty_score"] = df["pred_raw"].apply(
219
+ df.loc[:, "score"] = df["probs"].apply(
210
220
  lambda x: 1 - (np.sort(x)[-1] - np.sort(x)[-2])
211
221
  )
212
222
 
@@ -214,12 +224,12 @@ class ActiveLearner:
214
224
  logger.info(
215
225
  f"Using ratio of confidence strategy to get top {num_samples} samples"
216
226
  )
217
- if len(df["pred_raw"].iloc[0]) < 2:
218
- logger.error("pred_raw has less than 2 elements")
219
- raise ValueError("pred_raw has less than 2 elements")
227
+ if len(df["probs"].iloc[0]) < 2:
228
+ logger.error("probs has less than 2 elements")
229
+ raise ValueError("probs has less than 2 elements")
220
230
 
221
231
  # Calculate uncertainty score as ratio of top two predictions
222
- df.loc[:, "uncertainty_score"] = df["pred_raw"].apply(
232
+ df.loc[:, "score"] = df["probs"].apply(
223
233
  lambda x: np.sort(x)[-2] / np.sort(x)[-1]
224
234
  )
225
235
 
@@ -227,25 +237,25 @@ class ActiveLearner:
227
237
  logger.info(f"Using entropy strategy to get top {num_samples} samples")
228
238
 
229
239
  # Calculate uncertainty score as entropy of the prediction
230
- df.loc[:, "uncertainty_score"] = df["pred_raw"].apply(
231
- lambda x: -np.sum(x * np.log2(x))
232
- )
240
+ df.loc[:, "score"] = df["probs"].apply(lambda x: -np.sum(x * np.log2(x)))
233
241
 
234
242
  # Normalize the uncertainty score to be between 0 and 1 by dividing by log2 of the number of classes
235
- df.loc[:, "uncertainty_score"] = df["uncertainty_score"] / np.log2(
236
- self.num_classes
237
- )
243
+ df.loc[:, "score"] = df["score"] / np.log2(self.num_classes)
238
244
 
239
245
  else:
240
246
  logger.error(f"Unknown strategy: {strategy}")
241
247
  raise ValueError(f"Unknown strategy: {strategy}")
242
248
 
243
- df = df[
244
- ["filepath", "pred_label", "pred_conf", "uncertainty_score", "pred_raw"]
245
- ]
246
- return df.sort_values(by="uncertainty_score", ascending=False).head(num_samples)
249
+ df = df[["filepath", "pred_label", "pred_conf", "score", "probs", "logits"]]
250
+
251
+ df["score"] = df["score"].map("{:.4f}".format)
252
+ df["pred_conf"] = df["pred_conf"].map("{:.4f}".format)
253
+
254
+ return df.sort_values(by="score", ascending=False).head(num_samples)
247
255
 
248
- def sample_diverse(self, df: pd.DataFrame, num_samples: int):
256
+ def sample_diverse(
257
+ self, df: pd.DataFrame, num_samples: int, strategy: str = "model-based-outlier"
258
+ ):
249
259
  """
250
260
  Sample top `num_samples` diverse samples. Returns a df with filepaths and predicted labels, and confidence scores.
251
261
 
@@ -253,9 +263,63 @@ class ActiveLearner:
253
263
  - model-based-outlier: Get top `num_samples` samples with lowest activation of the model's last layer.
254
264
  - cluster-based: Get top `num_samples` samples with the highest distance to the nearest neighbor.
255
265
  - representative: Get top `num_samples` samples with the highest distance to the centroid of the training set.
266
+
256
267
  """
257
- logger.error("Diverse sampling strategy not implemented")
258
- raise NotImplementedError("Diverse sampling strategy not implemented")
268
+ # Remove samples that is already in the training set
269
+ df = df[~df["filepath"].isin(self.train_set["filepath"])].copy()
270
+
271
+ if strategy == "model-based-outlier":
272
+ logger.info(
273
+ f"Using model-based outlier strategy to get top {num_samples} samples"
274
+ )
275
+
276
+ # Get the activations for all items in the validation set.
277
+ valid_set_preds = self.predict(self.valid_set["filepath"].tolist())
278
+
279
+ # Store logits for each class in a list instead of dict
280
+ validation_class_logits = [
281
+ sorted(
282
+ valid_set_preds["logits"].apply(lambda x: x[i]).tolist(),
283
+ reverse=True,
284
+ )
285
+ for i in range(self.num_classes)
286
+ ]
287
+
288
+ # Get the logits for the unlabeled set
289
+ unlabeled_set_preds = self.predict(df["filepath"].tolist())
290
+
291
+ # For each element in the unlabeled set logits, compare it to the validation set ranked logits and get the position in the ranked logits
292
+ unlabeled_set_logits = []
293
+ for idx, row in unlabeled_set_preds.iterrows():
294
+ logits = row["logits"]
295
+ # For each class, find where this sample's logit would rank in the validation set
296
+ ranks = []
297
+ for class_idx in range(self.num_classes):
298
+ class_logit = logits[class_idx]
299
+ ranked_logits = validation_class_logits[
300
+ class_idx
301
+ ] # Access by index instead of dict key
302
+ # Find position where this logit would be inserted to maintain sorted order
303
+ # Now using bisect_left directly since logits are sorted high to low
304
+ rank = bisect.bisect_left(ranked_logits, class_logit)
305
+ ranks.append(
306
+ rank / len(ranked_logits)
307
+ ) # Normalize rank to 0-1 range
308
+
309
+ # Average rank across all classes - lower means more outlier-like
310
+ avg_rank = np.mean(ranks)
311
+ unlabeled_set_logits.append(avg_rank)
312
+
313
+ # Add outlier scores to dataframe
314
+ df.loc[:, "score"] = unlabeled_set_logits
315
+
316
+ df = df[["filepath", "pred_label", "pred_conf", "score", "probs", "logits"]]
317
+
318
+ df["score"] = df["score"].map("{:.4f}".format)
319
+ df["pred_conf"] = df["pred_conf"].map("{:.4f}".format)
320
+
321
+ # Sort by score ascending higher rank = more outlier-like compared to the validation set
322
+ return df.sort_values(by="score", ascending=False).head(num_samples)
259
323
 
260
324
  def sample_random(self, df: pd.DataFrame, num_samples: int, seed: int = None):
261
325
  """
@@ -309,7 +373,7 @@ class ActiveLearner:
309
373
  type="filepath",
310
374
  label="Image",
311
375
  value=filepaths[0],
312
- height=500,
376
+ height=510,
313
377
  )
314
378
 
315
379
  # Add bar plot with top 5 predictions
@@ -320,11 +384,11 @@ class ActiveLearner:
320
384
  title="Top 5 Predictions",
321
385
  x_lim=[0, 1],
322
386
  value=None
323
- if "pred_raw" not in df.columns
387
+ if "probs" not in df.columns
324
388
  else pd.DataFrame(
325
389
  {
326
390
  "class": self.class_names,
327
- "probability": df["pred_raw"].iloc[0],
391
+ "probability": df["probs"].iloc[0],
328
392
  }
329
393
  ).nlargest(5, "probability"),
330
394
  )
@@ -332,18 +396,27 @@ class ActiveLearner:
332
396
  filename = gr.Textbox(
333
397
  label="Filename", value=filepaths[0], interactive=False
334
398
  )
335
-
336
- pred_label = gr.Textbox(
337
- label="Predicted Label",
338
- value=df["pred_label"].iloc[0]
339
- if "pred_label" in df.columns
340
- else "",
341
- interactive=False,
342
- )
343
- pred_conf = gr.Textbox(
344
- label="Confidence",
345
- value=f"{df['pred_conf'].iloc[0]:.2%}"
346
- if "pred_conf" in df.columns
399
+ with gr.Row():
400
+ pred_label = gr.Textbox(
401
+ label="Predicted Label",
402
+ value=df["pred_label"].iloc[0]
403
+ if "pred_label" in df.columns
404
+ else "",
405
+ interactive=False,
406
+ )
407
+
408
+ pred_conf = gr.Textbox(
409
+ label="Confidence",
410
+ value=df["pred_conf"].iloc[0]
411
+ if "pred_conf" in df.columns
412
+ else "",
413
+ interactive=False,
414
+ )
415
+
416
+ sample_score = gr.Textbox(
417
+ label="Sample Score [0-1] - Indicates how informative the sample is. Higher means more informative.",
418
+ value=df["score"].iloc[0]
419
+ if "score" in df.columns
347
420
  else "",
348
421
  interactive=False,
349
422
  )
@@ -387,6 +460,7 @@ class ActiveLearner:
387
460
  current_index,
388
461
  progress,
389
462
  pred_plot,
463
+ sample_score,
390
464
  ],
391
465
  )
392
466
 
@@ -476,11 +550,11 @@ class ActiveLearner:
476
550
  if 0 <= next_idx < len(filepaths):
477
551
  plot_data = (
478
552
  None
479
- if "pred_raw" not in df.columns
553
+ if "probs" not in df.columns
480
554
  else pd.DataFrame(
481
555
  {
482
556
  "class": self.class_names,
483
- "probability": df["pred_raw"].iloc[next_idx],
557
+ "probability": df["probs"].iloc[next_idx],
484
558
  }
485
559
  ).nlargest(5, "probability")
486
560
  )
@@ -490,7 +564,7 @@ class ActiveLearner:
490
564
  df["pred_label"].iloc[next_idx]
491
565
  if "pred_label" in df.columns
492
566
  else "",
493
- f"{df['pred_conf'].iloc[next_idx]:.2%}"
567
+ df["pred_conf"].iloc[next_idx]
494
568
  if "pred_conf" in df.columns
495
569
  else "",
496
570
  df["pred_label"].iloc[next_idx]
@@ -499,14 +573,15 @@ class ActiveLearner:
499
573
  next_idx,
500
574
  next_idx,
501
575
  plot_data,
576
+ df["score"].iloc[next_idx] if "score" in df.columns else "",
502
577
  )
503
578
  plot_data = (
504
579
  None
505
- if "pred_raw" not in df.columns
580
+ if "probs" not in df.columns
506
581
  else pd.DataFrame(
507
582
  {
508
583
  "class": self.class_names,
509
- "probability": df["pred_raw"].iloc[current_idx],
584
+ "probability": df["probs"].iloc[current_idx],
510
585
  }
511
586
  ).nlargest(5, "probability")
512
587
  )
@@ -516,7 +591,7 @@ class ActiveLearner:
516
591
  df["pred_label"].iloc[current_idx]
517
592
  if "pred_label" in df.columns
518
593
  else "",
519
- f"{df['pred_conf'].iloc[current_idx]:.2%}"
594
+ df["pred_conf"].iloc[current_idx]
520
595
  if "pred_conf" in df.columns
521
596
  else "",
522
597
  df["pred_label"].iloc[current_idx]
@@ -525,6 +600,7 @@ class ActiveLearner:
525
600
  current_idx,
526
601
  current_idx,
527
602
  plot_data,
603
+ df["score"].iloc[current_idx] if "score" in df.columns else "",
528
604
  )
529
605
 
530
606
  def save_and_next(current_idx, selected_category):
@@ -534,11 +610,11 @@ class ActiveLearner:
534
610
  if selected_category is None:
535
611
  plot_data = (
536
612
  None
537
- if "pred_raw" not in df.columns
613
+ if "probs" not in df.columns
538
614
  else pd.DataFrame(
539
615
  {
540
616
  "class": self.class_names,
541
- "probability": df["pred_raw"].iloc[current_idx],
617
+ "probability": df["probs"].iloc[current_idx],
542
618
  }
543
619
  ).nlargest(5, "probability")
544
620
  )
@@ -548,7 +624,7 @@ class ActiveLearner:
548
624
  df["pred_label"].iloc[current_idx]
549
625
  if "pred_label" in df.columns
550
626
  else "",
551
- f"{df['pred_conf'].iloc[current_idx]:.2%}"
627
+ df["pred_conf"].iloc[current_idx]
552
628
  if "pred_conf" in df.columns
553
629
  else "",
554
630
  df["pred_label"].iloc[current_idx]
@@ -557,6 +633,7 @@ class ActiveLearner:
557
633
  current_idx,
558
634
  current_idx,
559
635
  plot_data,
636
+ df["score"].iloc[current_idx] if "score" in df.columns else "",
560
637
  )
561
638
 
562
639
  # Save the current annotation
@@ -568,11 +645,11 @@ class ActiveLearner:
568
645
  if next_idx >= len(filepaths):
569
646
  plot_data = (
570
647
  None
571
- if "pred_raw" not in df.columns
648
+ if "probs" not in df.columns
572
649
  else pd.DataFrame(
573
650
  {
574
651
  "class": self.class_names,
575
- "probability": df["pred_raw"].iloc[current_idx],
652
+ "probability": df["probs"].iloc[current_idx],
576
653
  }
577
654
  ).nlargest(5, "probability")
578
655
  )
@@ -582,7 +659,7 @@ class ActiveLearner:
582
659
  df["pred_label"].iloc[current_idx]
583
660
  if "pred_label" in df.columns
584
661
  else "",
585
- f"{df['pred_conf'].iloc[current_idx]:.2%}"
662
+ df["pred_conf"].iloc[current_idx]
586
663
  if "pred_conf" in df.columns
587
664
  else "",
588
665
  df["pred_label"].iloc[current_idx]
@@ -591,15 +668,16 @@ class ActiveLearner:
591
668
  current_idx,
592
669
  current_idx,
593
670
  plot_data,
671
+ df["score"].iloc[current_idx] if "score" in df.columns else "",
594
672
  )
595
673
 
596
674
  plot_data = (
597
675
  None
598
- if "pred_raw" not in df.columns
676
+ if "probs" not in df.columns
599
677
  else pd.DataFrame(
600
678
  {
601
679
  "class": self.class_names,
602
- "probability": df["pred_raw"].iloc[next_idx],
680
+ "probability": df["probs"].iloc[next_idx],
603
681
  }
604
682
  ).nlargest(5, "probability")
605
683
  )
@@ -609,15 +687,14 @@ class ActiveLearner:
609
687
  df["pred_label"].iloc[next_idx]
610
688
  if "pred_label" in df.columns
611
689
  else "",
612
- f"{df['pred_conf'].iloc[next_idx]:.2%}"
613
- if "pred_conf" in df.columns
614
- else "",
690
+ df["pred_conf"].iloc[next_idx] if "pred_conf" in df.columns else "",
615
691
  df["pred_label"].iloc[next_idx]
616
692
  if "pred_label" in df.columns
617
693
  else None,
618
694
  next_idx,
619
695
  next_idx,
620
696
  plot_data,
697
+ df["score"].iloc[next_idx] if "score" in df.columns else "",
621
698
  )
622
699
 
623
700
  def convert_csv_to_parquet():
@@ -643,6 +720,7 @@ class ActiveLearner:
643
720
  current_index,
644
721
  progress,
645
722
  pred_plot,
723
+ sample_score,
646
724
  ],
647
725
  )
648
726
 
@@ -658,6 +736,7 @@ class ActiveLearner:
658
736
  current_index,
659
737
  progress,
660
738
  pred_plot,
739
+ sample_score,
661
740
  ],
662
741
  )
663
742
 
@@ -673,6 +752,7 @@ class ActiveLearner:
673
752
  current_index,
674
753
  progress,
675
754
  pred_plot,
755
+ sample_score,
676
756
  ],
677
757
  )
678
758
 
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: active-vision
3
- Version: 0.2.0
4
- Summary: Active learning for edge vision.
3
+ Version: 0.3.0
4
+ Summary: Active learning for computer vision.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
7
7
  License-File: LICENSE
@@ -53,7 +53,8 @@ Uncertainty Sampling:
53
53
 
54
54
  Diverse Sampling:
55
55
  - [X] Random sampling
56
- - [ ] Model-based outlier
56
+ - [X] Model-based outlier
57
+ - [ ] Embeddings-based outlier
57
58
  - [ ] Cluster-based
58
59
  - [ ] Representative
59
60
 
@@ -0,0 +1,7 @@
1
+ active_vision/__init__.py,sha256=hbFzCBVh_5qm0XuZh_I07cRmmDZ_cDx5n-6mf-tFB6s,43
2
+ active_vision/core.py,sha256=8kYsA0cHNty1oOXg0yvvlT2Tau7m_AS9DJ7Sc0RB30k,31096
3
+ active_vision-0.3.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
4
+ active_vision-0.3.0.dist-info/METADATA,sha256=B8t28CcxeXFLAonjFV6zoVwAAOOR1mSn_YtJVEzKqcg,15710
5
+ active_vision-0.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
6
+ active_vision-0.3.0.dist-info/top_level.txt,sha256=7qUQvccN2UU63z5S9vrgJmqK-8sFGrtpf1e9Z86nihE,14
7
+ active_vision-0.3.0.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- active_vision/__init__.py,sha256=SxR6MPyULKlvx-86S3NIk46Tz1xlN-g_vI_aW3LitG4,43
2
- active_vision/core.py,sha256=4Nl8e3isinIlzcD6bCbG9TTGiuG0PQkKNUIvnAsbaTY,27373
3
- active_vision-0.2.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
4
- active_vision-0.2.0.dist-info/METADATA,sha256=3XvDTC1Cnxd3rIUUXyY8MwTgKGcnncN9D2VvKnkw1jQ,15675
5
- active_vision-0.2.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
6
- active_vision-0.2.0.dist-info/top_level.txt,sha256=7qUQvccN2UU63z5S9vrgJmqK-8sFGrtpf1e9Z86nihE,14
7
- active_vision-0.2.0.dist-info/RECORD,,