PyPI - active-vision - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

active-vision 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

active_vision/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-__version__ = "0.2.0"
+__version__ = "0.3.0"
 from .core import *

active_vision/core.py CHANGED Viewed

@@ -3,6 +3,7 @@ from loguru import logger
 from fastai.vision.all import *
 import torch
 import numpy as np
+import bisect
 import warnings
 from typing import Callable
@@ -56,7 +57,6 @@ class ActiveLearner:
         learner_path: str = None,
     ):
         logger.info(f"Loading dataset from {filepath_col} and {label_col}")
-        self.train_set = df.copy()
         logger.info("Creating dataloaders")
         self.dls = ImageDataLoaders.from_df(
@@ -85,6 +85,8 @@ class ActiveLearner:
                 self.dls, self.model, metrics=accuracy
             ).to_fp16()
+        self.train_set = self.learn.dls.train_ds.items
+        self.valid_set = self.learn.dls.valid_ds.items
         self.class_names = self.dls.vocab
         self.num_classes = self.dls.c
         logger.info("Done. Ready to train.")
@@ -136,16 +138,24 @@ class ActiveLearner:
         """
         logger.info(f"Running inference on {len(filepaths)} samples")
         test_dl = self.dls.test_dl(filepaths, bs=batch_size)
-        preds, _, cls_preds = self.learn.get_preds(dl=test_dl, with_decoded=True)
+        def identity(x):
+            return x
+        logits, _, class_idxs = self.learn.get_preds(
+            dl=test_dl, with_decoded=True, act=identity
+        )
         self.pred_df = pd.DataFrame(
             {
                 "filepath": filepaths,
-                "pred_label": [self.learn.dls.vocab[i] for i in cls_preds.numpy()],
-                "pred_conf": torch.max(preds, dim=1)[0].numpy(),
-                "pred_raw": preds.numpy().tolist(),
+                "pred_label": [self.learn.dls.vocab[i] for i in class_idxs.numpy()],
+                "pred_conf": torch.max(F.softmax(logits, dim=1), dim=1)[0].numpy(),
+                "probs": F.softmax(logits, dim=1).numpy().tolist(),
+                "logits": logits.numpy().tolist(),
             }
         )
         return self.pred_df
     def evaluate(
@@ -193,7 +203,7 @@ class ActiveLearner:
             logger.info(
                 f"Using least confidence strategy to get top {num_samples} samples"
             )
-            df.loc[:, "uncertainty_score"] = 1 - (df["pred_conf"]) / (
+            df.loc[:, "score"] = 1 - (df["pred_conf"]) / (
                 self.num_classes - (self.num_classes - 1)
             )
@@ -201,12 +211,12 @@ class ActiveLearner:
             logger.info(
                 f"Using margin of confidence strategy to get top {num_samples} samples"
             )
-            if len(df["pred_raw"].iloc[0]) < 2:
-                logger.error("pred_raw has less than 2 elements")
-                raise ValueError("pred_raw has less than 2 elements")
+            if len(df["probs"].iloc[0]) < 2:
+                logger.error("probs has less than 2 elements")
+                raise ValueError("probs has less than 2 elements")
             # Calculate uncertainty score as 1 - (difference between top two predictions)
-            df.loc[:, "uncertainty_score"] = df["pred_raw"].apply(
+            df.loc[:, "score"] = df["probs"].apply(
                 lambda x: 1 - (np.sort(x)[-1] - np.sort(x)[-2])
             )
@@ -214,12 +224,12 @@ class ActiveLearner:
             logger.info(
                 f"Using ratio of confidence strategy to get top {num_samples} samples"
             )
-            if len(df["pred_raw"].iloc[0]) < 2:
-                logger.error("pred_raw has less than 2 elements")
-                raise ValueError("pred_raw has less than 2 elements")
+            if len(df["probs"].iloc[0]) < 2:
+                logger.error("probs has less than 2 elements")
+                raise ValueError("probs has less than 2 elements")
             # Calculate uncertainty score as ratio of top two predictions
-            df.loc[:, "uncertainty_score"] = df["pred_raw"].apply(
+            df.loc[:, "score"] = df["probs"].apply(
                 lambda x: np.sort(x)[-2] / np.sort(x)[-1]
             )
@@ -227,25 +237,25 @@ class ActiveLearner:
             logger.info(f"Using entropy strategy to get top {num_samples} samples")
             # Calculate uncertainty score as entropy of the prediction
-            df.loc[:, "uncertainty_score"] = df["pred_raw"].apply(
-                lambda x: -np.sum(x * np.log2(x))
-            )
+            df.loc[:, "score"] = df["probs"].apply(lambda x: -np.sum(x * np.log2(x)))
             # Normalize the uncertainty score to be between 0 and 1 by dividing by log2 of the number of classes
-            df.loc[:, "uncertainty_score"] = df["uncertainty_score"] / np.log2(
-                self.num_classes
-            )
+            df.loc[:, "score"] = df["score"] / np.log2(self.num_classes)
         else:
             logger.error(f"Unknown strategy: {strategy}")
             raise ValueError(f"Unknown strategy: {strategy}")
-        df = df[
-            ["filepath", "pred_label", "pred_conf", "uncertainty_score", "pred_raw"]
-        ]
-        return df.sort_values(by="uncertainty_score", ascending=False).head(num_samples)
+        df = df[["filepath", "pred_label", "pred_conf", "score", "probs", "logits"]]
+        df["score"] = df["score"].map("{:.4f}".format)
+        df["pred_conf"] = df["pred_conf"].map("{:.4f}".format)
+        return df.sort_values(by="score", ascending=False).head(num_samples)
-    def sample_diverse(self, df: pd.DataFrame, num_samples: int):
+    def sample_diverse(
+        self, df: pd.DataFrame, num_samples: int, strategy: str = "model-based-outlier"
+    ):
         """
         Sample top `num_samples` diverse samples. Returns a df with filepaths and predicted labels, and confidence scores.
@@ -253,9 +263,63 @@ class ActiveLearner:
         - model-based-outlier: Get top `num_samples` samples with lowest activation of the model's last layer.
         - cluster-based: Get top `num_samples` samples with the highest distance to the nearest neighbor.
         - representative: Get top `num_samples` samples with the highest distance to the centroid of the training set.
         """
-        logger.error("Diverse sampling strategy not implemented")
-        raise NotImplementedError("Diverse sampling strategy not implemented")
+        # Remove samples that is already in the training set
+        df = df[~df["filepath"].isin(self.train_set["filepath"])].copy()
+        if strategy == "model-based-outlier":
+            logger.info(
+                f"Using model-based outlier strategy to get top {num_samples} samples"
+            )
+            # Get the activations for all items in the validation set.
+            valid_set_preds = self.predict(self.valid_set["filepath"].tolist())
+            # Store logits for each class in a list instead of dict
+            validation_class_logits = [
+                sorted(
+                    valid_set_preds["logits"].apply(lambda x: x[i]).tolist(),
+                    reverse=True,
+                )
+                for i in range(self.num_classes)
+            ]
+            # Get the logits for the unlabeled set
+            unlabeled_set_preds = self.predict(df["filepath"].tolist())
+            # For each element in the unlabeled set logits, compare it to the validation set ranked logits and get the position in the ranked logits
+            unlabeled_set_logits = []
+            for idx, row in unlabeled_set_preds.iterrows():
+                logits = row["logits"]
+                # For each class, find where this sample's logit would rank in the validation set
+                ranks = []
+                for class_idx in range(self.num_classes):
+                    class_logit = logits[class_idx]
+                    ranked_logits = validation_class_logits[
+                        class_idx
+                    ]  # Access by index instead of dict key
+                    # Find position where this logit would be inserted to maintain sorted order
+                    # Now using bisect_left directly since logits are sorted high to low
+                    rank = bisect.bisect_left(ranked_logits, class_logit)
+                    ranks.append(
+                        rank / len(ranked_logits)
+                    )  # Normalize rank to 0-1 range
+                # Average rank across all classes - lower means more outlier-like
+                avg_rank = np.mean(ranks)
+                unlabeled_set_logits.append(avg_rank)
+            # Add outlier scores to dataframe
+            df.loc[:, "score"] = unlabeled_set_logits
+            df = df[["filepath", "pred_label", "pred_conf", "score", "probs", "logits"]]
+            df["score"] = df["score"].map("{:.4f}".format)
+            df["pred_conf"] = df["pred_conf"].map("{:.4f}".format)
+            # Sort by score ascending higher rank = more outlier-like compared to the validation set
+            return df.sort_values(by="score", ascending=False).head(num_samples)
     def sample_random(self, df: pd.DataFrame, num_samples: int, seed: int = None):
         """
@@ -309,7 +373,7 @@ class ActiveLearner:
                             type="filepath",
                             label="Image",
                             value=filepaths[0],
-                            height=500,
+                            height=510,
                         )
                         # Add bar plot with top 5 predictions
@@ -320,11 +384,11 @@ class ActiveLearner:
                                 title="Top 5 Predictions",
                                 x_lim=[0, 1],
                                 value=None
-                                if "pred_raw" not in df.columns
+                                if "probs" not in df.columns
                                 else pd.DataFrame(
                                     {
                                         "class": self.class_names,
-                                        "probability": df["pred_raw"].iloc[0],
+                                        "probability": df["probs"].iloc[0],
                                     }
                                 ).nlargest(5, "probability"),
                             )
@@ -332,18 +396,27 @@ class ActiveLearner:
                             filename = gr.Textbox(
                                 label="Filename", value=filepaths[0], interactive=False
                             )
-                            pred_label = gr.Textbox(
-                                label="Predicted Label",
-                                value=df["pred_label"].iloc[0]
-                                if "pred_label" in df.columns
-                                else "",
-                                interactive=False,
-                            )
-                            pred_conf = gr.Textbox(
-                                label="Confidence",
-                                value=f"{df['pred_conf'].iloc[0]:.2%}"
-                                if "pred_conf" in df.columns
+                            with gr.Row():
+                                pred_label = gr.Textbox(
+                                    label="Predicted Label",
+                                    value=df["pred_label"].iloc[0]
+                                    if "pred_label" in df.columns
+                                    else "",
+                                    interactive=False,
+                                )
+                                pred_conf = gr.Textbox(
+                                    label="Confidence",
+                                    value=df["pred_conf"].iloc[0]
+                                    if "pred_conf" in df.columns
+                                    else "",
+                                    interactive=False,
+                                )
+                            sample_score = gr.Textbox(
+                                label="Sample Score [0-1] - Indicates how informative the sample is. Higher means more informative.",
+                                value=df["score"].iloc[0]
+                                if "score" in df.columns
                                 else "",
                                 interactive=False,
                             )
@@ -387,6 +460,7 @@ class ActiveLearner:
                             current_index,
                             progress,
                             pred_plot,
+                            sample_score,
                         ],
                     )
@@ -476,11 +550,11 @@ class ActiveLearner:
                 if 0 <= next_idx < len(filepaths):
                     plot_data = (
                         None
-                        if "pred_raw" not in df.columns
+                        if "probs" not in df.columns
                         else pd.DataFrame(
                             {
                                 "class": self.class_names,
-                                "probability": df["pred_raw"].iloc[next_idx],
+                                "probability": df["probs"].iloc[next_idx],
                             }
                         ).nlargest(5, "probability")
                     )
@@ -490,7 +564,7 @@ class ActiveLearner:
                         df["pred_label"].iloc[next_idx]
                         if "pred_label" in df.columns
                         else "",
-                        f"{df['pred_conf'].iloc[next_idx]:.2%}"
+                        df["pred_conf"].iloc[next_idx]
                         if "pred_conf" in df.columns
                         else "",
                         df["pred_label"].iloc[next_idx]
@@ -499,14 +573,15 @@ class ActiveLearner:
                         next_idx,
                         next_idx,
                         plot_data,
+                        df["score"].iloc[next_idx] if "score" in df.columns else "",
                     )
                 plot_data = (
                     None
-                    if "pred_raw" not in df.columns
+                    if "probs" not in df.columns
                     else pd.DataFrame(
                         {
                             "class": self.class_names,
-                            "probability": df["pred_raw"].iloc[current_idx],
+                            "probability": df["probs"].iloc[current_idx],
                         }
                     ).nlargest(5, "probability")
                 )
@@ -516,7 +591,7 @@ class ActiveLearner:
                     df["pred_label"].iloc[current_idx]
                     if "pred_label" in df.columns
                     else "",
-                    f"{df['pred_conf'].iloc[current_idx]:.2%}"
+                    df["pred_conf"].iloc[current_idx]
                     if "pred_conf" in df.columns
                     else "",
                     df["pred_label"].iloc[current_idx]
@@ -525,6 +600,7 @@ class ActiveLearner:
                     current_idx,
                     current_idx,
                     plot_data,
+                    df["score"].iloc[current_idx] if "score" in df.columns else "",
                 )
             def save_and_next(current_idx, selected_category):
@@ -534,11 +610,11 @@ class ActiveLearner:
                 if selected_category is None:
                     plot_data = (
                         None
-                        if "pred_raw" not in df.columns
+                        if "probs" not in df.columns
                         else pd.DataFrame(
                             {
                                 "class": self.class_names,
-                                "probability": df["pred_raw"].iloc[current_idx],
+                                "probability": df["probs"].iloc[current_idx],
                             }
                         ).nlargest(5, "probability")
                     )
@@ -548,7 +624,7 @@ class ActiveLearner:
                         df["pred_label"].iloc[current_idx]
                         if "pred_label" in df.columns
                         else "",
-                        f"{df['pred_conf'].iloc[current_idx]:.2%}"
+                        df["pred_conf"].iloc[current_idx]
                         if "pred_conf" in df.columns
                         else "",
                         df["pred_label"].iloc[current_idx]
@@ -557,6 +633,7 @@ class ActiveLearner:
                         current_idx,
                         current_idx,
                         plot_data,
+                        df["score"].iloc[current_idx] if "score" in df.columns else "",
                     )
                 # Save the current annotation
@@ -568,11 +645,11 @@ class ActiveLearner:
                 if next_idx >= len(filepaths):
                     plot_data = (
                         None
-                        if "pred_raw" not in df.columns
+                        if "probs" not in df.columns
                         else pd.DataFrame(
                             {
                                 "class": self.class_names,
-                                "probability": df["pred_raw"].iloc[current_idx],
+                                "probability": df["probs"].iloc[current_idx],
                             }
                         ).nlargest(5, "probability")
                     )
@@ -582,7 +659,7 @@ class ActiveLearner:
                         df["pred_label"].iloc[current_idx]
                         if "pred_label" in df.columns
                         else "",
-                        f"{df['pred_conf'].iloc[current_idx]:.2%}"
+                        df["pred_conf"].iloc[current_idx]
                         if "pred_conf" in df.columns
                         else "",
                         df["pred_label"].iloc[current_idx]
@@ -591,15 +668,16 @@ class ActiveLearner:
                         current_idx,
                         current_idx,
                         plot_data,
+                        df["score"].iloc[current_idx] if "score" in df.columns else "",
                     )
                 plot_data = (
                     None
-                    if "pred_raw" not in df.columns
+                    if "probs" not in df.columns
                     else pd.DataFrame(
                         {
                             "class": self.class_names,
-                            "probability": df["pred_raw"].iloc[next_idx],
+                            "probability": df["probs"].iloc[next_idx],
                         }
                     ).nlargest(5, "probability")
                 )
@@ -609,15 +687,14 @@ class ActiveLearner:
                     df["pred_label"].iloc[next_idx]
                     if "pred_label" in df.columns
                     else "",
-                    f"{df['pred_conf'].iloc[next_idx]:.2%}"
-                    if "pred_conf" in df.columns
-                    else "",
+                    df["pred_conf"].iloc[next_idx] if "pred_conf" in df.columns else "",
                     df["pred_label"].iloc[next_idx]
                     if "pred_label" in df.columns
                     else None,
                     next_idx,
                     next_idx,
                     plot_data,
+                    df["score"].iloc[next_idx] if "score" in df.columns else "",
                 )
             def convert_csv_to_parquet():
@@ -643,6 +720,7 @@ class ActiveLearner:
                     current_index,
                     progress,
                     pred_plot,
+                    sample_score,
                 ],
             )
@@ -658,6 +736,7 @@ class ActiveLearner:
                     current_index,
                     progress,
                     pred_plot,
+                    sample_score,
                 ],
             )
@@ -673,6 +752,7 @@ class ActiveLearner:
                     current_index,
                     progress,
                     pred_plot,
+                    sample_score,
                 ],
             )

{active_vision-0.2.0.dist-info → active_vision-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.2
 Name: active-vision
-Version: 0.2.0
-Summary: Active learning for edge vision.
+Version: 0.3.0
+Summary: Active learning for computer vision.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
@@ -53,7 +53,8 @@ Uncertainty Sampling:
 Diverse Sampling:
 - [X] Random sampling
-- [ ] Model-based outlier
+- [X] Model-based outlier
+- [ ] Embeddings-based outlier
 - [ ] Cluster-based
 - [ ] Representative

active_vision-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+active_vision/__init__.py,sha256=hbFzCBVh_5qm0XuZh_I07cRmmDZ_cDx5n-6mf-tFB6s,43
+active_vision/core.py,sha256=8kYsA0cHNty1oOXg0yvvlT2Tau7m_AS9DJ7Sc0RB30k,31096
+active_vision-0.3.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+active_vision-0.3.0.dist-info/METADATA,sha256=B8t28CcxeXFLAonjFV6zoVwAAOOR1mSn_YtJVEzKqcg,15710
+active_vision-0.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+active_vision-0.3.0.dist-info/top_level.txt,sha256=7qUQvccN2UU63z5S9vrgJmqK-8sFGrtpf1e9Z86nihE,14
+active_vision-0.3.0.dist-info/RECORD,,

active_vision-0.2.0.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-active_vision/__init__.py,sha256=SxR6MPyULKlvx-86S3NIk46Tz1xlN-g_vI_aW3LitG4,43
-active_vision/core.py,sha256=4Nl8e3isinIlzcD6bCbG9TTGiuG0PQkKNUIvnAsbaTY,27373
-active_vision-0.2.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-active_vision-0.2.0.dist-info/METADATA,sha256=3XvDTC1Cnxd3rIUUXyY8MwTgKGcnncN9D2VvKnkw1jQ,15675
-active_vision-0.2.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-active_vision-0.2.0.dist-info/top_level.txt,sha256=7qUQvccN2UU63z5S9vrgJmqK-8sFGrtpf1e9Z86nihE,14
-active_vision-0.2.0.dist-info/RECORD,,

{active_vision-0.2.0.dist-info → active_vision-0.3.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{active_vision-0.2.0.dist-info → active_vision-0.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{active_vision-0.2.0.dist-info → active_vision-0.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

active-vision 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

active-vision 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl