PyPI - britekit - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

britekit 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of britekit might be problematic. Click here for more details.

Files changed (45) hide show

britekit/__about__.py +1 -1
britekit/__init__.py +6 -2
britekit/cli.py +6 -1
britekit/commands/__init__.py +2 -1
britekit/commands/_analyze.py +12 -10
britekit/commands/_audioset.py +8 -8
britekit/commands/_calibrate.py +8 -8
britekit/commands/_ckpt_ops.py +6 -6
britekit/commands/_db_add.py +12 -12
britekit/commands/_db_delete.py +15 -15
britekit/commands/_embed.py +4 -4
britekit/commands/_ensemble.py +7 -7
britekit/commands/_extract.py +158 -19
britekit/commands/_find_dup.py +5 -5
britekit/commands/_inat.py +4 -4
britekit/commands/_init.py +1 -1
britekit/commands/_pickle.py +7 -7
britekit/commands/_plot.py +26 -26
britekit/commands/_reextract.py +6 -6
britekit/commands/_reports.py +41 -27
britekit/commands/_search.py +12 -12
britekit/commands/_train.py +6 -6
britekit/commands/_tune.py +12 -12
britekit/commands/_wav2mp3.py +2 -2
britekit/commands/_xeno.py +7 -7
britekit/commands/_youtube.py +3 -3
britekit/core/analyzer.py +8 -8
britekit/core/audio.py +14 -14
britekit/core/data_module.py +2 -2
britekit/core/plot.py +8 -8
britekit/core/predictor.py +21 -21
britekit/core/reextractor.py +6 -6
britekit/core/util.py +8 -8
britekit/models/base_model.py +1 -0
britekit/occurrence_db/occurrence_data_provider.py +13 -13
britekit/testing/{per_minute_tester.py → per_block_tester.py} +39 -36
britekit/training_db/extractor.py +65 -30
britekit/training_db/training_data_provider.py +1 -1
britekit/training_db/training_db.py +97 -100
britekit-0.1.4.dist-info/METADATA +299 -0
{britekit-0.1.2.dist-info → britekit-0.1.4.dist-info}/RECORD +44 -44
britekit-0.1.2.dist-info/METADATA +0 -290
{britekit-0.1.2.dist-info → britekit-0.1.4.dist-info}/WHEEL +0 -0
{britekit-0.1.2.dist-info → britekit-0.1.4.dist-info}/entry_points.txt +0 -0
{britekit-0.1.2.dist-info → britekit-0.1.4.dist-info}/licenses/LICENSE.txt +0 -0

britekit/core/reextractor.py CHANGED Viewed

@@ -22,12 +22,12 @@ class Reextractor:
     updating the database.
     Args:
-        cfg_path (str, optional): Path to YAML file defining configuration overrides.
-        db_path (str, optional): Path to the training database. Defaults to cfg.train.training_db.
-        class_name (str, optional): Name of a specific class to reextract. If omitted, processes all classes.
-        classes_path (str, optional): Path to CSV file listing classes to reextract. Alternative to class_name.
-        check (bool): If True, only check that all recording paths are accessible without updating database.
-        spec_group (str): Spectrogram group name for storing the extracted spectrograms. Defaults to 'default'.
+    - cfg_path (str, optional): Path to YAML file defining configuration overrides.
+    - db_path (str, optional): Path to the training database. Defaults to cfg.train.training_db.
+    - class_name (str, optional): Name of a specific class to reextract. If omitted, processes all classes.
+    - classes_path (str, optional): Path to CSV file listing classes to reextract. Alternative to class_name.
+    - check (bool): If True, only check that all recording paths are accessible without updating database.
+    - spec_group (str): Spectrogram group name for storing the extracted spectrograms. Defaults to 'default'.
     """
     def __init__(

britekit/core/util.py CHANGED Viewed

@@ -166,7 +166,7 @@ def cfg_to_pure(obj: Any) -> JSONValue:
     str, int, float, bool) that can be safely serialized.
     Args:
-        obj: Any object to convert to JSON-serializable format
+    - obj: Any object to convert to JSON-serializable format
     Returns:
         JSON-serializable representation of the input object
@@ -284,8 +284,8 @@ def get_audio_files(path: str, short_names: bool = False) -> List[str]:
     Return list of audio files in the given directory.
     Args:
-        path (str): Directory path
-        short_names (bool): If true, return file names, else return full paths
+    - path (str): Directory path
+    - short_names (bool): If true, return file names, else return full paths
     Returns:
         List of audio files in the given directory
@@ -325,8 +325,8 @@ def get_file_lines(path: str, encoding: str = "utf-8") -> List[str]:
     and lines that start with #.
     Args:
-        path: Path to text file
-        encoding: File encoding (default: utf-8)
+    - path: Path to text file
+    - encoding: File encoding (default: utf-8)
     Returns:
         List of lines
@@ -354,7 +354,7 @@ def get_source_name(filename: str) -> str:
     Return a source name given a recording file name.
     Args:
-        filename: Recording file name
+    - filename: Recording file name
     Returns:
         Source name
@@ -390,7 +390,7 @@ def compress_spectrogram(spec) -> bytes:
     Compress a spectrogram in preparation for inserting into database.
     Args:
-        spec: Uncompressed spectrogram
+    - spec: Uncompressed spectrogram
     Returns:
         Compressed spectrogram
@@ -421,7 +421,7 @@ def expand_spectrogram(spec: bytes):
     Decompress a spectrogram, then convert from bytes to floats and reshape it.
     Args:
-        spec: Compressed spectrogram
+    - spec: Compressed spectrogram
     Returns:
         Uncompressed spectrogram

britekit/models/base_model.py CHANGED Viewed

@@ -252,6 +252,7 @@ class BaseModel(pl.LightningModule):
         }
     def on_save_checkpoint(self, checkpoint):
+        print("on_save_checkpoint")
         """Save model metadata to checkpoint."""
         if not hasattr(self, "identifier"):
             self.identifier = str(uuid.uuid4()).upper()

britekit/occurrence_db/occurrence_data_provider.py CHANGED Viewed

@@ -10,7 +10,7 @@ class OccurrenceDataProvider:
     you must call the refresh method.
     Args:
-        db (OccurrenceDatabase): The database object.
+    - db (OccurrenceDatabase): The database object.
     """
     def __init__(self, db: OccurrenceDatabase):
@@ -31,8 +31,8 @@ class OccurrenceDataProvider:
         Return county info for a given latitude/longitude, or None if not found.
         Args:
-            latitude (float): Latitude.
-            longitude (float): Longitude.
+        - latitude (float): Latitude.
+        - longitude (float): Longitude.
         Returns:
             County object, or None if not found.
@@ -54,8 +54,8 @@ class OccurrenceDataProvider:
         For each week, return the maximum of it and the adjacent weeks.
         Args:
-            county_code (str): County code
-            class_name (str): Class name
+        - county_code (str): County code
+        - class_name (str): Class name
         Returns:
             List of smoothed occurrence values.
@@ -75,8 +75,8 @@ class OccurrenceDataProvider:
         Return list of occurrence values for given county code and class name.
         Args:
-            county_code (str): County code
-            class_name (str): Class name
+        - county_code (str): County code
+        - class_name (str): Class name
         Returns:
             List of occurrence values.
@@ -97,9 +97,9 @@ class OccurrenceDataProvider:
         If area_weight = True, weight each county by its area.
         Args:
-            county_prefix (str): County code prefix
-            class_name (str): Class name
-            area_weight (bool, Optional): If true, weight by county area (default = False)
+        - county_prefix (str): County code prefix
+        - class_name (str): Class name
+        - area_weight (bool, Optional): If true, weight by county area (default = False)
         Returns:
             Numpy array of 48 average occurrence values (one per week, using 4-week months).
@@ -139,9 +139,9 @@ class OccurrenceDataProvider:
         county don't occur in the same week.
         Args:
-            county_prefix (str): County code prefix
-            class_name (str): Class name
-            area_weight (bool, Optional): If true, weight by county area (default = False)
+        - county_prefix (str): County code prefix
+        - class_name (str): Class name
+        - area_weight (bool, Optional): If true, weight by county area (default = False)
         Returns:
             Numpy average maximum occurrence value.

britekit/testing/{per_minute_tester.py → per_block_tester.py} RENAMED Viewed

@@ -17,16 +17,16 @@ class Annotation:
         return f"{self.class_code}: {self.start_time}-{self.end_time}"
-class PerMinuteTester(BaseTester):
+class PerBlockTester(BaseTester):
     """
-    Calculate test metrics when annotations are specified per minute. That is, for selected minutes of
-    each recording, a list of classes known to be present is given, and we are to calculate metrics for
-    those minutes only.
+    Calculate test metrics when annotations are specified per block, where a block is a fixed length, such
+    as a minute. That is, for selected blocks of each recording, a list of classes known to be present is given,
+    and we are to calculate metrics for those blocks only.
-    Annotations are read as a CSV with three columns: "recording", "minute", and "classes".
+    Annotations are read as a CSV with three columns: "recording", "block", and "classes".
     The recording column is the file name without the path or type suffix, e.g. "recording1".
-    The minute column contains 1 for the first minute, 2 for the second minute etc. and may
-    exclude some minutes. The classes column contains a comma-separated list of codes for the classes found in the corresponding minute.
+    The block column contains 1 for the first block, 2 for the second block etc. and may exclude some blocks.
+    The classes column contains a comma-separated list of codes for the classes found in the corresponding block.
     If your annotations are in a different format, simply convert to this format to use this script.
     Classifiers should be run with a threshold of 0, and with label merging disabled so segment-specific scores are retained.
@@ -37,6 +37,7 @@ class PerMinuteTester(BaseTester):
         label_dir (str): Directory containing Audacity labels.
         output_dir (str): Output directory, where reports will be written.
         threshold (float): Score threshold for precision/recall reporting.
+        block_size (int, optional): block_size in seconds (default=60).
         gen_pr_table (bool, optional): If true, generate a PR table, which may be slow (default = False).
     """
@@ -47,10 +48,11 @@ class PerMinuteTester(BaseTester):
         label_dir: str,
         output_dir: str,
         threshold: float,
+        block_size: int = 60,
         gen_pr_table: bool = False,
     ):
         """
-        Initialize the PerMinuteTester.
+        Initialize the PerBlockTester.
         See class docstring for detailed parameter descriptions and usage information.
         """
@@ -60,6 +62,7 @@ class PerMinuteTester(BaseTester):
         self.label_dir = label_dir
         self.output_dir = output_dir
         self.threshold = threshold
+        self.block_size = block_size
         self.gen_pr_table = gen_pr_table
         self.cfg = get_config()
@@ -119,8 +122,8 @@ class PerMinuteTester(BaseTester):
         Load annotation data from CSV file and process into internal format.
         This method reads a CSV file containing ground truth annotations where each row
-        represents a recording, minute, and its associated classes. The CSV should have columns:
-        "recording" (filename without path/extension), "minute" (minute number starting from 1),
+        represents a recording, block, and its associated classes. The CSV should have columns:
+        "recording" (filename without path/extension), "block" (block number starting from 1),
         and "classes" (comma-separated class codes).
         The method processes the annotations, handles class code mapping, filters out
@@ -151,10 +154,10 @@ class PerMinuteTester(BaseTester):
                 self.annotations[recording] = {}
                 self.segments_per_recording[recording] = []
-            minute = row["minute"]
-            if minute not in self.annotations[recording]:
-                self.annotations[recording][minute] = []
-                self.segments_per_recording[recording].append(minute - 1)
+            block = row["block"]
+            if block not in self.annotations[recording]:
+                self.annotations[recording][block] = []
+                self.segments_per_recording[recording].append(block - 1)
             input_class_list = []
             for code in row["classes"].split(","):
@@ -176,7 +179,7 @@ class PerMinuteTester(BaseTester):
                         continue  # exclude from saved annotations
                 if class_code:
-                    self.annotations[recording][minute].append(class_code)
+                    self.annotations[recording][block].append(class_code)
                     self.annotated_class_set.add(class_code)
         self.annotated_classes = sorted(list(self.annotated_class_set))
@@ -192,12 +195,12 @@ class PerMinuteTester(BaseTester):
         This method evaluates precision and recall metrics at different threshold values
         (0.01 to 1.00 in 0.01 increments) to create comprehensive precision-recall curves.
-        It calculates both per-minute granularity metrics and per-second granularity metrics.
+        It calculates both per-block granularity metrics and per-second granularity metrics.
         Returns:
             dict: Dictionary containing precision-recall data with keys:
                 - annotated_thresholds: List of threshold values for annotated classes
-                - annotated_precisions_minutes: List of precision values (minutes) for annotated classes
+                - annotated_precisions_blocks: List of precision values (blocks) for annotated classes
                 - annotated_precisions_seconds: List of precision values (seconds) for annotated classes
                 - annotated_recalls: List of recall values for annotated classes
                 - trained_thresholds: List of threshold values for trained classes
@@ -219,20 +222,20 @@ class PerMinuteTester(BaseTester):
         # use the looping method so we get per_second precision
         thresholds = []
-        recall_annotated, precision_annotated_minutes, precision_annotated_seconds = (
+        recall_annotated, precision_annotated_blocks, precision_annotated_seconds = (
             [],
             [],
             [],
         )
-        recall_trained, precision_trained_minutes = [], []
+        recall_trained, precision_trained_blocks = [], []
         for threshold in np.arange(0.01, 1.01, 0.01):
             info = self.get_precision_recall(threshold)
             thresholds.append(threshold)
             recall_annotated.append(info["recall_annotated"])
-            precision_annotated_minutes.append(info["precision_annotated"])
+            precision_annotated_blocks.append(info["precision_annotated"])
             precision_annotated_seconds.append(info["precision_secs"])
             recall_trained.append(info["recall_trained"])
-            precision_trained_minutes.append(info["precision_trained"])
+            precision_trained_blocks.append(info["precision_trained"])
             logging.info(
                 f"\rPercent complete: {int(threshold * 100)}%", end="", flush=True
             )
@@ -240,12 +243,12 @@ class PerMinuteTester(BaseTester):
         logging.info("")
         pr_table_dict = {}
         pr_table_dict["annotated_thresholds"] = thresholds
-        pr_table_dict["annotated_precisions_minutes"] = precision_annotated_minutes
+        pr_table_dict["annotated_precisions_blocks"] = precision_annotated_blocks
         pr_table_dict["annotated_precisions_seconds"] = precision_annotated_seconds
         pr_table_dict["annotated_recalls"] = recall_annotated
         pr_table_dict["trained_thresholds"] = thresholds
-        pr_table_dict["trained_precisions"] = precision_trained_minutes
+        pr_table_dict["trained_precisions"] = precision_trained_blocks
         pr_table_dict["trained_recalls"] = recall_trained
         # use this method for more granular results without per_second precision
@@ -303,7 +306,7 @@ class PerMinuteTester(BaseTester):
         if self.gen_pr_table:
             # calculate and output precision/recall per threshold
             threshold_annotated = self.pr_table_dict["annotated_thresholds"]
-            precision_annotated = self.pr_table_dict["annotated_precisions_minutes"]
+            precision_annotated = self.pr_table_dict["annotated_precisions_blocks"]
             precision_annotated_secs = self.pr_table_dict[
                 "annotated_precisions_seconds"
             ]
@@ -401,13 +404,13 @@ class PerMinuteTester(BaseTester):
         )
         rpt.append(f"   For threshold = {self.threshold}:\n")
         rpt.append(
-            f"      Precision (minutes) = {100 * self.details_dict['precision_annotated']:.2f}%\n"
+            f"      Precision (blocks) = {100 * self.details_dict['precision_annotated']:.2f}%\n"
         )
         rpt.append(
             f"      Precision (seconds) = {100 * self.details_dict['precision_secs']:.2f}%\n"
         )
         rpt.append(
-            f"      Recall (minutes) = {100 * self.details_dict['recall_annotated']:.2f}%\n"
+            f"      Recall (blocks) = {100 * self.details_dict['recall_annotated']:.2f}%\n"
         )
         rpt.append("\n")
@@ -420,10 +423,10 @@ class PerMinuteTester(BaseTester):
         )
         rpt.append(f"   For threshold = {self.threshold}:\n")
         rpt.append(
-            f"      Precision (minutes) = {100 * self.details_dict['precision_trained']:.2f}%\n"
+            f"      Precision (blocks) = {100 * self.details_dict['precision_trained']:.2f}%\n"
         )
         rpt.append(
-            f"      Recall (minutes) = {100 * self.details_dict['recall_trained']:.2f}%\n"
+            f"      Recall (blocks) = {100 * self.details_dict['recall_trained']:.2f}%\n"
         )
         logging.info("")
         with open(os.path.join(self.output_dir, "summary_report.txt"), "w") as summary:
@@ -551,7 +554,7 @@ class PerMinuteTester(BaseTester):
         # initialize y_true and y_pred and save them as CSV files
         logging.info("Initializing")
-        self.get_labels(self.label_dir, segment_len=60, overlap=0)
+        self.get_labels(self.label_dir, segment_len=self.block_size, overlap=0)
         self.get_annotations()
         self._init_y_true()
         self.init_y_pred(segments_per_recording=self.segments_per_recording)
@@ -573,7 +576,7 @@ class PerMinuteTester(BaseTester):
     def _init_y_true(self):
         """
-        Create a dataframe representing the ground truth data, with recordings segmented into 1-minute segments
+        Create a dataframe representing the ground truth data, with recordings segmented into 1-block segments
         """
         import pandas as pd
@@ -582,11 +585,11 @@ class PerMinuteTester(BaseTester):
         self.recordings = []  # base class needs array with recording per row
         rows = []
         for recording in sorted(self.annotations.keys()):
-            for minute in sorted(self.annotations[recording].keys()):
+            for block in sorted(self.annotations[recording].keys()):
                 self.recordings.append(recording)
-                row = [f"{recording}-{minute - 1}"]
+                row = [f"{recording}-{block - 1}"]
                 row.extend([0 for class_code in self.trained_classes])
-                for class_code in self.annotations[recording][minute]:
+                for class_code in self.annotations[recording][block]:
                     if class_code in self.trained_class_indexes:
                         row[self.trained_class_indexes[class_code] + 1] = 1
@@ -618,8 +621,8 @@ class PerMinuteTester(BaseTester):
         df = pd.DataFrame()
         df["threshold"] = pd.Series(threshold)
-        df["recall (minutes)"] = pd.Series(recall)
-        df["precision (minutes)"] = pd.Series(precision)
+        df["recall (blocks)"] = pd.Series(recall)
+        df["precision (blocks)"] = pd.Series(precision)
         if precision_secs is not None:
             df["precision (seconds)"] = pd.Series(precision_secs)
@@ -631,7 +634,7 @@ class PerMinuteTester(BaseTester):
         plt.clf()
         plt.plot(recall, label="Recall")
-        plt.plot(precision, label="Precision (Minutes)")
+        plt.plot(precision, label="Precision (blocks)")
         if precision_secs is not None:
             plt.plot(precision_secs, label="Precision (Seconds)")

britekit/training_db/extractor.py CHANGED Viewed

@@ -109,13 +109,45 @@ class Extractor:
         return offsets_per_file
+    def _insert_by_dict(self, recording_dir, destination_dir, offsets_per_file):
+        """
+        Given a recording directory and a dict from recording stems to offsets,
+        insert the corresponding spectrograms.
+        """
+        num_inserted = 0
+        recording_paths = util.get_audio_files(recording_dir)
+        for recording_dir in recording_paths:
+            filename = Path(recording_dir).stem
+            if filename not in offsets_per_file:
+                continue
+            if destination_dir is not None:
+                dest_path = os.path.join(destination_dir, Path(recording_dir).name)
+                if not os.path.exists(dest_path):
+                    shutil.copy(recording_dir, dest_path)
+                recording_dir = dest_path
+            logging.info(f"Processing {recording_dir}")
+            try:
+                self.audio.load(recording_dir)
+            except Exception as e:
+                logging.error(f"Caught exception: {e}")
+                continue
+            num_inserted += self.insert_spectrograms(
+                recording_dir, offsets_per_file[filename]
+            )
+        return num_inserted
     def insert_spectrograms(self, recording_path, offsets):
         """
         Insert a spectrogram at each of the given offsets of the specified file.
         Args:
-            recording_path (str): Path to audio recording.
-            offsets (list[float]): List of offsets, where each represents number of seconds to start of spectrogram.
+        - recording_path (str): Path to audio recording.
+        - offsets (list[float]): List of offsets, where each represents number of seconds to start of spectrogram.
         Returns:
             Number of spectrograms inserted.
@@ -156,7 +188,7 @@ class Extractor:
         Extract spectrograms for all recordings in the given directory.
         Args:
-            dir_path (str): Directory containing recordings.
+        - dir_path (str): Directory containing recordings.
         Returns:
             Number of spectrograms inserted.
@@ -187,45 +219,48 @@ class Extractor:
         return num_inserted
-    def extract_by_image(
-        self, rec_dir: str, spec_dir: str, dest_dir: Optional[str] = None
+    def extract_by_csv(
+        self, rec_dir: str, csv_path: str, dest_dir: Optional[str] = None
     ):
         """
         Extract spectrograms that match names of spectrogram images in a given directory.
         Typically the spectrograms were generated using the 'search' or 'plot-db' commands.
         Args:
-            rec_dir (str): Directory containing recordings.
-            spec_dir (str): Directory containing spectrogram images.
-            dest_dir (str, optional): Optionally copy used recordings to this directory.
+        - rec_dir (str): Directory containing recordings.
+        - csv_path (str): Path to CSV file containing two columns (recording and offset) to identify segments to extract.
+        - dest_dir (str, optional): Optionally copy used recordings to this directory.
         Returns:
             Number of spectrograms inserted.
         """
-        offsets_per_file = self._process_image_dir(spec_dir)
-        num_inserted = 0
-        recording_paths = util.get_audio_files(rec_dir)
-        for recording_path in recording_paths:
-            filename = Path(recording_path).stem
-            if filename not in offsets_per_file:
-                continue
+        import pandas as pd
-            if dest_dir is not None:
-                dest_path = os.path.join(dest_dir, Path(recording_path).name)
-                if not os.path.exists(dest_path):
-                    shutil.copy(recording_path, dest_path)
+        df = pd.read_csv(csv_path)
+        offsets_per_file: dict[str, list] = {}
+        for i, row in df.iterrows():
+            recording = row["recording"]
+            if recording not in offsets_per_file:
+                offsets_per_file[recording] = []
-                recording_path = dest_path
+            offsets_per_file[recording].append(row["offset"])
-            logging.info(f"Processing {recording_path}")
-            try:
-                self.audio.load(recording_path)
-            except Exception as e:
-                logging.error(f"Caught exception: {e}")
-                continue
+        return self._insert_by_dict(rec_dir, dest_dir, offsets_per_file)
-            num_inserted += self.insert_spectrograms(
-                recording_path, offsets_per_file[filename]
-            )
+    def extract_by_image(
+        self, rec_dir: str, spec_dir: str, dest_dir: Optional[str] = None
+    ):
+        """
+        Extract spectrograms that match names of spectrogram images in a given directory.
+        Typically the spectrograms were generated using the 'search' or 'plot-db' commands.
-        return num_inserted
+        Args:
+        - rec_dir (str): Directory containing recordings.
+        - spec_dir (str): Directory containing spectrogram images.
+        - dest_dir (str, optional): Optionally copy used recordings to this directory.
+        Returns:
+            Number of spectrograms inserted.
+        """
+        offsets_per_file = self._process_image_dir(spec_dir)
+        return self._insert_by_dict(rec_dir, dest_dir, offsets_per_file)

britekit/training_db/training_data_provider.py CHANGED Viewed

@@ -8,7 +8,7 @@ class TrainingDataProvider:
     Data access layer on top of TrainingDatabase.
     Args:
-        db (TrainingDatabase): The database object.
+    - db (TrainingDatabase): The database object.
     """
     def __init__(self, db: TrainingDatabase):

britekit 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

Potentially problematic release.

britekit 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl