PyPI - masster - Versions diffs - 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl - Mend

masster 0.3.15py3-none-any.whl → 0.3.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of masster might be problematic. Click here for more details.

Files changed (17) hide show

masster/_version.py +1 -1
masster/sample/h5.py +577 -0
masster/sample/load.py +57 -0
masster/sample/sample.py +4 -0
masster/spectrum.py +3 -0
masster/study/defaults/align_def.py +9 -0
masster/study/defaults/fill_def.py +3 -3
masster/study/export.py +3 -0
masster/study/load.py +653 -258
masster/study/processing.py +149 -120
masster/study/study.py +8 -0
masster/study/study5_schema.json +3 -0
{masster-0.3.15.dist-info → masster-0.3.17.dist-info}/METADATA +1 -1
{masster-0.3.15.dist-info → masster-0.3.17.dist-info}/RECORD +17 -17
{masster-0.3.15.dist-info → masster-0.3.17.dist-info}/WHEEL +0 -0
{masster-0.3.15.dist-info → masster-0.3.17.dist-info}/entry_points.txt +0 -0
{masster-0.3.15.dist-info → masster-0.3.17.dist-info}/licenses/LICENSE +0 -0

masster/study/load.py CHANGED Viewed

@@ -40,7 +40,21 @@ def add(
     reset=False,
     adducts=None,
     max_files=None,
+    fast=True,
 ):
+    """Add samples from a folder to the study.
+    Args:
+        folder (str, optional): Path to folder containing sample files.
+            Defaults to study folder or current working directory.
+        reset (bool, optional): Whether to reset the study before adding samples.
+            Defaults to False.
+        adducts (optional): Adducts to use for sample loading. Defaults to None.
+        max_files (int, optional): Maximum number of files to process.
+            Defaults to None (no limit).
+        fast (bool, optional): Whether to use optimized loading that skips ms1_df
+            for better performance. Defaults to True.
+    """
     if folder is None:
         if self.folder is not None:
             folder = self.folder
@@ -85,39 +99,29 @@ def add(
             self.logger.debug(f"Found {len(files)} {ext} files")
-            # Process files
-            for i, file in enumerate(
-                tqdm(
-                    files,
-                    total=len(files),
-                    desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Add *{ext}",
-                    disable=tdqm_disable,
-                ),
-            ):
+            # Filter files not already processed and respect max_files limit
+            files_to_process = []
+            for file in files:
                 if max_files is not None and counter >= max_files:
                     break
                 # Get filename without extension for blacklist check
                 basename = os.path.basename(file)
                 filename_no_ext = os.path.splitext(basename)[0]
                 # Check if this filename (without extension) is already in blacklist
-                if filename_no_ext in blacklist:
-                    self.logger.debug(f"Skipping {file} - filename already processed")
-                    continue
-                self.logger.debug(f"Add file {counter + 1}: {file}")
-                # Try to add the sample
-                try:
-                    self.add_sample(file=file, reset=reset, adducts=adducts)
-                    # If successful, add to blacklist and increment counter
-                    blacklist.add(filename_no_ext)
-                    counter += 1
+                if filename_no_ext not in blacklist:
+                    files_to_process.append(file)
+                    if len(files_to_process) + counter >= (max_files or float('inf')):
+                        break
+            # Batch process all files of this extension using ultra-optimized method
+            if files_to_process:
+                self.logger.debug(f"Batch processing {len(files_to_process)} {ext} files")
+                successful = self._add_samples_batch(files_to_process, reset=reset, adducts=adducts, blacklist=blacklist, fast=fast)
+                counter += successful
+                if successful > 0:
                     not_zero = True
-                except Exception as e:
-                    self.logger.warning(f"Failed to add sample {file}: {e}")
-                    continue
     if max_files is not None and counter >= max_files:
         self.logger.debug(
@@ -133,198 +137,43 @@ def add(
 # TODO type is not used
-def add_sample(self, file, type=None, reset=False, adducts=None):
-    self.logger.debug(f"Adding: {file}")
-    # Extract sample name by removing any known extension
-    basename = os.path.basename(file)
-    sample_name = os.path.splitext(basename)[0]
-    # check if sample_name is already in the samples_df
-    if sample_name in self.samples_df["sample_name"].to_list():
-        self.logger.warning(
-            f"Sample {sample_name} already exists in the study. Skipping.",
-        )
-        return
-    # check if file exists
-    if not os.path.exists(file):
-        self.logger.error(f"File {file} does not exist.")
-        return
-    # Check for supported file extensions
-    if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
-        self.logger.error(f"File {file} is not a supported file type. Supported: .sample5, .wiff, .raw, .mzML")
-        return
-    # Load the sample based on file type
-    ddaobj = Sample()
-    ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
-    if file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
-        ddaobj.load(file)
-    else:
-        self.logger.error(f"Unsupported file format: {file}")
-        return
-    if ddaobj.features_df is None and not reset:
-        self.logger.debug(
-            f"File {file} will be newly processed.",
-        )
-        ddaobj.features = None
-    if ddaobj.features is None or reset:
-        ddaobj.find_features()
-        ddaobj.find_adducts(adducts=adducts)
-        ddaobj.find_ms2()
-    self.features_maps.append(ddaobj.features)
-    sample_type = "sample" if type is None else type
-    if "qc" in sample_name.lower():
-        sample_type = "qc"
-    if "blank" in sample_name.lower():
-        sample_type = "blank"
+def add_sample(self, file, type=None, reset=False, adducts=None, fast=True):
+    """
+    Add a single sample to the study.
-    # Use the index of the feature map in self.features_maps as map_id
-    map_id_value = len(self.features_maps) - 1
-    # Determine the final sample path based on file type
-    if file.endswith(".sample5"):
-        # If input is already .sample5, keep it in original location
-        final_sample_path = file
-        self.logger.debug(f"Using existing .sample5 file at original location: {final_sample_path}")
-        # Check if there's a corresponding featureXML file in the same directory
-        featurexml_path = file.replace(".sample5", ".featureXML")
-        if os.path.exists(featurexml_path):
-            self.logger.debug(f"Found corresponding featureXML file: {featurexml_path}")
-        else:
-            self.logger.debug(f"No corresponding featureXML file found at: {featurexml_path}")
-    else:
-        # For .wiff, .mzML, .raw files, save to study folder (original behavior)
-        if self.folder is not None:
-            if not os.path.exists(self.folder):
-                os.makedirs(self.folder)
-            final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
-            ddaobj.save(final_sample_path)
-            self.logger.debug(f"Saved converted sample to study folder: {final_sample_path}")
-        else:
-            # If no study folder is set, save in current directory
-            final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
-            ddaobj.save(final_sample_path)
-            self.logger.debug(f"Saved converted sample to current directory: {final_sample_path}")
-    # Count MS1 and MS2 scans from the loaded sample
-    ms1_count = 0
-    ms2_count = 0
-    if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
-        ms1_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 1).height)
-        ms2_count = int(ddaobj.scans_df.filter(pl.col("ms_level") == 2).height)
-    # Calculate next sequence number
-    next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
-    new_sample = pl.DataFrame(
-        {
-            "sample_uid": [int(len(self.samples_df) + 1)],
-            "sample_name": [sample_name],
-            "sample_path": [final_sample_path],  # Use the determined path
-            "sample_type": [sample_type],
-            "map_id": [map_id_value],
-            "sample_source": [getattr(ddaobj, "file_source", file)],
-            "sample_color": [None],  # Will be set by set_sample_color below
-            "sample_group": [""],  # Default empty string
-            "sample_batch": [1],  # Default batch 1
-            "sample_sequence": [next_sequence],  # Increasing sequence number
-            "num_features": [int(ddaobj.features.size())],
-            "num_ms1": [ms1_count],
-            "num_ms2": [ms2_count],
-        },
-        schema={
-            "sample_uid": pl.Int64,
-            "sample_name": pl.Utf8,
-            "sample_path": pl.Utf8,
-            "sample_type": pl.Utf8,
-            "map_id": pl.Int64,
-            "sample_source": pl.Utf8,
-            "sample_color": pl.Utf8,
-            "sample_group": pl.Utf8,
-            "sample_batch": pl.Int64,
-            "sample_sequence": pl.Int64,
-            "num_features": pl.Int64,
-            "num_ms1": pl.Int64,
-            "num_ms2": pl.Int64,
-        },
-    )
-    self.samples_df = pl.concat([self.samples_df, new_sample])
-    # Optimized DataFrame operations - chain operations instead of multiple clones
-    columns_to_add = [
-        pl.lit(len(self.samples_df)).alias("sample_uid"),
-        pl.lit(False).alias("filled"),
-        pl.lit(-1.0).alias("chrom_area"),
-    ]
-    # Only add rt_original if it doesn't exist
-    if "rt_original" not in ddaobj.features_df.columns:
-        columns_to_add.append(pl.col("rt").alias("rt_original"))
-    f_df = ddaobj.features_df.with_columns(columns_to_add)
-    if self.features_df.is_empty():
-        # Create new features_df with feature_uid column
-        self.features_df = f_df.with_columns(
-            pl.int_range(pl.len()).add(1).alias("feature_uid"),
-        ).select(
-            ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
+    Args:
+        file (str): Path to the sample file
+        type (str, optional): File type to force. Defaults to None (auto-detect).
+        reset (bool, optional): Whether to reset the study. Defaults to False.
+        adducts (optional): Adducts to use for sample loading. Defaults to None.
+        fast (bool, optional): Whether to use optimized loading that skips ms1_df
+            for better performance. Defaults to True.
+    Returns:
+        bool: True if successful, False otherwise.
+    """
+    if fast:
+        # Use optimized method for better performance
+        success = self._add_sample_optimized(
+            file,
+            type=type,
+            reset=reset,
+            adducts=adducts,
+            skip_color_reset=False,  # Do color reset for individual calls
+            skip_schema_check=True   # Skip schema check for performance (safe with diagonal concat)
         )
-        # Ensure column order matches schema from the very beginning
-        self._ensure_features_df_schema_order()
     else:
-        offset = self.features_df["feature_uid"].max() + 1 if not self.features_df.is_empty() else 1
-        # Chain operations and add to existing DataFrame
-        f_df = f_df.with_columns(
-            pl.int_range(pl.len()).add(offset).alias("feature_uid"),
-        ).select(
-            ["feature_uid"] + [col for col in f_df.columns if col != "feature_uid"],
+        # Use standard method with full ms1_df loading
+        success = self._add_sample_standard(
+            file,
+            type=type,
+            reset=reset,
+            adducts=adducts,
+            skip_color_reset=False,  # Do color reset for individual calls
+            skip_schema_check=True   # Skip schema check for performance
         )
-        # Reorganize f_df columns to match self.features_df column order and schema
-        target_columns = self.features_df.columns
-        target_schema = self.features_df.schema
-        f_df_columns = f_df.columns
-        # Create select expressions for reordering and type casting
-        select_exprs = []
-        for col in target_columns:
-            if col in f_df_columns:
-                # Cast to the expected type
-                expected_dtype = target_schema[col]
-                select_exprs.append(pl.col(col).cast(expected_dtype, strict=False))
-            else:
-                # Add missing columns with null values of the correct type
-                expected_dtype = target_schema[col]
-                select_exprs.append(pl.lit(None, dtype=expected_dtype).alias(col))
-        # Add any extra columns from f_df that aren't in target_columns (keep their original types)
-        for col in f_df_columns:
-            if col not in target_columns:
-                select_exprs.append(pl.col(col))
-        # Reorder and type-cast f_df columns
-        f_df = f_df.select(select_exprs)
-        self.features_df = pl.concat([self.features_df, f_df])
-    # Ensure features_df column order matches schema
-    self._ensure_features_df_schema_order()
-    # Auto-assign colors when new sample is added (reset all colors using turbo colormap based on UID)
-    self.sample_color_reset()
-    self.logger.debug(
-        f"Added sample {sample_name} with {ddaobj.features.size()} features to the study.",
-    )
+    return success
 def load(self, filename=None):
@@ -942,8 +791,6 @@ def _fill_chrom_impl(
         })
     total_missing = len(missing_combinations_df)
-    total_samples = len(samples_to_process)
     self.logger.debug(
         f"Gap filling for {total_missing} missing features...",
     )
@@ -1114,51 +961,96 @@ def _get_missing_consensus_sample_combinations(self, uids):
     """
     Efficiently identify which consensus_uid/sample combinations are missing.
     Returns a list of tuples: (consensus_uid, sample_uid, sample_name, sample_path)
+    Optimized for common scenarios:
+    - Early termination for fully-filled studies
+    - Efficient dictionary lookups instead of expensive DataFrame joins
+    - Smart handling of sparse vs dense missing data patterns
     """
-    # Get all consensus UIDs we're interested in
-    consensus_uids_set = set(uids)
-    # Get all sample UIDs and create lookup
-    all_sample_info = {}
-    for row in self.samples_df.select([
-        "sample_uid",
-        "sample_name",
-        "sample_path",
-    ]).iter_rows(named=True):
-        all_sample_info[row["sample_uid"]] = {
-            "sample_name": row["sample_name"],
-            "sample_path": row["sample_path"],
-        }
-    # Get existing consensus/sample combinations from consensus_mapping_df
-    existing_combinations = set()
-    consensus_mapping_filtered = self.consensus_mapping_df.filter(
-        pl.col("consensus_uid").is_in(list(consensus_uids_set)),
-    )
-    # Join with features_df to get sample_uid information
-    existing_features = consensus_mapping_filtered.join(
-        self.features_df.select(["feature_uid", "sample_uid"]),
-        on="feature_uid",
-        how="inner",
+    if not uids:
+        return []
+    n_consensus = len(uids)
+    n_samples = len(self.samples_df)
+    total_possible = n_consensus * n_samples
+    # Quick early termination check for fully/nearly filled studies
+    # This handles the common case where fill() is run on an already-filled study
+    consensus_counts = (
+        self.consensus_mapping_df
+        .filter(pl.col("consensus_uid").is_in(uids))
+        .group_by("consensus_uid")
+        .agg(pl.count("feature_uid").alias("count"))
     )
-    for row in existing_features.select(["consensus_uid", "sample_uid"]).iter_rows():
-        existing_combinations.add((row[0], row[1]))  # (consensus_uid, sample_uid)
-    # Find missing combinations
-    missing_combinations = []
-    for consensus_uid in consensus_uids_set:
-        for sample_uid, sample_info in all_sample_info.items():
-            if (consensus_uid, sample_uid) not in existing_combinations:
-                missing_combinations.append((
-                    consensus_uid,
-                    sample_uid,
-                    sample_info["sample_name"],
-                    sample_info["sample_path"],
-                ))
-    return missing_combinations
+    total_existing = consensus_counts["count"].sum() if not consensus_counts.is_empty() else 0
+    # If >95% filled, likely no gaps (common case)
+    if total_existing >= total_possible * 0.95:
+        self.logger.debug(f"Study appears {total_existing/total_possible*100:.1f}% filled, using sparse optimization")
+        # For sparse missing data, check each consensus feature individually
+        missing_combinations = []
+        uids_set = set(uids)
+        # Build efficient lookups
+        feature_to_sample = dict(
+            self.features_df.select(["feature_uid", "sample_uid"]).iter_rows()
+        )
+        # Get existing combinations for target UIDs only
+        existing_by_consensus = {}
+        for consensus_uid, feature_uid in self.consensus_mapping_df.select(["consensus_uid", "feature_uid"]).iter_rows():
+            if consensus_uid in uids_set and feature_uid in feature_to_sample:
+                if consensus_uid not in existing_by_consensus:
+                    existing_by_consensus[consensus_uid] = set()
+                existing_by_consensus[consensus_uid].add(feature_to_sample[feature_uid])
+        # Get sample info once
+        all_samples = list(
+            self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows()
+        )
+        # Check for missing combinations
+        for consensus_uid in uids:
+            existing_samples = existing_by_consensus.get(consensus_uid, set())
+            for sample_uid, sample_name, sample_path in all_samples:
+                if sample_uid not in existing_samples:
+                    missing_combinations.append((consensus_uid, sample_uid, sample_name, sample_path))
+        return missing_combinations
+    else:
+        # For studies with many gaps, use bulk operations
+        self.logger.debug(f"Study {total_existing/total_possible*100:.1f}% filled, using bulk optimization")
+        # Build efficient lookups
+        uids_set = set(uids)
+        feature_to_sample = dict(
+            self.features_df.select(["feature_uid", "sample_uid"]).iter_rows()
+        )
+        # Build existing combinations set
+        existing_combinations = {
+            (consensus_uid, feature_to_sample[feature_uid])
+            for consensus_uid, feature_uid in self.consensus_mapping_df.select(["consensus_uid", "feature_uid"]).iter_rows()
+            if consensus_uid in uids_set and feature_uid in feature_to_sample
+        }
+        # Get all sample info
+        all_samples = list(
+            self.samples_df.select(["sample_uid", "sample_name", "sample_path"]).iter_rows()
+        )
+        # Generate all missing combinations
+        missing_combinations = [
+            (consensus_uid, sample_uid, sample_name, sample_path)
+            for consensus_uid in uids
+            for sample_uid, sample_name, sample_path in all_samples
+            if (consensus_uid, sample_uid) not in existing_combinations
+        ]
+        return missing_combinations
 def sanitize(self):
@@ -1300,3 +1192,506 @@ def _load_consensusXML(self, filename="alignment.consensusXML"):
     self.consensus_map = oms.ConsensusMap()
     fh.load(filename, self.consensus_map)
     self.logger.debug(f"Loaded consensus map from {filename}.")
+def _add_samples_batch(self, files, reset=False, adducts=None, blacklist=None, fast=True):
+    """
+    Optimized batch addition of samples.
+    Args:
+        files (list): List of file paths to process
+        reset (bool): Whether to reset features before processing
+        adducts: Adducts to use for sample loading
+        blacklist (set): Set of filenames already processed
+        fast (bool): Whether to use optimized loading (skips ms1_df) or standard loading
+    Performance optimizations:
+    1. No per-sample color reset
+    2. No schema enforcement during addition
+    3. Simplified DataFrame operations
+    4. Batch progress reporting
+    """
+    if not files:
+        return 0
+    if blacklist is None:
+        blacklist = set()
+    self.logger.debug(f"Starting batch addition of {len(files)} samples (fast={fast})...")
+    successful_additions = 0
+    failed_additions = 0
+    # Progress reporting setup
+    tqdm_disable = self.log_level not in ["TRACE", "DEBUG", "INFO"]
+    for i, file in enumerate(
+        tqdm(
+            files,
+            total=len(files),
+            desc=f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]} | INFO     | {self.log_label}Batch add",
+            disable=tqdm_disable,
+        )
+    ):
+        try:
+            # Choose between optimized and standard loading
+            if fast:
+                success = self._add_sample_optimized(
+                    file,
+                    reset=reset,
+                    adducts=adducts,
+                    skip_color_reset=True,  # Skip color reset during batch
+                    skip_schema_check=True  # Skip schema enforcement
+                )
+            else:
+                success = self._add_sample_standard(
+                    file,
+                    reset=reset,
+                    adducts=adducts,
+                    skip_color_reset=True,  # Skip color reset during batch
+                    skip_schema_check=True  # Skip schema enforcement
+                )
+            if success:
+                # Add to blacklist for filename tracking
+                basename = os.path.basename(file)
+                filename_no_ext = os.path.splitext(basename)[0]
+                blacklist.add(filename_no_ext)
+                successful_additions += 1
+        except Exception as e:
+            self.logger.warning(f"Failed to add sample {file}: {e}")
+            failed_additions += 1
+            continue
+    # Final cleanup operations done once at the end
+    if successful_additions > 0:
+        self.logger.debug("Performing final batch cleanup...")
+        # Optional: Only do schema enforcement if specifically needed (usually not required)
+        # self._ensure_features_df_schema_order()
+        # Color assignment done once for all samples
+        self._sample_color_reset_optimized()
+        self.logger.debug(f"Batch addition complete: {successful_additions} successful, {failed_additions} failed")
+    return successful_additions
+def _add_sample_optimized(self, file, type=None, reset=False, adducts=None, skip_color_reset=True, skip_schema_check=True):
+    """
+    Optimized add_sample with performance improvements integrated.
+    Removes:
+    - Schema enforcement (_ensure_features_df_schema_order)
+    - Complex column alignment and type casting
+    - Per-addition color reset
+    - Unnecessary column reordering
+    Returns True if successful, False otherwise.
+    """
+    self.logger.debug(f"Adding: {file}")
+    # Basic validation
+    basename = os.path.basename(file)
+    sample_name = os.path.splitext(basename)[0]
+    if sample_name in self.samples_df["sample_name"].to_list():
+        self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
+        return False
+    if not os.path.exists(file):
+        self.logger.error(f"File {file} does not exist.")
+        return False
+    if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
+        self.logger.error(f"Unsupported file type: {file}")
+        return False
+    # Load sample
+    ddaobj = Sample()
+    ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
+    # Use standard loading method temporarily to test if this fixes the astuple error
+    ddaobj.load(file)
+    if ddaobj.features_df is None and not reset:
+        ddaobj.features = None
+    if ddaobj.features is None or reset:
+        ddaobj.find_features()
+        ddaobj.find_adducts(adducts=adducts)
+        ddaobj.find_ms2()
+    self.features_maps.append(ddaobj.features)
+    # Determine sample type
+    sample_type = "sample" if type is None else type
+    if "qc" in sample_name.lower():
+        sample_type = "qc"
+    if "blank" in sample_name.lower():
+        sample_type = "blank"
+    map_id_value = len(self.features_maps) - 1
+    # Handle file paths
+    if file.endswith(".sample5"):
+        final_sample_path = file
+        self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
+    else:
+        if self.folder is not None:
+            if not os.path.exists(self.folder):
+                os.makedirs(self.folder)
+            final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
+        else:
+            final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
+        ddaobj.save(final_sample_path)
+        self.logger.debug(f"Saved converted sample: {final_sample_path}")
+    # Efficient scan counting
+    ms1_count = ms2_count = 0
+    if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
+        scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
+        ms_levels = scan_counts.get("ms_level", [])
+        counts = scan_counts.get("len", [])
+        for level, count in zip(ms_levels, counts):
+            if level == 1:
+                ms1_count = count
+            elif level == 2:
+                ms2_count = count
+    # Create sample entry
+    next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
+    new_sample = pl.DataFrame({
+        "sample_uid": [int(len(self.samples_df) + 1)],
+        "sample_name": [sample_name],
+        "sample_path": [final_sample_path],
+        "sample_type": [sample_type],
+        "map_id": [map_id_value],
+        "sample_source": [getattr(ddaobj, "file_source", file)],
+        "sample_color": [None],  # Will be set in batch at end
+        "sample_group": [""],
+        "sample_batch": [1],
+        "sample_sequence": [next_sequence],
+        "num_features": [int(ddaobj.features.size())],
+        "num_ms1": [ms1_count],
+        "num_ms2": [ms2_count],
+    })
+    self.samples_df = pl.concat([self.samples_df, new_sample])
+    # SIMPLIFIED feature processing
+    current_sample_uid = len(self.samples_df) - 1
+    # Add required columns with minimal operations
+    columns_to_add = [
+        pl.lit(current_sample_uid).alias("sample_uid"),
+        pl.lit(False).alias("filled"),
+        pl.lit(-1.0).alias("chrom_area"),
+    ]
+    # Only add rt_original if it doesn't exist
+    if "rt_original" not in ddaobj.features_df.columns:
+        columns_to_add.append(pl.col("rt").alias("rt_original"))
+    f_df = ddaobj.features_df.with_columns(columns_to_add)
+    if self.features_df.is_empty():
+        # First sample
+        self.features_df = f_df.with_columns(
+            pl.int_range(pl.len()).add(1).alias("feature_uid")
+        )
+    else:
+        # Subsequent samples - minimal overhead
+        offset = self.features_df["feature_uid"].max() + 1
+        f_df = f_df.with_columns(
+            pl.int_range(pl.len()).add(offset).alias("feature_uid")
+        )
+        # OPTIMIZED: Use diagonal concatenation without any schema enforcement
+        # This is the fastest concatenation method in Polars and handles type mismatches automatically
+        self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
+    # REMOVED ALL EXPENSIVE OPERATIONS:
+    # - No _ensure_features_df_schema_order()
+    # - No complex column alignment
+    # - No type casting loops
+    # - No sample_color_reset()
+    self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (optimized)")
+    return True
+def _add_sample_standard(self, file, type=None, reset=False, adducts=None, skip_color_reset=True, skip_schema_check=True):
+    """
+    Standard add_sample method that uses full sample loading (includes ms1_df).
+    This method uses the standard sample.load() method which loads all data
+    including ms1_df, providing full functionality but potentially slower performance
+    for large MS1 datasets.
+    Returns True if successful, False otherwise.
+    """
+    self.logger.debug(f"Adding (standard): {file}")
+    # Basic validation
+    basename = os.path.basename(file)
+    sample_name = os.path.splitext(basename)[0]
+    if sample_name in self.samples_df["sample_name"].to_list():
+        self.logger.warning(f"Sample {sample_name} already exists. Skipping.")
+        return False
+    if not os.path.exists(file):
+        self.logger.error(f"File {file} does not exist.")
+        return False
+    if not file.endswith((".sample5", ".wiff", ".raw", ".mzML")):
+        self.logger.error(f"Unsupported file type: {file}")
+        return False
+    # Load sample using standard method (includes ms1_df)
+    ddaobj = Sample()
+    ddaobj.logger_update(level="WARNING", label=os.path.basename(file))
+    # Use standard loading method that loads all data including ms1_df
+    ddaobj.load(file)
+    if ddaobj.features_df is None and not reset:
+        ddaobj.features = None
+    if ddaobj.features is None or reset:
+        ddaobj.find_features()
+        ddaobj.find_adducts(adducts=adducts)
+        ddaobj.find_ms2()
+    self.features_maps.append(ddaobj.features)
+    # Determine sample type
+    sample_type = "sample" if type is None else type
+    if "qc" in sample_name.lower():
+        sample_type = "qc"
+    if "blank" in sample_name.lower():
+        sample_type = "blank"
+    map_id_value = len(self.features_maps) - 1
+    # Handle file paths
+    if file.endswith(".sample5"):
+        final_sample_path = file
+        self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
+    else:
+        if self.folder is not None:
+            if not os.path.exists(self.folder):
+                os.makedirs(self.folder)
+            final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
+        else:
+            final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
+        ddaobj.save(final_sample_path)
+        self.logger.debug(f"Saved converted sample: {final_sample_path}")
+    # Efficient scan counting
+    ms1_count = ms2_count = 0
+    if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
+        scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
+        ms_levels = scan_counts.get("ms_level", [])
+        counts = scan_counts.get("len", [])
+        for level, count in zip(ms_levels, counts):
+            if level == 1:
+                ms1_count = count
+            elif level == 2:
+                ms2_count = count
+    # Create sample entry
+    next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
+    new_sample = pl.DataFrame({
+        "sample_uid": [int(len(self.samples_df) + 1)],
+        "sample_name": [sample_name],
+        "sample_path": [final_sample_path],
+        "sample_type": [sample_type],
+        "map_id": [map_id_value],
+        "sample_source": [getattr(ddaobj, "file_source", file)],
+        "sample_color": [None],  # Will be set in batch at end
+        "sample_group": [""],
+        "sample_batch": [1],
+        "sample_sequence": [next_sequence],
+        "num_features": [int(ddaobj.features.size())],
+        "num_ms1": [ms1_count],
+        "num_ms2": [ms2_count],
+    })
+    self.samples_df = pl.concat([self.samples_df, new_sample])
+    # SIMPLIFIED feature processing
+    current_sample_uid = len(self.samples_df) - 1
+    # Add required columns with minimal operations
+    columns_to_add = [
+        pl.lit(current_sample_uid).alias("sample_uid"),
+        pl.lit(False).alias("filled"),
+        pl.lit(-1.0).alias("chrom_area"),
+    ]
+    # Only add rt_original if it doesn't exist
+    if "rt_original" not in ddaobj.features_df.columns:
+        columns_to_add.append(pl.col("rt").alias("rt_original"))
+    f_df = ddaobj.features_df.with_columns(columns_to_add)
+    if self.features_df.is_empty():
+        # First sample
+        self.features_df = f_df.with_columns(
+            pl.int_range(pl.len()).add(1).alias("feature_uid")
+        )
+    else:
+        # Subsequent samples - minimal overhead
+        offset = self.features_df["feature_uid"].max() + 1
+        f_df = f_df.with_columns(
+            pl.int_range(pl.len()).add(offset).alias("feature_uid")
+        )
+        # Use diagonal concatenation for flexibility
+        self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
+    self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
+    return True
+    # Use standard loading method that loads all data including ms1_df
+    ddaobj.load(file)
+    if ddaobj.features_df is None and not reset:
+        ddaobj.features = None
+    if ddaobj.features is None or reset:
+        ddaobj.find_features()
+        ddaobj.find_adducts(adducts=adducts)
+        ddaobj.find_ms2()
+    self.features_maps.append(ddaobj.features)
+    # Determine sample type
+    sample_type = "sample" if type is None else type
+    if "qc" in sample_name.lower():
+        sample_type = "qc"
+    if "blank" in sample_name.lower():
+        sample_type = "blank"
+    map_id_value = len(self.features_maps) - 1
+    # Handle file paths
+    if file.endswith(".sample5"):
+        final_sample_path = file
+        self.logger.debug(f"Using existing .sample5 file: {final_sample_path}")
+    else:
+        if self.folder is not None:
+            if not os.path.exists(self.folder):
+                os.makedirs(self.folder)
+            final_sample_path = os.path.join(self.folder, sample_name + ".sample5")
+        else:
+            final_sample_path = os.path.join(os.getcwd(), sample_name + ".sample5")
+        ddaobj.save(final_sample_path)
+        self.logger.debug(f"Saved converted sample: {final_sample_path}")
+    # Efficient scan counting
+    ms1_count = ms2_count = 0
+    if hasattr(ddaobj, "scans_df") and ddaobj.scans_df is not None and not ddaobj.scans_df.is_empty():
+        scan_counts = ddaobj.scans_df.group_by("ms_level").len().to_dict(as_series=False)
+        ms_levels = scan_counts.get("ms_level", [])
+        counts = scan_counts.get("len", [])
+        for level, count in zip(ms_levels, counts):
+            if level == 1:
+                ms1_count = count
+            elif level == 2:
+                ms2_count = count
+    # Create sample entry
+    next_sequence = len(self.samples_df) + 1 if not self.samples_df.is_empty() else 1
+    new_sample = pl.DataFrame({
+        "sample_uid": [int(len(self.samples_df) + 1)],
+        "sample_name": [sample_name],
+        "sample_path": [final_sample_path],
+        "sample_type": [sample_type],
+        "map_id": [map_id_value],
+        "sample_source": [getattr(ddaobj, "file_source", file)],
+        "sample_color": [None],  # Will be set in batch at end
+        "sample_group": [""],
+        "sample_batch": [1],
+        "sample_sequence": [next_sequence],
+        "num_features": [int(ddaobj.features.size())],
+        "num_ms1": [ms1_count],
+        "num_ms2": [ms2_count],
+    })
+    self.samples_df = pl.concat([self.samples_df, new_sample])
+    # SIMPLIFIED feature processing
+    current_sample_uid = len(self.samples_df) - 1
+    # Add required columns with minimal operations
+    columns_to_add = [
+        pl.lit(current_sample_uid).alias("sample_uid"),
+        pl.lit(False).alias("filled"),
+        pl.lit(-1.0).alias("chrom_area"),
+    ]
+    # Only add rt_original if it doesn't exist
+    if "rt_original" not in ddaobj.features_df.columns:
+        columns_to_add.append(pl.col("rt").alias("rt_original"))
+    f_df = ddaobj.features_df.with_columns(columns_to_add)
+    if self.features_df.is_empty():
+        # First sample
+        self.features_df = f_df.with_columns(
+            pl.int_range(pl.len()).add(1).alias("feature_uid")
+        )
+    else:
+        # Subsequent samples - minimal overhead
+        offset = self.features_df["feature_uid"].max() + 1
+        f_df = f_df.with_columns(
+            pl.int_range(pl.len()).add(offset).alias("feature_uid")
+        )
+        # Use diagonal concatenation for flexibility
+        self.features_df = pl.concat([self.features_df, f_df], how="diagonal")
+    self.logger.debug(f"Added sample {sample_name} with {ddaobj.features.size()} features (standard)")
+    return True
+def _sample_color_reset_optimized(self):
+    """
+    Optimized version of sample_color_reset that caches colormap initialization.
+    """
+    if self.samples_df is None or len(self.samples_df) == 0:
+        self.logger.warning("No samples found in study.")
+        return
+    # Cache the colormap if not already cached
+    if not hasattr(self, '_cached_colormap'):
+        try:
+            from cmap import Colormap
+            self._cached_colormap = Colormap('turbo')
+        except ImportError:
+            self.logger.warning("cmap package not available, using default colors")
+            return
+    cm = self._cached_colormap
+    n_samples = len(self.samples_df)
+    # Pre-allocate colors list for better performance
+    colors = [None] * n_samples
+    # Vectorized color generation
+    for i in range(n_samples):
+        normalized_value = 0.1 + ((i + 0.5) / n_samples) * 0.8
+        color_rgba = cm(normalized_value)
+        if len(color_rgba) >= 3:
+            r, g, b = color_rgba[:3]
+            if max(color_rgba[:3]) <= 1.0:
+                r, g, b = int(r * 255), int(g * 255), int(b * 255)
+            colors[i] = f"#{r:02x}{g:02x}{b:02x}"
+    # Update the sample_color column efficiently
+    self.samples_df = self.samples_df.with_columns(
+        pl.Series("sample_color", colors).alias("sample_color")
+    )
+    self.logger.debug(f"Reset sample colors (cached) for {n_samples} samples")

masster 0.3.15__py3-none-any.whl → 0.3.17__py3-none-any.whl

Potentially problematic release.

masster 0.3.15py3-none-any.whl → 0.3.17py3-none-any.whl