masster 0.4.19__py3-none-any.whl → 0.4.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of masster might be problematic. Click here for more details.

masster/study/helpers.py CHANGED
@@ -509,8 +509,9 @@ def get_consensus(self, quant="chrom_area"):
509
509
  # Convert Polars DataFrame to pandas for this operation since the result is used for export
510
510
  df1 = self.consensus_df.to_pandas().copy()
511
511
 
512
- # set consensus_id as uint64
513
- df1["consensus_id"] = df1["consensus_id"].astype("uint64")
512
+ # Keep consensus_id as string (UUID format)
513
+ # Note: consensus_id is now a 16-character UUID string, not an integer
514
+ df1["consensus_id"] = df1["consensus_id"].astype("string")
514
515
  # set consensus_id as index
515
516
  df1.set_index("consensus_uid", inplace=True)
516
517
  # sort by consensus_id
@@ -640,21 +641,61 @@ def get_gaps_stats(self, uids=None):
640
641
  return gaps_stats
641
642
 
642
643
 
643
- # TODO is uid not supposed to be a list anymore?
644
- def get_consensus_matches(self, uids=None):
644
+ def get_consensus_matches(self, uids=None, filled=True):
645
+ """
646
+ Get feature matches for consensus UIDs with optimized join operation.
647
+
648
+ Parameters:
649
+ uids: Consensus UID(s) to get matches for. Can be:
650
+ - None: get matches for all consensus features
651
+ - int: single consensus UID (converted to list)
652
+ - list: multiple consensus UIDs
653
+ filled (bool): Whether to include filled rows (True) or exclude them (False).
654
+ Default is True to maintain backward compatibility.
655
+
656
+ Returns:
657
+ pl.DataFrame: Feature matches for the specified consensus UIDs
658
+ """
659
+ # Handle single int by converting to list
660
+ if isinstance(uids, int):
661
+ uids = [uids]
662
+
645
663
  uids = self._get_consensus_uids(uids)
646
-
647
- # find all rows in consensus_mapping_df with consensus_id=id - use Polars filtering
648
- fid = (
649
- self.consensus_mapping_df.filter(
650
- pl.col("consensus_uid").is_in(uids),
664
+
665
+ if not uids:
666
+ return pl.DataFrame()
667
+
668
+ # Early validation checks
669
+ if self.consensus_mapping_df is None or self.consensus_mapping_df.is_empty():
670
+ self.logger.warning("No consensus mapping data available")
671
+ return pl.DataFrame()
672
+
673
+ if self.features_df is None or self.features_df.is_empty():
674
+ self.logger.warning("No feature data available")
675
+ return pl.DataFrame()
676
+
677
+ # Build the query with optional filled filter
678
+ features_query = self.features_df.lazy()
679
+
680
+ # Apply filled filter if specified
681
+ if not filled and "filled" in self.features_df.columns:
682
+ features_query = features_query.filter(~pl.col("filled"))
683
+
684
+ # Optimized single-pass operation using join instead of two separate filters
685
+ # This avoids creating intermediate Python lists and leverages Polars' optimized joins
686
+ matches = (
687
+ features_query
688
+ .join(
689
+ self.consensus_mapping_df
690
+ .lazy()
691
+ .filter(pl.col("consensus_uid").is_in(uids))
692
+ .select("feature_uid"), # Only select what we need for the join
693
+ on="feature_uid",
694
+ how="inner"
651
695
  )
652
- .select("feature_uid")
653
- .to_series()
654
- .to_list()
696
+ .collect(streaming=True) # Use streaming for memory efficiency with large datasets
655
697
  )
656
- # select all rows in features_df with uid in fid
657
- matches = self.features_df.filter(pl.col("feature_uid").is_in(fid)).clone()
698
+
658
699
  return matches
659
700
 
660
701