spacr 1.0.9__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
spacr/sequencing.py CHANGED
@@ -10,6 +10,23 @@ from IPython.display import display
10
10
 
11
11
  # Function to map sequences to names (same as your original)
12
12
  def map_sequences_to_names(csv_file, sequences, rc):
13
+ """
14
+ Maps DNA sequences to their corresponding names based on a CSV file.
15
+
16
+ Args:
17
+ csv_file (str): Path to the CSV file containing 'sequence' and 'name' columns.
18
+ sequences (list of str): List of DNA sequences to map.
19
+ rc (bool): If True, reverse complement the sequences in the CSV before mapping.
20
+
21
+ Returns:
22
+ list: A list of names corresponding to the input sequences. If a sequence is not found,
23
+ `pd.NA` is returned in its place.
24
+
25
+ Notes:
26
+ - The CSV file must contain columns named 'sequence' and 'name'.
27
+ - If `rc` is True, sequences in the CSV will be reverse complemented prior to mapping.
28
+ - Sequences in `sequences` are not altered—only sequences in the CSV are reverse complemented.
29
+ """
13
30
  def rev_comp(dna_sequence):
14
31
  complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
15
32
  reverse_seq = dna_sequence[::-1]
@@ -24,6 +41,24 @@ def map_sequences_to_names(csv_file, sequences, rc):
24
41
 
25
42
  # Functions to save data (same as your original)
26
43
  def save_df_to_hdf5(df, hdf5_file, key='df', comp_type='zlib', comp_level=5):
44
+ """
45
+ Saves a pandas DataFrame to an HDF5 file, optionally appending to an existing dataset.
46
+
47
+ Args:
48
+ df (pd.DataFrame): The DataFrame to save.
49
+ hdf5_file (str): Path to the target HDF5 file.
50
+ key (str, optional): Key under which to store the DataFrame. Defaults to 'df'.
51
+ comp_type (str, optional): Compression algorithm to use (e.g., 'zlib', 'bzip2', 'blosc'). Defaults to 'zlib'.
52
+ comp_level (int, optional): Compression level (0–9). Higher values yield better compression at the cost of speed. Defaults to 5.
53
+
54
+ Returns:
55
+ None
56
+
57
+ Notes:
58
+ - If the specified key already exists in the HDF5 file, the new DataFrame is appended to it.
59
+ - The combined DataFrame is saved in 'table' format to support appending and querying.
60
+ - Errors encountered during saving are printed to standard output.
61
+ """
27
62
  try:
28
63
  with pd.HDFStore(hdf5_file, 'a', complib=comp_type, complevel=comp_level) as store:
29
64
  if key in store:
@@ -34,6 +69,23 @@ def save_df_to_hdf5(df, hdf5_file, key='df', comp_type='zlib', comp_level=5):
34
69
  print(f"Error while saving DataFrame to HDF5: {e}")
35
70
 
36
71
  def save_unique_combinations_to_csv(unique_combinations, csv_file):
72
+ """
73
+ Saves or appends a DataFrame of unique gRNA combinations to a CSV file, aggregating duplicates.
74
+
75
+ Args:
76
+ unique_combinations (pd.DataFrame): DataFrame containing 'rowID', 'columnID', and 'grna_name' columns,
77
+ along with associated count or metric columns.
78
+ csv_file (str): Path to the CSV file where data will be saved.
79
+
80
+ Returns:
81
+ None
82
+
83
+ Notes:
84
+ - If the file exists, it reads the existing contents and appends the new data.
85
+ - Duplicate combinations (same 'rowID', 'columnID', 'grna_name') are summed.
86
+ - The resulting DataFrame is saved with index included.
87
+ - Any exception during the process is caught and printed to stdout.
88
+ """
37
89
  try:
38
90
  try:
39
91
  existing_df = pd.read_csv(csv_file)
@@ -50,6 +102,22 @@ def save_unique_combinations_to_csv(unique_combinations, csv_file):
50
102
  print(f"Error while saving unique combinations to CSV: {e}")
51
103
 
52
104
  def save_qc_df_to_csv(qc_df, qc_csv_file):
105
+ """
106
+ Saves or appends a QC (quality control) DataFrame to a CSV file by summing overlapping entries.
107
+
108
+ Args:
109
+ qc_df (pd.DataFrame): DataFrame containing numeric QC metrics (e.g., counts, read stats).
110
+ qc_csv_file (str): Path to the CSV file where the QC data will be saved.
111
+
112
+ Returns:
113
+ None
114
+
115
+ Notes:
116
+ - If the file exists, it reads the existing QC data and adds the new values to it (element-wise).
117
+ - If the file doesn't exist, it creates a new one.
118
+ - The final DataFrame is saved without including the index.
119
+ - Any exception is caught and logged to stdout.
120
+ """
53
121
  try:
54
122
  try:
55
123
  existing_qc_df = pd.read_csv(qc_csv_file)
@@ -64,9 +132,34 @@ def save_qc_df_to_csv(qc_df, qc_csv_file):
64
132
  print(f"Error while saving QC DataFrame to CSV: {e}")
65
133
 
66
134
  def extract_sequence_and_quality(sequence, quality, start, end):
135
+ """
136
+ Extracts a subsequence and its corresponding quality scores.
137
+
138
+ Args:
139
+ sequence (str): DNA sequence string.
140
+ quality (str): Quality string corresponding to the sequence.
141
+ start (int): Start index of the region to extract.
142
+ end (int): End index of the region to extract (exclusive).
143
+
144
+ Returns:
145
+ tuple: (subsequence, subquality) as strings.
146
+ """
67
147
  return sequence[start:end], quality[start:end]
68
148
 
69
149
  def create_consensus(seq1, qual1, seq2, qual2):
150
+ """
151
+ Constructs a consensus DNA sequence from two reads with associated quality scores.
152
+
153
+ Args:
154
+ seq1 (str): First DNA sequence.
155
+ qual1 (str): Quality scores for `seq1` (as ASCII characters or integer-encoded).
156
+ seq2 (str): Second DNA sequence.
157
+ qual2 (str): Quality scores for `seq2`.
158
+
159
+ Returns:
160
+ str: Consensus sequence, selecting the base with the highest quality at each position.
161
+ If one base is 'N', the non-'N' base is chosen regardless of quality.
162
+ """
70
163
  consensus_seq = []
71
164
  for i in range(len(seq1)):
72
165
  bases = [(seq1[i], qual1[i]), (seq2[i], qual2[i])]
@@ -74,6 +167,15 @@ def create_consensus(seq1, qual1, seq2, qual2):
74
167
  return ''.join(consensus_seq)
75
168
 
76
169
  def get_consensus_base(bases):
170
+ """
171
+ Selects the most reliable base from a list of two base-quality pairs.
172
+
173
+ Args:
174
+ bases (list of tuples): Each tuple contains (base, quality_score), expected length is 2.
175
+
176
+ Returns:
177
+ str: The consensus base. Prefers non-'N' bases and higher quality scores.
178
+ """
77
179
  # Prefer non-'N' bases, if 'N' exists, pick the other one.
78
180
  if bases[0][0] == 'N':
79
181
  return bases[1][0]
@@ -84,13 +186,78 @@ def get_consensus_base(bases):
84
186
  return bases[0][0] if bases[0][1] >= bases[1][1] else bases[1][0]
85
187
 
86
188
  def reverse_complement(seq):
189
+ """
190
+ Computes the reverse complement of a DNA sequence.
191
+
192
+ Args:
193
+ seq (str): Input DNA sequence.
194
+
195
+ Returns:
196
+ str: Reverse complement of the input sequence.
197
+ """
87
198
  return str(Seq(seq).reverse_complement())
88
199
 
89
200
  # Core logic for processing a chunk (same as your original)
90
201
  def process_chunk(chunk_data):
91
-
202
+ """
203
+ Processes a chunk of sequencing reads to extract and map barcode sequences to corresponding names.
204
+
205
+ This function handles both single-end and paired-end FASTQ data. It searches for a target barcode
206
+ sequence in each read, extracts a consensus region around it, applies a regex to extract barcodes,
207
+ and maps those to known IDs using reference CSVs. Quality control data and unique combinations are
208
+ also computed.
209
+
210
+ Args:
211
+ chunk_data (tuple): Contains either 9 or 10 elements:
212
+
213
+ For paired-end mode (10 elements):
214
+ - r1_chunk (list): List of strings, each 4-line block from R1 FASTQ.
215
+ - r2_chunk (list): List of strings, each 4-line block from R2 FASTQ.
216
+ - regex (str): Regex pattern with named groups ('rowID', 'columnID', 'grna').
217
+ - target_sequence (str): Sequence to anchor barcode extraction.
218
+ - offset_start (int): Offset from target_sequence to start consensus extraction.
219
+ - expected_end (int): Length of the region to extract.
220
+ - column_csv (str): Path to column barcode reference CSV.
221
+ - grna_csv (str): Path to gRNA barcode reference CSV.
222
+ - row_csv (str): Path to row barcode reference CSV.
223
+ - fill_na (bool): Whether to fill unmapped names with raw barcode sequences.
224
+
225
+ For single-end mode (9 elements):
226
+ - Same as above, but r2_chunk is omitted.
227
+
228
+ Returns:
229
+ tuple:
230
+ - df (pd.DataFrame): Full dataframe with columns:
231
+ ['read', 'column_sequence', 'columnID', 'row_sequence', 'rowID',
232
+ 'grna_sequence', 'grna_name']
233
+ - unique_combinations (pd.DataFrame): Count of each unique (rowID, columnID, grna_name) triplet.
234
+ - qc_df (pd.DataFrame): Summary of missing values and total reads.
235
+ """
92
236
  def paired_find_sequence_in_chunk_reads(r1_chunk, r2_chunk, target_sequence, offset_start, expected_end, regex):
93
-
237
+ """
238
+ Processes paired-end FASTQ read chunks to extract consensus barcode sequences and decode them
239
+ using a regex pattern.
240
+
241
+ For each R1–R2 read pair, this function identifies the `target_sequence`, extracts a window of
242
+ defined length with an offset, computes a consensus sequence using base quality scores, and
243
+ applies a regex to extract barcode components.
244
+
245
+ Args:
246
+ r1_chunk (list of str): List of 4-line strings for each R1 read in the chunk.
247
+ r2_chunk (list of str): List of 4-line strings for each R2 read in the chunk.
248
+ target_sequence (str): Nucleotide sequence used as anchor for barcode extraction.
249
+ offset_start (int): Position offset from `target_sequence` to begin extracting barcode.
250
+ expected_end (int): Total length of region to extract after offset.
251
+ regex (str): Regular expression with named groups ('rowID', 'columnID', 'grna')
252
+ to parse barcodes from the extracted consensus sequence.
253
+
254
+ Returns:
255
+ tuple:
256
+ consensus_sequences (list of str): Consensus DNA sequences extracted from read pairs.
257
+ columns (list of str): Extracted column barcode sequences.
258
+ grnas (list of str): Extracted gRNA barcode sequences.
259
+ rows (list of str): Extracted row barcode sequences.
260
+ """
94
261
  consensus_sequences, columns, grnas, rows = [], [], [], []
95
262
  consensus_seq = None
96
263
 
@@ -154,6 +321,26 @@ def process_chunk(chunk_data):
154
321
  return consensus_sequences, columns, grnas, rows
155
322
 
156
323
  def single_find_sequence_in_chunk_reads(r1_chunk, target_sequence, offset_start, expected_end, regex):
324
+ """
325
+ Processes single-end FASTQ read chunks to extract barcode sequences using a target motif and regex pattern.
326
+
327
+ For each R1 read, the function identifies the `target_sequence`, extracts a region starting at an offset
328
+ and of fixed length, pads if necessary, and applies a regex with named groups to decode barcodes.
329
+
330
+ Args:
331
+ r1_chunk (list of str): List of 4-line strings for each R1 read in the chunk.
332
+ target_sequence (str): Anchor sequence to locate the barcode region in R1.
333
+ offset_start (int): Position offset from the end of `target_sequence` to start barcode extraction.
334
+ expected_end (int): Total length of the barcode region to extract.
335
+ regex (str): Regular expression with named groups ('rowID', 'columnID', 'grna') to extract barcodes.
336
+
337
+ Returns:
338
+ tuple:
339
+ consensus_sequences (list of str): Extracted sequences used as barcode consensus (R1 only).
340
+ columns (list of str): Extracted column barcode subsequences.
341
+ grnas (list of str): Extracted gRNA barcode subsequences.
342
+ rows (list of str): Extracted row barcode subsequences.
343
+ """
157
344
 
158
345
  consensus_sequences, columns, grnas, rows = [], [], [], []
159
346
 
@@ -251,6 +438,20 @@ def process_chunk(chunk_data):
251
438
 
252
439
  # Function to save data from the queue
253
440
  def saver_process(save_queue, hdf5_file, save_h5, unique_combinations_csv, qc_csv_file, comp_type, comp_level):
441
+ """
442
+ Continuously reads data from a multiprocessing queue and saves it to disk in various formats.
443
+
444
+ This function is intended to run in a separate process. It terminates when it receives the "STOP" sentinel value.
445
+
446
+ Args:
447
+ save_queue (multiprocessing.Queue): Queue containing tuples of (df, unique_combinations, qc_df).
448
+ hdf5_file (str): Path to the HDF5 file to store full reads (only used if save_h5 is True).
449
+ save_h5 (bool): Whether to save the full reads DataFrame to HDF5.
450
+ unique_combinations_csv (str): Path to the CSV file for aggregated barcode combinations.
451
+ qc_csv_file (str): Path to the CSV file for quality control statistics.
452
+ comp_type (str): Compression algorithm for HDF5 (e.g., 'zlib').
453
+ comp_level (int): Compression level for HDF5.
454
+ """
254
455
  while True:
255
456
  item = save_queue.get()
256
457
  if item == "STOP":
@@ -262,7 +463,33 @@ def saver_process(save_queue, hdf5_file, save_h5, unique_combinations_csv, qc_cs
262
463
  save_qc_df_to_csv(qc_df, qc_csv_file)
263
464
 
264
465
  def paired_read_chunked_processing(r1_file, r2_file, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None, test=False, fill_na=False):
265
-
466
+ """
467
+ Processes paired-end FASTQ files in chunks to extract barcoded sequences and generate consensus reads.
468
+
469
+ This function identifies sequences matching a regular expression in both R1 and R2 reads, extracts barcodes,
470
+ and maps them to user-defined identifiers. Processed data is saved incrementally using a separate process.
471
+
472
+ Args:
473
+ r1_file (str): Path to the gzipped R1 FASTQ file.
474
+ r2_file (str): Path to the gzipped R2 FASTQ file.
475
+ regex (str): Regular expression with named capture groups: 'rowID', 'columnID', and 'grna'.
476
+ target_sequence (str): Anchor sequence to align from.
477
+ offset_start (int): Offset from anchor to start consensus extraction.
478
+ expected_end (int): Length of the consensus region to extract.
479
+ column_csv (str): Path to CSV file mapping column barcode sequences to IDs.
480
+ grna_csv (str): Path to CSV file mapping gRNA barcode sequences to names.
481
+ row_csv (str): Path to CSV file mapping row barcode sequences to IDs.
482
+ save_h5 (bool): Whether to save the full reads DataFrame to HDF5.
483
+ comp_type (str): Compression algorithm for HDF5 (e.g., 'zlib').
484
+ comp_level (int): Compression level for HDF5.
485
+ hdf5_file (str): Path to the HDF5 output file.
486
+ unique_combinations_csv (str): Path to CSV file for saving unique row/column/gRNA combinations.
487
+ qc_csv_file (str): Path to CSV file for saving QC summary (e.g., NaN counts).
488
+ chunk_size (int, optional): Number of reads per batch. Defaults to 10000.
489
+ n_jobs (int, optional): Number of parallel workers. Defaults to cpu_count() - 3.
490
+ test (bool, optional): If True, processes only a single chunk and prints the result. Defaults to False.
491
+ fill_na (bool, optional): If True, fills unmapped IDs with raw barcode sequences. Defaults to False.
492
+ """
266
493
  from .utils import count_reads_in_fastq, print_progress
267
494
 
268
495
  # Use cpu_count minus 3 cores if n_jobs isn't specified
@@ -344,7 +571,34 @@ def paired_read_chunked_processing(r1_file, r2_file, regex, target_sequence, off
344
571
  save_process.join()
345
572
 
346
573
  def single_read_chunked_processing(r1_file, r2_file, regex, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None, test=False, fill_na=False):
347
-
574
+ """
575
+ Processes single-end FASTQ data in chunks to extract barcoded sequences and map them to identifiers.
576
+
577
+ This function reads gzipped R1 FASTQ data, detects barcode-containing sequences using a target anchor and regex,
578
+ and maps row, column, and gRNA barcodes to user-defined identifiers. Results are processed in parallel
579
+ and saved incrementally via a background process.
580
+
581
+ Args:
582
+ r1_file (str): Path to gzipped R1 FASTQ file.
583
+ r2_file (str): Placeholder for interface consistency; not used in single-end mode.
584
+ regex (str): Regular expression with named capture groups: 'rowID', 'columnID', and 'grna'.
585
+ target_sequence (str): Anchor sequence used to locate barcode region.
586
+ offset_start (int): Offset from anchor to start barcode parsing.
587
+ expected_end (int): Length of the barcode region to extract.
588
+ column_csv (str): Path to CSV file mapping column barcode sequences to IDs.
589
+ grna_csv (str): Path to CSV file mapping gRNA barcode sequences to names.
590
+ row_csv (str): Path to CSV file mapping row barcode sequences to IDs.
591
+ save_h5 (bool): Whether to save the full reads DataFrame to HDF5 format.
592
+ comp_type (str): Compression algorithm for HDF5 (e.g., 'zlib').
593
+ comp_level (int): Compression level for HDF5.
594
+ hdf5_file (str): Output HDF5 file path.
595
+ unique_combinations_csv (str): Output path for CSV summarizing row/column/gRNA combinations.
596
+ qc_csv_file (str): Output path for CSV summarizing missing values and total reads.
597
+ chunk_size (int, optional): Number of reads per batch. Defaults to 10,000.
598
+ n_jobs (int, optional): Number of parallel worker processes. Defaults to cpu_count() - 3.
599
+ test (bool, optional): If True, processes only the first chunk and prints its result. Defaults to False.
600
+ fill_na (bool, optional): If True, fills missing mapped IDs with their corresponding barcode sequences. Defaults to False.
601
+ """
348
602
  from .utils import count_reads_in_fastq, print_progress
349
603
 
350
604
  # Use cpu_count minus 3 cores if n_jobs isn't specified
@@ -422,7 +676,40 @@ def single_read_chunked_processing(r1_file, r2_file, regex, target_sequence, off
422
676
  save_process.join()
423
677
 
424
678
  def generate_barecode_mapping(settings={}):
425
-
679
+ """
680
+ Orchestrates barcode extraction and mapping from gzipped sequencing data using user-defined or default settings.
681
+
682
+ This function parses sequencing reads from single-end or paired-end FASTQ (.gz) files, extracts barcode regions
683
+ using a regular expression, maps them to row, column, and gRNA identifiers, and saves the results to disk.
684
+ Results include the full annotated reads (optional), barcode combination counts, and a QC summary.
685
+
686
+ Args:
687
+ settings (dict, optional): Dictionary containing parameters required for barcode mapping. If not provided,
688
+ default values will be applied. Important keys include:
689
+ - 'src' (str): Source directory containing gzipped FASTQ files.
690
+ - 'mode' (str): Either 'single' or 'paired' for single-end or paired-end processing.
691
+ - 'single_direction' (str): If 'single', specifies which read to use ('R1' or 'R2').
692
+ - 'regex' (str): Regular expression with capture groups 'rowID', 'columnID', and 'grna'.
693
+ - 'target_sequence' (str): Anchor sequence to locate barcode start position.
694
+ - 'offset_start' (int): Offset from the anchor to the barcode start.
695
+ - 'expected_end' (int): Expected barcode region length.
696
+ - 'column_csv' (str): CSV file mapping column barcodes to names.
697
+ - 'grna_csv' (str): CSV file mapping gRNA barcodes to names.
698
+ - 'row_csv' (str): CSV file mapping row barcodes to names.
699
+ - 'save_h5' (bool): Whether to save annotated reads to HDF5.
700
+ - 'comp_type' (str): Compression algorithm for HDF5.
701
+ - 'comp_level' (int): Compression level for HDF5.
702
+ - 'chunk_size' (int): Number of reads to process per batch.
703
+ - 'n_jobs' (int): Number of parallel processes for barcode mapping.
704
+ - 'test' (bool): If True, only processes the first chunk for testing.
705
+ - 'fill_na' (bool): If True, fills unmapped barcodes with raw sequence instead of NaN.
706
+
707
+ Side Effects:
708
+ Saves the following files in the output directory:
709
+ - `annotated_reads.h5` (optional): Annotated read information in HDF5 format.
710
+ - `unique_combinations.csv`: Count table of (rowID, columnID, grna_name) triplets.
711
+ - `qc.csv`: Summary of missing values and read counts.
712
+ """
426
713
  from .settings import set_default_generate_barecode_mapping
427
714
  from .utils import save_settings
428
715
  from .io import parse_gz_files
@@ -490,7 +777,23 @@ def generate_barecode_mapping(settings={}):
490
777
 
491
778
  # Function to read the CSV, compute reverse complement, and save it
492
779
  def barecodes_reverse_complement(csv_file):
780
+ """
781
+ Reads a barcode CSV file, computes the reverse complement of each sequence, and saves the result to a new CSV.
782
+
783
+ This function assumes the input CSV contains a column named 'sequence' with DNA barcodes. It computes the
784
+ reverse complement for each sequence and saves the modified DataFrame to a new file with '_RC' appended
785
+ to the original filename.
786
+
787
+ Args:
788
+ csv_file (str): Path to the input CSV file. Must contain a column named 'sequence'.
789
+
790
+ Side Effects:
791
+ - Saves a new CSV file in the same directory with reverse-complemented sequences.
792
+ - Prints the path of the saved file.
493
793
 
794
+ Output:
795
+ New file path format: <original_filename>_RC.csv
796
+ """
494
797
  def reverse_complement(sequence):
495
798
  complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
496
799
  return ''.join(complement[base] for base in reversed(sequence))
@@ -512,10 +815,48 @@ def barecodes_reverse_complement(csv_file):
512
815
  print(f"Reverse complement file saved as {new_filename}")
513
816
 
514
817
  def graph_sequencing_stats(settings):
515
-
818
+ """
819
+ Analyze and visualize sequencing quality metrics to determine an optimal fraction threshold
820
+ that maximizes unique gRNA representation per well across plates.
821
+
822
+ This function reads one or more CSV files containing count data, filters out control wells,
823
+ calculates the fraction of reads per gRNA in each well, and identifies the minimum fraction
824
+ required to recover a target average number of unique gRNAs per well. It generates plots to
825
+ help visualize the chosen threshold and spatial distribution of unique gRNA counts.
826
+
827
+ Args:
828
+ settings (dict): Dictionary containing the following keys:
829
+ - 'count_data' (str or list of str): Paths to CSV file(s) with 'grna', 'count', 'rowID', 'columnID' columns.
830
+ - 'target_unique_count' (int): Target number of unique gRNAs per well to recover.
831
+ - 'filter_column' (str): Column name to filter out control wells.
832
+ - 'control_wells' (list): List of control well labels to exclude.
833
+ - 'log_x' (bool): Whether to log-scale the x-axis in the threshold plot.
834
+ - 'log_y' (bool): Whether to log-scale the y-axis in the threshold plot.
835
+
836
+ Returns:
837
+ float: Closest fraction threshold that approximates the target unique gRNA count per well.
838
+
839
+ Side Effects:
840
+ - Saves a PDF plot of unique gRNA count vs fraction threshold.
841
+ - Saves a spatial plate map of unique gRNA counts.
842
+ - Prints threshold and summary statistics.
843
+ - Displays intermediate DataFrames for inspection.
844
+ """
516
845
  from .utils import correct_metadata_column_names, correct_metadata
517
846
 
518
847
  def _plot_density(df, dependent_variable, dst=None):
848
+ """
849
+ Plot a kernel density estimate (KDE) of a specified variable from a DataFrame.
850
+
851
+ Args:
852
+ df (pd.DataFrame): DataFrame containing the data.
853
+ dependent_variable (str): Name of the column to plot.
854
+ dst (str, optional): Directory to save the plot. If None, the plot is not saved.
855
+
856
+ Side Effects:
857
+ - Displays the KDE plot.
858
+ - Saves the plot as 'dependent_variable_density.pdf' in the specified directory if dst is provided.
859
+ """
519
860
  """Plot a density plot of the dependent variable."""
520
861
  plt.figure(figsize=(10, 10))
521
862
  sns.kdeplot(df[dependent_variable], fill=True, alpha=0.6)
@@ -530,8 +871,22 @@ def graph_sequencing_stats(settings):
530
871
 
531
872
  def find_and_visualize_fraction_threshold(df, target_unique_count=5, log_x=False, log_y=False, dst=None):
532
873
  """
533
- Find the fraction threshold where the recalculated unique count matches the target value,
534
- and visualize the relationship between fraction thresholds and unique counts.
874
+ Identify the optimal fraction threshold that yields an average number of unique gRNAs per well
875
+ closest to a specified target, and visualize the relationship between threshold and unique count.
876
+
877
+ Args:
878
+ df (pd.DataFrame): Input DataFrame containing 'fraction', 'plateID', 'rowID', 'columnID', and 'grna' columns.
879
+ target_unique_count (int, optional): Desired average number of unique gRNAs per well. Default is 5.
880
+ log_x (bool, optional): Whether to apply a log scale to the x-axis in the plot.
881
+ log_y (bool, optional): Whether to apply a log scale to the y-axis in the plot.
882
+ dst (str, optional): Directory where the plot will be saved. If None, the plot is not saved.
883
+
884
+ Returns:
885
+ float: The fraction threshold value closest to achieving the target_unique_count.
886
+
887
+ Side Effects:
888
+ - Displays a line plot of unique gRNA counts vs. fraction thresholds.
889
+ - Saves the plot as 'fraction_threshold.pdf' in a subdirectory 'results/' under `dst` if provided.
535
890
  """
536
891
 
537
892
  def _line_plot(df, x='fraction_threshold', y='unique_count', log_x=False, log_y=False):