PyPI - spacr - Versions diffs - 0.2.68__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

spacr 0.2.68py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

spacr/__init__.py +2 -1
spacr/core.py +107 -12
spacr/gui.py +3 -2
spacr/gui_core.py +160 -109
spacr/gui_elements.py +190 -18
spacr/gui_utils.py +4 -1
spacr/io.py +1 -1
spacr/measure.py +4 -4
spacr/mediar.py +366 -0
spacr/plot.py +4 -1
spacr/resources/MEDIAR/.git +1 -0
spacr/resources/MEDIAR/.gitignore +18 -0
spacr/resources/MEDIAR/LICENSE +21 -0
spacr/resources/MEDIAR/README.md +189 -0
spacr/resources/MEDIAR/SetupDict.py +39 -0
spacr/resources/MEDIAR/config/baseline.json +60 -0
spacr/resources/MEDIAR/config/mediar_example.json +72 -0
spacr/resources/MEDIAR/config/pred/pred_mediar.json +17 -0
spacr/resources/MEDIAR/config/step1_pretraining/phase1.json +55 -0
spacr/resources/MEDIAR/config/step1_pretraining/phase2.json +58 -0
spacr/resources/MEDIAR/config/step2_finetuning/finetuning1.json +66 -0
spacr/resources/MEDIAR/config/step2_finetuning/finetuning2.json +66 -0
spacr/resources/MEDIAR/config/step3_prediction/base_prediction.json +16 -0
spacr/resources/MEDIAR/config/step3_prediction/ensemble_tta.json +23 -0
spacr/resources/MEDIAR/core/BasePredictor.py +120 -0
spacr/resources/MEDIAR/core/BaseTrainer.py +240 -0
spacr/resources/MEDIAR/core/Baseline/Predictor.py +59 -0
spacr/resources/MEDIAR/core/Baseline/Trainer.py +113 -0
spacr/resources/MEDIAR/core/Baseline/__init__.py +2 -0
spacr/resources/MEDIAR/core/Baseline/utils.py +80 -0
spacr/resources/MEDIAR/core/MEDIAR/EnsemblePredictor.py +105 -0
spacr/resources/MEDIAR/core/MEDIAR/Predictor.py +234 -0
spacr/resources/MEDIAR/core/MEDIAR/Trainer.py +172 -0
spacr/resources/MEDIAR/core/MEDIAR/__init__.py +3 -0
spacr/resources/MEDIAR/core/MEDIAR/utils.py +429 -0
spacr/resources/MEDIAR/core/__init__.py +2 -0
spacr/resources/MEDIAR/core/utils.py +40 -0
spacr/resources/MEDIAR/evaluate.py +71 -0
spacr/resources/MEDIAR/generate_mapping.py +121 -0
spacr/resources/MEDIAR/image/examples/img1.tiff +0 -0
spacr/resources/MEDIAR/image/examples/img2.tif +0 -0
spacr/resources/MEDIAR/image/failure_cases.png +0 -0
spacr/resources/MEDIAR/image/mediar_framework.png +0 -0
spacr/resources/MEDIAR/image/mediar_model.PNG +0 -0
spacr/resources/MEDIAR/image/mediar_results.png +0 -0
spacr/resources/MEDIAR/main.py +125 -0
spacr/resources/MEDIAR/predict.py +70 -0
spacr/resources/MEDIAR/requirements.txt +14 -0
spacr/resources/MEDIAR/train_tools/__init__.py +3 -0
spacr/resources/MEDIAR/train_tools/data_utils/__init__.py +1 -0
spacr/resources/MEDIAR/train_tools/data_utils/custom/CellAware.py +88 -0
spacr/resources/MEDIAR/train_tools/data_utils/custom/LoadImage.py +161 -0
spacr/resources/MEDIAR/train_tools/data_utils/custom/NormalizeImage.py +77 -0
spacr/resources/MEDIAR/train_tools/data_utils/custom/__init__.py +3 -0
spacr/resources/MEDIAR/train_tools/data_utils/custom/modalities.pkl +0 -0
spacr/resources/MEDIAR/train_tools/data_utils/datasetter.py +208 -0
spacr/resources/MEDIAR/train_tools/data_utils/transforms.py +148 -0
spacr/resources/MEDIAR/train_tools/data_utils/utils.py +84 -0
spacr/resources/MEDIAR/train_tools/measures.py +200 -0
spacr/resources/MEDIAR/train_tools/models/MEDIARFormer.py +102 -0
spacr/resources/MEDIAR/train_tools/models/__init__.py +1 -0
spacr/resources/MEDIAR/train_tools/utils.py +70 -0
spacr/resources/MEDIAR_weights/.DS_Store +0 -0
spacr/resources/icons/.DS_Store +0 -0
spacr/resources/icons/plaque.png +0 -0
spacr/resources/images/plate1_E01_T0001F001L01A01Z01C02.tif +0 -0
spacr/resources/images/plate1_E01_T0001F001L01A02Z01C01.tif +0 -0
spacr/resources/images/plate1_E01_T0001F001L01A03Z01C03.tif +0 -0
spacr/sequencing.py +234 -422
spacr/settings.py +16 -10
spacr/utils.py +14 -11
{spacr-0.2.68.dist-info → spacr-0.3.0.dist-info}/METADATA +10 -2
{spacr-0.2.68.dist-info → spacr-0.3.0.dist-info}/RECORD +77 -18
{spacr-0.2.68.dist-info → spacr-0.3.0.dist-info}/LICENSE +0 -0
{spacr-0.2.68.dist-info → spacr-0.3.0.dist-info}/WHEEL +0 -0
{spacr-0.2.68.dist-info → spacr-0.3.0.dist-info}/entry_points.txt +0 -0
{spacr-0.2.68.dist-info → spacr-0.3.0.dist-info}/top_level.txt +0 -0

spacr/sequencing.py CHANGED Viewed

@@ -25,6 +25,18 @@ from Bio import SeqIO
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
+from collections import defaultdict
+import gzip, re
+from Bio.Seq import Seq
+import pandas as pd
+import numpy as np
+import gzip, re
+from Bio.Seq import Seq
+import pandas as pd
+import numpy as np
+from multiprocessing import Pool, cpu_count
 def parse_gz_files(folder_path):
     """
     Parses the .fastq.gz files in the specified folder path and returns a dictionary
@@ -55,474 +67,274 @@ def parse_gz_files(folder_path):
             samples_dict[sample_name]['R2'] = os.path.join(folder_path, gz_file)
     return samples_dict
-def process_chunk_for_consensus(r1_chunk, r2_chunk):
-    """
-    Process a chunk of paired-end sequencing reads to generate consensus sequences.
-    Args:
-        r1_chunk (list): List of SeqRecord objects representing the first read in each pair.
-        r2_chunk (list): List of SeqRecord objects representing the second read in each pair.
-    Returns:
-        list: List of SeqRecord objects representing the consensus sequences.
-    """
-    consensus_records = []
+# Function to map sequences to names (same as your original)
+def map_sequences_to_names(csv_file, sequences, rc):
+    def rev_comp(dna_sequence):
+        complement_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
+        reverse_seq = dna_sequence[::-1]
+        return ''.join([complement_dict[base] for base in reverse_seq])
-    for r1_record, r2_record in zip(r1_chunk, r2_chunk):
-        best_sequence = []
-        best_quality = []
-        for base1, base2, qual1, qual2 in zip(r1_record.seq, r2_record.seq, r1_record.letter_annotations["phred_quality"], r2_record.letter_annotations["phred_quality"]):
-            if qual1 >= qual2:
-                best_sequence.append(base1)
-                best_quality.append(qual1)
-            else:
-                best_sequence.append(base2)
-                best_quality.append(qual2)
-        consensus_seq = Seq("".join(best_sequence))
-        # Create a new SeqRecord for the consensus sequence
-        consensus_record = SeqRecord(consensus_seq, id=r1_record.id, description="", letter_annotations={"phred_quality": best_quality})
-        # Add the consensus record to the list
-        consensus_records.append(consensus_record)
+    df = pd.read_csv(csv_file)
+    if rc:
+        df['sequence'] = df['sequence'].apply(rev_comp)
-    return consensus_records
-def consensus_sequence(fastq_r1, fastq_r2, output_file, chunk_size=1000000, n_jobs=None):
-    """
-    Calculate the consensus sequence from two FASTQ files (R1 and R2) and write the result to an output file.
-    Parameters:
-    - fastq_r1 (str): Path to the R1 FASTQ file.
-    - fastq_r2 (str): Path to the R2 FASTQ file.
-    - output_file (str): Path to the output file where the consensus sequence will be written.
-    - chunk_size (int): Number of reads to process in each chunk. Default is 1000000.
-    - n_jobs (int): Number of parallel processes to use. If None, it will use the number of available CPUs minus 2.
-    Returns:
-    None
-    """
-    from .utils import print_progress, count_reads_in_fastq
-    print(f'Calculating read count for {fastq_r1} ...')
-    total_reads = count_reads_in_fastq(fastq_r1)
-    chunks_nr = (int(total_reads / chunk_size) + 1) // (n_jobs if n_jobs else cpu_count())
-    total_reads_processed = 0
-    chunk_count = 0
-    time_ls = []
-    if n_jobs is None:
-        n_jobs = cpu_count() - 2
+    csv_sequences = pd.Series(df['name'].values, index=df['sequence']).to_dict()
+    return [csv_sequences.get(sequence, pd.NA) for sequence in sequences]
-    with gzip.open(fastq_r1, "rt") as r1_handle, gzip.open(fastq_r2, "rt") as r2_handle, gzip.open(output_file, "wt") as output_handle:
-        r1_iter = SeqIO.parse(r1_handle, "fastq")
-        r2_iter = SeqIO.parse(r2_handle, "fastq")
-        pool = Pool(processes=n_jobs)
+# Functions to save data (same as your original)
+def save_df_to_hdf5(df, hdf5_file, key='df', comp_type='zlib', comp_level=5):
+    try:
+        with pd.HDFStore(hdf5_file, 'a', complib=comp_type, complevel=comp_level) as store:
+            if key in store:
+                existing_df = store[key]
+                df = pd.concat([existing_df, df], ignore_index=True)
+            store.put(key, df, format='table')
+    except Exception as e:
+        print(f"Error while saving DataFrame to HDF5: {e}")
+def save_unique_combinations_to_csv(unique_combinations, csv_file):
+    try:
+        try:
+            existing_df = pd.read_csv(csv_file)
+        except FileNotFoundError:
+            existing_df = pd.DataFrame()
-        while True:
-            start_time = time.time()
+        if not existing_df.empty:
+            unique_combinations = pd.concat([existing_df, unique_combinations])
+            unique_combinations = unique_combinations.groupby(
+                ['row_name', 'column_name', 'grna_name'], as_index=False).sum()
-            r1_chunk = [rec for rec in (next(r1_iter, None) for _ in range(n_jobs * chunk_size)) if rec is not None]
-            r2_chunk = [rec for rec in (next(r2_iter, None) for _ in range(n_jobs * chunk_size)) if rec is not None]
-            # If either chunk is empty, we have reached the end of one or both files
-            if not r1_chunk or not r2_chunk:
-                break
+        unique_combinations.to_csv(csv_file, index=False)
+    except Exception as e:
+        print(f"Error while saving unique combinations to CSV: {e}")
-            chunk_count += 1
-            total_reads_processed += len(r1_chunk)
-            # Split the records into chunks to be processed by each core
-            r1_chunked = [r1_chunk[i:i + chunk_size] for i in range(0, len(r1_chunk), chunk_size)]
-            r2_chunked = [r2_chunk[i:i + chunk_size] for i in range(0, len(r2_chunk), chunk_size)]
-            # Process each chunk in parallel
-            results = pool.starmap(process_chunk_for_consensus, zip(r1_chunked, r2_chunked))
-            # Write the results to the output file
-            for consensus_records in results:
-                SeqIO.write(consensus_records, output_handle, "fastq")
-            end_time = time.time()
-            chunk_time = end_time - start_time
-            time_ls.append(chunk_time)
-            print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=chunk_size, operation_type=" Consensus sequence from R1 & R2")
-        pool.close()
-        pool.join()
-def consensus_sequence_v1(fastq_r1, fastq_r2, output_file, chunk_size=1000000):
-    """
-    Generate a consensus sequence from paired-end FASTQ files.
-    Args:
-        fastq_r1 (str): Path to the first input FASTQ file.
-        fastq_r2 (str): Path to the second input FASTQ file.
-        output_file (str): Path to the output FASTQ file.
-        chunk_size (int, optional): Number of reads to process in each iteration. Defaults to 1000000.
-    Returns:
-        None
-    """
-    from .utils import print_progress, count_reads_in_fastq
-    print(f'Calculating read count for {fastq_r1} ...')
-    total_reads = count_reads_in_fastq(fastq_r1)
-    chunks_nr = int(total_reads/chunk_size) + 1
+def save_qc_df_to_csv(qc_df, qc_csv_file):
+    try:
+        try:
+            existing_qc_df = pd.read_csv(qc_csv_file)
+        except FileNotFoundError:
+            existing_qc_df = pd.DataFrame()
+        if not existing_qc_df.empty:
+            qc_df = qc_df.add(existing_qc_df, fill_value=0)
+        qc_df.to_csv(qc_csv_file, index=False)
+    except Exception as e:
+        print(f"Error while saving QC DataFrame to CSV: {e}")
+def extract_sequence_and_quality(sequence, quality, start, end):
+    return sequence[start:end], quality[start:end]
+def create_consensus(seq1, qual1, seq2, qual2):
+    consensus_seq = []
+    for i in range(len(seq1)):
+        bases = [(seq1[i], qual1[i]), (seq2[i], qual2[i])]
+        consensus_seq.append(get_consensus_base(bases))
+    return ''.join(consensus_seq)
+def get_consensus_base(bases):
+    # Prefer non-'N' bases, if 'N' exists, pick the other one.
+    if bases[0][0] == 'N':
+        return bases[1][0]
+    elif bases[1][0] == 'N':
+        return bases[0][0]
+    else:
+        # Return the base with the highest quality score
+        return bases[0][0] if bases[0][1] >= bases[1][1] else bases[1][0]
-    total_reads = 0
-    chunk_count = 0
-    time_ls = []
+def reverse_complement(seq):
+    return str(Seq(seq).reverse_complement())
-    with gzip.open(fastq_r1, "rt") as r1_handle, gzip.open(fastq_r2, "rt") as r2_handle, gzip.open(output_file, "wt") as output_handle:
-        r1_iter = SeqIO.parse(r1_handle, "fastq")
-        r2_iter = SeqIO.parse(r2_handle, "fastq")
+# Core logic for processing a chunk (same as your original)
+def process_chunk(chunk_data):
+    def find_sequence_in_chunk_reads(r1_chunk, r2_chunk, target_sequence, offset_start, expected_end):
+        i = 0
+        fail_count = 0
+        failed_cases = []
+        regex = r"^(?P<column>.{8})TGCTG.*TAAAC(?P<grna>.{20,21})AACTT.*AGAAG(?P<row>.{8}).*"
+        consensus_sequences, columns, grnas, rows = [], [], [], []
-        while True:
-            start_time = time.time()
-            r1_chunk = [rec for rec in (next(r1_iter, None) for _ in range(chunk_size)) if rec is not None]
-            r2_chunk = [rec for rec in (next(r2_iter, None) for _ in range(chunk_size)) if rec is not None]
-            # If either chunk is empty, we have reached the end of one or both files
-            if not r1_chunk or not r2_chunk:
-                break
-            chunk_count += 1
-            total_reads += len(r1_chunk)
-            for r1_record, r2_record in zip(r1_chunk, r2_chunk):
-                best_sequence = []
-                best_quality = []
-                for base1, base2, qual1, qual2 in zip(r1_record.seq, r2_record.seq, r1_record.letter_annotations["phred_quality"], r2_record.letter_annotations["phred_quality"]):
-                    if qual1 >= qual2:
-                        best_sequence.append(base1)
-                        best_quality.append(qual1)
-                    else:
-                        best_sequence.append(base2)
-                        best_quality.append(qual2)
-                consensus_seq = Seq("".join(best_sequence))
-                # Create a new SeqRecord for the consensus sequence
-                consensus_record = SeqRecord(consensus_seq, id=r1_record.id, description="", letter_annotations={"phred_quality": best_quality})
-                # Write the consensus sequence to the output file
-                SeqIO.write(consensus_record, output_handle, "fastq")
-            end_time = time.time()
-            chunk_time = end_time - start_time
-            time_ls.append(chunk_time)
-            print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=1, time_ls=time_ls, batch_size=chunk_size, operation_type=" Consensus sequence from R1 & R2")
-def save_to_hdf(queue, output_file, complevel=9, compression='zlib'):
-    """
-    Save data from a queue to an HDF file.
-    Parameters:
-    - queue: Queue object containing chunks of data to be saved
-    - output_file: Path to the output HDF file
-    - complevel: Compression level (default: 9)
-    - compression: Compression algorithm (default: 'zlib')
-    Returns:
-    None
-    """
-    with pd.HDFStore(output_file, mode='a', complevel=complevel, complib=compression) as store:
-        while True:
-            chunk_count, df = queue.get()
-            if df is None:
-                break
-            print(f'Writing chunks to H5PY ...')
-            store.append(f'chunk_{chunk_count}', df, format='table', data_columns=True)
-def get_top_two_matches(seq, barcode_dict):
-    """
-    Finds the top two closest matches for a given sequence in a barcode dictionary.
-    Args:
-        seq (str): The sequence to find the closest matches for.
-        barcode_dict (dict): A dictionary containing barcodes as keys and their corresponding values.
-    Returns:
-        list of tuples: A list containing up to two tuples, each with a barcode match and its score.
-    """
-    results = process.extract(seq, barcode_dict.keys(), scorer=fuzz.ratio, limit=2)
-    matches = [(barcode_dict[result[0]], result[1] / 100.0) for result in results]
-    # Pad the matches list if there are fewer than two results
-    if len(matches) < 2:
-        matches.append((None, 0.0))
-    return matches
-def process_chunk_for_mapping(records, barcode_mapping, barcode_dicts, barcode_coordinates, reverse_complements):
-    """
-    Process a chunk of records for barcode mapping, including highest and second-highest scores.
-    Args:
-        records (list): A list of records to process.
-        barcode_mapping (dict): A dictionary mapping barcodes to their corresponding keys.
-        barcode_dicts (dict): A dictionary of barcode dictionaries.
-        barcode_coordinates (dict): A dictionary mapping barcode keys to their start and end coordinates.
-        reverse_complements (dict): A dictionary indicating whether to reverse complement the extracted sequences for each barcode key.
+        for r1_lines, r2_lines in zip(r1_chunk, r2_chunk):
+            r1_header, r1_sequence, r1_plus, r1_quality = r1_lines.split('\n')
+            r2_header, r2_sequence, r2_plus, r2_quality = r2_lines.split('\n')
+            r2_sequence = reverse_complement(r2_sequence)
-    Returns:
-        pandas.DataFrame: A DataFrame containing the processed data.
-    """
-    data = {key: [] for key in barcode_mapping.keys()}
-    seq_data = {f"{key}_seq": [] for key in barcode_mapping.keys()}
-    score_data_1 = {f"{key}_score_1": [] for key in barcode_mapping.keys()}
-    score_data_2 = {f"{key}_score_2": [] for key in barcode_mapping.keys()}
-    sequences = []
+            r1_pos = r1_sequence.find(target_sequence)
+            r2_pos = r2_sequence.find(target_sequence)
+            if r1_pos != -1 and r2_pos != -1:
+                r1_start = max(r1_pos + offset_start, 0)
+                r1_end = min(r1_start + expected_end, len(r1_sequence))
+                r2_start = max(r2_pos + offset_start, 0)
+                r2_end = min(r2_start + expected_end, len(r2_sequence))
+                r1_seq, r1_qual = extract_sequence_and_quality(r1_sequence, r1_quality, r1_start, r1_end)
+                r2_seq, r2_qual = extract_sequence_and_quality(r2_sequence, r2_quality, r2_start, r2_end)
+                if len(r1_seq) < expected_end:
+                    r1_seq += 'N' * (expected_end - len(r1_seq))
+                    r1_qual += '!' * (expected_end - len(r1_qual))
+                if len(r2_seq) < expected_end:
+                    r2_seq += 'N' * (expected_end - len(r2_seq))
+                    r2_qual += '!' * (expected_end - len(r2_qual))
+                consensus_seq = create_consensus(r1_seq, r1_qual, r2_seq, r2_qual)
+                if len(consensus_seq) >= expected_end:
+                    match = re.match(regex, consensus_seq)
+                    if match:
+                        consensus_sequences.append(consensus_seq)
+                        column_sequence = match.group('column')
+                        grna_sequence = match.group('grna')
+                        row_sequence = match.group('row')
+                        columns.append(column_sequence)
+                        grnas.append(grna_sequence)
+                        rows.append(row_sequence)
+        return consensus_sequences, columns, grnas, rows, fail_count
+    r1_chunk, r2_chunk, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv = chunk_data
+    consensus_sequences, columns, grnas, rows, _ = find_sequence_in_chunk_reads(r1_chunk, r2_chunk, target_sequence, offset_start, expected_end)
-    for record in records:
-        sequences.append(str(record.seq))
-        for key, coord in barcode_coordinates.items():
-            start, end = coord
-            extracted_seq = str(record.seq[start:end])
-            if reverse_complements[key]:
-                extracted_seq = str(Seq(extracted_seq).reverse_complement())
-            seq_data[f"{key}_seq"].append(extracted_seq)
-            if key in barcode_dicts:
-                exact_match = barcode_dicts[key].get(extracted_seq, None)
-                if exact_match:
-                    data[key].append(exact_match)
-                    score_data_1[f"{key}_score_1"].append(1.0)
-                    score_data_2[f"{key}_score_2"].append(0.0)
-                else:
-                    matches = get_top_two_matches(extracted_seq, barcode_dicts[key])
-                    data[key].append(matches[0][0])
-                    score_data_1[f"{key}_score_1"].append(matches[0][1])
-                    score_data_2[f"{key}_score_2"].append(matches[1][1])
-            else:
-                data[key].append(extracted_seq)
-                score_data_1[f"{key}_score_1"].append(0.0)
-                score_data_2[f"{key}_score_2"].append(0.0)
-    df = pd.DataFrame(data)
-    df_seq = pd.DataFrame(seq_data)
-    df_score_1 = pd.DataFrame(score_data_1)
-    df_score_2 = pd.DataFrame(score_data_2)
-    df['sequence'] = sequences
-    df = pd.concat([df, df_seq, df_score_1, df_score_2], axis=1)
-    return df
-def extract_barcodes_from_fastq(fastq, output_file, chunk_size, barcode_mapping, n_jobs=None, compression='zlib', complevel=9):
-    """
-    Extracts barcodes from a FASTQ file and maps them based on a barcode mapping.
+    column_names = map_sequences_to_names(column_csv, columns, rc=False)
+    grna_names = map_sequences_to_names(grna_csv, grnas, rc=True)
+    row_names = map_sequences_to_names(row_csv, rows, rc=True)
+    df = pd.DataFrame({
+        'read': consensus_sequences,
+        'column_sequence': columns,
+        'column_name': column_names,
+        'row_sequence': rows,
+        'row_name': row_names,
+        'grna_sequence': grnas,
+        'grna_name': grna_names
+    })
+    qc_df = df.isna().sum().to_frame().T
+    qc_df.columns = df.columns
+    qc_df.index = ["NaN_Counts"]
+    qc_df['total_reads'] = len(df)
+    unique_combinations = df.groupby(['row_name', 'column_name', 'grna_name']).size().reset_index(name='count')
+    return df, unique_combinations, qc_df
+# Function to save data from the queue
+def saver_process(save_queue, hdf5_file, unique_combinations_csv, qc_csv_file, comp_type, comp_level):
+    while True:
+        item = save_queue.get()
+        if item == "STOP":
+            break
+        df, unique_combinations, qc_df = item
+        save_df_to_hdf5(df, hdf5_file, key='df', comp_type=comp_type, comp_level=comp_level)
+        save_unique_combinations_to_csv(unique_combinations, unique_combinations_csv)
+        save_qc_df_to_csv(qc_df, qc_csv_file)
-    Args:
-        fastq (str): Path to the input FASTQ file.
-        output_file (str): Path to the output file where the mapped barcodes will be saved.
-        chunk_size (int): Number of records to process in each chunk.
-        barcode_mapping (dict): Dictionary containing barcode mapping information.
-            The keys are the names of the barcode sets, and the values are tuples
-            containing the path to the CSV file, barcode coordinates, and reverse complement flag.
-        n_jobs (int, optional): Number of parallel processes to use for mapping. Defaults to None.
-        compression (str, optional): Compression algorithm to use for saving the output file. Defaults to 'zlib'.
-        complevel (int, optional): Compression level to use for saving the output file. Defaults to 9.
+# Updated chunked_processing with improved multiprocessing logic
+def chunked_processing(r1_file, r2_file, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv, save_h5, comp_type, comp_level, hdf5_file, unique_combinations_csv, qc_csv_file, chunk_size=10000, n_jobs=None):
-    Returns:
-        None
-    """
-    from .utils import print_progress, count_reads_in_fastq
-    # Ensure the file is deleted before starting
-    if os.path.exists(output_file):
-        os.remove(output_file)
-    # Validate and process barcode mapping
-    barcode_dicts = {}
-    barcode_coordinates = {}
-    reverse_complements = {}
-    for key, (csv_path, coordinates, reverse_comp) in barcode_mapping.items():
-        df = pd.read_csv(csv_path)
-        if 'name' not in df.columns or 'sequence' not in df.columns:
-            print(f"Warning: CSV file {csv_path} does not have required columns 'name' and 'sequence'. Aborting.")
-            return
-        barcode_dicts[key] = df.set_index('sequence')['name'].to_dict()
-        barcode_coordinates[key] = coordinates
-        reverse_complements[key] = reverse_comp
+    from .utils import count_reads_in_fastq, print_progress
+    # Use cpu_count minus 3 cores if n_jobs isn't specified
     if n_jobs is None:
-        n_jobs = cpu_count() - 3  # Reserve one core for saving
+        n_jobs = cpu_count() - 3
     analyzed_chunks = 0
     chunk_count = 0
     time_ls = []
-    print(f'Calculating read count for {fastq} ...')
-    total_reads = count_reads_in_fastq(fastq)
-    chunks_nr = int(total_reads/chunk_size)
-    print(f'Mapping barcodes for {total_reads} reads in {chunks_nr} batches for {fastq} ...')
-    # Create a queue to hold dataframes to be saved
+    print(f'Calculating read count for {r1_file}...')
+    total_reads = count_reads_in_fastq(r1_file)
+    chunks_nr = int(total_reads / chunk_size)
+    print(f'Mapping barcodes for {total_reads} reads in {chunks_nr} batches for {r1_file}...')
+    # Queue for saving
     save_queue = Queue()
-    # Start a separate process for saving the data
-    save_process = Process(target=save_to_hdf, args=(save_queue, output_file, complevel, compression))
+    # Start the saving process
+    save_process = Process(target=saver_process, args=(save_queue, hdf5_file, unique_combinations_csv, qc_csv_file, comp_type, comp_level))
     save_process.start()
-    with gzip.open(fastq, "rt") as handle:
-        fastq_iter = SeqIO.parse(handle, "fastq")
-        pool = Pool(processes=n_jobs)
+    pool = Pool(n_jobs)
+    with gzip.open(r1_file, 'rt') as r1, gzip.open(r2_file, 'rt') as r2:
+        fastq_iter = zip(r1, r2)
         while True:
-            # Read n_jobs * chunk_size records into memory
-            records = [record for _, record in zip(range(n_jobs * chunk_size), fastq_iter)]
-            if not records:
-                break
-            analyzed_chunks_1 = analyzed_chunks
             start_time = time.time()
-            chunk_count += 1
-            analyzed_chunks = int(chunk_count*n_jobs)
-            analyzed_chunks_ls = list(range(analyzed_chunks_1, analyzed_chunks))
-            # Split the records into chunks to be processed by each core
-            chunked_records = [records[i:i + chunk_size] for i in range(0, len(records), chunk_size)]
-            # Process each chunk in parallel
-            dfs = pool.starmap(process_chunk_for_mapping, [(chunk, barcode_mapping, barcode_dicts, barcode_coordinates, reverse_complements) for chunk in chunked_records])
-            # Queue the dataframes to be saved
-            df = pd.concat(dfs, ignore_index=True)
-            save_queue.put((chunk_count, df))
-            end_time = time.time()
-            chunk_time = end_time - start_time
-            time_ls.append(chunk_time)
-            for az_chunks in analyzed_chunks_ls:
-                print_progress(files_processed=az_chunks, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=chunk_size, operation_type=" Mapping Barcodes")
-            del records, chunked_records, dfs, df
-        pool.close()
-        pool.join()
-    # Send a sentinel value to indicate the saving process should stop
-    save_queue.put((None, None))
-    save_process.join()
-def extract_barcodes_from_fastq_v1(fastq, output_file, chunk_size, barcode_mapping, n_jobs=None, compression='zlib', complevel=9):
-    """
-    Extracts barcodes from a FASTQ file and saves the results to an output file.
-    Parameters:
-    - fastq (str): Path to the input FASTQ file.
-    - output_file (str): Path to the output file where the barcode data will be saved.
-    - chunk_size (int): Number of records to process in each chunk.
-    - barcode_mapping (dict): Mapping of barcode keys to CSV file paths, barcode coordinates, and reverse complement flags.
-    - n_jobs (int, optional): Number of parallel processes to use for barcode mapping. Defaults to None.
-    - compression (str, optional): Compression algorithm to use for the output file. Defaults to 'zlib'.
-    - complevel (int, optional): Compression level to use for the output file. Defaults to 9.
-    """
-    from .utils import print_progress, count_reads_in_fastq
-    # Ensure the file is deleted before starting
-    if os.path.exists(output_file):
-        os.remove(output_file)
-    # Validate and process barcode mapping
-    barcode_dicts = {}
-    barcode_coordinates = {}
-    reverse_complements = {}
-    for key, (csv_path, coordinates, reverse_comp) in barcode_mapping.items():
-        df = pd.read_csv(csv_path)
-        if 'name' not in df.columns or 'sequence' not in df.columns:
-            print(f"Warning: CSV file {csv_path} does not have required columns 'name' and 'sequence'. Aborting.")
-            return
-        barcode_dicts[key] = df.set_index('sequence')['name'].to_dict()
-        barcode_coordinates[key] = coordinates
-        reverse_complements[key] = reverse_comp
-    if n_jobs is None:
-        n_jobs = cpu_count() - 2
-    chunk_count = 0
-    time_ls = []
-    print(f'Calculating read count for {fastq} ...')
-    total_reads = count_reads_in_fastq(fastq)
-    chunks_nr = (int(total_reads/chunk_size) + 1)
-    print(f'Mapping barcodes for {total_reads} reads in {chunks_nr} batches for {fastq} ...')
-    with gzip.open(fastq, "rt") as handle:
-        fastq_iter = SeqIO.parse(handle, "fastq")
-        pool = Pool(processes=n_jobs)
-        while True:
-            # Read n_jobs * chunk_size records into memory
-            records = [record for _, record in zip(range(n_jobs * chunk_size), fastq_iter)]
+            r1_chunk = []
+            r2_chunk = []
+            for _ in range(chunk_size):
+                try:
+                    r1_lines = [r1.readline().strip() for _ in range(4)]
+                    r2_lines = [r2.readline().strip() for _ in range(4)]
+                    r1_chunk.append('\n'.join(r1_lines))
+                    r2_chunk.append('\n'.join(r2_lines))
+                except StopIteration:
+                    break
-            if not records:
+            if not r1_chunk:
                 break
-            start_time = time.time()
             chunk_count += 1
+            chunk_data = (r1_chunk, r2_chunk, target_sequence, offset_start, expected_end, column_csv, grna_csv, row_csv)
-            # Split the records into chunks to be processed by each core
-            chunked_records = [records[i:i + chunk_size] for i in range(0, len(records), chunk_size)]
-            # Process each chunk in parallel
-            dfs = pool.starmap(process_chunk_for_mapping, [(chunk, barcode_mapping, barcode_dicts, barcode_coordinates, reverse_complements) for chunk in chunked_records])
+            # Process chunks in parallel
+            result = pool.apply_async(process_chunk, (chunk_data,))
+            df, unique_combinations, qc_df = result.get()
-            # Join the results
-            df = pd.concat(dfs, ignore_index=True)
-            # Save to HDF5 with compression
-            print(f'Writing chunk {chunk_count} to H5PY ...')
-            df.to_hdf(output_file, key=f'chunk_{chunk_count}', mode='a', format='table', complevel=complevel, complib=compression)
+            # Queue the results for saving
+            save_queue.put((df, unique_combinations, qc_df))
             end_time = time.time()
             chunk_time = end_time - start_time
             time_ls.append(chunk_time)
-            print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=None, operation_type=" Mapping Barcodes")
+            print_progress(files_processed=chunk_count, files_to_process=chunks_nr, n_jobs=n_jobs, time_ls=time_ls, batch_size=chunk_size, operation_type="Mapping Barcodes")
-            del records, chunked_records, dfs, df
+    # Cleanup the pool
+    pool.close()
+    pool.join()
-        pool.close()
-        pool.join()
+    # Send stop signal to saver process
+    save_queue.put("STOP")
+    save_process.join()
 def generate_barecode_mapping(settings={}):
     from .settings import set_default_generate_barecode_mapping
     settings = set_default_generate_barecode_mapping(settings)
     samples_dict = parse_gz_files(settings['src'])
     for key in samples_dict:
-        if samples_dict[key]['R1'] and samples_dict[key]['R2']:
-            R1 = samples_dict[key]['R1']
-            R2 = samples_dict[key]['R2']
-            consensus_dir = os.path.join(os.path.dirname(R1), 'consensus')
-            os.makedirs(consensus_dir, exist_ok=True)
-            consensus = os.path.join(consensus_dir, f"{key}_consensus.fastq.gz")
-            h5 = os.path.join(consensus_dir, f"{key}_barecodes.h5")
-            if not os.path.exists(consensus):
-                consensus_sequence(R1, R2, consensus, settings['chunk_size'])
-            else:
-                print(f"Consensus file {consensus} already exists. Mapping barecodes.")
-            extract_barcodes_from_fastq(fastq=consensus,
-                                        output_file=h5,
-                                        chunk_size=settings['chunk_size'],
-                                        barcode_mapping=settings['barcode_mapping'],
-                                        n_jobs=settings['n_jobs'],
-                                        compression=settings['compression'],
-                                        complevel=settings['complevel'])
+        if samples_dict[key]['R1'] and samples_dict[key]['R2']:
+            dst = os.path.join(settings['src'], key)
+            hdf5_file = os.path.join(dst, 'annotated_reads.h5')
+            unique_combinations_csv = os.path.join(dst, 'unique_combinations.csv')
+            qc_csv_file = os.path.join(dst, 'qc.csv')
+            os.makedirs(dst, exist_ok=True)
+            print(f'Analyzing reads from sample {key}')
+            chunked_processing(r1_file=samples_dict[key]['R1'],
+                               r2_file=samples_dict[key]['R2'],
+                               target_sequence=settings['target_sequence'],
+                               offset_start=settings['offset_start'],
+                               expected_end=settings['expected_end'],
+                               column_csv=settings['column_csv'],
+                               grna_csv=settings['grna_csv'],
+                               row_csv=settings['row_csv'],
+                               save_h5 = settings['save_h5'],
+                               comp_type = settings['comp_type'],
+                               comp_level=settings['comp_level'],
+                               hdf5_file=hdf5_file,
+                               unique_combinations_csv=unique_combinations_csv,
+                               qc_csv_file=qc_csv_file,
+                               chunk_size=settings['chunk_size'],
+                               n_jobs=settings['n_jobs'])

spacr 0.2.68__py3-none-any.whl → 0.3.0__py3-none-any.whl

spacr 0.2.68py3-none-any.whl → 0.3.0py3-none-any.whl