phykit 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phykit/__init__.py +0 -0
- phykit/__main__.py +6 -0
- phykit/helpers/__init__.py +0 -0
- phykit/helpers/boolean_argument_parsing.py +12 -0
- phykit/helpers/caching.py +201 -0
- phykit/helpers/files.py +125 -0
- phykit/helpers/parallel.py +305 -0
- phykit/helpers/stats_summary.py +64 -0
- phykit/helpers/streaming.py +152 -0
- phykit/phykit.py +2862 -0
- phykit/services/__init__.py +0 -0
- phykit/services/alignment/__init__.py +17 -0
- phykit/services/alignment/alignment_length.py +16 -0
- phykit/services/alignment/alignment_length_no_gaps.py +69 -0
- phykit/services/alignment/alignment_recoding.py +89 -0
- phykit/services/alignment/base.py +103 -0
- phykit/services/alignment/column_score.py +66 -0
- phykit/services/alignment/compositional_bias_per_site.py +98 -0
- phykit/services/alignment/create_concatenation_matrix.py +254 -0
- phykit/services/alignment/dna_threader.py +145 -0
- phykit/services/alignment/evolutionary_rate_per_site.py +85 -0
- phykit/services/alignment/faidx.py +21 -0
- phykit/services/alignment/gc_content.py +94 -0
- phykit/services/alignment/pairwise_identity.py +159 -0
- phykit/services/alignment/parsimony_informative_sites.py +81 -0
- phykit/services/alignment/rcv.py +14 -0
- phykit/services/alignment/rcvt.py +47 -0
- phykit/services/alignment/rename_fasta_entries.py +53 -0
- phykit/services/alignment/sum_of_pairs_score.py +157 -0
- phykit/services/alignment/variable_sites.py +54 -0
- phykit/services/base.py +9 -0
- phykit/services/tree/__init__.py +29 -0
- phykit/services/tree/base.py +178 -0
- phykit/services/tree/bipartition_support_stats.py +48 -0
- phykit/services/tree/branch_length_multiplier.py +37 -0
- phykit/services/tree/collapse_branches.py +27 -0
- phykit/services/tree/covarying_evolutionary_rates.py +272 -0
- phykit/services/tree/dvmc.py +37 -0
- phykit/services/tree/evolutionary_rate.py +17 -0
- phykit/services/tree/hidden_paralogy_check.py +128 -0
- phykit/services/tree/internal_branch_stats.py +77 -0
- phykit/services/tree/internode_labeler.py +33 -0
- phykit/services/tree/last_common_ancestor_subtree.py +35 -0
- phykit/services/tree/lb_score.py +196 -0
- phykit/services/tree/monophyly_check.py +106 -0
- phykit/services/tree/nearest_neighbor_interchange.py +140 -0
- phykit/services/tree/patristic_distances.py +113 -0
- phykit/services/tree/polytomy_test.py +546 -0
- phykit/services/tree/print_tree.py +28 -0
- phykit/services/tree/prune_tree.py +40 -0
- phykit/services/tree/rename_tree_tips.py +64 -0
- phykit/services/tree/rf_distance.py +136 -0
- phykit/services/tree/root_tree.py +35 -0
- phykit/services/tree/saturation.py +209 -0
- phykit/services/tree/spurious_sequence.py +75 -0
- phykit/services/tree/terminal_branch_stats.py +87 -0
- phykit/services/tree/tip_labels.py +18 -0
- phykit/services/tree/tip_to_tip_distance.py +41 -0
- phykit/services/tree/tip_to_tip_node_distance.py +41 -0
- phykit/services/tree/total_tree_length.py +25 -0
- phykit/services/tree/treeness.py +16 -0
- phykit/services/tree/treeness_over_rcv.py +40 -0
- phykit/version.py +1 -0
- phykit-2.1.0.dist-info/METADATA +150 -0
- phykit-2.1.0.dist-info/RECORD +69 -0
- phykit-2.1.0.dist-info/WHEEL +5 -0
- phykit-2.1.0.dist-info/entry_points.txt +121 -0
- phykit-2.1.0.dist-info/licenses/LICENSE.md +7 -0
- phykit-2.1.0.dist-info/top_level.txt +1 -0
phykit/__init__.py
ADDED
|
File without changes
|
phykit/__main__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Caching utilities for expensive computations
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import pickle
|
|
7
|
+
import os
|
|
8
|
+
import tempfile
|
|
9
|
+
from functools import wraps, lru_cache
|
|
10
|
+
from typing import Any, Callable
|
|
11
|
+
import json
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ResultCache:
|
|
15
|
+
"""
|
|
16
|
+
File-based cache for expensive computation results.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, cache_dir: str = None):
|
|
20
|
+
"""
|
|
21
|
+
Initialize cache.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
cache_dir: Directory for cache files (uses temp dir if None)
|
|
25
|
+
"""
|
|
26
|
+
if cache_dir is None:
|
|
27
|
+
cache_dir = os.path.join(tempfile.gettempdir(), 'phykit_cache')
|
|
28
|
+
|
|
29
|
+
self.cache_dir = cache_dir
|
|
30
|
+
os.makedirs(self.cache_dir, exist_ok=True)
|
|
31
|
+
|
|
32
|
+
def _get_cache_key(self, *args, **kwargs) -> str:
|
|
33
|
+
"""Generate a unique cache key from function arguments."""
|
|
34
|
+
# Create a string representation of arguments
|
|
35
|
+
key_parts = []
|
|
36
|
+
|
|
37
|
+
for arg in args:
|
|
38
|
+
if isinstance(arg, (str, int, float, bool)):
|
|
39
|
+
key_parts.append(str(arg))
|
|
40
|
+
elif hasattr(arg, '__dict__'):
|
|
41
|
+
# For objects, use their attributes
|
|
42
|
+
key_parts.append(json.dumps(vars(arg), sort_keys=True, default=str))
|
|
43
|
+
else:
|
|
44
|
+
key_parts.append(str(arg))
|
|
45
|
+
|
|
46
|
+
for k, v in sorted(kwargs.items()):
|
|
47
|
+
key_parts.append(f"{k}={v}")
|
|
48
|
+
|
|
49
|
+
key_string = "_".join(key_parts)
|
|
50
|
+
return hashlib.md5(key_string.encode()).hexdigest()
|
|
51
|
+
|
|
52
|
+
def get(self, cache_key: str) -> Any:
|
|
53
|
+
"""Retrieve cached result."""
|
|
54
|
+
cache_file = os.path.join(self.cache_dir, f"{cache_key}.pkl")
|
|
55
|
+
|
|
56
|
+
if os.path.exists(cache_file):
|
|
57
|
+
try:
|
|
58
|
+
with open(cache_file, 'rb') as f:
|
|
59
|
+
return pickle.load(f)
|
|
60
|
+
except:
|
|
61
|
+
# Cache corrupted, remove it
|
|
62
|
+
os.remove(cache_file)
|
|
63
|
+
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
def set(self, cache_key: str, value: Any) -> None:
|
|
67
|
+
"""Store result in cache."""
|
|
68
|
+
cache_file = os.path.join(self.cache_dir, f"{cache_key}.pkl")
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
with open(cache_file, 'wb') as f:
|
|
72
|
+
pickle.dump(value, f)
|
|
73
|
+
except:
|
|
74
|
+
# Caching failed, continue without caching
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
def clear(self) -> None:
|
|
78
|
+
"""Clear all cached results."""
|
|
79
|
+
for file in os.listdir(self.cache_dir):
|
|
80
|
+
if file.endswith('.pkl'):
|
|
81
|
+
os.remove(os.path.join(self.cache_dir, file))
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def cached_computation(cache_instance: ResultCache = None):
|
|
85
|
+
"""
|
|
86
|
+
Decorator for caching expensive computation results.
|
|
87
|
+
|
|
88
|
+
Usage:
|
|
89
|
+
@cached_computation()
|
|
90
|
+
def expensive_function(param1, param2):
|
|
91
|
+
# Expensive computation
|
|
92
|
+
return result
|
|
93
|
+
"""
|
|
94
|
+
if cache_instance is None:
|
|
95
|
+
cache_instance = ResultCache()
|
|
96
|
+
|
|
97
|
+
def decorator(func: Callable) -> Callable:
|
|
98
|
+
@wraps(func)
|
|
99
|
+
def wrapper(*args, **kwargs):
|
|
100
|
+
# Generate cache key
|
|
101
|
+
cache_key = cache_instance._get_cache_key(func.__name__, *args, **kwargs)
|
|
102
|
+
|
|
103
|
+
# Try to get cached result
|
|
104
|
+
cached_result = cache_instance.get(cache_key)
|
|
105
|
+
if cached_result is not None:
|
|
106
|
+
return cached_result
|
|
107
|
+
|
|
108
|
+
# Compute result
|
|
109
|
+
result = func(*args, **kwargs)
|
|
110
|
+
|
|
111
|
+
# Cache result
|
|
112
|
+
cache_instance.set(cache_key, result)
|
|
113
|
+
|
|
114
|
+
return result
|
|
115
|
+
|
|
116
|
+
# Add method to clear cache for this function
|
|
117
|
+
wrapper.clear_cache = cache_instance.clear
|
|
118
|
+
|
|
119
|
+
return wrapper
|
|
120
|
+
|
|
121
|
+
return decorator
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# Specialized caching for tree operations
|
|
125
|
+
@lru_cache(maxsize=128)
|
|
126
|
+
def cached_tree_distance(tree_pickle: bytes, tip1: str, tip2: str) -> float:
|
|
127
|
+
"""
|
|
128
|
+
Cache tree distance calculations.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
tree_pickle: Pickled tree object
|
|
132
|
+
tip1: First tip name
|
|
133
|
+
tip2: Second tip name
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Distance between tips
|
|
137
|
+
"""
|
|
138
|
+
import pickle
|
|
139
|
+
tree = pickle.loads(tree_pickle)
|
|
140
|
+
return tree.distance(tip1, tip2)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# Specialized caching for alignment operations
|
|
144
|
+
class AlignmentCache:
|
|
145
|
+
"""
|
|
146
|
+
Specialized cache for alignment operations.
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
def __init__(self):
|
|
150
|
+
self._column_cache = {}
|
|
151
|
+
self._stats_cache = {}
|
|
152
|
+
|
|
153
|
+
@lru_cache(maxsize=1024)
|
|
154
|
+
def get_column(self, alignment_hash: str, column_idx: int) -> str:
|
|
155
|
+
"""
|
|
156
|
+
Get cached alignment column.
|
|
157
|
+
"""
|
|
158
|
+
return self._column_cache.get(f"{alignment_hash}_{column_idx}")
|
|
159
|
+
|
|
160
|
+
def set_column(self, alignment_hash: str, column_idx: int, column: str) -> None:
|
|
161
|
+
"""
|
|
162
|
+
Cache alignment column.
|
|
163
|
+
"""
|
|
164
|
+
self._column_cache[f"{alignment_hash}_{column_idx}"] = column
|
|
165
|
+
|
|
166
|
+
@lru_cache(maxsize=128)
|
|
167
|
+
def get_stats(self, alignment_hash: str, stat_type: str) -> Any:
|
|
168
|
+
"""
|
|
169
|
+
Get cached alignment statistics.
|
|
170
|
+
"""
|
|
171
|
+
return self._stats_cache.get(f"{alignment_hash}_{stat_type}")
|
|
172
|
+
|
|
173
|
+
def set_stats(self, alignment_hash: str, stat_type: str, stats: Any) -> None:
|
|
174
|
+
"""
|
|
175
|
+
Cache alignment statistics.
|
|
176
|
+
"""
|
|
177
|
+
self._stats_cache[f"{alignment_hash}_{stat_type}"] = stats
|
|
178
|
+
|
|
179
|
+
def clear(self) -> None:
|
|
180
|
+
"""
|
|
181
|
+
Clear all caches.
|
|
182
|
+
"""
|
|
183
|
+
self._column_cache.clear()
|
|
184
|
+
self._stats_cache.clear()
|
|
185
|
+
self.get_column.cache_clear()
|
|
186
|
+
self.get_stats.cache_clear()
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# Global cache instances
|
|
190
|
+
_result_cache = ResultCache()
|
|
191
|
+
_alignment_cache = AlignmentCache()
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def get_result_cache() -> ResultCache:
|
|
195
|
+
"""Get global result cache instance."""
|
|
196
|
+
return _result_cache
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def get_alignment_cache() -> AlignmentCache:
|
|
200
|
+
"""Get global alignment cache instance."""
|
|
201
|
+
return _alignment_cache
|
phykit/helpers/files.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Tuple, Optional
|
|
4
|
+
from functools import lru_cache
|
|
5
|
+
import hashlib
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
from Bio import AlignIO
|
|
9
|
+
from Bio.Align import MultipleSeqAlignment
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FileFormat(Enum):
|
|
13
|
+
fasta = "fasta"
|
|
14
|
+
clustal = "clustal"
|
|
15
|
+
maf = "maf"
|
|
16
|
+
mauve = "mauve"
|
|
17
|
+
phylip = "phylip"
|
|
18
|
+
phylip_seq = "phylip-sequential"
|
|
19
|
+
phylip_rel = "phylip-relaxed"
|
|
20
|
+
stockholm = "stockholm"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _get_file_hash(file_path: str) -> str:
|
|
24
|
+
"""Calculate a hash for file content to use as cache key."""
|
|
25
|
+
# Use file path, size, and modification time for cache key
|
|
26
|
+
# This is faster than hashing file contents
|
|
27
|
+
stat = os.stat(file_path)
|
|
28
|
+
cache_key = f"{file_path}_{stat.st_size}_{stat.st_mtime}"
|
|
29
|
+
return hashlib.md5(cache_key.encode()).hexdigest()
|
|
30
|
+
|
|
31
|
+
def _detect_format_by_content(file_path: str) -> Optional[str]:
|
|
32
|
+
"""Attempt to detect file format by examining file content."""
|
|
33
|
+
with open(file_path, 'r') as f:
|
|
34
|
+
first_line = f.readline().strip()
|
|
35
|
+
|
|
36
|
+
# Quick format detection based on first line
|
|
37
|
+
if first_line.startswith('>'):
|
|
38
|
+
return 'fasta'
|
|
39
|
+
elif first_line.startswith('CLUSTAL'):
|
|
40
|
+
return 'clustal'
|
|
41
|
+
elif first_line.startswith('#'):
|
|
42
|
+
# Could be Stockholm
|
|
43
|
+
if 'STOCKHOLM' in first_line:
|
|
44
|
+
return 'stockholm'
|
|
45
|
+
elif first_line.isdigit() or (len(first_line.split()) == 2 and
|
|
46
|
+
first_line.split()[0].isdigit()):
|
|
47
|
+
return 'phylip'
|
|
48
|
+
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
@lru_cache(maxsize=32)
|
|
52
|
+
def _cached_alignment_read(file_hash: str, file_path: str, file_format: str) -> Tuple[MultipleSeqAlignment, bool]:
|
|
53
|
+
"""Cached reading of alignment files."""
|
|
54
|
+
with open(file_path) as f:
|
|
55
|
+
alignment = AlignIO.read(f, file_format)
|
|
56
|
+
return alignment, is_protein_alignment(alignment)
|
|
57
|
+
|
|
58
|
+
def get_alignment_and_format(
|
|
59
|
+
alignment_file_path: str
|
|
60
|
+
) -> Tuple[MultipleSeqAlignment, str, bool]:
|
|
61
|
+
# Check if file exists first
|
|
62
|
+
if not os.path.exists(alignment_file_path):
|
|
63
|
+
print(f"{alignment_file_path} corresponds to no such file.")
|
|
64
|
+
print("Please check file name and pathing")
|
|
65
|
+
sys.exit(2)
|
|
66
|
+
|
|
67
|
+
# Try to detect format by content first
|
|
68
|
+
detected_format = _detect_format_by_content(alignment_file_path)
|
|
69
|
+
|
|
70
|
+
# Get file hash for caching
|
|
71
|
+
file_hash = _get_file_hash(alignment_file_path)
|
|
72
|
+
|
|
73
|
+
# If format was detected, try it first
|
|
74
|
+
if detected_format:
|
|
75
|
+
try:
|
|
76
|
+
alignment, is_protein = _cached_alignment_read(
|
|
77
|
+
file_hash, alignment_file_path, detected_format
|
|
78
|
+
)
|
|
79
|
+
return alignment, detected_format, is_protein
|
|
80
|
+
except (ValueError, AssertionError):
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
# Fall back to trying all formats
|
|
84
|
+
for fileFormat in FileFormat:
|
|
85
|
+
# Skip the already tried format
|
|
86
|
+
if detected_format and fileFormat.value == detected_format:
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
alignment, is_protein = _cached_alignment_read(
|
|
91
|
+
file_hash, alignment_file_path, fileFormat.value
|
|
92
|
+
)
|
|
93
|
+
return alignment, fileFormat.value, is_protein
|
|
94
|
+
except (ValueError, AssertionError):
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
# If we get here, no format worked
|
|
98
|
+
print(f"Could not determine format for {alignment_file_path}")
|
|
99
|
+
print("Please ensure the file is in a supported format")
|
|
100
|
+
sys.exit(2)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def is_protein_alignment(alignment: MultipleSeqAlignment) -> bool:
|
|
104
|
+
nucleotide_set = {
|
|
105
|
+
"A", "C", "G", "T", "U", "-", "N", "?", "*"
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
for record in alignment:
|
|
109
|
+
seq_set = set(record.seq.upper())
|
|
110
|
+
if seq_set - nucleotide_set:
|
|
111
|
+
# if there are chars that are not in the nucl set,
|
|
112
|
+
# it's likely a protein sequence
|
|
113
|
+
return True
|
|
114
|
+
|
|
115
|
+
return False
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def read_single_column_file_to_list(single_col_file_path: str) -> list:
|
|
119
|
+
try:
|
|
120
|
+
with open(single_col_file_path) as f:
|
|
121
|
+
return [line.rstrip("\n").strip() for line in f]
|
|
122
|
+
except FileNotFoundError:
|
|
123
|
+
print(f"{single_col_file_path} corresponds to no such file or directory.")
|
|
124
|
+
print("Please check file name and pathing")
|
|
125
|
+
sys.exit(2)
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Parallel processing utilities for batch operations
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import multiprocessing as mp
|
|
6
|
+
from functools import partial
|
|
7
|
+
from typing import List, Any, Callable, Optional
|
|
8
|
+
import numpy as np
|
|
9
|
+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
|
10
|
+
import sys
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ParallelProcessor:
|
|
14
|
+
"""
|
|
15
|
+
Utility class for parallel processing of batch operations.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
@staticmethod
|
|
19
|
+
def get_optimal_workers(data_size: int, min_chunk_size: int = 10) -> int:
|
|
20
|
+
"""
|
|
21
|
+
Determine optimal number of workers based on data size.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
data_size: Size of data to process
|
|
25
|
+
min_chunk_size: Minimum size per chunk
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Optimal number of workers
|
|
29
|
+
"""
|
|
30
|
+
max_workers = mp.cpu_count()
|
|
31
|
+
optimal_workers = min(max_workers, max(1, data_size // min_chunk_size))
|
|
32
|
+
return min(optimal_workers, 8) # Cap at 8 to avoid overhead
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def chunk_data(data: List[Any], num_chunks: int) -> List[List[Any]]:
|
|
36
|
+
"""
|
|
37
|
+
Split data into chunks for parallel processing.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
data: Data to split
|
|
41
|
+
num_chunks: Number of chunks
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
List of data chunks
|
|
45
|
+
"""
|
|
46
|
+
chunk_size = max(1, len(data) // num_chunks)
|
|
47
|
+
chunks = []
|
|
48
|
+
|
|
49
|
+
for i in range(0, len(data), chunk_size):
|
|
50
|
+
chunk = data[i:i + chunk_size]
|
|
51
|
+
if chunk:
|
|
52
|
+
chunks.append(chunk)
|
|
53
|
+
|
|
54
|
+
return chunks
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def parallel_map(
|
|
58
|
+
func: Callable,
|
|
59
|
+
data: List[Any],
|
|
60
|
+
num_workers: Optional[int] = None,
|
|
61
|
+
use_threads: bool = False,
|
|
62
|
+
show_progress: bool = False
|
|
63
|
+
) -> List[Any]:
|
|
64
|
+
"""
|
|
65
|
+
Apply function to data in parallel.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
func: Function to apply
|
|
69
|
+
data: Data to process
|
|
70
|
+
num_workers: Number of workers (auto-determined if None)
|
|
71
|
+
use_threads: Use threads instead of processes
|
|
72
|
+
show_progress: Show progress bar
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
List of results
|
|
76
|
+
"""
|
|
77
|
+
if not data:
|
|
78
|
+
return []
|
|
79
|
+
|
|
80
|
+
# Determine number of workers
|
|
81
|
+
if num_workers is None:
|
|
82
|
+
num_workers = ParallelProcessor.get_optimal_workers(len(data))
|
|
83
|
+
|
|
84
|
+
# For small datasets, use sequential processing
|
|
85
|
+
if len(data) < 20 or num_workers == 1:
|
|
86
|
+
return [func(item) for item in data]
|
|
87
|
+
|
|
88
|
+
# Choose executor type
|
|
89
|
+
executor_class = ThreadPoolExecutor if use_threads else ProcessPoolExecutor
|
|
90
|
+
|
|
91
|
+
results = []
|
|
92
|
+
with executor_class(max_workers=num_workers) as executor:
|
|
93
|
+
if show_progress and sys.stderr.isatty():
|
|
94
|
+
try:
|
|
95
|
+
from tqdm import tqdm
|
|
96
|
+
futures = {executor.submit(func, item): i for i, item in enumerate(data)}
|
|
97
|
+
results = [None] * len(data)
|
|
98
|
+
|
|
99
|
+
for future in tqdm(as_completed(futures), total=len(data), desc="Processing"):
|
|
100
|
+
idx = futures[future]
|
|
101
|
+
results[idx] = future.result()
|
|
102
|
+
except ImportError:
|
|
103
|
+
# Fallback without progress bar
|
|
104
|
+
results = list(executor.map(func, data))
|
|
105
|
+
else:
|
|
106
|
+
results = list(executor.map(func, data))
|
|
107
|
+
|
|
108
|
+
return results
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def parallel_reduce(
|
|
112
|
+
func: Callable,
|
|
113
|
+
data: List[Any],
|
|
114
|
+
reduce_func: Callable,
|
|
115
|
+
initial_value: Any = None,
|
|
116
|
+
num_workers: Optional[int] = None
|
|
117
|
+
) -> Any:
|
|
118
|
+
"""
|
|
119
|
+
Apply function to data in parallel and reduce results.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
func: Function to apply to each item
|
|
123
|
+
data: Data to process
|
|
124
|
+
reduce_func: Function to reduce results
|
|
125
|
+
initial_value: Initial value for reduction
|
|
126
|
+
num_workers: Number of workers
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Reduced result
|
|
130
|
+
"""
|
|
131
|
+
# Apply function in parallel
|
|
132
|
+
results = ParallelProcessor.parallel_map(func, data, num_workers)
|
|
133
|
+
|
|
134
|
+
# Reduce results
|
|
135
|
+
if initial_value is not None:
|
|
136
|
+
result = initial_value
|
|
137
|
+
for item in results:
|
|
138
|
+
result = reduce_func(result, item)
|
|
139
|
+
else:
|
|
140
|
+
if not results:
|
|
141
|
+
return None
|
|
142
|
+
result = results[0]
|
|
143
|
+
for item in results[1:]:
|
|
144
|
+
result = reduce_func(result, item)
|
|
145
|
+
|
|
146
|
+
return result
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class BatchFileProcessor:
|
|
150
|
+
"""
|
|
151
|
+
Process multiple files in parallel.
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
@staticmethod
|
|
155
|
+
def process_files(
|
|
156
|
+
file_paths: List[str],
|
|
157
|
+
processing_func: Callable,
|
|
158
|
+
num_workers: Optional[int] = None,
|
|
159
|
+
aggregate_func: Optional[Callable] = None
|
|
160
|
+
) -> Any:
|
|
161
|
+
"""
|
|
162
|
+
Process multiple files in parallel.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
file_paths: List of file paths
|
|
166
|
+
processing_func: Function to process each file
|
|
167
|
+
num_workers: Number of workers
|
|
168
|
+
aggregate_func: Function to aggregate results
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
Processed results or aggregated result
|
|
172
|
+
"""
|
|
173
|
+
if not file_paths:
|
|
174
|
+
return []
|
|
175
|
+
|
|
176
|
+
# Process files in parallel
|
|
177
|
+
results = ParallelProcessor.parallel_map(
|
|
178
|
+
processing_func,
|
|
179
|
+
file_paths,
|
|
180
|
+
num_workers,
|
|
181
|
+
show_progress=True
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Aggregate results if function provided
|
|
185
|
+
if aggregate_func:
|
|
186
|
+
return aggregate_func(results)
|
|
187
|
+
|
|
188
|
+
return results
|
|
189
|
+
|
|
190
|
+
@staticmethod
|
|
191
|
+
def process_file_pairs(
|
|
192
|
+
file_pairs: List[Tuple[str, str]],
|
|
193
|
+
processing_func: Callable,
|
|
194
|
+
num_workers: Optional[int] = None
|
|
195
|
+
) -> List[Any]:
|
|
196
|
+
"""
|
|
197
|
+
Process pairs of files in parallel.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
file_pairs: List of file path pairs
|
|
201
|
+
processing_func: Function to process each pair
|
|
202
|
+
num_workers: Number of workers
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
List of results
|
|
206
|
+
"""
|
|
207
|
+
def process_pair(pair):
|
|
208
|
+
return processing_func(pair[0], pair[1])
|
|
209
|
+
|
|
210
|
+
return ParallelProcessor.parallel_map(
|
|
211
|
+
process_pair,
|
|
212
|
+
file_pairs,
|
|
213
|
+
num_workers
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class NumpyParallel:
|
|
218
|
+
"""
|
|
219
|
+
Utilities for parallel NumPy operations.
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
@staticmethod
|
|
223
|
+
def parallel_apply_along_axis(
|
|
224
|
+
func: Callable,
|
|
225
|
+
axis: int,
|
|
226
|
+
array: np.ndarray,
|
|
227
|
+
num_workers: Optional[int] = None
|
|
228
|
+
) -> np.ndarray:
|
|
229
|
+
"""
|
|
230
|
+
Apply function along axis in parallel.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
func: Function to apply
|
|
234
|
+
axis: Axis along which to apply function
|
|
235
|
+
array: NumPy array
|
|
236
|
+
num_workers: Number of workers
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Result array
|
|
240
|
+
"""
|
|
241
|
+
if axis == 0:
|
|
242
|
+
# Process columns
|
|
243
|
+
results = ParallelProcessor.parallel_map(
|
|
244
|
+
lambda col: func(array[:, col]),
|
|
245
|
+
list(range(array.shape[1])),
|
|
246
|
+
num_workers
|
|
247
|
+
)
|
|
248
|
+
return np.array(results).T
|
|
249
|
+
elif axis == 1:
|
|
250
|
+
# Process rows
|
|
251
|
+
results = ParallelProcessor.parallel_map(
|
|
252
|
+
lambda row: func(array[row, :]),
|
|
253
|
+
list(range(array.shape[0])),
|
|
254
|
+
num_workers
|
|
255
|
+
)
|
|
256
|
+
return np.array(results)
|
|
257
|
+
else:
|
|
258
|
+
raise ValueError(f"Unsupported axis: {axis}")
|
|
259
|
+
|
|
260
|
+
@staticmethod
|
|
261
|
+
def parallel_pairwise_operation(
|
|
262
|
+
items: List[Any],
|
|
263
|
+
operation_func: Callable,
|
|
264
|
+
num_workers: Optional[int] = None,
|
|
265
|
+
symmetric: bool = True
|
|
266
|
+
) -> np.ndarray:
|
|
267
|
+
"""
|
|
268
|
+
Perform pairwise operations in parallel.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
items: List of items
|
|
272
|
+
operation_func: Function to apply to pairs
|
|
273
|
+
num_workers: Number of workers
|
|
274
|
+
symmetric: Whether operation is symmetric
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
Matrix of pairwise results
|
|
278
|
+
"""
|
|
279
|
+
n = len(items)
|
|
280
|
+
result_matrix = np.zeros((n, n))
|
|
281
|
+
|
|
282
|
+
# Generate pairs
|
|
283
|
+
pairs = []
|
|
284
|
+
for i in range(n):
|
|
285
|
+
for j in range(i + 1 if symmetric else 0, n):
|
|
286
|
+
pairs.append((i, j, items[i], items[j]))
|
|
287
|
+
|
|
288
|
+
# Process pairs in parallel
|
|
289
|
+
def process_pair(pair_data):
|
|
290
|
+
i, j, item1, item2 = pair_data
|
|
291
|
+
return i, j, operation_func(item1, item2)
|
|
292
|
+
|
|
293
|
+
results = ParallelProcessor.parallel_map(
|
|
294
|
+
process_pair,
|
|
295
|
+
pairs,
|
|
296
|
+
num_workers
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# Fill result matrix
|
|
300
|
+
for i, j, value in results:
|
|
301
|
+
result_matrix[i, j] = value
|
|
302
|
+
if symmetric:
|
|
303
|
+
result_matrix[j, i] = value
|
|
304
|
+
|
|
305
|
+
return result_matrix
|