RNApolis 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnapolis/distiller.py +118 -50
- {rnapolis-0.10.1.dist-info → rnapolis-0.10.2.dist-info}/METADATA +1 -1
- {rnapolis-0.10.1.dist-info → rnapolis-0.10.2.dist-info}/RECORD +7 -7
- {rnapolis-0.10.1.dist-info → rnapolis-0.10.2.dist-info}/WHEEL +0 -0
- {rnapolis-0.10.1.dist-info → rnapolis-0.10.2.dist-info}/entry_points.txt +0 -0
- {rnapolis-0.10.1.dist-info → rnapolis-0.10.2.dist-info}/licenses/LICENSE +0 -0
- {rnapolis-0.10.1.dist-info → rnapolis-0.10.2.dist-info}/top_level.txt +0 -0
rnapolis/distiller.py
CHANGED
@@ -35,7 +35,10 @@ def parse_arguments():
|
|
35
35
|
)
|
36
36
|
|
37
37
|
parser.add_argument(
|
38
|
-
"files",
|
38
|
+
"files",
|
39
|
+
nargs="*",
|
40
|
+
type=Path,
|
41
|
+
help="Input mmCIF or PDB files to analyze (use '-' or omit to read paths from stdin)",
|
39
42
|
)
|
40
43
|
|
41
44
|
parser.add_argument(
|
@@ -82,16 +85,21 @@ def parse_arguments():
|
|
82
85
|
parser.add_argument(
|
83
86
|
"--mode",
|
84
87
|
choices=["exact", "approximate"],
|
85
|
-
default="
|
86
|
-
help="Clustering mode switch: --mode
|
87
|
-
"--mode
|
88
|
+
default="approximate",
|
89
|
+
help="Clustering mode switch: --mode approximate (default) performs faster feature-based PCA + FAISS clustering, "
|
90
|
+
"--mode exact performs rigorous nRMSD clustering",
|
88
91
|
)
|
89
92
|
|
90
93
|
parser.add_argument(
|
91
94
|
"--radius",
|
92
95
|
type=float,
|
93
|
-
|
94
|
-
|
96
|
+
action="append",
|
97
|
+
default=[1.0, 2.0, 4.0, 8.0],
|
98
|
+
help=(
|
99
|
+
"Radius in PCA-reduced space for redundancy detection "
|
100
|
+
"(approximate mode). Can be supplied multiple times; "
|
101
|
+
"results will be produced for each value (default: 1, 2, 4, 8)."
|
102
|
+
),
|
95
103
|
)
|
96
104
|
|
97
105
|
return parser.parse_args()
|
@@ -245,7 +253,12 @@ def validate_nucleotide_counts(
|
|
245
253
|
"""
|
246
254
|
nucleotide_counts = []
|
247
255
|
|
248
|
-
for structure, file_path in
|
256
|
+
for structure, file_path in tqdm(
|
257
|
+
zip(structures, file_paths),
|
258
|
+
total=len(structures),
|
259
|
+
desc="Validating nucleotide counts",
|
260
|
+
unit="structure",
|
261
|
+
):
|
249
262
|
nucleotide_residues = [
|
250
263
|
residue for residue in structure.residues if residue.is_nucleotide
|
251
264
|
]
|
@@ -521,13 +534,28 @@ def featurize_structure(structure: Structure) -> np.ndarray:
|
|
521
534
|
return np.asarray(feats, dtype=np.float32)
|
522
535
|
|
523
536
|
|
524
|
-
def
|
537
|
+
def run_approximate_multiple(
|
538
|
+
structures: List[Structure],
|
539
|
+
file_paths: List[Path],
|
540
|
+
radii: List[float],
|
541
|
+
output_json: Optional[str],
|
542
|
+
) -> None:
|
525
543
|
"""
|
526
|
-
Approximate mode:
|
544
|
+
Approximate mode (multi-radius): compute PCA once, then perform clustering
|
545
|
+
for each radius value provided. This avoids redundant dimensionality reduction.
|
527
546
|
"""
|
528
|
-
|
547
|
+
if not radii:
|
548
|
+
print("Error: No radius values supplied", file=sys.stderr)
|
549
|
+
sys.exit(1)
|
529
550
|
|
530
|
-
|
551
|
+
# ------------------------------------------------------------------
|
552
|
+
# 1. Feature extraction
|
553
|
+
# ------------------------------------------------------------------
|
554
|
+
print("\nRunning approximate mode (feature-based PCA + FAISS)")
|
555
|
+
feature_vectors = [
|
556
|
+
featurize_structure(s)
|
557
|
+
for s in tqdm(structures, desc="Featurizing", unit="structure")
|
558
|
+
]
|
531
559
|
feature_lengths = {len(v) for v in feature_vectors}
|
532
560
|
if len(feature_lengths) != 1:
|
533
561
|
print("Error: Inconsistent feature lengths among structures", file=sys.stderr)
|
@@ -536,48 +564,74 @@ def run_approximate(structures: List[Structure], file_paths: List[Path], args) -
|
|
536
564
|
X = np.stack(feature_vectors).astype(np.float32)
|
537
565
|
print(f"Feature matrix shape: {X.shape}")
|
538
566
|
|
567
|
+
# ------------------------------------------------------------------
|
568
|
+
# 2. PCA transformation (fit once)
|
569
|
+
# ------------------------------------------------------------------
|
539
570
|
pca = PCA(n_components=0.95, svd_solver="full", random_state=0)
|
540
571
|
X_red = pca.fit_transform(X).astype(np.float32)
|
541
572
|
d = X_red.shape[1]
|
542
573
|
print(f"PCA reduced to {d} dimensions (95 % variance)")
|
543
574
|
|
575
|
+
# ------------------------------------------------------------------
|
576
|
+
# 3. Build FAISS index once
|
577
|
+
# ------------------------------------------------------------------
|
544
578
|
index = faiss.IndexFlatL2(d)
|
545
579
|
index.add(X_red)
|
546
|
-
radius_sq = args.radius**2
|
547
|
-
|
548
|
-
visited: set[int] = set()
|
549
|
-
clusters: List[List[int]] = []
|
550
|
-
|
551
|
-
for idx in range(len(structures)):
|
552
|
-
if idx in visited:
|
553
|
-
continue
|
554
|
-
D, I = index.search(X_red[idx : idx + 1], len(structures))
|
555
|
-
cluster = [int(i) for dist, i in zip(D[0], I[0]) if dist <= radius_sq]
|
556
|
-
clusters.append(cluster)
|
557
|
-
visited.update(cluster)
|
558
|
-
|
559
|
-
print(f"\nIdentified {len(clusters)} representatives with radius {args.radius}")
|
560
|
-
for cluster in clusters:
|
561
|
-
rep = cluster[0]
|
562
|
-
redundants = cluster[1:]
|
563
|
-
print(f"Representative: {file_paths[rep]}")
|
564
|
-
for r in redundants:
|
565
|
-
print(f" Redundant: {file_paths[r]}")
|
566
580
|
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
581
|
+
# ------------------------------------------------------------------
|
582
|
+
# 4. Cluster for each radius
|
583
|
+
# ------------------------------------------------------------------
|
584
|
+
results_for_json: List[dict] = []
|
585
|
+
for radius in radii:
|
586
|
+
radius_sq = radius**2
|
587
|
+
visited: set[int] = set()
|
588
|
+
clusters: List[List[int]] = []
|
589
|
+
|
590
|
+
for idx in range(len(structures)):
|
591
|
+
if idx in visited:
|
592
|
+
continue
|
593
|
+
D, I = index.search(X_red[idx : idx + 1], len(structures))
|
594
|
+
cluster = [int(i) for dist, i in zip(D[0], I[0]) if dist <= radius_sq]
|
595
|
+
clusters.append(cluster)
|
596
|
+
visited.update(cluster)
|
597
|
+
|
598
|
+
print(f"\nIdentified {len(clusters)} representatives with radius {radius}")
|
599
|
+
if output_json is None:
|
600
|
+
for cluster in clusters:
|
601
|
+
rep = cluster[0]
|
602
|
+
redundants = cluster[1:]
|
603
|
+
print(f"Representative: {file_paths[rep]}")
|
604
|
+
for r in redundants:
|
605
|
+
print(f" Redundant: {file_paths[r]}")
|
606
|
+
|
607
|
+
if output_json is not None:
|
608
|
+
results_for_json.append(
|
571
609
|
{
|
572
|
-
"
|
573
|
-
"
|
610
|
+
"radius": radius,
|
611
|
+
"n_clusters": len(clusters),
|
612
|
+
"clusters": [
|
613
|
+
{
|
614
|
+
"representative": str(file_paths[c[0]]),
|
615
|
+
"members": [str(file_paths[m]) for m in c[1:]],
|
616
|
+
}
|
617
|
+
for c in clusters
|
618
|
+
],
|
574
619
|
}
|
575
|
-
|
576
|
-
|
620
|
+
)
|
621
|
+
|
622
|
+
# Write combined JSON once after processing all radii
|
623
|
+
if output_json and results_for_json:
|
624
|
+
combined = {
|
625
|
+
"parameters": {
|
626
|
+
"mode": "approximate",
|
627
|
+
"radii": radii,
|
628
|
+
"n_structures": len(structures),
|
629
|
+
},
|
630
|
+
"results": results_for_json,
|
577
631
|
}
|
578
|
-
with open(
|
579
|
-
json.dump(
|
580
|
-
print(f"\nApproximate clustering saved to {
|
632
|
+
with open(output_json, "w") as f:
|
633
|
+
json.dump(combined, f, indent=2)
|
634
|
+
print(f"\nApproximate clustering for all radii saved to {output_json}")
|
581
635
|
|
582
636
|
return
|
583
637
|
|
@@ -1074,8 +1128,18 @@ def main():
|
|
1074
1128
|
"""Main entry point for the distiller CLI tool."""
|
1075
1129
|
args = parse_arguments()
|
1076
1130
|
|
1131
|
+
# Combine file paths from CLI arguments and/or stdin
|
1132
|
+
file_paths: List[Path] = []
|
1133
|
+
cli_paths = [p for p in args.files if str(p) != "-"]
|
1134
|
+
file_paths.extend(cli_paths)
|
1135
|
+
|
1136
|
+
# If no CLI paths provided or '-' sentinel present, read from stdin
|
1137
|
+
if not args.files or any(str(p) == "-" for p in args.files):
|
1138
|
+
stdin_paths = [Path(line.strip()) for line in sys.stdin if line.strip()]
|
1139
|
+
file_paths.extend(stdin_paths)
|
1140
|
+
|
1077
1141
|
# Validate input files
|
1078
|
-
valid_files = validate_input_files(
|
1142
|
+
valid_files = validate_input_files(file_paths)
|
1079
1143
|
|
1080
1144
|
if not valid_files:
|
1081
1145
|
print("Error: No valid input files found", file=sys.stderr)
|
@@ -1085,22 +1149,26 @@ def main():
|
|
1085
1149
|
|
1086
1150
|
# Parse all structure files
|
1087
1151
|
print("Parsing structure files...")
|
1088
|
-
structures = []
|
1089
|
-
|
1152
|
+
structures: List[Structure] = []
|
1153
|
+
parsed_files: List[Path] = []
|
1154
|
+
|
1155
|
+
for file_path in tqdm(valid_files, desc="Parsing", unit="file"):
|
1090
1156
|
try:
|
1091
1157
|
structure = parse_structure_file(file_path)
|
1092
1158
|
structures.append(structure)
|
1093
|
-
|
1159
|
+
parsed_files.append(file_path)
|
1094
1160
|
except Exception:
|
1161
|
+
# Keep reporting failures explicitly
|
1095
1162
|
print(f" Failed to parse {file_path}, skipping", file=sys.stderr)
|
1096
|
-
|
1163
|
+
|
1164
|
+
# Replace the original list with the successfully parsed ones
|
1165
|
+
valid_files = parsed_files
|
1097
1166
|
|
1098
1167
|
if not structures:
|
1099
1168
|
print("Error: No structures could be parsed", file=sys.stderr)
|
1100
1169
|
sys.exit(1)
|
1101
1170
|
|
1102
|
-
#
|
1103
|
-
valid_files = valid_files[: len(structures)]
|
1171
|
+
# valid_files already filtered to successfully parsed structures above
|
1104
1172
|
|
1105
1173
|
# Validate nucleotide counts
|
1106
1174
|
print("\nValidating nucleotide counts...")
|
@@ -1108,7 +1176,7 @@ def main():
|
|
1108
1176
|
|
1109
1177
|
# Switch workflow based on requested mode
|
1110
1178
|
if args.mode == "approximate":
|
1111
|
-
|
1179
|
+
run_approximate_multiple(structures, valid_files, args.radius, args.output_json)
|
1112
1180
|
return
|
1113
1181
|
else:
|
1114
1182
|
run_exact(structures, valid_files, args)
|
@@ -7,7 +7,7 @@ rnapolis/component_A.csv,sha256=koirS-AwUZwoYGItT8yn3wS6Idvmh2FANfTQcOS_xh8,2897
|
|
7
7
|
rnapolis/component_C.csv,sha256=NtvsAu_YrUgTjzZm3j4poW4IZ99x3dPARB09XVIiMCc,2803
|
8
8
|
rnapolis/component_G.csv,sha256=Z5wl8OnHRyx4XhTyBiWgRZiEvmZXhoxtVRH8bn6Vxf0,2898
|
9
9
|
rnapolis/component_U.csv,sha256=8BUoU1m2YzGmi8_kw1xdpf3pucszHjFEtTex87CuXiE,2645
|
10
|
-
rnapolis/distiller.py,sha256=
|
10
|
+
rnapolis/distiller.py,sha256=ryoTGK9C2WGBlxcfidqyXhuGfjJ0XjNRVjp0d1-cyAk,39545
|
11
11
|
rnapolis/metareader.py,sha256=I1-cXc2YNBPwa3zihAnMTjEsAo79tEKzSmWu5yvN1Pk,2071
|
12
12
|
rnapolis/mmcif_pdbx_v50.dic,sha256=5QFx1ssDaehR4_DQ-tS9VQux262SiLXaqcwmwwejF5c,5744659
|
13
13
|
rnapolis/molecule_filter.py,sha256=jgcpJxx_oXEBX0d30v4k_FdwRouRUPUsEtCYWgLGpD4,7310
|
@@ -22,9 +22,9 @@ rnapolis/tertiary_v2.py,sha256=SgijTv0bPqMJwsMqyQk0O8QAnS2Ozk45vk8igxt9hRs,38001
|
|
22
22
|
rnapolis/transformer.py,sha256=aC0nBmHHJf5TyLvBIV57Jj3tlwpvHbPo347opfAOlQA,3844
|
23
23
|
rnapolis/unifier.py,sha256=2ge7IB9FdRgzSAiVD39U_ciwtdDJ2fGzf8mUIudbrqY,5820
|
24
24
|
rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
|
25
|
-
rnapolis-0.10.
|
26
|
-
rnapolis-0.10.
|
27
|
-
rnapolis-0.10.
|
28
|
-
rnapolis-0.10.
|
29
|
-
rnapolis-0.10.
|
30
|
-
rnapolis-0.10.
|
25
|
+
rnapolis-0.10.2.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
|
26
|
+
rnapolis-0.10.2.dist-info/METADATA,sha256=5ZYzqjRDd2WHeVrDrq4QMp4V7nQ2gI1OVgUvgmqt_Es,54611
|
27
|
+
rnapolis-0.10.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
28
|
+
rnapolis-0.10.2.dist-info/entry_points.txt,sha256=MZMWnYBUYnis-zWDmFfuA5yXtU3W5YdQrm5HA5LrkeM,474
|
29
|
+
rnapolis-0.10.2.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
|
30
|
+
rnapolis-0.10.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|