RNApolis 0.10.0__tar.gz → 0.10.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {rnapolis-0.10.0/src/RNApolis.egg-info → rnapolis-0.10.2}/PKG-INFO +1 -1
  2. {rnapolis-0.10.0 → rnapolis-0.10.2}/setup.py +1 -1
  3. {rnapolis-0.10.0 → rnapolis-0.10.2/src/RNApolis.egg-info}/PKG-INFO +1 -1
  4. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/distiller.py +118 -50
  5. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/parser.py +3 -0
  6. {rnapolis-0.10.0 → rnapolis-0.10.2}/LICENSE +0 -0
  7. {rnapolis-0.10.0 → rnapolis-0.10.2}/README.md +0 -0
  8. {rnapolis-0.10.0 → rnapolis-0.10.2}/pyproject.toml +0 -0
  9. {rnapolis-0.10.0 → rnapolis-0.10.2}/setup.cfg +0 -0
  10. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/RNApolis.egg-info/SOURCES.txt +0 -0
  11. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/RNApolis.egg-info/dependency_links.txt +0 -0
  12. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/RNApolis.egg-info/entry_points.txt +0 -0
  13. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/RNApolis.egg-info/requires.txt +0 -0
  14. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/RNApolis.egg-info/top_level.txt +0 -0
  15. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/adapter.py +0 -0
  16. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/aligner.py +0 -0
  17. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/annotator.py +0 -0
  18. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/clashfinder.py +0 -0
  19. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/common.py +0 -0
  20. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/component_A.csv +0 -0
  21. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/component_C.csv +0 -0
  22. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/component_G.csv +0 -0
  23. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/component_U.csv +0 -0
  24. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/metareader.py +0 -0
  25. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/mmcif_pdbx_v50.dic +0 -0
  26. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/molecule_filter.py +0 -0
  27. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/motif_extractor.py +0 -0
  28. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/parser_v2.py +0 -0
  29. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/py.typed +0 -0
  30. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/rfam_folder.py +0 -0
  31. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/splitter.py +0 -0
  32. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/tertiary.py +0 -0
  33. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/tertiary_v2.py +0 -0
  34. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/transformer.py +0 -0
  35. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/unifier.py +0 -0
  36. {rnapolis-0.10.0 → rnapolis-0.10.2}/src/rnapolis/util.py +0 -0
  37. {rnapolis-0.10.0 → rnapolis-0.10.2}/tests/test_adapter.py +0 -0
  38. {rnapolis-0.10.0 → rnapolis-0.10.2}/tests/test_annotator.py +0 -0
  39. {rnapolis-0.10.0 → rnapolis-0.10.2}/tests/test_bugfixes.py +0 -0
  40. {rnapolis-0.10.0 → rnapolis-0.10.2}/tests/test_common.py +0 -0
  41. {rnapolis-0.10.0 → rnapolis-0.10.2}/tests/test_metareader.py +0 -0
  42. {rnapolis-0.10.0 → rnapolis-0.10.2}/tests/test_molecule_filter.py +0 -0
  43. {rnapolis-0.10.0 → rnapolis-0.10.2}/tests/test_parser.py +0 -0
  44. {rnapolis-0.10.0 → rnapolis-0.10.2}/tests/test_quadruplexes.py +0 -0
  45. {rnapolis-0.10.0 → rnapolis-0.10.2}/tests/test_rfam_folder.py +0 -0
  46. {rnapolis-0.10.0 → rnapolis-0.10.2}/tests/test_tertiary.py +0 -0
  47. {rnapolis-0.10.0 → rnapolis-0.10.2}/tests/test_transformer.py +0 -0
  48. {rnapolis-0.10.0 → rnapolis-0.10.2}/tests/test_v2.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: RNApolis
3
- Version: 0.10.0
3
+ Version: 0.10.2
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -5,7 +5,7 @@ with open("README.md") as f:
5
5
 
6
6
  setup(
7
7
  name="RNApolis",
8
- version="0.10.0",
8
+ version="0.10.2",
9
9
  packages=["rnapolis"],
10
10
  package_dir={"": "src"},
11
11
  author="Tomasz Zok",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: RNApolis
3
- Version: 0.10.0
3
+ Version: 0.10.2
4
4
  Summary: A Python library containing RNA-related bioinformatics functions and classes
5
5
  Home-page: https://github.com/tzok/rnapolis-py
6
6
  Author: Tomasz Zok
@@ -35,7 +35,10 @@ def parse_arguments():
35
35
  )
36
36
 
37
37
  parser.add_argument(
38
- "files", nargs="+", type=Path, help="Input mmCIF or PDB files to analyze"
38
+ "files",
39
+ nargs="*",
40
+ type=Path,
41
+ help="Input mmCIF or PDB files to analyze (use '-' or omit to read paths from stdin)",
39
42
  )
40
43
 
41
44
  parser.add_argument(
@@ -82,16 +85,21 @@ def parse_arguments():
82
85
  parser.add_argument(
83
86
  "--mode",
84
87
  choices=["exact", "approximate"],
85
- default="exact",
86
- help="Clustering mode switch: --mode exact (default) performs rigorous nRMSD clustering, "
87
- "--mode approximate performs faster feature-based PCA + FAISS clustering",
88
+ default="approximate",
89
+ help="Clustering mode switch: --mode approximate (default) performs faster feature-based PCA + FAISS clustering, "
90
+ "--mode exact performs rigorous nRMSD clustering",
88
91
  )
89
92
 
90
93
  parser.add_argument(
91
94
  "--radius",
92
95
  type=float,
93
- default=10.0,
94
- help="Radius in PCA-reduced space for redundancy detection (approximate mode only, default: 10.0)",
96
+ action="append",
97
+ default=[1.0, 2.0, 4.0, 8.0],
98
+ help=(
99
+ "Radius in PCA-reduced space for redundancy detection "
100
+ "(approximate mode). Can be supplied multiple times; "
101
+ "results will be produced for each value (default: 1, 2, 4, 8)."
102
+ ),
95
103
  )
96
104
 
97
105
  return parser.parse_args()
@@ -245,7 +253,12 @@ def validate_nucleotide_counts(
245
253
  """
246
254
  nucleotide_counts = []
247
255
 
248
- for structure, file_path in zip(structures, file_paths):
256
+ for structure, file_path in tqdm(
257
+ zip(structures, file_paths),
258
+ total=len(structures),
259
+ desc="Validating nucleotide counts",
260
+ unit="structure",
261
+ ):
249
262
  nucleotide_residues = [
250
263
  residue for residue in structure.residues if residue.is_nucleotide
251
264
  ]
@@ -521,13 +534,28 @@ def featurize_structure(structure: Structure) -> np.ndarray:
521
534
  return np.asarray(feats, dtype=np.float32)
522
535
 
523
536
 
524
- def run_approximate(structures: List[Structure], file_paths: List[Path], args) -> None:
537
+ def run_approximate_multiple(
538
+ structures: List[Structure],
539
+ file_paths: List[Path],
540
+ radii: List[float],
541
+ output_json: Optional[str],
542
+ ) -> None:
525
543
  """
526
- Approximate mode: features PCA FAISS radius clustering.
544
+ Approximate mode (multi-radius): compute PCA once, then perform clustering
545
+ for each radius value provided. This avoids redundant dimensionality reduction.
527
546
  """
528
- print("\nRunning approximate mode (feature-based PCA + FAISS)")
547
+ if not radii:
548
+ print("Error: No radius values supplied", file=sys.stderr)
549
+ sys.exit(1)
529
550
 
530
- feature_vectors = [featurize_structure(s) for s in structures]
551
+ # ------------------------------------------------------------------
552
+ # 1. Feature extraction
553
+ # ------------------------------------------------------------------
554
+ print("\nRunning approximate mode (feature-based PCA + FAISS)")
555
+ feature_vectors = [
556
+ featurize_structure(s)
557
+ for s in tqdm(structures, desc="Featurizing", unit="structure")
558
+ ]
531
559
  feature_lengths = {len(v) for v in feature_vectors}
532
560
  if len(feature_lengths) != 1:
533
561
  print("Error: Inconsistent feature lengths among structures", file=sys.stderr)
@@ -536,48 +564,74 @@ def run_approximate(structures: List[Structure], file_paths: List[Path], args) -
536
564
  X = np.stack(feature_vectors).astype(np.float32)
537
565
  print(f"Feature matrix shape: {X.shape}")
538
566
 
567
+ # ------------------------------------------------------------------
568
+ # 2. PCA transformation (fit once)
569
+ # ------------------------------------------------------------------
539
570
  pca = PCA(n_components=0.95, svd_solver="full", random_state=0)
540
571
  X_red = pca.fit_transform(X).astype(np.float32)
541
572
  d = X_red.shape[1]
542
573
  print(f"PCA reduced to {d} dimensions (95 % variance)")
543
574
 
575
+ # ------------------------------------------------------------------
576
+ # 3. Build FAISS index once
577
+ # ------------------------------------------------------------------
544
578
  index = faiss.IndexFlatL2(d)
545
579
  index.add(X_red)
546
- radius_sq = args.radius**2
547
-
548
- visited: set[int] = set()
549
- clusters: List[List[int]] = []
550
-
551
- for idx in range(len(structures)):
552
- if idx in visited:
553
- continue
554
- D, I = index.search(X_red[idx : idx + 1], len(structures))
555
- cluster = [int(i) for dist, i in zip(D[0], I[0]) if dist <= radius_sq]
556
- clusters.append(cluster)
557
- visited.update(cluster)
558
-
559
- print(f"\nIdentified {len(clusters)} representatives with radius {args.radius}")
560
- for cluster in clusters:
561
- rep = cluster[0]
562
- redundants = cluster[1:]
563
- print(f"Representative: {file_paths[rep]}")
564
- for r in redundants:
565
- print(f" Redundant: {file_paths[r]}")
566
580
 
567
- if args.output_json:
568
- out = {
569
- "parameters": {"mode": "approximate", "radius": args.radius},
570
- "clusters": [
581
+ # ------------------------------------------------------------------
582
+ # 4. Cluster for each radius
583
+ # ------------------------------------------------------------------
584
+ results_for_json: List[dict] = []
585
+ for radius in radii:
586
+ radius_sq = radius**2
587
+ visited: set[int] = set()
588
+ clusters: List[List[int]] = []
589
+
590
+ for idx in range(len(structures)):
591
+ if idx in visited:
592
+ continue
593
+ D, I = index.search(X_red[idx : idx + 1], len(structures))
594
+ cluster = [int(i) for dist, i in zip(D[0], I[0]) if dist <= radius_sq]
595
+ clusters.append(cluster)
596
+ visited.update(cluster)
597
+
598
+ print(f"\nIdentified {len(clusters)} representatives with radius {radius}")
599
+ if output_json is None:
600
+ for cluster in clusters:
601
+ rep = cluster[0]
602
+ redundants = cluster[1:]
603
+ print(f"Representative: {file_paths[rep]}")
604
+ for r in redundants:
605
+ print(f" Redundant: {file_paths[r]}")
606
+
607
+ if output_json is not None:
608
+ results_for_json.append(
571
609
  {
572
- "representative": str(file_paths[c[0]]),
573
- "members": [str(file_paths[m]) for m in c[1:]],
610
+ "radius": radius,
611
+ "n_clusters": len(clusters),
612
+ "clusters": [
613
+ {
614
+ "representative": str(file_paths[c[0]]),
615
+ "members": [str(file_paths[m]) for m in c[1:]],
616
+ }
617
+ for c in clusters
618
+ ],
574
619
  }
575
- for c in clusters
576
- ],
620
+ )
621
+
622
+ # Write combined JSON once after processing all radii
623
+ if output_json and results_for_json:
624
+ combined = {
625
+ "parameters": {
626
+ "mode": "approximate",
627
+ "radii": radii,
628
+ "n_structures": len(structures),
629
+ },
630
+ "results": results_for_json,
577
631
  }
578
- with open(args.output_json, "w") as f:
579
- json.dump(out, f, indent=2)
580
- print(f"\nApproximate clustering saved to {args.output_json}")
632
+ with open(output_json, "w") as f:
633
+ json.dump(combined, f, indent=2)
634
+ print(f"\nApproximate clustering for all radii saved to {output_json}")
581
635
 
582
636
  return
583
637
 
@@ -1074,8 +1128,18 @@ def main():
1074
1128
  """Main entry point for the distiller CLI tool."""
1075
1129
  args = parse_arguments()
1076
1130
 
1131
+ # Combine file paths from CLI arguments and/or stdin
1132
+ file_paths: List[Path] = []
1133
+ cli_paths = [p for p in args.files if str(p) != "-"]
1134
+ file_paths.extend(cli_paths)
1135
+
1136
+ # If no CLI paths provided or '-' sentinel present, read from stdin
1137
+ if not args.files or any(str(p) == "-" for p in args.files):
1138
+ stdin_paths = [Path(line.strip()) for line in sys.stdin if line.strip()]
1139
+ file_paths.extend(stdin_paths)
1140
+
1077
1141
  # Validate input files
1078
- valid_files = validate_input_files(args.files)
1142
+ valid_files = validate_input_files(file_paths)
1079
1143
 
1080
1144
  if not valid_files:
1081
1145
  print("Error: No valid input files found", file=sys.stderr)
@@ -1085,22 +1149,26 @@ def main():
1085
1149
 
1086
1150
  # Parse all structure files
1087
1151
  print("Parsing structure files...")
1088
- structures = []
1089
- for file_path in valid_files:
1152
+ structures: List[Structure] = []
1153
+ parsed_files: List[Path] = []
1154
+
1155
+ for file_path in tqdm(valid_files, desc="Parsing", unit="file"):
1090
1156
  try:
1091
1157
  structure = parse_structure_file(file_path)
1092
1158
  structures.append(structure)
1093
- print(f" Parsed {file_path}")
1159
+ parsed_files.append(file_path)
1094
1160
  except Exception:
1161
+ # Keep reporting failures explicitly
1095
1162
  print(f" Failed to parse {file_path}, skipping", file=sys.stderr)
1096
- continue
1163
+
1164
+ # Replace the original list with the successfully parsed ones
1165
+ valid_files = parsed_files
1097
1166
 
1098
1167
  if not structures:
1099
1168
  print("Error: No structures could be parsed", file=sys.stderr)
1100
1169
  sys.exit(1)
1101
1170
 
1102
- # Update valid_files to match successfully parsed structures
1103
- valid_files = valid_files[: len(structures)]
1171
+ # valid_files already filtered to successfully parsed structures above
1104
1172
 
1105
1173
  # Validate nucleotide counts
1106
1174
  print("\nValidating nucleotide counts...")
@@ -1108,7 +1176,7 @@ def main():
1108
1176
 
1109
1177
  # Switch workflow based on requested mode
1110
1178
  if args.mode == "approximate":
1111
- run_approximate(structures, valid_files, args)
1179
+ run_approximate_multiple(structures, valid_files, args.radius, args.output_json)
1112
1180
  return
1113
1181
  else:
1114
1182
  run_exact(structures, valid_files, args)
@@ -18,6 +18,9 @@ def read_3d_structure(
18
18
  atoms, modified, sequence_by_entity, is_nucleic_acid_by_entity = (
19
19
  parse_cif(cif_or_pdb) if is_cif(cif_or_pdb) else parse_pdb(cif_or_pdb)
20
20
  )
21
+ if not atoms:
22
+ logger.warning("No atoms parsed from file, returning empty Structure3D.")
23
+ return Structure3D([])
21
24
  available_models = {atom.model: None for atom in atoms}
22
25
  atoms_by_model = {
23
26
  model: list(filter(lambda atom: atom.model == model, atoms))
File without changes
File without changes
File without changes
File without changes
File without changes