PyPI - RNApolis - Versions diffs - 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl - Mend

RNApolis 0.10.1py3-none-any.whl → 0.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

rnapolis/distiller.py CHANGED Viewed

@@ -35,7 +35,10 @@ def parse_arguments():
     )
     parser.add_argument(
-        "files", nargs="+", type=Path, help="Input mmCIF or PDB files to analyze"
+        "files",
+        nargs="*",
+        type=Path,
+        help="Input mmCIF or PDB files to analyze (use '-' or omit to read paths from stdin)",
     )
     parser.add_argument(
@@ -82,16 +85,21 @@ def parse_arguments():
     parser.add_argument(
         "--mode",
         choices=["exact", "approximate"],
-        default="exact",
-        help="Clustering mode switch: --mode exact (default) performs rigorous nRMSD clustering, "
-        "--mode approximate performs faster feature-based PCA + FAISS clustering",
+        default="approximate",
+        help="Clustering mode switch: --mode approximate (default) performs faster feature-based PCA + FAISS clustering, "
+        "--mode exact performs rigorous nRMSD clustering",
     )
     parser.add_argument(
         "--radius",
         type=float,
-        default=10.0,
-        help="Radius in PCA-reduced space for redundancy detection (approximate mode only, default: 10.0)",
+        action="append",
+        default=[1.0, 2.0, 4.0, 8.0],
+        help=(
+            "Radius in PCA-reduced space for redundancy detection "
+            "(approximate mode). Can be supplied multiple times; "
+            "results will be produced for each value (default: 1, 2, 4, 8)."
+        ),
     )
     return parser.parse_args()
@@ -245,7 +253,12 @@ def validate_nucleotide_counts(
     """
     nucleotide_counts = []
-    for structure, file_path in zip(structures, file_paths):
+    for structure, file_path in tqdm(
+        zip(structures, file_paths),
+        total=len(structures),
+        desc="Validating nucleotide counts",
+        unit="structure",
+    ):
         nucleotide_residues = [
             residue for residue in structure.residues if residue.is_nucleotide
         ]
@@ -521,13 +534,28 @@ def featurize_structure(structure: Structure) -> np.ndarray:
     return np.asarray(feats, dtype=np.float32)
-def run_approximate(structures: List[Structure], file_paths: List[Path], args) -> None:
+def run_approximate_multiple(
+    structures: List[Structure],
+    file_paths: List[Path],
+    radii: List[float],
+    output_json: Optional[str],
+) -> None:
     """
-    Approximate mode: features → PCA → FAISS radius clustering.
+    Approximate mode (multi-radius): compute PCA once, then perform clustering
+    for each radius value provided. This avoids redundant dimensionality reduction.
     """
-    print("\nRunning approximate mode (feature-based PCA + FAISS)")
+    if not radii:
+        print("Error: No radius values supplied", file=sys.stderr)
+        sys.exit(1)
-    feature_vectors = [featurize_structure(s) for s in structures]
+    # ------------------------------------------------------------------
+    # 1. Feature extraction
+    # ------------------------------------------------------------------
+    print("\nRunning approximate mode (feature-based PCA + FAISS)")
+    feature_vectors = [
+        featurize_structure(s)
+        for s in tqdm(structures, desc="Featurizing", unit="structure")
+    ]
     feature_lengths = {len(v) for v in feature_vectors}
     if len(feature_lengths) != 1:
         print("Error: Inconsistent feature lengths among structures", file=sys.stderr)
@@ -536,48 +564,74 @@ def run_approximate(structures: List[Structure], file_paths: List[Path], args) -
     X = np.stack(feature_vectors).astype(np.float32)
     print(f"Feature matrix shape: {X.shape}")
+    # ------------------------------------------------------------------
+    # 2. PCA transformation (fit once)
+    # ------------------------------------------------------------------
     pca = PCA(n_components=0.95, svd_solver="full", random_state=0)
     X_red = pca.fit_transform(X).astype(np.float32)
     d = X_red.shape[1]
     print(f"PCA reduced to {d} dimensions (95 % variance)")
+    # ------------------------------------------------------------------
+    # 3. Build FAISS index once
+    # ------------------------------------------------------------------
     index = faiss.IndexFlatL2(d)
     index.add(X_red)
-    radius_sq = args.radius**2
-    visited: set[int] = set()
-    clusters: List[List[int]] = []
-    for idx in range(len(structures)):
-        if idx in visited:
-            continue
-        D, I = index.search(X_red[idx : idx + 1], len(structures))
-        cluster = [int(i) for dist, i in zip(D[0], I[0]) if dist <= radius_sq]
-        clusters.append(cluster)
-        visited.update(cluster)
-    print(f"\nIdentified {len(clusters)} representatives with radius {args.radius}")
-    for cluster in clusters:
-        rep = cluster[0]
-        redundants = cluster[1:]
-        print(f"Representative: {file_paths[rep]}")
-        for r in redundants:
-            print(f"  Redundant: {file_paths[r]}")
-    if args.output_json:
-        out = {
-            "parameters": {"mode": "approximate", "radius": args.radius},
-            "clusters": [
+    # ------------------------------------------------------------------
+    # 4. Cluster for each radius
+    # ------------------------------------------------------------------
+    results_for_json: List[dict] = []
+    for radius in radii:
+        radius_sq = radius**2
+        visited: set[int] = set()
+        clusters: List[List[int]] = []
+        for idx in range(len(structures)):
+            if idx in visited:
+                continue
+            D, I = index.search(X_red[idx : idx + 1], len(structures))
+            cluster = [int(i) for dist, i in zip(D[0], I[0]) if dist <= radius_sq]
+            clusters.append(cluster)
+            visited.update(cluster)
+        print(f"\nIdentified {len(clusters)} representatives with radius {radius}")
+        if output_json is None:
+            for cluster in clusters:
+                rep = cluster[0]
+                redundants = cluster[1:]
+                print(f"Representative: {file_paths[rep]}")
+                for r in redundants:
+                    print(f"  Redundant: {file_paths[r]}")
+        if output_json is not None:
+            results_for_json.append(
                 {
-                    "representative": str(file_paths[c[0]]),
-                    "members": [str(file_paths[m]) for m in c[1:]],
+                    "radius": radius,
+                    "n_clusters": len(clusters),
+                    "clusters": [
+                        {
+                            "representative": str(file_paths[c[0]]),
+                            "members": [str(file_paths[m]) for m in c[1:]],
+                        }
+                        for c in clusters
+                    ],
                 }
-                for c in clusters
-            ],
+            )
+    # Write combined JSON once after processing all radii
+    if output_json and results_for_json:
+        combined = {
+            "parameters": {
+                "mode": "approximate",
+                "radii": radii,
+                "n_structures": len(structures),
+            },
+            "results": results_for_json,
         }
-        with open(args.output_json, "w") as f:
-            json.dump(out, f, indent=2)
-        print(f"\nApproximate clustering saved to {args.output_json}")
+        with open(output_json, "w") as f:
+            json.dump(combined, f, indent=2)
+        print(f"\nApproximate clustering for all radii saved to {output_json}")
     return
@@ -1074,8 +1128,18 @@ def main():
     """Main entry point for the distiller CLI tool."""
     args = parse_arguments()
+    # Combine file paths from CLI arguments and/or stdin
+    file_paths: List[Path] = []
+    cli_paths = [p for p in args.files if str(p) != "-"]
+    file_paths.extend(cli_paths)
+    # If no CLI paths provided or '-' sentinel present, read from stdin
+    if not args.files or any(str(p) == "-" for p in args.files):
+        stdin_paths = [Path(line.strip()) for line in sys.stdin if line.strip()]
+        file_paths.extend(stdin_paths)
     # Validate input files
-    valid_files = validate_input_files(args.files)
+    valid_files = validate_input_files(file_paths)
     if not valid_files:
         print("Error: No valid input files found", file=sys.stderr)
@@ -1085,22 +1149,26 @@ def main():
     # Parse all structure files
     print("Parsing structure files...")
-    structures = []
-    for file_path in valid_files:
+    structures: List[Structure] = []
+    parsed_files: List[Path] = []
+    for file_path in tqdm(valid_files, desc="Parsing", unit="file"):
         try:
             structure = parse_structure_file(file_path)
             structures.append(structure)
-            print(f"  Parsed {file_path}")
+            parsed_files.append(file_path)
         except Exception:
+            # Keep reporting failures explicitly
             print(f"  Failed to parse {file_path}, skipping", file=sys.stderr)
-            continue
+    # Replace the original list with the successfully parsed ones
+    valid_files = parsed_files
     if not structures:
         print("Error: No structures could be parsed", file=sys.stderr)
         sys.exit(1)
-    # Update valid_files to match successfully parsed structures
-    valid_files = valid_files[: len(structures)]
+    # valid_files already filtered to successfully parsed structures above
     # Validate nucleotide counts
     print("\nValidating nucleotide counts...")
@@ -1108,7 +1176,7 @@ def main():
     # Switch workflow based on requested mode
     if args.mode == "approximate":
-        run_approximate(structures, valid_files, args)
+        run_approximate_multiple(structures, valid_files, args.radius, args.output_json)
         return
     else:
         run_exact(structures, valid_files, args)

{rnapolis-0.10.1.dist-info → rnapolis-0.10.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: RNApolis
-Version: 0.10.1
+Version: 0.10.2
 Summary: A Python library containing RNA-related bioinformatics functions and classes
 Home-page: https://github.com/tzok/rnapolis-py
 Author: Tomasz Zok

{rnapolis-0.10.1.dist-info → rnapolis-0.10.2.dist-info}/RECORD RENAMED Viewed

@@ -7,7 +7,7 @@ rnapolis/component_A.csv,sha256=koirS-AwUZwoYGItT8yn3wS6Idvmh2FANfTQcOS_xh8,2897
 rnapolis/component_C.csv,sha256=NtvsAu_YrUgTjzZm3j4poW4IZ99x3dPARB09XVIiMCc,2803
 rnapolis/component_G.csv,sha256=Z5wl8OnHRyx4XhTyBiWgRZiEvmZXhoxtVRH8bn6Vxf0,2898
 rnapolis/component_U.csv,sha256=8BUoU1m2YzGmi8_kw1xdpf3pucszHjFEtTex87CuXiE,2645
-rnapolis/distiller.py,sha256=QFFSwiCFftrb4tW3YhaECKEePg5pJAqJalrabPPQVJk,36817
+rnapolis/distiller.py,sha256=ryoTGK9C2WGBlxcfidqyXhuGfjJ0XjNRVjp0d1-cyAk,39545
 rnapolis/metareader.py,sha256=I1-cXc2YNBPwa3zihAnMTjEsAo79tEKzSmWu5yvN1Pk,2071
 rnapolis/mmcif_pdbx_v50.dic,sha256=5QFx1ssDaehR4_DQ-tS9VQux262SiLXaqcwmwwejF5c,5744659
 rnapolis/molecule_filter.py,sha256=jgcpJxx_oXEBX0d30v4k_FdwRouRUPUsEtCYWgLGpD4,7310
@@ -22,9 +22,9 @@ rnapolis/tertiary_v2.py,sha256=SgijTv0bPqMJwsMqyQk0O8QAnS2Ozk45vk8igxt9hRs,38001
 rnapolis/transformer.py,sha256=aC0nBmHHJf5TyLvBIV57Jj3tlwpvHbPo347opfAOlQA,3844
 rnapolis/unifier.py,sha256=2ge7IB9FdRgzSAiVD39U_ciwtdDJ2fGzf8mUIudbrqY,5820
 rnapolis/util.py,sha256=IdquFO3PV1_KDqodjupzm0Rqvgy0CeSzxGHaGEHYXVU,543
-rnapolis-0.10.1.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
-rnapolis-0.10.1.dist-info/METADATA,sha256=rjPTfIJ666l8ZmlocJWBO2_5OSem_9r-AyCe-zFR7as,54611
-rnapolis-0.10.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-rnapolis-0.10.1.dist-info/entry_points.txt,sha256=MZMWnYBUYnis-zWDmFfuA5yXtU3W5YdQrm5HA5LrkeM,474
-rnapolis-0.10.1.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
-rnapolis-0.10.1.dist-info/RECORD,,
+rnapolis-0.10.2.dist-info/licenses/LICENSE,sha256=ZGRu12MzCgbYA-Lt8MyBlmjvPZh7xfiD5u5wBx0enq4,1066
+rnapolis-0.10.2.dist-info/METADATA,sha256=5ZYzqjRDd2WHeVrDrq4QMp4V7nQ2gI1OVgUvgmqt_Es,54611
+rnapolis-0.10.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+rnapolis-0.10.2.dist-info/entry_points.txt,sha256=MZMWnYBUYnis-zWDmFfuA5yXtU3W5YdQrm5HA5LrkeM,474
+rnapolis-0.10.2.dist-info/top_level.txt,sha256=LcO18koxZcWoJ21KDRRRo_tyIbmXL5z61dPitZpy8yc,9
+rnapolis-0.10.2.dist-info/RECORD,,

{rnapolis-0.10.1.dist-info → rnapolis-0.10.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{rnapolis-0.10.1.dist-info → rnapolis-0.10.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{rnapolis-0.10.1.dist-info → rnapolis-0.10.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{rnapolis-0.10.1.dist-info → rnapolis-0.10.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

RNApolis 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl

RNApolis 0.10.1py3-none-any.whl → 0.10.2py3-none-any.whl