PyPI - tooluniverse - Versions diffs - 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl - Mend

tooluniverse 1.0.7py3-none-any.whl → 1.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tooluniverse might be problematic. Click here for more details.

Files changed (96) hide show

tooluniverse/__init__.py +37 -14
tooluniverse/admetai_tool.py +16 -5
tooluniverse/base_tool.py +36 -0
tooluniverse/biogrid_tool.py +118 -0
tooluniverse/build_optimizer.py +87 -0
tooluniverse/cache/__init__.py +3 -0
tooluniverse/cache/memory_cache.py +99 -0
tooluniverse/cache/result_cache_manager.py +235 -0
tooluniverse/cache/sqlite_backend.py +257 -0
tooluniverse/clinvar_tool.py +90 -0
tooluniverse/compose_scripts/output_summarizer.py +87 -33
tooluniverse/compose_tool.py +2 -2
tooluniverse/custom_tool.py +28 -0
tooluniverse/data/adverse_event_tools.json +97 -98
tooluniverse/data/agentic_tools.json +81 -162
tooluniverse/data/arxiv_tools.json +1 -4
tooluniverse/data/compose_tools.json +0 -54
tooluniverse/data/core_tools.json +1 -4
tooluniverse/data/dataset_tools.json +7 -7
tooluniverse/data/doaj_tools.json +1 -3
tooluniverse/data/drug_discovery_agents.json +282 -0
tooluniverse/data/europe_pmc_tools.json +1 -2
tooluniverse/data/genomics_tools.json +174 -0
tooluniverse/data/geo_tools.json +86 -0
tooluniverse/data/literature_search_tools.json +15 -35
tooluniverse/data/markitdown_tools.json +51 -0
tooluniverse/data/monarch_tools.json +1 -2
tooluniverse/data/openalex_tools.json +1 -5
tooluniverse/data/opentarget_tools.json +8 -16
tooluniverse/data/output_summarization_tools.json +23 -20
tooluniverse/data/packages/bioinformatics_core_tools.json +2 -2
tooluniverse/data/packages/cheminformatics_tools.json +1 -1
tooluniverse/data/packages/genomics_tools.json +1 -1
tooluniverse/data/packages/single_cell_tools.json +1 -1
tooluniverse/data/packages/structural_biology_tools.json +1 -1
tooluniverse/data/pmc_tools.json +1 -4
tooluniverse/data/ppi_tools.json +139 -0
tooluniverse/data/pubmed_tools.json +1 -3
tooluniverse/data/semantic_scholar_tools.json +1 -2
tooluniverse/data/tool_composition_tools.json +2 -4
tooluniverse/data/unified_guideline_tools.json +206 -4
tooluniverse/data/xml_tools.json +15 -15
tooluniverse/data/zenodo_tools.json +1 -2
tooluniverse/dbsnp_tool.py +71 -0
tooluniverse/default_config.py +6 -0
tooluniverse/ensembl_tool.py +61 -0
tooluniverse/execute_function.py +235 -76
tooluniverse/generate_tools.py +303 -20
tooluniverse/genomics_gene_search_tool.py +56 -0
tooluniverse/geo_tool.py +116 -0
tooluniverse/gnomad_tool.py +63 -0
tooluniverse/logging_config.py +64 -2
tooluniverse/markitdown_tool.py +159 -0
tooluniverse/mcp_client_tool.py +10 -5
tooluniverse/molecule_2d_tool.py +9 -3
tooluniverse/molecule_3d_tool.py +9 -3
tooluniverse/output_hook.py +217 -150
tooluniverse/smcp.py +18 -10
tooluniverse/smcp_server.py +89 -199
tooluniverse/string_tool.py +112 -0
tooluniverse/tools/{MultiAgentLiteratureSearch.py → ADMETAnalyzerAgent.py} +18 -18
tooluniverse/tools/ArXiv_search_papers.py +3 -3
tooluniverse/tools/CMA_Guidelines_Search.py +52 -0
tooluniverse/tools/CORE_search_papers.py +3 -3
tooluniverse/tools/ClinVar_search_variants.py +52 -0
tooluniverse/tools/ClinicalTrialDesignAgent.py +63 -0
tooluniverse/tools/CompoundDiscoveryAgent.py +59 -0
tooluniverse/tools/DOAJ_search_articles.py +2 -2
tooluniverse/tools/DiseaseAnalyzerAgent.py +52 -0
tooluniverse/tools/DrugInteractionAnalyzerAgent.py +52 -0
tooluniverse/tools/DrugOptimizationAgent.py +63 -0
tooluniverse/tools/Ensembl_lookup_gene_by_symbol.py +52 -0
tooluniverse/tools/EuropePMC_search_articles.py +1 -1
tooluniverse/tools/GIN_Guidelines_Search.py +52 -0
tooluniverse/tools/GWAS_search_associations_by_gene.py +52 -0
tooluniverse/tools/LiteratureSynthesisAgent.py +59 -0
tooluniverse/tools/PMC_search_papers.py +3 -3
tooluniverse/tools/PubMed_search_articles.py +2 -2
tooluniverse/tools/SemanticScholar_search_papers.py +1 -1
tooluniverse/tools/UCSC_get_genes_by_region.py +67 -0
tooluniverse/tools/Zenodo_search_records.py +1 -1
tooluniverse/tools/__init__.py +33 -3
tooluniverse/tools/convert_to_markdown.py +59 -0
tooluniverse/tools/dbSNP_get_variant_by_rsid.py +46 -0
tooluniverse/tools/gnomAD_query_variant.py +52 -0
tooluniverse/tools/openalex_literature_search.py +4 -4
tooluniverse/ucsc_tool.py +60 -0
tooluniverse/unified_guideline_tools.py +1175 -57
tooluniverse/utils.py +51 -4
tooluniverse/zenodo_tool.py +2 -1
{tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/METADATA +10 -3
{tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/RECORD +96 -61
{tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/entry_points.txt +0 -3
{tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/WHEEL +0 -0
{tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/licenses/LICENSE +0 -0
{tooluniverse-1.0.7.dist-info → tooluniverse-1.0.9.dist-info}/top_level.txt +0 -0

tooluniverse/data/packages/cheminformatics_tools.json CHANGED Viewed

@@ -276,7 +276,7 @@
         "pip": "pip install openchem",
         "conda": "conda install -c conda-forge openchem"
       },
-      "usage_example": "# OpenChem deep learning for drug discovery\n# This demonstrates molecular property prediction concepts\n\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.metrics import mean_squared_error, r2_score\nimport tempfile\nimport os\nfrom collections import defaultdict\n\nprint('OpenChem - Deep Learning for Drug Discovery')\nprint('=' * 45)\n\n# Overview of OpenChem capabilities\nprint('OpenChem Features:')\nprint('• Molecular property prediction (ADMET, bioactivity)')\nprint('• Graph neural networks for molecules')\nprint('• Molecular generation and optimization')\nprint('• Multi-task learning for drug discovery')\nprint('• Molecular descriptors and fingerprints')\nprint('• Integration with PyTorch and other ML frameworks')\n\nprint('\\nSupported Molecular Representations:')\nprint('• SMILES strings')\nprint('• Molecular graphs')\nprint('• 3D conformations')\nprint('• Molecular fingerprints (ECFP, MACCS, etc.)')\nprint('• Physicochemical descriptors')\n\n# Simulate molecular dataset\nprint('\\n=== Molecular Dataset Simulation ===')\n\nnp.random.seed(42)\n\n# Generate synthetic molecular data\nn_molecules = 1000\nprint(f'Generating {n_molecules} synthetic molecules...')\n\n# Molecular descriptors (simplified)\ndescriptor_names = [\n    'molecular_weight', 'logP', 'num_donors', 'num_acceptors',\n    'tpsa', 'num_rotatable_bonds', 'num_aromatic_rings',\n    'formal_charge', 'num_heteroatoms', 'fraction_csp3'\n]\n\n# Generate realistic molecular descriptors\nmolecular_data = []\nfor i in range(n_molecules):\n    # Molecular weight: 150-600 Da (drug-like range)\n    mw = np.random.normal(350, 100)\n    mw = np.clip(mw, 150, 600)\n    \n    # LogP: -2 to 6 (lipophilicity)\n    logP = np.random.normal(2.5, 1.5)\n    logP = np.clip(logP, -2, 6)\n    \n    # Hydrogen bond donors: 0-5\n    donors = np.random.poisson(2)\n    donors = np.clip(donors, 0, 5)\n    \n    # Hydrogen bond acceptors: 0-10\n    acceptors = np.random.poisson(4)\n    acceptors = np.clip(acceptors, 0, 10)\n    \n    # Topological polar surface area: 0-200 Å²\n    tpsa = np.random.gamma(2, 30)\n    tpsa = np.clip(tpsa, 0, 200)\n    \n    # Rotatable bonds: 0-15\n    rot_bonds = np.random.poisson(5)\n    rot_bonds = np.clip(rot_bonds, 0, 15)\n    \n    # Aromatic rings: 0-4\n    aromatic_rings = np.random.poisson(2)\n    aromatic_rings = np.clip(aromatic_rings, 0, 4)\n    \n    # Formal charge: typically 0, sometimes ±1\n    formal_charge = np.random.choice([0, 0, 0, 0, 1, -1])\n    \n    # Heteroatoms: 1-8\n    heteroatoms = np.random.poisson(3) + 1\n    heteroatoms = np.clip(heteroatoms, 1, 8)\n    \n    # Fraction sp3 carbons: 0-1\n    frac_csp3 = np.random.beta(2, 2)\n    \n    molecule = {\n        'id': f'MOL_{i:04d}',\n        'molecular_weight': mw,\n        'logP': logP,\n        'num_donors': donors,\n        'num_acceptors': acceptors,\n        'tpsa': tpsa,\n        'num_rotatable_bonds': rot_bonds,\n        'num_aromatic_rings': aromatic_rings,\n        'formal_charge': formal_charge,\n        'num_heteroatoms': heteroatoms,\n        'fraction_csp3': frac_csp3\n    }\n    \n    molecular_data.append(molecule)\n\n# Convert to DataFrame\ndf = pd.DataFrame(molecular_data)\nprint(f'Generated molecular dataset: {df.shape}')\nprint(f'Descriptors: {len(descriptor_names)}')\n\n# Show basic statistics\nprint('\\nDataset statistics:')\nprint(df[descriptor_names].describe().round(2))\n\n# Generate target properties\nprint('\\n=== Target Property Generation ===')\n\n# Simulate bioactivity (IC50 values)\nprint('Generating bioactivity data (IC50 values)...')\n\ndef calculate_bioactivity(row):\n    \"\"\"Simulate bioactivity based on molecular descriptors\"\"\"\n    # Lipinski's rule of five compliance\n    lipinski_score = 0\n    if row['molecular_weight'] <= 500: lipinski_score += 1\n    if row['logP'] <= 5: lipinski_score += 1\n    if row['num_donors'] <= 5: lipinski_score += 1\n    if row['num_acceptors'] <= 10: lipinski_score += 1\n    \n    # Base activity influenced by Lipinski compliance\n    base_activity = 5.0 + (lipinski_score - 2.0) * 1.5\n    \n    # Additional molecular factors\n    tpsa_factor = -0.01 * max(0, row['tpsa'] - 90)  # Penalty for high TPSA\n    flexibility_factor = -0.1 * max(0, row['num_rotatable_bonds'] - 7)  # Penalty for high flexibility\n    aromatic_factor = 0.3 * min(row['num_aromatic_rings'], 3)  # Bonus for aromatics (up to 3)\n    \n    # Combined activity (pIC50)\n    activity = base_activity + tpsa_factor + flexibility_factor + aromatic_factor\n    \n    # Add some noise\n    activity += np.random.normal(0, 0.5)\n    \n    # Ensure reasonable range (4-9 pIC50)\n    activity = np.clip(activity, 4.0, 9.0)\n    \n    return activity\n\n# Calculate bioactivity\ndf['pIC50'] = df.apply(calculate_bioactivity, axis=1)\n\n# Generate additional properties\nprint('Generating ADMET properties...')\n\n# Solubility (LogS)\ndef calculate_solubility(row):\n    \"\"\"Simulate aqueous solubility\"\"\"\n    # Lipophilicity penalty\n    logP_penalty = -0.5 * max(0, row['logP'] - 2)\n    \n    # Molecular weight penalty\n    mw_penalty = -0.005 * max(0, row['molecular_weight'] - 300)\n    \n    # TPSA bonus (polar surface area helps solubility)\n    tpsa_bonus = 0.01 * min(row['tpsa'], 100)\n    \n    # Base solubility\n    base_solubility = -2.0\n    \n    solubility = base_solubility + logP_penalty + mw_penalty + tpsa_bonus\n    solubility += np.random.normal(0, 0.3)\n    \n    return np.clip(solubility, -6.0, 1.0)\n\ndf['logS'] = df.apply(calculate_solubility, axis=1)\n\n# Permeability (Caco-2)\ndef calculate_permeability(row):\n    \"\"\"Simulate cell permeability\"\"\"\n    # LogP correlation\n    logP_factor = 0.3 * row['logP']\n    \n    # TPSA penalty\n    tpsa_penalty = -0.02 * row['tpsa']\n    \n    # Molecular weight penalty\n    mw_penalty = -0.003 * row['molecular_weight']\n    \n    # Base permeability (log Papp)\n    base_perm = -4.5\n    \n    permeability = base_perm + logP_factor + tpsa_penalty + mw_penalty\n    permeability += np.random.normal(0, 0.4)\n    \n    return np.clip(permeability, -7.0, -3.0)\n\ndf['log_Papp'] = df.apply(calculate_permeability, axis=1)\n\n# Half-life (t1/2)\ndef calculate_half_life(row):\n    \"\"\"Simulate metabolic stability\"\"\"\n    # Molecular complexity factor\n    complexity = row['num_rotatable_bonds'] + row['num_heteroatoms']\n    complexity_factor = -0.05 * complexity\n    \n    # Aromatic stabilization\n    aromatic_factor = 0.1 * row['num_aromatic_rings']\n    \n    # Base half-life (log hours)\n    base_t_half = 1.0\n    \n    t_half = base_t_half + complexity_factor + aromatic_factor\n    t_half += np.random.normal(0, 0.3)\n    \n    return np.clip(t_half, -1.0, 3.0)\n\ndf['log_t_half'] = df.apply(calculate_half_life, axis=1)\n\nprint(f'Generated properties: pIC50, logS, log_Papp, log_t_half')\nprint(f'Property value ranges:')\nfor prop in ['pIC50', 'logS', 'log_Papp', 'log_t_half']:\n    print(f'  {prop}: {df[prop].min():.2f} to {df[prop].max():.2f}')\n\n# Apply drug-likeness filters\nprint('\\n=== Drug-likeness Analysis ===')\n\ndef lipinski_filter(row):\n    \"\"\"Apply Lipinski's Rule of Five\"\"\"\n    violations = 0\n    if row['molecular_weight'] > 500: violations += 1\n    if row['logP'] > 5: violations += 1\n    if row['num_donors'] > 5: violations += 1\n    if row['num_acceptors'] > 10: violations += 1\n    return violations\n\ndef veber_filter(row):\n    \"\"\"Apply Veber's rules for oral bioavailability\"\"\"\n    violations = 0\n    if row['tpsa'] > 140: violations += 1\n    if row['num_rotatable_bonds'] > 10: violations += 1\n    return violations\n\ndf['lipinski_violations'] = df.apply(lipinski_filter, axis=1)\ndf['veber_violations'] = df.apply(veber_filter, axis=1)\ndf['drug_like'] = (df['lipinski_violations'] == 0) & (df['veber_violations'] == 0)\n\nprint(f'Drug-likeness analysis:')\nprint(f'  Lipinski compliant (0 violations): {(df[\"lipinski_violations\"] == 0).sum()}')\nprint(f'  Veber compliant (0 violations): {(df[\"veber_violations\"] == 0).sum()}')\nprint(f'  Overall drug-like: {df[\"drug_like\"].sum()}')\nprint(f'  Drug-like percentage: {df[\"drug_like\"].mean():.1%}')\n\n# Machine learning model training\nprint('\\n=== Machine Learning Model Training ===')\n\n# Prepare features and targets\nfeature_cols = descriptor_names\ntarget_cols = ['pIC50', 'logS', 'log_Papp', 'log_t_half']\n\nX = df[feature_cols].values\ny = df[target_cols].values\n\nprint(f'Feature matrix shape: {X.shape}')\nprint(f'Target matrix shape: {y.shape}')\n\n# Split data\nX_train, X_test, y_train, y_test = train_test_split(\n    X, y, test_size=0.2, random_state=42\n)\n\nprint(f'Training set: {X_train.shape[0]} molecules')\nprint(f'Test set: {X_test.shape[0]} molecules')\n\n# Train models for each property\nmodels = {}\nperformance = {}\n\nfor i, target_name in enumerate(target_cols):\n    print(f'\\nTraining model for {target_name}...')\n    \n    # Random Forest model\n    model = RandomForestRegressor(n_estimators=100, random_state=42)\n    model.fit(X_train, y_train[:, i])\n    \n    # Predictions\n    y_pred_train = model.predict(X_train)\n    y_pred_test = model.predict(X_test)\n    \n    # Performance metrics\n    train_r2 = r2_score(y_train[:, i], y_pred_train)\n    test_r2 = r2_score(y_test[:, i], y_pred_test)\n    train_rmse = np.sqrt(mean_squared_error(y_train[:, i], y_pred_train))\n    test_rmse = np.sqrt(mean_squared_error(y_test[:, i], y_pred_test))\n    \n    models[target_name] = model\n    performance[target_name] = {\n        'train_r2': train_r2,\n        'test_r2': test_r2,\n        'train_rmse': train_rmse,\n        'test_rmse': test_rmse,\n        'predictions': y_pred_test,\n        'true_values': y_test[:, i]\n    }\n    \n    print(f'  Training R²: {train_r2:.3f}')\n    print(f'  Test R²: {test_r2:.3f}')\n    print(f'  Test RMSE: {test_rmse:.3f}')\n\n# Feature importance analysis\nprint('\\n=== Feature Importance Analysis ===')\n\nfeature_importance = pd.DataFrame({\n    'feature': feature_cols,\n    **{target: models[target].feature_importances_ for target in target_cols}\n})\n\nprint('Top features for each property:')\nfor target in target_cols:\n    top_features = feature_importance.nlargest(3, target)\n    print(f'\\n{target}:')\n    for _, row in top_features.iterrows():\n        print(f'  {row[\"feature\"]}: {row[target]:.3f}')\n\n# Virtual screening simulation\nprint('\\n=== Virtual Screening Simulation ===')\n\n# Generate new molecules for screening\nn_screening = 10000\nprint(f'Generating {n_screening} molecules for virtual screening...')\n\nscreening_data = []\nfor i in range(n_screening):\n    # Generate random molecular descriptors\n    mw = np.random.normal(350, 120)\n    mw = np.clip(mw, 100, 700)\n    \n    logP = np.random.normal(2.5, 2.0)\n    logP = np.clip(logP, -3, 7)\n    \n    donors = np.random.poisson(2)\n    donors = np.clip(donors, 0, 8)\n    \n    acceptors = np.random.poisson(4)\n    acceptors = np.clip(acceptors, 0, 15)\n    \n    tpsa = np.random.gamma(2, 35)\n    tpsa = np.clip(tpsa, 0, 250)\n    \n    rot_bonds = np.random.poisson(6)\n    rot_bonds = np.clip(rot_bonds, 0, 20)\n    \n    aromatic_rings = np.random.poisson(2)\n    aromatic_rings = np.clip(aromatic_rings, 0, 5)\n    \n    formal_charge = np.random.choice([0, 0, 0, 0, 1, -1])\n    \n    heteroatoms = np.random.poisson(4) + 1\n    heteroatoms = np.clip(heteroatoms, 1, 12)\n    \n    frac_csp3 = np.random.beta(2, 2)\n    \n    screening_data.append([\n        mw, logP, donors, acceptors, tpsa, rot_bonds,\n        aromatic_rings, formal_charge, heteroatoms, frac_csp3\n    ])\n\nX_screening = np.array(screening_data)\n\n# Predict properties for screening library\nprint('Predicting properties for screening library...')\nscreening_predictions = {}\n\nfor target in target_cols:\n    predictions = models[target].predict(X_screening)\n    screening_predictions[target] = predictions\n\n# Apply filters for hit identification\nprint('\\nApplying screening filters...')\n\n# Create screening DataFrame\nscreening_df = pd.DataFrame(X_screening, columns=feature_cols)\nfor target in target_cols:\n    screening_df[f'pred_{target}'] = screening_predictions[target]\n\n# Apply drug-likeness filters\nscreening_df['lipinski_violations'] = screening_df.apply(lipinski_filter, axis=1)\nscreening_df['veber_violations'] = screening_df.apply(veber_filter, axis=1)\nscreening_df['drug_like'] = (\n    (screening_df['lipinski_violations'] == 0) & \n    (screening_df['veber_violations'] == 0)\n)\n\n# Activity filters\nactivity_threshold = 6.5  # pIC50 > 6.5 (IC50 < 316 nM)\nsolubility_threshold = -4.0  # logS > -4 (> 0.1 mM)\npermeability_threshold = -5.5  # log Papp > -5.5\n\nhits = screening_df[\n    (screening_df['pred_pIC50'] > activity_threshold) &\n    (screening_df['pred_logS'] > solubility_threshold) &\n    (screening_df['pred_log_Papp'] > permeability_threshold) &\n    (screening_df['drug_like'] == True)\n]\n\nprint(f'Screening results:')\nprint(f'  Total molecules: {len(screening_df):,}')\nprint(f'  Drug-like molecules: {screening_df[\"drug_like\"].sum():,}')\nprint(f'  Active hits (pIC50 > {activity_threshold}): {(screening_df[\"pred_pIC50\"] > activity_threshold).sum():,}')\nprint(f'  Final hits (all criteria): {len(hits):,}')\nprint(f'  Hit rate: {len(hits) / len(screening_df):.1%}')\n\nif len(hits) > 0:\n    print(f'\\nTop 5 hits:')\n    top_hits = hits.nlargest(5, 'pred_pIC50')\n    for idx, hit in top_hits.iterrows():\n        print(f'  Hit {idx}: pIC50={hit[\"pred_pIC50\"]:.2f}, '\n              f'logS={hit[\"pred_logS\"]:.2f}, MW={hit[\"molecular_weight\"]:.0f}')\n\n# Visualization\nprint('\\n=== Visualization ===')\n\nfig, axes = plt.subplots(2, 2, figsize=(15, 12))\n\n# 1. Property predictions vs true values\nfor i, target in enumerate(['pIC50', 'logS']):\n    ax = axes[0, i]\n    perf = performance[target]\n    \n    ax.scatter(perf['true_values'], perf['predictions'], alpha=0.6, s=20)\n    \n    # Perfect prediction line\n    min_val = min(perf['true_values'].min(), perf['predictions'].min())\n    max_val = max(perf['true_values'].max(), perf['predictions'].max())\n    ax.plot([min_val, max_val], [min_val, max_val], 'r--', alpha=0.8)\n    \n    ax.set_xlabel(f'True {target}')\n    ax.set_ylabel(f'Predicted {target}')\n    ax.set_title(f'{target} Prediction (R² = {perf[\"test_r2\"]:.3f})')\n    ax.grid(True, alpha=0.3)\n\n# 2. Feature importance heatmap\nax = axes[1, 0]\nimportance_matrix = feature_importance[target_cols].values.T\nim = ax.imshow(importance_matrix, cmap='viridis', aspect='auto')\nax.set_xticks(range(len(feature_cols)))\nax.set_xticklabels(feature_cols, rotation=45, ha='right')\nax.set_yticks(range(len(target_cols)))\nax.set_yticklabels(target_cols)\nax.set_title('Feature Importance Heatmap')\nplt.colorbar(im, ax=ax)\n\n# 3. Virtual screening results\nax = axes[1, 1]\nax.scatter(screening_df['pred_pIC50'], screening_df['pred_logS'], \n          alpha=0.3, s=10, color='lightblue', label='All molecules')\nif len(hits) > 0:\n    ax.scatter(hits['pred_pIC50'], hits['pred_logS'], \n              alpha=0.8, s=30, color='red', label=f'Hits ({len(hits)})')\n\nax.axvline(x=activity_threshold, color='green', linestyle='--', alpha=0.7, \n          label=f'pIC50 > {activity_threshold}')\nax.axhline(y=solubility_threshold, color='orange', linestyle='--', alpha=0.7, \n          label=f'logS > {solubility_threshold}')\n\nax.set_xlabel('Predicted pIC50')\nax.set_ylabel('Predicted logS')\nax.set_title('Virtual Screening Results')\nax.legend()\nax.grid(True, alpha=0.3)\n\nplt.tight_layout()\n\n# Save visualization\nwith tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:\n    plt.savefig(tmp.name, dpi=150, bbox_inches='tight')\n    viz_file = tmp.name\n\nplt.close()\nprint(f'Analysis visualization saved to: {viz_file}')\n\n# Summary report\nprint('\\n' + '=' * 45)\nprint('OPENCHEM DRUG DISCOVERY SUMMARY')\nprint('=' * 45)\nprint(f'Molecules analyzed: {len(df):,}')\nprint(f'Properties predicted: {len(target_cols)}')\nprint(f'Drug-like molecules: {df[\"drug_like\"].sum():,} ({df[\"drug_like\"].mean():.1%})')\nprint(f'\\nModel performance (test R²):')\nfor target in target_cols:\n    print(f'  {target}: {performance[target][\"test_r2\"]:.3f}')\nprint(f'\\nVirtual screening:')\nprint(f'  Molecules screened: {len(screening_df):,}')\nprint(f'  Hits identified: {len(hits):,}')\nprint(f'  Hit rate: {len(hits) / len(screening_df):.2%}')\n\n# Cleanup\nos.unlink(viz_file)\nprint('\\nDemo complete - temporary files cleaned up')\n\nprint('\\nOpenChem provides:')\nprint('• Multi-task molecular property prediction')\nprint('• Graph neural networks for molecules')\nprint('• Molecular generation and optimization')\nprint('• ADMET property prediction')\nprint('• Virtual screening capabilities')\nprint('• Integration with PyTorch')\nprint('• Pre-trained models and datasets')\n\nprint('\\nTypical OpenChem workflow:')\nprint('1. Load molecular dataset (SMILES, SDF)')\nprint('2. Generate molecular representations')\nprint('3. Train/load prediction models')\nprint('4. Predict properties for new molecules')\nprint('5. Apply filters for drug discovery')",
+      "usage_example": "# OpenChem deep learning for drug discovery\n# This demonstrates molecular property prediction concepts\n\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.metrics import mean_squared_error, r2_score\nimport tempfile\nimport os\nfrom collections import defaultdict\n\nprint('OpenChem - Deep Learning for Drug Discovery')\nprint('=' * 45)\n\n# Overview of OpenChem capabilities\nprint('OpenChem Features:')\nprint('• Molecular property prediction (ADMET, bioactivity)')\nprint('• Graph neural networks for molecules')\nprint('• Molecular generation and optimization')\nprint('• Multi-task learning for drug discovery')\nprint('• Molecular descriptors and fingerprints')\nprint('• Integration with PyTorch and other ML frameworks')\n\nprint('\\nSupported Molecular Representations:')\nprint('• SMILES strings')\nprint('• Molecular graphs')\nprint('• 3D conformations')\nprint('• Molecular fingerprints (ECFP, MACCS, etc.)')\nprint('• Physicochemical descriptors')\n\n# Simulate molecular dataset\nprint('\\n=== Molecular Dataset Simulation ===')\n\nnp.random.seed(42)\n\n# Generate synthetic molecular data\nn_molecules = 1000\nprint(f'Generating {n_molecules} synthetic molecules...')\n\n# Molecular descriptors (simplified)\ndescriptor_names = [\n    'molecular_weight', 'logP', 'num_donors', 'num_acceptors',\n    'tpsa', 'num_rotatable_bonds', 'num_aromatic_rings',\n    'formal_charge', 'num_heteroatoms', 'fraction_csp3'\n]\n\n# Generate realistic molecular descriptors\nmolecular_data = []\nfor i in range(n_molecules):\n    # Molecular weight: 150-600 Da (drug-like range)\n    mw = np.random.normal(350, 100)\n    mw = np.clip(mw, 150, 600)\n    \n    # LogP: -2 to 6 (lipophilicity)\n    logP = np.random.normal(2.5, 1.5)\n    logP = np.clip(logP, -2, 6)\n    \n    # Hydrogen bond donors: 0-5\n    donors = np.random.poisson(2)\n    donors = np.clip(donors, 0, 5)\n    \n    # Hydrogen bond acceptors: 0-10\n    acceptors = np.random.poisson(4)\n    acceptors = np.clip(acceptors, 0, 10)\n    \n    # Topological polar surface area: 0-200 Å²\n    tpsa = np.random.gamma(2, 30)\n    tpsa = np.clip(tpsa, 0, 200)\n    \n    # Rotatable bonds: 0-15\n    rot_bonds = np.random.poisson(5)\n    rot_bonds = np.clip(rot_bonds, 0, 15)\n    \n    # Aromatic rings: 0-4\n    aromatic_rings = np.random.poisson(2)\n    aromatic_rings = np.clip(aromatic_rings, 0, 4)\n    \n    # Formal charge: typically 0, sometimes ±1\n    formal_charge = np.random.choice([0, 0, 0, 0, 1, -1])\n    \n    # Heteroatoms: 1-8\n    heteroatoms = np.random.poisson(3) + 1\n    heteroatoms = np.clip(heteroatoms, 1, 8)\n    \n    # Fraction sp3 carbons: 0-1\n    frac_csp3 = np.random.beta(2, 2)\n    \n    molecule = {\n        'id': f'MOL_{i:04d}',\n        'molecular_weight': mw,\n        'logP': logP,\n        'num_donors': donors,\n        'num_acceptors': acceptors,\n        'tpsa': tpsa,\n        'num_rotatable_bonds': rot_bonds,\n        'num_aromatic_rings': aromatic_rings,\n        'formal_charge': formal_charge,\n        'num_heteroatoms': heteroatoms,\n        'fraction_csp3': frac_csp3\n    }\n    \n    molecular_data.append(molecule)\n\n# Convert to DataFrame\ndf = pd.DataFrame(molecular_data)\nprint(f'Generated molecular dataset: {df.shape}')\nprint(f'Descriptors: {len(descriptor_names)}')\n\n# Show basic statistics\nprint('\\nDataset statistics:')\nprint(df[descriptor_names].describe().round(2))\n\n# Generate target properties\nprint('\\n=== Target Property Generation ===')\n\n# Simulate bioactivity (IC50 values)\nprint('Generating bioactivity data (IC50 values)...')\n\ndef calculate_bioactivity(row):\n    \"\"\"Simulate bioactivity based on molecular descriptors\"\"\"\n    # Lipinski's rule of five compliance\n    lipinski_score = 0\n    if row['molecular_weight'] <= 500: lipinski_score += 1\n    if row['logP'] <= 5: lipinski_score += 1\n    if row['num_donors'] <= 5: lipinski_score += 1\n    if row['num_acceptors'] <= 10: lipinski_score += 1\n    \n    # Base activity influenced by Lipinski compliance\n    base_activity = 5.0 + (lipinski_score - 2.0) * 1.5\n    \n    # Additional molecular factors\n    tpsa_factor = -0.01 * max(0, row['tpsa'] - 90)  # Penalty for high TPSA\n    flexibility_factor = -0.1 * max(0, row['num_rotatable_bonds'] - 7)  # Penalty for high flexibility\n    aromatic_factor = 0.3 * min(row['num_aromatic_rings'], 3)  # Bonus for aromatics (up to 3)\n    \n    # Combined activity (pIC50)\n    activity = base_activity + tpsa_factor + flexibility_factor + aromatic_factor\n    \n    # Add some noise\n    activity += np.random.normal(0, 0.5)\n    \n    # Ensure reasonable range (4-9 pIC50)\n    activity = np.clip(activity, 4.0, 9.0)\n    \n    return activity\n\n# Calculate bioactivity\ndf['pIC50'] = df.apply(calculate_bioactivity, axis=1)\n\n# Generate additional properties\nprint('Generating ADMET properties...')\n\n# Solubility (LogS)\ndef calculate_solubility(row):\n    \"\"\"Simulate aqueous solubility\"\"\"\n    # Lipophilicity penalty\n    logP_penalty = -0.5 * max(0, row['logP'] - 2)\n    \n    # Molecular weight penalty\n    mw_penalty = -0.005 * max(0, row['molecular_weight'] - 300)\n    \n    # TPSA bonus (polar surface area helps solubility)\n    tpsa_bonus = 0.01 * min(row['tpsa'], 100)\n    \n    # Base solubility\n    base_solubility = -2.0\n    \n    solubility = base_solubility + logP_penalty + mw_penalty + tpsa_bonus\n    solubility += np.random.normal(0, 0.3)\n    \n    return np.clip(solubility, -6.0, 1.0)\n\ndf['logS'] = df.apply(calculate_solubility, axis=1)\n\n# Permeability (Caco-2)\ndef calculate_permeability(row):\n    \"\"\"Simulate cell permeability\"\"\"\n    # LogP correlation\n    logP_factor = 0.3 * row['logP']\n    \n    # TPSA penalty\n    tpsa_penalty = -0.02 * row['tpsa']\n    \n    # Molecular weight penalty\n    mw_penalty = -0.003 * row['molecular_weight']\n    \n    # Base permeability (log Papp)\n    base_perm = -4.5\n    \n    permeability = base_perm + logP_factor + tpsa_penalty + mw_penalty\n    permeability += np.random.normal(0, 0.4)\n    \n    return np.clip(permeability, -7.0, -3.0)\n\ndf['log_Papp'] = df.apply(calculate_permeability, axis=1)\n\n# Half-life (t1/2)\ndef calculate_half_life(row):\n    \"\"\"Simulate metabolic stability\"\"\"\n    # Molecular complexity factor\n    complexity = row['num_rotatable_bonds'] + row['num_heteroatoms']\n    complexity_factor = -0.05 * complexity\n    \n    # Aromatic stabilization\n    aromatic_factor = 0.1 * row['num_aromatic_rings']\n    \n    # Base half-life (log hours)\n    base_t_half = 1.0\n    \n    t_half = base_t_half + complexity_factor + aromatic_factor\n    t_half += np.random.normal(0, 0.3)\n    \n    return np.clip(t_half, -1.0, 3.0)\n\ndf['log_t_half'] = df.apply(calculate_half_life, axis=1)\n\nprint(f'Generated properties: pIC50, logS, log_Papp, log_t_half')\nprint(f'Property value ranges:')\nfor prop in ['pIC50', 'logS', 'log_Papp', 'log_t_half']:\n    print(f'  {prop}: {df[prop].min():.2f} to {df[prop].max():.2f}')\n\n# Apply drug-likeness filters\nprint('\\n=== Drug-likeness Analysis ===')\n\ndef lipinski_filter(row):\n    \"\"\"Apply Lipinski's Rule of Five\"\"\"\n    violations = 0\n    if row['molecular_weight'] > 500: violations += 1\n    if row['logP'] > 5: violations += 1\n    if row['num_donors'] > 5: violations += 1\n    if row['num_acceptors'] > 10: violations += 1\n    return violations\n\ndef veber_filter(row):\n    \"\"\"Apply Veber's rules for oral bioavailability\"\"\"\n    violations = 0\n    if row['tpsa'] > 140: violations += 1\n    if row['num_rotatable_bonds'] > 10: violations += 1\n    return violations\n\ndf['lipinski_violations'] = df.apply(lipinski_filter, axis=1)\ndf['veber_violations'] = df.apply(veber_filter, axis=1)\ndf['drug_like'] = (df['lipinski_violations'] == 0) & (df['veber_violations'] == 0)\n\nprint(f'Drug-likeness analysis:')\nprint(f'  Lipinski compliant (0 violations): {(df[\"lipinski_violations\"] == 0).sum()}')\nprint(f'  Veber compliant (0 violations): {(df[\"veber_violations\"] == 0).sum()}')\nprint(f'  Overall drug-like: {df[\"drug_like\"].sum()}')\nprint(f'  Drug-like percentage: {df[\"drug_like\"].mean():.1%}')\n\n# Machine learning model training\nprint('\\n=== Machine Learning Model Training ===')\n\n# Prepare features and targets\nfeature_cols = descriptor_names\ntarget_cols = ['pIC50', 'logS', 'log_Papp', 'log_t_half']\n\nX = df[feature_cols].values\ny = df[target_cols].values\n\nprint(f'Feature matrix shape: {X.shape}')\nprint(f'Target matrix shape: {y.shape}')\n\n# Split data\nX_train, X_test, y_train, y_test = train_test_split(\n    X, y, test_size=0.2, random_state=42\n)\n\nprint(f'Training set: {X_train.shape[0]} molecules')\nprint(f'Test set: {X_test.shape[0]} molecules')\n\n# Train models for each property\nmodels = {}\nperformance = {}\n\nfor i, target_name in enumerate(target_cols):\n    print(f'\\nTraining model for {target_name}...')\n    \n    # Random Forest model\n    model = RandomForestRegressor(n_estimators=100, random_state=42)\n    model.fit(X_train, y_train[:, i])\n    \n    # Predictions\n    y_pred_train = model.predict(X_train)\n    y_pred_test = model.predict(X_test)\n    \n    # Performance metrics\n    train_r2 = r2_score(y_train[:, i], y_pred_train)\n    test_r2 = r2_score(y_test[:, i], y_pred_test)\n    train_rmse = np.sqrt(mean_squared_error(y_train[:, i], y_pred_train))\n    test_rmse = np.sqrt(mean_squared_error(y_test[:, i], y_pred_test))\n    \n    models[target_name] = model\n    performance[target_name] = {\n        'train_r2': train_r2,\n        'test_r2': test_r2,\n        'train_rmse': train_rmse,\n        'test_rmse': test_rmse,\n        'predictions': y_pred_test,\n        'true_values': y_test[:, i]\n    }\n    \n    print(f'  Training R²: {train_r2:.3f}')\n    print(f'  Test R²: {test_r2:.3f}')\n    print(f'  Test RMSE: {test_rmse:.3f}')\n\n# Feature importance analysis\nprint('\\n=== Feature Importance Analysis ===')\n\nfeature_importance = pd.DataFrame({\n    'feature': feature_cols,\n    **{target: models[target].feature_importances_ for target in target_cols}\n})\n\nprint('Top features for each property:')\nfor target in target_cols:\n    top_features = feature_importance.nlargest(3, target)\n    print(f'\\n{target}:')\n    for _, row in top_features.iterrows():\n        print(f'  {row[\"feature\"]}: {row[target]:.3f}')\n\n# Virtual screening simulation\nprint('\\n=== Virtual Screening Simulation ===')\n\n# Generate new molecules for screening\nn_screening = 10000\nprint(f'Generating {n_screening} molecules for virtual screening...')\n\nscreening_data = []\nfor i in range(n_screening):\n    # Generate random molecular descriptors\n    mw = np.random.normal(350, 120)\n    mw = np.clip(mw, 100, 700)\n    \n    logP = np.random.normal(2.5, 2.0)\n    logP = np.clip(logP, -3, 7)\n    \n    donors = np.random.poisson(2)\n    donors = np.clip(donors, 0, 8)\n    \n    acceptors = np.random.poisson(4)\n    acceptors = np.clip(acceptors, 0, 15)\n    \n    tpsa = np.random.gamma(2, 35)\n    tpsa = np.clip(tpsa, 0, 250)\n    \n    rot_bonds = np.random.poisson(6)\n    rot_bonds = np.clip(rot_bonds, 0, 20)\n    \n    aromatic_rings = np.random.poisson(2)\n    aromatic_rings = np.clip(aromatic_rings, 0, 5)\n    \n    formal_charge = np.random.choice([0, 0, 0, 0, 1, -1])\n    \n    heteroatoms = np.random.poisson(4) + 1\n    heteroatoms = np.clip(heteroatoms, 1, 12)\n    \n    frac_csp3 = np.random.beta(2, 2)\n    \n    screening_data.append([\n        mw, logP, donors, acceptors, tpsa, rot_bonds,\n        aromatic_rings, formal_charge, heteroatoms, frac_csp3\n    ])\n\nX_screening = np.array(screening_data)\n\n# Predict properties for screening library\nprint('Predicting properties for screening library...')\nscreening_predictions = {}\n\nfor target in target_cols:\n    predictions = models[target].predict(X_screening)\n    screening_predictions[target] = predictions\n\n# Apply filters for hit identification\nprint('\\nApplying screening filters...')\n\n# Create screening DataFrame\nscreening_df = pd.DataFrame(X_screening, columns=feature_cols)\nfor target in target_cols:\n    screening_df[f'pred_{target}'] = screening_predictions[target]\n\n# Apply drug-likeness filters\nscreening_df['lipinski_violations'] = screening_df.apply(lipinski_filter, axis=1)\nscreening_df['veber_violations'] = screening_df.apply(veber_filter, axis=1)\nscreening_df['drug_like'] = (\n    (screening_df['lipinski_violations'] == 0) & \n    (screening_df['veber_violations'] == 0)\n)\n\n# Activity filters\nactivity_threshold = 6.5  # pIC50 > 6.5 (IC50 < 316 nM)\nsolubility_threshold = -4.0  # logS > -4 (> 0.1 mM)\npermeability_threshold = -5.5  # log Papp > -5.5\n\nhits = screening_df[\n    (screening_df['pred_pIC50'] > activity_threshold) &\n    (screening_df['pred_logS'] > solubility_threshold) &\n    (screening_df['pred_log_Papp'] > permeability_threshold) &\n    (screening_df['drug_like'] == True)\n]\n\nprint(f'Screening results:')\nprint(f'  Total molecules: {len(screening_df):}')\nprint(f'  Drug-like molecules: {screening_df[\"drug_like\"].sum():}')\nprint(f'  Active hits (pIC50 > {activity_threshold}): {(screening_df[\"pred_pIC50\"] > activity_threshold).sum():}')\nprint(f'  Final hits (all criteria): {len(hits):}')\nprint(f'  Hit rate: {len(hits) / len(screening_df):.1%}')\n\nif len(hits) > 0:\n    print(f'\\nTop 5 hits:')\n    top_hits = hits.nlargest(5, 'pred_pIC50')\n    for idx, hit in top_hits.iterrows():\n        print(f'  Hit {idx}: pIC50={hit[\"pred_pIC50\"]:.2f}, '\n              f'logS={hit[\"pred_logS\"]:.2f}, MW={hit[\"molecular_weight\"]:.0f}')\n\n# Visualization\nprint('\\n=== Visualization ===')\n\nfig, axes = plt.subplots(2, 2, figsize=(15, 12))\n\n# 1. Property predictions vs true values\nfor i, target in enumerate(['pIC50', 'logS']):\n    ax = axes[0, i]\n    perf = performance[target]\n    \n    ax.scatter(perf['true_values'], perf['predictions'], alpha=0.6, s=20)\n    \n    # Perfect prediction line\n    min_val = min(perf['true_values'].min(), perf['predictions'].min())\n    max_val = max(perf['true_values'].max(), perf['predictions'].max())\n    ax.plot([min_val, max_val], [min_val, max_val], 'r--', alpha=0.8)\n    \n    ax.set_xlabel(f'True {target}')\n    ax.set_ylabel(f'Predicted {target}')\n    ax.set_title(f'{target} Prediction (R² = {perf[\"test_r2\"]:.3f})')\n    ax.grid(True, alpha=0.3)\n\n# 2. Feature importance heatmap\nax = axes[1, 0]\nimportance_matrix = feature_importance[target_cols].values.T\nim = ax.imshow(importance_matrix, cmap='viridis', aspect='auto')\nax.set_xticks(range(len(feature_cols)))\nax.set_xticklabels(feature_cols, rotation=45, ha='right')\nax.set_yticks(range(len(target_cols)))\nax.set_yticklabels(target_cols)\nax.set_title('Feature Importance Heatmap')\nplt.colorbar(im, ax=ax)\n\n# 3. Virtual screening results\nax = axes[1, 1]\nax.scatter(screening_df['pred_pIC50'], screening_df['pred_logS'], \n          alpha=0.3, s=10, color='lightblue', label='All molecules')\nif len(hits) > 0:\n    ax.scatter(hits['pred_pIC50'], hits['pred_logS'], \n              alpha=0.8, s=30, color='red', label=f'Hits ({len(hits)})')\n\nax.axvline(x=activity_threshold, color='green', linestyle='--', alpha=0.7, \n          label=f'pIC50 > {activity_threshold}')\nax.axhline(y=solubility_threshold, color='orange', linestyle='--', alpha=0.7, \n          label=f'logS > {solubility_threshold}')\n\nax.set_xlabel('Predicted pIC50')\nax.set_ylabel('Predicted logS')\nax.set_title('Virtual Screening Results')\nax.legend()\nax.grid(True, alpha=0.3)\n\nplt.tight_layout()\n\n# Save visualization\nwith tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:\n    plt.savefig(tmp.name, dpi=150, bbox_inches='tight')\n    viz_file = tmp.name\n\nplt.close()\nprint(f'Analysis visualization saved to: {viz_file}')\n\n# Summary report\nprint('\\n' + '=' * 45)\nprint('OPENCHEM DRUG DISCOVERY SUMMARY')\nprint('=' * 45)\nprint(f'Molecules analyzed: {len(df):}')\nprint(f'Properties predicted: {len(target_cols)}')\nprint(f'Drug-like molecules: {df[\"drug_like\"].sum():} ({df[\"drug_like\"].mean():.1%})')\nprint(f'\\nModel performance (test R²):')\nfor target in target_cols:\n    print(f'  {target}: {performance[target][\"test_r2\"]:.3f}')\nprint(f'\\nVirtual screening:')\nprint(f'  Molecules screened: {len(screening_df):}')\nprint(f'  Hits identified: {len(hits):}')\nprint(f'  Hit rate: {len(hits) / len(screening_df):.2%}')\n\n# Cleanup\nos.unlink(viz_file)\nprint('\\nDemo complete - temporary files cleaned up')\n\nprint('\\nOpenChem provides:')\nprint('• Multi-task molecular property prediction')\nprint('• Graph neural networks for molecules')\nprint('• Molecular generation and optimization')\nprint('• ADMET property prediction')\nprint('• Virtual screening capabilities')\nprint('• Integration with PyTorch')\nprint('• Pre-trained models and datasets')\n\nprint('\\nTypical OpenChem workflow:')\nprint('1. Load molecular dataset (SMILES, SDF)')\nprint('2. Generate molecular representations')\nprint('3. Train/load prediction models')\nprint('4. Predict properties for new molecules')\nprint('5. Apply filters for drug discovery')",
       "quick_start": [
         "Install: pip install openchem",
         "Import: from openchem.models import build_model",

tooluniverse/data/packages/genomics_tools.json CHANGED Viewed

@@ -582,7 +582,7 @@
         "pip": "pip install pydeseq2",
         "conda": "conda install -c conda-forge pydeseq2"
       },
-      "usage_example": "import pandas as pd\nimport numpy as np\nfrom pydeseq2 import DeseqDataSet\nfrom pydeseq2.dds import DeseqStats\nfrom pydeseq2.default_inference import DefaultInference\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\nprint('PyDESeq2 - RNA-seq Differential Expression Analysis')\nprint('=' * 55)\n\n# Create synthetic RNA-seq count data\nprint('Creating synthetic RNA-seq count data...')\n\n# Set random seed for reproducibility\nnp.random.seed(42)\n\n# Parameters\nn_genes = 1000\nn_samples_per_condition = 6\nconditions = ['control', 'treatment']\ntotal_samples = n_samples_per_condition * len(conditions)\n\n# Generate gene names\ngene_names = [f'Gene_{i:04d}' for i in range(1, n_genes + 1)]\n\n# Generate sample names and metadata\nsample_names = []\ncondition_labels = []\nfor condition in conditions:\n    for i in range(n_samples_per_condition):\n        sample_names.append(f'{condition}_rep{i+1}')\n        condition_labels.append(condition)\n\n# Create metadata DataFrame\nmetadata = pd.DataFrame({\n    'sample_id': sample_names,\n    'condition': condition_labels\n})\nmetadata.set_index('sample_id', inplace=True)\n\nprint(f'Created metadata for {len(sample_names)} samples:')\nprint(metadata.groupby('condition').size())\n\n# Generate count matrix\nprint('\\nGenerating count matrix...')\n\n# Base expression levels (log scale)\nbase_expression = np.random.negative_binomial(n=5, p=0.3, size=n_genes)\n\n# Create count matrix\ncounts = np.zeros((n_genes, total_samples))\n\n# Generate counts for each sample\nfor i, condition in enumerate(condition_labels):\n    # Add some noise and condition-specific effects\n    if condition == 'control':\n        # Control condition - use base expression\n        sample_counts = np.random.negative_binomial(\n            n=base_expression, \n            p=0.1,  # Dispersion parameter\n            size=n_genes\n        )\n    else:\n        # Treatment condition - add differential expression\n        # Select ~10% of genes to be differentially expressed\n        de_genes = np.random.choice(n_genes, size=int(0.1 * n_genes), replace=False)\n        \n        modified_expression = base_expression.copy()\n        \n        # Half upregulated, half downregulated\n        up_genes = de_genes[:len(de_genes)//2]\n        down_genes = de_genes[len(de_genes)//2:]\n        \n        # Upregulate (2-4 fold)\n        modified_expression[up_genes] *= np.random.uniform(2, 4, len(up_genes))\n        \n        # Downregulate (0.25-0.5 fold)\n        modified_expression[down_genes] *= np.random.uniform(0.25, 0.5, len(down_genes))\n        \n        sample_counts = np.random.negative_binomial(\n            n=modified_expression.astype(int), \n            p=0.1,\n            size=n_genes\n        )\n    \n    counts[:, i] = sample_counts\n\n# Create count DataFrame\ncount_df = pd.DataFrame(counts, index=gene_names, columns=sample_names)\ncount_df = count_df.astype(int)\n\nprint(f'Count matrix shape: {count_df.shape}')\nprint(f'Total reads per sample:')\nfor sample in count_df.columns:\n    total_reads = count_df[sample].sum()\n    print(f'  {sample}: {total_reads:,} reads')\n\nprint(f'\\nCount statistics:')\nprint(f'  Mean counts per gene: {count_df.mean(axis=1).mean():.1f}')\nprint(f'  Median counts per gene: {count_df.median(axis=1).median():.1f}')\nprint(f'  Genes with zero counts: {(count_df.sum(axis=1) == 0).sum()}')\n\n# Filter low-count genes\nprint('\\nFiltering low-count genes...')\nmin_count = 10\nmin_samples = 3\n\n# Keep genes with at least min_count reads in at least min_samples samples\nkeep_genes = (count_df >= min_count).sum(axis=1) >= min_samples\nfiltered_counts = count_df[keep_genes]\n\nprint(f'Genes before filtering: {len(count_df)}')\nprint(f'Genes after filtering: {len(filtered_counts)}')\nprint(f'Genes removed: {len(count_df) - len(filtered_counts)}')\n\n# Create DESeq2 dataset\nprint('\\n=== DESeq2 Analysis ===')\nprint('Creating DESeq2 dataset...')\n\n# Prepare data for PyDESeq2\ninference = DefaultInference(n_cpus=1)\n\n# Create DESeq2 dataset\ndds = DeseqDataSet(\n    counts=filtered_counts,\n    metadata=metadata,\n    design_factors=['condition'],\n    refit_cooks=True,\n    inference=inference\n)\n\nprint(f'DESeq2 dataset created with {dds.n_obs} genes and {dds.n_vars} samples')\n\n# Run DESeq2 analysis\nprint('\\nRunning DESeq2 analysis...')\nprint('1. Estimating size factors...')\ndds.fit_size_factors()\n\nprint('2. Estimating dispersions...')\ndds.fit_genewise_dispersions()\ndds.fit_dispersion_trend()\ndds.fit_dispersion_prior()\ndds.fit_MAP_dispersions()\n\nprint('3. Fitting generalized linear model...')\ndds.fit_LFC()\n\nprint('4. Running statistical tests...')\nstat_res = DeseqStats(dds, inference=inference)\nstat_res.summary()\n\n# Get results\nprint('\\n=== Results Analysis ===')\nresults_df = stat_res.results_df\n\nprint(f'Results shape: {results_df.shape}')\nprint(f'Columns: {list(results_df.columns)}')\n\n# Filter for significant genes\nalpha = 0.05\nlog2fc_threshold = 1.0\n\nsignificant = (\n    (results_df['padj'] < alpha) & \n    (np.abs(results_df['log2FoldChange']) > log2fc_threshold)\n)\n\nupregulated = (\n    (results_df['padj'] < alpha) & \n    (results_df['log2FoldChange'] > log2fc_threshold)\n)\n\ndownregulated = (\n    (results_df['padj'] < alpha) & \n    (results_df['log2FoldChange'] < -log2fc_threshold)\n)\n\nprint(f'\\nDifferential expression results:')\nprint(f'  Total genes tested: {len(results_df)}')\nprint(f'  Significant genes (padj < {alpha}, |log2FC| > {log2fc_threshold}): {significant.sum()}')\nprint(f'  Upregulated genes: {upregulated.sum()}')\nprint(f'  Downregulated genes: {downregulated.sum()}')\n\n# Show top differentially expressed genes\nprint('\\nTop 10 upregulated genes:')\ntop_up = results_df[upregulated].nlargest(10, 'log2FoldChange')\nfor gene, row in top_up.iterrows():\n    print(f'  {gene}: log2FC={row[\"log2FoldChange\"]:.2f}, padj={row[\"padj\"]:.2e}')\n\nprint('\\nTop 10 downregulated genes:')\ntop_down = results_df[downregulated].nsmallest(10, 'log2FoldChange')\nfor gene, row in top_down.iterrows():\n    print(f'  {gene}: log2FC={row[\"log2FoldChange\"]:.2f}, padj={row[\"padj\"]:.2e}')\n\n# Quality control plots\nprint('\\n=== Quality Control Plots ===')\n\nfig, axes = plt.subplots(2, 2, figsize=(12, 10))\n\n# 1. MA plot\naxes[0, 0].scatter(results_df['baseMean'], results_df['log2FoldChange'], \n                  alpha=0.5, s=1, color='gray')\naxes[0, 0].scatter(results_df.loc[significant, 'baseMean'], \n                  results_df.loc[significant, 'log2FoldChange'], \n                  alpha=0.7, s=2, color='red')\naxes[0, 0].axhline(y=0, color='blue', linestyle='--', alpha=0.7)\naxes[0, 0].axhline(y=log2fc_threshold, color='green', linestyle='--', alpha=0.7)\naxes[0, 0].axhline(y=-log2fc_threshold, color='green', linestyle='--', alpha=0.7)\naxes[0, 0].set_xlabel('Mean Expression')\naxes[0, 0].set_ylabel('Log2 Fold Change')\naxes[0, 0].set_title('MA Plot')\naxes[0, 0].set_xscale('log')\n\n# 2. Volcano plot\np_values = -np.log10(results_df['padj'].fillna(1))\naxes[0, 1].scatter(results_df['log2FoldChange'], p_values, \n                  alpha=0.5, s=1, color='gray')\naxes[0, 1].scatter(results_df.loc[significant, 'log2FoldChange'], \n                  p_values[significant], \n                  alpha=0.7, s=2, color='red')\naxes[0, 1].axhline(y=-np.log10(alpha), color='green', linestyle='--', alpha=0.7)\naxes[0, 1].axvline(x=log2fc_threshold, color='green', linestyle='--', alpha=0.7)\naxes[0, 1].axvline(x=-log2fc_threshold, color='green', linestyle='--', alpha=0.7)\naxes[0, 1].set_xlabel('Log2 Fold Change')\naxes[0, 1].set_ylabel('-Log10 Adjusted P-value')\naxes[0, 1].set_title('Volcano Plot')\n\n# 3. P-value histogram\naxes[1, 0].hist(results_df['pvalue'].dropna(), bins=50, alpha=0.7, color='skyblue')\naxes[1, 0].set_xlabel('P-value')\naxes[1, 0].set_ylabel('Frequency')\naxes[1, 0].set_title('P-value Distribution')\n\n# 4. Dispersion plot\naxes[1, 1].scatter(dds.layers['normed_counts'].mean(axis=1), \n                  dds.varm['dispersions'], \n                  alpha=0.5, s=1, color='gray')\naxes[1, 1].set_xlabel('Mean Normalized Counts')\naxes[1, 1].set_ylabel('Dispersion')\naxes[1, 1].set_title('Dispersion Estimates')\naxes[1, 1].set_xscale('log')\naxes[1, 1].set_yscale('log')\n\nplt.tight_layout()\n\n# Save plots\nimport tempfile\nwith tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:\n    plt.savefig(tmp.name, dpi=150, bbox_inches='tight')\n    plot_file = tmp.name\n\nplt.close()\nprint(f'QC plots saved to: {plot_file}')\n\n# Summary statistics\nprint('\\n' + '=' * 55)\nprint('DIFFERENTIAL EXPRESSION ANALYSIS SUMMARY')\nprint('=' * 55)\nprint(f'Total genes analyzed: {len(results_df):,}')\nprint(f'Significant DE genes: {significant.sum():,} ({significant.sum()/len(results_df)*100:.1f}%)')\nprint(f'Upregulated genes: {upregulated.sum():,}')\nprint(f'Downregulated genes: {downregulated.sum():,}')\nprint(f'Significance threshold: padj < {alpha}')\nprint(f'Fold change threshold: |log2FC| > {log2fc_threshold}')\n\n# Effect size distribution\nif significant.sum() > 0:\n    sig_lfc = results_df.loc[significant, 'log2FoldChange']\n    print(f'\\nEffect size statistics (significant genes):')\n    print(f'  Mean |log2FC|: {np.abs(sig_lfc).mean():.2f}')\n    print(f'  Max upregulation: {sig_lfc.max():.2f} log2FC')\n    print(f'  Max downregulation: {sig_lfc.min():.2f} log2FC')\n\n# Cleanup\nimport os\nos.unlink(plot_file)\nprint('\\nDemo complete - temporary files cleaned up')\n\nprint('\\nPyDESeq2 provides:')\nprint('• Python implementation of DESeq2')\nprint('• Differential expression analysis')\nprint('• Size factor normalization')\nprint('• Dispersion estimation')\nprint('• Statistical testing with multiple correction')\nprint('• Integration with pandas and numpy')\nprint('• Visualization and quality control')",
+      "usage_example": "import pandas as pd\nimport numpy as np\nfrom pydeseq2 import DeseqDataSet\nfrom pydeseq2.dds import DeseqStats\nfrom pydeseq2.default_inference import DefaultInference\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\nprint('PyDESeq2 - RNA-seq Differential Expression Analysis')\nprint('=' * 55)\n\n# Create synthetic RNA-seq count data\nprint('Creating synthetic RNA-seq count data...')\n\n# Set random seed for reproducibility\nnp.random.seed(42)\n\n# Parameters\nn_genes = 1000\nn_samples_per_condition = 6\nconditions = ['control', 'treatment']\ntotal_samples = n_samples_per_condition * len(conditions)\n\n# Generate gene names\ngene_names = [f'Gene_{i:04d}' for i in range(1, n_genes + 1)]\n\n# Generate sample names and metadata\nsample_names = []\ncondition_labels = []\nfor condition in conditions:\n    for i in range(n_samples_per_condition):\n        sample_names.append(f'{condition}_rep{i+1}')\n        condition_labels.append(condition)\n\n# Create metadata DataFrame\nmetadata = pd.DataFrame({\n    'sample_id': sample_names,\n    'condition': condition_labels\n})\nmetadata.set_index('sample_id', inplace=True)\n\nprint(f'Created metadata for {len(sample_names)} samples:')\nprint(metadata.groupby('condition').size())\n\n# Generate count matrix\nprint('\\nGenerating count matrix...')\n\n# Base expression levels (log scale)\nbase_expression = np.random.negative_binomial(n=5, p=0.3, size=n_genes)\n\n# Create count matrix\ncounts = np.zeros((n_genes, total_samples))\n\n# Generate counts for each sample\nfor i, condition in enumerate(condition_labels):\n    # Add some noise and condition-specific effects\n    if condition == 'control':\n        # Control condition - use base expression\n        sample_counts = np.random.negative_binomial(\n            n=base_expression, \n            p=0.1,  # Dispersion parameter\n            size=n_genes\n        )\n    else:\n        # Treatment condition - add differential expression\n        # Select ~10% of genes to be differentially expressed\n        de_genes = np.random.choice(n_genes, size=int(0.1 * n_genes), replace=False)\n        \n        modified_expression = base_expression.copy()\n        \n        # Half upregulated, half downregulated\n        up_genes = de_genes[:len(de_genes)//2]\n        down_genes = de_genes[len(de_genes)//2:]\n        \n        # Upregulate (2-4 fold)\n        modified_expression[up_genes] *= np.random.uniform(2, 4, len(up_genes))\n        \n        # Downregulate (0.25-0.5 fold)\n        modified_expression[down_genes] *= np.random.uniform(0.25, 0.5, len(down_genes))\n        \n        sample_counts = np.random.negative_binomial(\n            n=modified_expression.astype(int), \n            p=0.1,\n            size=n_genes\n        )\n    \n    counts[:, i] = sample_counts\n\n# Create count DataFrame\ncount_df = pd.DataFrame(counts, index=gene_names, columns=sample_names)\ncount_df = count_df.astype(int)\n\nprint(f'Count matrix shape: {count_df.shape}')\nprint(f'Total reads per sample:')\nfor sample in count_df.columns:\n    total_reads = count_df[sample].sum()\n    print(f'  {sample}: {total_reads:} reads')\n\nprint(f'\\nCount statistics:')\nprint(f'  Mean counts per gene: {count_df.mean(axis=1).mean():.1f}')\nprint(f'  Median counts per gene: {count_df.median(axis=1).median():.1f}')\nprint(f'  Genes with zero counts: {(count_df.sum(axis=1) == 0).sum()}')\n\n# Filter low-count genes\nprint('\\nFiltering low-count genes...')\nmin_count = 10\nmin_samples = 3\n\n# Keep genes with at least min_count reads in at least min_samples samples\nkeep_genes = (count_df >= min_count).sum(axis=1) >= min_samples\nfiltered_counts = count_df[keep_genes]\n\nprint(f'Genes before filtering: {len(count_df)}')\nprint(f'Genes after filtering: {len(filtered_counts)}')\nprint(f'Genes removed: {len(count_df) - len(filtered_counts)}')\n\n# Create DESeq2 dataset\nprint('\\n=== DESeq2 Analysis ===')\nprint('Creating DESeq2 dataset...')\n\n# Prepare data for PyDESeq2\ninference = DefaultInference(n_cpus=1)\n\n# Create DESeq2 dataset\ndds = DeseqDataSet(\n    counts=filtered_counts,\n    metadata=metadata,\n    design_factors=['condition'],\n    refit_cooks=True,\n    inference=inference\n)\n\nprint(f'DESeq2 dataset created with {dds.n_obs} genes and {dds.n_vars} samples')\n\n# Run DESeq2 analysis\nprint('\\nRunning DESeq2 analysis...')\nprint('1. Estimating size factors...')\ndds.fit_size_factors()\n\nprint('2. Estimating dispersions...')\ndds.fit_genewise_dispersions()\ndds.fit_dispersion_trend()\ndds.fit_dispersion_prior()\ndds.fit_MAP_dispersions()\n\nprint('3. Fitting generalized linear model...')\ndds.fit_LFC()\n\nprint('4. Running statistical tests...')\nstat_res = DeseqStats(dds, inference=inference)\nstat_res.summary()\n\n# Get results\nprint('\\n=== Results Analysis ===')\nresults_df = stat_res.results_df\n\nprint(f'Results shape: {results_df.shape}')\nprint(f'Columns: {list(results_df.columns)}')\n\n# Filter for significant genes\nalpha = 0.05\nlog2fc_threshold = 1.0\n\nsignificant = (\n    (results_df['padj'] < alpha) & \n    (np.abs(results_df['log2FoldChange']) > log2fc_threshold)\n)\n\nupregulated = (\n    (results_df['padj'] < alpha) & \n    (results_df['log2FoldChange'] > log2fc_threshold)\n)\n\ndownregulated = (\n    (results_df['padj'] < alpha) & \n    (results_df['log2FoldChange'] < -log2fc_threshold)\n)\n\nprint(f'\\nDifferential expression results:')\nprint(f'  Total genes tested: {len(results_df)}')\nprint(f'  Significant genes (padj < {alpha}, |log2FC| > {log2fc_threshold}): {significant.sum()}')\nprint(f'  Upregulated genes: {upregulated.sum()}')\nprint(f'  Downregulated genes: {downregulated.sum()}')\n\n# Show top differentially expressed genes\nprint('\\nTop 10 upregulated genes:')\ntop_up = results_df[upregulated].nlargest(10, 'log2FoldChange')\nfor gene, row in top_up.iterrows():\n    print(f'  {gene}: log2FC={row[\"log2FoldChange\"]:.2f}, padj={row[\"padj\"]:.2e}')\n\nprint('\\nTop 10 downregulated genes:')\ntop_down = results_df[downregulated].nsmallest(10, 'log2FoldChange')\nfor gene, row in top_down.iterrows():\n    print(f'  {gene}: log2FC={row[\"log2FoldChange\"]:.2f}, padj={row[\"padj\"]:.2e}')\n\n# Quality control plots\nprint('\\n=== Quality Control Plots ===')\n\nfig, axes = plt.subplots(2, 2, figsize=(12, 10))\n\n# 1. MA plot\naxes[0, 0].scatter(results_df['baseMean'], results_df['log2FoldChange'], \n                  alpha=0.5, s=1, color='gray')\naxes[0, 0].scatter(results_df.loc[significant, 'baseMean'], \n                  results_df.loc[significant, 'log2FoldChange'], \n                  alpha=0.7, s=2, color='red')\naxes[0, 0].axhline(y=0, color='blue', linestyle='--', alpha=0.7)\naxes[0, 0].axhline(y=log2fc_threshold, color='green', linestyle='--', alpha=0.7)\naxes[0, 0].axhline(y=-log2fc_threshold, color='green', linestyle='--', alpha=0.7)\naxes[0, 0].set_xlabel('Mean Expression')\naxes[0, 0].set_ylabel('Log2 Fold Change')\naxes[0, 0].set_title('MA Plot')\naxes[0, 0].set_xscale('log')\n\n# 2. Volcano plot\np_values = -np.log10(results_df['padj'].fillna(1))\naxes[0, 1].scatter(results_df['log2FoldChange'], p_values, \n                  alpha=0.5, s=1, color='gray')\naxes[0, 1].scatter(results_df.loc[significant, 'log2FoldChange'], \n                  p_values[significant], \n                  alpha=0.7, s=2, color='red')\naxes[0, 1].axhline(y=-np.log10(alpha), color='green', linestyle='--', alpha=0.7)\naxes[0, 1].axvline(x=log2fc_threshold, color='green', linestyle='--', alpha=0.7)\naxes[0, 1].axvline(x=-log2fc_threshold, color='green', linestyle='--', alpha=0.7)\naxes[0, 1].set_xlabel('Log2 Fold Change')\naxes[0, 1].set_ylabel('-Log10 Adjusted P-value')\naxes[0, 1].set_title('Volcano Plot')\n\n# 3. P-value histogram\naxes[1, 0].hist(results_df['pvalue'].dropna(), bins=50, alpha=0.7, color='skyblue')\naxes[1, 0].set_xlabel('P-value')\naxes[1, 0].set_ylabel('Frequency')\naxes[1, 0].set_title('P-value Distribution')\n\n# 4. Dispersion plot\naxes[1, 1].scatter(dds.layers['normed_counts'].mean(axis=1), \n                  dds.varm['dispersions'], \n                  alpha=0.5, s=1, color='gray')\naxes[1, 1].set_xlabel('Mean Normalized Counts')\naxes[1, 1].set_ylabel('Dispersion')\naxes[1, 1].set_title('Dispersion Estimates')\naxes[1, 1].set_xscale('log')\naxes[1, 1].set_yscale('log')\n\nplt.tight_layout()\n\n# Save plots\nimport tempfile\nwith tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:\n    plt.savefig(tmp.name, dpi=150, bbox_inches='tight')\n    plot_file = tmp.name\n\nplt.close()\nprint(f'QC plots saved to: {plot_file}')\n\n# Summary statistics\nprint('\\n' + '=' * 55)\nprint('DIFFERENTIAL EXPRESSION ANALYSIS SUMMARY')\nprint('=' * 55)\nprint(f'Total genes analyzed: {len(results_df):}')\nprint(f'Significant DE genes: {significant.sum():} ({significant.sum()/len(results_df)*100:.1f}%)')\nprint(f'Upregulated genes: {upregulated.sum():}')\nprint(f'Downregulated genes: {downregulated.sum():}')\nprint(f'Significance threshold: padj < {alpha}')\nprint(f'Fold change threshold: |log2FC| > {log2fc_threshold}')\n\n# Effect size distribution\nif significant.sum() > 0:\n    sig_lfc = results_df.loc[significant, 'log2FoldChange']\n    print(f'\\nEffect size statistics (significant genes):')\n    print(f'  Mean |log2FC|: {np.abs(sig_lfc).mean():.2f}')\n    print(f'  Max upregulation: {sig_lfc.max():.2f} log2FC')\n    print(f'  Max downregulation: {sig_lfc.min():.2f} log2FC')\n\n# Cleanup\nimport os\nos.unlink(plot_file)\nprint('\\nDemo complete - temporary files cleaned up')\n\nprint('\\nPyDESeq2 provides:')\nprint('• Python implementation of DESeq2')\nprint('• Differential expression analysis')\nprint('• Size factor normalization')\nprint('• Dispersion estimation')\nprint('• Statistical testing with multiple correction')\nprint('• Integration with pandas and numpy')\nprint('• Visualization and quality control')",
       "quick_start": [
         "Install: pip install pydeseq2",
         "Create dataset: dds = DeseqDataSet(counts, metadata, design_factors)",

tooluniverse/data/packages/single_cell_tools.json CHANGED Viewed

@@ -276,7 +276,7 @@
         "pip": "pip install souporcell",
         "conda": "conda install -c conda-forge souporcell"
       },
-      "usage_example": "# souporcell is primarily a command-line tool\n# Here we demonstrate the concepts and analysis workflow\n\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.cluster import KMeans\nfrom sklearn.decomposition import PCA\nfrom sklearn.metrics import adjusted_rand_score\nimport tempfile\nimport os\n\nprint('souporcell - Single-cell Genotype Clustering')\nprint('=' * 45)\n\n# Overview of souporcell workflow\nprint('souporcell Workflow:')\nprint('1. Variant calling from scRNA-seq reads')\nprint('2. Genotype matrix construction')\nprint('3. Clustering cells by genotype similarity')\nprint('4. Assignment of cells to individuals')\nprint('5. Quality control and validation')\n\nprint('\\nKey Features:')\nprint('• Handles multiplexed scRNA-seq samples')\nprint('• No prior genotype information required')\nprint('• Identifies ambient RNA contamination')\nprint('• Provides cluster assignments and QC metrics')\nprint('• Compatible with 10x Genomics data')\n\n# Simulate multiplexed single-cell data\nprint('\\n=== Simulating Multiplexed scRNA-seq Data ===')\n\nnp.random.seed(42)\n\n# Simulation parameters\nn_individuals = 4\nn_cells_per_individual = 500\nn_variants = 1000\nn_genes = 2000\n\ntotal_cells = n_individuals * n_cells_per_individual\nprint(f'Simulating {total_cells} cells from {n_individuals} individuals')\nprint(f'Using {n_variants} genetic variants and {n_genes} genes')\n\n# Generate individual genotypes\nprint('\\nGenerating individual genotypes...')\nindividual_genotypes = {}\n\nfor ind_id in range(n_individuals):\n    # Each individual has different allele frequencies\n    genotype = np.random.choice([0, 1, 2], size=n_variants, p=[0.6, 0.3, 0.1])\n    individual_genotypes[f'Individual_{ind_id}'] = genotype\n\nprint(f'Generated genotypes for {len(individual_genotypes)} individuals')\n\n# Calculate genotype differences between individuals\ngenotype_matrix = np.array([geno for geno in individual_genotypes.values()])\nprint(f'Genotype matrix shape: {genotype_matrix.shape}')\n\n# Pairwise differences\nprint('\\nPairwise genotype differences:')\nfor i in range(n_individuals):\n    for j in range(i+1, n_individuals):\n        diff = np.sum(genotype_matrix[i] != genotype_matrix[j])\n        similarity = 1 - (diff / n_variants)\n        print(f'  Individual_{i} vs Individual_{j}: {diff} differences ({similarity:.3f} similarity)')\n\n# Generate cell-level data\nprint('\\nGenerating cell-level genotype data...')\n\ncell_genotypes = []\ncell_labels = []\ncell_ids = []\n\nfor ind_id in range(n_individuals):\n    individual_geno = individual_genotypes[f'Individual_{ind_id}']\n    \n    for cell_id in range(n_cells_per_individual):\n        # Add noise to simulate technical variation and allelic dropout\n        cell_geno = individual_geno.copy()\n        \n        # Simulate allelic dropout (some variants not detected)\n        dropout_rate = 0.1\n        dropout_mask = np.random.random(n_variants) < dropout_rate\n        cell_geno[dropout_mask] = 0  # Set to homozygous reference\n        \n        # Add some random noise (technical errors)\n        noise_rate = 0.02\n        noise_mask = np.random.random(n_variants) < noise_rate\n        cell_geno[noise_mask] = np.random.choice([0, 1, 2], size=np.sum(noise_mask))\n        \n        cell_genotypes.append(cell_geno)\n        cell_labels.append(ind_id)\n        cell_ids.append(f'Cell_{ind_id}_{cell_id}')\n\ncell_genotype_matrix = np.array(cell_genotypes)\nprint(f'Cell genotype matrix shape: {cell_genotype_matrix.shape}')\nprint(f'Cells per individual: {[cell_labels.count(i) for i in range(n_individuals)]}')\n\n# Add ambient RNA contamination (doublets)\nprint('\\nSimulating ambient RNA contamination (doublets)...')\nn_doublets = 100\n\nfor doublet_id in range(n_doublets):\n    # Mix genotypes from two random individuals\n    ind1, ind2 = np.random.choice(n_individuals, size=2, replace=False)\n    \n    geno1 = individual_genotypes[f'Individual_{ind1}']\n    geno2 = individual_genotypes[f'Individual_{ind2}']\n    \n    # Create mixed genotype (roughly 50:50 mix)\n    mixed_geno = np.where(np.random.random(n_variants) < 0.5, geno1, geno2)\n    \n    # Add to cell data\n    cell_genotypes.append(mixed_geno)\n    cell_labels.append(-1)  # Doublet label\n    cell_ids.append(f'Doublet_{doublet_id}')\n\n# Update matrices\ncell_genotype_matrix = np.array(cell_genotypes)\ntotal_cells_with_doublets = len(cell_genotypes)\n\nprint(f'Total cells (including doublets): {total_cells_with_doublets}')\nprint(f'Doublets added: {n_doublets}')\nprint(f'Singlets: {total_cells_with_doublets - n_doublets}')\n\n# Dimensionality reduction for visualization\nprint('\\n=== Genotype-based Clustering Analysis ===')\n\n# PCA on genotype data\nprint('Performing PCA on genotype matrix...')\npca = PCA(n_components=10)\npca_result = pca.fit_transform(cell_genotype_matrix)\n\nprint(f'PCA explained variance ratio (first 5 components): {pca.explained_variance_ratio_[:5]}')\nprint(f'Cumulative explained variance (first 5): {np.cumsum(pca.explained_variance_ratio_[:5])}')\n\n# K-means clustering\nprint('\\nPerforming K-means clustering...')\n\n# Try different numbers of clusters\ncluster_range = range(2, 8)\ninertias = []\nari_scores = []\n\nfor k in cluster_range:\n    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)\n    cluster_labels = kmeans.fit_predict(pca_result[:, :5])  # Use first 5 PCs\n    \n    inertias.append(kmeans.inertia_)\n    \n    # Calculate ARI against true labels (excluding doublets)\n    true_labels_clean = [label for label in cell_labels if label != -1]\n    cluster_labels_clean = cluster_labels[:len(true_labels_clean)]\n    \n    if len(set(true_labels_clean)) > 1 and len(set(cluster_labels_clean)) > 1:\n        ari = adjusted_rand_score(true_labels_clean, cluster_labels_clean)\n        ari_scores.append(ari)\n    else:\n        ari_scores.append(0)\n\nprint(f'Inertias for k=2 to 7: {[f\"{inertia:.0f}\" for inertia in inertias]}')\nprint(f'ARI scores for k=2 to 7: {[f\"{ari:.3f}\" for ari in ari_scores]}')\n\n# Choose optimal k (highest ARI)\nbest_k = cluster_range[np.argmax(ari_scores)]\nprint(f'\\nOptimal number of clusters: {best_k} (ARI: {max(ari_scores):.3f})')\n\n# Final clustering with optimal k\nfinal_kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)\nfinal_clusters = final_kmeans.fit_predict(pca_result[:, :5])\n\n# Analyze cluster assignments\nprint('\\n=== Cluster Assignment Analysis ===')\n\n# Create assignment matrix\ncluster_assignment = pd.DataFrame({\n    'cell_id': cell_ids,\n    'true_individual': cell_labels,\n    'predicted_cluster': final_clusters,\n    'is_doublet': [label == -1 for label in cell_labels]\n})\n\nprint(f'Cluster assignment summary:')\nprint(cluster_assignment.groupby(['true_individual', 'predicted_cluster']).size().unstack(fill_value=0))\n\n# Calculate cluster purity\nprint('\\nCluster purity analysis:')\nfor cluster_id in range(best_k):\n    cluster_cells = cluster_assignment[cluster_assignment['predicted_cluster'] == cluster_id]\n    \n    if len(cluster_cells) > 0:\n        # Exclude doublets from purity calculation\n        singlets_in_cluster = cluster_cells[~cluster_cells['is_doublet']]\n        \n        if len(singlets_in_cluster) > 0:\n            most_common_individual = singlets_in_cluster['true_individual'].mode()\n            if len(most_common_individual) > 0:\n                purity = (singlets_in_cluster['true_individual'] == most_common_individual.iloc[0]).mean()\n                print(f'  Cluster {cluster_id}: {len(cluster_cells)} cells, '\n                      f'purity = {purity:.3f}, '\n                      f'doublets = {cluster_cells[\"is_doublet\"].sum()}')\n\n# Doublet detection analysis\nprint('\\n=== Doublet Detection Analysis ===')\n\n# Cells in clusters with mixed individuals are potential doublets\ndoublet_scores = []\n\nfor idx, row in cluster_assignment.iterrows():\n    cluster_id = row['predicted_cluster']\n    cluster_cells = cluster_assignment[cluster_assignment['predicted_cluster'] == cluster_id]\n    \n    # Calculate heterogeneity score for this cluster\n    singlets_in_cluster = cluster_cells[~cluster_cells['is_doublet']]\n    \n    if len(singlets_in_cluster) > 1:\n        individual_counts = singlets_in_cluster['true_individual'].value_counts()\n        heterogeneity = 1 - (individual_counts.max() / len(singlets_in_cluster))\n    else:\n        heterogeneity = 0\n    \n    doublet_scores.append(heterogeneity)\n\ncluster_assignment['doublet_score'] = doublet_scores\n\n# Set threshold for doublet detection\ndoublet_threshold = 0.3\npredicted_doublets = cluster_assignment['doublet_score'] > doublet_threshold\n\n# Evaluate doublet detection\ntrue_doublets = cluster_assignment['is_doublet']\ndoublet_tp = sum(predicted_doublets & true_doublets)\ndoublet_fp = sum(predicted_doublets & ~true_doublets)\ndoublet_fn = sum(~predicted_doublets & true_doublets)\ndoublet_tn = sum(~predicted_doublets & ~true_doublets)\n\ndoublet_precision = doublet_tp / (doublet_tp + doublet_fp) if (doublet_tp + doublet_fp) > 0 else 0\ndoublet_recall = doublet_tp / (doublet_tp + doublet_fn) if (doublet_tp + doublet_fn) > 0 else 0\n\nprint(f'Doublet detection performance:')\nprint(f'  True doublets: {sum(true_doublets)}')\nprint(f'  Predicted doublets: {sum(predicted_doublets)}')\nprint(f'  Precision: {doublet_precision:.3f}')\nprint(f'  Recall: {doublet_recall:.3f}')\n\n# Quality control metrics\nprint('\\n=== Quality Control Metrics ===')\n\n# Calculate per-cell variant detection rate\nvariant_detection_rates = []\nfor cell_geno in cell_genotype_matrix:\n    non_zero_variants = np.sum(cell_geno > 0)\n    detection_rate = non_zero_variants / n_variants\n    variant_detection_rates.append(detection_rate)\n\ncluster_assignment['variant_detection_rate'] = variant_detection_rates\n\nprint(f'Variant detection rates:')\nprint(f'  Mean: {np.mean(variant_detection_rates):.3f}')\nprint(f'  Median: {np.median(variant_detection_rates):.3f}')\nprint(f'  Range: {np.min(variant_detection_rates):.3f} - {np.max(variant_detection_rates):.3f}')\n\n# Per-individual statistics\nprint(f'\\nPer-individual assignment accuracy:')\nfor ind_id in range(n_individuals):\n    individual_cells = cluster_assignment[cluster_assignment['true_individual'] == ind_id]\n    \n    if len(individual_cells) > 0:\n        # Most common cluster assignment\n        most_common_cluster = individual_cells['predicted_cluster'].mode()\n        if len(most_common_cluster) > 0:\n            accuracy = (individual_cells['predicted_cluster'] == most_common_cluster.iloc[0]).mean()\n            print(f'  Individual {ind_id}: {accuracy:.3f} ({len(individual_cells)} cells)')\n\n# Visualization\nprint('\\n=== Visualization ===')\n\nfig, axes = plt.subplots(2, 2, figsize=(15, 12))\n\n# 1. PCA plot colored by true individual\nscatter1 = axes[0, 0].scatter(pca_result[:, 0], pca_result[:, 1], \n                             c=cell_labels, cmap='tab10', alpha=0.6, s=20)\naxes[0, 0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')\naxes[0, 0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')\naxes[0, 0].set_title('PCA - True Individuals')\n\n# 2. PCA plot colored by predicted cluster\nscatter2 = axes[0, 1].scatter(pca_result[:, 0], pca_result[:, 1], \n                             c=final_clusters, cmap='tab10', alpha=0.6, s=20)\naxes[0, 1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')\naxes[0, 1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')\naxes[0, 1].set_title('PCA - Predicted Clusters')\n\n# 3. Clustering evaluation metrics\nmetrics = ['Elbow Method', 'ARI Score']\naxes[1, 0].plot(cluster_range, inertias, 'bo-', label='Inertia')\naxes[1, 0].set_xlabel('Number of Clusters')\naxes[1, 0].set_ylabel('Inertia', color='blue')\naxes[1, 0].set_title('Clustering Evaluation')\naxes[1, 0].tick_params(axis='y', labelcolor='blue')\n\n# Secondary y-axis for ARI\nax_twin = axes[1, 0].twinx()\nax_twin.plot(cluster_range, ari_scores, 'ro-', label='ARI')\nax_twin.set_ylabel('Adjusted Rand Index', color='red')\nax_twin.tick_params(axis='y', labelcolor='red')\nax_twin.axvline(x=best_k, color='green', linestyle='--', alpha=0.7, label=f'Optimal k={best_k}')\n\n# 4. Doublet score distribution\naxes[1, 1].hist(cluster_assignment[cluster_assignment['is_doublet']]['doublet_score'], \n               alpha=0.7, label='True doublets', bins=20, color='red')\naxes[1, 1].hist(cluster_assignment[~cluster_assignment['is_doublet']]['doublet_score'], \n               alpha=0.7, label='Singlets', bins=20, color='blue')\naxes[1, 1].axvline(x=doublet_threshold, color='green', linestyle='--', \n                  label=f'Threshold ({doublet_threshold})')\naxes[1, 1].set_xlabel('Doublet Score')\naxes[1, 1].set_ylabel('Count')\naxes[1, 1].set_title('Doublet Score Distribution')\naxes[1, 1].legend()\n\nplt.tight_layout()\n\n# Save visualization\nwith tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:\n    plt.savefig(tmp.name, dpi=150, bbox_inches='tight')\n    viz_file = tmp.name\n\nplt.close()\nprint(f'Analysis visualization saved to: {viz_file}')\n\n# Summary report\nprint('\\n' + '=' * 45)\nprint('SOUPORCELL ANALYSIS SUMMARY')\nprint('=' * 45)\nprint(f'Total cells analyzed: {total_cells_with_doublets:,}')\nprint(f'True individuals: {n_individuals}')\nprint(f'Predicted clusters: {best_k}')\nprint(f'Clustering accuracy (ARI): {max(ari_scores):.3f}')\nprint(f'\\nDoublet detection:')\nprint(f'  True doublets: {sum(true_doublets)}')\nprint(f'  Detected doublets: {sum(predicted_doublets)}')\nprint(f'  Precision: {doublet_precision:.3f}')\nprint(f'  Recall: {doublet_recall:.3f}')\nprint(f'\\nQuality metrics:')\nprint(f'  Mean variant detection rate: {np.mean(variant_detection_rates):.3f}')\nprint(f'  Genetic variants used: {n_variants:,}')\n\n# Cleanup\nos.unlink(viz_file)\nprint('\\nDemo complete - temporary files cleaned up')\n\nprint('\\nsouporcell provides:')\nprint('• Genotype-based cell clustering')\nprint('• Multiplexed sample demultiplexing')\nprint('• Doublet detection and removal')\nprint('• Quality control metrics')\nprint('• Integration with standard scRNA-seq pipelines')\nprint('• Support for 10x Genomics data')\nprint('• Ambient RNA contamination detection')\n\nprint('\\nTypical souporcell command:')\nprint('souporcell_pipeline.py -i possorted_genome_bam.bam \\\\')\nprint('  -b filtered_feature_bc_matrix -f reference.fasta \\\\')\nprint('  -t 8 -o souporcell_output -k 4')",
+      "usage_example": "# souporcell is primarily a command-line tool\n# Here we demonstrate the concepts and analysis workflow\n\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.cluster import KMeans\nfrom sklearn.decomposition import PCA\nfrom sklearn.metrics import adjusted_rand_score\nimport tempfile\nimport os\n\nprint('souporcell - Single-cell Genotype Clustering')\nprint('=' * 45)\n\n# Overview of souporcell workflow\nprint('souporcell Workflow:')\nprint('1. Variant calling from scRNA-seq reads')\nprint('2. Genotype matrix construction')\nprint('3. Clustering cells by genotype similarity')\nprint('4. Assignment of cells to individuals')\nprint('5. Quality control and validation')\n\nprint('\\nKey Features:')\nprint('• Handles multiplexed scRNA-seq samples')\nprint('• No prior genotype information required')\nprint('• Identifies ambient RNA contamination')\nprint('• Provides cluster assignments and QC metrics')\nprint('• Compatible with 10x Genomics data')\n\n# Simulate multiplexed single-cell data\nprint('\\n=== Simulating Multiplexed scRNA-seq Data ===')\n\nnp.random.seed(42)\n\n# Simulation parameters\nn_individuals = 4\nn_cells_per_individual = 500\nn_variants = 1000\nn_genes = 2000\n\ntotal_cells = n_individuals * n_cells_per_individual\nprint(f'Simulating {total_cells} cells from {n_individuals} individuals')\nprint(f'Using {n_variants} genetic variants and {n_genes} genes')\n\n# Generate individual genotypes\nprint('\\nGenerating individual genotypes...')\nindividual_genotypes = {}\n\nfor ind_id in range(n_individuals):\n    # Each individual has different allele frequencies\n    genotype = np.random.choice([0, 1, 2], size=n_variants, p=[0.6, 0.3, 0.1])\n    individual_genotypes[f'Individual_{ind_id}'] = genotype\n\nprint(f'Generated genotypes for {len(individual_genotypes)} individuals')\n\n# Calculate genotype differences between individuals\ngenotype_matrix = np.array([geno for geno in individual_genotypes.values()])\nprint(f'Genotype matrix shape: {genotype_matrix.shape}')\n\n# Pairwise differences\nprint('\\nPairwise genotype differences:')\nfor i in range(n_individuals):\n    for j in range(i+1, n_individuals):\n        diff = np.sum(genotype_matrix[i] != genotype_matrix[j])\n        similarity = 1 - (diff / n_variants)\n        print(f'  Individual_{i} vs Individual_{j}: {diff} differences ({similarity:.3f} similarity)')\n\n# Generate cell-level data\nprint('\\nGenerating cell-level genotype data...')\n\ncell_genotypes = []\ncell_labels = []\ncell_ids = []\n\nfor ind_id in range(n_individuals):\n    individual_geno = individual_genotypes[f'Individual_{ind_id}']\n    \n    for cell_id in range(n_cells_per_individual):\n        # Add noise to simulate technical variation and allelic dropout\n        cell_geno = individual_geno.copy()\n        \n        # Simulate allelic dropout (some variants not detected)\n        dropout_rate = 0.1\n        dropout_mask = np.random.random(n_variants) < dropout_rate\n        cell_geno[dropout_mask] = 0  # Set to homozygous reference\n        \n        # Add some random noise (technical errors)\n        noise_rate = 0.02\n        noise_mask = np.random.random(n_variants) < noise_rate\n        cell_geno[noise_mask] = np.random.choice([0, 1, 2], size=np.sum(noise_mask))\n        \n        cell_genotypes.append(cell_geno)\n        cell_labels.append(ind_id)\n        cell_ids.append(f'Cell_{ind_id}_{cell_id}')\n\ncell_genotype_matrix = np.array(cell_genotypes)\nprint(f'Cell genotype matrix shape: {cell_genotype_matrix.shape}')\nprint(f'Cells per individual: {[cell_labels.count(i) for i in range(n_individuals)]}')\n\n# Add ambient RNA contamination (doublets)\nprint('\\nSimulating ambient RNA contamination (doublets)...')\nn_doublets = 100\n\nfor doublet_id in range(n_doublets):\n    # Mix genotypes from two random individuals\n    ind1, ind2 = np.random.choice(n_individuals, size=2, replace=False)\n    \n    geno1 = individual_genotypes[f'Individual_{ind1}']\n    geno2 = individual_genotypes[f'Individual_{ind2}']\n    \n    # Create mixed genotype (roughly 50:50 mix)\n    mixed_geno = np.where(np.random.random(n_variants) < 0.5, geno1, geno2)\n    \n    # Add to cell data\n    cell_genotypes.append(mixed_geno)\n    cell_labels.append(-1)  # Doublet label\n    cell_ids.append(f'Doublet_{doublet_id}')\n\n# Update matrices\ncell_genotype_matrix = np.array(cell_genotypes)\ntotal_cells_with_doublets = len(cell_genotypes)\n\nprint(f'Total cells (including doublets): {total_cells_with_doublets}')\nprint(f'Doublets added: {n_doublets}')\nprint(f'Singlets: {total_cells_with_doublets - n_doublets}')\n\n# Dimensionality reduction for visualization\nprint('\\n=== Genotype-based Clustering Analysis ===')\n\n# PCA on genotype data\nprint('Performing PCA on genotype matrix...')\npca = PCA(n_components=10)\npca_result = pca.fit_transform(cell_genotype_matrix)\n\nprint(f'PCA explained variance ratio (first 5 components): {pca.explained_variance_ratio_[:5]}')\nprint(f'Cumulative explained variance (first 5): {np.cumsum(pca.explained_variance_ratio_[:5])}')\n\n# K-means clustering\nprint('\\nPerforming K-means clustering...')\n\n# Try different numbers of clusters\ncluster_range = range(2, 8)\ninertias = []\nari_scores = []\n\nfor k in cluster_range:\n    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)\n    cluster_labels = kmeans.fit_predict(pca_result[:, :5])  # Use first 5 PCs\n    \n    inertias.append(kmeans.inertia_)\n    \n    # Calculate ARI against true labels (excluding doublets)\n    true_labels_clean = [label for label in cell_labels if label != -1]\n    cluster_labels_clean = cluster_labels[:len(true_labels_clean)]\n    \n    if len(set(true_labels_clean)) > 1 and len(set(cluster_labels_clean)) > 1:\n        ari = adjusted_rand_score(true_labels_clean, cluster_labels_clean)\n        ari_scores.append(ari)\n    else:\n        ari_scores.append(0)\n\nprint(f'Inertias for k=2 to 7: {[f\"{inertia:.0f}\" for inertia in inertias]}')\nprint(f'ARI scores for k=2 to 7: {[f\"{ari:.3f}\" for ari in ari_scores]}')\n\n# Choose optimal k (highest ARI)\nbest_k = cluster_range[np.argmax(ari_scores)]\nprint(f'\\nOptimal number of clusters: {best_k} (ARI: {max(ari_scores):.3f})')\n\n# Final clustering with optimal k\nfinal_kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)\nfinal_clusters = final_kmeans.fit_predict(pca_result[:, :5])\n\n# Analyze cluster assignments\nprint('\\n=== Cluster Assignment Analysis ===')\n\n# Create assignment matrix\ncluster_assignment = pd.DataFrame({\n    'cell_id': cell_ids,\n    'true_individual': cell_labels,\n    'predicted_cluster': final_clusters,\n    'is_doublet': [label == -1 for label in cell_labels]\n})\n\nprint(f'Cluster assignment summary:')\nprint(cluster_assignment.groupby(['true_individual', 'predicted_cluster']).size().unstack(fill_value=0))\n\n# Calculate cluster purity\nprint('\\nCluster purity analysis:')\nfor cluster_id in range(best_k):\n    cluster_cells = cluster_assignment[cluster_assignment['predicted_cluster'] == cluster_id]\n    \n    if len(cluster_cells) > 0:\n        # Exclude doublets from purity calculation\n        singlets_in_cluster = cluster_cells[~cluster_cells['is_doublet']]\n        \n        if len(singlets_in_cluster) > 0:\n            most_common_individual = singlets_in_cluster['true_individual'].mode()\n            if len(most_common_individual) > 0:\n                purity = (singlets_in_cluster['true_individual'] == most_common_individual.iloc[0]).mean()\n                print(f'  Cluster {cluster_id}: {len(cluster_cells)} cells, '\n                      f'purity = {purity:.3f}, '\n                      f'doublets = {cluster_cells[\"is_doublet\"].sum()}')\n\n# Doublet detection analysis\nprint('\\n=== Doublet Detection Analysis ===')\n\n# Cells in clusters with mixed individuals are potential doublets\ndoublet_scores = []\n\nfor idx, row in cluster_assignment.iterrows():\n    cluster_id = row['predicted_cluster']\n    cluster_cells = cluster_assignment[cluster_assignment['predicted_cluster'] == cluster_id]\n    \n    # Calculate heterogeneity score for this cluster\n    singlets_in_cluster = cluster_cells[~cluster_cells['is_doublet']]\n    \n    if len(singlets_in_cluster) > 1:\n        individual_counts = singlets_in_cluster['true_individual'].value_counts()\n        heterogeneity = 1 - (individual_counts.max() / len(singlets_in_cluster))\n    else:\n        heterogeneity = 0\n    \n    doublet_scores.append(heterogeneity)\n\ncluster_assignment['doublet_score'] = doublet_scores\n\n# Set threshold for doublet detection\ndoublet_threshold = 0.3\npredicted_doublets = cluster_assignment['doublet_score'] > doublet_threshold\n\n# Evaluate doublet detection\ntrue_doublets = cluster_assignment['is_doublet']\ndoublet_tp = sum(predicted_doublets & true_doublets)\ndoublet_fp = sum(predicted_doublets & ~true_doublets)\ndoublet_fn = sum(~predicted_doublets & true_doublets)\ndoublet_tn = sum(~predicted_doublets & ~true_doublets)\n\ndoublet_precision = doublet_tp / (doublet_tp + doublet_fp) if (doublet_tp + doublet_fp) > 0 else 0\ndoublet_recall = doublet_tp / (doublet_tp + doublet_fn) if (doublet_tp + doublet_fn) > 0 else 0\n\nprint(f'Doublet detection performance:')\nprint(f'  True doublets: {sum(true_doublets)}')\nprint(f'  Predicted doublets: {sum(predicted_doublets)}')\nprint(f'  Precision: {doublet_precision:.3f}')\nprint(f'  Recall: {doublet_recall:.3f}')\n\n# Quality control metrics\nprint('\\n=== Quality Control Metrics ===')\n\n# Calculate per-cell variant detection rate\nvariant_detection_rates = []\nfor cell_geno in cell_genotype_matrix:\n    non_zero_variants = np.sum(cell_geno > 0)\n    detection_rate = non_zero_variants / n_variants\n    variant_detection_rates.append(detection_rate)\n\ncluster_assignment['variant_detection_rate'] = variant_detection_rates\n\nprint(f'Variant detection rates:')\nprint(f'  Mean: {np.mean(variant_detection_rates):.3f}')\nprint(f'  Median: {np.median(variant_detection_rates):.3f}')\nprint(f'  Range: {np.min(variant_detection_rates):.3f} - {np.max(variant_detection_rates):.3f}')\n\n# Per-individual statistics\nprint(f'\\nPer-individual assignment accuracy:')\nfor ind_id in range(n_individuals):\n    individual_cells = cluster_assignment[cluster_assignment['true_individual'] == ind_id]\n    \n    if len(individual_cells) > 0:\n        # Most common cluster assignment\n        most_common_cluster = individual_cells['predicted_cluster'].mode()\n        if len(most_common_cluster) > 0:\n            accuracy = (individual_cells['predicted_cluster'] == most_common_cluster.iloc[0]).mean()\n            print(f'  Individual {ind_id}: {accuracy:.3f} ({len(individual_cells)} cells)')\n\n# Visualization\nprint('\\n=== Visualization ===')\n\nfig, axes = plt.subplots(2, 2, figsize=(15, 12))\n\n# 1. PCA plot colored by true individual\nscatter1 = axes[0, 0].scatter(pca_result[:, 0], pca_result[:, 1], \n                             c=cell_labels, cmap='tab10', alpha=0.6, s=20)\naxes[0, 0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')\naxes[0, 0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')\naxes[0, 0].set_title('PCA - True Individuals')\n\n# 2. PCA plot colored by predicted cluster\nscatter2 = axes[0, 1].scatter(pca_result[:, 0], pca_result[:, 1], \n                             c=final_clusters, cmap='tab10', alpha=0.6, s=20)\naxes[0, 1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')\naxes[0, 1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')\naxes[0, 1].set_title('PCA - Predicted Clusters')\n\n# 3. Clustering evaluation metrics\nmetrics = ['Elbow Method', 'ARI Score']\naxes[1, 0].plot(cluster_range, inertias, 'bo-', label='Inertia')\naxes[1, 0].set_xlabel('Number of Clusters')\naxes[1, 0].set_ylabel('Inertia', color='blue')\naxes[1, 0].set_title('Clustering Evaluation')\naxes[1, 0].tick_params(axis='y', labelcolor='blue')\n\n# Secondary y-axis for ARI\nax_twin = axes[1, 0].twinx()\nax_twin.plot(cluster_range, ari_scores, 'ro-', label='ARI')\nax_twin.set_ylabel('Adjusted Rand Index', color='red')\nax_twin.tick_params(axis='y', labelcolor='red')\nax_twin.axvline(x=best_k, color='green', linestyle='--', alpha=0.7, label=f'Optimal k={best_k}')\n\n# 4. Doublet score distribution\naxes[1, 1].hist(cluster_assignment[cluster_assignment['is_doublet']]['doublet_score'], \n               alpha=0.7, label='True doublets', bins=20, color='red')\naxes[1, 1].hist(cluster_assignment[~cluster_assignment['is_doublet']]['doublet_score'], \n               alpha=0.7, label='Singlets', bins=20, color='blue')\naxes[1, 1].axvline(x=doublet_threshold, color='green', linestyle='--', \n                  label=f'Threshold ({doublet_threshold})')\naxes[1, 1].set_xlabel('Doublet Score')\naxes[1, 1].set_ylabel('Count')\naxes[1, 1].set_title('Doublet Score Distribution')\naxes[1, 1].legend()\n\nplt.tight_layout()\n\n# Save visualization\nwith tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:\n    plt.savefig(tmp.name, dpi=150, bbox_inches='tight')\n    viz_file = tmp.name\n\nplt.close()\nprint(f'Analysis visualization saved to: {viz_file}')\n\n# Summary report\nprint('\\n' + '=' * 45)\nprint('SOUPORCELL ANALYSIS SUMMARY')\nprint('=' * 45)\nprint(f'Total cells analyzed: {total_cells_with_doublets:}')\nprint(f'True individuals: {n_individuals}')\nprint(f'Predicted clusters: {best_k}')\nprint(f'Clustering accuracy (ARI): {max(ari_scores):.3f}')\nprint(f'\\nDoublet detection:')\nprint(f'  True doublets: {sum(true_doublets)}')\nprint(f'  Detected doublets: {sum(predicted_doublets)}')\nprint(f'  Precision: {doublet_precision:.3f}')\nprint(f'  Recall: {doublet_recall:.3f}')\nprint(f'\\nQuality metrics:')\nprint(f'  Mean variant detection rate: {np.mean(variant_detection_rates):.3f}')\nprint(f'  Genetic variants used: {n_variants:}')\n\n# Cleanup\nos.unlink(viz_file)\nprint('\\nDemo complete - temporary files cleaned up')\n\nprint('\\nsouporcell provides:')\nprint('• Genotype-based cell clustering')\nprint('• Multiplexed sample demultiplexing')\nprint('• Doublet detection and removal')\nprint('• Quality control metrics')\nprint('• Integration with standard scRNA-seq pipelines')\nprint('• Support for 10x Genomics data')\nprint('• Ambient RNA contamination detection')\n\nprint('\\nTypical souporcell command:')\nprint('souporcell_pipeline.py -i possorted_genome_bam.bam \\\\')\nprint('  -b filtered_feature_bc_matrix -f reference.fasta \\\\')\nprint('  -t 8 -o souporcell_output -k 4')",
       "quick_start": [
         "Install: pip install souporcell",
         "Run: souporcell_pipeline.py -i input.bam -b barcodes",

tooluniverse/data/packages/structural_biology_tools.json CHANGED Viewed

@@ -281,7 +281,7 @@
         "pip": "pip install diffdock",
         "conda": "conda install -c conda-forge diffdock"
       },
-      "usage_example": "# DiffDock molecular docking simulation\n# This demonstrates the concepts and workflow\n\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom scipy.spatial.distance import cdist\nfrom sklearn.cluster import DBSCAN\nimport tempfile\nimport os\n\nprint('DiffDock - Diffusion Model for Molecular Docking')\nprint('=' * 50)\n\n# Overview of DiffDock approach\nprint('DiffDock Innovation:')\nprint('• Uses diffusion models for pose generation')\nprint('• End-to-end learning without hand-crafted features')\nprint('• Handles both rigid and flexible docking')\nprint('• State-of-the-art accuracy on benchmarks')\nprint('• Fast inference with GPU acceleration')\n\nprint('\\nKey Advantages over Traditional Docking:')\nprint('• No need for extensive sampling')\nprint('• Better handling of conformational flexibility')\nprint('• Learned representations from large datasets')\nprint('• Robust to protein structure variations')\n\n# Simulate molecular docking workflow\nprint('\\n=== Molecular Docking Simulation ===')\n\nnp.random.seed(42)\n\n# Define protein binding site\nprint('Defining protein binding site...')\nbinding_site_center = np.array([0.0, 0.0, 0.0])\nbinding_site_radius = 8.0\n\n# Generate protein surface points\nn_surface_points = 200\ntheta = np.random.uniform(0, 2*np.pi, n_surface_points)\nphi = np.random.uniform(0, np.pi, n_surface_points)\nr = np.random.uniform(5.0, 8.0, n_surface_points)\n\nprotein_surface = np.column_stack([\n    r * np.sin(phi) * np.cos(theta),\n    r * np.sin(phi) * np.sin(theta),\n    r * np.cos(phi)\n])\n\nprint(f'Generated {n_surface_points} protein surface points')\nprint(f'Binding site radius: {binding_site_radius:.1f} Å')\n\n# Define ligand structure\nprint('\\nDefining ligand structure...')\nligand_atoms = {\n    'C1': np.array([0.0, 0.0, 0.0]),\n    'C2': np.array([1.5, 0.0, 0.0]),\n    'C3': np.array([2.2, 1.3, 0.0]),\n    'N1': np.array([1.5, 2.6, 0.0]),\n    'O1': np.array([0.0, 2.6, 0.5]),\n    'C4': np.array([-0.7, 1.3, 0.5])\n}\n\nligand_center = np.mean(list(ligand_atoms.values()), axis=0)\nligand_coords = np.array(list(ligand_atoms.values()))\nligand_coords_centered = ligand_coords - ligand_center\n\nprint(f'Ligand atoms: {len(ligand_atoms)}')\nprint(f'Ligand center: ({ligand_center[0]:.1f}, {ligand_center[1]:.1f}, {ligand_center[2]:.1f})')\nprint(f'Ligand size: {np.max(cdist(ligand_coords, ligand_coords)):.1f} Å')\n\n# Simulate diffusion-based pose generation\nprint('\\n=== Diffusion-Based Pose Generation ===')\n\ndef rotation_matrix(axis, angle):\n    \"\"\"Generate rotation matrix for given axis and angle\"\"\"\n    axis = axis / np.linalg.norm(axis)\n    cos_angle = np.cos(angle)\n    sin_angle = np.sin(angle)\n    \n    return (\n        cos_angle * np.eye(3) +\n        sin_angle * np.array([[0, -axis[2], axis[1]],\n                             [axis[2], 0, -axis[0]],\n                             [-axis[1], axis[0], 0]]) +\n        (1 - cos_angle) * np.outer(axis, axis)\n    )\n\ndef generate_pose(translation, rotation_axis, rotation_angle):\n    \"\"\"Generate ligand pose from translation and rotation\"\"\"\n    rot_matrix = rotation_matrix(rotation_axis, rotation_angle)\n    rotated_ligand = np.dot(ligand_coords_centered, rot_matrix.T)\n    posed_ligand = rotated_ligand + translation\n    return posed_ligand\n\ndef calculate_docking_score(posed_ligand, protein_surface):\n    \"\"\"Calculate simplified docking score\"\"\"\n    # Distance-based scoring\n    min_distances = np.min(cdist(posed_ligand, protein_surface), axis=1)\n    \n    # Prefer poses where ligand is close to protein surface\n    distance_score = -np.mean(np.maximum(0, min_distances - 2.0))\n    \n    # Penalty for steric clashes (too close)\n    clash_penalty = -np.sum(np.maximum(0, 1.5 - min_distances)) * 10\n    \n    # Bonus for being in binding site\n    center_distances = np.linalg.norm(posed_ligand - binding_site_center, axis=1)\n    binding_site_bonus = -np.mean(np.maximum(0, center_distances - binding_site_radius))\n    \n    total_score = distance_score + clash_penalty + binding_site_bonus\n    return total_score, distance_score, clash_penalty, binding_site_bonus\n\n# Generate multiple poses using diffusion-like sampling\nprint('Generating poses via diffusion sampling...')\nn_poses = 1000\nposes = []\nscores = []\nscore_components = []\n\nfor i in range(n_poses):\n    # Sample translation (biased toward binding site)\n    translation = binding_site_center + np.random.normal(0, 3.0, 3)\n    \n    # Sample rotation\n    rotation_axis = np.random.normal(0, 1, 3)\n    rotation_axis = rotation_axis / np.linalg.norm(rotation_axis)\n    rotation_angle = np.random.uniform(0, 2*np.pi)\n    \n    # Generate pose\n    posed_ligand = generate_pose(translation, rotation_axis, rotation_angle)\n    \n    # Calculate score\n    total_score, dist_score, clash_penalty, binding_bonus = calculate_docking_score(\n        posed_ligand, protein_surface)\n    \n    poses.append({\n        'id': i,\n        'translation': translation,\n        'rotation_axis': rotation_axis,\n        'rotation_angle': rotation_angle,\n        'coordinates': posed_ligand,\n        'score': total_score,\n        'distance_score': dist_score,\n        'clash_penalty': clash_penalty,\n        'binding_bonus': binding_bonus\n    })\n    \n    scores.append(total_score)\n    score_components.append([dist_score, clash_penalty, binding_bonus])\n\nscores = np.array(scores)\nscore_components = np.array(score_components)\n\nprint(f'Generated {n_poses} poses')\nprint(f'Score range: {np.min(scores):.2f} to {np.max(scores):.2f}')\nprint(f'Mean score: {np.mean(scores):.2f} ± {np.std(scores):.2f}')\n\n# Select top poses\ntop_indices = np.argsort(scores)[-50:]  # Top 50 poses\ntop_poses = [poses[i] for i in top_indices]\ntop_scores = scores[top_indices]\n\nprint(f'\\nTop 10 pose scores: {top_scores[-10:]}')\n\n# Cluster top poses\nprint('\\n=== Pose Clustering ===')\n\n# Extract center positions of top poses\ntop_centers = np.array([np.mean(pose['coordinates'], axis=0) for pose in top_poses])\n\n# Cluster by position\ndbscan = DBSCAN(eps=2.0, min_samples=3)\ncluster_labels = dbscan.fit_predict(top_centers)\n\nn_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)\nn_noise = list(cluster_labels).count(-1)\n\nprint(f'Found {n_clusters} pose clusters')\nprint(f'Noise points: {n_noise}')\n\n# Analyze clusters\ncluster_info = []\nfor cluster_id in range(n_clusters):\n    cluster_mask = cluster_labels == cluster_id\n    cluster_poses = [top_poses[i] for i in range(len(top_poses)) if cluster_mask[i]]\n    cluster_scores = top_scores[cluster_mask]\n    \n    cluster_center = np.mean(top_centers[cluster_mask], axis=0)\n    best_score = np.max(cluster_scores)\n    cluster_size = len(cluster_poses)\n    \n    cluster_info.append({\n        'id': cluster_id,\n        'size': cluster_size,\n        'best_score': best_score,\n        'center': cluster_center,\n        'poses': cluster_poses\n    })\n    \n    print(f'  Cluster {cluster_id}: {cluster_size} poses, '\n          f'best score: {best_score:.2f}, '\n          f'center: ({cluster_center[0]:.1f}, {cluster_center[1]:.1f}, {cluster_center[2]:.1f})')\n\n# Select representative poses\nprint('\\n=== Representative Pose Selection ===')\n\nrepresentative_poses = []\nfor cluster in sorted(cluster_info, key=lambda x: x['best_score'], reverse=True):\n    # Select best pose from each cluster\n    best_pose_idx = np.argmax([pose['score'] for pose in cluster['poses']])\n    best_pose = cluster['poses'][best_pose_idx]\n    \n    representative_poses.append({\n        'cluster_id': cluster['id'],\n        'pose': best_pose,\n        'cluster_size': cluster['size']\n    })\n    \n    print(f'Cluster {cluster[\"id\"]}: Score {best_pose[\"score\"]:.2f}, '\n          f'Position ({np.mean(best_pose[\"coordinates\"], axis=0)})')\n\n# Analyze binding mode\nprint('\\n=== Binding Mode Analysis ===')\n\nif representative_poses:\n    best_overall = representative_poses[0]['pose']\n    \n    print(f'Best pose analysis:')\n    print(f'  Total score: {best_overall[\"score\"]:.2f}')\n    print(f'  Distance score: {best_overall[\"distance_score\"]:.2f}')\n    print(f'  Clash penalty: {best_overall[\"clash_penalty\"]:.2f}')\n    print(f'  Binding site bonus: {best_overall[\"binding_bonus\"]:.2f}')\n    \n    # Calculate interactions\n    best_coords = best_overall['coordinates']\n    distances_to_surface = cdist(best_coords, protein_surface)\n    min_distances = np.min(distances_to_surface, axis=1)\n    \n    print(f'\\n  Ligand-protein distances:')\n    for i, (atom_name, distance) in enumerate(zip(ligand_atoms.keys(), min_distances)):\n        print(f'    {atom_name}: {distance:.2f} Å')\n    \n    # Identify close contacts\n    close_contacts = min_distances < 3.0\n    print(f'\\n  Close contacts (< 3.0 Å): {np.sum(close_contacts)} atoms')\n    \n    # Center of mass analysis\n    com_distance = np.linalg.norm(np.mean(best_coords, axis=0) - binding_site_center)\n    print(f'  Distance from binding site center: {com_distance:.2f} Å')\n\n# Confidence estimation\nprint('\\n=== Confidence Estimation ===')\n\n# Score-based confidence\ntop_10_scores = top_scores[-10:]\nscore_std = np.std(top_10_scores)\nscore_confidence = 1.0 / (1.0 + score_std)  # Higher std = lower confidence\n\n# Clustering-based confidence\nif n_clusters > 0:\n    largest_cluster_size = max(cluster['size'] for cluster in cluster_info)\n    cluster_confidence = largest_cluster_size / len(top_poses)\nelse:\n    cluster_confidence = 0.0\n\n# Combined confidence\noverall_confidence = (score_confidence + cluster_confidence) / 2.0\n\nprint(f'Score-based confidence: {score_confidence:.3f}')\nprint(f'Clustering confidence: {cluster_confidence:.3f}')\nprint(f'Overall confidence: {overall_confidence:.3f}')\n\n# Quality metrics\nprint('\\n=== Quality Metrics ===')\n\n# Pose diversity\nall_centers = np.array([np.mean(pose['coordinates'], axis=0) for pose in poses])\nposition_diversity = np.mean(cdist(all_centers, all_centers))\n\n# Energy landscape analysis\nenergy_range = np.max(scores) - np.min(scores)\nfunnel_quality = (np.max(scores) - np.mean(scores)) / np.std(scores)\n\nprint(f'Position diversity: {position_diversity:.2f} Å')\nprint(f'Energy range: {energy_range:.2f}')\nprint(f'Funnel quality: {funnel_quality:.2f}')\n\n# Visualization\nprint('\\n=== Visualization ===')\n\nfig = plt.figure(figsize=(16, 12))\n\n# 3D visualization of binding site and top poses\nax1 = fig.add_subplot(221, projection='3d')\n\n# Plot protein surface\nax1.scatter(protein_surface[:, 0], protein_surface[:, 1], protein_surface[:, 2], \n           c='lightblue', alpha=0.3, s=20, label='Protein surface')\n\n# Plot binding site center\nax1.scatter(*binding_site_center, c='red', s=100, marker='*', label='Binding site')\n\n# Plot top poses\ncolors = plt.cm.viridis(np.linspace(0, 1, len(representative_poses)))\nfor i, rep_pose in enumerate(representative_poses[:5]):  # Show top 5\n    coords = rep_pose['pose']['coordinates']\n    ax1.scatter(coords[:, 0], coords[:, 1], coords[:, 2], \n               c=[colors[i]], s=50, alpha=0.8, label=f'Pose {i+1}')\n\nax1.set_xlabel('X (Å)')\nax1.set_ylabel('Y (Å)')\nax1.set_zlabel('Z (Å)')\nax1.set_title('3D Binding Site and Top Poses')\nax1.legend()\n\n# Score distribution\nax2 = fig.add_subplot(222)\nax2.hist(scores, bins=50, alpha=0.7, color='skyblue', edgecolor='black')\nax2.axvline(np.mean(scores), color='red', linestyle='--', label=f'Mean: {np.mean(scores):.2f}')\nax2.axvline(np.percentile(scores, 95), color='green', linestyle='--', \n           label=f'95th percentile: {np.percentile(scores, 95):.2f}')\nax2.set_xlabel('Docking Score')\nax2.set_ylabel('Frequency')\nax2.set_title('Score Distribution')\nax2.legend()\n\n# Score components\nax3 = fig.add_subplot(223)\ncomponent_names = ['Distance', 'Clash Penalty', 'Binding Bonus']\nfor i, name in enumerate(component_names):\n    ax3.scatter(range(len(top_poses)), score_components[top_indices, i], \n               alpha=0.6, label=name, s=20)\nax3.set_xlabel('Pose Index (sorted by score)')\nax3.set_ylabel('Score Component')\nax3.set_title('Score Components for Top Poses')\nax3.legend()\nax3.grid(True, alpha=0.3)\n\n# Cluster analysis\nax4 = fig.add_subplot(224)\nif n_clusters > 0:\n    for cluster_id in range(n_clusters):\n        cluster_mask = cluster_labels == cluster_id\n        if np.any(cluster_mask):\n            cluster_centers = top_centers[cluster_mask]\n            cluster_scores = top_scores[cluster_mask]\n            ax4.scatter(cluster_centers[:, 0], cluster_centers[:, 1], \n                       c=cluster_scores, s=50, alpha=0.7, \n                       label=f'Cluster {cluster_id}', cmap='viridis')\n    \n    # Noise points\n    noise_mask = cluster_labels == -1\n    if np.any(noise_mask):\n        ax4.scatter(top_centers[noise_mask, 0], top_centers[noise_mask, 1], \n                   c='gray', s=20, alpha=0.5, label='Noise')\n    \n    ax4.scatter(*binding_site_center[:2], c='red', s=100, marker='*', \n               label='Binding site')\n    \n    ax4.set_xlabel('X (Å)')\n    ax4.set_ylabel('Y (Å)')\n    ax4.set_title('Pose Clusters (XY projection)')\n    ax4.legend()\n    ax4.grid(True, alpha=0.3)\n\nplt.tight_layout()\n\n# Save visualization\nwith tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:\n    plt.savefig(tmp.name, dpi=150, bbox_inches='tight')\n    viz_file = tmp.name\n\nplt.close()\nprint(f'Docking visualization saved to: {viz_file}')\n\n# Summary report\nprint('\\n' + '=' * 50)\nprint('DIFFDOCK MOLECULAR DOCKING SUMMARY')\nprint('=' * 50)\nprint(f'Poses generated: {n_poses:,}')\nprint(f'Top poses analyzed: {len(top_poses)}')\nprint(f'Pose clusters found: {n_clusters}')\nprint(f'Best docking score: {np.max(scores):.2f}')\nprint(f'Confidence score: {overall_confidence:.3f}')\nprint(f'\\nBest pose location:')\nif representative_poses:\n    best_com = np.mean(representative_poses[0]['pose']['coordinates'], axis=0)\n    print(f'  Center of mass: ({best_com[0]:.2f}, {best_com[1]:.2f}, {best_com[2]:.2f})')\n    print(f'  Distance from binding site: {np.linalg.norm(best_com - binding_site_center):.2f} Å')\n\nprint(f'\\nQuality metrics:')\nprint(f'  Position diversity: {position_diversity:.2f} Å')\nprint(f'  Energy range: {energy_range:.2f}')\nprint(f'  Funnel quality: {funnel_quality:.2f}')\n\n# Cleanup\nos.unlink(viz_file)\nprint('\\nDemo complete - temporary files cleaned up')\n\nprint('\\nDiffDock provides:')\nprint('• State-of-the-art docking accuracy')\nprint('• End-to-end deep learning approach')\nprint('• Fast GPU-accelerated inference')\nprint('• Handling of flexible ligands and proteins')\nprint('• Confidence estimation for poses')\nprint('• Integration with drug discovery pipelines')\nprint('• Pre-trained models on large datasets')\n\nprint('\\nTypical DiffDock usage:')\nprint('python -m diffdock.inference --protein protein.pdb \\\\')\nprint('  --ligand ligand.sdf --out_dir results --inference_steps 20')",
+      "usage_example": "# DiffDock molecular docking simulation\n# This demonstrates the concepts and workflow\n\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom scipy.spatial.distance import cdist\nfrom sklearn.cluster import DBSCAN\nimport tempfile\nimport os\n\nprint('DiffDock - Diffusion Model for Molecular Docking')\nprint('=' * 50)\n\n# Overview of DiffDock approach\nprint('DiffDock Innovation:')\nprint('• Uses diffusion models for pose generation')\nprint('• End-to-end learning without hand-crafted features')\nprint('• Handles both rigid and flexible docking')\nprint('• State-of-the-art accuracy on benchmarks')\nprint('• Fast inference with GPU acceleration')\n\nprint('\\nKey Advantages over Traditional Docking:')\nprint('• No need for extensive sampling')\nprint('• Better handling of conformational flexibility')\nprint('• Learned representations from large datasets')\nprint('• Robust to protein structure variations')\n\n# Simulate molecular docking workflow\nprint('\\n=== Molecular Docking Simulation ===')\n\nnp.random.seed(42)\n\n# Define protein binding site\nprint('Defining protein binding site...')\nbinding_site_center = np.array([0.0, 0.0, 0.0])\nbinding_site_radius = 8.0\n\n# Generate protein surface points\nn_surface_points = 200\ntheta = np.random.uniform(0, 2*np.pi, n_surface_points)\nphi = np.random.uniform(0, np.pi, n_surface_points)\nr = np.random.uniform(5.0, 8.0, n_surface_points)\n\nprotein_surface = np.column_stack([\n    r * np.sin(phi) * np.cos(theta),\n    r * np.sin(phi) * np.sin(theta),\n    r * np.cos(phi)\n])\n\nprint(f'Generated {n_surface_points} protein surface points')\nprint(f'Binding site radius: {binding_site_radius:.1f} Å')\n\n# Define ligand structure\nprint('\\nDefining ligand structure...')\nligand_atoms = {\n    'C1': np.array([0.0, 0.0, 0.0]),\n    'C2': np.array([1.5, 0.0, 0.0]),\n    'C3': np.array([2.2, 1.3, 0.0]),\n    'N1': np.array([1.5, 2.6, 0.0]),\n    'O1': np.array([0.0, 2.6, 0.5]),\n    'C4': np.array([-0.7, 1.3, 0.5])\n}\n\nligand_center = np.mean(list(ligand_atoms.values()), axis=0)\nligand_coords = np.array(list(ligand_atoms.values()))\nligand_coords_centered = ligand_coords - ligand_center\n\nprint(f'Ligand atoms: {len(ligand_atoms)}')\nprint(f'Ligand center: ({ligand_center[0]:.1f}, {ligand_center[1]:.1f}, {ligand_center[2]:.1f})')\nprint(f'Ligand size: {np.max(cdist(ligand_coords, ligand_coords)):.1f} Å')\n\n# Simulate diffusion-based pose generation\nprint('\\n=== Diffusion-Based Pose Generation ===')\n\ndef rotation_matrix(axis, angle):\n    \"\"\"Generate rotation matrix for given axis and angle\"\"\"\n    axis = axis / np.linalg.norm(axis)\n    cos_angle = np.cos(angle)\n    sin_angle = np.sin(angle)\n    \n    return (\n        cos_angle * np.eye(3) +\n        sin_angle * np.array([[0, -axis[2], axis[1]],\n                             [axis[2], 0, -axis[0]],\n                             [-axis[1], axis[0], 0]]) +\n        (1 - cos_angle) * np.outer(axis, axis)\n    )\n\ndef generate_pose(translation, rotation_axis, rotation_angle):\n    \"\"\"Generate ligand pose from translation and rotation\"\"\"\n    rot_matrix = rotation_matrix(rotation_axis, rotation_angle)\n    rotated_ligand = np.dot(ligand_coords_centered, rot_matrix.T)\n    posed_ligand = rotated_ligand + translation\n    return posed_ligand\n\ndef calculate_docking_score(posed_ligand, protein_surface):\n    \"\"\"Calculate simplified docking score\"\"\"\n    # Distance-based scoring\n    min_distances = np.min(cdist(posed_ligand, protein_surface), axis=1)\n    \n    # Prefer poses where ligand is close to protein surface\n    distance_score = -np.mean(np.maximum(0, min_distances - 2.0))\n    \n    # Penalty for steric clashes (too close)\n    clash_penalty = -np.sum(np.maximum(0, 1.5 - min_distances)) * 10\n    \n    # Bonus for being in binding site\n    center_distances = np.linalg.norm(posed_ligand - binding_site_center, axis=1)\n    binding_site_bonus = -np.mean(np.maximum(0, center_distances - binding_site_radius))\n    \n    total_score = distance_score + clash_penalty + binding_site_bonus\n    return total_score, distance_score, clash_penalty, binding_site_bonus\n\n# Generate multiple poses using diffusion-like sampling\nprint('Generating poses via diffusion sampling...')\nn_poses = 1000\nposes = []\nscores = []\nscore_components = []\n\nfor i in range(n_poses):\n    # Sample translation (biased toward binding site)\n    translation = binding_site_center + np.random.normal(0, 3.0, 3)\n    \n    # Sample rotation\n    rotation_axis = np.random.normal(0, 1, 3)\n    rotation_axis = rotation_axis / np.linalg.norm(rotation_axis)\n    rotation_angle = np.random.uniform(0, 2*np.pi)\n    \n    # Generate pose\n    posed_ligand = generate_pose(translation, rotation_axis, rotation_angle)\n    \n    # Calculate score\n    total_score, dist_score, clash_penalty, binding_bonus = calculate_docking_score(\n        posed_ligand, protein_surface)\n    \n    poses.append({\n        'id': i,\n        'translation': translation,\n        'rotation_axis': rotation_axis,\n        'rotation_angle': rotation_angle,\n        'coordinates': posed_ligand,\n        'score': total_score,\n        'distance_score': dist_score,\n        'clash_penalty': clash_penalty,\n        'binding_bonus': binding_bonus\n    })\n    \n    scores.append(total_score)\n    score_components.append([dist_score, clash_penalty, binding_bonus])\n\nscores = np.array(scores)\nscore_components = np.array(score_components)\n\nprint(f'Generated {n_poses} poses')\nprint(f'Score range: {np.min(scores):.2f} to {np.max(scores):.2f}')\nprint(f'Mean score: {np.mean(scores):.2f} ± {np.std(scores):.2f}')\n\n# Select top poses\ntop_indices = np.argsort(scores)[-50:]  # Top 50 poses\ntop_poses = [poses[i] for i in top_indices]\ntop_scores = scores[top_indices]\n\nprint(f'\\nTop 10 pose scores: {top_scores[-10:]}')\n\n# Cluster top poses\nprint('\\n=== Pose Clustering ===')\n\n# Extract center positions of top poses\ntop_centers = np.array([np.mean(pose['coordinates'], axis=0) for pose in top_poses])\n\n# Cluster by position\ndbscan = DBSCAN(eps=2.0, min_samples=3)\ncluster_labels = dbscan.fit_predict(top_centers)\n\nn_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)\nn_noise = list(cluster_labels).count(-1)\n\nprint(f'Found {n_clusters} pose clusters')\nprint(f'Noise points: {n_noise}')\n\n# Analyze clusters\ncluster_info = []\nfor cluster_id in range(n_clusters):\n    cluster_mask = cluster_labels == cluster_id\n    cluster_poses = [top_poses[i] for i in range(len(top_poses)) if cluster_mask[i]]\n    cluster_scores = top_scores[cluster_mask]\n    \n    cluster_center = np.mean(top_centers[cluster_mask], axis=0)\n    best_score = np.max(cluster_scores)\n    cluster_size = len(cluster_poses)\n    \n    cluster_info.append({\n        'id': cluster_id,\n        'size': cluster_size,\n        'best_score': best_score,\n        'center': cluster_center,\n        'poses': cluster_poses\n    })\n    \n    print(f'  Cluster {cluster_id}: {cluster_size} poses, '\n          f'best score: {best_score:.2f}, '\n          f'center: ({cluster_center[0]:.1f}, {cluster_center[1]:.1f}, {cluster_center[2]:.1f})')\n\n# Select representative poses\nprint('\\n=== Representative Pose Selection ===')\n\nrepresentative_poses = []\nfor cluster in sorted(cluster_info, key=lambda x: x['best_score'], reverse=True):\n    # Select best pose from each cluster\n    best_pose_idx = np.argmax([pose['score'] for pose in cluster['poses']])\n    best_pose = cluster['poses'][best_pose_idx]\n    \n    representative_poses.append({\n        'cluster_id': cluster['id'],\n        'pose': best_pose,\n        'cluster_size': cluster['size']\n    })\n    \n    print(f'Cluster {cluster[\"id\"]}: Score {best_pose[\"score\"]:.2f}, '\n          f'Position ({np.mean(best_pose[\"coordinates\"], axis=0)})')\n\n# Analyze binding mode\nprint('\\n=== Binding Mode Analysis ===')\n\nif representative_poses:\n    best_overall = representative_poses[0]['pose']\n    \n    print(f'Best pose analysis:')\n    print(f'  Total score: {best_overall[\"score\"]:.2f}')\n    print(f'  Distance score: {best_overall[\"distance_score\"]:.2f}')\n    print(f'  Clash penalty: {best_overall[\"clash_penalty\"]:.2f}')\n    print(f'  Binding site bonus: {best_overall[\"binding_bonus\"]:.2f}')\n    \n    # Calculate interactions\n    best_coords = best_overall['coordinates']\n    distances_to_surface = cdist(best_coords, protein_surface)\n    min_distances = np.min(distances_to_surface, axis=1)\n    \n    print(f'\\n  Ligand-protein distances:')\n    for i, (atom_name, distance) in enumerate(zip(ligand_atoms.keys(), min_distances)):\n        print(f'    {atom_name}: {distance:.2f} Å')\n    \n    # Identify close contacts\n    close_contacts = min_distances < 3.0\n    print(f'\\n  Close contacts (< 3.0 Å): {np.sum(close_contacts)} atoms')\n    \n    # Center of mass analysis\n    com_distance = np.linalg.norm(np.mean(best_coords, axis=0) - binding_site_center)\n    print(f'  Distance from binding site center: {com_distance:.2f} Å')\n\n# Confidence estimation\nprint('\\n=== Confidence Estimation ===')\n\n# Score-based confidence\ntop_10_scores = top_scores[-10:]\nscore_std = np.std(top_10_scores)\nscore_confidence = 1.0 / (1.0 + score_std)  # Higher std = lower confidence\n\n# Clustering-based confidence\nif n_clusters > 0:\n    largest_cluster_size = max(cluster['size'] for cluster in cluster_info)\n    cluster_confidence = largest_cluster_size / len(top_poses)\nelse:\n    cluster_confidence = 0.0\n\n# Combined confidence\noverall_confidence = (score_confidence + cluster_confidence) / 2.0\n\nprint(f'Score-based confidence: {score_confidence:.3f}')\nprint(f'Clustering confidence: {cluster_confidence:.3f}')\nprint(f'Overall confidence: {overall_confidence:.3f}')\n\n# Quality metrics\nprint('\\n=== Quality Metrics ===')\n\n# Pose diversity\nall_centers = np.array([np.mean(pose['coordinates'], axis=0) for pose in poses])\nposition_diversity = np.mean(cdist(all_centers, all_centers))\n\n# Energy landscape analysis\nenergy_range = np.max(scores) - np.min(scores)\nfunnel_quality = (np.max(scores) - np.mean(scores)) / np.std(scores)\n\nprint(f'Position diversity: {position_diversity:.2f} Å')\nprint(f'Energy range: {energy_range:.2f}')\nprint(f'Funnel quality: {funnel_quality:.2f}')\n\n# Visualization\nprint('\\n=== Visualization ===')\n\nfig = plt.figure(figsize=(16, 12))\n\n# 3D visualization of binding site and top poses\nax1 = fig.add_subplot(221, projection='3d')\n\n# Plot protein surface\nax1.scatter(protein_surface[:, 0], protein_surface[:, 1], protein_surface[:, 2], \n           c='lightblue', alpha=0.3, s=20, label='Protein surface')\n\n# Plot binding site center\nax1.scatter(*binding_site_center, c='red', s=100, marker='*', label='Binding site')\n\n# Plot top poses\ncolors = plt.cm.viridis(np.linspace(0, 1, len(representative_poses)))\nfor i, rep_pose in enumerate(representative_poses[:5]):  # Show top 5\n    coords = rep_pose['pose']['coordinates']\n    ax1.scatter(coords[:, 0], coords[:, 1], coords[:, 2], \n               c=[colors[i]], s=50, alpha=0.8, label=f'Pose {i+1}')\n\nax1.set_xlabel('X (Å)')\nax1.set_ylabel('Y (Å)')\nax1.set_zlabel('Z (Å)')\nax1.set_title('3D Binding Site and Top Poses')\nax1.legend()\n\n# Score distribution\nax2 = fig.add_subplot(222)\nax2.hist(scores, bins=50, alpha=0.7, color='skyblue', edgecolor='black')\nax2.axvline(np.mean(scores), color='red', linestyle='--', label=f'Mean: {np.mean(scores):.2f}')\nax2.axvline(np.percentile(scores, 95), color='green', linestyle='--', \n           label=f'95th percentile: {np.percentile(scores, 95):.2f}')\nax2.set_xlabel('Docking Score')\nax2.set_ylabel('Frequency')\nax2.set_title('Score Distribution')\nax2.legend()\n\n# Score components\nax3 = fig.add_subplot(223)\ncomponent_names = ['Distance', 'Clash Penalty', 'Binding Bonus']\nfor i, name in enumerate(component_names):\n    ax3.scatter(range(len(top_poses)), score_components[top_indices, i], \n               alpha=0.6, label=name, s=20)\nax3.set_xlabel('Pose Index (sorted by score)')\nax3.set_ylabel('Score Component')\nax3.set_title('Score Components for Top Poses')\nax3.legend()\nax3.grid(True, alpha=0.3)\n\n# Cluster analysis\nax4 = fig.add_subplot(224)\nif n_clusters > 0:\n    for cluster_id in range(n_clusters):\n        cluster_mask = cluster_labels == cluster_id\n        if np.any(cluster_mask):\n            cluster_centers = top_centers[cluster_mask]\n            cluster_scores = top_scores[cluster_mask]\n            ax4.scatter(cluster_centers[:, 0], cluster_centers[:, 1], \n                       c=cluster_scores, s=50, alpha=0.7, \n                       label=f'Cluster {cluster_id}', cmap='viridis')\n    \n    # Noise points\n    noise_mask = cluster_labels == -1\n    if np.any(noise_mask):\n        ax4.scatter(top_centers[noise_mask, 0], top_centers[noise_mask, 1], \n                   c='gray', s=20, alpha=0.5, label='Noise')\n    \n    ax4.scatter(*binding_site_center[:2], c='red', s=100, marker='*', \n               label='Binding site')\n    \n    ax4.set_xlabel('X (Å)')\n    ax4.set_ylabel('Y (Å)')\n    ax4.set_title('Pose Clusters (XY projection)')\n    ax4.legend()\n    ax4.grid(True, alpha=0.3)\n\nplt.tight_layout()\n\n# Save visualization\nwith tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:\n    plt.savefig(tmp.name, dpi=150, bbox_inches='tight')\n    viz_file = tmp.name\n\nplt.close()\nprint(f'Docking visualization saved to: {viz_file}')\n\n# Summary report\nprint('\\n' + '=' * 50)\nprint('DIFFDOCK MOLECULAR DOCKING SUMMARY')\nprint('=' * 50)\nprint(f'Poses generated: {n_poses:}')\nprint(f'Top poses analyzed: {len(top_poses)}')\nprint(f'Pose clusters found: {n_clusters}')\nprint(f'Best docking score: {np.max(scores):.2f}')\nprint(f'Confidence score: {overall_confidence:.3f}')\nprint(f'\\nBest pose location:')\nif representative_poses:\n    best_com = np.mean(representative_poses[0]['pose']['coordinates'], axis=0)\n    print(f'  Center of mass: ({best_com[0]:.2f}, {best_com[1]:.2f}, {best_com[2]:.2f})')\n    print(f'  Distance from binding site: {np.linalg.norm(best_com - binding_site_center):.2f} Å')\n\nprint(f'\\nQuality metrics:')\nprint(f'  Position diversity: {position_diversity:.2f} Å')\nprint(f'  Energy range: {energy_range:.2f}')\nprint(f'  Funnel quality: {funnel_quality:.2f}')\n\n# Cleanup\nos.unlink(viz_file)\nprint('\\nDemo complete - temporary files cleaned up')\n\nprint('\\nDiffDock provides:')\nprint('• State-of-the-art docking accuracy')\nprint('• End-to-end deep learning approach')\nprint('• Fast GPU-accelerated inference')\nprint('• Handling of flexible ligands and proteins')\nprint('• Confidence estimation for poses')\nprint('• Integration with drug discovery pipelines')\nprint('• Pre-trained models on large datasets')\n\nprint('\\nTypical DiffDock usage:')\nprint('python -m diffdock.inference --protein protein.pdb \\\\')\nprint('  --ligand ligand.sdf --out_dir results --inference_steps 20')",
       "quick_start": [
         "Install: pip install diffdock",
         "Prepare: protein.pdb and ligand.sdf files",

tooluniverse/data/pmc_tools.json CHANGED Viewed

@@ -32,10 +32,7 @@
       },
       "required": [
         "query",
-        "limit",
-        "date_from",
-        "date_to",
-        "article_type"
+        "limit"
       ]
     },
     "return_schema": {

tooluniverse/data/ppi_tools.json ADDED Viewed

@@ -0,0 +1,139 @@
+[
+  {
+    "type": "STRINGRESTTool",
+    "name": "STRING_get_protein_interactions",
+    "description": "Query protein-protein interactions from the STRING database. STRING is a comprehensive database of known and predicted protein-protein interactions with confidence scores and functional annotations.",
+    "parameter": {
+      "type": "object",
+      "properties": {
+        "protein_ids": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "List of protein identifiers (UniProt IDs, gene names, etc.)",
+          "minItems": 1
+        },
+        "species": {
+          "type": "integer",
+          "description": "NCBI taxonomy ID (default: 9606 for human)",
+          "default": 9606
+        },
+        "confidence_score": {
+          "type": "number",
+          "description": "Minimum confidence score (0-1, default: 0.4)",
+          "minimum": 0,
+          "maximum": 1,
+          "default": 0.4
+        },
+        "limit": {
+          "type": "integer",
+          "description": "Maximum number of interactions to return (default: 50)",
+          "minimum": 1,
+          "maximum": 1000,
+          "default": 50
+        },
+        "network_type": {
+          "type": "string",
+          "description": "Type of network ('full', 'physical', 'functional')",
+          "enum": ["full", "physical", "functional"],
+          "default": "full"
+        }
+      },
+      "required": ["protein_ids"]
+    },
+    "fields": {
+      "endpoint": "/tsv/network",
+      "return_format": "TSV"
+    },
+    "return_schema": {
+      "type": "object",
+      "properties": {
+        "success": {"type": "boolean"},
+        "interactions": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "protein1": {"type": "string"},
+              "protein2": {"type": "string"},
+              "protein1_name": {"type": "string"},
+              "protein2_name": {"type": "string"},
+              "combined_score": {"type": "number"},
+              "confidence_score": {"type": "number"},
+              "interaction_sources": {"type": "array", "items": {"type": "string"}},
+              "interaction_type": {"type": "string"}
+            }
+          }
+        },
+        "mapping_data": {"type": "array"},
+        "functional_enrichment": {"type": "array"},
+        "summary": {"type": "object"},
+        "error": {"type": "string"}
+      }
+    },
+    "implementation": {
+      "language": "python",
+      "dependencies": ["requests"],
+      "source_file": "STRING_get_protein_interactions.py"
+    }
+  },
+  {
+    "type": "BioGRIDRESTTool",
+    "name": "BioGRID_get_interactions",
+    "description": "Query protein and genetic interactions from the BioGRID database. BioGRID is a comprehensive database of physical and genetic interactions with detailed experimental evidence.",
+    "parameter": {
+      "type": "object",
+      "properties": {
+        "gene_names": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "List of gene names or protein identifiers",
+          "minItems": 1
+        },
+        "organism": {
+          "type": "string",
+          "description": "Organism name (e.g., 'Homo sapiens', 'Mus musculus')",
+          "default": "Homo sapiens"
+        },
+        "interaction_type": {
+          "type": "string",
+          "description": "Type of interaction ('physical', 'genetic', 'both')",
+          "enum": ["physical", "genetic", "both"],
+          "default": "both"
+        },
+        "evidence_types": {
+          "type": "array",
+          "items": {"type": "string"},
+          "description": "List of evidence types to include"
+        },
+        "limit": {
+          "type": "integer",
+          "description": "Maximum number of interactions to return (default: 100)",
+          "minimum": 1,
+          "maximum": 1000,
+          "default": 100
+        },
+        "format": {
+          "type": "string",
+          "description": "Output format ('json' or 'tab', default: 'json')",
+          "enum": ["json", "tab"],
+          "default": "json"
+        }
+      },
+      "required": ["gene_names"]
+    },
+    "return_schema": {
+      "type": "object",
+      "properties": {
+        "success": {"type": "boolean"},
+        "interactions": {"type": "array"},
+        "summary": {"type": "object"},
+        "statistics": {"type": "object"},
+        "error": {"type": "string"}
+      }
+    },
+    "fields": {
+      "endpoint": "/interactions/",
+      "return_format": "JSON"
+    }
+  }
+]

tooluniverse/data/pubmed_tools.json CHANGED Viewed

@@ -21,9 +21,7 @@
         }
       },
       "required": [
-        "query",
-        "limit",
-        "api_key"
+        "query"
       ]
     },
     "return_schema": {

tooluniverse/data/semantic_scholar_tools.json CHANGED Viewed

@@ -22,8 +22,7 @@
       },
       "required": [
         "query",
-        "limit",
-        "api_key"
+        "limit"
       ]
     },
     "return_schema": {

tooluniverse/data/tool_composition_tools.json CHANGED Viewed

@@ -14,13 +14,11 @@
       "properties": {
         "source_tool": {
           "type": "string",
-          "description": "The source tool specification (JSON string with name, description, parameter schema, and example outputs)",
-          "required": true
+          "description": "The source tool specification (JSON string with name, description, parameter schema, and example outputs)"
         },
         "target_tool": {
           "type": "string",
-          "description": "The target tool specification (JSON string with name, description, parameter schema)",
-          "required": true
+          "description": "The target tool specification (JSON string with name, description, parameter schema)"
         },
         "analysis_depth": {
           "type": "string",

tooluniverse 1.0.7__py3-none-any.whl → 1.0.9__py3-none-any.whl

Potentially problematic release.

tooluniverse 1.0.7py3-none-any.whl → 1.0.9py3-none-any.whl