tooluniverse 0.1.4__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tooluniverse might be problematic. Click here for more details.

Files changed (187) hide show
  1. tooluniverse/__init__.py +340 -4
  2. tooluniverse/admetai_tool.py +84 -0
  3. tooluniverse/agentic_tool.py +563 -0
  4. tooluniverse/alphafold_tool.py +96 -0
  5. tooluniverse/base_tool.py +129 -6
  6. tooluniverse/boltz_tool.py +207 -0
  7. tooluniverse/chem_tool.py +192 -0
  8. tooluniverse/compose_scripts/__init__.py +1 -0
  9. tooluniverse/compose_scripts/biomarker_discovery.py +293 -0
  10. tooluniverse/compose_scripts/comprehensive_drug_discovery.py +186 -0
  11. tooluniverse/compose_scripts/drug_safety_analyzer.py +89 -0
  12. tooluniverse/compose_scripts/literature_tool.py +34 -0
  13. tooluniverse/compose_scripts/output_summarizer.py +279 -0
  14. tooluniverse/compose_scripts/tool_description_optimizer.py +681 -0
  15. tooluniverse/compose_scripts/tool_discover.py +705 -0
  16. tooluniverse/compose_scripts/tool_graph_composer.py +448 -0
  17. tooluniverse/compose_tool.py +371 -0
  18. tooluniverse/ctg_tool.py +1002 -0
  19. tooluniverse/custom_tool.py +81 -0
  20. tooluniverse/dailymed_tool.py +108 -0
  21. tooluniverse/data/admetai_tools.json +155 -0
  22. tooluniverse/data/agentic_tools.json +1156 -0
  23. tooluniverse/data/alphafold_tools.json +87 -0
  24. tooluniverse/data/boltz_tools.json +9 -0
  25. tooluniverse/data/chembl_tools.json +16 -0
  26. tooluniverse/data/clait_tools.json +108 -0
  27. tooluniverse/data/clinicaltrials_gov_tools.json +326 -0
  28. tooluniverse/data/compose_tools.json +202 -0
  29. tooluniverse/data/dailymed_tools.json +70 -0
  30. tooluniverse/data/dataset_tools.json +646 -0
  31. tooluniverse/data/disease_target_score_tools.json +712 -0
  32. tooluniverse/data/efo_tools.json +17 -0
  33. tooluniverse/data/embedding_tools.json +319 -0
  34. tooluniverse/data/enrichr_tools.json +31 -0
  35. tooluniverse/data/europe_pmc_tools.json +22 -0
  36. tooluniverse/data/expert_feedback_tools.json +10 -0
  37. tooluniverse/data/fda_drug_adverse_event_tools.json +491 -0
  38. tooluniverse/data/fda_drug_labeling_tools.json +544 -168
  39. tooluniverse/data/fda_drugs_with_brand_generic_names_for_tool.py +76929 -148860
  40. tooluniverse/data/finder_tools.json +209 -0
  41. tooluniverse/data/gene_ontology_tools.json +113 -0
  42. tooluniverse/data/gwas_tools.json +1082 -0
  43. tooluniverse/data/hpa_tools.json +333 -0
  44. tooluniverse/data/humanbase_tools.json +47 -0
  45. tooluniverse/data/idmap_tools.json +74 -0
  46. tooluniverse/data/mcp_client_tools_example.json +113 -0
  47. tooluniverse/data/mcpautoloadertool_defaults.json +28 -0
  48. tooluniverse/data/medlineplus_tools.json +141 -0
  49. tooluniverse/data/monarch_tools.json +1 -1
  50. tooluniverse/data/openalex_tools.json +36 -0
  51. tooluniverse/data/opentarget_tools.json +82 -58
  52. tooluniverse/data/output_summarization_tools.json +101 -0
  53. tooluniverse/data/packages/bioinformatics_core_tools.json +1756 -0
  54. tooluniverse/data/packages/categorized_tools.txt +206 -0
  55. tooluniverse/data/packages/cheminformatics_tools.json +347 -0
  56. tooluniverse/data/packages/earth_sciences_tools.json +74 -0
  57. tooluniverse/data/packages/genomics_tools.json +776 -0
  58. tooluniverse/data/packages/image_processing_tools.json +38 -0
  59. tooluniverse/data/packages/machine_learning_tools.json +789 -0
  60. tooluniverse/data/packages/neuroscience_tools.json +62 -0
  61. tooluniverse/data/packages/original_tools.txt +0 -0
  62. tooluniverse/data/packages/physics_astronomy_tools.json +62 -0
  63. tooluniverse/data/packages/scientific_computing_tools.json +560 -0
  64. tooluniverse/data/packages/single_cell_tools.json +453 -0
  65. tooluniverse/data/packages/software_tools.json +4954 -0
  66. tooluniverse/data/packages/structural_biology_tools.json +396 -0
  67. tooluniverse/data/packages/visualization_tools.json +399 -0
  68. tooluniverse/data/pubchem_tools.json +215 -0
  69. tooluniverse/data/pubtator_tools.json +68 -0
  70. tooluniverse/data/rcsb_pdb_tools.json +1332 -0
  71. tooluniverse/data/reactome_tools.json +19 -0
  72. tooluniverse/data/semantic_scholar_tools.json +26 -0
  73. tooluniverse/data/special_tools.json +2 -25
  74. tooluniverse/data/tool_composition_tools.json +88 -0
  75. tooluniverse/data/toolfinderkeyword_defaults.json +34 -0
  76. tooluniverse/data/txagent_client_tools.json +9 -0
  77. tooluniverse/data/uniprot_tools.json +211 -0
  78. tooluniverse/data/url_fetch_tools.json +94 -0
  79. tooluniverse/data/uspto_downloader_tools.json +9 -0
  80. tooluniverse/data/uspto_tools.json +811 -0
  81. tooluniverse/data/xml_tools.json +3275 -0
  82. tooluniverse/dataset_tool.py +296 -0
  83. tooluniverse/default_config.py +165 -0
  84. tooluniverse/efo_tool.py +42 -0
  85. tooluniverse/embedding_database.py +630 -0
  86. tooluniverse/embedding_sync.py +396 -0
  87. tooluniverse/enrichr_tool.py +266 -0
  88. tooluniverse/europe_pmc_tool.py +52 -0
  89. tooluniverse/execute_function.py +1775 -95
  90. tooluniverse/extended_hooks.py +444 -0
  91. tooluniverse/gene_ontology_tool.py +194 -0
  92. tooluniverse/graphql_tool.py +158 -36
  93. tooluniverse/gwas_tool.py +358 -0
  94. tooluniverse/hpa_tool.py +1645 -0
  95. tooluniverse/humanbase_tool.py +389 -0
  96. tooluniverse/logging_config.py +254 -0
  97. tooluniverse/mcp_client_tool.py +764 -0
  98. tooluniverse/mcp_integration.py +413 -0
  99. tooluniverse/mcp_tool_registry.py +925 -0
  100. tooluniverse/medlineplus_tool.py +337 -0
  101. tooluniverse/openalex_tool.py +228 -0
  102. tooluniverse/openfda_adv_tool.py +283 -0
  103. tooluniverse/openfda_tool.py +393 -160
  104. tooluniverse/output_hook.py +1122 -0
  105. tooluniverse/package_tool.py +195 -0
  106. tooluniverse/pubchem_tool.py +158 -0
  107. tooluniverse/pubtator_tool.py +168 -0
  108. tooluniverse/rcsb_pdb_tool.py +38 -0
  109. tooluniverse/reactome_tool.py +108 -0
  110. tooluniverse/remote/boltz/boltz_mcp_server.py +50 -0
  111. tooluniverse/remote/depmap_24q2/depmap_24q2_mcp_tool.py +442 -0
  112. tooluniverse/remote/expert_feedback/human_expert_mcp_tools.py +2013 -0
  113. tooluniverse/remote/expert_feedback/simple_test.py +23 -0
  114. tooluniverse/remote/expert_feedback/start_web_interface.py +188 -0
  115. tooluniverse/remote/expert_feedback/web_only_interface.py +0 -0
  116. tooluniverse/remote/expert_feedback_mcp/human_expert_mcp_server.py +1611 -0
  117. tooluniverse/remote/expert_feedback_mcp/simple_test.py +34 -0
  118. tooluniverse/remote/expert_feedback_mcp/start_web_interface.py +91 -0
  119. tooluniverse/remote/immune_compass/compass_tool.py +327 -0
  120. tooluniverse/remote/pinnacle/pinnacle_tool.py +328 -0
  121. tooluniverse/remote/transcriptformer/transcriptformer_tool.py +586 -0
  122. tooluniverse/remote/uspto_downloader/uspto_downloader_mcp_server.py +61 -0
  123. tooluniverse/remote/uspto_downloader/uspto_downloader_tool.py +120 -0
  124. tooluniverse/remote_tool.py +99 -0
  125. tooluniverse/restful_tool.py +53 -30
  126. tooluniverse/scripts/generate_tool_graph.py +408 -0
  127. tooluniverse/scripts/visualize_tool_graph.py +829 -0
  128. tooluniverse/semantic_scholar_tool.py +62 -0
  129. tooluniverse/smcp.py +2452 -0
  130. tooluniverse/smcp_server.py +975 -0
  131. tooluniverse/test/mcp_server_test.py +0 -0
  132. tooluniverse/test/test_admetai_tool.py +370 -0
  133. tooluniverse/test/test_agentic_tool.py +129 -0
  134. tooluniverse/test/test_alphafold_tool.py +71 -0
  135. tooluniverse/test/test_chem_tool.py +37 -0
  136. tooluniverse/test/test_compose_lieraturereview.py +63 -0
  137. tooluniverse/test/test_compose_tool.py +448 -0
  138. tooluniverse/test/test_dailymed.py +69 -0
  139. tooluniverse/test/test_dataset_tool.py +200 -0
  140. tooluniverse/test/test_disease_target_score.py +56 -0
  141. tooluniverse/test/test_drugbank_filter_examples.py +179 -0
  142. tooluniverse/test/test_efo.py +31 -0
  143. tooluniverse/test/test_enrichr_tool.py +21 -0
  144. tooluniverse/test/test_europe_pmc_tool.py +20 -0
  145. tooluniverse/test/test_fda_adv.py +95 -0
  146. tooluniverse/test/test_fda_drug_labeling.py +91 -0
  147. tooluniverse/test/test_gene_ontology_tools.py +66 -0
  148. tooluniverse/test/test_gwas_tool.py +139 -0
  149. tooluniverse/test/test_hpa.py +625 -0
  150. tooluniverse/test/test_humanbase_tool.py +20 -0
  151. tooluniverse/test/test_idmap_tools.py +61 -0
  152. tooluniverse/test/test_mcp_server.py +211 -0
  153. tooluniverse/test/test_mcp_tool.py +247 -0
  154. tooluniverse/test/test_medlineplus.py +220 -0
  155. tooluniverse/test/test_openalex_tool.py +32 -0
  156. tooluniverse/test/test_opentargets.py +28 -0
  157. tooluniverse/test/test_pubchem_tool.py +116 -0
  158. tooluniverse/test/test_pubtator_tool.py +37 -0
  159. tooluniverse/test/test_rcsb_pdb_tool.py +86 -0
  160. tooluniverse/test/test_reactome.py +54 -0
  161. tooluniverse/test/test_semantic_scholar_tool.py +24 -0
  162. tooluniverse/test/test_software_tools.py +147 -0
  163. tooluniverse/test/test_tool_description_optimizer.py +49 -0
  164. tooluniverse/test/test_tool_finder.py +26 -0
  165. tooluniverse/test/test_tool_finder_llm.py +252 -0
  166. tooluniverse/test/test_tools_find.py +195 -0
  167. tooluniverse/test/test_uniprot_tools.py +74 -0
  168. tooluniverse/test/test_uspto_tool.py +72 -0
  169. tooluniverse/test/test_xml_tool.py +113 -0
  170. tooluniverse/tool_finder_embedding.py +267 -0
  171. tooluniverse/tool_finder_keyword.py +693 -0
  172. tooluniverse/tool_finder_llm.py +699 -0
  173. tooluniverse/tool_graph_web_ui.py +955 -0
  174. tooluniverse/tool_registry.py +416 -0
  175. tooluniverse/uniprot_tool.py +155 -0
  176. tooluniverse/url_tool.py +253 -0
  177. tooluniverse/uspto_tool.py +240 -0
  178. tooluniverse/utils.py +369 -41
  179. tooluniverse/xml_tool.py +369 -0
  180. tooluniverse-1.0.0.dist-info/METADATA +377 -0
  181. tooluniverse-1.0.0.dist-info/RECORD +186 -0
  182. {tooluniverse-0.1.4.dist-info → tooluniverse-1.0.0.dist-info}/WHEEL +1 -1
  183. tooluniverse-1.0.0.dist-info/entry_points.txt +9 -0
  184. tooluniverse-0.1.4.dist-info/METADATA +0 -141
  185. tooluniverse-0.1.4.dist-info/RECORD +0 -18
  186. {tooluniverse-0.1.4.dist-info → tooluniverse-1.0.0.dist-info}/licenses/LICENSE +0 -0
  187. {tooluniverse-0.1.4.dist-info → tooluniverse-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,453 @@
1
+ [
2
+ {
3
+ "type": "PackageTool",
4
+ "name": "get_scanpy_info",
5
+ "description": "Get comprehensive information about Scanpy – scalable single-cell analysis in Python",
6
+ "parameter": {
7
+ "type": "object",
8
+ "properties": {
9
+ "include_examples": {
10
+ "type": "boolean",
11
+ "description": "Whether to include usage examples and quick start guide",
12
+ "default": true
13
+ }
14
+ }
15
+ },
16
+ "package_name": "scanpy",
17
+ "local_info": {
18
+ "name": "Scanpy",
19
+ "description": "Scalable toolkit for analyzing single-cell gene expression data. Includes preprocessing, visualization, clustering, trajectory inference and differential expression testing for datasets of more than one million cells.",
20
+ "category": "Single-Cell Genomics",
21
+ "import_name": "scanpy",
22
+ "popularity": 90,
23
+ "keywords": [
24
+ "single-cell",
25
+ "RNA-seq",
26
+ "clustering",
27
+ "trajectory inference",
28
+ "differential expression"
29
+ ],
30
+ "documentation": "https://scanpy.readthedocs.io/",
31
+ "repository": "https://github.com/scverse/scanpy",
32
+ "installation": {
33
+ "pip": "pip install scanpy",
34
+ "conda": "conda install -c conda-forge scanpy"
35
+ },
36
+ "usage_example": "import scanpy as sc\nimport pandas as pd\n\n# Read 10X data\nadata = sc.read_10x_mtx('filtered_feature_bc_matrix/')\nadata.var_names_unique()\n\n# Basic preprocessing\nsc.pp.filter_cells(adata, min_genes=200)\nsc.pp.filter_genes(adata, min_cells=3)\nsc.pp.normalize_total(adata, target_sum=1e4)\nsc.pp.log1p(adata)\n\n# Find highly variable genes and cluster\nsc.pp.highly_variable_genes(adata)\nsc.pp.pca(adata)\nsc.pp.neighbors(adata)\nsc.tl.umap(adata)\nsc.tl.leiden(adata)",
37
+ "quick_start": [
38
+ "Install: pip install scanpy",
39
+ "Read data: sc.read_10x_mtx() or sc.read_h5ad()",
40
+ "Preprocess: filter cells/genes, normalize, log transform",
41
+ "Analyze: PCA, neighbors, UMAP, clustering",
42
+ "Visualize: sc.pl.umap(), sc.pl.violin()"
43
+ ]
44
+ }
45
+ },
46
+ {
47
+ "type": "PackageTool",
48
+ "name": "get_anndata_info",
49
+ "description": "Get comprehensive information about AnnData – annotated data for computational biology",
50
+ "parameter": {
51
+ "type": "object",
52
+ "properties": {
53
+ "include_examples": {
54
+ "type": "boolean",
55
+ "description": "Whether to include usage examples and quick start guide",
56
+ "default": true
57
+ }
58
+ }
59
+ },
60
+ "package_name": "anndata",
61
+ "local_info": {
62
+ "name": "AnnData",
63
+ "description": "Annotated data matrix for handling large-scale biological data. Positioned between pandas and xarray, offers sparse data support, lazy operations, and PyTorch interface for computational biology applications.",
64
+ "category": "Data Structures",
65
+ "import_name": "anndata",
66
+ "popularity": 88,
67
+ "keywords": [
68
+ "annotated data",
69
+ "single-cell",
70
+ "sparse matrices",
71
+ "data structure",
72
+ "computational biology"
73
+ ],
74
+ "documentation": "https://anndata.readthedocs.io/",
75
+ "repository": "https://github.com/scverse/anndata",
76
+ "installation": {
77
+ "pip": "pip install anndata",
78
+ "conda": "conda install -c conda-forge anndata"
79
+ },
80
+ "usage_example": "import anndata as ad\nimport numpy as np\nimport pandas as pd\n\n# Create AnnData object\nX = np.random.randn(100, 50) # 100 cells, 50 genes\nobs = pd.DataFrame({'cell_type': ['A']*50 + ['B']*50})\nvar = pd.DataFrame({'gene_name': [f'Gene_{i}' for i in range(50)]})\n\nadata = ad.AnnData(X=X, obs=obs, var=var)\nprint(adata)\n\n# Access data\nprint(adata.X.shape)\nprint(adata.obs.head())\nprint(adata.var.head())",
81
+ "quick_start": [
82
+ "Install: pip install anndata",
83
+ "Create: ad.AnnData(X=matrix, obs=cell_meta, var=gene_meta)",
84
+ "Access: adata.X (data), adata.obs (cells), adata.var (genes)",
85
+ "Save/load: adata.write('data.h5ad'), ad.read_h5ad('data.h5ad')",
86
+ "Subset: adata[cells, genes] for slicing"
87
+ ]
88
+ }
89
+ },
90
+ {
91
+ "type": "PackageTool",
92
+ "name": "get_mudata_info",
93
+ "description": "Get comprehensive information about MuData – multimodal annotated data for computational biology",
94
+ "parameter": {
95
+ "type": "object",
96
+ "properties": {
97
+ "include_examples": {
98
+ "type": "boolean",
99
+ "description": "Whether to include usage examples and quick start guide",
100
+ "default": true
101
+ }
102
+ }
103
+ },
104
+ "package_name": "mudata",
105
+ "local_info": {
106
+ "name": "MuData",
107
+ "description": "Multimodal data structure that can store multiple AnnData objects representing different data modalities (e.g., RNA-seq, ATAC-seq, protein) from the same samples in a unified container.",
108
+ "category": "Multimodal Data",
109
+ "import_name": "mudata",
110
+ "popularity": 75,
111
+ "keywords": [
112
+ "multimodal data",
113
+ "single-cell",
114
+ "ATAC-seq",
115
+ "RNA-seq",
116
+ "protein data"
117
+ ],
118
+ "documentation": "https://mudata.readthedocs.io/",
119
+ "repository": "https://github.com/scverse/mudata",
120
+ "installation": {
121
+ "pip": "pip install mudata",
122
+ "conda": "conda install -c conda-forge mudata"
123
+ },
124
+ "usage_example": "import mudata as mu\nimport anndata as ad\nimport numpy as np\n\n# Create separate AnnData objects for different modalities\nrna_data = ad.AnnData(np.random.randn(100, 2000))\natac_data = ad.AnnData(np.random.randn(100, 10000))\n\n# Combine into MuData object\nmdata = mu.MuData({'rna': rna_data, 'atac': atac_data})\nprint(mdata)\n\n# Access modalities\nprint(mdata['rna'].shape)\nprint(mdata['atac'].shape)\n\n# Save multimodal data\nmdata.write('multimodal_data.h5mu')",
125
+ "quick_start": [
126
+ "Install: pip install mudata",
127
+ "Create: mu.MuData({'rna': rna_adata, 'atac': atac_adata})",
128
+ "Access: mdata['rna'], mdata['atac']",
129
+ "Save/load: mdata.write('data.h5mu'), mu.read('data.h5mu')",
130
+ "Integrate: Use with scanpy/muon for analysis"
131
+ ]
132
+ }
133
+ },
134
+ {
135
+ "type": "PackageTool",
136
+ "name": "get_scvelo_info",
137
+ "description": "Get comprehensive information about scVelo – RNA velocity analysis in single cells",
138
+ "parameter": {
139
+ "type": "object",
140
+ "properties": {
141
+ "include_examples": {
142
+ "type": "boolean",
143
+ "description": "Whether to include usage examples and quick start guide",
144
+ "default": true
145
+ }
146
+ }
147
+ },
148
+ "package_name": "scvelo",
149
+ "local_info": {
150
+ "name": "scVelo",
151
+ "description": "Python library for RNA velocity analysis to study cellular dynamics and trajectory inference in single-cell RNA sequencing data. Estimates future cell states based on spliced/unspliced mRNA ratios.",
152
+ "category": "Single-Cell Dynamics",
153
+ "import_name": "scvelo",
154
+ "popularity": 82,
155
+ "keywords": [
156
+ "RNA velocity",
157
+ "single-cell dynamics",
158
+ "trajectory inference",
159
+ "cellular dynamics",
160
+ "gene expression"
161
+ ],
162
+ "documentation": "https://scvelo.readthedocs.io/",
163
+ "repository": "https://github.com/theislab/scvelo",
164
+ "installation": {
165
+ "pip": "pip install scvelo",
166
+ "conda": "conda install -c conda-forge scvelo"
167
+ },
168
+ "usage_example": "import scvelo as scv\nimport scanpy as sc\n\n# Load data with spliced/unspliced information\nadata = scv.datasets.pancreatic_endocrinogenesis()\n\n# Preprocess data\nscv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=2000)\nscv.pp.moments(adata, n_pcs=30, n_neighbors=30)\n\n# Compute velocity and velocity graph\nscv.tl.velocity(adata)\nscv.tl.velocity_graph(adata)\n\n# Plot velocity field\nscv.pl.velocity_embedding_stream(adata, basis='umap')",
169
+ "quick_start": [
170
+ "Install: pip install scvelo",
171
+ "Import: import scvelo as scv",
172
+ "Load data: adata = scv.datasets.pancreatic_endocrinogenesis()",
173
+ "Preprocess: scv.pp.filter_and_normalize(adata)",
174
+ "Analyze: scv.tl.velocity(); scv.pl.velocity_embedding()"
175
+ ]
176
+ }
177
+ },
178
+ {
179
+ "type": "PackageTool",
180
+ "name": "get_scrublet_info",
181
+ "description": "Get comprehensive information about Scrublet – single-cell doublet detection",
182
+ "parameter": {
183
+ "type": "object",
184
+ "properties": {
185
+ "include_examples": {
186
+ "type": "boolean",
187
+ "description": "Whether to include usage examples and quick start guide",
188
+ "default": true
189
+ }
190
+ }
191
+ },
192
+ "package_name": "scrublet",
193
+ "local_info": {
194
+ "name": "Scrublet",
195
+ "description": "Python library for detecting doublets in single-cell RNA-seq data. Uses simulated doublets and k-nearest neighbor classifier to identify likely doublets in experimental data.",
196
+ "category": "Single-Cell Quality Control",
197
+ "import_name": "scrublet",
198
+ "popularity": 72,
199
+ "keywords": [
200
+ "doublet detection",
201
+ "single-cell quality control",
202
+ "RNA-seq",
203
+ "cell filtering",
204
+ "data quality"
205
+ ],
206
+ "documentation": "https://github.com/AllonKleinLab/scrublet",
207
+ "repository": "https://github.com/AllonKleinLab/scrublet",
208
+ "installation": {
209
+ "pip": "pip install scrublet",
210
+ "conda": "conda install -c bioconda scrublet"
211
+ },
212
+ "usage_example": "import scrublet as scr\nimport scipy.io\nimport matplotlib.pyplot as plt\n\n# Load count matrix (cells x genes)\ncounts_matrix = scipy.sparse.load_npz('counts.npz')\n\n# Initialize Scrublet object\nscrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.06)\n\n# Run doublet detection\ndoublet_scores, predicted_doublets = scrub.scrub_doublets()\n\n# Plot histogram of doublet scores\nscrub.plot_histogram()\nplt.show()\n\nprint(f'Detected {predicted_doublets.sum()} doublets')",
213
+ "quick_start": [
214
+ "Install: pip install scrublet",
215
+ "Initialize: scrub = scr.Scrublet(counts_matrix)",
216
+ "Detect: scores, doublets = scrub.scrub_doublets()",
217
+ "Plot: scrub.plot_histogram()",
218
+ "Filter: Remove predicted doublets from dataset"
219
+ ]
220
+ }
221
+ },
222
+ {
223
+ "type": "PackageTool",
224
+ "name": "get_souporcell_info",
225
+ "description": "Get comprehensive information about souporcell – scRNA-seq genotype clustering",
226
+ "parameter": {
227
+ "type": "object",
228
+ "properties": {
229
+ "info_type": {
230
+ "type": "string",
231
+ "enum": [
232
+ "overview",
233
+ "installation",
234
+ "usage",
235
+ "documentation"
236
+ ],
237
+ "description": "Type of information to retrieve about souporcell"
238
+ }
239
+ },
240
+ "required": [
241
+ "info_type"
242
+ ]
243
+ },
244
+ "package_name": "souporcell",
245
+ "local_info": {
246
+ "name": "souporcell",
247
+ "description": "Clustering single-cell RNA-seq data by genotype. Deconvolutes multiplexed scRNA-seq samples by identifying genetic variants and assigning cells to individuals based on their genotype profiles.",
248
+ "category": "Single-Cell Genomics / Demultiplexing",
249
+ "import_name": "souporcell",
250
+ "popularity": 70,
251
+ "keywords": [
252
+ "single-cell",
253
+ "genotype clustering",
254
+ "demultiplexing",
255
+ "scRNA-seq",
256
+ "variant calling"
257
+ ],
258
+ "documentation": "https://github.com/wheaton5/souporcell",
259
+ "repository": "https://github.com/wheaton5/souporcell",
260
+ "installation": {
261
+ "pip": "pip install souporcell",
262
+ "conda": "conda install -c conda-forge souporcell"
263
+ },
264
+ "usage_example": "# souporcell is primarily a command-line tool\n# Here we demonstrate the concepts and analysis workflow\n\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.cluster import KMeans\nfrom sklearn.decomposition import PCA\nfrom sklearn.metrics import adjusted_rand_score\nimport tempfile\nimport os\n\nprint('souporcell - Single-cell Genotype Clustering')\nprint('=' * 45)\n\n# Overview of souporcell workflow\nprint('souporcell Workflow:')\nprint('1. Variant calling from scRNA-seq reads')\nprint('2. Genotype matrix construction')\nprint('3. Clustering cells by genotype similarity')\nprint('4. Assignment of cells to individuals')\nprint('5. Quality control and validation')\n\nprint('\\nKey Features:')\nprint('• Handles multiplexed scRNA-seq samples')\nprint('• No prior genotype information required')\nprint('• Identifies ambient RNA contamination')\nprint('• Provides cluster assignments and QC metrics')\nprint('• Compatible with 10x Genomics data')\n\n# Simulate multiplexed single-cell data\nprint('\\n=== Simulating Multiplexed scRNA-seq Data ===')\n\nnp.random.seed(42)\n\n# Simulation parameters\nn_individuals = 4\nn_cells_per_individual = 500\nn_variants = 1000\nn_genes = 2000\n\ntotal_cells = n_individuals * n_cells_per_individual\nprint(f'Simulating {total_cells} cells from {n_individuals} individuals')\nprint(f'Using {n_variants} genetic variants and {n_genes} genes')\n\n# Generate individual genotypes\nprint('\\nGenerating individual genotypes...')\nindividual_genotypes = {}\n\nfor ind_id in range(n_individuals):\n # Each individual has different allele frequencies\n genotype = np.random.choice([0, 1, 2], size=n_variants, p=[0.6, 0.3, 0.1])\n individual_genotypes[f'Individual_{ind_id}'] = genotype\n\nprint(f'Generated genotypes for {len(individual_genotypes)} individuals')\n\n# Calculate genotype differences between individuals\ngenotype_matrix = np.array([geno for geno in individual_genotypes.values()])\nprint(f'Genotype matrix shape: {genotype_matrix.shape}')\n\n# Pairwise differences\nprint('\\nPairwise genotype differences:')\nfor i in range(n_individuals):\n for j in range(i+1, n_individuals):\n diff = np.sum(genotype_matrix[i] != genotype_matrix[j])\n similarity = 1 - (diff / n_variants)\n print(f' Individual_{i} vs Individual_{j}: {diff} differences ({similarity:.3f} similarity)')\n\n# Generate cell-level data\nprint('\\nGenerating cell-level genotype data...')\n\ncell_genotypes = []\ncell_labels = []\ncell_ids = []\n\nfor ind_id in range(n_individuals):\n individual_geno = individual_genotypes[f'Individual_{ind_id}']\n \n for cell_id in range(n_cells_per_individual):\n # Add noise to simulate technical variation and allelic dropout\n cell_geno = individual_geno.copy()\n \n # Simulate allelic dropout (some variants not detected)\n dropout_rate = 0.1\n dropout_mask = np.random.random(n_variants) < dropout_rate\n cell_geno[dropout_mask] = 0 # Set to homozygous reference\n \n # Add some random noise (technical errors)\n noise_rate = 0.02\n noise_mask = np.random.random(n_variants) < noise_rate\n cell_geno[noise_mask] = np.random.choice([0, 1, 2], size=np.sum(noise_mask))\n \n cell_genotypes.append(cell_geno)\n cell_labels.append(ind_id)\n cell_ids.append(f'Cell_{ind_id}_{cell_id}')\n\ncell_genotype_matrix = np.array(cell_genotypes)\nprint(f'Cell genotype matrix shape: {cell_genotype_matrix.shape}')\nprint(f'Cells per individual: {[cell_labels.count(i) for i in range(n_individuals)]}')\n\n# Add ambient RNA contamination (doublets)\nprint('\\nSimulating ambient RNA contamination (doublets)...')\nn_doublets = 100\n\nfor doublet_id in range(n_doublets):\n # Mix genotypes from two random individuals\n ind1, ind2 = np.random.choice(n_individuals, size=2, replace=False)\n \n geno1 = individual_genotypes[f'Individual_{ind1}']\n geno2 = individual_genotypes[f'Individual_{ind2}']\n \n # Create mixed genotype (roughly 50:50 mix)\n mixed_geno = np.where(np.random.random(n_variants) < 0.5, geno1, geno2)\n \n # Add to cell data\n cell_genotypes.append(mixed_geno)\n cell_labels.append(-1) # Doublet label\n cell_ids.append(f'Doublet_{doublet_id}')\n\n# Update matrices\ncell_genotype_matrix = np.array(cell_genotypes)\ntotal_cells_with_doublets = len(cell_genotypes)\n\nprint(f'Total cells (including doublets): {total_cells_with_doublets}')\nprint(f'Doublets added: {n_doublets}')\nprint(f'Singlets: {total_cells_with_doublets - n_doublets}')\n\n# Dimensionality reduction for visualization\nprint('\\n=== Genotype-based Clustering Analysis ===')\n\n# PCA on genotype data\nprint('Performing PCA on genotype matrix...')\npca = PCA(n_components=10)\npca_result = pca.fit_transform(cell_genotype_matrix)\n\nprint(f'PCA explained variance ratio (first 5 components): {pca.explained_variance_ratio_[:5]}')\nprint(f'Cumulative explained variance (first 5): {np.cumsum(pca.explained_variance_ratio_[:5])}')\n\n# K-means clustering\nprint('\\nPerforming K-means clustering...')\n\n# Try different numbers of clusters\ncluster_range = range(2, 8)\ninertias = []\nari_scores = []\n\nfor k in cluster_range:\n kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)\n cluster_labels = kmeans.fit_predict(pca_result[:, :5]) # Use first 5 PCs\n \n inertias.append(kmeans.inertia_)\n \n # Calculate ARI against true labels (excluding doublets)\n true_labels_clean = [label for label in cell_labels if label != -1]\n cluster_labels_clean = cluster_labels[:len(true_labels_clean)]\n \n if len(set(true_labels_clean)) > 1 and len(set(cluster_labels_clean)) > 1:\n ari = adjusted_rand_score(true_labels_clean, cluster_labels_clean)\n ari_scores.append(ari)\n else:\n ari_scores.append(0)\n\nprint(f'Inertias for k=2 to 7: {[f\"{inertia:.0f}\" for inertia in inertias]}')\nprint(f'ARI scores for k=2 to 7: {[f\"{ari:.3f}\" for ari in ari_scores]}')\n\n# Choose optimal k (highest ARI)\nbest_k = cluster_range[np.argmax(ari_scores)]\nprint(f'\\nOptimal number of clusters: {best_k} (ARI: {max(ari_scores):.3f})')\n\n# Final clustering with optimal k\nfinal_kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)\nfinal_clusters = final_kmeans.fit_predict(pca_result[:, :5])\n\n# Analyze cluster assignments\nprint('\\n=== Cluster Assignment Analysis ===')\n\n# Create assignment matrix\ncluster_assignment = pd.DataFrame({\n 'cell_id': cell_ids,\n 'true_individual': cell_labels,\n 'predicted_cluster': final_clusters,\n 'is_doublet': [label == -1 for label in cell_labels]\n})\n\nprint(f'Cluster assignment summary:')\nprint(cluster_assignment.groupby(['true_individual', 'predicted_cluster']).size().unstack(fill_value=0))\n\n# Calculate cluster purity\nprint('\\nCluster purity analysis:')\nfor cluster_id in range(best_k):\n cluster_cells = cluster_assignment[cluster_assignment['predicted_cluster'] == cluster_id]\n \n if len(cluster_cells) > 0:\n # Exclude doublets from purity calculation\n singlets_in_cluster = cluster_cells[~cluster_cells['is_doublet']]\n \n if len(singlets_in_cluster) > 0:\n most_common_individual = singlets_in_cluster['true_individual'].mode()\n if len(most_common_individual) > 0:\n purity = (singlets_in_cluster['true_individual'] == most_common_individual.iloc[0]).mean()\n print(f' Cluster {cluster_id}: {len(cluster_cells)} cells, '\n f'purity = {purity:.3f}, '\n f'doublets = {cluster_cells[\"is_doublet\"].sum()}')\n\n# Doublet detection analysis\nprint('\\n=== Doublet Detection Analysis ===')\n\n# Cells in clusters with mixed individuals are potential doublets\ndoublet_scores = []\n\nfor idx, row in cluster_assignment.iterrows():\n cluster_id = row['predicted_cluster']\n cluster_cells = cluster_assignment[cluster_assignment['predicted_cluster'] == cluster_id]\n \n # Calculate heterogeneity score for this cluster\n singlets_in_cluster = cluster_cells[~cluster_cells['is_doublet']]\n \n if len(singlets_in_cluster) > 1:\n individual_counts = singlets_in_cluster['true_individual'].value_counts()\n heterogeneity = 1 - (individual_counts.max() / len(singlets_in_cluster))\n else:\n heterogeneity = 0\n \n doublet_scores.append(heterogeneity)\n\ncluster_assignment['doublet_score'] = doublet_scores\n\n# Set threshold for doublet detection\ndoublet_threshold = 0.3\npredicted_doublets = cluster_assignment['doublet_score'] > doublet_threshold\n\n# Evaluate doublet detection\ntrue_doublets = cluster_assignment['is_doublet']\ndoublet_tp = sum(predicted_doublets & true_doublets)\ndoublet_fp = sum(predicted_doublets & ~true_doublets)\ndoublet_fn = sum(~predicted_doublets & true_doublets)\ndoublet_tn = sum(~predicted_doublets & ~true_doublets)\n\ndoublet_precision = doublet_tp / (doublet_tp + doublet_fp) if (doublet_tp + doublet_fp) > 0 else 0\ndoublet_recall = doublet_tp / (doublet_tp + doublet_fn) if (doublet_tp + doublet_fn) > 0 else 0\n\nprint(f'Doublet detection performance:')\nprint(f' True doublets: {sum(true_doublets)}')\nprint(f' Predicted doublets: {sum(predicted_doublets)}')\nprint(f' Precision: {doublet_precision:.3f}')\nprint(f' Recall: {doublet_recall:.3f}')\n\n# Quality control metrics\nprint('\\n=== Quality Control Metrics ===')\n\n# Calculate per-cell variant detection rate\nvariant_detection_rates = []\nfor cell_geno in cell_genotype_matrix:\n non_zero_variants = np.sum(cell_geno > 0)\n detection_rate = non_zero_variants / n_variants\n variant_detection_rates.append(detection_rate)\n\ncluster_assignment['variant_detection_rate'] = variant_detection_rates\n\nprint(f'Variant detection rates:')\nprint(f' Mean: {np.mean(variant_detection_rates):.3f}')\nprint(f' Median: {np.median(variant_detection_rates):.3f}')\nprint(f' Range: {np.min(variant_detection_rates):.3f} - {np.max(variant_detection_rates):.3f}')\n\n# Per-individual statistics\nprint(f'\\nPer-individual assignment accuracy:')\nfor ind_id in range(n_individuals):\n individual_cells = cluster_assignment[cluster_assignment['true_individual'] == ind_id]\n \n if len(individual_cells) > 0:\n # Most common cluster assignment\n most_common_cluster = individual_cells['predicted_cluster'].mode()\n if len(most_common_cluster) > 0:\n accuracy = (individual_cells['predicted_cluster'] == most_common_cluster.iloc[0]).mean()\n print(f' Individual {ind_id}: {accuracy:.3f} ({len(individual_cells)} cells)')\n\n# Visualization\nprint('\\n=== Visualization ===')\n\nfig, axes = plt.subplots(2, 2, figsize=(15, 12))\n\n# 1. PCA plot colored by true individual\nscatter1 = axes[0, 0].scatter(pca_result[:, 0], pca_result[:, 1], \n c=cell_labels, cmap='tab10', alpha=0.6, s=20)\naxes[0, 0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')\naxes[0, 0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')\naxes[0, 0].set_title('PCA - True Individuals')\n\n# 2. PCA plot colored by predicted cluster\nscatter2 = axes[0, 1].scatter(pca_result[:, 0], pca_result[:, 1], \n c=final_clusters, cmap='tab10', alpha=0.6, s=20)\naxes[0, 1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')\naxes[0, 1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')\naxes[0, 1].set_title('PCA - Predicted Clusters')\n\n# 3. Clustering evaluation metrics\nmetrics = ['Elbow Method', 'ARI Score']\naxes[1, 0].plot(cluster_range, inertias, 'bo-', label='Inertia')\naxes[1, 0].set_xlabel('Number of Clusters')\naxes[1, 0].set_ylabel('Inertia', color='blue')\naxes[1, 0].set_title('Clustering Evaluation')\naxes[1, 0].tick_params(axis='y', labelcolor='blue')\n\n# Secondary y-axis for ARI\nax_twin = axes[1, 0].twinx()\nax_twin.plot(cluster_range, ari_scores, 'ro-', label='ARI')\nax_twin.set_ylabel('Adjusted Rand Index', color='red')\nax_twin.tick_params(axis='y', labelcolor='red')\nax_twin.axvline(x=best_k, color='green', linestyle='--', alpha=0.7, label=f'Optimal k={best_k}')\n\n# 4. Doublet score distribution\naxes[1, 1].hist(cluster_assignment[cluster_assignment['is_doublet']]['doublet_score'], \n alpha=0.7, label='True doublets', bins=20, color='red')\naxes[1, 1].hist(cluster_assignment[~cluster_assignment['is_doublet']]['doublet_score'], \n alpha=0.7, label='Singlets', bins=20, color='blue')\naxes[1, 1].axvline(x=doublet_threshold, color='green', linestyle='--', \n label=f'Threshold ({doublet_threshold})')\naxes[1, 1].set_xlabel('Doublet Score')\naxes[1, 1].set_ylabel('Count')\naxes[1, 1].set_title('Doublet Score Distribution')\naxes[1, 1].legend()\n\nplt.tight_layout()\n\n# Save visualization\nwith tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:\n plt.savefig(tmp.name, dpi=150, bbox_inches='tight')\n viz_file = tmp.name\n\nplt.close()\nprint(f'Analysis visualization saved to: {viz_file}')\n\n# Summary report\nprint('\\n' + '=' * 45)\nprint('SOUPORCELL ANALYSIS SUMMARY')\nprint('=' * 45)\nprint(f'Total cells analyzed: {total_cells_with_doublets:,}')\nprint(f'True individuals: {n_individuals}')\nprint(f'Predicted clusters: {best_k}')\nprint(f'Clustering accuracy (ARI): {max(ari_scores):.3f}')\nprint(f'\\nDoublet detection:')\nprint(f' True doublets: {sum(true_doublets)}')\nprint(f' Detected doublets: {sum(predicted_doublets)}')\nprint(f' Precision: {doublet_precision:.3f}')\nprint(f' Recall: {doublet_recall:.3f}')\nprint(f'\\nQuality metrics:')\nprint(f' Mean variant detection rate: {np.mean(variant_detection_rates):.3f}')\nprint(f' Genetic variants used: {n_variants:,}')\n\n# Cleanup\nos.unlink(viz_file)\nprint('\\nDemo complete - temporary files cleaned up')\n\nprint('\\nsouporcell provides:')\nprint('• Genotype-based cell clustering')\nprint('• Multiplexed sample demultiplexing')\nprint('• Doublet detection and removal')\nprint('• Quality control metrics')\nprint('• Integration with standard scRNA-seq pipelines')\nprint('• Support for 10x Genomics data')\nprint('• Ambient RNA contamination detection')\n\nprint('\\nTypical souporcell command:')\nprint('souporcell_pipeline.py -i possorted_genome_bam.bam \\\\')\nprint(' -b filtered_feature_bc_matrix -f reference.fasta \\\\')\nprint(' -t 8 -o souporcell_output -k 4')",
265
+ "quick_start": [
266
+ "Install: pip install souporcell",
267
+ "Run: souporcell_pipeline.py -i input.bam -b barcodes",
268
+ "Reference: -f reference.fasta",
269
+ "Clusters: -k number_of_individuals",
270
+ "Output: -o output_directory",
271
+ "Check cluster assignments in output files"
272
+ ]
273
+ }
274
+ },
275
+ {
276
+ "type": "PackageTool",
277
+ "name": "get_pyscenic_info",
278
+ "description": "Get comprehensive information about pySCENIC – single-cell regulatory network inference",
279
+ "parameter": {
280
+ "type": "object",
281
+ "properties": {
282
+ "info_type": {
283
+ "type": "string",
284
+ "enum": [
285
+ "overview",
286
+ "installation",
287
+ "usage",
288
+ "documentation"
289
+ ],
290
+ "description": "Type of information to retrieve about pySCENIC"
291
+ }
292
+ },
293
+ "required": [
294
+ "info_type"
295
+ ]
296
+ },
297
+ "package_name": "pyscenic",
298
+ "local_info": {
299
+ "name": "pySCENIC",
300
+ "description": "Python implementation of SCENIC (Single-CEll regulatory Network Inference and Clustering) for inferring gene regulatory networks from single-cell RNA-seq data and identifying cell states.",
301
+ "category": "Single-Cell Regulatory Networks",
302
+ "import_name": "pyscenic",
303
+ "popularity": 75,
304
+ "keywords": [
305
+ "regulatory networks",
306
+ "transcription factors",
307
+ "single-cell",
308
+ "gene regulation",
309
+ "SCENIC"
310
+ ],
311
+ "documentation": "https://pyscenic.readthedocs.io/",
312
+ "repository": "https://github.com/aertslab/pySCENIC",
313
+ "installation": {
314
+ "pip": "pip install pyscenic",
315
+ "conda": "conda install -c bioconda pyscenic"
316
+ },
317
+ "usage_example": "import pandas as pd\nfrom pyscenic.utils import modules_from_adjacencies\nfrom pyscenic.prune import prune2df, df2regulons\nfrom pyscenic.aucell import aucell\n\n# Load expression data (genes x cells)\nex_matrix = pd.read_csv('expression_matrix.csv', index_col=0)\n\n# Step 1: Infer co-expression modules\n# This step requires GRNBoost2 or GENIE3\n# adjacencies = grnboost2(ex_matrix, tf_names=tf_names)\n\n# Step 2: Prune modules for targets with cis-regulatory footprints\n# Requires motif databases and cis-regulatory regions\n# df = prune2df(dbs, modules, motif_annotations)\n# regulons = df2regulons(df)\n\n# Step 3: Calculate cellular enrichment (AUCell)\n# auc_matrix = aucell(ex_matrix, regulons)\n\nprint('pySCENIC workflow:')\nprint('1. Infer co-expression modules with GRNBoost2')\nprint('2. Prune modules using motif enrichment')\nprint('3. Score regulon activity with AUCell')\nprint('4. Identify cell states and types')\n\n# Note: Full workflow requires additional data files:\n# - Transcription factor list\n# - Motif databases (e.g., JASPAR)\n# - Cis-regulatory regions (e.g., from ENCODE)",
318
+ "quick_start": [
319
+ "Install: pip install pyscenic",
320
+ "Prepare expression matrix (genes x cells)",
321
+ "Download motif databases and TF annotations",
322
+ "Run GRNBoost2: infer co-expression modules",
323
+ "Prune modules: prune2df() with motif data",
324
+ "Score activity: aucell() for regulon enrichment"
325
+ ]
326
+ }
327
+ },
328
+ {
329
+ "type": "PackageTool",
330
+ "name": "get_tiledbsoma_info",
331
+ "description": "Get comprehensive information about TileDB-SOMA – single-cell data storage with TileDB",
332
+ "parameter": {
333
+ "type": "object",
334
+ "properties": {
335
+ "info_type": {
336
+ "type": "string",
337
+ "enum": [
338
+ "overview",
339
+ "installation",
340
+ "usage",
341
+ "documentation"
342
+ ],
343
+ "description": "Type of information to retrieve about TileDB-SOMA"
344
+ }
345
+ },
346
+ "required": [
347
+ "info_type"
348
+ ]
349
+ },
350
+ "package_name": "tiledbsoma",
351
+ "local_info": {
352
+ "name": "TileDB-SOMA",
353
+ "description": "Single-cell Observation Matrix API (SOMA) implementation using TileDB for efficient storage and retrieval of single-cell genomics data. Provides scalable access to large single-cell datasets.",
354
+ "category": "Single-Cell Data Storage",
355
+ "import_name": "tiledbsoma",
356
+ "popularity": 60,
357
+ "keywords": [
358
+ "single-cell",
359
+ "SOMA",
360
+ "TileDB",
361
+ "genomics storage",
362
+ "scalable data access"
363
+ ],
364
+ "documentation": "https://github.com/single-cell-data/TileDB-SOMA",
365
+ "repository": "https://github.com/single-cell-data/TileDB-SOMA",
366
+ "installation": {
367
+ "pip": "pip install tiledbsoma",
368
+ "conda": "conda install -c conda-forge tiledbsoma"
369
+ },
370
+ "usage_example": "import tiledbsoma as soma\nimport numpy as np\nimport pandas as pd\nimport tempfile\nimport os\n\n# Create temporary directory for demo\ntemp_dir = tempfile.mkdtemp()\nsoma_uri = os.path.join(temp_dir, 'demo_soma')\n\nprint(f'Creating SOMA experiment at: {soma_uri}')\n\n# Create sample single-cell data\nn_obs = 1000 # cells\nn_var = 2000 # genes\n\n# Create observation (cell) metadata\nobs_data = pd.DataFrame({\n 'cell_id': [f'cell_{i}' for i in range(n_obs)],\n 'cell_type': np.random.choice(['TypeA', 'TypeB', 'TypeC'], n_obs),\n 'batch': np.random.choice(['batch1', 'batch2'], n_obs),\n 'n_genes': np.random.poisson(1500, n_obs)\n})\nobs_data.set_index('cell_id', inplace=True)\n\n# Create variable (gene) metadata\nvar_data = pd.DataFrame({\n 'gene_id': [f'gene_{i}' for i in range(n_var)],\n 'gene_name': [f'Gene_{i}' for i in range(n_var)],\n 'highly_variable': np.random.choice([True, False], n_var, p=[0.2, 0.8])\n})\nvar_data.set_index('gene_id', inplace=True)\n\nprint(f'Created metadata: {n_obs} cells, {n_var} genes')\n\n# Create sparse expression matrix\nfrom scipy.sparse import random as sparse_random\nX_sparse = sparse_random(n_obs, n_var, density=0.1, format='csr', \n random_state=42) * 100\nX_sparse = X_sparse.astype(np.float32)\n\nprint(f'Created sparse matrix: {X_sparse.shape}, density: {X_sparse.nnz / X_sparse.size:.3f}')\n\n# Create SOMA experiment\nwith soma.Experiment.create(soma_uri) as exp:\n # Add observation metadata\n exp.obs = soma.DataFrame.create(\n os.path.join(soma_uri, 'obs'),\n schema=soma.DataFrame._build_schema_from_pandas(obs_data)\n )\n exp.obs.write(obs_data)\n \n # Add variable metadata\n exp.var = soma.DataFrame.create(\n os.path.join(soma_uri, 'var'),\n schema=soma.DataFrame._build_schema_from_pandas(var_data)\n )\n exp.var.write(var_data)\n \n print('Written metadata to SOMA experiment')\n\n# Read data back\nprint('\\nReading data from SOMA experiment:')\nwith soma.Experiment.open(soma_uri) as exp:\n # Read observation metadata\n obs_df = exp.obs.read().concat().to_pandas()\n print(f'Observations: {obs_df.shape}')\n print(f'Cell types: {obs_df[\"cell_type\"].value_counts().to_dict()}')\n \n # Read variable metadata\n var_df = exp.var.read().concat().to_pandas()\n print(f'Variables: {var_df.shape}')\n print(f'Highly variable genes: {var_df[\"highly_variable\"].sum()}')\n\n# Cleanup\nimport shutil\nshutil.rmtree(temp_dir)\nprint('\\nDemo complete - temporary files cleaned up')\n\nprint('\\nTileDB-SOMA provides:')\nprint('- Efficient storage for large single-cell datasets')\nprint('- SOMA API compatibility')\nprint('- Integration with scanpy and other tools')\nprint('- Cloud-native storage capabilities')",
371
+ "quick_start": [
372
+ "Install: pip install tiledbsoma",
373
+ "Create experiment: soma.Experiment.create(uri)",
374
+ "Add metadata: exp.obs/var = soma.DataFrame.create()",
375
+ "Write data: dataframe.write(pandas_df)",
376
+ "Read data: dataframe.read().concat().to_pandas()",
377
+ "Integrate with single-cell analysis workflows"
378
+ ]
379
+ }
380
+ },
381
+ {
382
+ "type": "PackageTool",
383
+ "name": "get_scvi_tools_info",
384
+ "description": "Get information about the scvi-tools package. Deep probabilistic analysis of single-cell omics data",
385
+ "package_name": "scvi-tools",
386
+ "parameter": {
387
+ "type": "object",
388
+ "properties": {},
389
+ "required": []
390
+ },
391
+ "required": []
392
+ },
393
+ {
394
+ "type": "PackageTool",
395
+ "name": "get_cellrank_info",
396
+ "description": "Get information about the cellrank package. Trajectory inference and cell fate mapping in single-cell data",
397
+ "package_name": "cellrank",
398
+ "parameter": {
399
+ "type": "object",
400
+ "properties": {},
401
+ "required": []
402
+ },
403
+ "required": []
404
+ },
405
+ {
406
+ "type": "PackageTool",
407
+ "name": "get_velocyto_info",
408
+ "description": "Get information about the velocyto package. RNA velocity analysis for single cell RNA-seq data",
409
+ "package_name": "velocyto",
410
+ "parameter": {
411
+ "type": "object",
412
+ "properties": {},
413
+ "required": []
414
+ },
415
+ "required": []
416
+ },
417
+ {
418
+ "type": "PackageTool",
419
+ "name": "get_scanorama_info",
420
+ "description": "Get information about the scanorama package. Batch correction and integration of single-cell data",
421
+ "package_name": "scanorama",
422
+ "parameter": {
423
+ "type": "object",
424
+ "properties": {},
425
+ "required": []
426
+ },
427
+ "required": []
428
+ },
429
+ {
430
+ "type": "PackageTool",
431
+ "name": "get_palantir_info",
432
+ "description": "Get information about the palantir package. Algorithm for modeling continuous cell state transitions",
433
+ "package_name": "palantir",
434
+ "parameter": {
435
+ "type": "object",
436
+ "properties": {},
437
+ "required": []
438
+ },
439
+ "required": []
440
+ },
441
+ {
442
+ "type": "PackageTool",
443
+ "name": "get_episcanpy_info",
444
+ "description": "Get information about the episcanpy package. Epigenomics single cell analysis in Python",
445
+ "package_name": "episcanpy",
446
+ "parameter": {
447
+ "type": "object",
448
+ "properties": {},
449
+ "required": []
450
+ },
451
+ "required": []
452
+ }
453
+ ]