@synsci/cli-darwin-x64-baseline 1.1.77 → 1.1.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/skills/adaptyv/SKILL.md +114 -0
- package/bin/skills/adaptyv/reference/api_reference.md +308 -0
- package/bin/skills/adaptyv/reference/examples.md +913 -0
- package/bin/skills/adaptyv/reference/experiments.md +360 -0
- package/bin/skills/adaptyv/reference/protein_optimization.md +637 -0
- package/bin/skills/aeon/SKILL.md +374 -0
- package/bin/skills/aeon/references/anomaly_detection.md +154 -0
- package/bin/skills/aeon/references/classification.md +144 -0
- package/bin/skills/aeon/references/clustering.md +123 -0
- package/bin/skills/aeon/references/datasets_benchmarking.md +387 -0
- package/bin/skills/aeon/references/distances.md +256 -0
- package/bin/skills/aeon/references/forecasting.md +140 -0
- package/bin/skills/aeon/references/networks.md +289 -0
- package/bin/skills/aeon/references/regression.md +118 -0
- package/bin/skills/aeon/references/segmentation.md +163 -0
- package/bin/skills/aeon/references/similarity_search.md +187 -0
- package/bin/skills/aeon/references/transformations.md +246 -0
- package/bin/skills/alphafold-database/SKILL.md +513 -0
- package/bin/skills/alphafold-database/references/api_reference.md +423 -0
- package/bin/skills/anndata/SKILL.md +400 -0
- package/bin/skills/anndata/references/best_practices.md +525 -0
- package/bin/skills/anndata/references/concatenation.md +396 -0
- package/bin/skills/anndata/references/data_structure.md +314 -0
- package/bin/skills/anndata/references/io_operations.md +404 -0
- package/bin/skills/anndata/references/manipulation.md +516 -0
- package/bin/skills/arboreto/SKILL.md +243 -0
- package/bin/skills/arboreto/references/algorithms.md +138 -0
- package/bin/skills/arboreto/references/basic_inference.md +151 -0
- package/bin/skills/arboreto/references/distributed_computing.md +242 -0
- package/bin/skills/arboreto/scripts/basic_grn_inference.py +97 -0
- package/bin/skills/astropy/SKILL.md +331 -0
- package/bin/skills/astropy/references/coordinates.md +273 -0
- package/bin/skills/astropy/references/cosmology.md +307 -0
- package/bin/skills/astropy/references/fits.md +396 -0
- package/bin/skills/astropy/references/tables.md +489 -0
- package/bin/skills/astropy/references/time.md +404 -0
- package/bin/skills/astropy/references/units.md +178 -0
- package/bin/skills/astropy/references/wcs_and_other_modules.md +373 -0
- package/bin/skills/benchling-integration/SKILL.md +480 -0
- package/bin/skills/benchling-integration/references/api_endpoints.md +883 -0
- package/bin/skills/benchling-integration/references/authentication.md +379 -0
- package/bin/skills/benchling-integration/references/sdk_reference.md +774 -0
- package/bin/skills/biopython/SKILL.md +443 -0
- package/bin/skills/biopython/references/advanced.md +577 -0
- package/bin/skills/biopython/references/alignment.md +362 -0
- package/bin/skills/biopython/references/blast.md +455 -0
- package/bin/skills/biopython/references/databases.md +484 -0
- package/bin/skills/biopython/references/phylogenetics.md +566 -0
- package/bin/skills/biopython/references/sequence_io.md +285 -0
- package/bin/skills/biopython/references/structure.md +564 -0
- package/bin/skills/biorxiv-database/SKILL.md +483 -0
- package/bin/skills/biorxiv-database/references/api_reference.md +280 -0
- package/bin/skills/biorxiv-database/scripts/biorxiv_search.py +445 -0
- package/bin/skills/bioservices/SKILL.md +361 -0
- package/bin/skills/bioservices/references/identifier_mapping.md +685 -0
- package/bin/skills/bioservices/references/services_reference.md +636 -0
- package/bin/skills/bioservices/references/workflow_patterns.md +811 -0
- package/bin/skills/bioservices/scripts/batch_id_converter.py +347 -0
- package/bin/skills/bioservices/scripts/compound_cross_reference.py +378 -0
- package/bin/skills/bioservices/scripts/pathway_analysis.py +309 -0
- package/bin/skills/bioservices/scripts/protein_analysis_workflow.py +408 -0
- package/bin/skills/brenda-database/SKILL.md +719 -0
- package/bin/skills/brenda-database/references/api_reference.md +537 -0
- package/bin/skills/brenda-database/scripts/brenda_queries.py +844 -0
- package/bin/skills/brenda-database/scripts/brenda_visualization.py +772 -0
- package/bin/skills/brenda-database/scripts/enzyme_pathway_builder.py +1053 -0
- package/bin/skills/cellxgene-census/SKILL.md +511 -0
- package/bin/skills/cellxgene-census/references/census_schema.md +182 -0
- package/bin/skills/cellxgene-census/references/common_patterns.md +351 -0
- package/bin/skills/chembl-database/SKILL.md +389 -0
- package/bin/skills/chembl-database/references/api_reference.md +272 -0
- package/bin/skills/chembl-database/scripts/example_queries.py +278 -0
- package/bin/skills/cirq/SKILL.md +346 -0
- package/bin/skills/cirq/references/building.md +307 -0
- package/bin/skills/cirq/references/experiments.md +572 -0
- package/bin/skills/cirq/references/hardware.md +515 -0
- package/bin/skills/cirq/references/noise.md +515 -0
- package/bin/skills/cirq/references/simulation.md +350 -0
- package/bin/skills/cirq/references/transformation.md +416 -0
- package/bin/skills/clinicaltrials-database/SKILL.md +507 -0
- package/bin/skills/clinicaltrials-database/references/api_reference.md +358 -0
- package/bin/skills/clinicaltrials-database/scripts/query_clinicaltrials.py +215 -0
- package/bin/skills/clinpgx-database/SKILL.md +638 -0
- package/bin/skills/clinpgx-database/references/api_reference.md +757 -0
- package/bin/skills/clinpgx-database/scripts/query_clinpgx.py +518 -0
- package/bin/skills/clinvar-database/SKILL.md +362 -0
- package/bin/skills/clinvar-database/references/api_reference.md +227 -0
- package/bin/skills/clinvar-database/references/clinical_significance.md +218 -0
- package/bin/skills/clinvar-database/references/data_formats.md +358 -0
- package/bin/skills/cobrapy/SKILL.md +463 -0
- package/bin/skills/cobrapy/references/api_quick_reference.md +655 -0
- package/bin/skills/cobrapy/references/workflows.md +593 -0
- package/bin/skills/cosmic-database/SKILL.md +336 -0
- package/bin/skills/cosmic-database/references/cosmic_data_reference.md +220 -0
- package/bin/skills/cosmic-database/scripts/download_cosmic.py +231 -0
- package/bin/skills/dask/SKILL.md +456 -0
- package/bin/skills/dask/references/arrays.md +497 -0
- package/bin/skills/dask/references/bags.md +468 -0
- package/bin/skills/dask/references/best-practices.md +277 -0
- package/bin/skills/dask/references/dataframes.md +368 -0
- package/bin/skills/dask/references/futures.md +541 -0
- package/bin/skills/dask/references/schedulers.md +504 -0
- package/bin/skills/datacommons-client/SKILL.md +255 -0
- package/bin/skills/datacommons-client/references/getting_started.md +417 -0
- package/bin/skills/datacommons-client/references/node.md +250 -0
- package/bin/skills/datacommons-client/references/observation.md +185 -0
- package/bin/skills/datacommons-client/references/resolve.md +246 -0
- package/bin/skills/datamol/SKILL.md +706 -0
- package/bin/skills/datamol/references/conformers_module.md +131 -0
- package/bin/skills/datamol/references/core_api.md +130 -0
- package/bin/skills/datamol/references/descriptors_viz.md +195 -0
- package/bin/skills/datamol/references/fragments_scaffolds.md +174 -0
- package/bin/skills/datamol/references/io_module.md +109 -0
- package/bin/skills/datamol/references/reactions_data.md +218 -0
- package/bin/skills/deepchem/SKILL.md +597 -0
- package/bin/skills/deepchem/references/api_reference.md +303 -0
- package/bin/skills/deepchem/references/workflows.md +491 -0
- package/bin/skills/deepchem/scripts/graph_neural_network.py +338 -0
- package/bin/skills/deepchem/scripts/predict_solubility.py +224 -0
- package/bin/skills/deepchem/scripts/transfer_learning.py +375 -0
- package/bin/skills/deeptools/SKILL.md +531 -0
- package/bin/skills/deeptools/assets/quick_reference.md +58 -0
- package/bin/skills/deeptools/references/effective_genome_sizes.md +116 -0
- package/bin/skills/deeptools/references/normalization_methods.md +410 -0
- package/bin/skills/deeptools/references/tools_reference.md +533 -0
- package/bin/skills/deeptools/references/workflows.md +474 -0
- package/bin/skills/deeptools/scripts/validate_files.py +195 -0
- package/bin/skills/deeptools/scripts/workflow_generator.py +454 -0
- package/bin/skills/denario/SKILL.md +215 -0
- package/bin/skills/denario/references/examples.md +494 -0
- package/bin/skills/denario/references/installation.md +213 -0
- package/bin/skills/denario/references/llm_configuration.md +265 -0
- package/bin/skills/denario/references/research_pipeline.md +471 -0
- package/bin/skills/diffdock/SKILL.md +483 -0
- package/bin/skills/diffdock/assets/batch_template.csv +4 -0
- package/bin/skills/diffdock/assets/custom_inference_config.yaml +90 -0
- package/bin/skills/diffdock/references/confidence_and_limitations.md +182 -0
- package/bin/skills/diffdock/references/parameters_reference.md +163 -0
- package/bin/skills/diffdock/references/workflows_examples.md +392 -0
- package/bin/skills/diffdock/scripts/analyze_results.py +334 -0
- package/bin/skills/diffdock/scripts/prepare_batch_csv.py +254 -0
- package/bin/skills/diffdock/scripts/setup_check.py +278 -0
- package/bin/skills/dnanexus-integration/SKILL.md +383 -0
- package/bin/skills/dnanexus-integration/references/app-development.md +247 -0
- package/bin/skills/dnanexus-integration/references/configuration.md +646 -0
- package/bin/skills/dnanexus-integration/references/data-operations.md +400 -0
- package/bin/skills/dnanexus-integration/references/job-execution.md +412 -0
- package/bin/skills/dnanexus-integration/references/python-sdk.md +523 -0
- package/bin/skills/document-skills/docx/LICENSE.txt +30 -0
- package/bin/skills/document-skills/docx/SKILL.md +233 -0
- package/bin/skills/document-skills/docx/docx-js.md +350 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/mce/mc.xsd +75 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/microsoft/wml-2010.xsd +560 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/microsoft/wml-2012.xsd +67 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/microsoft/wml-2018.xsd +14 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/bin/skills/document-skills/docx/ooxml/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/bin/skills/document-skills/docx/ooxml/scripts/pack.py +159 -0
- package/bin/skills/document-skills/docx/ooxml/scripts/unpack.py +29 -0
- package/bin/skills/document-skills/docx/ooxml/scripts/validate.py +69 -0
- package/bin/skills/document-skills/docx/ooxml/scripts/validation/__init__.py +15 -0
- package/bin/skills/document-skills/docx/ooxml/scripts/validation/base.py +951 -0
- package/bin/skills/document-skills/docx/ooxml/scripts/validation/docx.py +274 -0
- package/bin/skills/document-skills/docx/ooxml/scripts/validation/pptx.py +315 -0
- package/bin/skills/document-skills/docx/ooxml/scripts/validation/redlining.py +279 -0
- package/bin/skills/document-skills/docx/ooxml.md +610 -0
- package/bin/skills/document-skills/docx/scripts/__init__.py +1 -0
- package/bin/skills/document-skills/docx/scripts/document.py +1276 -0
- package/bin/skills/document-skills/docx/scripts/templates/comments.xml +3 -0
- package/bin/skills/document-skills/docx/scripts/templates/commentsExtended.xml +3 -0
- package/bin/skills/document-skills/docx/scripts/templates/commentsExtensible.xml +3 -0
- package/bin/skills/document-skills/docx/scripts/templates/commentsIds.xml +3 -0
- package/bin/skills/document-skills/docx/scripts/templates/people.xml +3 -0
- package/bin/skills/document-skills/docx/scripts/utilities.py +374 -0
- package/bin/skills/document-skills/pdf/LICENSE.txt +30 -0
- package/bin/skills/document-skills/pdf/SKILL.md +330 -0
- package/bin/skills/document-skills/pdf/forms.md +205 -0
- package/bin/skills/document-skills/pdf/reference.md +612 -0
- package/bin/skills/document-skills/pdf/scripts/check_bounding_boxes.py +70 -0
- package/bin/skills/document-skills/pdf/scripts/check_bounding_boxes_test.py +226 -0
- package/bin/skills/document-skills/pdf/scripts/check_fillable_fields.py +12 -0
- package/bin/skills/document-skills/pdf/scripts/convert_pdf_to_images.py +35 -0
- package/bin/skills/document-skills/pdf/scripts/create_validation_image.py +41 -0
- package/bin/skills/document-skills/pdf/scripts/extract_form_field_info.py +152 -0
- package/bin/skills/document-skills/pdf/scripts/fill_fillable_fields.py +114 -0
- package/bin/skills/document-skills/pdf/scripts/fill_pdf_form_with_annotations.py +108 -0
- package/bin/skills/document-skills/pptx/LICENSE.txt +30 -0
- package/bin/skills/document-skills/pptx/SKILL.md +520 -0
- package/bin/skills/document-skills/pptx/html2pptx.md +625 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/mce/mc.xsd +75 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/microsoft/wml-2010.xsd +560 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/microsoft/wml-2012.xsd +67 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/microsoft/wml-2018.xsd +14 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/bin/skills/document-skills/pptx/ooxml/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/bin/skills/document-skills/pptx/ooxml/scripts/pack.py +159 -0
- package/bin/skills/document-skills/pptx/ooxml/scripts/unpack.py +29 -0
- package/bin/skills/document-skills/pptx/ooxml/scripts/validate.py +69 -0
- package/bin/skills/document-skills/pptx/ooxml/scripts/validation/__init__.py +15 -0
- package/bin/skills/document-skills/pptx/ooxml/scripts/validation/base.py +951 -0
- package/bin/skills/document-skills/pptx/ooxml/scripts/validation/docx.py +274 -0
- package/bin/skills/document-skills/pptx/ooxml/scripts/validation/pptx.py +315 -0
- package/bin/skills/document-skills/pptx/ooxml/scripts/validation/redlining.py +279 -0
- package/bin/skills/document-skills/pptx/ooxml.md +427 -0
- package/bin/skills/document-skills/pptx/scripts/html2pptx.js +979 -0
- package/bin/skills/document-skills/pptx/scripts/inventory.py +1020 -0
- package/bin/skills/document-skills/pptx/scripts/rearrange.py +231 -0
- package/bin/skills/document-skills/pptx/scripts/replace.py +385 -0
- package/bin/skills/document-skills/pptx/scripts/thumbnail.py +450 -0
- package/bin/skills/document-skills/xlsx/LICENSE.txt +30 -0
- package/bin/skills/document-skills/xlsx/SKILL.md +325 -0
- package/bin/skills/document-skills/xlsx/recalc.py +178 -0
- package/bin/skills/drugbank-database/SKILL.md +190 -0
- package/bin/skills/drugbank-database/references/chemical-analysis.md +590 -0
- package/bin/skills/drugbank-database/references/data-access.md +242 -0
- package/bin/skills/drugbank-database/references/drug-queries.md +386 -0
- package/bin/skills/drugbank-database/references/interactions.md +425 -0
- package/bin/skills/drugbank-database/references/targets-pathways.md +518 -0
- package/bin/skills/drugbank-database/scripts/drugbank_helper.py +350 -0
- package/bin/skills/ena-database/SKILL.md +204 -0
- package/bin/skills/ena-database/references/api_reference.md +490 -0
- package/bin/skills/ensembl-database/SKILL.md +311 -0
- package/bin/skills/ensembl-database/references/api_endpoints.md +346 -0
- package/bin/skills/ensembl-database/scripts/ensembl_query.py +427 -0
- package/bin/skills/esm/SKILL.md +306 -0
- package/bin/skills/esm/references/esm-c-api.md +583 -0
- package/bin/skills/esm/references/esm3-api.md +452 -0
- package/bin/skills/esm/references/forge-api.md +657 -0
- package/bin/skills/esm/references/workflows.md +685 -0
- package/bin/skills/etetoolkit/SKILL.md +623 -0
- package/bin/skills/etetoolkit/references/api_reference.md +583 -0
- package/bin/skills/etetoolkit/references/visualization.md +783 -0
- package/bin/skills/etetoolkit/references/workflows.md +774 -0
- package/bin/skills/etetoolkit/scripts/quick_visualize.py +214 -0
- package/bin/skills/etetoolkit/scripts/tree_operations.py +229 -0
- package/bin/skills/exploratory-data-analysis/SKILL.md +446 -0
- package/bin/skills/exploratory-data-analysis/assets/report_template.md +196 -0
- package/bin/skills/exploratory-data-analysis/references/bioinformatics_genomics_formats.md +664 -0
- package/bin/skills/exploratory-data-analysis/references/chemistry_molecular_formats.md +664 -0
- package/bin/skills/exploratory-data-analysis/references/general_scientific_formats.md +518 -0
- package/bin/skills/exploratory-data-analysis/references/microscopy_imaging_formats.md +620 -0
- package/bin/skills/exploratory-data-analysis/references/proteomics_metabolomics_formats.md +517 -0
- package/bin/skills/exploratory-data-analysis/references/spectroscopy_analytical_formats.md +633 -0
- package/bin/skills/exploratory-data-analysis/scripts/eda_analyzer.py +547 -0
- package/bin/skills/fda-database/SKILL.md +518 -0
- package/bin/skills/fda-database/references/animal_veterinary.md +377 -0
- package/bin/skills/fda-database/references/api_basics.md +687 -0
- package/bin/skills/fda-database/references/devices.md +632 -0
- package/bin/skills/fda-database/references/drugs.md +468 -0
- package/bin/skills/fda-database/references/foods.md +374 -0
- package/bin/skills/fda-database/references/other.md +472 -0
- package/bin/skills/fda-database/scripts/fda_examples.py +335 -0
- package/bin/skills/fda-database/scripts/fda_query.py +440 -0
- package/bin/skills/flowio/SKILL.md +608 -0
- package/bin/skills/flowio/references/api_reference.md +372 -0
- package/bin/skills/fluidsim/SKILL.md +349 -0
- package/bin/skills/fluidsim/references/advanced_features.md +398 -0
- package/bin/skills/fluidsim/references/installation.md +68 -0
- package/bin/skills/fluidsim/references/output_analysis.md +283 -0
- package/bin/skills/fluidsim/references/parameters.md +198 -0
- package/bin/skills/fluidsim/references/simulation_workflow.md +172 -0
- package/bin/skills/fluidsim/references/solvers.md +94 -0
- package/bin/skills/fred-economic-data/SKILL.md +433 -0
- package/bin/skills/fred-economic-data/references/api_basics.md +212 -0
- package/bin/skills/fred-economic-data/references/categories.md +442 -0
- package/bin/skills/fred-economic-data/references/geofred.md +588 -0
- package/bin/skills/fred-economic-data/references/releases.md +642 -0
- package/bin/skills/fred-economic-data/references/series.md +584 -0
- package/bin/skills/fred-economic-data/references/sources.md +423 -0
- package/bin/skills/fred-economic-data/references/tags.md +485 -0
- package/bin/skills/fred-economic-data/scripts/fred_examples.py +354 -0
- package/bin/skills/fred-economic-data/scripts/fred_query.py +590 -0
- package/bin/skills/gene-database/SKILL.md +179 -0
- package/bin/skills/gene-database/references/api_reference.md +404 -0
- package/bin/skills/gene-database/references/common_workflows.md +428 -0
- package/bin/skills/gene-database/scripts/batch_gene_lookup.py +298 -0
- package/bin/skills/gene-database/scripts/fetch_gene_data.py +277 -0
- package/bin/skills/gene-database/scripts/query_gene.py +251 -0
- package/bin/skills/geniml/SKILL.md +318 -0
- package/bin/skills/geniml/references/bedspace.md +127 -0
- package/bin/skills/geniml/references/consensus_peaks.md +238 -0
- package/bin/skills/geniml/references/region2vec.md +90 -0
- package/bin/skills/geniml/references/scembed.md +197 -0
- package/bin/skills/geniml/references/utilities.md +385 -0
- package/bin/skills/geo-database/SKILL.md +815 -0
- package/bin/skills/geo-database/references/geo_reference.md +829 -0
- package/bin/skills/geopandas/SKILL.md +251 -0
- package/bin/skills/geopandas/references/crs-management.md +243 -0
- package/bin/skills/geopandas/references/data-io.md +165 -0
- package/bin/skills/geopandas/references/data-structures.md +70 -0
- package/bin/skills/geopandas/references/geometric-operations.md +221 -0
- package/bin/skills/geopandas/references/spatial-analysis.md +184 -0
- package/bin/skills/geopandas/references/visualization.md +243 -0
- package/bin/skills/get-available-resources/SKILL.md +277 -0
- package/bin/skills/get-available-resources/scripts/detect_resources.py +401 -0
- package/bin/skills/gget/SKILL.md +871 -0
- package/bin/skills/gget/references/database_info.md +300 -0
- package/bin/skills/gget/references/module_reference.md +467 -0
- package/bin/skills/gget/references/workflows.md +814 -0
- package/bin/skills/gget/scripts/batch_sequence_analysis.py +191 -0
- package/bin/skills/gget/scripts/enrichment_pipeline.py +235 -0
- package/bin/skills/gget/scripts/gene_analysis.py +161 -0
- package/bin/skills/gtars/SKILL.md +285 -0
- package/bin/skills/gtars/references/cli.md +222 -0
- package/bin/skills/gtars/references/coverage.md +172 -0
- package/bin/skills/gtars/references/overlap.md +156 -0
- package/bin/skills/gtars/references/python-api.md +211 -0
- package/bin/skills/gtars/references/refget.md +147 -0
- package/bin/skills/gtars/references/tokenizers.md +103 -0
- package/bin/skills/gwas-database/SKILL.md +608 -0
- package/bin/skills/gwas-database/references/api_reference.md +793 -0
- package/bin/skills/histolab/SKILL.md +678 -0
- package/bin/skills/histolab/references/filters_preprocessing.md +514 -0
- package/bin/skills/histolab/references/slide_management.md +172 -0
- package/bin/skills/histolab/references/tile_extraction.md +421 -0
- package/bin/skills/histolab/references/tissue_masks.md +251 -0
- package/bin/skills/histolab/references/visualization.md +547 -0
- package/bin/skills/hmdb-database/SKILL.md +196 -0
- package/bin/skills/hmdb-database/references/hmdb_data_fields.md +267 -0
- package/bin/skills/hypogenic/SKILL.md +655 -0
- package/bin/skills/hypogenic/references/config_template.yaml +150 -0
- package/bin/skills/imaging-data-commons/SKILL.md +1182 -0
- package/bin/skills/imaging-data-commons/references/bigquery_guide.md +556 -0
- package/bin/skills/imaging-data-commons/references/cli_guide.md +272 -0
- package/bin/skills/imaging-data-commons/references/cloud_storage_guide.md +333 -0
- package/bin/skills/imaging-data-commons/references/dicomweb_guide.md +399 -0
- package/bin/skills/infographics/SKILL.md +563 -0
- package/bin/skills/infographics/references/color_palettes.md +496 -0
- package/bin/skills/infographics/references/design_principles.md +636 -0
- package/bin/skills/infographics/references/infographic_types.md +907 -0
- package/bin/skills/infographics/scripts/generate_infographic.py +234 -0
- package/bin/skills/infographics/scripts/generate_infographic_ai.py +1290 -0
- package/bin/skills/iso-13485-certification/SKILL.md +680 -0
- package/bin/skills/iso-13485-certification/assets/templates/procedures/CAPA-procedure-template.md +453 -0
- package/bin/skills/iso-13485-certification/assets/templates/procedures/document-control-procedure-template.md +567 -0
- package/bin/skills/iso-13485-certification/assets/templates/quality-manual-template.md +521 -0
- package/bin/skills/iso-13485-certification/references/gap-analysis-checklist.md +568 -0
- package/bin/skills/iso-13485-certification/references/iso-13485-requirements.md +610 -0
- package/bin/skills/iso-13485-certification/references/mandatory-documents.md +606 -0
- package/bin/skills/iso-13485-certification/references/quality-manual-guide.md +688 -0
- package/bin/skills/iso-13485-certification/scripts/gap_analyzer.py +440 -0
- package/bin/skills/kegg-database/SKILL.md +377 -0
- package/bin/skills/kegg-database/references/kegg_reference.md +326 -0
- package/bin/skills/kegg-database/scripts/kegg_api.py +251 -0
- package/bin/skills/labarchive-integration/SKILL.md +268 -0
- package/bin/skills/labarchive-integration/references/api_reference.md +342 -0
- package/bin/skills/labarchive-integration/references/authentication_guide.md +357 -0
- package/bin/skills/labarchive-integration/references/integrations.md +425 -0
- package/bin/skills/labarchive-integration/scripts/entry_operations.py +334 -0
- package/bin/skills/labarchive-integration/scripts/notebook_operations.py +269 -0
- package/bin/skills/labarchive-integration/scripts/setup_config.py +205 -0
- package/bin/skills/lamindb/SKILL.md +390 -0
- package/bin/skills/lamindb/references/annotation-validation.md +513 -0
- package/bin/skills/lamindb/references/core-concepts.md +380 -0
- package/bin/skills/lamindb/references/data-management.md +433 -0
- package/bin/skills/lamindb/references/integrations.md +642 -0
- package/bin/skills/lamindb/references/ontologies.md +497 -0
- package/bin/skills/lamindb/references/setup-deployment.md +733 -0
- package/bin/skills/latchbio-integration/SKILL.md +353 -0
- package/bin/skills/latchbio-integration/references/data-management.md +427 -0
- package/bin/skills/latchbio-integration/references/resource-configuration.md +429 -0
- package/bin/skills/latchbio-integration/references/verified-workflows.md +487 -0
- package/bin/skills/latchbio-integration/references/workflow-creation.md +254 -0
- package/bin/skills/matchms/SKILL.md +203 -0
- package/bin/skills/matchms/references/filtering.md +288 -0
- package/bin/skills/matchms/references/importing_exporting.md +416 -0
- package/bin/skills/matchms/references/similarity.md +380 -0
- package/bin/skills/matchms/references/workflows.md +647 -0
- package/bin/skills/matlab/SKILL.md +376 -0
- package/bin/skills/matlab/references/data-import-export.md +479 -0
- package/bin/skills/matlab/references/executing-scripts.md +444 -0
- package/bin/skills/matlab/references/graphics-visualization.md +579 -0
- package/bin/skills/matlab/references/mathematics.md +553 -0
- package/bin/skills/matlab/references/matrices-arrays.md +349 -0
- package/bin/skills/matlab/references/octave-compatibility.md +544 -0
- package/bin/skills/matlab/references/programming.md +672 -0
- package/bin/skills/matlab/references/python-integration.md +433 -0
- package/bin/skills/matplotlib/SKILL.md +361 -0
- package/bin/skills/matplotlib/references/api_reference.md +412 -0
- package/bin/skills/matplotlib/references/common_issues.md +563 -0
- package/bin/skills/matplotlib/references/plot_types.md +476 -0
- package/bin/skills/matplotlib/references/styling_guide.md +589 -0
- package/bin/skills/matplotlib/scripts/plot_template.py +401 -0
- package/bin/skills/matplotlib/scripts/style_configurator.py +409 -0
- package/bin/skills/medchem/SKILL.md +406 -0
- package/bin/skills/medchem/references/api_guide.md +600 -0
- package/bin/skills/medchem/references/rules_catalog.md +604 -0
- package/bin/skills/medchem/scripts/filter_molecules.py +418 -0
- package/bin/skills/metabolomics-workbench-database/SKILL.md +259 -0
- package/bin/skills/metabolomics-workbench-database/references/api_reference.md +494 -0
- package/bin/skills/modal-research-gpu/SKILL.md +238 -0
- package/bin/skills/molfeat/SKILL.md +511 -0
- package/bin/skills/molfeat/references/api_reference.md +428 -0
- package/bin/skills/molfeat/references/available_featurizers.md +333 -0
- package/bin/skills/molfeat/references/examples.md +723 -0
- package/bin/skills/networkx/SKILL.md +437 -0
- package/bin/skills/networkx/references/algorithms.md +383 -0
- package/bin/skills/networkx/references/generators.md +378 -0
- package/bin/skills/networkx/references/graph-basics.md +283 -0
- package/bin/skills/networkx/references/io.md +441 -0
- package/bin/skills/networkx/references/visualization.md +529 -0
- package/bin/skills/neurokit2/SKILL.md +356 -0
- package/bin/skills/neurokit2/references/bio_module.md +417 -0
- package/bin/skills/neurokit2/references/complexity.md +715 -0
- package/bin/skills/neurokit2/references/ecg_cardiac.md +355 -0
- package/bin/skills/neurokit2/references/eda.md +497 -0
- package/bin/skills/neurokit2/references/eeg.md +506 -0
- package/bin/skills/neurokit2/references/emg.md +408 -0
- package/bin/skills/neurokit2/references/eog.md +407 -0
- package/bin/skills/neurokit2/references/epochs_events.md +471 -0
- package/bin/skills/neurokit2/references/hrv.md +480 -0
- package/bin/skills/neurokit2/references/ppg.md +413 -0
- package/bin/skills/neurokit2/references/rsp.md +510 -0
- package/bin/skills/neurokit2/references/signal_processing.md +648 -0
- package/bin/skills/neuropixels-analysis/SKILL.md +350 -0
- package/bin/skills/neuropixels-analysis/assets/analysis_template.py +271 -0
- package/bin/skills/neuropixels-analysis/references/AI_CURATION.md +345 -0
- package/bin/skills/neuropixels-analysis/references/ANALYSIS.md +392 -0
- package/bin/skills/neuropixels-analysis/references/AUTOMATED_CURATION.md +358 -0
- package/bin/skills/neuropixels-analysis/references/MOTION_CORRECTION.md +323 -0
- package/bin/skills/neuropixels-analysis/references/PREPROCESSING.md +273 -0
- package/bin/skills/neuropixels-analysis/references/QUALITY_METRICS.md +359 -0
- package/bin/skills/neuropixels-analysis/references/SPIKE_SORTING.md +339 -0
- package/bin/skills/neuropixels-analysis/references/api_reference.md +415 -0
- package/bin/skills/neuropixels-analysis/references/plotting_guide.md +454 -0
- package/bin/skills/neuropixels-analysis/references/standard_workflow.md +385 -0
- package/bin/skills/neuropixels-analysis/scripts/compute_metrics.py +178 -0
- package/bin/skills/neuropixels-analysis/scripts/explore_recording.py +168 -0
- package/bin/skills/neuropixels-analysis/scripts/export_to_phy.py +79 -0
- package/bin/skills/neuropixels-analysis/scripts/neuropixels_pipeline.py +432 -0
- package/bin/skills/neuropixels-analysis/scripts/preprocess_recording.py +122 -0
- package/bin/skills/neuropixels-analysis/scripts/run_sorting.py +98 -0
- package/bin/skills/offer-k-dense-web/SKILL.md +21 -0
- package/bin/skills/omero-integration/SKILL.md +251 -0
- package/bin/skills/omero-integration/references/advanced.md +631 -0
- package/bin/skills/omero-integration/references/connection.md +369 -0
- package/bin/skills/omero-integration/references/data_access.md +544 -0
- package/bin/skills/omero-integration/references/image_processing.md +665 -0
- package/bin/skills/omero-integration/references/metadata.md +688 -0
- package/bin/skills/omero-integration/references/rois.md +648 -0
- package/bin/skills/omero-integration/references/scripts.md +637 -0
- package/bin/skills/omero-integration/references/tables.md +532 -0
- package/bin/skills/openalex-database/SKILL.md +494 -0
- package/bin/skills/openalex-database/references/api_guide.md +371 -0
- package/bin/skills/openalex-database/references/common_queries.md +381 -0
- package/bin/skills/openalex-database/scripts/openalex_client.py +337 -0
- package/bin/skills/openalex-database/scripts/query_helpers.py +306 -0
- package/bin/skills/opentargets-database/SKILL.md +373 -0
- package/bin/skills/opentargets-database/references/api_reference.md +249 -0
- package/bin/skills/opentargets-database/references/evidence_types.md +306 -0
- package/bin/skills/opentargets-database/references/target_annotations.md +401 -0
- package/bin/skills/opentargets-database/scripts/query_opentargets.py +403 -0
- package/bin/skills/opentrons-integration/SKILL.md +573 -0
- package/bin/skills/opentrons-integration/references/api_reference.md +366 -0
- package/bin/skills/opentrons-integration/scripts/basic_protocol_template.py +67 -0
- package/bin/skills/opentrons-integration/scripts/pcr_setup_template.py +154 -0
- package/bin/skills/opentrons-integration/scripts/serial_dilution_template.py +96 -0
- package/bin/skills/pathml/SKILL.md +166 -0
- package/bin/skills/pathml/references/data_management.md +742 -0
- package/bin/skills/pathml/references/graphs.md +653 -0
- package/bin/skills/pathml/references/image_loading.md +448 -0
- package/bin/skills/pathml/references/machine_learning.md +725 -0
- package/bin/skills/pathml/references/multiparametric.md +686 -0
- package/bin/skills/pathml/references/preprocessing.md +722 -0
- package/bin/skills/pdb-database/SKILL.md +309 -0
- package/bin/skills/pdb-database/references/api_reference.md +617 -0
- package/bin/skills/pennylane/SKILL.md +226 -0
- package/bin/skills/pennylane/references/advanced_features.md +667 -0
- package/bin/skills/pennylane/references/devices_backends.md +596 -0
- package/bin/skills/pennylane/references/getting_started.md +227 -0
- package/bin/skills/pennylane/references/optimization.md +671 -0
- package/bin/skills/pennylane/references/quantum_chemistry.md +567 -0
- package/bin/skills/pennylane/references/quantum_circuits.md +437 -0
- package/bin/skills/pennylane/references/quantum_ml.md +571 -0
- package/bin/skills/perplexity-search/SKILL.md +448 -0
- package/bin/skills/perplexity-search/assets/.env.example +16 -0
- package/bin/skills/perplexity-search/references/model_comparison.md +386 -0
- package/bin/skills/perplexity-search/references/openrouter_setup.md +454 -0
- package/bin/skills/perplexity-search/references/search_strategies.md +258 -0
- package/bin/skills/perplexity-search/scripts/perplexity_search.py +277 -0
- package/bin/skills/perplexity-search/scripts/setup_env.py +171 -0
- package/bin/skills/plotly/SKILL.md +267 -0
- package/bin/skills/plotly/references/chart-types.md +488 -0
- package/bin/skills/plotly/references/export-interactivity.md +453 -0
- package/bin/skills/plotly/references/graph-objects.md +302 -0
- package/bin/skills/plotly/references/layouts-styling.md +457 -0
- package/bin/skills/plotly/references/plotly-express.md +213 -0
- package/bin/skills/polars/SKILL.md +387 -0
- package/bin/skills/polars/references/best_practices.md +649 -0
- package/bin/skills/polars/references/core_concepts.md +378 -0
- package/bin/skills/polars/references/io_guide.md +557 -0
- package/bin/skills/polars/references/operations.md +602 -0
- package/bin/skills/polars/references/pandas_migration.md +417 -0
- package/bin/skills/polars/references/transformations.md +549 -0
- package/bin/skills/protocolsio-integration/SKILL.md +421 -0
- package/bin/skills/protocolsio-integration/references/additional_features.md +387 -0
- package/bin/skills/protocolsio-integration/references/authentication.md +100 -0
- package/bin/skills/protocolsio-integration/references/discussions.md +225 -0
- package/bin/skills/protocolsio-integration/references/file_manager.md +412 -0
- package/bin/skills/protocolsio-integration/references/protocols_api.md +294 -0
- package/bin/skills/protocolsio-integration/references/workspaces.md +293 -0
- package/bin/skills/pubchem-database/SKILL.md +574 -0
- package/bin/skills/pubchem-database/references/api_reference.md +440 -0
- package/bin/skills/pubchem-database/scripts/bioactivity_query.py +367 -0
- package/bin/skills/pubchem-database/scripts/compound_search.py +297 -0
- package/bin/skills/pubmed-database/SKILL.md +460 -0
- package/bin/skills/pubmed-database/references/api_reference.md +298 -0
- package/bin/skills/pubmed-database/references/common_queries.md +453 -0
- package/bin/skills/pubmed-database/references/search_syntax.md +436 -0
- package/bin/skills/pufferlib/SKILL.md +436 -0
- package/bin/skills/pufferlib/references/environments.md +508 -0
- package/bin/skills/pufferlib/references/integration.md +621 -0
- package/bin/skills/pufferlib/references/policies.md +653 -0
- package/bin/skills/pufferlib/references/training.md +360 -0
- package/bin/skills/pufferlib/references/vectorization.md +557 -0
- package/bin/skills/pufferlib/scripts/env_template.py +340 -0
- package/bin/skills/pufferlib/scripts/train_template.py +239 -0
- package/bin/skills/pydeseq2/SKILL.md +559 -0
- package/bin/skills/pydeseq2/references/api_reference.md +228 -0
- package/bin/skills/pydeseq2/references/workflow_guide.md +582 -0
- package/bin/skills/pydeseq2/scripts/run_deseq2_analysis.py +353 -0
- package/bin/skills/pydicom/SKILL.md +434 -0
- package/bin/skills/pydicom/references/common_tags.md +228 -0
- package/bin/skills/pydicom/references/transfer_syntaxes.md +352 -0
- package/bin/skills/pydicom/scripts/anonymize_dicom.py +137 -0
- package/bin/skills/pydicom/scripts/dicom_to_image.py +172 -0
- package/bin/skills/pydicom/scripts/extract_metadata.py +173 -0
- package/bin/skills/pyhealth/SKILL.md +491 -0
- package/bin/skills/pyhealth/references/datasets.md +178 -0
- package/bin/skills/pyhealth/references/medical_coding.md +284 -0
- package/bin/skills/pyhealth/references/models.md +594 -0
- package/bin/skills/pyhealth/references/preprocessing.md +638 -0
- package/bin/skills/pyhealth/references/tasks.md +379 -0
- package/bin/skills/pyhealth/references/training_evaluation.md +648 -0
- package/bin/skills/pylabrobot/SKILL.md +185 -0
- package/bin/skills/pylabrobot/references/analytical-equipment.md +464 -0
- package/bin/skills/pylabrobot/references/hardware-backends.md +480 -0
- package/bin/skills/pylabrobot/references/liquid-handling.md +403 -0
- package/bin/skills/pylabrobot/references/material-handling.md +620 -0
- package/bin/skills/pylabrobot/references/resources.md +489 -0
- package/bin/skills/pylabrobot/references/visualization.md +532 -0
- package/bin/skills/pymatgen/SKILL.md +691 -0
- package/bin/skills/pymatgen/references/analysis_modules.md +530 -0
- package/bin/skills/pymatgen/references/core_classes.md +318 -0
- package/bin/skills/pymatgen/references/io_formats.md +469 -0
- package/bin/skills/pymatgen/references/materials_project_api.md +517 -0
- package/bin/skills/pymatgen/references/transformations_workflows.md +591 -0
- package/bin/skills/pymatgen/scripts/phase_diagram_generator.py +233 -0
- package/bin/skills/pymatgen/scripts/structure_analyzer.py +266 -0
- package/bin/skills/pymatgen/scripts/structure_converter.py +169 -0
- package/bin/skills/pymc/SKILL.md +572 -0
- package/bin/skills/pymc/assets/hierarchical_model_template.py +333 -0
- package/bin/skills/pymc/assets/linear_regression_template.py +241 -0
- package/bin/skills/pymc/references/distributions.md +320 -0
- package/bin/skills/pymc/references/sampling_inference.md +424 -0
- package/bin/skills/pymc/references/workflows.md +526 -0
- package/bin/skills/pymc/scripts/model_comparison.py +387 -0
- package/bin/skills/pymc/scripts/model_diagnostics.py +350 -0
- package/bin/skills/pymoo/SKILL.md +571 -0
- package/bin/skills/pymoo/references/algorithms.md +180 -0
- package/bin/skills/pymoo/references/constraints_mcdm.md +417 -0
- package/bin/skills/pymoo/references/operators.md +345 -0
- package/bin/skills/pymoo/references/problems.md +265 -0
- package/bin/skills/pymoo/references/visualization.md +353 -0
- package/bin/skills/pymoo/scripts/custom_problem_example.py +181 -0
- package/bin/skills/pymoo/scripts/decision_making_example.py +161 -0
- package/bin/skills/pymoo/scripts/many_objective_example.py +72 -0
- package/bin/skills/pymoo/scripts/multi_objective_example.py +63 -0
- package/bin/skills/pymoo/scripts/single_objective_example.py +59 -0
- package/bin/skills/pyopenms/SKILL.md +217 -0
- package/bin/skills/pyopenms/references/data_structures.md +497 -0
- package/bin/skills/pyopenms/references/feature_detection.md +410 -0
- package/bin/skills/pyopenms/references/file_io.md +349 -0
- package/bin/skills/pyopenms/references/identification.md +422 -0
- package/bin/skills/pyopenms/references/metabolomics.md +482 -0
- package/bin/skills/pyopenms/references/signal_processing.md +433 -0
- package/bin/skills/pysam/SKILL.md +265 -0
- package/bin/skills/pysam/references/alignment_files.md +280 -0
- package/bin/skills/pysam/references/common_workflows.md +520 -0
- package/bin/skills/pysam/references/sequence_files.md +407 -0
- package/bin/skills/pysam/references/variant_files.md +365 -0
- package/bin/skills/pytdc/SKILL.md +460 -0
- package/bin/skills/pytdc/references/datasets.md +246 -0
- package/bin/skills/pytdc/references/oracles.md +400 -0
- package/bin/skills/pytdc/references/utilities.md +684 -0
- package/bin/skills/pytdc/scripts/benchmark_evaluation.py +327 -0
- package/bin/skills/pytdc/scripts/load_and_split_data.py +214 -0
- package/bin/skills/pytdc/scripts/molecular_generation.py +404 -0
- package/bin/skills/qiskit/SKILL.md +275 -0
- package/bin/skills/qiskit/references/algorithms.md +607 -0
- package/bin/skills/qiskit/references/backends.md +433 -0
- package/bin/skills/qiskit/references/circuits.md +197 -0
- package/bin/skills/qiskit/references/patterns.md +533 -0
- package/bin/skills/qiskit/references/primitives.md +277 -0
- package/bin/skills/qiskit/references/setup.md +99 -0
- package/bin/skills/qiskit/references/transpilation.md +286 -0
- package/bin/skills/qiskit/references/visualization.md +415 -0
- package/bin/skills/qutip/SKILL.md +318 -0
- package/bin/skills/qutip/references/advanced.md +555 -0
- package/bin/skills/qutip/references/analysis.md +523 -0
- package/bin/skills/qutip/references/core_concepts.md +293 -0
- package/bin/skills/qutip/references/time_evolution.md +348 -0
- package/bin/skills/qutip/references/visualization.md +431 -0
- package/bin/skills/rdkit/SKILL.md +780 -0
- package/bin/skills/rdkit/references/api_reference.md +432 -0
- package/bin/skills/rdkit/references/descriptors_reference.md +595 -0
- package/bin/skills/rdkit/references/smarts_patterns.md +668 -0
- package/bin/skills/rdkit/scripts/molecular_properties.py +243 -0
- package/bin/skills/rdkit/scripts/similarity_search.py +297 -0
- package/bin/skills/rdkit/scripts/substructure_filter.py +386 -0
- package/bin/skills/reactome-database/SKILL.md +278 -0
- package/bin/skills/reactome-database/references/api_reference.md +465 -0
- package/bin/skills/reactome-database/scripts/reactome_query.py +286 -0
- package/bin/skills/rowan/SKILL.md +427 -0
- package/bin/skills/rowan/references/api_reference.md +413 -0
- package/bin/skills/rowan/references/molecule_handling.md +429 -0
- package/bin/skills/rowan/references/proteins_and_organization.md +499 -0
- package/bin/skills/rowan/references/rdkit_native.md +438 -0
- package/bin/skills/rowan/references/results_interpretation.md +481 -0
- package/bin/skills/rowan/references/workflow_types.md +591 -0
- package/bin/skills/scanpy/SKILL.md +386 -0
- package/bin/skills/scanpy/assets/analysis_template.py +295 -0
- package/bin/skills/scanpy/references/api_reference.md +251 -0
- package/bin/skills/scanpy/references/plotting_guide.md +352 -0
- package/bin/skills/scanpy/references/standard_workflow.md +206 -0
- package/bin/skills/scanpy/scripts/qc_analysis.py +200 -0
- package/bin/skills/scientific-brainstorming/SKILL.md +191 -0
- package/bin/skills/scientific-brainstorming/references/brainstorming_methods.md +326 -0
- package/bin/skills/scientific-visualization/SKILL.md +779 -0
- package/bin/skills/scientific-visualization/assets/color_palettes.py +197 -0
- package/bin/skills/scientific-visualization/assets/nature.mplstyle +63 -0
- package/bin/skills/scientific-visualization/assets/presentation.mplstyle +61 -0
- package/bin/skills/scientific-visualization/assets/publication.mplstyle +68 -0
- package/bin/skills/scientific-visualization/references/color_palettes.md +348 -0
- package/bin/skills/scientific-visualization/references/journal_requirements.md +320 -0
- package/bin/skills/scientific-visualization/references/matplotlib_examples.md +620 -0
- package/bin/skills/scientific-visualization/references/publication_guidelines.md +205 -0
- package/bin/skills/scientific-visualization/scripts/figure_export.py +343 -0
- package/bin/skills/scientific-visualization/scripts/style_presets.py +416 -0
- package/bin/skills/scikit-bio/SKILL.md +437 -0
- package/bin/skills/scikit-bio/references/api_reference.md +749 -0
- package/bin/skills/scikit-learn/SKILL.md +521 -0
- package/bin/skills/scikit-learn/references/model_evaluation.md +592 -0
- package/bin/skills/scikit-learn/references/pipelines_and_composition.md +612 -0
- package/bin/skills/scikit-learn/references/preprocessing.md +606 -0
- package/bin/skills/scikit-learn/references/quick_reference.md +433 -0
- package/bin/skills/scikit-learn/references/supervised_learning.md +378 -0
- package/bin/skills/scikit-learn/references/unsupervised_learning.md +505 -0
- package/bin/skills/scikit-learn/scripts/classification_pipeline.py +257 -0
- package/bin/skills/scikit-learn/scripts/clustering_analysis.py +386 -0
- package/bin/skills/scikit-survival/SKILL.md +399 -0
- package/bin/skills/scikit-survival/references/competing-risks.md +397 -0
- package/bin/skills/scikit-survival/references/cox-models.md +182 -0
- package/bin/skills/scikit-survival/references/data-handling.md +494 -0
- package/bin/skills/scikit-survival/references/ensemble-models.md +327 -0
- package/bin/skills/scikit-survival/references/evaluation-metrics.md +378 -0
- package/bin/skills/scikit-survival/references/svm-models.md +411 -0
- package/bin/skills/scvi-tools/SKILL.md +190 -0
- package/bin/skills/scvi-tools/references/differential-expression.md +581 -0
- package/bin/skills/scvi-tools/references/models-atac-seq.md +321 -0
- package/bin/skills/scvi-tools/references/models-multimodal.md +367 -0
- package/bin/skills/scvi-tools/references/models-scrna-seq.md +330 -0
- package/bin/skills/scvi-tools/references/models-spatial.md +438 -0
- package/bin/skills/scvi-tools/references/models-specialized.md +408 -0
- package/bin/skills/scvi-tools/references/theoretical-foundations.md +438 -0
- package/bin/skills/scvi-tools/references/workflows.md +546 -0
- package/bin/skills/seaborn/SKILL.md +673 -0
- package/bin/skills/seaborn/references/examples.md +822 -0
- package/bin/skills/seaborn/references/function_reference.md +770 -0
- package/bin/skills/seaborn/references/objects_interface.md +964 -0
- package/bin/skills/shap/SKILL.md +566 -0
- package/bin/skills/shap/references/explainers.md +339 -0
- package/bin/skills/shap/references/plots.md +507 -0
- package/bin/skills/shap/references/theory.md +449 -0
- package/bin/skills/shap/references/workflows.md +605 -0
- package/bin/skills/simpy/SKILL.md +429 -0
- package/bin/skills/simpy/references/events.md +374 -0
- package/bin/skills/simpy/references/monitoring.md +475 -0
- package/bin/skills/simpy/references/process-interaction.md +424 -0
- package/bin/skills/simpy/references/real-time.md +395 -0
- package/bin/skills/simpy/references/resources.md +275 -0
- package/bin/skills/simpy/scripts/basic_simulation_template.py +193 -0
- package/bin/skills/simpy/scripts/resource_monitor.py +345 -0
- package/bin/skills/stable-baselines3/SKILL.md +299 -0
- package/bin/skills/stable-baselines3/references/algorithms.md +333 -0
- package/bin/skills/stable-baselines3/references/callbacks.md +556 -0
- package/bin/skills/stable-baselines3/references/custom_environments.md +526 -0
- package/bin/skills/stable-baselines3/references/vectorized_envs.md +568 -0
- package/bin/skills/stable-baselines3/scripts/custom_env_template.py +314 -0
- package/bin/skills/stable-baselines3/scripts/evaluate_agent.py +245 -0
- package/bin/skills/stable-baselines3/scripts/train_rl_agent.py +165 -0
- package/bin/skills/statistical-analysis/SKILL.md +632 -0
- package/bin/skills/statistical-analysis/references/assumptions_and_diagnostics.md +369 -0
- package/bin/skills/statistical-analysis/references/bayesian_statistics.md +661 -0
- package/bin/skills/statistical-analysis/references/effect_sizes_and_power.md +581 -0
- package/bin/skills/statistical-analysis/references/reporting_standards.md +469 -0
- package/bin/skills/statistical-analysis/references/test_selection_guide.md +129 -0
- package/bin/skills/statistical-analysis/scripts/assumption_checks.py +539 -0
- package/bin/skills/statsmodels/SKILL.md +614 -0
- package/bin/skills/statsmodels/references/discrete_choice.md +669 -0
- package/bin/skills/statsmodels/references/glm.md +619 -0
- package/bin/skills/statsmodels/references/linear_models.md +447 -0
- package/bin/skills/statsmodels/references/stats_diagnostics.md +859 -0
- package/bin/skills/statsmodels/references/time_series.md +716 -0
- package/bin/skills/string-database/SKILL.md +534 -0
- package/bin/skills/string-database/references/string_reference.md +455 -0
- package/bin/skills/string-database/scripts/string_api.py +369 -0
- package/bin/skills/sympy/SKILL.md +500 -0
- package/bin/skills/sympy/references/advanced-topics.md +635 -0
- package/bin/skills/sympy/references/code-generation-printing.md +599 -0
- package/bin/skills/sympy/references/core-capabilities.md +348 -0
- package/bin/skills/sympy/references/matrices-linear-algebra.md +526 -0
- package/bin/skills/sympy/references/physics-mechanics.md +592 -0
- package/bin/skills/torch_geometric/SKILL.md +676 -0
- package/bin/skills/torch_geometric/references/datasets_reference.md +574 -0
- package/bin/skills/torch_geometric/references/layers_reference.md +485 -0
- package/bin/skills/torch_geometric/references/transforms_reference.md +679 -0
- package/bin/skills/torch_geometric/scripts/benchmark_model.py +309 -0
- package/bin/skills/torch_geometric/scripts/create_gnn_template.py +529 -0
- package/bin/skills/torch_geometric/scripts/visualize_graph.py +313 -0
- package/bin/skills/torchdrug/SKILL.md +450 -0
- package/bin/skills/torchdrug/references/core_concepts.md +565 -0
- package/bin/skills/torchdrug/references/datasets.md +380 -0
- package/bin/skills/torchdrug/references/knowledge_graphs.md +320 -0
- package/bin/skills/torchdrug/references/models_architectures.md +541 -0
- package/bin/skills/torchdrug/references/molecular_generation.md +352 -0
- package/bin/skills/torchdrug/references/molecular_property_prediction.md +169 -0
- package/bin/skills/torchdrug/references/protein_modeling.md +272 -0
- package/bin/skills/torchdrug/references/retrosynthesis.md +436 -0
- package/bin/skills/transformers/SKILL.md +164 -0
- package/bin/skills/transformers/references/generation.md +467 -0
- package/bin/skills/transformers/references/models.md +361 -0
- package/bin/skills/transformers/references/pipelines.md +335 -0
- package/bin/skills/transformers/references/tokenizers.md +447 -0
- package/bin/skills/transformers/references/training.md +500 -0
- package/bin/skills/umap-learn/SKILL.md +479 -0
- package/bin/skills/umap-learn/references/api_reference.md +532 -0
- package/bin/skills/uniprot-database/SKILL.md +195 -0
- package/bin/skills/uniprot-database/references/api_examples.md +413 -0
- package/bin/skills/uniprot-database/references/api_fields.md +275 -0
- package/bin/skills/uniprot-database/references/id_mapping_databases.md +285 -0
- package/bin/skills/uniprot-database/references/query_syntax.md +256 -0
- package/bin/skills/uniprot-database/scripts/uniprot_client.py +341 -0
- package/bin/skills/uspto-database/SKILL.md +607 -0
- package/bin/skills/uspto-database/references/additional_apis.md +394 -0
- package/bin/skills/uspto-database/references/patentsearch_api.md +266 -0
- package/bin/skills/uspto-database/references/peds_api.md +212 -0
- package/bin/skills/uspto-database/references/trademark_api.md +358 -0
- package/bin/skills/uspto-database/scripts/patent_search.py +290 -0
- package/bin/skills/uspto-database/scripts/peds_client.py +285 -0
- package/bin/skills/uspto-database/scripts/trademark_client.py +311 -0
- package/bin/skills/vaex/SKILL.md +182 -0
- package/bin/skills/vaex/references/core_dataframes.md +367 -0
- package/bin/skills/vaex/references/data_processing.md +555 -0
- package/bin/skills/vaex/references/io_operations.md +703 -0
- package/bin/skills/vaex/references/machine_learning.md +728 -0
- package/bin/skills/vaex/references/performance.md +571 -0
- package/bin/skills/vaex/references/visualization.md +613 -0
- package/bin/skills/zarr-python/SKILL.md +779 -0
- package/bin/skills/zarr-python/references/api_reference.md +515 -0
- package/bin/skills/zinc-database/SKILL.md +404 -0
- package/bin/skills/zinc-database/references/api_reference.md +692 -0
- package/bin/synsc +0 -0
- package/package.json +1 -1
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
# Dask Best Practices
|
|
2
|
+
|
|
3
|
+
## Performance Optimization Principles
|
|
4
|
+
|
|
5
|
+
### Start with Simpler Solutions First
|
|
6
|
+
|
|
7
|
+
Before implementing parallel computing with Dask, explore these alternatives:
|
|
8
|
+
- Better algorithms for the specific problem
|
|
9
|
+
- Efficient file formats (Parquet, HDF5, Zarr instead of CSV)
|
|
10
|
+
- Compiled code via Numba or Cython
|
|
11
|
+
- Data sampling for development and testing
|
|
12
|
+
|
|
13
|
+
These alternatives often provide better returns than distributed systems and should be exhausted before scaling to parallel computing.
|
|
14
|
+
|
|
15
|
+
### Chunk Size Strategy
|
|
16
|
+
|
|
17
|
+
**Critical Rule**: Chunks should be small enough that many fit in a worker's available memory at once.
|
|
18
|
+
|
|
19
|
+
**Recommended Target**: Size chunks so workers can hold 10 chunks per core without exceeding available memory.
|
|
20
|
+
|
|
21
|
+
**Why It Matters**:
|
|
22
|
+
- Too large chunks: Memory overflow and inefficient parallelization
|
|
23
|
+
- Too small chunks: Excessive scheduling overhead
|
|
24
|
+
|
|
25
|
+
**Example Calculation**:
|
|
26
|
+
- 8 cores with 32 GB RAM
|
|
27
|
+
- Target: ~400 MB per chunk (32 GB / 8 cores / 10 chunks)
|
|
28
|
+
|
|
29
|
+
### Monitor with the Dashboard
|
|
30
|
+
|
|
31
|
+
The Dask dashboard provides essential visibility into:
|
|
32
|
+
- Worker states and resource utilization
|
|
33
|
+
- Task progress and bottlenecks
|
|
34
|
+
- Memory usage patterns
|
|
35
|
+
- Performance characteristics
|
|
36
|
+
|
|
37
|
+
Access the dashboard to understand what's actually slow in parallel workloads rather than guessing at optimizations.
|
|
38
|
+
|
|
39
|
+
## Critical Pitfalls to Avoid
|
|
40
|
+
|
|
41
|
+
### 1. Don't Create Large Objects Locally Before Dask
|
|
42
|
+
|
|
43
|
+
**Wrong Approach**:
|
|
44
|
+
```python
|
|
45
|
+
import pandas as pd
|
|
46
|
+
import dask.dataframe as dd
|
|
47
|
+
|
|
48
|
+
# Loads entire dataset into memory first
|
|
49
|
+
df = pd.read_csv('large_file.csv')
|
|
50
|
+
ddf = dd.from_pandas(df, npartitions=10)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
**Correct Approach**:
|
|
54
|
+
```python
|
|
55
|
+
import dask.dataframe as dd
|
|
56
|
+
|
|
57
|
+
# Let Dask handle the loading
|
|
58
|
+
ddf = dd.read_csv('large_file.csv')
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Why**: Loading data with pandas or NumPy first forces the scheduler to serialize and embed those objects in task graphs, defeating the purpose of parallel computing.
|
|
62
|
+
|
|
63
|
+
**Key Principle**: Use Dask methods to load data and use Dask to control the results.
|
|
64
|
+
|
|
65
|
+
### 2. Avoid Repeated compute() Calls
|
|
66
|
+
|
|
67
|
+
**Wrong Approach**:
|
|
68
|
+
```python
|
|
69
|
+
results = []
|
|
70
|
+
for item in items:
|
|
71
|
+
result = dask_computation(item).compute() # Each compute is separate
|
|
72
|
+
results.append(result)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
**Correct Approach**:
|
|
76
|
+
```python
|
|
77
|
+
computations = [dask_computation(item) for item in items]
|
|
78
|
+
results = dask.compute(*computations) # Single compute for all
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**Why**: Calling compute in loops prevents Dask from:
|
|
82
|
+
- Parallelizing different computations
|
|
83
|
+
- Sharing intermediate results
|
|
84
|
+
- Optimizing the overall task graph
|
|
85
|
+
|
|
86
|
+
### 3. Don't Build Excessively Large Task Graphs
|
|
87
|
+
|
|
88
|
+
**Symptoms**:
|
|
89
|
+
- Millions of tasks in a single computation
|
|
90
|
+
- Severe scheduling overhead
|
|
91
|
+
- Long delays before computation starts
|
|
92
|
+
|
|
93
|
+
**Solutions**:
|
|
94
|
+
- Increase chunk sizes to reduce number of tasks
|
|
95
|
+
- Use `map_partitions` or `map_blocks` to fuse operations
|
|
96
|
+
- Break computations into smaller pieces with intermediate persists
|
|
97
|
+
- Consider whether the problem truly requires distributed computing
|
|
98
|
+
|
|
99
|
+
**Example Using map_partitions**:
|
|
100
|
+
```python
|
|
101
|
+
# Instead of applying function to each row
|
|
102
|
+
ddf['result'] = ddf.apply(complex_function, axis=1) # Many tasks
|
|
103
|
+
|
|
104
|
+
# Apply to entire partitions at once
|
|
105
|
+
ddf = ddf.map_partitions(lambda df: df.assign(result=complex_function(df)))
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Infrastructure Considerations
|
|
109
|
+
|
|
110
|
+
### Scheduler Selection
|
|
111
|
+
|
|
112
|
+
**Use Threads For**:
|
|
113
|
+
- Numeric work with GIL-releasing libraries (NumPy, Pandas, scikit-learn)
|
|
114
|
+
- Operations that benefit from shared memory
|
|
115
|
+
- Single-machine workloads with array/dataframe operations
|
|
116
|
+
|
|
117
|
+
**Use Processes For**:
|
|
118
|
+
- Text processing and Python collection operations
|
|
119
|
+
- Pure Python code that's GIL-bound
|
|
120
|
+
- Operations that need process isolation
|
|
121
|
+
|
|
122
|
+
**Use Distributed Scheduler For**:
|
|
123
|
+
- Multi-machine clusters
|
|
124
|
+
- Need for diagnostic dashboard
|
|
125
|
+
- Asynchronous APIs
|
|
126
|
+
- Better data locality handling
|
|
127
|
+
|
|
128
|
+
### Thread Configuration
|
|
129
|
+
|
|
130
|
+
**Recommendation**: Aim for roughly 4 threads per process on numeric workloads.
|
|
131
|
+
|
|
132
|
+
**Rationale**:
|
|
133
|
+
- Balance between parallelism and overhead
|
|
134
|
+
- Allows efficient use of CPU cores
|
|
135
|
+
- Reduces context switching costs
|
|
136
|
+
|
|
137
|
+
### Memory Management
|
|
138
|
+
|
|
139
|
+
**Persist Strategically**:
|
|
140
|
+
```python
|
|
141
|
+
# Persist intermediate results that are reused
|
|
142
|
+
intermediate = expensive_computation(data).persist()
|
|
143
|
+
result1 = intermediate.operation1().compute()
|
|
144
|
+
result2 = intermediate.operation2().compute()
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
**Clear Memory When Done**:
|
|
148
|
+
```python
|
|
149
|
+
# Explicitly delete large objects
|
|
150
|
+
del intermediate
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Data Loading Best Practices
|
|
154
|
+
|
|
155
|
+
### Use Appropriate File Formats
|
|
156
|
+
|
|
157
|
+
**For Tabular Data**:
|
|
158
|
+
- Parquet: Columnar, compressed, fast filtering
|
|
159
|
+
- CSV: Only for small data or initial ingestion
|
|
160
|
+
|
|
161
|
+
**For Array Data**:
|
|
162
|
+
- HDF5: Good for numeric arrays
|
|
163
|
+
- Zarr: Cloud-native, parallel-friendly
|
|
164
|
+
- NetCDF: Scientific data with metadata
|
|
165
|
+
|
|
166
|
+
### Optimize Data Ingestion
|
|
167
|
+
|
|
168
|
+
**Read Multiple Files Efficiently**:
|
|
169
|
+
```python
|
|
170
|
+
# Use glob patterns to read multiple files in parallel
|
|
171
|
+
ddf = dd.read_parquet('data/year=2024/month=*/day=*.parquet')
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
**Specify Useful Columns Early**:
|
|
175
|
+
```python
|
|
176
|
+
# Only read needed columns
|
|
177
|
+
ddf = dd.read_parquet('data.parquet', columns=['col1', 'col2', 'col3'])
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Common Patterns and Solutions
|
|
181
|
+
|
|
182
|
+
### Pattern: Embarrassingly Parallel Problems
|
|
183
|
+
|
|
184
|
+
For independent computations, use Futures:
|
|
185
|
+
```python
|
|
186
|
+
from dask.distributed import Client
|
|
187
|
+
|
|
188
|
+
client = Client()
|
|
189
|
+
futures = [client.submit(func, arg) for arg in args]
|
|
190
|
+
results = client.gather(futures)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### Pattern: Data Preprocessing Pipeline
|
|
194
|
+
|
|
195
|
+
Use Bags for initial ETL, then convert to structured formats:
|
|
196
|
+
```python
|
|
197
|
+
import dask.bag as db
|
|
198
|
+
|
|
199
|
+
# Process raw JSON
|
|
200
|
+
bag = db.read_text('logs/*.json').map(json.loads)
|
|
201
|
+
bag = bag.filter(lambda x: x['status'] == 'success')
|
|
202
|
+
|
|
203
|
+
# Convert to DataFrame for analysis
|
|
204
|
+
ddf = bag.to_dataframe()
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### Pattern: Iterative Algorithms
|
|
208
|
+
|
|
209
|
+
Persist data between iterations:
|
|
210
|
+
```python
|
|
211
|
+
data = dd.read_parquet('data.parquet')
|
|
212
|
+
data = data.persist() # Keep in memory across iterations
|
|
213
|
+
|
|
214
|
+
for iteration in range(num_iterations):
|
|
215
|
+
data = update_function(data)
|
|
216
|
+
data = data.persist() # Persist updated version
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## Debugging Tips
|
|
220
|
+
|
|
221
|
+
### Use Single-Threaded Scheduler
|
|
222
|
+
|
|
223
|
+
For debugging with pdb or detailed error inspection:
|
|
224
|
+
```python
|
|
225
|
+
import dask
|
|
226
|
+
|
|
227
|
+
dask.config.set(scheduler='synchronous')
|
|
228
|
+
result = computation.compute() # Runs in single thread for debugging
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### Check Task Graph Size
|
|
232
|
+
|
|
233
|
+
Before computing, check the number of tasks:
|
|
234
|
+
```python
|
|
235
|
+
print(len(ddf.__dask_graph__())) # Should be reasonable, not millions
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
### Validate on Small Data First
|
|
239
|
+
|
|
240
|
+
Test logic on small subset before scaling:
|
|
241
|
+
```python
|
|
242
|
+
# Test on first partition
|
|
243
|
+
sample = ddf.head(1000)
|
|
244
|
+
# Validate results
|
|
245
|
+
# Then scale to full dataset
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## Performance Troubleshooting
|
|
249
|
+
|
|
250
|
+
### Symptom: Slow Computation Start
|
|
251
|
+
|
|
252
|
+
**Likely Cause**: Task graph is too large
|
|
253
|
+
**Solution**: Increase chunk sizes or use map_partitions
|
|
254
|
+
|
|
255
|
+
### Symptom: Memory Errors
|
|
256
|
+
|
|
257
|
+
**Likely Causes**:
|
|
258
|
+
- Chunks too large
|
|
259
|
+
- Too many intermediate results
|
|
260
|
+
- Memory leaks in user functions
|
|
261
|
+
|
|
262
|
+
**Solutions**:
|
|
263
|
+
- Decrease chunk sizes
|
|
264
|
+
- Use persist() strategically and delete when done
|
|
265
|
+
- Profile user functions for memory issues
|
|
266
|
+
|
|
267
|
+
### Symptom: Poor Parallelization
|
|
268
|
+
|
|
269
|
+
**Likely Causes**:
|
|
270
|
+
- Data dependencies preventing parallelism
|
|
271
|
+
- Chunks too large (not enough tasks)
|
|
272
|
+
- GIL contention with threads on Python code
|
|
273
|
+
|
|
274
|
+
**Solutions**:
|
|
275
|
+
- Restructure computation to reduce dependencies
|
|
276
|
+
- Increase number of partitions
|
|
277
|
+
- Switch to multiprocessing scheduler for Python code
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
# Dask DataFrames
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Dask DataFrames enable parallel processing of large tabular data by distributing work across multiple pandas DataFrames. As described in the documentation, "Dask DataFrames are a collection of many pandas DataFrames" with identical APIs, making the transition from pandas straightforward.
|
|
6
|
+
|
|
7
|
+
## Core Concept
|
|
8
|
+
|
|
9
|
+
A Dask DataFrame is divided into multiple pandas DataFrames (partitions) along the index:
|
|
10
|
+
- Each partition is a regular pandas DataFrame
|
|
11
|
+
- Operations are applied to each partition in parallel
|
|
12
|
+
- Results are combined automatically
|
|
13
|
+
|
|
14
|
+
## Key Capabilities
|
|
15
|
+
|
|
16
|
+
### Scale
|
|
17
|
+
- Process 100 GiB on a laptop
|
|
18
|
+
- Process 100 TiB on a cluster
|
|
19
|
+
- Handle datasets exceeding available RAM
|
|
20
|
+
|
|
21
|
+
### Compatibility
|
|
22
|
+
- Implements most of the pandas API
|
|
23
|
+
- Easy transition from pandas code
|
|
24
|
+
- Works with familiar operations
|
|
25
|
+
|
|
26
|
+
## When to Use Dask DataFrames
|
|
27
|
+
|
|
28
|
+
**Use Dask When**:
|
|
29
|
+
- Dataset exceeds available RAM
|
|
30
|
+
- Computations require significant time and pandas optimization hasn't helped
|
|
31
|
+
- Need to scale from prototype (pandas) to production (larger data)
|
|
32
|
+
- Working with multiple files that should be processed together
|
|
33
|
+
|
|
34
|
+
**Stick with Pandas When**:
|
|
35
|
+
- Data fits comfortably in memory
|
|
36
|
+
- Computations complete in subseconds
|
|
37
|
+
- Simple operations without custom `.apply()` functions
|
|
38
|
+
- Iterative development and exploration
|
|
39
|
+
|
|
40
|
+
## Reading Data
|
|
41
|
+
|
|
42
|
+
Dask mirrors pandas reading syntax with added support for multiple files:
|
|
43
|
+
|
|
44
|
+
### Single File
|
|
45
|
+
```python
|
|
46
|
+
import dask.dataframe as dd
|
|
47
|
+
|
|
48
|
+
# Read single file
|
|
49
|
+
ddf = dd.read_csv('data.csv')
|
|
50
|
+
ddf = dd.read_parquet('data.parquet')
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Multiple Files
|
|
54
|
+
```python
|
|
55
|
+
# Read multiple files using glob patterns
|
|
56
|
+
ddf = dd.read_csv('data/*.csv')
|
|
57
|
+
ddf = dd.read_parquet('s3://mybucket/data/*.parquet')
|
|
58
|
+
|
|
59
|
+
# Read with path structure
|
|
60
|
+
ddf = dd.read_parquet('data/year=*/month=*/day=*.parquet')
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Optimizations
|
|
64
|
+
```python
|
|
65
|
+
# Specify columns to read (reduces memory)
|
|
66
|
+
ddf = dd.read_parquet('data.parquet', columns=['col1', 'col2'])
|
|
67
|
+
|
|
68
|
+
# Control partitioning
|
|
69
|
+
ddf = dd.read_csv('data.csv', blocksize='64MB') # Creates 64MB partitions
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Common Operations
|
|
73
|
+
|
|
74
|
+
All operations are lazy until `.compute()` is called.
|
|
75
|
+
|
|
76
|
+
### Filtering
|
|
77
|
+
```python
|
|
78
|
+
# Same as pandas
|
|
79
|
+
filtered = ddf[ddf['column'] > 100]
|
|
80
|
+
filtered = ddf.query('column > 100')
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Column Operations
|
|
84
|
+
```python
|
|
85
|
+
# Add columns
|
|
86
|
+
ddf['new_column'] = ddf['col1'] + ddf['col2']
|
|
87
|
+
|
|
88
|
+
# Select columns
|
|
89
|
+
subset = ddf[['col1', 'col2', 'col3']]
|
|
90
|
+
|
|
91
|
+
# Drop columns
|
|
92
|
+
ddf = ddf.drop(columns=['unnecessary_col'])
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Aggregations
|
|
96
|
+
```python
|
|
97
|
+
# Standard aggregations work as expected
|
|
98
|
+
mean = ddf['column'].mean().compute()
|
|
99
|
+
sum_total = ddf['column'].sum().compute()
|
|
100
|
+
counts = ddf['category'].value_counts().compute()
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### GroupBy
|
|
104
|
+
```python
|
|
105
|
+
# GroupBy operations (may require shuffle)
|
|
106
|
+
grouped = ddf.groupby('category')['value'].mean().compute()
|
|
107
|
+
|
|
108
|
+
# Multiple aggregations
|
|
109
|
+
agg_result = ddf.groupby('category').agg({
|
|
110
|
+
'value': ['mean', 'sum', 'count'],
|
|
111
|
+
'amount': 'sum'
|
|
112
|
+
}).compute()
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### Joins and Merges
|
|
116
|
+
```python
|
|
117
|
+
# Merge DataFrames
|
|
118
|
+
merged = dd.merge(ddf1, ddf2, on='key', how='left')
|
|
119
|
+
|
|
120
|
+
# Join on index
|
|
121
|
+
joined = ddf1.join(ddf2, on='key')
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Sorting
|
|
125
|
+
```python
|
|
126
|
+
# Sorting (expensive operation, requires data movement)
|
|
127
|
+
sorted_ddf = ddf.sort_values('column')
|
|
128
|
+
result = sorted_ddf.compute()
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Custom Operations
|
|
132
|
+
|
|
133
|
+
### Apply Functions
|
|
134
|
+
|
|
135
|
+
**To Partitions (Efficient)**:
|
|
136
|
+
```python
|
|
137
|
+
# Apply function to entire partitions
|
|
138
|
+
def custom_partition_function(partition_df):
|
|
139
|
+
# partition_df is a pandas DataFrame
|
|
140
|
+
return partition_df.assign(new_col=partition_df['col1'] * 2)
|
|
141
|
+
|
|
142
|
+
ddf = ddf.map_partitions(custom_partition_function)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
**To Rows (Less Efficient)**:
|
|
146
|
+
```python
|
|
147
|
+
# Apply to each row (creates many tasks)
|
|
148
|
+
ddf['result'] = ddf.apply(lambda row: custom_function(row), axis=1, meta=('result', 'float'))
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
**Note**: Always prefer `map_partitions` over row-wise `apply` for better performance.
|
|
152
|
+
|
|
153
|
+
### Meta Parameter
|
|
154
|
+
|
|
155
|
+
When Dask can't infer output structure, specify the `meta` parameter:
|
|
156
|
+
```python
|
|
157
|
+
# For apply operations
|
|
158
|
+
ddf['new'] = ddf.apply(func, axis=1, meta=('new', 'float64'))
|
|
159
|
+
|
|
160
|
+
# For map_partitions
|
|
161
|
+
ddf = ddf.map_partitions(func, meta=pd.DataFrame({
|
|
162
|
+
'col1': pd.Series(dtype='float64'),
|
|
163
|
+
'col2': pd.Series(dtype='int64')
|
|
164
|
+
}))
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Lazy Evaluation and Computation
|
|
168
|
+
|
|
169
|
+
### Lazy Operations
|
|
170
|
+
```python
|
|
171
|
+
# These operations are lazy (instant, no computation)
|
|
172
|
+
filtered = ddf[ddf['value'] > 100]
|
|
173
|
+
aggregated = filtered.groupby('category').mean()
|
|
174
|
+
final = aggregated[aggregated['value'] < 500]
|
|
175
|
+
|
|
176
|
+
# Nothing has computed yet
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### Triggering Computation
|
|
180
|
+
```python
|
|
181
|
+
# Compute single result
|
|
182
|
+
result = final.compute()
|
|
183
|
+
|
|
184
|
+
# Compute multiple results efficiently
|
|
185
|
+
result1, result2, result3 = dask.compute(
|
|
186
|
+
operation1,
|
|
187
|
+
operation2,
|
|
188
|
+
operation3
|
|
189
|
+
)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Persist in Memory
|
|
193
|
+
```python
|
|
194
|
+
# Keep results in distributed memory for reuse
|
|
195
|
+
ddf_cached = ddf.persist()
|
|
196
|
+
|
|
197
|
+
# Now multiple operations on ddf_cached won't recompute
|
|
198
|
+
result1 = ddf_cached.mean().compute()
|
|
199
|
+
result2 = ddf_cached.sum().compute()
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## Index Management
|
|
203
|
+
|
|
204
|
+
### Setting Index
|
|
205
|
+
```python
|
|
206
|
+
# Set index (required for efficient joins and certain operations)
|
|
207
|
+
ddf = ddf.set_index('timestamp', sorted=True)
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
### Index Properties
|
|
211
|
+
- Sorted index enables efficient filtering and joins
|
|
212
|
+
- Index determines partitioning
|
|
213
|
+
- Some operations perform better with appropriate index
|
|
214
|
+
|
|
215
|
+
## Writing Results
|
|
216
|
+
|
|
217
|
+
### To Files
|
|
218
|
+
```python
|
|
219
|
+
# Write to multiple files (one per partition)
|
|
220
|
+
ddf.to_parquet('output/data.parquet')
|
|
221
|
+
ddf.to_csv('output/data-*.csv')
|
|
222
|
+
|
|
223
|
+
# Write to single file (forces computation and concatenation)
|
|
224
|
+
ddf.compute().to_csv('output/single_file.csv')
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### To Memory (Pandas)
|
|
228
|
+
```python
|
|
229
|
+
# Convert to pandas (loads all data in memory)
|
|
230
|
+
pdf = ddf.compute()
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## Performance Considerations
|
|
234
|
+
|
|
235
|
+
### Efficient Operations
|
|
236
|
+
- Column selection and filtering: Very efficient
|
|
237
|
+
- Simple aggregations (sum, mean, count): Efficient
|
|
238
|
+
- Row-wise operations on partitions: Efficient with `map_partitions`
|
|
239
|
+
|
|
240
|
+
### Expensive Operations
|
|
241
|
+
- Sorting: Requires data shuffle across workers
|
|
242
|
+
- GroupBy with many groups: May require shuffle
|
|
243
|
+
- Complex joins: Depends on data distribution
|
|
244
|
+
- Row-wise apply: Creates many tasks
|
|
245
|
+
|
|
246
|
+
### Optimization Tips
|
|
247
|
+
|
|
248
|
+
**1. Select Columns Early**
|
|
249
|
+
```python
|
|
250
|
+
# Better: Read only needed columns
|
|
251
|
+
ddf = dd.read_parquet('data.parquet', columns=['col1', 'col2'])
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
**2. Filter Before GroupBy**
|
|
255
|
+
```python
|
|
256
|
+
# Better: Reduce data before expensive operations
|
|
257
|
+
result = ddf[ddf['year'] == 2024].groupby('category').sum().compute()
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
**3. Use Efficient File Formats**
|
|
261
|
+
```python
|
|
262
|
+
# Use Parquet instead of CSV for better performance
|
|
263
|
+
ddf.to_parquet('data.parquet') # Faster, smaller, columnar
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
**4. Repartition Appropriately**
|
|
267
|
+
```python
|
|
268
|
+
# If partitions are too small
|
|
269
|
+
ddf = ddf.repartition(npartitions=10)
|
|
270
|
+
|
|
271
|
+
# If partitions are too large
|
|
272
|
+
ddf = ddf.repartition(partition_size='100MB')
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
## Common Patterns
|
|
276
|
+
|
|
277
|
+
### ETL Pipeline
|
|
278
|
+
```python
|
|
279
|
+
import dask.dataframe as dd
|
|
280
|
+
|
|
281
|
+
# Read data
|
|
282
|
+
ddf = dd.read_csv('raw_data/*.csv')
|
|
283
|
+
|
|
284
|
+
# Transform
|
|
285
|
+
ddf = ddf[ddf['status'] == 'valid']
|
|
286
|
+
ddf['amount'] = ddf['amount'].astype('float64')
|
|
287
|
+
ddf = ddf.dropna(subset=['important_col'])
|
|
288
|
+
|
|
289
|
+
# Aggregate
|
|
290
|
+
summary = ddf.groupby('category').agg({
|
|
291
|
+
'amount': ['sum', 'mean'],
|
|
292
|
+
'quantity': 'count'
|
|
293
|
+
})
|
|
294
|
+
|
|
295
|
+
# Write results
|
|
296
|
+
summary.to_parquet('output/summary.parquet')
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
### Time Series Analysis
|
|
300
|
+
```python
|
|
301
|
+
# Read time series data
|
|
302
|
+
ddf = dd.read_parquet('timeseries/*.parquet')
|
|
303
|
+
|
|
304
|
+
# Set timestamp index
|
|
305
|
+
ddf = ddf.set_index('timestamp', sorted=True)
|
|
306
|
+
|
|
307
|
+
# Resample (if available in Dask version)
|
|
308
|
+
hourly = ddf.resample('1H').mean()
|
|
309
|
+
|
|
310
|
+
# Compute statistics
|
|
311
|
+
result = hourly.compute()
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
### Combining Multiple Files
|
|
315
|
+
```python
|
|
316
|
+
# Read multiple files as single DataFrame
|
|
317
|
+
ddf = dd.read_csv('data/2024-*.csv')
|
|
318
|
+
|
|
319
|
+
# Process combined data
|
|
320
|
+
result = ddf.groupby('category')['value'].sum().compute()
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
## Limitations and Differences from Pandas
|
|
324
|
+
|
|
325
|
+
### Not All Pandas Features Available
|
|
326
|
+
Some pandas operations are not implemented in Dask:
|
|
327
|
+
- Some string methods
|
|
328
|
+
- Certain window functions
|
|
329
|
+
- Some specialized statistical functions
|
|
330
|
+
|
|
331
|
+
### Partitioning Matters
|
|
332
|
+
- Operations within partitions are efficient
|
|
333
|
+
- Cross-partition operations may be expensive
|
|
334
|
+
- Index-based operations benefit from sorted index
|
|
335
|
+
|
|
336
|
+
### Lazy Evaluation
|
|
337
|
+
- Operations don't execute until `.compute()`
|
|
338
|
+
- Need to be aware of computation triggers
|
|
339
|
+
- Can't inspect intermediate results without computing
|
|
340
|
+
|
|
341
|
+
## Debugging Tips
|
|
342
|
+
|
|
343
|
+
### Inspect Partitions
|
|
344
|
+
```python
|
|
345
|
+
# Get number of partitions
|
|
346
|
+
print(ddf.npartitions)
|
|
347
|
+
|
|
348
|
+
# Compute single partition
|
|
349
|
+
first_partition = ddf.get_partition(0).compute()
|
|
350
|
+
|
|
351
|
+
# View first few rows (computes first partition)
|
|
352
|
+
print(ddf.head())
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
### Validate Operations on Small Data
|
|
356
|
+
```python
|
|
357
|
+
# Test on small sample first
|
|
358
|
+
sample = ddf.head(1000)
|
|
359
|
+
# Validate logic works
|
|
360
|
+
# Then scale to full dataset
|
|
361
|
+
result = ddf.compute()
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
### Check Dtypes
|
|
365
|
+
```python
|
|
366
|
+
# Verify data types are correct
|
|
367
|
+
print(ddf.dtypes)
|
|
368
|
+
```
|