nimare 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- benchmarks/__init__.py +0 -0
- benchmarks/bench_cbma.py +57 -0
- nimare/__init__.py +45 -0
- nimare/_version.py +21 -0
- nimare/annotate/__init__.py +21 -0
- nimare/annotate/cogat.py +213 -0
- nimare/annotate/gclda.py +924 -0
- nimare/annotate/lda.py +147 -0
- nimare/annotate/text.py +75 -0
- nimare/annotate/utils.py +87 -0
- nimare/base.py +217 -0
- nimare/cli.py +124 -0
- nimare/correct.py +462 -0
- nimare/dataset.py +685 -0
- nimare/decode/__init__.py +33 -0
- nimare/decode/base.py +115 -0
- nimare/decode/continuous.py +462 -0
- nimare/decode/discrete.py +753 -0
- nimare/decode/encode.py +110 -0
- nimare/decode/utils.py +44 -0
- nimare/diagnostics.py +510 -0
- nimare/estimator.py +139 -0
- nimare/extract/__init__.py +19 -0
- nimare/extract/extract.py +466 -0
- nimare/extract/utils.py +295 -0
- nimare/generate.py +331 -0
- nimare/io.py +667 -0
- nimare/meta/__init__.py +39 -0
- nimare/meta/cbma/__init__.py +6 -0
- nimare/meta/cbma/ale.py +951 -0
- nimare/meta/cbma/base.py +947 -0
- nimare/meta/cbma/mkda.py +1361 -0
- nimare/meta/cbmr.py +970 -0
- nimare/meta/ibma.py +1683 -0
- nimare/meta/kernel.py +501 -0
- nimare/meta/models.py +1199 -0
- nimare/meta/utils.py +494 -0
- nimare/nimads.py +492 -0
- nimare/reports/__init__.py +24 -0
- nimare/reports/base.py +664 -0
- nimare/reports/default.yml +123 -0
- nimare/reports/figures.py +651 -0
- nimare/reports/report.tpl +160 -0
- nimare/resources/__init__.py +1 -0
- nimare/resources/atlases/Harvard-Oxford-LICENSE +93 -0
- nimare/resources/atlases/HarvardOxford-cort-maxprob-thr25-2mm.nii.gz +0 -0
- nimare/resources/database_file_manifest.json +142 -0
- nimare/resources/english_spellings.csv +1738 -0
- nimare/resources/filenames.json +32 -0
- nimare/resources/neurosynth_laird_studies.json +58773 -0
- nimare/resources/neurosynth_stoplist.txt +396 -0
- nimare/resources/nidm_pain_dset.json +1349 -0
- nimare/resources/references.bib +541 -0
- nimare/resources/semantic_knowledge_children.txt +325 -0
- nimare/resources/semantic_relatedness_children.txt +249 -0
- nimare/resources/templates/MNI152_2x2x2_brainmask.nii.gz +0 -0
- nimare/resources/templates/tpl-MNI152NLin6Asym_res-01_T1w.nii.gz +0 -0
- nimare/resources/templates/tpl-MNI152NLin6Asym_res-01_desc-brain_mask.nii.gz +0 -0
- nimare/resources/templates/tpl-MNI152NLin6Asym_res-02_T1w.nii.gz +0 -0
- nimare/resources/templates/tpl-MNI152NLin6Asym_res-02_desc-brain_mask.nii.gz +0 -0
- nimare/results.py +225 -0
- nimare/stats.py +276 -0
- nimare/tests/__init__.py +1 -0
- nimare/tests/conftest.py +229 -0
- nimare/tests/data/amygdala_roi.nii.gz +0 -0
- nimare/tests/data/data-neurosynth_version-7_coordinates.tsv.gz +0 -0
- nimare/tests/data/data-neurosynth_version-7_metadata.tsv.gz +0 -0
- nimare/tests/data/data-neurosynth_version-7_vocab-terms_source-abstract_type-tfidf_features.npz +0 -0
- nimare/tests/data/data-neurosynth_version-7_vocab-terms_vocabulary.txt +100 -0
- nimare/tests/data/neurosynth_dset.json +2868 -0
- nimare/tests/data/neurosynth_laird_studies.json +58773 -0
- nimare/tests/data/nidm_pain_dset.json +1349 -0
- nimare/tests/data/nimads_annotation.json +1 -0
- nimare/tests/data/nimads_studyset.json +1 -0
- nimare/tests/data/test_baseline.txt +2 -0
- nimare/tests/data/test_pain_dataset.json +1278 -0
- nimare/tests/data/test_pain_dataset_multiple_contrasts.json +1242 -0
- nimare/tests/data/test_sleuth_file.txt +18 -0
- nimare/tests/data/test_sleuth_file2.txt +10 -0
- nimare/tests/data/test_sleuth_file3.txt +5 -0
- nimare/tests/data/test_sleuth_file4.txt +5 -0
- nimare/tests/data/test_sleuth_file5.txt +5 -0
- nimare/tests/test_annotate_cogat.py +32 -0
- nimare/tests/test_annotate_gclda.py +86 -0
- nimare/tests/test_annotate_lda.py +27 -0
- nimare/tests/test_dataset.py +99 -0
- nimare/tests/test_decode_continuous.py +132 -0
- nimare/tests/test_decode_discrete.py +92 -0
- nimare/tests/test_diagnostics.py +168 -0
- nimare/tests/test_estimator_performance.py +385 -0
- nimare/tests/test_extract.py +46 -0
- nimare/tests/test_generate.py +247 -0
- nimare/tests/test_io.py +294 -0
- nimare/tests/test_meta_ale.py +298 -0
- nimare/tests/test_meta_cbmr.py +295 -0
- nimare/tests/test_meta_ibma.py +240 -0
- nimare/tests/test_meta_kernel.py +209 -0
- nimare/tests/test_meta_mkda.py +234 -0
- nimare/tests/test_nimads.py +21 -0
- nimare/tests/test_reports.py +110 -0
- nimare/tests/test_stats.py +101 -0
- nimare/tests/test_transforms.py +272 -0
- nimare/tests/test_utils.py +200 -0
- nimare/tests/test_workflows.py +221 -0
- nimare/tests/utils.py +126 -0
- nimare/transforms.py +907 -0
- nimare/utils.py +1367 -0
- nimare/workflows/__init__.py +14 -0
- nimare/workflows/base.py +189 -0
- nimare/workflows/cbma.py +165 -0
- nimare/workflows/ibma.py +108 -0
- nimare/workflows/macm.py +77 -0
- nimare/workflows/misc.py +65 -0
- nimare-0.4.2.dist-info/LICENSE +21 -0
- nimare-0.4.2.dist-info/METADATA +124 -0
- nimare-0.4.2.dist-info/RECORD +119 -0
- nimare-0.4.2.dist-info/WHEEL +5 -0
- nimare-0.4.2.dist-info/entry_points.txt +2 -0
- nimare-0.4.2.dist-info/top_level.txt +2 -0
nimare/estimator.py
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
"""Base class for estimators."""
|
2
|
+
|
3
|
+
from abc import abstractmethod
|
4
|
+
|
5
|
+
from joblib import Memory
|
6
|
+
|
7
|
+
from nimare.base import NiMAREBase
|
8
|
+
from nimare.results import MetaResult
|
9
|
+
|
10
|
+
|
11
|
+
class Estimator(NiMAREBase):
|
12
|
+
"""Estimators take in Datasets and return MetaResults.
|
13
|
+
|
14
|
+
All Estimators must have a ``_fit`` method implemented, which applies algorithm-specific
|
15
|
+
methods to a Dataset and returns a dictionary of arrays to be converted into a MetaResult.
|
16
|
+
|
17
|
+
Users will interact with the ``_fit`` method by calling the user-facing ``fit`` method.
|
18
|
+
``fit`` takes in a ``Dataset``, calls ``_collect_inputs``, then ``_preprocess_input``,
|
19
|
+
then ``_fit``, and finally converts the dictionary returned by ``_fit`` into a ``MetaResult``.
|
20
|
+
"""
|
21
|
+
|
22
|
+
# Inputs that must be available in input Dataset. Keys are names of
|
23
|
+
# attributes to set; values are strings indicating location in Dataset.
|
24
|
+
_required_inputs = {}
|
25
|
+
|
26
|
+
def __init__(self, memory=Memory(location=None, verbose=0), memory_level=0):
|
27
|
+
self.memory = memory
|
28
|
+
self.memory_level = memory_level
|
29
|
+
|
30
|
+
def _collect_inputs(self, dataset, drop_invalid=True):
|
31
|
+
"""Search for, and validate, required inputs as necessary.
|
32
|
+
|
33
|
+
This method populates the ``inputs_`` attribute.
|
34
|
+
|
35
|
+
.. versionchanged:: 0.0.12
|
36
|
+
|
37
|
+
Renamed from ``_validate_input``.
|
38
|
+
|
39
|
+
Parameters
|
40
|
+
----------
|
41
|
+
dataset : :obj:`~nimare.dataset.Dataset`
|
42
|
+
drop_invalid : :obj:`bool`, default=True
|
43
|
+
Whether to automatically drop any studies in the Dataset without valid data or not.
|
44
|
+
Default is True.
|
45
|
+
|
46
|
+
Attributes
|
47
|
+
----------
|
48
|
+
inputs_ : :obj:`dict`
|
49
|
+
A dictionary of required inputs for the Estimator, extracted from the Dataset.
|
50
|
+
The actual inputs collected in this attribute are determined by the
|
51
|
+
``_required_inputs`` variable that should be specified in each child class.
|
52
|
+
"""
|
53
|
+
if not hasattr(dataset, "slice"):
|
54
|
+
raise ValueError(
|
55
|
+
f"Argument 'dataset' must be a valid Dataset object, not a {type(dataset)}."
|
56
|
+
)
|
57
|
+
|
58
|
+
if self._required_inputs:
|
59
|
+
data = dataset.get(self._required_inputs, drop_invalid=drop_invalid)
|
60
|
+
# Do not overwrite existing inputs_ attribute.
|
61
|
+
# This is necessary for PairwiseCBMAEstimator, which validates two sets of coordinates
|
62
|
+
# in the same object.
|
63
|
+
# It makes the *strong* assumption that required inputs will not changes within an
|
64
|
+
# Estimator across fit calls, so all fields of inputs_ will be overwritten instead of
|
65
|
+
# retaining outdated fields from previous fit calls.
|
66
|
+
if not hasattr(self, "inputs_"):
|
67
|
+
self.inputs_ = {}
|
68
|
+
|
69
|
+
for k, v in data.items():
|
70
|
+
if v is None:
|
71
|
+
raise ValueError(
|
72
|
+
f"Estimator {self.__class__.__name__} requires input dataset to contain "
|
73
|
+
f"{k}, but no matching data were found."
|
74
|
+
)
|
75
|
+
self.inputs_[k] = v
|
76
|
+
|
77
|
+
@abstractmethod
|
78
|
+
def _generate_description(self):
|
79
|
+
"""Generate a text description of the Estimator."""
|
80
|
+
pass
|
81
|
+
|
82
|
+
@abstractmethod
|
83
|
+
def _preprocess_input(self, dataset):
|
84
|
+
"""Perform any additional preprocessing steps on data in self.inputs_.
|
85
|
+
|
86
|
+
Parameters
|
87
|
+
----------
|
88
|
+
dataset : :obj:`~nimare.dataset.Dataset`
|
89
|
+
The Dataset
|
90
|
+
"""
|
91
|
+
pass
|
92
|
+
|
93
|
+
@abstractmethod
|
94
|
+
def _fit(self, dataset):
|
95
|
+
"""Apply estimation to dataset and output results.
|
96
|
+
|
97
|
+
Must return a dictionary of results, where keys are names of images
|
98
|
+
and values are ndarrays.
|
99
|
+
"""
|
100
|
+
pass
|
101
|
+
|
102
|
+
def fit(self, dataset, drop_invalid=True):
|
103
|
+
"""Fit Estimator to Dataset.
|
104
|
+
|
105
|
+
Parameters
|
106
|
+
----------
|
107
|
+
dataset : :obj:`~nimare.dataset.Dataset`
|
108
|
+
Dataset object to analyze.
|
109
|
+
drop_invalid : :obj:`bool`, optional
|
110
|
+
Whether to automatically ignore any studies without the required data or not.
|
111
|
+
Default is False.
|
112
|
+
|
113
|
+
Returns
|
114
|
+
-------
|
115
|
+
:obj:`~nimare.results.MetaResult`
|
116
|
+
Results of Estimator fitting.
|
117
|
+
|
118
|
+
Attributes
|
119
|
+
----------
|
120
|
+
inputs_ : :obj:`dict`
|
121
|
+
Inputs used in _fit.
|
122
|
+
|
123
|
+
Notes
|
124
|
+
-----
|
125
|
+
The `fit` method is a light wrapper that runs input validation and
|
126
|
+
preprocessing before fitting the actual model. Estimators' individual
|
127
|
+
"fitting" methods are implemented as `_fit`, although users should
|
128
|
+
call `fit`.
|
129
|
+
"""
|
130
|
+
self._collect_inputs(dataset, drop_invalid=drop_invalid)
|
131
|
+
self._preprocess_input(dataset)
|
132
|
+
maps, tables, description = self._cache(self._fit, func_memory_level=1)(dataset)
|
133
|
+
|
134
|
+
if hasattr(self, "masker") and self.masker is not None:
|
135
|
+
masker = self.masker
|
136
|
+
else:
|
137
|
+
masker = dataset.masker
|
138
|
+
|
139
|
+
return MetaResult(self, mask=masker, maps=maps, tables=tables, description=description)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
"""Dataset and trained model downloading functions."""
|
2
|
+
|
3
|
+
from . import utils
|
4
|
+
from .extract import (
|
5
|
+
download_abstracts,
|
6
|
+
download_cognitive_atlas,
|
7
|
+
download_nidm_pain,
|
8
|
+
fetch_neuroquery,
|
9
|
+
fetch_neurosynth,
|
10
|
+
)
|
11
|
+
|
12
|
+
__all__ = [
|
13
|
+
"download_nidm_pain",
|
14
|
+
"download_cognitive_atlas",
|
15
|
+
"download_abstracts",
|
16
|
+
"fetch_neuroquery",
|
17
|
+
"fetch_neurosynth",
|
18
|
+
"utils",
|
19
|
+
]
|
@@ -0,0 +1,466 @@
|
|
1
|
+
"""Tools for downloading datasets."""
|
2
|
+
|
3
|
+
import itertools
|
4
|
+
import json
|
5
|
+
import logging
|
6
|
+
import os
|
7
|
+
import os.path as op
|
8
|
+
import shutil
|
9
|
+
import time
|
10
|
+
import zipfile
|
11
|
+
from glob import glob
|
12
|
+
from urllib.request import urlopen
|
13
|
+
|
14
|
+
import numpy as np
|
15
|
+
import pandas as pd
|
16
|
+
|
17
|
+
from nimare.dataset import Dataset
|
18
|
+
from nimare.extract.utils import (
|
19
|
+
_download_zipped_file,
|
20
|
+
_expand_df,
|
21
|
+
_get_concept_reltype,
|
22
|
+
_get_dataset_dir,
|
23
|
+
_longify,
|
24
|
+
)
|
25
|
+
from nimare.utils import get_resource_path
|
26
|
+
|
27
|
+
LGR = logging.getLogger(__name__)
|
28
|
+
|
29
|
+
VALID_ENTITIES = {
|
30
|
+
"coordinates.tsv.gz": ["data", "version"],
|
31
|
+
"metadata.tsv.gz": ["data", "version"],
|
32
|
+
"features.npz": ["data", "version", "vocab", "source", "type"],
|
33
|
+
"vocabulary.txt": ["data", "version", "vocab"],
|
34
|
+
"metadata.json": ["data", "version", "vocab"],
|
35
|
+
"keys.tsv": ["data", "version", "vocab"],
|
36
|
+
}
|
37
|
+
|
38
|
+
|
39
|
+
def _find_entities(filename, search_pairs, log=False):
|
40
|
+
"""Search file for any matching patterns of entities."""
|
41
|
+
# Convert all string-based kwargs to lists
|
42
|
+
search_pairs = {k: [v] if isinstance(v, str) else v for k, v in search_pairs.items()}
|
43
|
+
search_pairs = [[f"{k}-{v_i}" for v_i in v] for k, v in search_pairs.items()]
|
44
|
+
searches = list(itertools.product(*search_pairs))
|
45
|
+
|
46
|
+
if log:
|
47
|
+
LGR.info(f"Searching for any feature files matching the following criteria: {searches}")
|
48
|
+
|
49
|
+
file_parts = filename.split("_")
|
50
|
+
suffix = file_parts[-1]
|
51
|
+
valid_entities_for_suffix = VALID_ENTITIES[suffix]
|
52
|
+
for search in searches:
|
53
|
+
temp_search = [term for term in search if term.split("-")[0] in valid_entities_for_suffix]
|
54
|
+
if all(term in file_parts for term in temp_search):
|
55
|
+
return True
|
56
|
+
|
57
|
+
return False
|
58
|
+
|
59
|
+
|
60
|
+
def _fetch_database(search_pairs, database_url, out_dir, overwrite=False):
|
61
|
+
"""Fetch generic database."""
|
62
|
+
res_dir = get_resource_path()
|
63
|
+
with open(op.join(res_dir, "database_file_manifest.json"), "r") as fo:
|
64
|
+
database_file_manifest = json.load(fo)
|
65
|
+
|
66
|
+
out_dir = op.abspath(out_dir)
|
67
|
+
os.makedirs(out_dir, exist_ok=True)
|
68
|
+
|
69
|
+
found_databases = []
|
70
|
+
found_files = []
|
71
|
+
log = True
|
72
|
+
for database in database_file_manifest:
|
73
|
+
coordinates_file = database["coordinates"]
|
74
|
+
metadata_file = database["metadata"]
|
75
|
+
if not _find_entities(coordinates_file, search_pairs, log=log):
|
76
|
+
log = False
|
77
|
+
continue
|
78
|
+
|
79
|
+
log = False
|
80
|
+
|
81
|
+
feature_dicts = database["features"]
|
82
|
+
for feature_dict in feature_dicts:
|
83
|
+
features_file = feature_dict["features"]
|
84
|
+
# Other files associated with features have subset of entities,
|
85
|
+
# so unnecessary to search them if we assume that the hard-coded manifest is valid.
|
86
|
+
if not _find_entities(features_file, search_pairs):
|
87
|
+
continue
|
88
|
+
else:
|
89
|
+
out_coordinates_file = op.join(out_dir, coordinates_file)
|
90
|
+
out_metadata_file = op.join(out_dir, metadata_file)
|
91
|
+
out_feature_dict = {k: op.join(out_dir, v) for k, v in feature_dict.items()}
|
92
|
+
|
93
|
+
db_found = [
|
94
|
+
i_db
|
95
|
+
for i_db, db_dct in enumerate(found_databases)
|
96
|
+
if db_dct["coordinates"] == out_coordinates_file
|
97
|
+
]
|
98
|
+
if len(db_found):
|
99
|
+
assert len(db_found) == 1
|
100
|
+
|
101
|
+
found_databases[db_found[0]]["features"].append(out_feature_dict)
|
102
|
+
else:
|
103
|
+
found_databases.append(
|
104
|
+
{
|
105
|
+
"coordinates": out_coordinates_file,
|
106
|
+
"metadata": out_metadata_file,
|
107
|
+
"features": [out_feature_dict],
|
108
|
+
}
|
109
|
+
)
|
110
|
+
found_files += [coordinates_file, metadata_file, *feature_dict.values()]
|
111
|
+
|
112
|
+
found_files = sorted(list(set(found_files)))
|
113
|
+
for found_file in found_files:
|
114
|
+
print(f"Downloading {found_file}", flush=True)
|
115
|
+
|
116
|
+
url = op.join(database_url, found_file + "?raw=true")
|
117
|
+
out_file = op.join(out_dir, found_file)
|
118
|
+
|
119
|
+
if op.isfile(out_file) and not overwrite:
|
120
|
+
print("File exists and overwrite is False. Skipping.")
|
121
|
+
continue
|
122
|
+
|
123
|
+
with open(out_file, "wb") as fo:
|
124
|
+
u = urlopen(url)
|
125
|
+
|
126
|
+
block_size = 8192
|
127
|
+
while True:
|
128
|
+
buffer = u.read(block_size)
|
129
|
+
if not buffer:
|
130
|
+
break
|
131
|
+
fo.write(buffer)
|
132
|
+
|
133
|
+
return found_databases
|
134
|
+
|
135
|
+
|
136
|
+
def fetch_neurosynth(data_dir=None, version="7", overwrite=False, **kwargs):
|
137
|
+
"""Download the latest data files from NeuroSynth.
|
138
|
+
|
139
|
+
.. versionchanged:: 0.0.10
|
140
|
+
|
141
|
+
* Use new format for Neurosynth and NeuroQuery files.
|
142
|
+
* Change "path" parameter to "data_dir".
|
143
|
+
|
144
|
+
.. versionadded:: 0.0.4
|
145
|
+
|
146
|
+
Parameters
|
147
|
+
----------
|
148
|
+
data_dir : :obj:`pathlib.Path` or :obj:`str`, optional
|
149
|
+
Path where data should be downloaded. By default, files are downloaded in home directory.
|
150
|
+
A subfolder, named ``neurosynth``, will be created in ``data_dir``, which is where the
|
151
|
+
files will be located.
|
152
|
+
version : str or list, optional
|
153
|
+
The version to fetch. The default is "7" (Neurosynth's latest version).
|
154
|
+
overwrite : bool, optional
|
155
|
+
Whether to overwrite existing files or not. Default is False.
|
156
|
+
kwargs : dict, optional
|
157
|
+
Keyword arguments to select relevant feature files.
|
158
|
+
Valid kwargs include: source, vocab, type.
|
159
|
+
Each kwarg may be a string or a list of strings.
|
160
|
+
If no kwargs are provided, all feature files for the specified database version will be
|
161
|
+
downloaded.
|
162
|
+
|
163
|
+
Returns
|
164
|
+
-------
|
165
|
+
found_databases : :obj:`list` of :obj:`dict`
|
166
|
+
List of dictionaries indicating datasets downloaded.
|
167
|
+
Each list entry is a different database, containing a dictionary with three keys:
|
168
|
+
"coordinates", "metadata", and "features". "coordinates" and "metadata" will be filenames.
|
169
|
+
"features" will be a list of dictionaries, each containing "id", "vocab", and "features"
|
170
|
+
keys with associated files.
|
171
|
+
|
172
|
+
Notes
|
173
|
+
-----
|
174
|
+
This function was adapted from neurosynth.base.dataset.download().
|
175
|
+
|
176
|
+
Warnings
|
177
|
+
--------
|
178
|
+
Starting in version 0.0.10, this function operates on the new Neurosynth/NeuroQuery file
|
179
|
+
format. Old code using this function **will not work** with the new version.
|
180
|
+
"""
|
181
|
+
URL = (
|
182
|
+
"https://github.com/neurosynth/neurosynth-data/blob/"
|
183
|
+
"209c33cd009d0b069398a802198b41b9c488b9b7/"
|
184
|
+
)
|
185
|
+
dataset_name = "neurosynth"
|
186
|
+
|
187
|
+
data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir)
|
188
|
+
|
189
|
+
kwargs["data"] = dataset_name
|
190
|
+
kwargs["version"] = version
|
191
|
+
|
192
|
+
found_databases = _fetch_database(kwargs, URL, data_dir, overwrite=overwrite)
|
193
|
+
|
194
|
+
return found_databases
|
195
|
+
|
196
|
+
|
197
|
+
def fetch_neuroquery(data_dir=None, version="1", overwrite=False, **kwargs):
|
198
|
+
"""Download the latest data files from NeuroQuery.
|
199
|
+
|
200
|
+
.. versionadded:: 0.0.10
|
201
|
+
|
202
|
+
Parameters
|
203
|
+
----------
|
204
|
+
data_dir : :obj:`pathlib.Path` or :obj:`str`, optional
|
205
|
+
Path where data should be downloaded. By default, files are downloaded in home directory.
|
206
|
+
version : str or list, optional
|
207
|
+
The version to fetch. The default is "7" (Neurosynth's latest version).
|
208
|
+
url : None or str, optional
|
209
|
+
Specific URL to download. If not None, overrides URL to current data.
|
210
|
+
If you want to fetch Neurosynth's data from *before* the 2021 reorganization,
|
211
|
+
you will need to use this argument.
|
212
|
+
kwargs
|
213
|
+
Keyword arguments to select relevant feature files.
|
214
|
+
Valid kwargs include: source, vocab, type.
|
215
|
+
Each kwarg may be a string or a list of strings.
|
216
|
+
If no kwargs are provided, all feature files for the specified database version will be
|
217
|
+
downloaded.
|
218
|
+
|
219
|
+
Returns
|
220
|
+
-------
|
221
|
+
found_databases : :obj:`list` of :obj:`dict`
|
222
|
+
List of dictionaries indicating datasets downloaded.
|
223
|
+
Each list entry is a different database, containing a dictionary with three keys:
|
224
|
+
"coordinates", "metadata", and "features". "coordinates" and "metadata" will be filenames.
|
225
|
+
"features" will be a list of dictionaries, each containing "id", "vocab", and "features"
|
226
|
+
keys with associated files.
|
227
|
+
|
228
|
+
Notes
|
229
|
+
-----
|
230
|
+
This function was adapted from neurosynth.base.dataset.download().
|
231
|
+
"""
|
232
|
+
URL = (
|
233
|
+
"https://github.com/neuroquery/neuroquery_data/blob/"
|
234
|
+
"4580f86267fb7c14ac1f601e298cbed898d79f2d/data/"
|
235
|
+
)
|
236
|
+
dataset_name = "neuroquery"
|
237
|
+
|
238
|
+
data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir)
|
239
|
+
|
240
|
+
kwargs["data"] = dataset_name
|
241
|
+
kwargs["version"] = version
|
242
|
+
|
243
|
+
found_databases = _fetch_database(kwargs, URL, data_dir, overwrite=overwrite)
|
244
|
+
|
245
|
+
return found_databases
|
246
|
+
|
247
|
+
|
248
|
+
def download_nidm_pain(data_dir=None, overwrite=False):
|
249
|
+
"""Download NIDM Results for 21 pain studies from NeuroVault for tests.
|
250
|
+
|
251
|
+
.. versionadded:: 0.0.2
|
252
|
+
|
253
|
+
Parameters
|
254
|
+
----------
|
255
|
+
data_dir : :obj:`pathlib.Path` or :obj:`str`, optional
|
256
|
+
Path where data should be downloaded. By default, files are downloaded in home directory.
|
257
|
+
A subfolder, named ``neuroquery``, will be created in ``data_dir``, which is where the
|
258
|
+
files will be located.
|
259
|
+
overwrite : :obj:`bool`, optional
|
260
|
+
Whether to overwrite existing files or not. Default is False.
|
261
|
+
|
262
|
+
Returns
|
263
|
+
-------
|
264
|
+
data_dir : :obj:`str`
|
265
|
+
Updated data directory pointing to dataset files.
|
266
|
+
"""
|
267
|
+
url = "https://neurovault.org/collections/1425/download"
|
268
|
+
|
269
|
+
dataset_name = "nidm_21pain"
|
270
|
+
|
271
|
+
data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir)
|
272
|
+
desc_file = op.join(data_dir, "description.txt")
|
273
|
+
if op.isfile(desc_file) and overwrite is False:
|
274
|
+
return data_dir
|
275
|
+
|
276
|
+
# Download
|
277
|
+
fname = op.join(data_dir, url.split("/")[-1])
|
278
|
+
_download_zipped_file(url, filename=fname)
|
279
|
+
|
280
|
+
# Unzip
|
281
|
+
with zipfile.ZipFile(fname, "r") as zip_ref:
|
282
|
+
zip_ref.extractall(data_dir)
|
283
|
+
|
284
|
+
collection_folders = [f for f in glob(op.join(data_dir, "*")) if ".nidm" not in f]
|
285
|
+
collection_folders = [f for f in collection_folders if op.isdir(f)]
|
286
|
+
if len(collection_folders) > 1:
|
287
|
+
raise Exception(f"More than one folder found: {', '.join(collection_folders)}")
|
288
|
+
else:
|
289
|
+
folder = collection_folders[0]
|
290
|
+
zip_files = glob(op.join(folder, "*.zip"))
|
291
|
+
for zf in zip_files:
|
292
|
+
fn = op.splitext(op.basename(zf))[0]
|
293
|
+
with zipfile.ZipFile(zf, "r") as zip_ref:
|
294
|
+
zip_ref.extractall(op.join(data_dir, fn))
|
295
|
+
|
296
|
+
os.remove(fname)
|
297
|
+
shutil.rmtree(folder)
|
298
|
+
|
299
|
+
with open(desc_file, "w") as fo:
|
300
|
+
fo.write("21 pain studies in NIDM-results packs.")
|
301
|
+
return data_dir
|
302
|
+
|
303
|
+
|
304
|
+
def download_cognitive_atlas(data_dir=None, overwrite=False):
|
305
|
+
"""Download Cognitive Atlas ontology and extract IDs and relationships.
|
306
|
+
|
307
|
+
.. versionadded:: 0.0.2
|
308
|
+
|
309
|
+
Parameters
|
310
|
+
----------
|
311
|
+
data_dir : :obj:`pathlib.Path` or :obj:`str`, optional
|
312
|
+
Path where data should be downloaded. By default, files are downloaded in home directory.
|
313
|
+
overwrite : :obj:`bool`, optional
|
314
|
+
Whether to overwrite existing files or not. Default is False.
|
315
|
+
|
316
|
+
Returns
|
317
|
+
-------
|
318
|
+
out_dict : :obj:`dict`
|
319
|
+
Dictionary with two keys: 'ids' and 'relationships'. Each points to a
|
320
|
+
csv file. The 'ids' file contains CogAt identifiers, canonical names,
|
321
|
+
and aliases, sorted by alias length (number of characters).
|
322
|
+
The 'relationships' file contains associations between CogAt items,
|
323
|
+
with three columns: input, output, and rel_type (relationship type).
|
324
|
+
"""
|
325
|
+
from cognitiveatlas.api import get_concept, get_disorder, get_task
|
326
|
+
|
327
|
+
dataset_name = "cognitive_atlas"
|
328
|
+
data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir)
|
329
|
+
|
330
|
+
ids_file = op.join(data_dir, "cogat_aliases.csv")
|
331
|
+
rels_file = op.join(data_dir, "cogat_relationships.csv")
|
332
|
+
if overwrite or not all([op.isfile(f) for f in [ids_file, rels_file]]):
|
333
|
+
concepts = get_concept(silent=True).pandas
|
334
|
+
tasks = get_task(silent=True).pandas
|
335
|
+
disorders = get_disorder(silent=True).pandas
|
336
|
+
|
337
|
+
# Identifiers and aliases
|
338
|
+
long_concepts = _longify(concepts)
|
339
|
+
long_tasks = _longify(tasks)
|
340
|
+
|
341
|
+
# Disorders currently lack aliases
|
342
|
+
disorders["name"] = disorders["name"].str.lower()
|
343
|
+
disorders = disorders.assign(alias=disorders["name"])
|
344
|
+
disorders = disorders[["id", "name", "alias"]]
|
345
|
+
|
346
|
+
# Combine into aliases DataFrame
|
347
|
+
aliases = pd.concat((long_concepts, long_tasks, disorders), axis=0)
|
348
|
+
aliases = _expand_df(aliases)
|
349
|
+
aliases = aliases.replace("", np.nan)
|
350
|
+
aliases = aliases.dropna(axis=0)
|
351
|
+
aliases = aliases.reset_index(drop=True)
|
352
|
+
|
353
|
+
# Relationships
|
354
|
+
relationship_list = []
|
355
|
+
for i, id_ in enumerate(concepts["id"].unique()):
|
356
|
+
if i % 100 == 0:
|
357
|
+
time.sleep(5)
|
358
|
+
row = [id_, id_, "isSelf"]
|
359
|
+
relationship_list.append(row)
|
360
|
+
concept = get_concept(id=id_, silent=True).json
|
361
|
+
for rel in concept["relationships"]:
|
362
|
+
reltype = _get_concept_reltype(rel["relationship"], rel["direction"])
|
363
|
+
if reltype is not None:
|
364
|
+
row = [id_, rel["id"], reltype]
|
365
|
+
relationship_list.append(row)
|
366
|
+
|
367
|
+
for i, id_ in enumerate(tasks["id"].unique()):
|
368
|
+
if i % 100 == 0:
|
369
|
+
time.sleep(5)
|
370
|
+
row = [id_, id_, "isSelf"]
|
371
|
+
relationship_list.append(row)
|
372
|
+
task = get_task(id=id_, silent=True).json
|
373
|
+
for rel in task["concepts"]:
|
374
|
+
row = [id_, rel["concept_id"], "measures"]
|
375
|
+
relationship_list.append(row)
|
376
|
+
row = [rel["concept_id"], id_, "measuredBy"]
|
377
|
+
relationship_list.append(row)
|
378
|
+
|
379
|
+
for i, id_ in enumerate(disorders["id"].unique()):
|
380
|
+
if i % 100 == 0:
|
381
|
+
time.sleep(5)
|
382
|
+
row = [id_, id_, "isSelf"]
|
383
|
+
relationship_list.append(row)
|
384
|
+
disorder = get_disorder(id=id_, silent=True).json
|
385
|
+
for rel in disorder["disorders"]:
|
386
|
+
if rel["relationship"] == "ISA":
|
387
|
+
rel_type = "isA"
|
388
|
+
else:
|
389
|
+
rel_type = rel["relationship"]
|
390
|
+
row = [id_, rel["id"], rel_type]
|
391
|
+
relationship_list.append(row)
|
392
|
+
|
393
|
+
relationships = pd.DataFrame(
|
394
|
+
columns=["input", "output", "rel_type"], data=relationship_list
|
395
|
+
)
|
396
|
+
ctp_df = concepts[["id", "id_concept_class"]]
|
397
|
+
ctp_df = ctp_df.assign(rel_type="inCategory")
|
398
|
+
ctp_df.columns = ["input", "output", "rel_type"]
|
399
|
+
ctp_df["output"].replace("", np.nan, inplace=True)
|
400
|
+
ctp_df.dropna(axis=0, inplace=True)
|
401
|
+
relationships = pd.concat((ctp_df, relationships))
|
402
|
+
relationships = relationships.reset_index(drop=True)
|
403
|
+
aliases.to_csv(ids_file, index=False)
|
404
|
+
relationships.to_csv(rels_file, index=False)
|
405
|
+
out_dict = {"ids": ids_file, "relationships": rels_file}
|
406
|
+
|
407
|
+
return out_dict
|
408
|
+
|
409
|
+
|
410
|
+
def download_abstracts(dataset, email):
|
411
|
+
"""Download the abstracts for a list of PubMed IDs.
|
412
|
+
|
413
|
+
Uses the BioPython package.
|
414
|
+
|
415
|
+
.. versionadded:: 0.0.2
|
416
|
+
|
417
|
+
Parameters
|
418
|
+
----------
|
419
|
+
dataset : :obj:`~nimare.dataset.Dataset`
|
420
|
+
A Dataset object where IDs are in the form PMID-EXPID
|
421
|
+
email : :obj:`str`
|
422
|
+
Email address to use to call the PubMed API
|
423
|
+
|
424
|
+
Returns
|
425
|
+
-------
|
426
|
+
dataset : :obj:`~nimare.dataset.Dataset`
|
427
|
+
|
428
|
+
Warnings
|
429
|
+
--------
|
430
|
+
This function assumes that the dataset uses identifiers in the format
|
431
|
+
[PMID-EXPID]. Thus, the ``study_id`` column of the
|
432
|
+
:py:attr:`~nimare.dataset.Dataset.texts` DataFrame should correspond to PMID.
|
433
|
+
"""
|
434
|
+
try:
|
435
|
+
from Bio import Entrez, Medline
|
436
|
+
except ImportError:
|
437
|
+
raise Exception("Module biopython is required for downloading abstracts from PubMed.")
|
438
|
+
|
439
|
+
Entrez.email = email
|
440
|
+
|
441
|
+
if isinstance(dataset, Dataset):
|
442
|
+
pmids = dataset.texts["study_id"].astype(str).tolist()
|
443
|
+
pmids = sorted(list(set(pmids)))
|
444
|
+
elif isinstance(dataset, list):
|
445
|
+
pmids = [str(pmid) for pmid in dataset]
|
446
|
+
else:
|
447
|
+
raise Exception(f"Dataset type not recognized: {type(dataset)}")
|
448
|
+
|
449
|
+
records = []
|
450
|
+
# PubMed only allows you to search ~1000 at a time. I chose 900 to be safe.
|
451
|
+
chunks = [pmids[x : x + 900] for x in range(0, len(pmids), 900)]
|
452
|
+
for i, chunk in enumerate(chunks):
|
453
|
+
LGR.info(f"Downloading chunk {i + 1} of {len(chunks)}")
|
454
|
+
h = Entrez.efetch(db="pubmed", id=chunk, rettype="medline", retmode="text")
|
455
|
+
records += list(Medline.parse(h))
|
456
|
+
|
457
|
+
# Pull data for studies with abstracts
|
458
|
+
data = [[study["PMID"], study["AB"]] for study in records if study.get("AB", None)]
|
459
|
+
df = pd.DataFrame(columns=["study_id", "abstract"], data=data)
|
460
|
+
if not isinstance(dataset, Dataset):
|
461
|
+
return df
|
462
|
+
|
463
|
+
dataset.texts = pd.merge(
|
464
|
+
dataset.texts, df, left_on="study_id", right_on="study_id", how="left"
|
465
|
+
)
|
466
|
+
return dataset
|