genelastic 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/api/.env +4 -0
- genelastic/api/cli_start_api.py +2 -2
- genelastic/api/errors.py +52 -0
- genelastic/api/extends/example.py +0 -6
- genelastic/api/extends/example.yml +0 -20
- genelastic/api/routes.py +313 -181
- genelastic/api/server.py +8 -3
- genelastic/api/specification.yml +343 -181
- genelastic/common/__init__.py +0 -44
- genelastic/common/cli.py +48 -0
- genelastic/common/elastic.py +374 -46
- genelastic/common/exceptions.py +34 -2
- genelastic/common/server.py +9 -1
- genelastic/common/types.py +1 -14
- genelastic/import_data/__init__.py +0 -27
- genelastic/import_data/checker.py +99 -0
- genelastic/import_data/checker_observer.py +13 -0
- genelastic/import_data/cli/__init__.py +0 -0
- genelastic/import_data/cli/cli_check.py +136 -0
- genelastic/import_data/{cli_gen_data.py → cli/gen_data.py} +4 -4
- genelastic/import_data/cli/import_data.py +346 -0
- genelastic/import_data/cli/info.py +247 -0
- genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
- genelastic/import_data/cli/validate.py +146 -0
- genelastic/import_data/collect.py +185 -0
- genelastic/import_data/constants.py +136 -11
- genelastic/import_data/import_bundle.py +102 -59
- genelastic/import_data/import_bundle_factory.py +70 -149
- genelastic/import_data/importers/__init__.py +0 -0
- genelastic/import_data/importers/importer_base.py +131 -0
- genelastic/import_data/importers/importer_factory.py +85 -0
- genelastic/import_data/importers/importer_types.py +223 -0
- genelastic/import_data/logger.py +2 -1
- genelastic/import_data/models/__init__.py +0 -0
- genelastic/import_data/models/analyses.py +178 -0
- genelastic/import_data/models/analysis.py +144 -0
- genelastic/import_data/models/data_file.py +110 -0
- genelastic/import_data/models/process.py +45 -0
- genelastic/import_data/models/processes.py +84 -0
- genelastic/import_data/models/tags.py +170 -0
- genelastic/import_data/models/unique_list.py +109 -0
- genelastic/import_data/models/validate.py +26 -0
- genelastic/import_data/patterns.py +90 -0
- genelastic/import_data/random_bundle.py +10 -8
- genelastic/import_data/resolve.py +157 -0
- genelastic/ui/.env +1 -0
- genelastic/ui/cli_start_ui.py +4 -2
- genelastic/ui/routes.py +289 -42
- genelastic/ui/static/cea-cnrgh.ico +0 -0
- genelastic/ui/static/cea.ico +0 -0
- genelastic/ui/static/layout.ico +0 -0
- genelastic/ui/static/novaseq6000.png +0 -0
- genelastic/ui/static/style.css +430 -0
- genelastic/ui/static/ui.js +458 -0
- genelastic/ui/templates/analyses.html +96 -9
- genelastic/ui/templates/analysis_detail.html +44 -0
- genelastic/ui/templates/bi_process_detail.html +129 -0
- genelastic/ui/templates/bi_processes.html +114 -9
- genelastic/ui/templates/explorer.html +356 -0
- genelastic/ui/templates/home.html +205 -2
- genelastic/ui/templates/layout.html +148 -29
- genelastic/ui/templates/version.html +19 -7
- genelastic/ui/templates/wet_process_detail.html +131 -0
- genelastic/ui/templates/wet_processes.html +114 -9
- genelastic-0.9.0.dist-info/METADATA +686 -0
- genelastic-0.9.0.dist-info/RECORD +76 -0
- genelastic-0.9.0.dist-info/WHEEL +4 -0
- genelastic-0.9.0.dist-info/entry_points.txt +10 -0
- genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
- genelastic/import_data/analyses.py +0 -69
- genelastic/import_data/analysis.py +0 -205
- genelastic/import_data/bi_process.py +0 -27
- genelastic/import_data/bi_processes.py +0 -49
- genelastic/import_data/cli_import.py +0 -379
- genelastic/import_data/cli_info.py +0 -256
- genelastic/import_data/cli_validate.py +0 -54
- genelastic/import_data/data_file.py +0 -87
- genelastic/import_data/filename_pattern.py +0 -57
- genelastic/import_data/tags.py +0 -123
- genelastic/import_data/wet_process.py +0 -28
- genelastic/import_data/wet_processes.py +0 -53
- genelastic-0.8.0.dist-info/METADATA +0 -109
- genelastic-0.8.0.dist-info/RECORD +0 -52
- genelastic-0.8.0.dist-info/WHEEL +0 -5
- genelastic-0.8.0.dist-info/entry_points.txt +0 -8
- genelastic-0.8.0.dist-info/top_level.txt +0 -1
|
@@ -1,205 +0,0 @@
|
|
|
1
|
-
import copy
|
|
2
|
-
import logging
|
|
3
|
-
import re
|
|
4
|
-
import typing
|
|
5
|
-
from pathlib import Path
|
|
6
|
-
|
|
7
|
-
from genelastic.common import AnalysisMetaData
|
|
8
|
-
|
|
9
|
-
from .constants import ALLOWED_CATEGORIES
|
|
10
|
-
from .data_file import DataFile
|
|
11
|
-
from .filename_pattern import FilenamePattern
|
|
12
|
-
from .tags import Tags
|
|
13
|
-
|
|
14
|
-
logger = logging.getLogger("genelastic")
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class Analysis:
|
|
18
|
-
"""Class Analysis that represents an analysis."""
|
|
19
|
-
|
|
20
|
-
def __init__( # noqa: PLR0913
|
|
21
|
-
self,
|
|
22
|
-
tags: Tags,
|
|
23
|
-
root_dir: str = ".",
|
|
24
|
-
bundle_file: str | None = None,
|
|
25
|
-
file_prefix: str | None = None,
|
|
26
|
-
files: typing.Sequence[str] | None = None,
|
|
27
|
-
data_path: str | None = None,
|
|
28
|
-
**metadata: str | int,
|
|
29
|
-
) -> None:
|
|
30
|
-
self._bundle_file = Path(bundle_file) if bundle_file else None
|
|
31
|
-
self._file_prefix = file_prefix
|
|
32
|
-
self._files = files
|
|
33
|
-
self._data_path = Analysis._resolve_data_path(
|
|
34
|
-
Path(root_dir), Path(data_path) if data_path else None
|
|
35
|
-
)
|
|
36
|
-
self._tags = tags
|
|
37
|
-
self._metadata: AnalysisMetaData = metadata
|
|
38
|
-
self._categories: set[str] = set()
|
|
39
|
-
|
|
40
|
-
@property
|
|
41
|
-
def metadata(self) -> AnalysisMetaData:
|
|
42
|
-
"""Get metadata."""
|
|
43
|
-
return copy.deepcopy(self._metadata)
|
|
44
|
-
|
|
45
|
-
@property
|
|
46
|
-
def bundle_file(self) -> Path | None:
|
|
47
|
-
"""Get the bundle file."""
|
|
48
|
-
return self._bundle_file
|
|
49
|
-
|
|
50
|
-
@property
|
|
51
|
-
def filename_regex(self) -> str:
|
|
52
|
-
"""Resolve placeholders in a file prefix using metadata
|
|
53
|
-
and unresolved placeholders are converted to regex groups
|
|
54
|
-
"""
|
|
55
|
-
x: str = r"^.+\.(?P<ext>vcf|cov)(\.gz)?$"
|
|
56
|
-
|
|
57
|
-
# Use existing generic prefix
|
|
58
|
-
if self._file_prefix:
|
|
59
|
-
x = self._file_prefix
|
|
60
|
-
# Replace %* tags
|
|
61
|
-
for tag_name, tag_attrs in self._tags.items:
|
|
62
|
-
field = tag_attrs["field"]
|
|
63
|
-
regex = tag_attrs["regex"]
|
|
64
|
-
|
|
65
|
-
# Build field regex
|
|
66
|
-
field_regex = (
|
|
67
|
-
f"(?P<{field}>{self._metadata.get(field)})"
|
|
68
|
-
if field in self._metadata
|
|
69
|
-
else f"(?P<{field}>{regex})"
|
|
70
|
-
)
|
|
71
|
-
# Replace tag with field regex
|
|
72
|
-
x = x.replace(tag_name, field_regex)
|
|
73
|
-
|
|
74
|
-
# Check for tags that were not replaced.
|
|
75
|
-
groups = re.findall(self._tags.search_regex, x)
|
|
76
|
-
for match in groups:
|
|
77
|
-
logger.warning(
|
|
78
|
-
"String '%s' in key 'file_prefix' looks like an undefined tag. "
|
|
79
|
-
"If this string is not a tag, you can ignore this warning.",
|
|
80
|
-
match,
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
# Add missing start and end markers
|
|
84
|
-
if not x.startswith("^"):
|
|
85
|
-
x = "^" + x
|
|
86
|
-
if not x.endswith("$"):
|
|
87
|
-
x += r"\.(?P<ext>" + "|".join(ALLOWED_CATEGORIES) + r")(\.gz)?$"
|
|
88
|
-
logger.debug("File regex for %s: %s", self._bundle_file, x)
|
|
89
|
-
|
|
90
|
-
return x
|
|
91
|
-
|
|
92
|
-
def get_nb_files(self, cat: str | None = None) -> int:
|
|
93
|
-
"""Returns the total number of files."""
|
|
94
|
-
return len(self.get_file_paths(cat=cat))
|
|
95
|
-
|
|
96
|
-
def get_data_files(self, cat: str | None = None) -> list[DataFile]:
|
|
97
|
-
"""Returns the list of matched files as DataFile objects."""
|
|
98
|
-
files = self.get_file_paths(cat=cat)
|
|
99
|
-
filename_pattern = FilenamePattern(self.filename_regex)
|
|
100
|
-
|
|
101
|
-
data_files: list[DataFile] = []
|
|
102
|
-
|
|
103
|
-
for f in files:
|
|
104
|
-
try:
|
|
105
|
-
data_files.append(
|
|
106
|
-
DataFile.make_from_bundle(
|
|
107
|
-
path=f,
|
|
108
|
-
bundle_path=self._bundle_file,
|
|
109
|
-
pattern=filename_pattern,
|
|
110
|
-
)
|
|
111
|
-
)
|
|
112
|
-
except (OSError, ValueError) as e:
|
|
113
|
-
logger.error("Error processing file %s: %s", f, str(e))
|
|
114
|
-
|
|
115
|
-
return data_files
|
|
116
|
-
|
|
117
|
-
def get_file_paths(self, cat: str | None = None) -> typing.Sequence[Path]:
|
|
118
|
-
"""Returns the list of matched files."""
|
|
119
|
-
files, _, _ = self._do_get_file_paths(cat=cat)
|
|
120
|
-
return files
|
|
121
|
-
|
|
122
|
-
def get_unmatched_file_paths(
|
|
123
|
-
self, cat: str | None = None
|
|
124
|
-
) -> typing.Sequence[Path]:
|
|
125
|
-
"""Returns the list of unmatched files."""
|
|
126
|
-
_, files, _ = self._do_get_file_paths(cat=cat)
|
|
127
|
-
return files
|
|
128
|
-
|
|
129
|
-
def get_all_categories(self) -> set[str]:
|
|
130
|
-
"""Returns all categories of the analysis."""
|
|
131
|
-
_, _, categories = self._do_get_file_paths()
|
|
132
|
-
return categories
|
|
133
|
-
|
|
134
|
-
@staticmethod
|
|
135
|
-
def _resolve_data_path(root_dir: Path, data_path: Path | None) -> Path:
|
|
136
|
-
resolved_data_path = Path() if data_path is None else data_path
|
|
137
|
-
|
|
138
|
-
if not resolved_data_path.is_absolute():
|
|
139
|
-
resolved_data_path = (root_dir / resolved_data_path).absolute()
|
|
140
|
-
|
|
141
|
-
return resolved_data_path
|
|
142
|
-
|
|
143
|
-
def _get_files_with_allowed_categories(self) -> dict[Path, str]:
|
|
144
|
-
# Create a dict to store allowed files. Keys are the filepaths,
|
|
145
|
-
# and values are their corresponding category.
|
|
146
|
-
allowed_files: dict[Path, str] = {}
|
|
147
|
-
# If files are listed explicitly in the YAML in the 'files' attribute, process them.
|
|
148
|
-
if self._files is not None:
|
|
149
|
-
abs_filepaths = [Path(self._data_path) / f for f in self._files]
|
|
150
|
-
# Try to retrieve files matching allowed categories by checking their first suffix.
|
|
151
|
-
for file in abs_filepaths:
|
|
152
|
-
cat = file.suffixes[0][1:]
|
|
153
|
-
# Add each matching file and its category to the dict.
|
|
154
|
-
if cat in ALLOWED_CATEGORIES:
|
|
155
|
-
allowed_files[file] = cat
|
|
156
|
-
# Else, look for files on disk using the YAML 'data_path' attribute.
|
|
157
|
-
else:
|
|
158
|
-
# Try to retrieve files matching allowed categories using glob.
|
|
159
|
-
for cat in ALLOWED_CATEGORIES:
|
|
160
|
-
glob_res: list[Path] = []
|
|
161
|
-
glob_res.extend(self._data_path.glob(f"*.{cat}"))
|
|
162
|
-
glob_res.extend(self._data_path.glob(f"*.{cat}.gz"))
|
|
163
|
-
|
|
164
|
-
# Add each globed file and its category to the dict.
|
|
165
|
-
for g_file in glob_res:
|
|
166
|
-
allowed_files[g_file] = cat
|
|
167
|
-
|
|
168
|
-
return allowed_files
|
|
169
|
-
|
|
170
|
-
def _do_get_file_paths(
|
|
171
|
-
self, cat: str | None = None
|
|
172
|
-
) -> tuple[typing.Sequence[Path], typing.Sequence[Path], set[str]]:
|
|
173
|
-
# Raise an error if the category given as a parameter is not part of the allowed categories.
|
|
174
|
-
if cat is not None and cat not in ALLOWED_CATEGORIES:
|
|
175
|
-
msg = f"Unknown category {cat}."
|
|
176
|
-
raise ValueError(msg)
|
|
177
|
-
|
|
178
|
-
# Obtain a dict of all files matching the allowed categories.
|
|
179
|
-
allowed_files = self._get_files_with_allowed_categories()
|
|
180
|
-
|
|
181
|
-
if cat is None:
|
|
182
|
-
# No category was given as a parameter, so we match all categories.
|
|
183
|
-
files_to_match = allowed_files
|
|
184
|
-
else:
|
|
185
|
-
# A category was given as a parameter, so we match only this specific category.
|
|
186
|
-
files_to_match = {
|
|
187
|
-
k: v for k, v in allowed_files.items() if v == cat
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
filename_pattern = FilenamePattern(self.filename_regex)
|
|
191
|
-
matching_files: list[Path] = []
|
|
192
|
-
non_matching_files: list[Path] = []
|
|
193
|
-
categories = set()
|
|
194
|
-
|
|
195
|
-
# We filter files by ensuring that they match the filename pattern defined in the analysis.
|
|
196
|
-
for file, category in sorted(files_to_match.items()):
|
|
197
|
-
if filename_pattern.matches_pattern(file.name):
|
|
198
|
-
matching_files.append(file)
|
|
199
|
-
logger.info("MATCHED file %s.", file)
|
|
200
|
-
# Add the file category to the categories set.
|
|
201
|
-
categories.add(category)
|
|
202
|
-
else:
|
|
203
|
-
logger.warning("UNMATCHED file %s.", file)
|
|
204
|
-
non_matching_files.append(file)
|
|
205
|
-
return matching_files, non_matching_files, categories
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import copy
|
|
2
|
-
|
|
3
|
-
from genelastic.common import BioInfoProcessData
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class BioInfoProcess:
|
|
7
|
-
"""Class representing a bio process."""
|
|
8
|
-
|
|
9
|
-
def __init__(
|
|
10
|
-
self,
|
|
11
|
-
proc_id: str,
|
|
12
|
-
bundle_file: str | None = None,
|
|
13
|
-
**data: str | list[str],
|
|
14
|
-
) -> None:
|
|
15
|
-
self._proc_id = proc_id
|
|
16
|
-
self._bundle_file = bundle_file
|
|
17
|
-
self._data: BioInfoProcessData = data
|
|
18
|
-
|
|
19
|
-
@property
|
|
20
|
-
def id(self) -> str:
|
|
21
|
-
"""Get the bio process ID."""
|
|
22
|
-
return self._proc_id
|
|
23
|
-
|
|
24
|
-
@property
|
|
25
|
-
def data(self) -> BioInfoProcessData:
|
|
26
|
-
"""Get data associated to the bio process."""
|
|
27
|
-
return copy.deepcopy(self._data)
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
import typing
|
|
3
|
-
|
|
4
|
-
from genelastic.common import BundleDict
|
|
5
|
-
|
|
6
|
-
from .bi_process import BioInfoProcess
|
|
7
|
-
|
|
8
|
-
logger = logging.getLogger("genelastic")
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class BioInfoProcesses:
|
|
12
|
-
"""Class BioInfoProcesses is a container of BioInfoProcess objects."""
|
|
13
|
-
|
|
14
|
-
def __init__(self) -> None:
|
|
15
|
-
self._dict: dict[str, BioInfoProcess] = {}
|
|
16
|
-
|
|
17
|
-
def __len__(self) -> int:
|
|
18
|
-
return len(self._dict)
|
|
19
|
-
|
|
20
|
-
def __getitem__(self, key: str) -> BioInfoProcess:
|
|
21
|
-
return self._dict[key]
|
|
22
|
-
|
|
23
|
-
def add(self, process: BioInfoProcess) -> None:
|
|
24
|
-
"""Add one BioInfoProcess object.
|
|
25
|
-
If a BioInfoProcess object with the same ID already exists in the container,
|
|
26
|
-
the program exits.
|
|
27
|
-
"""
|
|
28
|
-
if process.id in self._dict:
|
|
29
|
-
msg = f"A bi process with the id '{process.id}' is already present."
|
|
30
|
-
raise ValueError(msg)
|
|
31
|
-
|
|
32
|
-
# Add one WetProcess object.
|
|
33
|
-
self._dict[process.id] = process
|
|
34
|
-
|
|
35
|
-
def get_process_ids(self) -> set[str]:
|
|
36
|
-
"""Get a list of the bio processes IDs."""
|
|
37
|
-
return set(self._dict.keys())
|
|
38
|
-
|
|
39
|
-
@classmethod
|
|
40
|
-
def from_array_of_dicts(
|
|
41
|
-
cls, arr: typing.Sequence[BundleDict]
|
|
42
|
-
) -> typing.Self:
|
|
43
|
-
"""Build a BioInfoProcesses instance."""
|
|
44
|
-
bi_processes = cls()
|
|
45
|
-
|
|
46
|
-
for d in arr:
|
|
47
|
-
bi_processes.add(BioInfoProcess(**d))
|
|
48
|
-
|
|
49
|
-
return bi_processes
|
|
@@ -1,379 +0,0 @@
|
|
|
1
|
-
# vi: se tw=80
|
|
2
|
-
|
|
3
|
-
# Elasticsearch Python API:
|
|
4
|
-
# https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/overview.html
|
|
5
|
-
# https://elasticsearch-py.readthedocs.io/en/latest/api.html
|
|
6
|
-
|
|
7
|
-
import argparse
|
|
8
|
-
import csv
|
|
9
|
-
import datetime
|
|
10
|
-
import hashlib
|
|
11
|
-
import logging
|
|
12
|
-
import sys
|
|
13
|
-
import time
|
|
14
|
-
from pathlib import Path
|
|
15
|
-
|
|
16
|
-
import vcf
|
|
17
|
-
|
|
18
|
-
from genelastic.common import (
|
|
19
|
-
AnalysisDocument,
|
|
20
|
-
BulkItems,
|
|
21
|
-
ElasticImportConn,
|
|
22
|
-
MetadataDocument,
|
|
23
|
-
ProcessDocument,
|
|
24
|
-
add_es_connection_args,
|
|
25
|
-
add_verbose_control_args,
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
from .bi_processes import BioInfoProcesses
|
|
29
|
-
from .data_file import DataFile
|
|
30
|
-
from .import_bundle_factory import make_import_bundle_from_files
|
|
31
|
-
from .logger import configure_logging
|
|
32
|
-
from .wet_processes import WetProcesses
|
|
33
|
-
|
|
34
|
-
logger = logging.getLogger("genelastic")
|
|
35
|
-
logging.getLogger("elastic_transport").setLevel(
|
|
36
|
-
logging.WARNING
|
|
37
|
-
) # Disable excessive logging
|
|
38
|
-
logging.getLogger("urllib3").setLevel(
|
|
39
|
-
logging.WARNING
|
|
40
|
-
) # Disable excessive logging
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def read_args() -> argparse.Namespace:
|
|
44
|
-
"""Read arguments from command line."""
|
|
45
|
-
parser = argparse.ArgumentParser(
|
|
46
|
-
description="Genetics data importer.",
|
|
47
|
-
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
48
|
-
allow_abbrev=False,
|
|
49
|
-
)
|
|
50
|
-
add_verbose_control_args(parser)
|
|
51
|
-
add_es_connection_args(parser)
|
|
52
|
-
parser.add_argument(
|
|
53
|
-
"-D",
|
|
54
|
-
"--dry-run",
|
|
55
|
-
dest="dryrun",
|
|
56
|
-
action="count",
|
|
57
|
-
default=0,
|
|
58
|
-
help=(
|
|
59
|
-
"Dry-run level. -D for data files loading (VCF, coverage, etc) "
|
|
60
|
-
"without connecting or importing to database. "
|
|
61
|
-
"-DD for metadata YAML files loading only (no loading of data files)."
|
|
62
|
-
),
|
|
63
|
-
)
|
|
64
|
-
parser.add_argument(
|
|
65
|
-
"--log-file", dest="log_file", help="Path to a log file."
|
|
66
|
-
)
|
|
67
|
-
parser.add_argument(
|
|
68
|
-
"--no-list",
|
|
69
|
-
dest="no_list",
|
|
70
|
-
action="store_true",
|
|
71
|
-
help="Do not print list of files to be imported.",
|
|
72
|
-
)
|
|
73
|
-
parser.add_argument(
|
|
74
|
-
"--no-confirm",
|
|
75
|
-
dest="no_confirm",
|
|
76
|
-
action="store_true",
|
|
77
|
-
help="Do not ask confirmation before importing.",
|
|
78
|
-
)
|
|
79
|
-
parser.add_argument(
|
|
80
|
-
"files",
|
|
81
|
-
type=Path,
|
|
82
|
-
nargs="+",
|
|
83
|
-
default=None,
|
|
84
|
-
help="Data files that describe what to import.",
|
|
85
|
-
)
|
|
86
|
-
return parser.parse_args()
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def import_cov_file(
|
|
90
|
-
es_import_conn: ElasticImportConn | None,
|
|
91
|
-
file_index: str,
|
|
92
|
-
file: Path,
|
|
93
|
-
dryrun: int = 0,
|
|
94
|
-
) -> None:
|
|
95
|
-
"""Import a coverage file to the Elasticsearch database."""
|
|
96
|
-
# Set field types
|
|
97
|
-
if dryrun == 0 and es_import_conn:
|
|
98
|
-
es_import_conn.client.indices.put_mapping(
|
|
99
|
-
index=file_index,
|
|
100
|
-
body={
|
|
101
|
-
"properties": {
|
|
102
|
-
"pos": {"type": "integer"},
|
|
103
|
-
"depth": {"type": "byte"},
|
|
104
|
-
}
|
|
105
|
-
},
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
# Open file
|
|
109
|
-
if dryrun > 1:
|
|
110
|
-
logger.info(
|
|
111
|
-
"Would load and import Coverage file %s " "into index %s.",
|
|
112
|
-
file,
|
|
113
|
-
file_index,
|
|
114
|
-
)
|
|
115
|
-
else:
|
|
116
|
-
logger.info("Load Coverage file %s.", file)
|
|
117
|
-
if dryrun == 1:
|
|
118
|
-
logger.info(
|
|
119
|
-
"Would import Coverage file %s into index %s.", file, file_index
|
|
120
|
-
)
|
|
121
|
-
else:
|
|
122
|
-
logger.info(
|
|
123
|
-
"Import Coverage file %s into index %s.", file, file_index
|
|
124
|
-
)
|
|
125
|
-
with file.open(newline="", encoding="utf-8") as f:
|
|
126
|
-
# Read file as CSV
|
|
127
|
-
reader = csv.reader(f, delimiter="\t", quotechar='"')
|
|
128
|
-
|
|
129
|
-
# Loop on al lines
|
|
130
|
-
for row in reader:
|
|
131
|
-
# Build document
|
|
132
|
-
# Position starts at 0 inside coverage file
|
|
133
|
-
doc: MetadataDocument = {
|
|
134
|
-
"type": "coverage",
|
|
135
|
-
"chr": row[0],
|
|
136
|
-
"pos": int(row[1]) + 1,
|
|
137
|
-
"depth": int(row[2]),
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
# Insert document
|
|
141
|
-
if dryrun == 0 and es_import_conn:
|
|
142
|
-
es_import_conn.client.index(index=file_index, document=doc)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
def import_analysis_metadata( # noqa: PLR0913
|
|
146
|
-
es_import_conn: ElasticImportConn | None,
|
|
147
|
-
index_prefix: str,
|
|
148
|
-
file_index: str,
|
|
149
|
-
file: DataFile,
|
|
150
|
-
analysis_type: str,
|
|
151
|
-
dryrun: int = 0,
|
|
152
|
-
) -> None:
|
|
153
|
-
"""Import analysis metadata into a dedicated index."""
|
|
154
|
-
doc: AnalysisDocument = {
|
|
155
|
-
"path": str(file.path.resolve()),
|
|
156
|
-
"bundle_path": str(file.bundle_path.resolve())
|
|
157
|
-
if file.bundle_path
|
|
158
|
-
else None,
|
|
159
|
-
"metadata": file.metadata,
|
|
160
|
-
"file_index": file_index,
|
|
161
|
-
"type": analysis_type,
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
bulk_items: BulkItems = [
|
|
165
|
-
{"_index": f"{index_prefix}-analyses", "_source": doc}
|
|
166
|
-
]
|
|
167
|
-
|
|
168
|
-
if dryrun == 0 and es_import_conn:
|
|
169
|
-
es_import_conn.import_items(
|
|
170
|
-
bulk_items,
|
|
171
|
-
start_time=time.perf_counter(),
|
|
172
|
-
total_items=len(bulk_items),
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
def import_vcf_file(
|
|
177
|
-
es_import_conn: ElasticImportConn | None,
|
|
178
|
-
file_index: str,
|
|
179
|
-
file: DataFile,
|
|
180
|
-
dryrun: int = 0,
|
|
181
|
-
) -> None:
|
|
182
|
-
"""Import a VCF file to the Elasticsearch database."""
|
|
183
|
-
logger.info('Import VCF file "%s".', file)
|
|
184
|
-
|
|
185
|
-
if dryrun > 1:
|
|
186
|
-
logger.info(
|
|
187
|
-
"Would load and import VCF file %s " "into index %s.",
|
|
188
|
-
file.path,
|
|
189
|
-
file_index,
|
|
190
|
-
)
|
|
191
|
-
else:
|
|
192
|
-
logger.info("Load VCF file %s.", file.path)
|
|
193
|
-
if dryrun == 1:
|
|
194
|
-
logger.info(
|
|
195
|
-
"Would import VCF file %s into index %s.", file.path, file_index
|
|
196
|
-
)
|
|
197
|
-
else:
|
|
198
|
-
logger.info(
|
|
199
|
-
"Importing VCF file %s into index %s...", file.path, file_index
|
|
200
|
-
)
|
|
201
|
-
|
|
202
|
-
try:
|
|
203
|
-
vcf_reader = vcf.Reader(filename=str(file.path))
|
|
204
|
-
n = 0
|
|
205
|
-
start = time.perf_counter()
|
|
206
|
-
bulk_sz = 256 # Bulk size
|
|
207
|
-
bulk_items: BulkItems = []
|
|
208
|
-
for record in vcf_reader:
|
|
209
|
-
# Correct values
|
|
210
|
-
if not record.CHROM.startswith("chr"):
|
|
211
|
-
if record.CHROM.lower().startswith("chr"):
|
|
212
|
-
record.CHROM = "chr" + record.CHROM[3:]
|
|
213
|
-
else:
|
|
214
|
-
record.CHROM = "chr" + record.CHROM
|
|
215
|
-
|
|
216
|
-
# Build document
|
|
217
|
-
alt = [x if x is None else x.type for x in record.ALT]
|
|
218
|
-
doc: MetadataDocument = {
|
|
219
|
-
"type": "vcf",
|
|
220
|
-
"chr": record.CHROM,
|
|
221
|
-
"pos": record.POS,
|
|
222
|
-
"alt": alt,
|
|
223
|
-
"info": record.INFO,
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
if dryrun == 0:
|
|
227
|
-
# Append item to bulk
|
|
228
|
-
bulk_items.append({"_index": file_index, "_source": doc})
|
|
229
|
-
n += 1
|
|
230
|
-
|
|
231
|
-
# Insert bulk of items
|
|
232
|
-
if len(bulk_items) >= bulk_sz and es_import_conn:
|
|
233
|
-
es_import_conn.import_items(
|
|
234
|
-
bulk_items, start_time=start, total_items=n
|
|
235
|
-
)
|
|
236
|
-
bulk_items = []
|
|
237
|
-
|
|
238
|
-
# Insert remaining items
|
|
239
|
-
if dryrun == 0 and es_import_conn:
|
|
240
|
-
es_import_conn.import_items(
|
|
241
|
-
bulk_items, start_time=start, total_items=n
|
|
242
|
-
)
|
|
243
|
-
|
|
244
|
-
except StopIteration:
|
|
245
|
-
logger.error("Skipping empty file : %s.", file.path)
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
def import_processes(
|
|
249
|
-
es_import_conn: ElasticImportConn | None,
|
|
250
|
-
index: str,
|
|
251
|
-
processes: WetProcesses | BioInfoProcesses,
|
|
252
|
-
dryrun: int = 0,
|
|
253
|
-
) -> None:
|
|
254
|
-
"""Import processes into their own index."""
|
|
255
|
-
bulk_items: BulkItems = []
|
|
256
|
-
|
|
257
|
-
for proc_id in processes.get_process_ids():
|
|
258
|
-
process = processes[proc_id]
|
|
259
|
-
process_type = process.__class__.__name__
|
|
260
|
-
doc: ProcessDocument = process.data | {
|
|
261
|
-
"proc_id": proc_id,
|
|
262
|
-
"type": process_type,
|
|
263
|
-
}
|
|
264
|
-
bulk_items.append({"_index": index, "_source": doc})
|
|
265
|
-
|
|
266
|
-
if dryrun == 0 and es_import_conn:
|
|
267
|
-
es_import_conn.import_items(
|
|
268
|
-
bulk_items,
|
|
269
|
-
start_time=time.perf_counter(),
|
|
270
|
-
total_items=len(bulk_items),
|
|
271
|
-
)
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
def generate_unique_index(index_prefix: str, filepath: Path) -> str:
|
|
275
|
-
"""Generate a unique index with the following format:
|
|
276
|
-
<index_prefix>_<current_date>_<md5_hashed_filepath>
|
|
277
|
-
"""
|
|
278
|
-
current_date = datetime.datetime.now(tz=datetime.UTC).strftime("%Y%m%d")
|
|
279
|
-
hashed_filepath = hashlib.md5(
|
|
280
|
-
str(filepath).encode("utf-8"), usedforsecurity=False
|
|
281
|
-
).hexdigest()
|
|
282
|
-
return f"{index_prefix}-file-{current_date}-{hashed_filepath}"
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
def main() -> None: # noqa: C901
|
|
286
|
-
"""Entry point of the import script."""
|
|
287
|
-
# Read command line arguments
|
|
288
|
-
args = read_args()
|
|
289
|
-
|
|
290
|
-
# Configure logging
|
|
291
|
-
configure_logging(args.verbose, log_file=args.log_file)
|
|
292
|
-
logger.debug("Arguments: %s", args)
|
|
293
|
-
logger.debug("LOGGERS: %s", logging.root.manager.loggerDict)
|
|
294
|
-
|
|
295
|
-
# Open connection to ES
|
|
296
|
-
if args.dryrun == 0:
|
|
297
|
-
addr = f"https://{args.es_host}:{args.es_port}"
|
|
298
|
-
logger.info("Trying to connect to Elasticsearch at %s...", addr)
|
|
299
|
-
es_import_conn = ElasticImportConn(
|
|
300
|
-
addr, args.es_cert_fp, basic_auth=(args.es_usr, args.es_pwd)
|
|
301
|
-
)
|
|
302
|
-
else:
|
|
303
|
-
es_import_conn = None
|
|
304
|
-
|
|
305
|
-
# Load YAML import bundle
|
|
306
|
-
import_bundle = make_import_bundle_from_files(args.files, check=True)
|
|
307
|
-
all_bundled_files = import_bundle.get_files()
|
|
308
|
-
|
|
309
|
-
# CHECK
|
|
310
|
-
for f in all_bundled_files:
|
|
311
|
-
if not f.exists():
|
|
312
|
-
msg = f"Path {f.path} does not point to a valid file."
|
|
313
|
-
raise RuntimeError(msg)
|
|
314
|
-
|
|
315
|
-
# LIST
|
|
316
|
-
if not args.no_list:
|
|
317
|
-
for f in all_bundled_files:
|
|
318
|
-
logger.info("Will import %s.", f.path)
|
|
319
|
-
|
|
320
|
-
# Ask confirmation for importing
|
|
321
|
-
if not args.no_confirm:
|
|
322
|
-
answer: str = "maybe"
|
|
323
|
-
while answer not in ["", "n", "y"]:
|
|
324
|
-
answer = input("Import (y/N)? ").lower()
|
|
325
|
-
if answer != "y":
|
|
326
|
-
logger.info("Import canceled.")
|
|
327
|
-
sys.exit(0)
|
|
328
|
-
|
|
329
|
-
# IMPORT
|
|
330
|
-
# Loop on file categories
|
|
331
|
-
for cat in import_bundle.analyses.get_all_categories():
|
|
332
|
-
# Import all files in this category.
|
|
333
|
-
for f in import_bundle.get_files(cat):
|
|
334
|
-
logger.info("Import %s files from %s.", cat, f.path)
|
|
335
|
-
# First, generate a unique index name for each file.
|
|
336
|
-
file_index = generate_unique_index(args.es_index_prefix, f.path)
|
|
337
|
-
# Then, import the analysis metadata into a dedicated index.
|
|
338
|
-
import_analysis_metadata(
|
|
339
|
-
es_import_conn,
|
|
340
|
-
args.es_index_prefix,
|
|
341
|
-
file_index,
|
|
342
|
-
f,
|
|
343
|
-
cat,
|
|
344
|
-
args.dryrun,
|
|
345
|
-
)
|
|
346
|
-
# Finally, import the file in its own index.
|
|
347
|
-
globals()[f"import_{cat}_file"](
|
|
348
|
-
es_import_conn=es_import_conn,
|
|
349
|
-
file_index=file_index,
|
|
350
|
-
file=f,
|
|
351
|
-
dryrun=args.dryrun,
|
|
352
|
-
)
|
|
353
|
-
|
|
354
|
-
# Import processes
|
|
355
|
-
logger.info("Importing wet processes.")
|
|
356
|
-
logger.info(
|
|
357
|
-
"Wet processes IDs = %s",
|
|
358
|
-
str(import_bundle.wet_processes.get_process_ids()),
|
|
359
|
-
)
|
|
360
|
-
import_processes(
|
|
361
|
-
es_import_conn,
|
|
362
|
-
f"{args.es_index_prefix}-wet_processes",
|
|
363
|
-
import_bundle.wet_processes,
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
logger.info("Importing bio info processes.")
|
|
367
|
-
logger.info(
|
|
368
|
-
"Bio info processes IDs = %s",
|
|
369
|
-
str(import_bundle.bi_processes.get_process_ids()),
|
|
370
|
-
)
|
|
371
|
-
import_processes(
|
|
372
|
-
es_import_conn,
|
|
373
|
-
f"{args.es_index_prefix}-bi_processes",
|
|
374
|
-
import_bundle.bi_processes,
|
|
375
|
-
)
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
if __name__ == "__main__":
|
|
379
|
-
main()
|