genelastic 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/api/extends/example.py +2 -3
- genelastic/api/routes.py +160 -23
- genelastic/api/server.py +30 -22
- genelastic/api/settings.py +3 -2
- genelastic/common/__init__.py +36 -9
- genelastic/common/cli.py +51 -23
- genelastic/common/elastic.py +80 -49
- genelastic/common/exceptions.py +0 -2
- genelastic/common/types.py +20 -15
- genelastic/import_data/__init__.py +23 -5
- genelastic/import_data/analyses.py +17 -20
- genelastic/import_data/analysis.py +69 -65
- genelastic/import_data/bi_process.py +7 -5
- genelastic/import_data/bi_processes.py +8 -8
- genelastic/import_data/cli_gen_data.py +116 -0
- genelastic/import_data/cli_import.py +379 -0
- genelastic/import_data/{info.py → cli_info.py} +104 -75
- genelastic/import_data/cli_integrity.py +384 -0
- genelastic/import_data/cli_validate.py +54 -0
- genelastic/import_data/constants.py +11 -32
- genelastic/import_data/data_file.py +23 -20
- genelastic/import_data/filename_pattern.py +26 -32
- genelastic/import_data/import_bundle.py +56 -47
- genelastic/import_data/import_bundle_factory.py +166 -158
- genelastic/import_data/logger.py +22 -18
- genelastic/import_data/random_bundle.py +402 -0
- genelastic/import_data/tags.py +46 -26
- genelastic/import_data/wet_process.py +8 -4
- genelastic/import_data/wet_processes.py +13 -8
- genelastic/ui/__init__.py +0 -0
- genelastic/ui/server.py +87 -0
- genelastic/ui/settings.py +11 -0
- genelastic-0.7.0.dist-info/METADATA +105 -0
- genelastic-0.7.0.dist-info/RECORD +40 -0
- {genelastic-0.6.1.dist-info → genelastic-0.7.0.dist-info}/WHEEL +1 -1
- genelastic-0.7.0.dist-info/entry_points.txt +6 -0
- genelastic/import_data/gen_data.py +0 -194
- genelastic/import_data/import_data.py +0 -292
- genelastic/import_data/integrity.py +0 -290
- genelastic/import_data/validate_data.py +0 -43
- genelastic-0.6.1.dist-info/METADATA +0 -41
- genelastic-0.6.1.dist-info/RECORD +0 -36
- genelastic-0.6.1.dist-info/entry_points.txt +0 -6
- {genelastic-0.6.1.dist-info → genelastic-0.7.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from schema import SchemaError
|
|
6
|
+
|
|
7
|
+
from genelastic.common import add_verbose_control_args
|
|
8
|
+
|
|
9
|
+
from .import_bundle_factory import make_import_bundle_from_files
|
|
10
|
+
from .logger import configure_logging
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger("genelastic")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def read_args() -> argparse.Namespace:
|
|
16
|
+
"""Read arguments from command line."""
|
|
17
|
+
parser = argparse.ArgumentParser(
|
|
18
|
+
description="Ensure that YAML files "
|
|
19
|
+
"follow the genelastic YAML bundle schema.",
|
|
20
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
21
|
+
allow_abbrev=False,
|
|
22
|
+
)
|
|
23
|
+
add_verbose_control_args(parser)
|
|
24
|
+
parser.add_argument(
|
|
25
|
+
"files",
|
|
26
|
+
type=Path,
|
|
27
|
+
nargs="+",
|
|
28
|
+
default=None,
|
|
29
|
+
help="YAML files to validate.",
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"-c",
|
|
33
|
+
"--check",
|
|
34
|
+
action="store_true",
|
|
35
|
+
help="In addition to validating the schema, "
|
|
36
|
+
"check for undefined referenced processes.",
|
|
37
|
+
)
|
|
38
|
+
return parser.parse_args()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def main() -> int:
|
|
42
|
+
"""Entry point of the validate script."""
|
|
43
|
+
args = read_args()
|
|
44
|
+
configure_logging(args.verbose)
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
make_import_bundle_from_files(args.files, check=args.check)
|
|
48
|
+
except (ValueError, RuntimeError, TypeError, SchemaError) as e:
|
|
49
|
+
# Catch any exception that can be raised by 'make_import_bundle_from_files'.
|
|
50
|
+
logger.error(e)
|
|
51
|
+
return 1
|
|
52
|
+
|
|
53
|
+
logger.info("All YAML files respect the genelastic YAML bundle format.")
|
|
54
|
+
return 0
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Module: constants
|
|
1
|
+
"""Module: constants
|
|
3
2
|
|
|
4
3
|
This module contains genelastic constants.
|
|
5
4
|
"""
|
|
5
|
+
|
|
6
6
|
import typing
|
|
7
7
|
|
|
8
|
-
ALLOWED_CATEGORIES: typing.Final[
|
|
8
|
+
ALLOWED_CATEGORIES: typing.Final[list[str]] = ["vcf", "cov"]
|
|
9
9
|
|
|
10
10
|
BUNDLE_CURRENT_VERSION = 3
|
|
11
11
|
|
|
@@ -13,33 +13,12 @@ DEFAULT_TAG_REGEX = "[^_-]+"
|
|
|
13
13
|
DEFAULT_TAG_PREFIX = "%"
|
|
14
14
|
DEFAULT_TAG_SUFFIX = ""
|
|
15
15
|
|
|
16
|
-
DEFAULT_TAG2FIELD: typing.Final[
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
},
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
},
|
|
25
|
-
'%W': {
|
|
26
|
-
"field": 'wet_process',
|
|
27
|
-
"regex": DEFAULT_TAG_REGEX
|
|
28
|
-
},
|
|
29
|
-
'%B': {
|
|
30
|
-
"field": 'bi_process',
|
|
31
|
-
"regex": DEFAULT_TAG_REGEX
|
|
32
|
-
},
|
|
33
|
-
'%D': {
|
|
34
|
-
"field": 'cov_depth',
|
|
35
|
-
"regex": DEFAULT_TAG_REGEX
|
|
36
|
-
},
|
|
37
|
-
'%A': {
|
|
38
|
-
"field": 'barcode',
|
|
39
|
-
"regex": DEFAULT_TAG_REGEX
|
|
40
|
-
},
|
|
41
|
-
'%R': {
|
|
42
|
-
"field": 'reference_genome',
|
|
43
|
-
"regex": DEFAULT_TAG_REGEX
|
|
44
|
-
}
|
|
16
|
+
DEFAULT_TAG2FIELD: typing.Final[dict[str, dict[str, str]]] = {
|
|
17
|
+
"%S": {"field": "sample_name", "regex": DEFAULT_TAG_REGEX},
|
|
18
|
+
"%F": {"field": "source", "regex": DEFAULT_TAG_REGEX},
|
|
19
|
+
"%W": {"field": "wet_process", "regex": DEFAULT_TAG_REGEX},
|
|
20
|
+
"%B": {"field": "bi_process", "regex": DEFAULT_TAG_REGEX},
|
|
21
|
+
"%D": {"field": "cov_depth", "regex": DEFAULT_TAG_REGEX},
|
|
22
|
+
"%A": {"field": "barcode", "regex": DEFAULT_TAG_REGEX},
|
|
23
|
+
"%R": {"field": "reference_genome", "regex": DEFAULT_TAG_REGEX},
|
|
45
24
|
}
|
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This module defines the DataFile class, which handles the representation,
|
|
1
|
+
"""This module defines the DataFile class, which handles the representation,
|
|
3
2
|
management, and extraction of metadata for a data file within a data bundle.
|
|
4
3
|
|
|
5
4
|
It includes functionality to construct DataFile instances from paths and
|
|
@@ -8,45 +7,47 @@ for extracting metadata from filenames using specified patterns.
|
|
|
8
7
|
"""
|
|
9
8
|
|
|
10
9
|
import logging
|
|
11
|
-
import os
|
|
12
10
|
import pathlib
|
|
13
|
-
import
|
|
11
|
+
from pathlib import Path
|
|
14
12
|
|
|
15
13
|
from genelastic.common import AnalysisMetaData
|
|
16
14
|
|
|
17
15
|
from .filename_pattern import FilenamePattern
|
|
18
16
|
|
|
19
|
-
logger = logging.getLogger(
|
|
17
|
+
logger = logging.getLogger("genelastic")
|
|
20
18
|
|
|
21
19
|
|
|
22
20
|
class DataFile:
|
|
23
21
|
"""Class for handling a data file and its metadata."""
|
|
24
22
|
|
|
25
23
|
# Initializer
|
|
26
|
-
def __init__(
|
|
27
|
-
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
path: Path,
|
|
27
|
+
bundle_path: Path | None = None,
|
|
28
|
+
metadata: AnalysisMetaData | None = None,
|
|
29
|
+
) -> None:
|
|
28
30
|
self._path = path
|
|
29
31
|
self._bundle_path = bundle_path # The bundle YAML file in which this
|
|
30
32
|
# file was listed.
|
|
31
33
|
self._metadata = {} if metadata is None else metadata
|
|
32
34
|
|
|
33
35
|
def __repr__(self) -> str:
|
|
34
|
-
return
|
|
35
|
-
+ f", with metadata {self._metadata}")
|
|
36
|
+
return f"File {self._path}, from bundle {self._bundle_path}, with metadata {self._metadata}"
|
|
36
37
|
|
|
37
38
|
# Get path
|
|
38
39
|
@property
|
|
39
|
-
def path(self) ->
|
|
40
|
+
def path(self) -> Path:
|
|
40
41
|
"""Retrieve the data file path."""
|
|
41
42
|
return self._path
|
|
42
43
|
|
|
43
44
|
def exists(self) -> bool:
|
|
44
45
|
"""Tests if the associated file exists on disk."""
|
|
45
|
-
return
|
|
46
|
+
return self._path.is_file()
|
|
46
47
|
|
|
47
48
|
# Get bundle path
|
|
48
49
|
@property
|
|
49
|
-
def bundle_path(self) ->
|
|
50
|
+
def bundle_path(self) -> Path | None:
|
|
50
51
|
"""Retrieve the path to the associated data bundle file."""
|
|
51
52
|
return self._bundle_path
|
|
52
53
|
|
|
@@ -59,20 +60,22 @@ class DataFile:
|
|
|
59
60
|
# Factory
|
|
60
61
|
@classmethod
|
|
61
62
|
def make_from_bundle(
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
63
|
+
cls,
|
|
64
|
+
path: Path,
|
|
65
|
+
bundle_path: Path | None,
|
|
66
|
+
pattern: FilenamePattern | None = None,
|
|
67
|
+
) -> "DataFile":
|
|
66
68
|
"""Construct a DataFile instance from a bundle path, file path,
|
|
67
|
-
and optional filename pattern.
|
|
69
|
+
and optional filename pattern.
|
|
70
|
+
"""
|
|
68
71
|
# Make absolute path
|
|
69
|
-
if not
|
|
70
|
-
path =
|
|
72
|
+
if not path.is_absolute() and bundle_path is not None:
|
|
73
|
+
path = bundle_path.parent / path
|
|
71
74
|
|
|
72
75
|
# Extract filename metadata
|
|
73
76
|
metadata = None
|
|
74
77
|
if pattern is not None:
|
|
75
|
-
metadata = pattern.extract_metadata(
|
|
78
|
+
metadata = pattern.extract_metadata(path.name)
|
|
76
79
|
|
|
77
80
|
if metadata:
|
|
78
81
|
if "ext" not in metadata:
|
|
@@ -1,14 +1,11 @@
|
|
|
1
|
-
"""
|
|
2
|
-
This module defines the FilenamePattern class, used to define a filename pattern
|
|
1
|
+
"""This module defines the FilenamePattern class, used to define a filename pattern
|
|
3
2
|
and extract metadata from file names using this pattern.
|
|
4
3
|
"""
|
|
5
4
|
|
|
6
|
-
import logging
|
|
7
5
|
import re
|
|
8
6
|
|
|
9
7
|
from genelastic.common import AnalysisMetaData
|
|
10
8
|
|
|
11
|
-
logger = logging.getLogger('genelastic')
|
|
12
9
|
|
|
13
10
|
class FilenamePattern:
|
|
14
11
|
"""Class for defining a filename pattern.
|
|
@@ -18,46 +15,43 @@ class FilenamePattern:
|
|
|
18
15
|
|
|
19
16
|
# Initializer
|
|
20
17
|
def __init__(self, pattern: str) -> None:
|
|
21
|
-
"""
|
|
22
|
-
Initializes a FilenamePattern instance.
|
|
18
|
+
"""Initializes a FilenamePattern instance.
|
|
23
19
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
20
|
+
Args:
|
|
21
|
+
pattern (str): The pattern string used for defining
|
|
22
|
+
the filename pattern.
|
|
23
|
+
"""
|
|
28
24
|
self._re = re.compile(pattern)
|
|
29
25
|
|
|
30
26
|
def extract_metadata(self, filename: str) -> AnalysisMetaData:
|
|
31
|
-
"""
|
|
32
|
-
|
|
33
|
-
on the defined pattern.
|
|
27
|
+
"""Extracts metadata from the given filename based
|
|
28
|
+
on the defined pattern.
|
|
34
29
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
30
|
+
Args:
|
|
31
|
+
filename (str): The filename from which metadata
|
|
32
|
+
needs to be extracted.
|
|
38
33
|
|
|
39
|
-
|
|
40
|
-
|
|
34
|
+
Returns:
|
|
35
|
+
dict: A dictionary containing the extracted metadata.
|
|
41
36
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
37
|
+
Raises:
|
|
38
|
+
RuntimeError: If parsing of filename fails
|
|
39
|
+
with the defined pattern.
|
|
40
|
+
"""
|
|
46
41
|
m = self._re.search(filename)
|
|
47
42
|
if not m:
|
|
48
|
-
|
|
49
|
-
|
|
43
|
+
msg = f'Failed parsing filename "{filename}" with pattern "{self._re.pattern}".'
|
|
44
|
+
raise RuntimeError(msg)
|
|
50
45
|
return m.groupdict()
|
|
51
46
|
|
|
52
47
|
def matches_pattern(self, filename: str) -> bool:
|
|
53
|
-
"""
|
|
54
|
-
Checks if the given filename matches the defined pattern.
|
|
48
|
+
"""Checks if the given filename matches the defined pattern.
|
|
55
49
|
|
|
56
|
-
|
|
57
|
-
|
|
50
|
+
Args:
|
|
51
|
+
filename (str): The filename to be checked.
|
|
58
52
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
53
|
+
Returns:
|
|
54
|
+
bool: True if the filename matches the pattern,
|
|
55
|
+
False otherwise.
|
|
56
|
+
"""
|
|
63
57
|
return bool(self._re.match(filename))
|
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Module: import_bundle
|
|
1
|
+
"""Module: import_bundle
|
|
3
2
|
|
|
4
3
|
This module provides functionality for importing data bundles.
|
|
5
4
|
"""
|
|
@@ -10,63 +9,68 @@ import typing
|
|
|
10
9
|
|
|
11
10
|
from genelastic.common import BundleDict
|
|
12
11
|
|
|
12
|
+
from .analyses import Analyses
|
|
13
13
|
from .bi_processes import BioInfoProcesses
|
|
14
|
-
from .data_file import DataFile
|
|
15
14
|
from .constants import BUNDLE_CURRENT_VERSION
|
|
16
|
-
from .
|
|
15
|
+
from .data_file import DataFile
|
|
17
16
|
from .tags import Tags
|
|
18
17
|
from .wet_processes import WetProcesses
|
|
19
18
|
|
|
20
|
-
logger = logging.getLogger(
|
|
19
|
+
logger = logging.getLogger("genelastic")
|
|
21
20
|
|
|
22
21
|
|
|
23
22
|
class ImportBundle:
|
|
24
23
|
"""Class for handling an import bundle description."""
|
|
25
24
|
|
|
26
|
-
def __init__(
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
analyses:
|
|
30
|
-
wet_processes:
|
|
31
|
-
bi_processes:
|
|
25
|
+
def __init__( # noqa: C901
|
|
26
|
+
self, x: typing.Sequence[BundleDict], *, check: bool = False
|
|
27
|
+
) -> None:
|
|
28
|
+
analyses: list[BundleDict] = []
|
|
29
|
+
wet_processes: list[BundleDict] = []
|
|
30
|
+
bi_processes: list[BundleDict] = []
|
|
32
31
|
tags = Tags(x)
|
|
33
32
|
|
|
34
33
|
# Loop on dicts
|
|
35
34
|
for d in x:
|
|
36
35
|
# Check version
|
|
37
|
-
if
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
36
|
+
if "version" not in d:
|
|
37
|
+
msg = "No version inside YAML document."
|
|
38
|
+
raise RuntimeError(msg)
|
|
39
|
+
if int(d["version"]) != BUNDLE_CURRENT_VERSION:
|
|
40
|
+
raise RuntimeError
|
|
41
41
|
|
|
42
42
|
# Gather all analyses
|
|
43
|
-
if
|
|
43
|
+
if "analyses" in d and d["analyses"] is not None:
|
|
44
44
|
# Copy some bundle properties into each analysis
|
|
45
|
-
for analysis in d[
|
|
46
|
-
for key in [
|
|
45
|
+
for analysis in d["analyses"]:
|
|
46
|
+
for key in ["bundle_file", "root_dir"]:
|
|
47
47
|
if key in d:
|
|
48
48
|
analysis[key] = d[key]
|
|
49
49
|
|
|
50
50
|
# Add the tags to use.
|
|
51
|
-
analysis[
|
|
51
|
+
analysis["tags"] = tags
|
|
52
52
|
|
|
53
|
-
analyses.extend(d[
|
|
53
|
+
analyses.extend(d["analyses"])
|
|
54
54
|
|
|
55
55
|
# If some wet processes are defined, copy the bundle file path into each of them.
|
|
56
|
-
if
|
|
57
|
-
for wet_process in d[
|
|
58
|
-
wet_process[
|
|
59
|
-
wet_processes.extend(d[
|
|
56
|
+
if "wet_processes" in d and d["wet_processes"] is not None:
|
|
57
|
+
for wet_process in d["wet_processes"]:
|
|
58
|
+
wet_process["bundle_file"] = d["bundle_file"]
|
|
59
|
+
wet_processes.extend(d["wet_processes"])
|
|
60
60
|
|
|
61
61
|
# If some bio processes are defined, copy the bundle file path into each of them.
|
|
62
|
-
if
|
|
63
|
-
for bi_process in d[
|
|
64
|
-
bi_process[
|
|
65
|
-
bi_processes.extend(d[
|
|
62
|
+
if "bi_processes" in d and d["bi_processes"] is not None:
|
|
63
|
+
for bi_process in d["bi_processes"]:
|
|
64
|
+
bi_process["bundle_file"] = d["bundle_file"]
|
|
65
|
+
bi_processes.extend(d["bi_processes"])
|
|
66
66
|
|
|
67
67
|
# Instantiate all objects
|
|
68
|
-
self._wet_processes: WetProcesses = WetProcesses.from_array_of_dicts(
|
|
69
|
-
|
|
68
|
+
self._wet_processes: WetProcesses = WetProcesses.from_array_of_dicts(
|
|
69
|
+
wet_processes
|
|
70
|
+
)
|
|
71
|
+
self._bi_processes: BioInfoProcesses = (
|
|
72
|
+
BioInfoProcesses.from_array_of_dicts(bi_processes)
|
|
73
|
+
)
|
|
70
74
|
self._analyses: Analyses = Analyses.from_array_of_dicts(analyses)
|
|
71
75
|
|
|
72
76
|
if check:
|
|
@@ -79,17 +83,27 @@ class ImportBundle:
|
|
|
79
83
|
for index, analysis in enumerate(self._analyses):
|
|
80
84
|
analysis_wet_process = analysis.metadata.get("wet_process")
|
|
81
85
|
|
|
82
|
-
if (
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
+
if (
|
|
87
|
+
analysis_wet_process
|
|
88
|
+
and analysis_wet_process
|
|
89
|
+
not in self._wet_processes.get_process_ids()
|
|
90
|
+
):
|
|
91
|
+
sys.exit(
|
|
92
|
+
f"Analysis at index {index} in file {analysis.bundle_file} "
|
|
93
|
+
f"is referencing an undefined wet process: {analysis_wet_process}"
|
|
94
|
+
)
|
|
86
95
|
|
|
87
96
|
analysis_bi_process = analysis.metadata.get("bi_process")
|
|
88
97
|
|
|
89
|
-
if (
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
98
|
+
if (
|
|
99
|
+
analysis_bi_process
|
|
100
|
+
and analysis_bi_process
|
|
101
|
+
not in self._bi_processes.get_process_ids()
|
|
102
|
+
):
|
|
103
|
+
sys.exit(
|
|
104
|
+
f"Analysis at index {index} in file {analysis.bundle_file} "
|
|
105
|
+
f"is referencing an undefined bi process: {analysis_bi_process}"
|
|
106
|
+
)
|
|
93
107
|
|
|
94
108
|
@property
|
|
95
109
|
def analyses(self) -> Analyses:
|
|
@@ -111,10 +125,9 @@ class ImportBundle:
|
|
|
111
125
|
files = self.get_files(cat)
|
|
112
126
|
return len(files)
|
|
113
127
|
|
|
114
|
-
def get_files(self, cat: str | None = None) ->
|
|
128
|
+
def get_files(self, cat: str | None = None) -> list[DataFile]:
|
|
115
129
|
"""Returns all files of a category."""
|
|
116
|
-
|
|
117
|
-
files: typing.List[DataFile] = []
|
|
130
|
+
files: list[DataFile] = []
|
|
118
131
|
|
|
119
132
|
# Loop on all analyses
|
|
120
133
|
for analysis in self.analyses:
|
|
@@ -124,12 +137,8 @@ class ImportBundle:
|
|
|
124
137
|
|
|
125
138
|
def get_nb_matched_files(self) -> int:
|
|
126
139
|
"""Get the number of files that match the pattern."""
|
|
127
|
-
|
|
128
|
-
return sum(a.get_nb_files()
|
|
129
|
-
for a in self.analyses)
|
|
140
|
+
return sum(a.get_nb_files() for a in self.analyses)
|
|
130
141
|
|
|
131
142
|
def get_nb_unmatched_files(self) -> int:
|
|
132
143
|
"""Get the number of files that do not match."""
|
|
133
|
-
|
|
134
|
-
return sum(len(a.get_unmatched_file_paths())
|
|
135
|
-
for a in self.analyses)
|
|
144
|
+
return sum(len(a.get_unmatched_file_paths()) for a in self.analyses)
|