genelastic 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/api/.env +4 -0
- genelastic/api/cli_start_api.py +2 -2
- genelastic/api/errors.py +52 -0
- genelastic/api/extends/example.py +0 -6
- genelastic/api/extends/example.yml +0 -20
- genelastic/api/routes.py +313 -181
- genelastic/api/server.py +8 -3
- genelastic/api/specification.yml +343 -181
- genelastic/common/__init__.py +0 -44
- genelastic/common/cli.py +48 -0
- genelastic/common/elastic.py +374 -46
- genelastic/common/exceptions.py +34 -2
- genelastic/common/server.py +9 -1
- genelastic/common/types.py +1 -14
- genelastic/import_data/__init__.py +0 -27
- genelastic/import_data/checker.py +99 -0
- genelastic/import_data/checker_observer.py +13 -0
- genelastic/import_data/cli/__init__.py +0 -0
- genelastic/import_data/cli/cli_check.py +136 -0
- genelastic/import_data/{cli_gen_data.py → cli/gen_data.py} +4 -4
- genelastic/import_data/cli/import_data.py +346 -0
- genelastic/import_data/cli/info.py +247 -0
- genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
- genelastic/import_data/cli/validate.py +146 -0
- genelastic/import_data/collect.py +185 -0
- genelastic/import_data/constants.py +136 -11
- genelastic/import_data/import_bundle.py +102 -59
- genelastic/import_data/import_bundle_factory.py +70 -149
- genelastic/import_data/importers/__init__.py +0 -0
- genelastic/import_data/importers/importer_base.py +131 -0
- genelastic/import_data/importers/importer_factory.py +85 -0
- genelastic/import_data/importers/importer_types.py +223 -0
- genelastic/import_data/logger.py +2 -1
- genelastic/import_data/models/__init__.py +0 -0
- genelastic/import_data/models/analyses.py +178 -0
- genelastic/import_data/models/analysis.py +144 -0
- genelastic/import_data/models/data_file.py +110 -0
- genelastic/import_data/models/process.py +45 -0
- genelastic/import_data/models/processes.py +84 -0
- genelastic/import_data/models/tags.py +170 -0
- genelastic/import_data/models/unique_list.py +109 -0
- genelastic/import_data/models/validate.py +26 -0
- genelastic/import_data/patterns.py +90 -0
- genelastic/import_data/random_bundle.py +10 -8
- genelastic/import_data/resolve.py +157 -0
- genelastic/ui/.env +1 -0
- genelastic/ui/cli_start_ui.py +4 -2
- genelastic/ui/routes.py +289 -42
- genelastic/ui/static/cea-cnrgh.ico +0 -0
- genelastic/ui/static/cea.ico +0 -0
- genelastic/ui/static/layout.ico +0 -0
- genelastic/ui/static/novaseq6000.png +0 -0
- genelastic/ui/static/style.css +430 -0
- genelastic/ui/static/ui.js +458 -0
- genelastic/ui/templates/analyses.html +96 -9
- genelastic/ui/templates/analysis_detail.html +44 -0
- genelastic/ui/templates/bi_process_detail.html +129 -0
- genelastic/ui/templates/bi_processes.html +114 -9
- genelastic/ui/templates/explorer.html +356 -0
- genelastic/ui/templates/home.html +205 -2
- genelastic/ui/templates/layout.html +148 -29
- genelastic/ui/templates/version.html +19 -7
- genelastic/ui/templates/wet_process_detail.html +131 -0
- genelastic/ui/templates/wet_processes.html +114 -9
- genelastic-0.9.0.dist-info/METADATA +686 -0
- genelastic-0.9.0.dist-info/RECORD +76 -0
- genelastic-0.9.0.dist-info/WHEEL +4 -0
- genelastic-0.9.0.dist-info/entry_points.txt +10 -0
- genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
- genelastic/import_data/analyses.py +0 -69
- genelastic/import_data/analysis.py +0 -205
- genelastic/import_data/bi_process.py +0 -27
- genelastic/import_data/bi_processes.py +0 -49
- genelastic/import_data/cli_import.py +0 -379
- genelastic/import_data/cli_info.py +0 -256
- genelastic/import_data/cli_validate.py +0 -54
- genelastic/import_data/data_file.py +0 -87
- genelastic/import_data/filename_pattern.py +0 -57
- genelastic/import_data/tags.py +0 -123
- genelastic/import_data/wet_process.py +0 -28
- genelastic/import_data/wet_processes.py +0 -53
- genelastic-0.8.0.dist-info/METADATA +0 -109
- genelastic-0.8.0.dist-info/RECORD +0 -52
- genelastic-0.8.0.dist-info/WHEEL +0 -5
- genelastic-0.8.0.dist-info/entry_points.txt +0 -8
- genelastic-0.8.0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import copy
|
|
3
|
+
import logging
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from types import NotImplementedType
|
|
7
|
+
|
|
8
|
+
from genelastic.common.types import Metadata
|
|
9
|
+
from genelastic.import_data.collect import (
|
|
10
|
+
DataFileCollector,
|
|
11
|
+
)
|
|
12
|
+
from genelastic.import_data.constants import (
|
|
13
|
+
ALLOWED_EXTENSIONS,
|
|
14
|
+
)
|
|
15
|
+
from genelastic.import_data.models.data_file import DataFile
|
|
16
|
+
from genelastic.import_data.patterns import FilenamePattern
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("genelastic")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Analysis:
|
|
22
|
+
"""Class Analysis that represents an analysis."""
|
|
23
|
+
|
|
24
|
+
METADATA_INTERNAL_KEYS = frozenset(
|
|
25
|
+
["tags", "multi_match", "ext", "file_prefix"]
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
analysis_id: str,
|
|
31
|
+
bundle_file: Path,
|
|
32
|
+
data_path: Path,
|
|
33
|
+
filename_pattern: FilenamePattern,
|
|
34
|
+
**metadata: str | int,
|
|
35
|
+
) -> None:
|
|
36
|
+
self._analysis_id = analysis_id
|
|
37
|
+
self._bundle_file = bundle_file
|
|
38
|
+
self._data_path = data_path
|
|
39
|
+
self._metadata = self._remove_internal_keys(metadata)
|
|
40
|
+
self._data_files_by_ext: dict[str, set[DataFile]] = defaultdict(set)
|
|
41
|
+
|
|
42
|
+
logger.info("")
|
|
43
|
+
logger.info("[ Analysis ID %s ]", self._analysis_id)
|
|
44
|
+
|
|
45
|
+
self._collected_files = DataFileCollector(
|
|
46
|
+
analysis_id,
|
|
47
|
+
bundle_file,
|
|
48
|
+
data_path,
|
|
49
|
+
filename_pattern,
|
|
50
|
+
).run()
|
|
51
|
+
|
|
52
|
+
for data_file in self._collected_files.data_files:
|
|
53
|
+
self._data_files_by_ext[data_file.ext].add(data_file)
|
|
54
|
+
|
|
55
|
+
logger.info(
|
|
56
|
+
" -> Extracted %s file extension(s): %s.",
|
|
57
|
+
len(self._data_files_by_ext.keys()),
|
|
58
|
+
", ".join(ext.upper() for ext in self._data_files_by_ext),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def __eq__(self, other: object) -> bool | NotImplementedType:
|
|
62
|
+
"""Defines equality comparison for Analysis instances based on their
|
|
63
|
+
ID.
|
|
64
|
+
"""
|
|
65
|
+
if isinstance(other, Analysis):
|
|
66
|
+
return self._analysis_id == other._analysis_id
|
|
67
|
+
return NotImplemented
|
|
68
|
+
|
|
69
|
+
def __lt__(self, other: object) -> bool | NotImplementedType:
|
|
70
|
+
"""Defines sort order for Analysis instances based on their ID."""
|
|
71
|
+
if isinstance(other, Analysis):
|
|
72
|
+
return self._analysis_id < other._analysis_id
|
|
73
|
+
return NotImplemented
|
|
74
|
+
|
|
75
|
+
def __str__(self) -> str:
|
|
76
|
+
return (
|
|
77
|
+
f"Analysis(id='{self._analysis_id}', "
|
|
78
|
+
f"bundle_file='{self._bundle_file}', "
|
|
79
|
+
f"data_path='{self._data_path}', "
|
|
80
|
+
f"metadata={self._metadata})"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def _remove_internal_keys(
|
|
85
|
+
metadata: Metadata,
|
|
86
|
+
) -> Metadata:
|
|
87
|
+
updated_metadata = metadata.copy()
|
|
88
|
+
|
|
89
|
+
for key in Analysis.METADATA_INTERNAL_KEYS:
|
|
90
|
+
with contextlib.suppress(KeyError):
|
|
91
|
+
del updated_metadata[key]
|
|
92
|
+
|
|
93
|
+
return updated_metadata
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def metadata(self) -> Metadata:
|
|
97
|
+
"""Get metadata."""
|
|
98
|
+
return copy.deepcopy(self._metadata)
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def bundle_file(self) -> Path:
|
|
102
|
+
"""Get the bundle file."""
|
|
103
|
+
return self._bundle_file
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def data_path(self) -> Path:
|
|
107
|
+
"""Get the data path specified in the bundle file."""
|
|
108
|
+
return self._data_path
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def id(self) -> str:
|
|
112
|
+
"""Get the analysis ID."""
|
|
113
|
+
return self._analysis_id
|
|
114
|
+
|
|
115
|
+
@property
|
|
116
|
+
def matched_files(self) -> set[Path]:
|
|
117
|
+
"""Returns the list of files that matched the filename pattern."""
|
|
118
|
+
return self._collected_files.matched_files
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def unmatched_files(self) -> set[Path]:
|
|
122
|
+
"""Returns the list of files that did not match the filename pattern."""
|
|
123
|
+
return self._collected_files.unmatched_files
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def extensions(self) -> set[str]:
|
|
127
|
+
"""Returns all the matched files extensions."""
|
|
128
|
+
return set(self._data_files_by_ext.keys())
|
|
129
|
+
|
|
130
|
+
def get_data_files(self, ext: str | None = None) -> set[DataFile]:
|
|
131
|
+
"""Returns the list of matched files as DataFile objects.
|
|
132
|
+
|
|
133
|
+
:param ext: Filter the list of matched files by their extension
|
|
134
|
+
(case-sensitive).
|
|
135
|
+
"""
|
|
136
|
+
if ext:
|
|
137
|
+
if ext not in ALLOWED_EXTENSIONS:
|
|
138
|
+
msg = f"Unsupported extension {ext}."
|
|
139
|
+
raise ValueError(msg)
|
|
140
|
+
|
|
141
|
+
if ext in self._data_files_by_ext:
|
|
142
|
+
return self._data_files_by_ext[ext]
|
|
143
|
+
return set()
|
|
144
|
+
return {f for value in self._data_files_by_ext.values() for f in value}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""This module defines the DataFile class, which handles the representation,
|
|
2
|
+
management, and extraction of metadata for a data file within a data bundle.
|
|
3
|
+
|
|
4
|
+
It includes functionality to construct DataFile instances from paths and
|
|
5
|
+
optional filename patterns, retrieve file paths and metadata, and support
|
|
6
|
+
for extracting metadata from filenames using specified patterns.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from types import NotImplementedType
|
|
12
|
+
|
|
13
|
+
from genelastic.common.types import Metadata
|
|
14
|
+
from genelastic.import_data.patterns import MetricsPattern
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger("genelastic")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DataFile:
|
|
20
|
+
"""Class for handling a data file and its metadata."""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
analysis_id: str,
|
|
25
|
+
path: Path,
|
|
26
|
+
bundle_file: Path,
|
|
27
|
+
metadata: Metadata,
|
|
28
|
+
) -> None:
|
|
29
|
+
self._analysis_id = analysis_id
|
|
30
|
+
self._path = path
|
|
31
|
+
self._bundle_file = bundle_file
|
|
32
|
+
self._metadata = metadata
|
|
33
|
+
self._metrics = MetricsPattern.extract_metadata(path)
|
|
34
|
+
self._validate_params()
|
|
35
|
+
|
|
36
|
+
self._ext = str(self._metadata["ext"]).lower()
|
|
37
|
+
|
|
38
|
+
key = "type" if self._metrics is not None else "ext"
|
|
39
|
+
self._type = str(self._metadata[key]).lower()
|
|
40
|
+
|
|
41
|
+
def __eq__(self, other: object) -> bool | NotImplementedType:
|
|
42
|
+
"""Defines equality comparison for DataFile instances based on their
|
|
43
|
+
file path.
|
|
44
|
+
"""
|
|
45
|
+
if isinstance(other, DataFile):
|
|
46
|
+
return self._path == other._path
|
|
47
|
+
return NotImplemented
|
|
48
|
+
|
|
49
|
+
def __hash__(self) -> int:
|
|
50
|
+
"""Defines hash behavior for DataFile to allow use in sets and as dict keys."""
|
|
51
|
+
return hash(self._path)
|
|
52
|
+
|
|
53
|
+
def _validate_params(self) -> None:
|
|
54
|
+
"""Validate values of some ``DataFile`` constructor parameters.
|
|
55
|
+
|
|
56
|
+
:raises RuntimeError: One of the parameters value is invalid.
|
|
57
|
+
"""
|
|
58
|
+
if "ext" not in self._metadata:
|
|
59
|
+
msg = (
|
|
60
|
+
f"Data file '{self._path}' "
|
|
61
|
+
f"is missing the required metadata key 'ext'."
|
|
62
|
+
)
|
|
63
|
+
raise RuntimeError(msg)
|
|
64
|
+
|
|
65
|
+
if self._metrics is not None and "type" not in self._metadata:
|
|
66
|
+
msg = (
|
|
67
|
+
f"Metrics data file '{self._path}' "
|
|
68
|
+
f"is missing the required metadata key 'type'."
|
|
69
|
+
)
|
|
70
|
+
raise RuntimeError(msg)
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def analysis_id(self) -> str:
|
|
74
|
+
"""Get the analysis ID."""
|
|
75
|
+
return self._analysis_id
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def path(self) -> Path:
|
|
79
|
+
"""Retrieve the data file path."""
|
|
80
|
+
return self._path
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def ext(self) -> str:
|
|
84
|
+
"""Retrieve the data file extension."""
|
|
85
|
+
return self._ext
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def type(self) -> str:
|
|
89
|
+
"""Retrieve the data file type.
|
|
90
|
+
|
|
91
|
+
Normally, the type is the file's extension.
|
|
92
|
+
If the file is a metrics file, its type is taken from the metadata key
|
|
93
|
+
'type'.
|
|
94
|
+
"""
|
|
95
|
+
return self._type
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def bundle_file(self) -> Path:
|
|
99
|
+
"""Retrieve the path to the associated data bundle file."""
|
|
100
|
+
return self._bundle_file
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def metadata(self) -> Metadata:
|
|
104
|
+
"""Retrieve a copy of the metadata associated with the data file."""
|
|
105
|
+
return self._metadata.copy()
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def metrics(self) -> list[dict[str, str]] | None:
|
|
109
|
+
"""Retrieve a copy of the metrics associated with the data file."""
|
|
110
|
+
return self._metrics
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
from abc import ABC
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Process(ABC): # noqa: B024
|
|
7
|
+
"""Abstract base class for a Process.
|
|
8
|
+
|
|
9
|
+
It is not intended to be instantiated directly. Instead, use one of its
|
|
10
|
+
subclasses, ``WetProcess`` or ``BioInfoProcess``.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
proc_id: str,
|
|
16
|
+
bundle_file: str | None = None,
|
|
17
|
+
**data: Any, # noqa: ANN401
|
|
18
|
+
) -> None:
|
|
19
|
+
self._proc_id = proc_id
|
|
20
|
+
self._bundle_file = bundle_file
|
|
21
|
+
self._data = data
|
|
22
|
+
self._type = self.__class__.__name__
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def id(self) -> str:
|
|
26
|
+
"""Unique identifier of the process."""
|
|
27
|
+
return self._proc_id
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def data(self) -> dict[str, Any]:
|
|
31
|
+
"""Return a copy of the associated data."""
|
|
32
|
+
return copy.deepcopy(self._data)
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def type(self) -> str:
|
|
36
|
+
"""Type of the process."""
|
|
37
|
+
return self._type
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class WetProcess(Process):
|
|
41
|
+
"""Concrete wet lab process."""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class BioInfoProcess(Process):
|
|
45
|
+
"""Concrete bioinformatics process."""
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import typing
|
|
3
|
+
from collections import UserDict
|
|
4
|
+
from typing import Self
|
|
5
|
+
|
|
6
|
+
from genelastic.common.types import BundleDict
|
|
7
|
+
from genelastic.import_data.models.process import (
|
|
8
|
+
Process,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger("genelastic")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Processes(UserDict[str, Process]):
|
|
15
|
+
"""Container for homogeneous Process objects.
|
|
16
|
+
|
|
17
|
+
Unlike a standard dict:
|
|
18
|
+
- Only subclasses of ``Process`` are allowed as values.
|
|
19
|
+
- All items must be of the same concrete subclass of ``Process``.
|
|
20
|
+
- Duplicate keys are not allowed and will raise an exception.
|
|
21
|
+
|
|
22
|
+
:ivar _item_type: Internal attribute storing the concrete subclass
|
|
23
|
+
of ``Process`` enforced in this container.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
_item_type: type | None = None
|
|
27
|
+
|
|
28
|
+
def __setitem__(self, key: str, value: Process) -> None:
|
|
29
|
+
if not isinstance(value, Process):
|
|
30
|
+
msg = (
|
|
31
|
+
"Object type not supported. "
|
|
32
|
+
"Container only supports 'Process' subclasses as items."
|
|
33
|
+
)
|
|
34
|
+
raise TypeError(msg)
|
|
35
|
+
|
|
36
|
+
if self._item_type is None:
|
|
37
|
+
self._item_type = type(value)
|
|
38
|
+
elif not isinstance(value, self._item_type):
|
|
39
|
+
msg = (
|
|
40
|
+
f"Cannot mix types. Container already holds "
|
|
41
|
+
f"{self._item_type.__name__} items."
|
|
42
|
+
)
|
|
43
|
+
raise TypeError(msg)
|
|
44
|
+
|
|
45
|
+
if key in self:
|
|
46
|
+
msg = (
|
|
47
|
+
f"Duplicate key. "
|
|
48
|
+
f"Container already holds an item with key '{key}'."
|
|
49
|
+
)
|
|
50
|
+
raise ValueError(msg)
|
|
51
|
+
|
|
52
|
+
super().__setitem__(key, value)
|
|
53
|
+
|
|
54
|
+
def add(self, item: Process) -> None:
|
|
55
|
+
"""Add one process item to the container.
|
|
56
|
+
|
|
57
|
+
:raises TypeError: If ``item`` is not a subclass of ``Process``,
|
|
58
|
+
or if it does not match the subclass type of items already in the
|
|
59
|
+
container.
|
|
60
|
+
:raises ValueError: If an item with the same key (``item.id``) already
|
|
61
|
+
exists in the container.
|
|
62
|
+
"""
|
|
63
|
+
self[item.id] = item
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def from_dicts(
|
|
67
|
+
cls, arr: typing.Sequence[BundleDict], process_cls: type[Process]
|
|
68
|
+
) -> Self:
|
|
69
|
+
"""Build a Processes container instance from a sequence of dictionaries.
|
|
70
|
+
|
|
71
|
+
:param arr: Sequence of dictionaries representing process data.
|
|
72
|
+
:param process_cls: The subclass of ``Process`` to instantiate for each
|
|
73
|
+
dict.
|
|
74
|
+
:raises TypeError: If instantiating ``process_cls`` fails due to invalid
|
|
75
|
+
dictionary arguments, or if the resulting object type does not
|
|
76
|
+
match the container's enforced type.
|
|
77
|
+
:raises ValueError: If two or more dictionaries yield items with the
|
|
78
|
+
same key (``id``), leading to duplicate entries in the container.
|
|
79
|
+
:return: A Processes container instance populated with process objects.
|
|
80
|
+
"""
|
|
81
|
+
instance = cls()
|
|
82
|
+
for d in arr:
|
|
83
|
+
instance.add(process_cls(**d))
|
|
84
|
+
return instance
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
import typing
|
|
4
|
+
from collections import UserDict
|
|
5
|
+
|
|
6
|
+
from genelastic.common.exceptions import TagsDefinitionError
|
|
7
|
+
from genelastic.common.types import BundleDict
|
|
8
|
+
from genelastic.import_data.constants import (
|
|
9
|
+
DEFAULT_TAG2FIELD,
|
|
10
|
+
DEFAULT_TAG_DELIMITER_END,
|
|
11
|
+
DEFAULT_TAG_DELIMITER_START,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("genelastic")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Tags(UserDict[str, dict[str, str]]):
|
|
18
|
+
"""Represents a set of tags used to extract metadata from filenames.
|
|
19
|
+
|
|
20
|
+
Each tag maps a name to a metadata field and a regex pattern, supporting
|
|
21
|
+
custom delimiters. This class combines default tags (``DEFAULT_TAG2FIELD``)
|
|
22
|
+
with optional user-defined tags, and provides utilities for searching,
|
|
23
|
+
accessing, and resolving tags in filename patterns.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
delimiter_start: str | None = None,
|
|
29
|
+
delimiter_end: str | None = None,
|
|
30
|
+
match: dict[str, dict[str, str]] | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
"""Initialize a Tags instance.
|
|
33
|
+
|
|
34
|
+
:param delimiter_start: Optional character prepended to all tag names.
|
|
35
|
+
Defaults to ``DEFAULT_TAG_DELIMITER_START``.
|
|
36
|
+
:param delimiter_end: Optional character appended to all tag names.
|
|
37
|
+
Defaults to ``DEFAULT_TAG_DELIMITER_END``.
|
|
38
|
+
:param match: Optional dictionary of user-defined tags. Overrides
|
|
39
|
+
``DEFAULT_TAG2FIELD`` if keys overlap.
|
|
40
|
+
"""
|
|
41
|
+
super().__init__()
|
|
42
|
+
|
|
43
|
+
if delimiter_start is None:
|
|
44
|
+
self._delimiter_start = DEFAULT_TAG_DELIMITER_START
|
|
45
|
+
else:
|
|
46
|
+
if not self.validate_tag_delimiter(delimiter_start):
|
|
47
|
+
msg = (
|
|
48
|
+
"A tag delimiter start should contain only one special "
|
|
49
|
+
"character, excluding the following: (, ), ?, <, >."
|
|
50
|
+
)
|
|
51
|
+
raise TagsDefinitionError(msg)
|
|
52
|
+
self._delimiter_start = delimiter_start
|
|
53
|
+
|
|
54
|
+
if delimiter_end is None:
|
|
55
|
+
self._delimiter_end = DEFAULT_TAG_DELIMITER_END
|
|
56
|
+
else:
|
|
57
|
+
if not self.validate_tag_delimiter(delimiter_end):
|
|
58
|
+
msg = (
|
|
59
|
+
"A tag delimiter end should contain only one special "
|
|
60
|
+
"character, excluding the following: (, ), ?, <, >."
|
|
61
|
+
)
|
|
62
|
+
raise TagsDefinitionError(msg)
|
|
63
|
+
self._delimiter_end = delimiter_end
|
|
64
|
+
|
|
65
|
+
# Combine default tags with user-provided tags. User-defined ones takes
|
|
66
|
+
# precedence.
|
|
67
|
+
effective_match = DEFAULT_TAG2FIELD | (match or {})
|
|
68
|
+
|
|
69
|
+
# Store each tag in the dictionary using the full name
|
|
70
|
+
# (delimiter start + tag name + delimiter end).
|
|
71
|
+
for tag_name, tag_attrs in effective_match.items():
|
|
72
|
+
if not self.validate_tag_name(tag_name):
|
|
73
|
+
msg = (
|
|
74
|
+
f"Invalid tag '{tag_name}': its name should contain at "
|
|
75
|
+
f"least one alphanumeric character: a-z, A-Z and 0-9."
|
|
76
|
+
)
|
|
77
|
+
raise TagsDefinitionError(msg)
|
|
78
|
+
|
|
79
|
+
for mandatory_key in ("field", "regex"):
|
|
80
|
+
if mandatory_key not in tag_attrs:
|
|
81
|
+
msg = (
|
|
82
|
+
f"Invalid tag '{tag_name}': mandatory key "
|
|
83
|
+
f"'{mandatory_key}' missing."
|
|
84
|
+
)
|
|
85
|
+
raise TagsDefinitionError(msg)
|
|
86
|
+
|
|
87
|
+
tag = f"{self._delimiter_start}{tag_name}{self._delimiter_end}"
|
|
88
|
+
self[tag] = tag_attrs
|
|
89
|
+
|
|
90
|
+
logger.info(
|
|
91
|
+
"The following tags will be used "
|
|
92
|
+
"to extract metadata from filenames : %s",
|
|
93
|
+
self,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def delimiter_start(self) -> str:
|
|
98
|
+
"""Return the tag delimiter start. Defaults to
|
|
99
|
+
``DEFAULT_TAG_DELIMITER_START``.
|
|
100
|
+
"""
|
|
101
|
+
return self._delimiter_start
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def delimiter_end(self) -> str:
|
|
105
|
+
"""Return the tag delimiter end. Defaults to
|
|
106
|
+
``DEFAULT_TAG_DELIMITER_END``.
|
|
107
|
+
"""
|
|
108
|
+
return self._delimiter_end
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def search_regex(self) -> str:
|
|
112
|
+
"""Return a regex pattern to search for tags inside a string.
|
|
113
|
+
|
|
114
|
+
This regex matches any tag using the current start and end delimiters.
|
|
115
|
+
Used for filename prefixes validation or resolving tags into regex
|
|
116
|
+
patterns.
|
|
117
|
+
"""
|
|
118
|
+
return (
|
|
119
|
+
re.escape(self._delimiter_start)
|
|
120
|
+
+ r"[a-zA-Z0-9]+"
|
|
121
|
+
+ re.escape(self._delimiter_end)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
@classmethod
|
|
125
|
+
def from_dict(cls, bundle: BundleDict) -> typing.Self:
|
|
126
|
+
"""Create tags from a bundle dict."""
|
|
127
|
+
delimiter_start, delimiter_end = None, None
|
|
128
|
+
|
|
129
|
+
if "tags" not in bundle:
|
|
130
|
+
msg = (
|
|
131
|
+
"Could not create a Tags object: bundle does not define tags "
|
|
132
|
+
"(root key 'tags' missing)."
|
|
133
|
+
)
|
|
134
|
+
raise TagsDefinitionError(msg)
|
|
135
|
+
|
|
136
|
+
tags = bundle["tags"]
|
|
137
|
+
match = tags.get("match")
|
|
138
|
+
tag_delimiter = tags.get("delimiter")
|
|
139
|
+
|
|
140
|
+
if tag_delimiter:
|
|
141
|
+
delimiter_start = tag_delimiter.get("start")
|
|
142
|
+
delimiter_end = tag_delimiter.get("end")
|
|
143
|
+
|
|
144
|
+
return cls(
|
|
145
|
+
delimiter_start=delimiter_start,
|
|
146
|
+
delimiter_end=delimiter_end,
|
|
147
|
+
match=match,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
@staticmethod
|
|
151
|
+
def validate_tag_delimiter(s: str) -> bool:
|
|
152
|
+
"""A tag delimiter should only contain one special character,
|
|
153
|
+
excluding the following: (, ), ?, <, >.
|
|
154
|
+
"""
|
|
155
|
+
if len(s) != 1:
|
|
156
|
+
return False
|
|
157
|
+
|
|
158
|
+
return not re.match(r"^[\w()<>?]$", s)
|
|
159
|
+
|
|
160
|
+
@staticmethod
|
|
161
|
+
def validate_tag_name(s: str) -> bool:
|
|
162
|
+
"""A tag name should contain at least one alphanumeric character:
|
|
163
|
+
``a-z``, ``A-Z`` and ``0-9``.
|
|
164
|
+
|
|
165
|
+
:return: True if the tag name is valid, False otherwise.
|
|
166
|
+
"""
|
|
167
|
+
if len(s) < 1:
|
|
168
|
+
return False
|
|
169
|
+
|
|
170
|
+
return bool(re.match(r"^[^_\W]+$", s))
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
from collections import UserList
|
|
3
|
+
from typing import SupportsIndex
|
|
4
|
+
|
|
5
|
+
from genelastic.common.exceptions import UniqueListDuplicateError
|
|
6
|
+
|
|
7
|
+
T = typing.TypeVar("T")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class UniqueList(UserList[T]):
|
|
11
|
+
"""A list that only allows unique elements.
|
|
12
|
+
|
|
13
|
+
:param init_list: Optional iterable to initialize the list.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, init_list: typing.Iterable[T] | None = None) -> None:
|
|
17
|
+
super().__init__()
|
|
18
|
+
|
|
19
|
+
if init_list:
|
|
20
|
+
for item in init_list:
|
|
21
|
+
self._ensure_unique(item)
|
|
22
|
+
super().append(item)
|
|
23
|
+
|
|
24
|
+
def __setitem__(
|
|
25
|
+
self, i: SupportsIndex | slice, item: T | typing.Iterable[T]
|
|
26
|
+
) -> None:
|
|
27
|
+
if isinstance(i, slice):
|
|
28
|
+
if not isinstance(item, typing.Iterable):
|
|
29
|
+
msg = "Expected iterable for slice assignment."
|
|
30
|
+
raise TypeError(msg)
|
|
31
|
+
|
|
32
|
+
slice_dupes = self._find_dupes(item)
|
|
33
|
+
if slice_dupes:
|
|
34
|
+
formatted_dupes = [str(dupe) for dupe in slice_dupes]
|
|
35
|
+
msg = (
|
|
36
|
+
f"Duplicate item(s) in slice assignment: "
|
|
37
|
+
f"{', '.join(formatted_dupes)}."
|
|
38
|
+
)
|
|
39
|
+
raise UniqueListDuplicateError(msg)
|
|
40
|
+
for x in item:
|
|
41
|
+
if x in self and x not in self[i]:
|
|
42
|
+
msg = f"Duplicate item: {x}."
|
|
43
|
+
raise UniqueListDuplicateError(msg)
|
|
44
|
+
super().__setitem__(i, item)
|
|
45
|
+
else:
|
|
46
|
+
self._ensure_unique(typing.cast(T, item))
|
|
47
|
+
super().__setitem__(i, typing.cast(T, item))
|
|
48
|
+
|
|
49
|
+
def __add__(self, other: typing.Iterable[T]) -> "UniqueList[T]":
|
|
50
|
+
for item in other:
|
|
51
|
+
self._ensure_unique(item)
|
|
52
|
+
return UniqueList(super().__add__(other))
|
|
53
|
+
|
|
54
|
+
def __iadd__(self, other: typing.Iterable[T]) -> typing.Self:
|
|
55
|
+
for item in other:
|
|
56
|
+
self._ensure_unique(item)
|
|
57
|
+
return super().__iadd__(other)
|
|
58
|
+
|
|
59
|
+
def __mul__(self, n: int) -> typing.Self:
|
|
60
|
+
raise NotImplementedError
|
|
61
|
+
|
|
62
|
+
def __imul__(self, n: int) -> typing.Self:
|
|
63
|
+
raise NotImplementedError
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def _find_dupes(a: typing.Iterable[T]) -> list[T]:
|
|
67
|
+
seen = set()
|
|
68
|
+
dupes = []
|
|
69
|
+
for x in a:
|
|
70
|
+
if x in seen:
|
|
71
|
+
dupes.append(x)
|
|
72
|
+
else:
|
|
73
|
+
seen.add(x)
|
|
74
|
+
return dupes
|
|
75
|
+
|
|
76
|
+
def _ensure_unique(self, item: T) -> None:
|
|
77
|
+
if item in self:
|
|
78
|
+
msg = f"Duplicate item: {item}."
|
|
79
|
+
raise UniqueListDuplicateError(msg)
|
|
80
|
+
|
|
81
|
+
def append(self, item: T) -> None:
|
|
82
|
+
"""Appends a unique item to the end of the list.
|
|
83
|
+
|
|
84
|
+
:param item: Element to append.
|
|
85
|
+
:raises UniqueListError: If the item already exists in the list.
|
|
86
|
+
"""
|
|
87
|
+
self._ensure_unique(item)
|
|
88
|
+
super().append(item)
|
|
89
|
+
|
|
90
|
+
def insert(self, i: int, item: T) -> None:
|
|
91
|
+
"""Inserts a unique item at a specified position.
|
|
92
|
+
|
|
93
|
+
:param i: Index where the item should be inserted.
|
|
94
|
+
:param item: Element to insert.
|
|
95
|
+
:raises UniqueListError: If the item already exists in the list.
|
|
96
|
+
"""
|
|
97
|
+
self._ensure_unique(item)
|
|
98
|
+
super().insert(i, item)
|
|
99
|
+
|
|
100
|
+
def extend(self, other: typing.Iterable[T]) -> None:
|
|
101
|
+
"""Extends the list with unique elements from another iterable.
|
|
102
|
+
|
|
103
|
+
:param other: Iterable of elements to add.
|
|
104
|
+
:raises UniqueListError: If any element in the iterable already exists in
|
|
105
|
+
the list.
|
|
106
|
+
"""
|
|
107
|
+
for item in other:
|
|
108
|
+
self._ensure_unique(item)
|
|
109
|
+
super().extend(other)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class ValidationIssue:
|
|
7
|
+
"""Contains context about a bundle validation issue."""
|
|
8
|
+
|
|
9
|
+
exc_type: str
|
|
10
|
+
file_path: Path
|
|
11
|
+
file_index: int
|
|
12
|
+
file_count: int
|
|
13
|
+
doc_index: int | None = None
|
|
14
|
+
doc_count: int | None = None
|
|
15
|
+
|
|
16
|
+
def __str__(self) -> str:
|
|
17
|
+
if not self.doc_index:
|
|
18
|
+
return (
|
|
19
|
+
f"[{self.exc_type}] "
|
|
20
|
+
f"File {self.file_index}/{self.file_count}: {self.file_path}"
|
|
21
|
+
)
|
|
22
|
+
return (
|
|
23
|
+
f"[{self.exc_type}] "
|
|
24
|
+
f"File {self.file_index}/{self.file_count}: {self.file_path} "
|
|
25
|
+
f"(in doc #{self.doc_index}/{self.doc_count})"
|
|
26
|
+
)
|