genelastic 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. genelastic/api/.env +4 -0
  2. genelastic/api/cli_start_api.py +18 -0
  3. genelastic/api/errors.py +52 -0
  4. genelastic/api/extends/example.py +0 -6
  5. genelastic/api/extends/example.yml +0 -0
  6. genelastic/api/routes.py +313 -181
  7. genelastic/api/server.py +34 -26
  8. genelastic/api/settings.py +5 -9
  9. genelastic/api/specification.yml +512 -0
  10. genelastic/common/__init__.py +0 -39
  11. genelastic/common/cli.py +100 -0
  12. genelastic/common/elastic.py +374 -46
  13. genelastic/common/exceptions.py +34 -2
  14. genelastic/common/server.py +59 -0
  15. genelastic/common/types.py +1 -14
  16. genelastic/import_data/__init__.py +0 -27
  17. genelastic/import_data/checker.py +99 -0
  18. genelastic/import_data/checker_observer.py +13 -0
  19. genelastic/import_data/cli/__init__.py +0 -0
  20. genelastic/import_data/cli/cli_check.py +136 -0
  21. genelastic/import_data/cli/gen_data.py +143 -0
  22. genelastic/import_data/cli/import_data.py +346 -0
  23. genelastic/import_data/cli/info.py +247 -0
  24. genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
  25. genelastic/import_data/cli/validate.py +146 -0
  26. genelastic/import_data/collect.py +185 -0
  27. genelastic/import_data/constants.py +136 -11
  28. genelastic/import_data/import_bundle.py +102 -59
  29. genelastic/import_data/import_bundle_factory.py +70 -149
  30. genelastic/import_data/importers/__init__.py +0 -0
  31. genelastic/import_data/importers/importer_base.py +131 -0
  32. genelastic/import_data/importers/importer_factory.py +85 -0
  33. genelastic/import_data/importers/importer_types.py +223 -0
  34. genelastic/import_data/logger.py +2 -1
  35. genelastic/import_data/models/__init__.py +0 -0
  36. genelastic/import_data/models/analyses.py +178 -0
  37. genelastic/import_data/models/analysis.py +144 -0
  38. genelastic/import_data/models/data_file.py +110 -0
  39. genelastic/import_data/models/process.py +45 -0
  40. genelastic/import_data/models/processes.py +84 -0
  41. genelastic/import_data/models/tags.py +170 -0
  42. genelastic/import_data/models/unique_list.py +109 -0
  43. genelastic/import_data/models/validate.py +26 -0
  44. genelastic/import_data/patterns.py +90 -0
  45. genelastic/import_data/random_bundle.py +79 -54
  46. genelastic/import_data/resolve.py +157 -0
  47. genelastic/ui/.env +1 -0
  48. genelastic/ui/cli_start_ui.py +20 -0
  49. genelastic/ui/routes.py +333 -0
  50. genelastic/ui/server.py +9 -82
  51. genelastic/ui/settings.py +2 -6
  52. genelastic/ui/static/cea-cnrgh.ico +0 -0
  53. genelastic/ui/static/cea.ico +0 -0
  54. genelastic/ui/static/layout.ico +0 -0
  55. genelastic/ui/static/novaseq6000.png +0 -0
  56. genelastic/ui/static/style.css +430 -0
  57. genelastic/ui/static/ui.js +458 -0
  58. genelastic/ui/templates/analyses.html +98 -0
  59. genelastic/ui/templates/analysis_detail.html +44 -0
  60. genelastic/ui/templates/bi_process_detail.html +129 -0
  61. genelastic/ui/templates/bi_processes.html +116 -0
  62. genelastic/ui/templates/explorer.html +356 -0
  63. genelastic/ui/templates/home.html +207 -0
  64. genelastic/ui/templates/layout.html +153 -0
  65. genelastic/ui/templates/version.html +21 -0
  66. genelastic/ui/templates/wet_process_detail.html +131 -0
  67. genelastic/ui/templates/wet_processes.html +116 -0
  68. genelastic-0.9.0.dist-info/METADATA +686 -0
  69. genelastic-0.9.0.dist-info/RECORD +76 -0
  70. genelastic-0.9.0.dist-info/WHEEL +4 -0
  71. genelastic-0.9.0.dist-info/entry_points.txt +10 -0
  72. genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
  73. genelastic/import_data/analyses.py +0 -69
  74. genelastic/import_data/analysis.py +0 -205
  75. genelastic/import_data/bi_process.py +0 -27
  76. genelastic/import_data/bi_processes.py +0 -49
  77. genelastic/import_data/cli_gen_data.py +0 -116
  78. genelastic/import_data/cli_import.py +0 -379
  79. genelastic/import_data/cli_info.py +0 -256
  80. genelastic/import_data/cli_validate.py +0 -54
  81. genelastic/import_data/data_file.py +0 -87
  82. genelastic/import_data/filename_pattern.py +0 -57
  83. genelastic/import_data/tags.py +0 -123
  84. genelastic/import_data/wet_process.py +0 -28
  85. genelastic/import_data/wet_processes.py +0 -53
  86. genelastic-0.7.0.dist-info/METADATA +0 -105
  87. genelastic-0.7.0.dist-info/RECORD +0 -40
  88. genelastic-0.7.0.dist-info/WHEEL +0 -5
  89. genelastic-0.7.0.dist-info/entry_points.txt +0 -6
  90. genelastic-0.7.0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,110 @@
1
+ """This module defines the DataFile class, which handles the representation,
2
+ management, and extraction of metadata for a data file within a data bundle.
3
+
4
+ It includes functionality to construct DataFile instances from paths and
5
+ optional filename patterns, retrieve file paths and metadata, and support
6
+ for extracting metadata from filenames using specified patterns.
7
+ """
8
+
9
+ import logging
10
+ from pathlib import Path
11
+ from types import NotImplementedType
12
+
13
+ from genelastic.common.types import Metadata
14
+ from genelastic.import_data.patterns import MetricsPattern
15
+
16
+ logger = logging.getLogger("genelastic")
17
+
18
+
19
+ class DataFile:
20
+ """Class for handling a data file and its metadata."""
21
+
22
+ def __init__(
23
+ self,
24
+ analysis_id: str,
25
+ path: Path,
26
+ bundle_file: Path,
27
+ metadata: Metadata,
28
+ ) -> None:
29
+ self._analysis_id = analysis_id
30
+ self._path = path
31
+ self._bundle_file = bundle_file
32
+ self._metadata = metadata
33
+ self._metrics = MetricsPattern.extract_metadata(path)
34
+ self._validate_params()
35
+
36
+ self._ext = str(self._metadata["ext"]).lower()
37
+
38
+ key = "type" if self._metrics is not None else "ext"
39
+ self._type = str(self._metadata[key]).lower()
40
+
41
+ def __eq__(self, other: object) -> bool | NotImplementedType:
42
+ """Defines equality comparison for DataFile instances based on their
43
+ file path.
44
+ """
45
+ if isinstance(other, DataFile):
46
+ return self._path == other._path
47
+ return NotImplemented
48
+
49
+ def __hash__(self) -> int:
50
+ """Defines hash behavior for DataFile to allow use in sets and as dict keys."""
51
+ return hash(self._path)
52
+
53
+ def _validate_params(self) -> None:
54
+ """Validate values of some ``DataFile`` constructor parameters.
55
+
56
+ :raises RuntimeError: One of the parameters value is invalid.
57
+ """
58
+ if "ext" not in self._metadata:
59
+ msg = (
60
+ f"Data file '{self._path}' "
61
+ f"is missing the required metadata key 'ext'."
62
+ )
63
+ raise RuntimeError(msg)
64
+
65
+ if self._metrics is not None and "type" not in self._metadata:
66
+ msg = (
67
+ f"Metrics data file '{self._path}' "
68
+ f"is missing the required metadata key 'type'."
69
+ )
70
+ raise RuntimeError(msg)
71
+
72
+ @property
73
+ def analysis_id(self) -> str:
74
+ """Get the analysis ID."""
75
+ return self._analysis_id
76
+
77
+ @property
78
+ def path(self) -> Path:
79
+ """Retrieve the data file path."""
80
+ return self._path
81
+
82
+ @property
83
+ def ext(self) -> str:
84
+ """Retrieve the data file extension."""
85
+ return self._ext
86
+
87
+ @property
88
+ def type(self) -> str:
89
+ """Retrieve the data file type.
90
+
91
+ Normally, the type is the file's extension.
92
+ If the file is a metrics file, its type is taken from the metadata key
93
+ 'type'.
94
+ """
95
+ return self._type
96
+
97
+ @property
98
+ def bundle_file(self) -> Path:
99
+ """Retrieve the path to the associated data bundle file."""
100
+ return self._bundle_file
101
+
102
+ @property
103
+ def metadata(self) -> Metadata:
104
+ """Retrieve a copy of the metadata associated with the data file."""
105
+ return self._metadata.copy()
106
+
107
+ @property
108
+ def metrics(self) -> list[dict[str, str]] | None:
109
+ """Retrieve a copy of the metrics associated with the data file."""
110
+ return self._metrics
@@ -0,0 +1,45 @@
1
+ import copy
2
+ from abc import ABC
3
+ from typing import Any
4
+
5
+
6
+ class Process(ABC): # noqa: B024
7
+ """Abstract base class for a Process.
8
+
9
+ It is not intended to be instantiated directly. Instead, use one of its
10
+ subclasses, ``WetProcess`` or ``BioInfoProcess``.
11
+ """
12
+
13
+ def __init__(
14
+ self,
15
+ proc_id: str,
16
+ bundle_file: str | None = None,
17
+ **data: Any, # noqa: ANN401
18
+ ) -> None:
19
+ self._proc_id = proc_id
20
+ self._bundle_file = bundle_file
21
+ self._data = data
22
+ self._type = self.__class__.__name__
23
+
24
+ @property
25
+ def id(self) -> str:
26
+ """Unique identifier of the process."""
27
+ return self._proc_id
28
+
29
+ @property
30
+ def data(self) -> dict[str, Any]:
31
+ """Return a copy of the associated data."""
32
+ return copy.deepcopy(self._data)
33
+
34
+ @property
35
+ def type(self) -> str:
36
+ """Type of the process."""
37
+ return self._type
38
+
39
+
40
+ class WetProcess(Process):
41
+ """Concrete wet lab process."""
42
+
43
+
44
+ class BioInfoProcess(Process):
45
+ """Concrete bioinformatics process."""
@@ -0,0 +1,84 @@
1
+ import logging
2
+ import typing
3
+ from collections import UserDict
4
+ from typing import Self
5
+
6
+ from genelastic.common.types import BundleDict
7
+ from genelastic.import_data.models.process import (
8
+ Process,
9
+ )
10
+
11
+ logger = logging.getLogger("genelastic")
12
+
13
+
14
+ class Processes(UserDict[str, Process]):
15
+ """Container for homogeneous Process objects.
16
+
17
+ Unlike a standard dict:
18
+ - Only subclasses of ``Process`` are allowed as values.
19
+ - All items must be of the same concrete subclass of ``Process``.
20
+ - Duplicate keys are not allowed and will raise an exception.
21
+
22
+ :ivar _item_type: Internal attribute storing the concrete subclass
23
+ of ``Process`` enforced in this container.
24
+ """
25
+
26
+ _item_type: type | None = None
27
+
28
+ def __setitem__(self, key: str, value: Process) -> None:
29
+ if not isinstance(value, Process):
30
+ msg = (
31
+ "Object type not supported. "
32
+ "Container only supports 'Process' subclasses as items."
33
+ )
34
+ raise TypeError(msg)
35
+
36
+ if self._item_type is None:
37
+ self._item_type = type(value)
38
+ elif not isinstance(value, self._item_type):
39
+ msg = (
40
+ f"Cannot mix types. Container already holds "
41
+ f"{self._item_type.__name__} items."
42
+ )
43
+ raise TypeError(msg)
44
+
45
+ if key in self:
46
+ msg = (
47
+ f"Duplicate key. "
48
+ f"Container already holds an item with key '{key}'."
49
+ )
50
+ raise ValueError(msg)
51
+
52
+ super().__setitem__(key, value)
53
+
54
+ def add(self, item: Process) -> None:
55
+ """Add one process item to the container.
56
+
57
+ :raises TypeError: If ``item`` is not a subclass of ``Process``,
58
+ or if it does not match the subclass type of items already in the
59
+ container.
60
+ :raises ValueError: If an item with the same key (``item.id``) already
61
+ exists in the container.
62
+ """
63
+ self[item.id] = item
64
+
65
+ @classmethod
66
+ def from_dicts(
67
+ cls, arr: typing.Sequence[BundleDict], process_cls: type[Process]
68
+ ) -> Self:
69
+ """Build a Processes container instance from a sequence of dictionaries.
70
+
71
+ :param arr: Sequence of dictionaries representing process data.
72
+ :param process_cls: The subclass of ``Process`` to instantiate for each
73
+ dict.
74
+ :raises TypeError: If instantiating ``process_cls`` fails due to invalid
75
+ dictionary arguments, or if the resulting object type does not
76
+ match the container's enforced type.
77
+ :raises ValueError: If two or more dictionaries yield items with the
78
+ same key (``id``), leading to duplicate entries in the container.
79
+ :return: A Processes container instance populated with process objects.
80
+ """
81
+ instance = cls()
82
+ for d in arr:
83
+ instance.add(process_cls(**d))
84
+ return instance
@@ -0,0 +1,170 @@
1
+ import logging
2
+ import re
3
+ import typing
4
+ from collections import UserDict
5
+
6
+ from genelastic.common.exceptions import TagsDefinitionError
7
+ from genelastic.common.types import BundleDict
8
+ from genelastic.import_data.constants import (
9
+ DEFAULT_TAG2FIELD,
10
+ DEFAULT_TAG_DELIMITER_END,
11
+ DEFAULT_TAG_DELIMITER_START,
12
+ )
13
+
14
+ logger = logging.getLogger("genelastic")
15
+
16
+
17
+ class Tags(UserDict[str, dict[str, str]]):
18
+ """Represents a set of tags used to extract metadata from filenames.
19
+
20
+ Each tag maps a name to a metadata field and a regex pattern, supporting
21
+ custom delimiters. This class combines default tags (``DEFAULT_TAG2FIELD``)
22
+ with optional user-defined tags, and provides utilities for searching,
23
+ accessing, and resolving tags in filename patterns.
24
+ """
25
+
26
+ def __init__(
27
+ self,
28
+ delimiter_start: str | None = None,
29
+ delimiter_end: str | None = None,
30
+ match: dict[str, dict[str, str]] | None = None,
31
+ ) -> None:
32
+ """Initialize a Tags instance.
33
+
34
+ :param delimiter_start: Optional character prepended to all tag names.
35
+ Defaults to ``DEFAULT_TAG_DELIMITER_START``.
36
+ :param delimiter_end: Optional character appended to all tag names.
37
+ Defaults to ``DEFAULT_TAG_DELIMITER_END``.
38
+ :param match: Optional dictionary of user-defined tags. Overrides
39
+ ``DEFAULT_TAG2FIELD`` if keys overlap.
40
+ """
41
+ super().__init__()
42
+
43
+ if delimiter_start is None:
44
+ self._delimiter_start = DEFAULT_TAG_DELIMITER_START
45
+ else:
46
+ if not self.validate_tag_delimiter(delimiter_start):
47
+ msg = (
48
+ "A tag delimiter start should contain only one special "
49
+ "character, excluding the following: (, ), ?, <, >."
50
+ )
51
+ raise TagsDefinitionError(msg)
52
+ self._delimiter_start = delimiter_start
53
+
54
+ if delimiter_end is None:
55
+ self._delimiter_end = DEFAULT_TAG_DELIMITER_END
56
+ else:
57
+ if not self.validate_tag_delimiter(delimiter_end):
58
+ msg = (
59
+ "A tag delimiter end should contain only one special "
60
+ "character, excluding the following: (, ), ?, <, >."
61
+ )
62
+ raise TagsDefinitionError(msg)
63
+ self._delimiter_end = delimiter_end
64
+
65
+ # Combine default tags with user-provided tags. User-defined ones takes
66
+ # precedence.
67
+ effective_match = DEFAULT_TAG2FIELD | (match or {})
68
+
69
+ # Store each tag in the dictionary using the full name
70
+ # (delimiter start + tag name + delimiter end).
71
+ for tag_name, tag_attrs in effective_match.items():
72
+ if not self.validate_tag_name(tag_name):
73
+ msg = (
74
+ f"Invalid tag '{tag_name}': its name should contain at "
75
+ f"least one alphanumeric character: a-z, A-Z and 0-9."
76
+ )
77
+ raise TagsDefinitionError(msg)
78
+
79
+ for mandatory_key in ("field", "regex"):
80
+ if mandatory_key not in tag_attrs:
81
+ msg = (
82
+ f"Invalid tag '{tag_name}': mandatory key "
83
+ f"'{mandatory_key}' missing."
84
+ )
85
+ raise TagsDefinitionError(msg)
86
+
87
+ tag = f"{self._delimiter_start}{tag_name}{self._delimiter_end}"
88
+ self[tag] = tag_attrs
89
+
90
+ logger.info(
91
+ "The following tags will be used "
92
+ "to extract metadata from filenames : %s",
93
+ self,
94
+ )
95
+
96
+ @property
97
+ def delimiter_start(self) -> str:
98
+ """Return the tag delimiter start. Defaults to
99
+ ``DEFAULT_TAG_DELIMITER_START``.
100
+ """
101
+ return self._delimiter_start
102
+
103
+ @property
104
+ def delimiter_end(self) -> str:
105
+ """Return the tag delimiter end. Defaults to
106
+ ``DEFAULT_TAG_DELIMITER_END``.
107
+ """
108
+ return self._delimiter_end
109
+
110
+ @property
111
+ def search_regex(self) -> str:
112
+ """Return a regex pattern to search for tags inside a string.
113
+
114
+ This regex matches any tag using the current start and end delimiters.
115
+ Used for filename prefixes validation or resolving tags into regex
116
+ patterns.
117
+ """
118
+ return (
119
+ re.escape(self._delimiter_start)
120
+ + r"[a-zA-Z0-9]+"
121
+ + re.escape(self._delimiter_end)
122
+ )
123
+
124
+ @classmethod
125
+ def from_dict(cls, bundle: BundleDict) -> typing.Self:
126
+ """Create tags from a bundle dict."""
127
+ delimiter_start, delimiter_end = None, None
128
+
129
+ if "tags" not in bundle:
130
+ msg = (
131
+ "Could not create a Tags object: bundle does not define tags "
132
+ "(root key 'tags' missing)."
133
+ )
134
+ raise TagsDefinitionError(msg)
135
+
136
+ tags = bundle["tags"]
137
+ match = tags.get("match")
138
+ tag_delimiter = tags.get("delimiter")
139
+
140
+ if tag_delimiter:
141
+ delimiter_start = tag_delimiter.get("start")
142
+ delimiter_end = tag_delimiter.get("end")
143
+
144
+ return cls(
145
+ delimiter_start=delimiter_start,
146
+ delimiter_end=delimiter_end,
147
+ match=match,
148
+ )
149
+
150
+ @staticmethod
151
+ def validate_tag_delimiter(s: str) -> bool:
152
+ """A tag delimiter should only contain one special character,
153
+ excluding the following: (, ), ?, <, >.
154
+ """
155
+ if len(s) != 1:
156
+ return False
157
+
158
+ return not re.match(r"^[\w()<>?]$", s)
159
+
160
+ @staticmethod
161
+ def validate_tag_name(s: str) -> bool:
162
+ """A tag name should contain at least one alphanumeric character:
163
+ ``a-z``, ``A-Z`` and ``0-9``.
164
+
165
+ :return: True if the tag name is valid, False otherwise.
166
+ """
167
+ if len(s) < 1:
168
+ return False
169
+
170
+ return bool(re.match(r"^[^_\W]+$", s))
@@ -0,0 +1,109 @@
1
+ import typing
2
+ from collections import UserList
3
+ from typing import SupportsIndex
4
+
5
+ from genelastic.common.exceptions import UniqueListDuplicateError
6
+
7
+ T = typing.TypeVar("T")
8
+
9
+
10
+ class UniqueList(UserList[T]):
11
+ """A list that only allows unique elements.
12
+
13
+ :param init_list: Optional iterable to initialize the list.
14
+ """
15
+
16
+ def __init__(self, init_list: typing.Iterable[T] | None = None) -> None:
17
+ super().__init__()
18
+
19
+ if init_list:
20
+ for item in init_list:
21
+ self._ensure_unique(item)
22
+ super().append(item)
23
+
24
+ def __setitem__(
25
+ self, i: SupportsIndex | slice, item: T | typing.Iterable[T]
26
+ ) -> None:
27
+ if isinstance(i, slice):
28
+ if not isinstance(item, typing.Iterable):
29
+ msg = "Expected iterable for slice assignment."
30
+ raise TypeError(msg)
31
+
32
+ slice_dupes = self._find_dupes(item)
33
+ if slice_dupes:
34
+ formatted_dupes = [str(dupe) for dupe in slice_dupes]
35
+ msg = (
36
+ f"Duplicate item(s) in slice assignment: "
37
+ f"{', '.join(formatted_dupes)}."
38
+ )
39
+ raise UniqueListDuplicateError(msg)
40
+ for x in item:
41
+ if x in self and x not in self[i]:
42
+ msg = f"Duplicate item: {x}."
43
+ raise UniqueListDuplicateError(msg)
44
+ super().__setitem__(i, item)
45
+ else:
46
+ self._ensure_unique(typing.cast(T, item))
47
+ super().__setitem__(i, typing.cast(T, item))
48
+
49
+ def __add__(self, other: typing.Iterable[T]) -> "UniqueList[T]":
50
+ for item in other:
51
+ self._ensure_unique(item)
52
+ return UniqueList(super().__add__(other))
53
+
54
+ def __iadd__(self, other: typing.Iterable[T]) -> typing.Self:
55
+ for item in other:
56
+ self._ensure_unique(item)
57
+ return super().__iadd__(other)
58
+
59
+ def __mul__(self, n: int) -> typing.Self:
60
+ raise NotImplementedError
61
+
62
+ def __imul__(self, n: int) -> typing.Self:
63
+ raise NotImplementedError
64
+
65
+ @staticmethod
66
+ def _find_dupes(a: typing.Iterable[T]) -> list[T]:
67
+ seen = set()
68
+ dupes = []
69
+ for x in a:
70
+ if x in seen:
71
+ dupes.append(x)
72
+ else:
73
+ seen.add(x)
74
+ return dupes
75
+
76
+ def _ensure_unique(self, item: T) -> None:
77
+ if item in self:
78
+ msg = f"Duplicate item: {item}."
79
+ raise UniqueListDuplicateError(msg)
80
+
81
+ def append(self, item: T) -> None:
82
+ """Appends a unique item to the end of the list.
83
+
84
+ :param item: Element to append.
85
+ :raises UniqueListError: If the item already exists in the list.
86
+ """
87
+ self._ensure_unique(item)
88
+ super().append(item)
89
+
90
+ def insert(self, i: int, item: T) -> None:
91
+ """Inserts a unique item at a specified position.
92
+
93
+ :param i: Index where the item should be inserted.
94
+ :param item: Element to insert.
95
+ :raises UniqueListError: If the item already exists in the list.
96
+ """
97
+ self._ensure_unique(item)
98
+ super().insert(i, item)
99
+
100
+ def extend(self, other: typing.Iterable[T]) -> None:
101
+ """Extends the list with unique elements from another iterable.
102
+
103
+ :param other: Iterable of elements to add.
104
+ :raises UniqueListError: If any element in the iterable already exists in
105
+ the list.
106
+ """
107
+ for item in other:
108
+ self._ensure_unique(item)
109
+ super().extend(other)
@@ -0,0 +1,26 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+
4
+
5
+ @dataclass
6
+ class ValidationIssue:
7
+ """Contains context about a bundle validation issue."""
8
+
9
+ exc_type: str
10
+ file_path: Path
11
+ file_index: int
12
+ file_count: int
13
+ doc_index: int | None = None
14
+ doc_count: int | None = None
15
+
16
+ def __str__(self) -> str:
17
+ if not self.doc_index:
18
+ return (
19
+ f"[{self.exc_type}] "
20
+ f"File {self.file_index}/{self.file_count}: {self.file_path}"
21
+ )
22
+ return (
23
+ f"[{self.exc_type}] "
24
+ f"File {self.file_index}/{self.file_count}: {self.file_path} "
25
+ f"(in doc #{self.doc_index}/{self.doc_count})"
26
+ )
@@ -0,0 +1,90 @@
1
+ import re
2
+ from pathlib import Path
3
+
4
+ from genelastic.common.types import Metadata
5
+ from genelastic.import_data.constants import TOOLS_SUFFIX_RE
6
+
7
+
8
+ class FilenamePattern:
9
+ """Utility class to extract metadata from filenames based on a regex
10
+ pattern.
11
+ """
12
+
13
+ def __init__(self, pattern: str) -> None:
14
+ """Initializes a FilenamePattern instance.
15
+
16
+ :param pattern: The regex pattern used to extract metadata from
17
+ filenames.
18
+ """
19
+ self._re = re.compile(pattern)
20
+
21
+ def extract_metadata(self, filename: str) -> Metadata:
22
+ """Extracts metadata from the given filename using the defined pattern.
23
+
24
+ :param filename: The filename from which metadata should be extracted.
25
+ :raises RuntimeError: If the filename does not match the pattern.
26
+ :returns: A dictionary containing the extracted metadata.
27
+ """
28
+ m = self._re.search(filename)
29
+ if not m:
30
+ msg = (
31
+ f"Failed parsing filename '{filename}' with pattern "
32
+ f"'{self._re.pattern}'."
33
+ )
34
+ raise RuntimeError(msg)
35
+
36
+ # Convert necessary values.
37
+ metadata = m.groupdict()
38
+ if "cov_depth" in metadata:
39
+ metadata["cov_depth"] = int(metadata["cov_depth"])
40
+
41
+ return metadata
42
+
43
+ def matches_pattern(self, filename: str) -> bool:
44
+ """Checks whether the given filename matches the defined pattern.
45
+
46
+ :param filename: The filename to check.
47
+ :returns: True if the filename matches the pattern, False otherwise.
48
+ """
49
+ return bool(self._re.fullmatch(filename))
50
+
51
+
52
+ class MetricsPattern:
53
+ """Utility class to extract tool/version metadata from filenames with a
54
+ ``.metrics`` suffix.
55
+ """
56
+
57
+ @staticmethod
58
+ def extract_metadata(file: Path) -> list[dict[str, str]] | None:
59
+ """Extracts metadata from a filename based on the ``.metrics`` suffix.
60
+
61
+ :param file: The path to the file to be analyzed.
62
+ :raises RuntimeError: If the suffix is malformed or cannot be parsed.
63
+ :returns:
64
+ - None if the file does not have a ``.metrics`` prefix,
65
+ - An empty list if the prefix is present but no metadata is found,
66
+ - A list of dictionaries with ``tool`` and ``version`` keys if
67
+ metadata is extracted.
68
+ """
69
+ if not file.suffixes or not file.suffixes[0].startswith(".metrics"):
70
+ return None
71
+
72
+ tools_str = file.suffixes[0].replace(".metrics", "")
73
+ matches = list(re.finditer(TOOLS_SUFFIX_RE, tools_str))
74
+ matched_str = "".join(m.group(0) for m in matches)
75
+
76
+ if matched_str != tools_str:
77
+ msg = (
78
+ f"Failed extracting metrics from filename '{file}': "
79
+ f"'{tools_str}' does not fully match pattern "
80
+ f"'{TOOLS_SUFFIX_RE}'."
81
+ )
82
+ raise RuntimeError(msg)
83
+
84
+ return [
85
+ {
86
+ "tool": m.group("tool"),
87
+ "version": m.group("version").replace("-", "."),
88
+ }
89
+ for m in matches
90
+ ]