genelastic 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. genelastic/api/extends/example.py +2 -3
  2. genelastic/api/routes.py +160 -23
  3. genelastic/api/server.py +30 -22
  4. genelastic/api/settings.py +3 -2
  5. genelastic/common/__init__.py +36 -9
  6. genelastic/common/cli.py +51 -23
  7. genelastic/common/elastic.py +80 -49
  8. genelastic/common/exceptions.py +0 -2
  9. genelastic/common/types.py +20 -15
  10. genelastic/import_data/__init__.py +23 -5
  11. genelastic/import_data/analyses.py +17 -20
  12. genelastic/import_data/analysis.py +69 -65
  13. genelastic/import_data/bi_process.py +7 -5
  14. genelastic/import_data/bi_processes.py +8 -8
  15. genelastic/import_data/cli_gen_data.py +116 -0
  16. genelastic/import_data/cli_import.py +379 -0
  17. genelastic/import_data/{info.py → cli_info.py} +104 -75
  18. genelastic/import_data/cli_integrity.py +384 -0
  19. genelastic/import_data/cli_validate.py +54 -0
  20. genelastic/import_data/constants.py +11 -32
  21. genelastic/import_data/data_file.py +23 -20
  22. genelastic/import_data/filename_pattern.py +26 -32
  23. genelastic/import_data/import_bundle.py +56 -47
  24. genelastic/import_data/import_bundle_factory.py +166 -158
  25. genelastic/import_data/logger.py +22 -18
  26. genelastic/import_data/random_bundle.py +402 -0
  27. genelastic/import_data/tags.py +46 -26
  28. genelastic/import_data/wet_process.py +8 -4
  29. genelastic/import_data/wet_processes.py +13 -8
  30. genelastic/ui/__init__.py +0 -0
  31. genelastic/ui/server.py +87 -0
  32. genelastic/ui/settings.py +11 -0
  33. genelastic-0.7.0.dist-info/METADATA +105 -0
  34. genelastic-0.7.0.dist-info/RECORD +40 -0
  35. {genelastic-0.6.1.dist-info → genelastic-0.7.0.dist-info}/WHEEL +1 -1
  36. genelastic-0.7.0.dist-info/entry_points.txt +6 -0
  37. genelastic/import_data/gen_data.py +0 -194
  38. genelastic/import_data/import_data.py +0 -292
  39. genelastic/import_data/integrity.py +0 -290
  40. genelastic/import_data/validate_data.py +0 -43
  41. genelastic-0.6.1.dist-info/METADATA +0 -41
  42. genelastic-0.6.1.dist-info/RECORD +0 -36
  43. genelastic-0.6.1.dist-info/entry_points.txt +0 -6
  44. {genelastic-0.6.1.dist-info → genelastic-0.7.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,3 @@
1
- # pylint: disable=missing-module-docstring
2
1
  import datetime
3
2
  import logging
4
3
  import time
@@ -13,14 +12,15 @@ from elasticsearch import Elasticsearch
13
12
  from .exceptions import DBIntegrityError
14
13
  from .types import Bucket, BulkItems
15
14
 
16
- logger = logging.getLogger('genelastic')
15
+ logger = logging.getLogger("genelastic")
17
16
 
18
17
 
19
- class ElasticConn(ABC): # pylint: disable=too-few-public-methods
18
+ class ElasticConn(ABC):
20
19
  """Abstract class representing a connector for an Elasticsearch server."""
20
+
21
21
  client: Elasticsearch
22
22
 
23
- def __init__(self, url: str, fingerprint: str, **kwargs: Any):
23
+ def __init__(self, url: str, fingerprint: str, **kwargs: Any) -> None: # noqa: ANN401
24
24
  """Initialize an elasticsearch client instance.
25
25
 
26
26
  :url: URL of the Elasticsearch host.
@@ -34,41 +34,49 @@ class ElasticConn(ABC): # pylint: disable=too-few-public-methods
34
34
  ssl_assert_fingerprint=fingerprint,
35
35
  # Verify cert only when the fingerprint is not None.
36
36
  verify_certs=bool(fingerprint),
37
- **kwargs
37
+ **kwargs,
38
38
  )
39
39
  self.client.info()
40
- except (elastic_transport.TransportError, elasticsearch.AuthenticationException) as e:
40
+ except (
41
+ elastic_transport.TransportError,
42
+ elasticsearch.AuthenticationException,
43
+ ) as e:
41
44
  raise SystemExit(e) from e
42
45
 
43
46
 
44
- class ElasticImportConn(ElasticConn): # pylint: disable=too-few-public-methods
47
+ class ElasticImportConn(ElasticConn):
45
48
  """Connector to import data into an Elasticsearch database."""
46
- def import_items(self, bulk_items: BulkItems,
47
- start_time: float,
48
- total_items: int) -> None:
49
+
50
+ def import_items(
51
+ self, bulk_items: BulkItems, start_time: float, total_items: int
52
+ ) -> None:
49
53
  """Import items to the Elasticsearch database."""
50
54
  if len(bulk_items) > 0:
51
55
  elasticsearch.helpers.bulk(self.client, bulk_items)
52
56
  elapsed = time.perf_counter() - start_time
53
- logger.info("Imported %d items in %s (%f items/s).", total_items,
54
- datetime.timedelta(seconds=elapsed), total_items / elapsed)
57
+ logger.info(
58
+ "Imported %d items in %s (%f items/s).",
59
+ total_items,
60
+ datetime.timedelta(seconds=elapsed),
61
+ total_items / elapsed,
62
+ )
55
63
 
56
64
 
57
65
  class ElasticQueryConn(ElasticConn):
58
66
  """Connector to query data from an Elasticsearch database."""
59
67
 
60
- def get_indices(self) -> Any | str:
68
+ def get_indices(self) -> Any | str: # noqa: ANN401
61
69
  """Return all indices."""
62
70
  return self.client.cat.indices(format="json").body
63
71
 
64
- def get_document_by_id(self, index: str, document_id: str) -> Any | str:
72
+ def get_document_by_id(self, index: str, document_id: str) -> Any | str: # noqa: ANN401
65
73
  """Return a document by its ID."""
66
74
  return self.client.get(index=index, id=document_id).body
67
75
 
68
- def run_composite_aggregation(self, index: str, query: dict[str, typing.Any]) \
69
- -> list[Bucket]:
70
- """
71
- Executes a composite aggregation on an Elasticsearch index and
76
+ def run_composite_aggregation(
77
+ self, index: str, query: dict[str, typing.Any]
78
+ ) -> list[Bucket]:
79
+ """Executes a composite aggregation on an Elasticsearch index and
72
80
  returns all paginated results.
73
81
 
74
82
  :param index: Name of the index to query.
@@ -77,29 +85,39 @@ class ElasticQueryConn(ElasticConn):
77
85
  """
78
86
  # Extract the aggregation name from the query dict.
79
87
  agg_name = next(iter(query["aggs"]))
80
- all_buckets: typing.List[Bucket] = []
88
+ all_buckets: list[Bucket] = []
81
89
 
82
90
  try:
83
- logger.debug("Running composite aggregation query %s on index '%s'.", query, index)
91
+ logger.debug(
92
+ "Running composite aggregation query %s on index '%s'.",
93
+ query,
94
+ index,
95
+ )
84
96
  response = self.client.search(index=index, body=query)
85
97
  except elasticsearch.NotFoundError as e:
86
- raise SystemExit(f"Error: {e.message} for index '{index}'.") from e
98
+ msg = f"Error: {e.message} for index '{index}'."
99
+ raise SystemExit(msg) from e
87
100
 
88
101
  while True:
89
102
  # Extract buckets from the response.
90
- buckets: typing.List[Bucket] = response['aggregations'][agg_name]['buckets']
103
+ buckets: list[Bucket] = response["aggregations"][agg_name][
104
+ "buckets"
105
+ ]
91
106
  all_buckets.extend(buckets)
92
107
 
93
108
  # Check if there are more results to fetch.
94
- if 'after_key' in response['aggregations'][agg_name]:
95
- after_key = response['aggregations'][agg_name]['after_key']
96
- query['aggs'][agg_name]['composite']['after'] = after_key
109
+ if "after_key" in response["aggregations"][agg_name]:
110
+ after_key = response["aggregations"][agg_name]["after_key"]
111
+ query["aggs"][agg_name]["composite"]["after"] = after_key
97
112
  try:
98
- logger.debug("Running query %s on index '%s'.", query, index)
113
+ logger.debug(
114
+ "Running query %s on index '%s'.", query, index
115
+ )
99
116
  # Fetch the next page of results.
100
117
  response = self.client.search(index=index, body=query)
101
118
  except elasticsearch.NotFoundError as e:
102
- raise SystemExit(f"Error: {e.message} for index '{index}'.") from e
119
+ msg = f"Error: {e.message} for index '{index}'."
120
+ raise SystemExit(msg) from e
103
121
  else:
104
122
  break
105
123
 
@@ -114,25 +132,34 @@ class ElasticQueryConn(ElasticConn):
114
132
  "aggs": {
115
133
  "get_field_values": {
116
134
  "composite": {
117
- "sources": {"values": {"terms": {"field": f"{field_name}.keyword"}}},
135
+ "sources": {
136
+ "values": {
137
+ "terms": {"field": f"{field_name}.keyword"}
138
+ }
139
+ },
118
140
  "size": 1000,
119
141
  }
120
142
  }
121
- }
143
+ },
122
144
  }
123
145
 
124
- buckets: typing.List[Bucket] = self.run_composite_aggregation(index, query)
146
+ buckets: list[Bucket] = self.run_composite_aggregation(index, query)
125
147
 
126
148
  for bucket in buckets:
127
- values.add(bucket['key']['values'])
149
+ values.add(bucket["key"]["values"])
128
150
 
129
151
  return values
130
152
 
131
- def search_by_field_value(self, index: str, field: str, value: str) -> (
132
- typing.Dict[str, typing.Any] | None):
153
+ def search_by_field_value(
154
+ self, index: str, field: str, value: str
155
+ ) -> dict[str, typing.Any] | None:
133
156
  """Search a document by a value for a certain field."""
134
- logger.info("Searching for field '%s' with value '%s' inside index '%s'.",
135
- field, value, index)
157
+ logger.info(
158
+ "Searching for field '%s' with value '%s' inside index '%s'.",
159
+ field,
160
+ value,
161
+ index,
162
+ )
136
163
  search_query = {
137
164
  "query": {
138
165
  "term": {
@@ -144,22 +171,23 @@ class ElasticQueryConn(ElasticConn):
144
171
  response = self.client.search(index=index, body=search_query)
145
172
 
146
173
  try:
147
- return response['hits']['hits'][0]['_source'] # type: ignore
174
+ return response["hits"]["hits"][0]["_source"] # type: ignore[no-any-return]
148
175
  except KeyError:
149
176
  return None
150
177
 
151
178
  def ensure_unique(self, index: str, field: str) -> None:
152
- """
153
- Ensure that all values of a field in an index are all unique.
179
+ """Ensure that all values of a field in an index are all unique.
154
180
 
155
181
  :param index: Name of the index.
156
182
  :param field: Field name to check for value uniqueness.
157
183
  :raises genelastic.common.DBIntegrityError:
158
184
  Some values of the given field are duplicated in the index.
159
185
  """
160
-
161
- logger.info("Ensuring that the field '%s' in the index '%s' only contains unique values...",
162
- field, index)
186
+ logger.info(
187
+ "Ensuring that the field '%s' in the index '%s' only contains unique values...",
188
+ field,
189
+ index,
190
+ )
163
191
  query = {
164
192
  "size": 0,
165
193
  "aggs": {
@@ -167,17 +195,20 @@ class ElasticQueryConn(ElasticConn):
167
195
  "terms": {
168
196
  "field": f"{field}.keyword",
169
197
  "size": 10000,
170
- "min_doc_count": 2
198
+ "min_doc_count": 2,
171
199
  }
172
200
  }
173
- }
201
+ },
202
+ }
203
+ buckets: list[Bucket] = self.run_composite_aggregation(index, query)
204
+ duplicated_processes: set[str] = {
205
+ str(bucket["key"]) for bucket in buckets
174
206
  }
175
- buckets: typing.List[Bucket] = self.run_composite_aggregation(index, query)
176
- duplicated_processes: typing.Set[str] = set(map(lambda bucket: str(bucket["key"]), buckets))
177
207
 
178
208
  if len(duplicated_processes) > 0:
179
- raise DBIntegrityError(f"Found non-unique value for field {field} in index '{index}': "
180
- f"{', '.join(duplicated_processes)}.")
209
+ msg = f"Found non-unique value for field {field} in index '{index}': {', '.join(duplicated_processes)}."
210
+ raise DBIntegrityError(msg)
181
211
 
182
- logger.info("All values of field '%s' in index '%s' are unique.",
183
- field, index)
212
+ logger.info(
213
+ "All values of field '%s' in index '%s' are unique.", field, index
214
+ )
@@ -1,5 +1,3 @@
1
- # pylint: disable=missing-module-docstring
2
-
3
1
  class DBIntegrityError(Exception):
4
2
  """Represents an integrity error,
5
3
  raised when the database content does not match the expected data schema.
@@ -1,20 +1,25 @@
1
- # pylint: disable=missing-module-docstring
2
-
3
1
  import typing
4
2
 
3
+ # Types related to Elasticsearch data import.
5
4
  Bucket: typing.TypeAlias = dict[str, dict[typing.Any, typing.Any]]
5
+ BundleDict: typing.TypeAlias = dict[str, typing.Any]
6
+
7
+ AnalysisMetaData: typing.TypeAlias = dict[str, str | int]
8
+ WetProcessesData: typing.TypeAlias = dict[str, str | int | float]
9
+ BioInfoProcessData: typing.TypeAlias = dict[str, str | list[str]]
6
10
 
7
- AnalysisMetaData: typing.TypeAlias = typing.Dict[str, str | int]
8
- WetProcessesData: typing.TypeAlias = typing.Dict[str, str | int | float]
9
- BioInfoProcessData: typing.TypeAlias = typing.Dict[str, str | typing.List[str]]
10
- BundleDict: typing.TypeAlias = typing.Dict[str, typing.Any]
11
+ AnalysisDocument: typing.TypeAlias = dict[str, str | None | AnalysisMetaData]
12
+ MetadataDocument: typing.TypeAlias = dict[
13
+ str, int | str | list[typing.Any | None]
14
+ ]
15
+ ProcessDocument: typing.TypeAlias = (
16
+ dict[str, str] | WetProcessesData | BioInfoProcessData
17
+ )
18
+ BulkItems: typing.TypeAlias = list[
19
+ dict[str, str | MetadataDocument | AnalysisDocument | ProcessDocument]
20
+ ]
11
21
 
12
- AnalysisDocument: typing.TypeAlias = typing.Dict[str, str | None | AnalysisMetaData]
13
- MetadataDocument: typing.TypeAlias = typing.Dict[str, int | str | typing.List[typing.Any | None]]
14
- ProcessDocument: typing.TypeAlias = (typing.Dict[str, str] |
15
- WetProcessesData |
16
- BioInfoProcessData)
17
- BulkItems: typing.TypeAlias = typing.List[typing.Dict[str, str |
18
- MetadataDocument |
19
- AnalysisDocument |
20
- ProcessDocument]]
22
+ # Types related to random bundle generation.
23
+ RandomBiProcessData: typing.TypeAlias = dict[str, str | list[dict[str, str]]]
24
+ RandomWetProcessData: typing.TypeAlias = dict[str, str | float]
25
+ RandomAnalysisData: typing.TypeAlias = dict[str, str | list[int | str]]
@@ -1,9 +1,27 @@
1
1
  """Genelastic package for importing Genomic data into Elasticsearch."""
2
+
2
3
  from .analysis import Analysis
3
- from .import_bundle_factory import (make_import_bundle_from_files,
4
- load_import_bundle_file)
5
- from .tags import Tags
6
4
  from .import_bundle import ImportBundle
5
+ from .import_bundle_factory import (
6
+ load_import_bundle_file,
7
+ make_import_bundle_from_files,
8
+ )
9
+ from .random_bundle import (
10
+ RandomAnalysis,
11
+ RandomBiProcess,
12
+ RandomBundle,
13
+ RandomWetProcess,
14
+ )
15
+ from .tags import Tags
7
16
 
8
- __all__ = ['Analysis', 'Tags', 'ImportBundle', 'make_import_bundle_from_files',
9
- 'load_import_bundle_file']
17
+ __all__ = [
18
+ "Analysis",
19
+ "ImportBundle",
20
+ "RandomAnalysis",
21
+ "RandomBiProcess",
22
+ "RandomBundle",
23
+ "RandomWetProcess",
24
+ "Tags",
25
+ "load_import_bundle_file",
26
+ "make_import_bundle_from_files",
27
+ ]
@@ -1,23 +1,22 @@
1
- # pylint: disable=missing-module-docstring
2
1
  import typing
3
2
 
4
- from genelastic.common import BundleDict
3
+ from genelastic.common import BundleDict
5
4
 
6
5
  from .analysis import Analysis
7
6
  from .data_file import DataFile
8
7
 
8
+
9
9
  class Analyses:
10
- """Class Analyses is a container of Analysis objects.
11
- """
10
+ """Class Analyses is a container of Analysis objects."""
12
11
 
13
12
  def __init__(self) -> None:
14
- self._arr: typing.List[Analysis] = []
13
+ self._arr: list[Analysis] = []
15
14
  self._iter_index: int = 0
16
15
 
17
16
  def __len__(self) -> int:
18
17
  return len(self._arr)
19
18
 
20
- def __iter__(self) -> typing.Generator[Analysis, typing.Any, None]:
19
+ def __iter__(self) -> typing.Iterator[Analysis]:
21
20
  yield from self._arr
22
21
 
23
22
  @typing.overload
@@ -25,13 +24,13 @@ class Analyses:
25
24
  pass
26
25
 
27
26
  @typing.overload
28
- def __getitem__(self, k: slice) -> typing.List[Analysis]:
27
+ def __getitem__(self, k: slice) -> list[Analysis]:
29
28
  pass
30
29
 
31
- def __getitem__(self, k): # type: ignore
30
+ def __getitem__(self, k): # type: ignore[no-untyped-def]
32
31
  if isinstance(k, int):
33
32
  return self._arr[k]
34
- return self._arr[k.start:k.stop]
33
+ return self._arr[k.start : k.stop]
35
34
 
36
35
  def add(self, a: Analysis) -> None:
37
36
  """Add one Analysis object."""
@@ -39,20 +38,18 @@ class Analyses:
39
38
 
40
39
  def get_nb_files(self, cat: str | None = None) -> int:
41
40
  """Get the total number of files as paths."""
42
- return len(self.get_data_files(cat = cat))
43
-
44
- def get_data_files(self, cat: str | None = None) -> typing.List[DataFile]:
45
- """Get the total number of files as DataFile objects.
46
- """
41
+ return len(self.get_data_files(cat=cat))
47
42
 
48
- data_files: typing.List[DataFile] = []
43
+ def get_data_files(self, cat: str | None = None) -> list[DataFile]:
44
+ """Get the total number of files as DataFile objects."""
45
+ data_files: list[DataFile] = []
49
46
 
50
47
  for a in self._arr:
51
- data_files.extend(a.get_data_files(cat = cat))
48
+ data_files.extend(a.get_data_files(cat=cat))
52
49
 
53
50
  return data_files
54
51
 
55
- def get_all_categories(self) -> typing.Set[str]:
52
+ def get_all_categories(self) -> set[str]:
56
53
  """Return all the categories of the analyses."""
57
54
  categories = set()
58
55
  for a in self._arr:
@@ -60,10 +57,10 @@ class Analyses:
60
57
  return categories
61
58
 
62
59
  @classmethod
63
- def from_array_of_dicts(cls, arr: typing.Sequence[BundleDict]
64
- ) -> typing.Self:
60
+ def from_array_of_dicts(
61
+ cls, arr: typing.Sequence[BundleDict]
62
+ ) -> typing.Self:
65
63
  """Build an Analyses instance."""
66
-
67
64
  analyses = cls()
68
65
 
69
66
  for d in arr:
@@ -1,8 +1,5 @@
1
- # pylint: disable=missing-module-docstring
2
1
  import copy
3
- import glob
4
2
  import logging
5
- import os
6
3
  import re
7
4
  import typing
8
5
  from pathlib import Path
@@ -14,28 +11,31 @@ from .data_file import DataFile
14
11
  from .filename_pattern import FilenamePattern
15
12
  from .tags import Tags
16
13
 
17
- logger = logging.getLogger('genelastic')
14
+ logger = logging.getLogger("genelastic")
18
15
 
19
16
 
20
17
  class Analysis:
21
18
  """Class Analysis that represents an analysis."""
22
19
 
23
- # pylint: disable-next=too-many-arguments, too-many-positional-arguments
24
- def __init__(self,
25
- tags: Tags,
26
- root_dir: str = '.',
27
- bundle_file: str | None = None,
28
- file_prefix: str | None = None,
29
- files: typing.Sequence[str] | None = None,
30
- data_path: str | None = None,
31
- **metadata: str | int) -> None:
32
- self._bundle_file = bundle_file
20
+ def __init__( # noqa: PLR0913
21
+ self,
22
+ tags: Tags,
23
+ root_dir: str = ".",
24
+ bundle_file: str | None = None,
25
+ file_prefix: str | None = None,
26
+ files: typing.Sequence[str] | None = None,
27
+ data_path: str | None = None,
28
+ **metadata: str | int,
29
+ ) -> None:
30
+ self._bundle_file = Path(bundle_file) if bundle_file else None
33
31
  self._file_prefix = file_prefix
34
32
  self._files = files
35
- self._data_path = Analysis._resolve_data_path(root_dir, data_path)
33
+ self._data_path = Analysis._resolve_data_path(
34
+ Path(root_dir), Path(data_path) if data_path else None
35
+ )
36
36
  self._tags = tags
37
37
  self._metadata: AnalysisMetaData = metadata
38
- self._categories: typing.Set[str] = set()
38
+ self._categories: set[str] = set()
39
39
 
40
40
  @property
41
41
  def metadata(self) -> AnalysisMetaData:
@@ -43,17 +43,15 @@ class Analysis:
43
43
  return copy.deepcopy(self._metadata)
44
44
 
45
45
  @property
46
- def bundle_file(self) -> str | None:
46
+ def bundle_file(self) -> Path | None:
47
47
  """Get the bundle file."""
48
48
  return self._bundle_file
49
49
 
50
50
  @property
51
51
  def filename_regex(self) -> str:
52
- """
53
- Resolve placeholders in a file prefix using metadata
52
+ """Resolve placeholders in a file prefix using metadata
54
53
  and unresolved placeholders are converted to regex groups
55
54
  """
56
-
57
55
  x: str = r"^.+\.(?P<ext>vcf|cov)(\.gz)?$"
58
56
 
59
57
  # Use existing generic prefix
@@ -65,84 +63,87 @@ class Analysis:
65
63
  regex = tag_attrs["regex"]
66
64
 
67
65
  # Build field regex
68
- field_regex = (f"(?P<{field}>{self._metadata.get(field)})"
69
- if field in self._metadata else
70
- f"(?P<{field}>{regex})")
66
+ field_regex = (
67
+ f"(?P<{field}>{self._metadata.get(field)})"
68
+ if field in self._metadata
69
+ else f"(?P<{field}>{regex})"
70
+ )
71
71
  # Replace tag with field regex
72
72
  x = x.replace(tag_name, field_regex)
73
73
 
74
74
  # Check for tags that were not replaced.
75
75
  groups = re.findall(self._tags.search_regex, x)
76
76
  for match in groups:
77
- logger.warning("String '%s' in key 'file_prefix' looks like an undefined tag. "
78
- "If this string is not a tag, you can ignore this warning.",
79
- match)
77
+ logger.warning(
78
+ "String '%s' in key 'file_prefix' looks like an undefined tag. "
79
+ "If this string is not a tag, you can ignore this warning.",
80
+ match,
81
+ )
80
82
 
81
83
  # Add missing start and end markers
82
84
  if not x.startswith("^"):
83
85
  x = "^" + x
84
86
  if not x.endswith("$"):
85
- x += (r"\.(?P<ext>" + '|'.join(ALLOWED_CATEGORIES)
86
- + r")(\.gz)?$")
87
+ x += r"\.(?P<ext>" + "|".join(ALLOWED_CATEGORIES) + r")(\.gz)?$"
87
88
  logger.debug("File regex for %s: %s", self._bundle_file, x)
88
89
 
89
90
  return x
90
91
 
91
92
  def get_nb_files(self, cat: str | None = None) -> int:
92
- """Returns the total number of files.
93
- """
93
+ """Returns the total number of files."""
94
94
  return len(self.get_file_paths(cat=cat))
95
95
 
96
- def get_data_files(self, cat: str | None = None) -> typing.List[DataFile]:
97
- """Returns the list of matched files as DataFile objects.
98
- """
99
-
96
+ def get_data_files(self, cat: str | None = None) -> list[DataFile]:
97
+ """Returns the list of matched files as DataFile objects."""
100
98
  files = self.get_file_paths(cat=cat)
101
99
  filename_pattern = FilenamePattern(self.filename_regex)
102
100
 
103
- data_files: typing.List[DataFile] = []
101
+ data_files: list[DataFile] = []
104
102
 
105
103
  for f in files:
106
104
  try:
107
- data_files.append(DataFile.make_from_bundle(
108
- path=f, bundle_path=self._bundle_file,
109
- pattern=filename_pattern))
110
- except (IOError, ValueError) as e:
105
+ data_files.append(
106
+ DataFile.make_from_bundle(
107
+ path=f,
108
+ bundle_path=self._bundle_file,
109
+ pattern=filename_pattern,
110
+ )
111
+ )
112
+ except (OSError, ValueError) as e:
111
113
  logger.error("Error processing file %s: %s", f, str(e))
112
114
 
113
115
  return data_files
114
116
 
115
- def get_file_paths(self, cat: str | None = None) -> typing.Sequence[str]:
116
- """Returns the list of matched files.
117
- """
117
+ def get_file_paths(self, cat: str | None = None) -> typing.Sequence[Path]:
118
+ """Returns the list of matched files."""
118
119
  files, _, _ = self._do_get_file_paths(cat=cat)
119
120
  return files
120
121
 
121
- def get_unmatched_file_paths(self, cat: str | None = None
122
- ) -> typing.Sequence[str]:
123
- """Returns the list of unmatched files.
124
- """
122
+ def get_unmatched_file_paths(
123
+ self, cat: str | None = None
124
+ ) -> typing.Sequence[Path]:
125
+ """Returns the list of unmatched files."""
125
126
  _, files, _ = self._do_get_file_paths(cat=cat)
126
127
  return files
127
128
 
128
- def get_all_categories(self) -> typing.Set[str]:
129
+ def get_all_categories(self) -> set[str]:
129
130
  """Returns all categories of the analysis."""
130
131
  _, _, categories = self._do_get_file_paths()
131
132
  return categories
132
133
 
133
134
  @staticmethod
134
- def _resolve_data_path(root_dir: str, data_path: str | None) -> str:
135
- resolved_data_path = '' if data_path is None else data_path
135
+ def _resolve_data_path(root_dir: Path, data_path: Path | None) -> Path:
136
+ resolved_data_path = Path() if data_path is None else data_path
136
137
 
137
- if not os.path.isabs(resolved_data_path):
138
- resolved_data_path = os.path.abspath(os.path.join(root_dir, resolved_data_path))
138
+ if not resolved_data_path.is_absolute():
139
+ resolved_data_path = (root_dir / resolved_data_path).absolute()
139
140
 
140
141
  return resolved_data_path
141
142
 
142
- def _get_files_with_allowed_categories(self) -> typing.Dict[str, str]:
143
+ def _get_files_with_allowed_categories(self) -> dict[Path, str]:
143
144
  # Create a dict to store allowed files. Keys are the filepaths,
144
145
  # and values are their corresponding category.
145
- allowed_files: typing.Dict[str, str] = {}
146
+ allowed_files: dict[Path, str] = {}
146
147
  # If files are listed explicitly in the YAML in the 'files' attribute, process them.
147
148
  if self._files is not None:
148
149
  abs_filepaths = [Path(self._data_path) / f for f in self._files]
@@ -151,14 +152,14 @@ class Analysis:
151
152
  cat = file.suffixes[0][1:]
152
153
  # Add each matching file and its category to the dict.
153
154
  if cat in ALLOWED_CATEGORIES:
154
- allowed_files[str(file)] = cat
155
+ allowed_files[file] = cat
155
156
  # Else, look for files on disk using the YAML 'data_path' attribute.
156
157
  else:
157
158
  # Try to retrieve files matching allowed categories using glob.
158
159
  for cat in ALLOWED_CATEGORIES:
159
- glob_res = []
160
- glob_res.extend(glob.glob(os.path.join(self._data_path, f"*.{cat}")))
161
- glob_res.extend(glob.glob(os.path.join(self._data_path, f"*.{cat}.gz")))
160
+ glob_res: list[Path] = []
161
+ glob_res.extend(self._data_path.glob(f"*.{cat}"))
162
+ glob_res.extend(self._data_path.glob(f"*.{cat}.gz"))
162
163
 
163
164
  # Add each globed file and its category to the dict.
164
165
  for g_file in glob_res:
@@ -166,12 +167,13 @@ class Analysis:
166
167
 
167
168
  return allowed_files
168
169
 
169
- def _do_get_file_paths(self, cat: str | None = None) \
170
- -> tuple[typing.Sequence[str], typing.Sequence[str], typing.Set[str]]:
171
-
170
+ def _do_get_file_paths(
171
+ self, cat: str | None = None
172
+ ) -> tuple[typing.Sequence[Path], typing.Sequence[Path], set[str]]:
172
173
  # Raise an error if the category given as a parameter is not part of the allowed categories.
173
174
  if cat is not None and cat not in ALLOWED_CATEGORIES:
174
- raise ValueError(f"Unknown category {cat}.")
175
+ msg = f"Unknown category {cat}."
176
+ raise ValueError(msg)
175
177
 
176
178
  # Obtain a dict of all files matching the allowed categories.
177
179
  allowed_files = self._get_files_with_allowed_categories()
@@ -181,16 +183,18 @@ class Analysis:
181
183
  files_to_match = allowed_files
182
184
  else:
183
185
  # A category was given as a parameter, so we match only this specific category.
184
- files_to_match = dict((k, v) for (k, v) in allowed_files.items() if v == cat)
186
+ files_to_match = {
187
+ k: v for k, v in allowed_files.items() if v == cat
188
+ }
185
189
 
186
190
  filename_pattern = FilenamePattern(self.filename_regex)
187
- matching_files: typing.List[str] = []
188
- non_matching_files: typing.List[str] = []
191
+ matching_files: list[Path] = []
192
+ non_matching_files: list[Path] = []
189
193
  categories = set()
190
194
 
191
195
  # We filter files by ensuring that they match the filename pattern defined in the analysis.
192
196
  for file, category in sorted(files_to_match.items()):
193
- if filename_pattern.matches_pattern(os.path.basename(file)):
197
+ if filename_pattern.matches_pattern(file.name):
194
198
  matching_files.append(file)
195
199
  logger.info("MATCHED file %s.", file)
196
200
  # Add the file category to the categories set.