genelastic 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/api/extends/example.py +2 -3
- genelastic/api/routes.py +160 -23
- genelastic/api/server.py +30 -22
- genelastic/api/settings.py +3 -2
- genelastic/common/__init__.py +36 -9
- genelastic/common/cli.py +51 -23
- genelastic/common/elastic.py +80 -49
- genelastic/common/exceptions.py +0 -2
- genelastic/common/types.py +20 -15
- genelastic/import_data/__init__.py +23 -5
- genelastic/import_data/analyses.py +17 -20
- genelastic/import_data/analysis.py +69 -65
- genelastic/import_data/bi_process.py +7 -5
- genelastic/import_data/bi_processes.py +8 -8
- genelastic/import_data/cli_gen_data.py +116 -0
- genelastic/import_data/cli_import.py +379 -0
- genelastic/import_data/{info.py → cli_info.py} +104 -75
- genelastic/import_data/cli_integrity.py +384 -0
- genelastic/import_data/cli_validate.py +54 -0
- genelastic/import_data/constants.py +11 -32
- genelastic/import_data/data_file.py +23 -20
- genelastic/import_data/filename_pattern.py +26 -32
- genelastic/import_data/import_bundle.py +56 -47
- genelastic/import_data/import_bundle_factory.py +166 -158
- genelastic/import_data/logger.py +22 -18
- genelastic/import_data/random_bundle.py +402 -0
- genelastic/import_data/tags.py +46 -26
- genelastic/import_data/wet_process.py +8 -4
- genelastic/import_data/wet_processes.py +13 -8
- genelastic/ui/__init__.py +0 -0
- genelastic/ui/server.py +87 -0
- genelastic/ui/settings.py +11 -0
- genelastic-0.7.0.dist-info/METADATA +105 -0
- genelastic-0.7.0.dist-info/RECORD +40 -0
- {genelastic-0.6.1.dist-info → genelastic-0.7.0.dist-info}/WHEEL +1 -1
- genelastic-0.7.0.dist-info/entry_points.txt +6 -0
- genelastic/import_data/gen_data.py +0 -194
- genelastic/import_data/import_data.py +0 -292
- genelastic/import_data/integrity.py +0 -290
- genelastic/import_data/validate_data.py +0 -43
- genelastic-0.6.1.dist-info/METADATA +0 -41
- genelastic-0.6.1.dist-info/RECORD +0 -36
- genelastic-0.6.1.dist-info/entry_points.txt +0 -6
- {genelastic-0.6.1.dist-info → genelastic-0.7.0.dist-info}/top_level.txt +0 -0
genelastic/common/elastic.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# pylint: disable=missing-module-docstring
|
|
2
1
|
import datetime
|
|
3
2
|
import logging
|
|
4
3
|
import time
|
|
@@ -13,14 +12,15 @@ from elasticsearch import Elasticsearch
|
|
|
13
12
|
from .exceptions import DBIntegrityError
|
|
14
13
|
from .types import Bucket, BulkItems
|
|
15
14
|
|
|
16
|
-
logger = logging.getLogger(
|
|
15
|
+
logger = logging.getLogger("genelastic")
|
|
17
16
|
|
|
18
17
|
|
|
19
|
-
class ElasticConn(ABC):
|
|
18
|
+
class ElasticConn(ABC):
|
|
20
19
|
"""Abstract class representing a connector for an Elasticsearch server."""
|
|
20
|
+
|
|
21
21
|
client: Elasticsearch
|
|
22
22
|
|
|
23
|
-
def __init__(self, url: str, fingerprint: str, **kwargs: Any):
|
|
23
|
+
def __init__(self, url: str, fingerprint: str, **kwargs: Any) -> None: # noqa: ANN401
|
|
24
24
|
"""Initialize an elasticsearch client instance.
|
|
25
25
|
|
|
26
26
|
:url: URL of the Elasticsearch host.
|
|
@@ -34,41 +34,49 @@ class ElasticConn(ABC): # pylint: disable=too-few-public-methods
|
|
|
34
34
|
ssl_assert_fingerprint=fingerprint,
|
|
35
35
|
# Verify cert only when the fingerprint is not None.
|
|
36
36
|
verify_certs=bool(fingerprint),
|
|
37
|
-
**kwargs
|
|
37
|
+
**kwargs,
|
|
38
38
|
)
|
|
39
39
|
self.client.info()
|
|
40
|
-
except (
|
|
40
|
+
except (
|
|
41
|
+
elastic_transport.TransportError,
|
|
42
|
+
elasticsearch.AuthenticationException,
|
|
43
|
+
) as e:
|
|
41
44
|
raise SystemExit(e) from e
|
|
42
45
|
|
|
43
46
|
|
|
44
|
-
class ElasticImportConn(ElasticConn):
|
|
47
|
+
class ElasticImportConn(ElasticConn):
|
|
45
48
|
"""Connector to import data into an Elasticsearch database."""
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
+
|
|
50
|
+
def import_items(
|
|
51
|
+
self, bulk_items: BulkItems, start_time: float, total_items: int
|
|
52
|
+
) -> None:
|
|
49
53
|
"""Import items to the Elasticsearch database."""
|
|
50
54
|
if len(bulk_items) > 0:
|
|
51
55
|
elasticsearch.helpers.bulk(self.client, bulk_items)
|
|
52
56
|
elapsed = time.perf_counter() - start_time
|
|
53
|
-
logger.info(
|
|
54
|
-
|
|
57
|
+
logger.info(
|
|
58
|
+
"Imported %d items in %s (%f items/s).",
|
|
59
|
+
total_items,
|
|
60
|
+
datetime.timedelta(seconds=elapsed),
|
|
61
|
+
total_items / elapsed,
|
|
62
|
+
)
|
|
55
63
|
|
|
56
64
|
|
|
57
65
|
class ElasticQueryConn(ElasticConn):
|
|
58
66
|
"""Connector to query data from an Elasticsearch database."""
|
|
59
67
|
|
|
60
|
-
def get_indices(self) -> Any | str:
|
|
68
|
+
def get_indices(self) -> Any | str: # noqa: ANN401
|
|
61
69
|
"""Return all indices."""
|
|
62
70
|
return self.client.cat.indices(format="json").body
|
|
63
71
|
|
|
64
|
-
def get_document_by_id(self, index: str, document_id: str) -> Any | str:
|
|
72
|
+
def get_document_by_id(self, index: str, document_id: str) -> Any | str: # noqa: ANN401
|
|
65
73
|
"""Return a document by its ID."""
|
|
66
74
|
return self.client.get(index=index, id=document_id).body
|
|
67
75
|
|
|
68
|
-
def run_composite_aggregation(
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
Executes a composite aggregation on an Elasticsearch index and
|
|
76
|
+
def run_composite_aggregation(
|
|
77
|
+
self, index: str, query: dict[str, typing.Any]
|
|
78
|
+
) -> list[Bucket]:
|
|
79
|
+
"""Executes a composite aggregation on an Elasticsearch index and
|
|
72
80
|
returns all paginated results.
|
|
73
81
|
|
|
74
82
|
:param index: Name of the index to query.
|
|
@@ -77,29 +85,39 @@ class ElasticQueryConn(ElasticConn):
|
|
|
77
85
|
"""
|
|
78
86
|
# Extract the aggregation name from the query dict.
|
|
79
87
|
agg_name = next(iter(query["aggs"]))
|
|
80
|
-
all_buckets:
|
|
88
|
+
all_buckets: list[Bucket] = []
|
|
81
89
|
|
|
82
90
|
try:
|
|
83
|
-
logger.debug(
|
|
91
|
+
logger.debug(
|
|
92
|
+
"Running composite aggregation query %s on index '%s'.",
|
|
93
|
+
query,
|
|
94
|
+
index,
|
|
95
|
+
)
|
|
84
96
|
response = self.client.search(index=index, body=query)
|
|
85
97
|
except elasticsearch.NotFoundError as e:
|
|
86
|
-
|
|
98
|
+
msg = f"Error: {e.message} for index '{index}'."
|
|
99
|
+
raise SystemExit(msg) from e
|
|
87
100
|
|
|
88
101
|
while True:
|
|
89
102
|
# Extract buckets from the response.
|
|
90
|
-
buckets:
|
|
103
|
+
buckets: list[Bucket] = response["aggregations"][agg_name][
|
|
104
|
+
"buckets"
|
|
105
|
+
]
|
|
91
106
|
all_buckets.extend(buckets)
|
|
92
107
|
|
|
93
108
|
# Check if there are more results to fetch.
|
|
94
|
-
if
|
|
95
|
-
after_key = response[
|
|
96
|
-
query[
|
|
109
|
+
if "after_key" in response["aggregations"][agg_name]:
|
|
110
|
+
after_key = response["aggregations"][agg_name]["after_key"]
|
|
111
|
+
query["aggs"][agg_name]["composite"]["after"] = after_key
|
|
97
112
|
try:
|
|
98
|
-
logger.debug(
|
|
113
|
+
logger.debug(
|
|
114
|
+
"Running query %s on index '%s'.", query, index
|
|
115
|
+
)
|
|
99
116
|
# Fetch the next page of results.
|
|
100
117
|
response = self.client.search(index=index, body=query)
|
|
101
118
|
except elasticsearch.NotFoundError as e:
|
|
102
|
-
|
|
119
|
+
msg = f"Error: {e.message} for index '{index}'."
|
|
120
|
+
raise SystemExit(msg) from e
|
|
103
121
|
else:
|
|
104
122
|
break
|
|
105
123
|
|
|
@@ -114,25 +132,34 @@ class ElasticQueryConn(ElasticConn):
|
|
|
114
132
|
"aggs": {
|
|
115
133
|
"get_field_values": {
|
|
116
134
|
"composite": {
|
|
117
|
-
"sources": {
|
|
135
|
+
"sources": {
|
|
136
|
+
"values": {
|
|
137
|
+
"terms": {"field": f"{field_name}.keyword"}
|
|
138
|
+
}
|
|
139
|
+
},
|
|
118
140
|
"size": 1000,
|
|
119
141
|
}
|
|
120
142
|
}
|
|
121
|
-
}
|
|
143
|
+
},
|
|
122
144
|
}
|
|
123
145
|
|
|
124
|
-
buckets:
|
|
146
|
+
buckets: list[Bucket] = self.run_composite_aggregation(index, query)
|
|
125
147
|
|
|
126
148
|
for bucket in buckets:
|
|
127
|
-
values.add(bucket[
|
|
149
|
+
values.add(bucket["key"]["values"])
|
|
128
150
|
|
|
129
151
|
return values
|
|
130
152
|
|
|
131
|
-
def search_by_field_value(
|
|
132
|
-
|
|
153
|
+
def search_by_field_value(
|
|
154
|
+
self, index: str, field: str, value: str
|
|
155
|
+
) -> dict[str, typing.Any] | None:
|
|
133
156
|
"""Search a document by a value for a certain field."""
|
|
134
|
-
logger.info(
|
|
135
|
-
|
|
157
|
+
logger.info(
|
|
158
|
+
"Searching for field '%s' with value '%s' inside index '%s'.",
|
|
159
|
+
field,
|
|
160
|
+
value,
|
|
161
|
+
index,
|
|
162
|
+
)
|
|
136
163
|
search_query = {
|
|
137
164
|
"query": {
|
|
138
165
|
"term": {
|
|
@@ -144,22 +171,23 @@ class ElasticQueryConn(ElasticConn):
|
|
|
144
171
|
response = self.client.search(index=index, body=search_query)
|
|
145
172
|
|
|
146
173
|
try:
|
|
147
|
-
return response[
|
|
174
|
+
return response["hits"]["hits"][0]["_source"] # type: ignore[no-any-return]
|
|
148
175
|
except KeyError:
|
|
149
176
|
return None
|
|
150
177
|
|
|
151
178
|
def ensure_unique(self, index: str, field: str) -> None:
|
|
152
|
-
"""
|
|
153
|
-
Ensure that all values of a field in an index are all unique.
|
|
179
|
+
"""Ensure that all values of a field in an index are all unique.
|
|
154
180
|
|
|
155
181
|
:param index: Name of the index.
|
|
156
182
|
:param field: Field name to check for value uniqueness.
|
|
157
183
|
:raises genelastic.common.DBIntegrityError:
|
|
158
184
|
Some values of the given field are duplicated in the index.
|
|
159
185
|
"""
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
186
|
+
logger.info(
|
|
187
|
+
"Ensuring that the field '%s' in the index '%s' only contains unique values...",
|
|
188
|
+
field,
|
|
189
|
+
index,
|
|
190
|
+
)
|
|
163
191
|
query = {
|
|
164
192
|
"size": 0,
|
|
165
193
|
"aggs": {
|
|
@@ -167,17 +195,20 @@ class ElasticQueryConn(ElasticConn):
|
|
|
167
195
|
"terms": {
|
|
168
196
|
"field": f"{field}.keyword",
|
|
169
197
|
"size": 10000,
|
|
170
|
-
"min_doc_count": 2
|
|
198
|
+
"min_doc_count": 2,
|
|
171
199
|
}
|
|
172
200
|
}
|
|
173
|
-
}
|
|
201
|
+
},
|
|
202
|
+
}
|
|
203
|
+
buckets: list[Bucket] = self.run_composite_aggregation(index, query)
|
|
204
|
+
duplicated_processes: set[str] = {
|
|
205
|
+
str(bucket["key"]) for bucket in buckets
|
|
174
206
|
}
|
|
175
|
-
buckets: typing.List[Bucket] = self.run_composite_aggregation(index, query)
|
|
176
|
-
duplicated_processes: typing.Set[str] = set(map(lambda bucket: str(bucket["key"]), buckets))
|
|
177
207
|
|
|
178
208
|
if len(duplicated_processes) > 0:
|
|
179
|
-
|
|
180
|
-
|
|
209
|
+
msg = f"Found non-unique value for field {field} in index '{index}': {', '.join(duplicated_processes)}."
|
|
210
|
+
raise DBIntegrityError(msg)
|
|
181
211
|
|
|
182
|
-
logger.info(
|
|
183
|
-
|
|
212
|
+
logger.info(
|
|
213
|
+
"All values of field '%s' in index '%s' are unique.", field, index
|
|
214
|
+
)
|
genelastic/common/exceptions.py
CHANGED
genelastic/common/types.py
CHANGED
|
@@ -1,20 +1,25 @@
|
|
|
1
|
-
# pylint: disable=missing-module-docstring
|
|
2
|
-
|
|
3
1
|
import typing
|
|
4
2
|
|
|
3
|
+
# Types related to Elasticsearch data import.
|
|
5
4
|
Bucket: typing.TypeAlias = dict[str, dict[typing.Any, typing.Any]]
|
|
5
|
+
BundleDict: typing.TypeAlias = dict[str, typing.Any]
|
|
6
|
+
|
|
7
|
+
AnalysisMetaData: typing.TypeAlias = dict[str, str | int]
|
|
8
|
+
WetProcessesData: typing.TypeAlias = dict[str, str | int | float]
|
|
9
|
+
BioInfoProcessData: typing.TypeAlias = dict[str, str | list[str]]
|
|
6
10
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
+
AnalysisDocument: typing.TypeAlias = dict[str, str | None | AnalysisMetaData]
|
|
12
|
+
MetadataDocument: typing.TypeAlias = dict[
|
|
13
|
+
str, int | str | list[typing.Any | None]
|
|
14
|
+
]
|
|
15
|
+
ProcessDocument: typing.TypeAlias = (
|
|
16
|
+
dict[str, str] | WetProcessesData | BioInfoProcessData
|
|
17
|
+
)
|
|
18
|
+
BulkItems: typing.TypeAlias = list[
|
|
19
|
+
dict[str, str | MetadataDocument | AnalysisDocument | ProcessDocument]
|
|
20
|
+
]
|
|
11
21
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
BioInfoProcessData)
|
|
17
|
-
BulkItems: typing.TypeAlias = typing.List[typing.Dict[str, str |
|
|
18
|
-
MetadataDocument |
|
|
19
|
-
AnalysisDocument |
|
|
20
|
-
ProcessDocument]]
|
|
22
|
+
# Types related to random bundle generation.
|
|
23
|
+
RandomBiProcessData: typing.TypeAlias = dict[str, str | list[dict[str, str]]]
|
|
24
|
+
RandomWetProcessData: typing.TypeAlias = dict[str, str | float]
|
|
25
|
+
RandomAnalysisData: typing.TypeAlias = dict[str, str | list[int | str]]
|
|
@@ -1,9 +1,27 @@
|
|
|
1
1
|
"""Genelastic package for importing Genomic data into Elasticsearch."""
|
|
2
|
+
|
|
2
3
|
from .analysis import Analysis
|
|
3
|
-
from .import_bundle_factory import (make_import_bundle_from_files,
|
|
4
|
-
load_import_bundle_file)
|
|
5
|
-
from .tags import Tags
|
|
6
4
|
from .import_bundle import ImportBundle
|
|
5
|
+
from .import_bundle_factory import (
|
|
6
|
+
load_import_bundle_file,
|
|
7
|
+
make_import_bundle_from_files,
|
|
8
|
+
)
|
|
9
|
+
from .random_bundle import (
|
|
10
|
+
RandomAnalysis,
|
|
11
|
+
RandomBiProcess,
|
|
12
|
+
RandomBundle,
|
|
13
|
+
RandomWetProcess,
|
|
14
|
+
)
|
|
15
|
+
from .tags import Tags
|
|
7
16
|
|
|
8
|
-
__all__ = [
|
|
9
|
-
|
|
17
|
+
__all__ = [
|
|
18
|
+
"Analysis",
|
|
19
|
+
"ImportBundle",
|
|
20
|
+
"RandomAnalysis",
|
|
21
|
+
"RandomBiProcess",
|
|
22
|
+
"RandomBundle",
|
|
23
|
+
"RandomWetProcess",
|
|
24
|
+
"Tags",
|
|
25
|
+
"load_import_bundle_file",
|
|
26
|
+
"make_import_bundle_from_files",
|
|
27
|
+
]
|
|
@@ -1,23 +1,22 @@
|
|
|
1
|
-
# pylint: disable=missing-module-docstring
|
|
2
1
|
import typing
|
|
3
2
|
|
|
4
|
-
from
|
|
3
|
+
from genelastic.common import BundleDict
|
|
5
4
|
|
|
6
5
|
from .analysis import Analysis
|
|
7
6
|
from .data_file import DataFile
|
|
8
7
|
|
|
8
|
+
|
|
9
9
|
class Analyses:
|
|
10
|
-
"""Class Analyses is a container of Analysis objects.
|
|
11
|
-
"""
|
|
10
|
+
"""Class Analyses is a container of Analysis objects."""
|
|
12
11
|
|
|
13
12
|
def __init__(self) -> None:
|
|
14
|
-
self._arr:
|
|
13
|
+
self._arr: list[Analysis] = []
|
|
15
14
|
self._iter_index: int = 0
|
|
16
15
|
|
|
17
16
|
def __len__(self) -> int:
|
|
18
17
|
return len(self._arr)
|
|
19
18
|
|
|
20
|
-
def __iter__(self) -> typing.
|
|
19
|
+
def __iter__(self) -> typing.Iterator[Analysis]:
|
|
21
20
|
yield from self._arr
|
|
22
21
|
|
|
23
22
|
@typing.overload
|
|
@@ -25,13 +24,13 @@ class Analyses:
|
|
|
25
24
|
pass
|
|
26
25
|
|
|
27
26
|
@typing.overload
|
|
28
|
-
def __getitem__(self, k: slice) ->
|
|
27
|
+
def __getitem__(self, k: slice) -> list[Analysis]:
|
|
29
28
|
pass
|
|
30
29
|
|
|
31
|
-
def __getitem__(self, k):
|
|
30
|
+
def __getitem__(self, k): # type: ignore[no-untyped-def]
|
|
32
31
|
if isinstance(k, int):
|
|
33
32
|
return self._arr[k]
|
|
34
|
-
return self._arr[k.start:k.stop]
|
|
33
|
+
return self._arr[k.start : k.stop]
|
|
35
34
|
|
|
36
35
|
def add(self, a: Analysis) -> None:
|
|
37
36
|
"""Add one Analysis object."""
|
|
@@ -39,20 +38,18 @@ class Analyses:
|
|
|
39
38
|
|
|
40
39
|
def get_nb_files(self, cat: str | None = None) -> int:
|
|
41
40
|
"""Get the total number of files as paths."""
|
|
42
|
-
return len(self.get_data_files(cat
|
|
43
|
-
|
|
44
|
-
def get_data_files(self, cat: str | None = None) -> typing.List[DataFile]:
|
|
45
|
-
"""Get the total number of files as DataFile objects.
|
|
46
|
-
"""
|
|
41
|
+
return len(self.get_data_files(cat=cat))
|
|
47
42
|
|
|
48
|
-
|
|
43
|
+
def get_data_files(self, cat: str | None = None) -> list[DataFile]:
|
|
44
|
+
"""Get the total number of files as DataFile objects."""
|
|
45
|
+
data_files: list[DataFile] = []
|
|
49
46
|
|
|
50
47
|
for a in self._arr:
|
|
51
|
-
data_files.extend(a.get_data_files(cat
|
|
48
|
+
data_files.extend(a.get_data_files(cat=cat))
|
|
52
49
|
|
|
53
50
|
return data_files
|
|
54
51
|
|
|
55
|
-
def get_all_categories(self) ->
|
|
52
|
+
def get_all_categories(self) -> set[str]:
|
|
56
53
|
"""Return all the categories of the analyses."""
|
|
57
54
|
categories = set()
|
|
58
55
|
for a in self._arr:
|
|
@@ -60,10 +57,10 @@ class Analyses:
|
|
|
60
57
|
return categories
|
|
61
58
|
|
|
62
59
|
@classmethod
|
|
63
|
-
def from_array_of_dicts(
|
|
64
|
-
|
|
60
|
+
def from_array_of_dicts(
|
|
61
|
+
cls, arr: typing.Sequence[BundleDict]
|
|
62
|
+
) -> typing.Self:
|
|
65
63
|
"""Build an Analyses instance."""
|
|
66
|
-
|
|
67
64
|
analyses = cls()
|
|
68
65
|
|
|
69
66
|
for d in arr:
|
|
@@ -1,8 +1,5 @@
|
|
|
1
|
-
# pylint: disable=missing-module-docstring
|
|
2
1
|
import copy
|
|
3
|
-
import glob
|
|
4
2
|
import logging
|
|
5
|
-
import os
|
|
6
3
|
import re
|
|
7
4
|
import typing
|
|
8
5
|
from pathlib import Path
|
|
@@ -14,28 +11,31 @@ from .data_file import DataFile
|
|
|
14
11
|
from .filename_pattern import FilenamePattern
|
|
15
12
|
from .tags import Tags
|
|
16
13
|
|
|
17
|
-
logger = logging.getLogger(
|
|
14
|
+
logger = logging.getLogger("genelastic")
|
|
18
15
|
|
|
19
16
|
|
|
20
17
|
class Analysis:
|
|
21
18
|
"""Class Analysis that represents an analysis."""
|
|
22
19
|
|
|
23
|
-
#
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
20
|
+
def __init__( # noqa: PLR0913
|
|
21
|
+
self,
|
|
22
|
+
tags: Tags,
|
|
23
|
+
root_dir: str = ".",
|
|
24
|
+
bundle_file: str | None = None,
|
|
25
|
+
file_prefix: str | None = None,
|
|
26
|
+
files: typing.Sequence[str] | None = None,
|
|
27
|
+
data_path: str | None = None,
|
|
28
|
+
**metadata: str | int,
|
|
29
|
+
) -> None:
|
|
30
|
+
self._bundle_file = Path(bundle_file) if bundle_file else None
|
|
33
31
|
self._file_prefix = file_prefix
|
|
34
32
|
self._files = files
|
|
35
|
-
self._data_path = Analysis._resolve_data_path(
|
|
33
|
+
self._data_path = Analysis._resolve_data_path(
|
|
34
|
+
Path(root_dir), Path(data_path) if data_path else None
|
|
35
|
+
)
|
|
36
36
|
self._tags = tags
|
|
37
37
|
self._metadata: AnalysisMetaData = metadata
|
|
38
|
-
self._categories:
|
|
38
|
+
self._categories: set[str] = set()
|
|
39
39
|
|
|
40
40
|
@property
|
|
41
41
|
def metadata(self) -> AnalysisMetaData:
|
|
@@ -43,17 +43,15 @@ class Analysis:
|
|
|
43
43
|
return copy.deepcopy(self._metadata)
|
|
44
44
|
|
|
45
45
|
@property
|
|
46
|
-
def bundle_file(self) ->
|
|
46
|
+
def bundle_file(self) -> Path | None:
|
|
47
47
|
"""Get the bundle file."""
|
|
48
48
|
return self._bundle_file
|
|
49
49
|
|
|
50
50
|
@property
|
|
51
51
|
def filename_regex(self) -> str:
|
|
52
|
-
"""
|
|
53
|
-
Resolve placeholders in a file prefix using metadata
|
|
52
|
+
"""Resolve placeholders in a file prefix using metadata
|
|
54
53
|
and unresolved placeholders are converted to regex groups
|
|
55
54
|
"""
|
|
56
|
-
|
|
57
55
|
x: str = r"^.+\.(?P<ext>vcf|cov)(\.gz)?$"
|
|
58
56
|
|
|
59
57
|
# Use existing generic prefix
|
|
@@ -65,84 +63,87 @@ class Analysis:
|
|
|
65
63
|
regex = tag_attrs["regex"]
|
|
66
64
|
|
|
67
65
|
# Build field regex
|
|
68
|
-
field_regex = (
|
|
69
|
-
|
|
70
|
-
|
|
66
|
+
field_regex = (
|
|
67
|
+
f"(?P<{field}>{self._metadata.get(field)})"
|
|
68
|
+
if field in self._metadata
|
|
69
|
+
else f"(?P<{field}>{regex})"
|
|
70
|
+
)
|
|
71
71
|
# Replace tag with field regex
|
|
72
72
|
x = x.replace(tag_name, field_regex)
|
|
73
73
|
|
|
74
74
|
# Check for tags that were not replaced.
|
|
75
75
|
groups = re.findall(self._tags.search_regex, x)
|
|
76
76
|
for match in groups:
|
|
77
|
-
logger.warning(
|
|
78
|
-
|
|
79
|
-
|
|
77
|
+
logger.warning(
|
|
78
|
+
"String '%s' in key 'file_prefix' looks like an undefined tag. "
|
|
79
|
+
"If this string is not a tag, you can ignore this warning.",
|
|
80
|
+
match,
|
|
81
|
+
)
|
|
80
82
|
|
|
81
83
|
# Add missing start and end markers
|
|
82
84
|
if not x.startswith("^"):
|
|
83
85
|
x = "^" + x
|
|
84
86
|
if not x.endswith("$"):
|
|
85
|
-
x +=
|
|
86
|
-
+ r")(\.gz)?$")
|
|
87
|
+
x += r"\.(?P<ext>" + "|".join(ALLOWED_CATEGORIES) + r")(\.gz)?$"
|
|
87
88
|
logger.debug("File regex for %s: %s", self._bundle_file, x)
|
|
88
89
|
|
|
89
90
|
return x
|
|
90
91
|
|
|
91
92
|
def get_nb_files(self, cat: str | None = None) -> int:
|
|
92
|
-
"""Returns the total number of files.
|
|
93
|
-
"""
|
|
93
|
+
"""Returns the total number of files."""
|
|
94
94
|
return len(self.get_file_paths(cat=cat))
|
|
95
95
|
|
|
96
|
-
def get_data_files(self, cat: str | None = None) ->
|
|
97
|
-
"""Returns the list of matched files as DataFile objects.
|
|
98
|
-
"""
|
|
99
|
-
|
|
96
|
+
def get_data_files(self, cat: str | None = None) -> list[DataFile]:
|
|
97
|
+
"""Returns the list of matched files as DataFile objects."""
|
|
100
98
|
files = self.get_file_paths(cat=cat)
|
|
101
99
|
filename_pattern = FilenamePattern(self.filename_regex)
|
|
102
100
|
|
|
103
|
-
data_files:
|
|
101
|
+
data_files: list[DataFile] = []
|
|
104
102
|
|
|
105
103
|
for f in files:
|
|
106
104
|
try:
|
|
107
|
-
data_files.append(
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
105
|
+
data_files.append(
|
|
106
|
+
DataFile.make_from_bundle(
|
|
107
|
+
path=f,
|
|
108
|
+
bundle_path=self._bundle_file,
|
|
109
|
+
pattern=filename_pattern,
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
except (OSError, ValueError) as e:
|
|
111
113
|
logger.error("Error processing file %s: %s", f, str(e))
|
|
112
114
|
|
|
113
115
|
return data_files
|
|
114
116
|
|
|
115
|
-
def get_file_paths(self, cat: str | None = None) -> typing.Sequence[
|
|
116
|
-
"""Returns the list of matched files.
|
|
117
|
-
"""
|
|
117
|
+
def get_file_paths(self, cat: str | None = None) -> typing.Sequence[Path]:
|
|
118
|
+
"""Returns the list of matched files."""
|
|
118
119
|
files, _, _ = self._do_get_file_paths(cat=cat)
|
|
119
120
|
return files
|
|
120
121
|
|
|
121
|
-
def get_unmatched_file_paths(
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
"""
|
|
122
|
+
def get_unmatched_file_paths(
|
|
123
|
+
self, cat: str | None = None
|
|
124
|
+
) -> typing.Sequence[Path]:
|
|
125
|
+
"""Returns the list of unmatched files."""
|
|
125
126
|
_, files, _ = self._do_get_file_paths(cat=cat)
|
|
126
127
|
return files
|
|
127
128
|
|
|
128
|
-
def get_all_categories(self) ->
|
|
129
|
+
def get_all_categories(self) -> set[str]:
|
|
129
130
|
"""Returns all categories of the analysis."""
|
|
130
131
|
_, _, categories = self._do_get_file_paths()
|
|
131
132
|
return categories
|
|
132
133
|
|
|
133
134
|
@staticmethod
|
|
134
|
-
def _resolve_data_path(root_dir:
|
|
135
|
-
resolved_data_path =
|
|
135
|
+
def _resolve_data_path(root_dir: Path, data_path: Path | None) -> Path:
|
|
136
|
+
resolved_data_path = Path() if data_path is None else data_path
|
|
136
137
|
|
|
137
|
-
if not
|
|
138
|
-
resolved_data_path =
|
|
138
|
+
if not resolved_data_path.is_absolute():
|
|
139
|
+
resolved_data_path = (root_dir / resolved_data_path).absolute()
|
|
139
140
|
|
|
140
141
|
return resolved_data_path
|
|
141
142
|
|
|
142
|
-
def _get_files_with_allowed_categories(self) ->
|
|
143
|
+
def _get_files_with_allowed_categories(self) -> dict[Path, str]:
|
|
143
144
|
# Create a dict to store allowed files. Keys are the filepaths,
|
|
144
145
|
# and values are their corresponding category.
|
|
145
|
-
allowed_files:
|
|
146
|
+
allowed_files: dict[Path, str] = {}
|
|
146
147
|
# If files are listed explicitly in the YAML in the 'files' attribute, process them.
|
|
147
148
|
if self._files is not None:
|
|
148
149
|
abs_filepaths = [Path(self._data_path) / f for f in self._files]
|
|
@@ -151,14 +152,14 @@ class Analysis:
|
|
|
151
152
|
cat = file.suffixes[0][1:]
|
|
152
153
|
# Add each matching file and its category to the dict.
|
|
153
154
|
if cat in ALLOWED_CATEGORIES:
|
|
154
|
-
allowed_files[
|
|
155
|
+
allowed_files[file] = cat
|
|
155
156
|
# Else, look for files on disk using the YAML 'data_path' attribute.
|
|
156
157
|
else:
|
|
157
158
|
# Try to retrieve files matching allowed categories using glob.
|
|
158
159
|
for cat in ALLOWED_CATEGORIES:
|
|
159
|
-
glob_res = []
|
|
160
|
-
glob_res.extend(
|
|
161
|
-
glob_res.extend(
|
|
160
|
+
glob_res: list[Path] = []
|
|
161
|
+
glob_res.extend(self._data_path.glob(f"*.{cat}"))
|
|
162
|
+
glob_res.extend(self._data_path.glob(f"*.{cat}.gz"))
|
|
162
163
|
|
|
163
164
|
# Add each globed file and its category to the dict.
|
|
164
165
|
for g_file in glob_res:
|
|
@@ -166,12 +167,13 @@ class Analysis:
|
|
|
166
167
|
|
|
167
168
|
return allowed_files
|
|
168
169
|
|
|
169
|
-
def _do_get_file_paths(
|
|
170
|
-
|
|
171
|
-
|
|
170
|
+
def _do_get_file_paths(
|
|
171
|
+
self, cat: str | None = None
|
|
172
|
+
) -> tuple[typing.Sequence[Path], typing.Sequence[Path], set[str]]:
|
|
172
173
|
# Raise an error if the category given as a parameter is not part of the allowed categories.
|
|
173
174
|
if cat is not None and cat not in ALLOWED_CATEGORIES:
|
|
174
|
-
|
|
175
|
+
msg = f"Unknown category {cat}."
|
|
176
|
+
raise ValueError(msg)
|
|
175
177
|
|
|
176
178
|
# Obtain a dict of all files matching the allowed categories.
|
|
177
179
|
allowed_files = self._get_files_with_allowed_categories()
|
|
@@ -181,16 +183,18 @@ class Analysis:
|
|
|
181
183
|
files_to_match = allowed_files
|
|
182
184
|
else:
|
|
183
185
|
# A category was given as a parameter, so we match only this specific category.
|
|
184
|
-
files_to_match =
|
|
186
|
+
files_to_match = {
|
|
187
|
+
k: v for k, v in allowed_files.items() if v == cat
|
|
188
|
+
}
|
|
185
189
|
|
|
186
190
|
filename_pattern = FilenamePattern(self.filename_regex)
|
|
187
|
-
matching_files:
|
|
188
|
-
non_matching_files:
|
|
191
|
+
matching_files: list[Path] = []
|
|
192
|
+
non_matching_files: list[Path] = []
|
|
189
193
|
categories = set()
|
|
190
194
|
|
|
191
195
|
# We filter files by ensuring that they match the filename pattern defined in the analysis.
|
|
192
196
|
for file, category in sorted(files_to_match.items()):
|
|
193
|
-
if filename_pattern.matches_pattern(
|
|
197
|
+
if filename_pattern.matches_pattern(file.name):
|
|
194
198
|
matching_files.append(file)
|
|
195
199
|
logger.info("MATCHED file %s.", file)
|
|
196
200
|
# Add the file category to the categories set.
|