genelastic 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/api/.env +4 -0
- genelastic/api/cli_start_api.py +2 -2
- genelastic/api/errors.py +52 -0
- genelastic/api/extends/example.py +0 -6
- genelastic/api/extends/example.yml +0 -20
- genelastic/api/routes.py +313 -181
- genelastic/api/server.py +8 -3
- genelastic/api/specification.yml +343 -181
- genelastic/common/__init__.py +0 -44
- genelastic/common/cli.py +48 -0
- genelastic/common/elastic.py +374 -46
- genelastic/common/exceptions.py +34 -2
- genelastic/common/server.py +9 -1
- genelastic/common/types.py +1 -14
- genelastic/import_data/__init__.py +0 -27
- genelastic/import_data/checker.py +99 -0
- genelastic/import_data/checker_observer.py +13 -0
- genelastic/import_data/cli/__init__.py +0 -0
- genelastic/import_data/cli/cli_check.py +136 -0
- genelastic/import_data/{cli_gen_data.py → cli/gen_data.py} +4 -4
- genelastic/import_data/cli/import_data.py +346 -0
- genelastic/import_data/cli/info.py +247 -0
- genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
- genelastic/import_data/cli/validate.py +146 -0
- genelastic/import_data/collect.py +185 -0
- genelastic/import_data/constants.py +136 -11
- genelastic/import_data/import_bundle.py +102 -59
- genelastic/import_data/import_bundle_factory.py +70 -149
- genelastic/import_data/importers/__init__.py +0 -0
- genelastic/import_data/importers/importer_base.py +131 -0
- genelastic/import_data/importers/importer_factory.py +85 -0
- genelastic/import_data/importers/importer_types.py +223 -0
- genelastic/import_data/logger.py +2 -1
- genelastic/import_data/models/__init__.py +0 -0
- genelastic/import_data/models/analyses.py +178 -0
- genelastic/import_data/models/analysis.py +144 -0
- genelastic/import_data/models/data_file.py +110 -0
- genelastic/import_data/models/process.py +45 -0
- genelastic/import_data/models/processes.py +84 -0
- genelastic/import_data/models/tags.py +170 -0
- genelastic/import_data/models/unique_list.py +109 -0
- genelastic/import_data/models/validate.py +26 -0
- genelastic/import_data/patterns.py +90 -0
- genelastic/import_data/random_bundle.py +10 -8
- genelastic/import_data/resolve.py +157 -0
- genelastic/ui/.env +1 -0
- genelastic/ui/cli_start_ui.py +4 -2
- genelastic/ui/routes.py +289 -42
- genelastic/ui/static/cea-cnrgh.ico +0 -0
- genelastic/ui/static/cea.ico +0 -0
- genelastic/ui/static/layout.ico +0 -0
- genelastic/ui/static/novaseq6000.png +0 -0
- genelastic/ui/static/style.css +430 -0
- genelastic/ui/static/ui.js +458 -0
- genelastic/ui/templates/analyses.html +96 -9
- genelastic/ui/templates/analysis_detail.html +44 -0
- genelastic/ui/templates/bi_process_detail.html +129 -0
- genelastic/ui/templates/bi_processes.html +114 -9
- genelastic/ui/templates/explorer.html +356 -0
- genelastic/ui/templates/home.html +205 -2
- genelastic/ui/templates/layout.html +148 -29
- genelastic/ui/templates/version.html +19 -7
- genelastic/ui/templates/wet_process_detail.html +131 -0
- genelastic/ui/templates/wet_processes.html +114 -9
- genelastic-0.9.0.dist-info/METADATA +686 -0
- genelastic-0.9.0.dist-info/RECORD +76 -0
- genelastic-0.9.0.dist-info/WHEEL +4 -0
- genelastic-0.9.0.dist-info/entry_points.txt +10 -0
- genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
- genelastic/import_data/analyses.py +0 -69
- genelastic/import_data/analysis.py +0 -205
- genelastic/import_data/bi_process.py +0 -27
- genelastic/import_data/bi_processes.py +0 -49
- genelastic/import_data/cli_import.py +0 -379
- genelastic/import_data/cli_info.py +0 -256
- genelastic/import_data/cli_validate.py +0 -54
- genelastic/import_data/data_file.py +0 -87
- genelastic/import_data/filename_pattern.py +0 -57
- genelastic/import_data/tags.py +0 -123
- genelastic/import_data/wet_process.py +0 -28
- genelastic/import_data/wet_processes.py +0 -53
- genelastic-0.8.0.dist-info/METADATA +0 -109
- genelastic-0.8.0.dist-info/RECORD +0 -52
- genelastic-0.8.0.dist-info/WHEEL +0 -5
- genelastic-0.8.0.dist-info/entry_points.txt +0 -8
- genelastic-0.8.0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
from genelastic.common.cli import (
|
|
6
|
+
add_es_connection_args,
|
|
7
|
+
add_verbose_control_args,
|
|
8
|
+
add_version_arg,
|
|
9
|
+
)
|
|
10
|
+
from genelastic.common.elastic import ElasticQueryConn
|
|
11
|
+
from genelastic.import_data.logger import configure_logging
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger("genelastic")
|
|
14
|
+
logging.getLogger("elastic_transport").setLevel(
|
|
15
|
+
logging.WARNING
|
|
16
|
+
) # Disable excessive logging
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def read_args() -> argparse.Namespace:
|
|
20
|
+
"""Read arguments from the command line."""
|
|
21
|
+
parser = argparse.ArgumentParser(
|
|
22
|
+
description="ElasticSearch database info.",
|
|
23
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
24
|
+
allow_abbrev=False,
|
|
25
|
+
)
|
|
26
|
+
add_version_arg(parser)
|
|
27
|
+
add_verbose_control_args(parser)
|
|
28
|
+
add_es_connection_args(parser)
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"-a",
|
|
31
|
+
"--list-analyses",
|
|
32
|
+
action="store_true",
|
|
33
|
+
help="List all imported analyses.",
|
|
34
|
+
)
|
|
35
|
+
parser.add_argument(
|
|
36
|
+
"-w",
|
|
37
|
+
"--list-wet-processes",
|
|
38
|
+
action="store_true",
|
|
39
|
+
help="List all imported wet processes.",
|
|
40
|
+
)
|
|
41
|
+
parser.add_argument(
|
|
42
|
+
"-b",
|
|
43
|
+
"--list-bi-processes",
|
|
44
|
+
action="store_true",
|
|
45
|
+
help="List all imported bio info processes.",
|
|
46
|
+
)
|
|
47
|
+
parser.add_argument(
|
|
48
|
+
"-B",
|
|
49
|
+
"--list-bundles",
|
|
50
|
+
action="store_true",
|
|
51
|
+
help="List YAML bundles and associated analyses.",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
return parser.parse_args()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def list_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
|
|
58
|
+
"""List all processes."""
|
|
59
|
+
process_ids = es_query_conn.get_field_values(index, "proc_id")
|
|
60
|
+
|
|
61
|
+
if len(process_ids) == 0:
|
|
62
|
+
logger.info("Empty response.")
|
|
63
|
+
return
|
|
64
|
+
|
|
65
|
+
for process_id in process_ids:
|
|
66
|
+
logger.info("- %s", process_id)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def list_wet_processes(es_query_conn: ElasticQueryConn) -> None:
|
|
70
|
+
"""List all wet processes."""
|
|
71
|
+
logger.info("Imported wet processes")
|
|
72
|
+
logger.info("======================")
|
|
73
|
+
list_processes(es_query_conn, es_query_conn.wet_processes_index)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def list_bi_processes(es_query_conn: ElasticQueryConn) -> None:
|
|
77
|
+
"""List all bio info processes."""
|
|
78
|
+
logger.info("Imported bi processes")
|
|
79
|
+
logger.info("=====================")
|
|
80
|
+
list_processes(es_query_conn, es_query_conn.bi_processes_index)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def list_analyses(es_query_conn: ElasticQueryConn) -> None:
|
|
84
|
+
"""List all imported analyses and their associated data files."""
|
|
85
|
+
query = {
|
|
86
|
+
"size": 0,
|
|
87
|
+
"aggs": {
|
|
88
|
+
"by_analysis": {
|
|
89
|
+
"composite": {
|
|
90
|
+
"size": 1000,
|
|
91
|
+
"sources": [
|
|
92
|
+
{
|
|
93
|
+
"analysis_id": {
|
|
94
|
+
"terms": {"field": "analysis_id.keyword"}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
],
|
|
98
|
+
},
|
|
99
|
+
"aggs": {
|
|
100
|
+
"data_files": {
|
|
101
|
+
"top_hits": {
|
|
102
|
+
"size": 100,
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
},
|
|
106
|
+
}
|
|
107
|
+
},
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
buckets = es_query_conn.run_composite_aggregation(
|
|
111
|
+
es_query_conn.data_files_index, query
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
if not buckets:
|
|
115
|
+
logger.info("No data files found.")
|
|
116
|
+
return
|
|
117
|
+
|
|
118
|
+
logger.info("Data files per YAML bundle")
|
|
119
|
+
logger.info("=" * 80)
|
|
120
|
+
|
|
121
|
+
for i, bucket in enumerate(buckets):
|
|
122
|
+
analysis_id = bucket["key"]["analysis_id"]
|
|
123
|
+
hits = bucket["data_files"]["hits"]["hits"]
|
|
124
|
+
doc_count = len(hits)
|
|
125
|
+
|
|
126
|
+
logger.info(
|
|
127
|
+
"[%d] Analysis ID: %s (%d file%s)",
|
|
128
|
+
i + 1,
|
|
129
|
+
analysis_id,
|
|
130
|
+
doc_count,
|
|
131
|
+
"s" if doc_count > 1 else "",
|
|
132
|
+
)
|
|
133
|
+
logger.info("-" * 80)
|
|
134
|
+
|
|
135
|
+
for j, hit in enumerate(hits):
|
|
136
|
+
source = hit["_source"]
|
|
137
|
+
|
|
138
|
+
created_at = datetime.fromisoformat(source["created_at"])
|
|
139
|
+
created_at_formatted = created_at.strftime("%Y-%m-%d")
|
|
140
|
+
|
|
141
|
+
logger.info(" File %d of %d:", j + 1, doc_count)
|
|
142
|
+
logger.info(" created_at : %s", created_at_formatted)
|
|
143
|
+
logger.info(" bundle_file : %s", source["bundle_file"])
|
|
144
|
+
logger.info(" path : %s", source["path"])
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def list_bundles(es_query_conn: ElasticQueryConn) -> None:
|
|
148
|
+
"""List bundle_file → associated analysis_id (clean visual CLI output)."""
|
|
149
|
+
query = {
|
|
150
|
+
"size": 0,
|
|
151
|
+
"aggs": {
|
|
152
|
+
"by_bundle": {
|
|
153
|
+
"composite": {
|
|
154
|
+
"size": 2000,
|
|
155
|
+
"sources": [
|
|
156
|
+
{
|
|
157
|
+
"bundle_file": {
|
|
158
|
+
"terms": {"field": "bundle_file.keyword"}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
],
|
|
162
|
+
},
|
|
163
|
+
"aggs": {
|
|
164
|
+
"analyses": {
|
|
165
|
+
"terms": {
|
|
166
|
+
"field": "analysis_id.keyword",
|
|
167
|
+
"size": 2000,
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
},
|
|
171
|
+
}
|
|
172
|
+
},
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
buckets = es_query_conn.run_composite_aggregation(
|
|
176
|
+
es_query_conn.data_files_index, query
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
if not buckets:
|
|
180
|
+
logger.info("No bundles found.")
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
# Sort bundles by bundle_file path
|
|
184
|
+
buckets = sorted(buckets, key=lambda b: b["key"]["bundle_file"])
|
|
185
|
+
|
|
186
|
+
logger.info("========================================")
|
|
187
|
+
logger.info(" BUNDLES AND ASSOCIATED ANALYSES")
|
|
188
|
+
logger.info("========================================")
|
|
189
|
+
logger.info("")
|
|
190
|
+
|
|
191
|
+
for idx, bucket in enumerate(buckets, start=1):
|
|
192
|
+
bundle = bucket["key"]["bundle_file"]
|
|
193
|
+
analyses = bucket["analyses"]["buckets"]
|
|
194
|
+
|
|
195
|
+
logger.info("#%d %s", idx, bundle)
|
|
196
|
+
if not analyses:
|
|
197
|
+
logger.info(" (no analyses)")
|
|
198
|
+
else:
|
|
199
|
+
for a in analyses:
|
|
200
|
+
logger.info(" • %s", a["key"])
|
|
201
|
+
|
|
202
|
+
logger.info("----------------------------------------")
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def main() -> None:
|
|
206
|
+
"""Entry point of the info script."""
|
|
207
|
+
args = read_args()
|
|
208
|
+
|
|
209
|
+
configure_logging(args.verbose)
|
|
210
|
+
logger.debug("Arguments: %s", args)
|
|
211
|
+
|
|
212
|
+
addr = f"https://{args.es_host}:{args.es_port}"
|
|
213
|
+
logger.info("Connecting to Elasticsearch at %s...", addr)
|
|
214
|
+
es_query_conn = ElasticQueryConn(
|
|
215
|
+
addr,
|
|
216
|
+
args.es_cert_fp,
|
|
217
|
+
args.es_index_prefix,
|
|
218
|
+
basic_auth=(args.es_usr, args.es_pwd),
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
list_call_count = 0
|
|
222
|
+
|
|
223
|
+
if args.list_bundles:
|
|
224
|
+
list_bundles(es_query_conn)
|
|
225
|
+
list_call_count += 1
|
|
226
|
+
|
|
227
|
+
if args.list_analyses:
|
|
228
|
+
list_analyses(es_query_conn)
|
|
229
|
+
list_call_count += 1
|
|
230
|
+
|
|
231
|
+
if args.list_wet_processes:
|
|
232
|
+
list_wet_processes(es_query_conn)
|
|
233
|
+
list_call_count += 1
|
|
234
|
+
|
|
235
|
+
if args.list_bi_processes:
|
|
236
|
+
list_bi_processes(es_query_conn)
|
|
237
|
+
list_call_count += 1
|
|
238
|
+
|
|
239
|
+
if list_call_count == 0:
|
|
240
|
+
logger.debug("No list option specified, listing everything.")
|
|
241
|
+
list_analyses(es_query_conn)
|
|
242
|
+
list_wet_processes(es_query_conn)
|
|
243
|
+
list_bi_processes(es_query_conn)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
if __name__ == "__main__":
|
|
247
|
+
main()
|
|
@@ -1,17 +1,20 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import logging
|
|
3
|
+
import typing
|
|
3
4
|
|
|
4
5
|
from elasticsearch import NotFoundError
|
|
5
6
|
|
|
6
|
-
from genelastic.common import (
|
|
7
|
-
Bucket,
|
|
8
|
-
DBIntegrityError,
|
|
9
|
-
ElasticQueryConn,
|
|
7
|
+
from genelastic.common.cli import (
|
|
10
8
|
add_es_connection_args,
|
|
11
9
|
add_verbose_control_args,
|
|
10
|
+
add_version_arg,
|
|
12
11
|
)
|
|
12
|
+
from genelastic.common.elastic import ElasticQueryConn
|
|
13
|
+
from genelastic.common.exceptions import DBIntegrityError
|
|
14
|
+
from genelastic.import_data.logger import configure_logging
|
|
13
15
|
|
|
14
|
-
|
|
16
|
+
if typing.TYPE_CHECKING:
|
|
17
|
+
from genelastic.common.types import Bucket
|
|
15
18
|
|
|
16
19
|
logger = logging.getLogger("genelastic")
|
|
17
20
|
logging.getLogger("elastic_transport").setLevel(
|
|
@@ -27,6 +30,7 @@ def read_args() -> argparse.Namespace:
|
|
|
27
30
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
28
31
|
allow_abbrev=False,
|
|
29
32
|
)
|
|
33
|
+
add_version_arg(parser)
|
|
30
34
|
add_verbose_control_args(parser)
|
|
31
35
|
add_es_connection_args(parser)
|
|
32
36
|
return parser.parse_args()
|
|
@@ -42,6 +46,13 @@ def check_for_undefined_file_indices(
|
|
|
42
46
|
:raises genelastic.common.DBIntegrityError:
|
|
43
47
|
Some files indices are used in the analyses index but are undefined.
|
|
44
48
|
"""
|
|
49
|
+
if not es_query_conn.client:
|
|
50
|
+
logger.info(
|
|
51
|
+
"[Dryrun] check_for_undefined_file_indices: "
|
|
52
|
+
"no Elasticsearch client."
|
|
53
|
+
)
|
|
54
|
+
return
|
|
55
|
+
|
|
45
56
|
logger.info(
|
|
46
57
|
"Checking for references to undefined file indices in the index '%s'...",
|
|
47
58
|
analyses_index,
|
|
@@ -217,6 +228,13 @@ def check_for_unused_file_indices(
|
|
|
217
228
|
:returns: 1 if some file indices exists but are unused in the analyses index,
|
|
218
229
|
and 0 otherwise.
|
|
219
230
|
"""
|
|
231
|
+
if not es_query_conn.client:
|
|
232
|
+
logger.info(
|
|
233
|
+
"[Dryrun] check_for_unused_file_indices: "
|
|
234
|
+
"no Elasticsearch client."
|
|
235
|
+
)
|
|
236
|
+
return -1
|
|
237
|
+
|
|
220
238
|
json_indices = es_query_conn.client.cat.indices(
|
|
221
239
|
index=f"{index_prefix}-file-*", format="json"
|
|
222
240
|
).body
|
|
@@ -349,9 +367,13 @@ def main() -> None:
|
|
|
349
367
|
bi_processes_index = f"{args.es_index_prefix}-bi_processes"
|
|
350
368
|
|
|
351
369
|
addr = f"https://{args.es_host}:{args.es_port}"
|
|
352
|
-
logger.info("
|
|
370
|
+
logger.info("Connecting to Elasticsearch at %s...", addr)
|
|
353
371
|
es_query_conn = ElasticQueryConn(
|
|
354
|
-
addr,
|
|
372
|
+
addr,
|
|
373
|
+
args.es_cert_fp,
|
|
374
|
+
args.es_index_prefix,
|
|
375
|
+
args.dryrun,
|
|
376
|
+
basic_auth=(args.es_usr, args.es_pwd),
|
|
355
377
|
)
|
|
356
378
|
|
|
357
379
|
# Fatal errors
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from genelastic.common.cli import add_verbose_control_args, add_version_arg
|
|
6
|
+
from genelastic.common.exceptions import (
|
|
7
|
+
ValidationError,
|
|
8
|
+
YAMLFileReadError,
|
|
9
|
+
)
|
|
10
|
+
from genelastic.import_data.import_bundle_factory import (
|
|
11
|
+
load_yaml_file,
|
|
12
|
+
validate_doc,
|
|
13
|
+
)
|
|
14
|
+
from genelastic.import_data.logger import configure_logging
|
|
15
|
+
from genelastic.import_data.models.validate import ValidationIssue
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger("genelastic")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def read_args() -> argparse.Namespace:
|
|
21
|
+
"""Read arguments from command line."""
|
|
22
|
+
parser = argparse.ArgumentParser(
|
|
23
|
+
description="Statically validates YAML bundles: "
|
|
24
|
+
"ensure they comply to the bundle schema.",
|
|
25
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
26
|
+
allow_abbrev=False,
|
|
27
|
+
)
|
|
28
|
+
add_version_arg(parser)
|
|
29
|
+
add_verbose_control_args(parser)
|
|
30
|
+
parser.add_argument(
|
|
31
|
+
"files",
|
|
32
|
+
type=Path,
|
|
33
|
+
nargs="+",
|
|
34
|
+
default=None,
|
|
35
|
+
help="Paths to YAML files containing bundles to validate.",
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"-x",
|
|
39
|
+
"--fail-fast",
|
|
40
|
+
dest="fail_fast",
|
|
41
|
+
action="store_true",
|
|
42
|
+
help="Stop validating files after the first error is encountered.",
|
|
43
|
+
)
|
|
44
|
+
return parser.parse_args()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def main() -> int:
|
|
48
|
+
"""Entry point of the validate script."""
|
|
49
|
+
args = read_args()
|
|
50
|
+
configure_logging(args.verbose)
|
|
51
|
+
|
|
52
|
+
validation_issues = []
|
|
53
|
+
file_count = len(args.files)
|
|
54
|
+
|
|
55
|
+
for file_index, file_path in enumerate(args.files):
|
|
56
|
+
resolved_file_path = file_path.resolve()
|
|
57
|
+
|
|
58
|
+
logger.info(
|
|
59
|
+
"[%s/%s] Validating bundle(s) from file '%s'.",
|
|
60
|
+
file_index + 1,
|
|
61
|
+
file_count,
|
|
62
|
+
resolved_file_path,
|
|
63
|
+
)
|
|
64
|
+
logger.info("Loading YAML file...")
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
docs = load_yaml_file(resolved_file_path)
|
|
68
|
+
except YAMLFileReadError as e:
|
|
69
|
+
logger.error(e)
|
|
70
|
+
|
|
71
|
+
if args.fail_fast:
|
|
72
|
+
raise SystemExit(1) from None
|
|
73
|
+
|
|
74
|
+
validation_issues.append(
|
|
75
|
+
ValidationIssue(
|
|
76
|
+
exc_type=type(e).__name__,
|
|
77
|
+
file_path=resolved_file_path,
|
|
78
|
+
file_index=file_index + 1,
|
|
79
|
+
file_count=file_count,
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
logger.info("-> YAML file successfully loaded.")
|
|
85
|
+
|
|
86
|
+
doc_count = len(docs)
|
|
87
|
+
logger.info("Found %s document(s) in the YAML file.", doc_count)
|
|
88
|
+
|
|
89
|
+
for doc_index, doc in enumerate(docs):
|
|
90
|
+
logger.info(
|
|
91
|
+
" Validating bundle format for document #%s/%s...",
|
|
92
|
+
doc_index + 1,
|
|
93
|
+
doc_count,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
validate_doc(doc)
|
|
98
|
+
except ValidationError as e:
|
|
99
|
+
logger.error(e)
|
|
100
|
+
|
|
101
|
+
if args.fail_fast:
|
|
102
|
+
raise SystemExit(1) from None
|
|
103
|
+
|
|
104
|
+
validation_issues.append(
|
|
105
|
+
ValidationIssue(
|
|
106
|
+
exc_type=type(e).__name__,
|
|
107
|
+
file_path=resolved_file_path,
|
|
108
|
+
file_index=file_index + 1,
|
|
109
|
+
file_count=file_count,
|
|
110
|
+
doc_index=doc_index + 1,
|
|
111
|
+
doc_count=doc_count,
|
|
112
|
+
)
|
|
113
|
+
)
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
logger.info(" -> Bundle format is valid.")
|
|
117
|
+
|
|
118
|
+
logger.info("")
|
|
119
|
+
|
|
120
|
+
if len(validation_issues) > 0:
|
|
121
|
+
logger.error("Some files raised exceptions:")
|
|
122
|
+
for issue in validation_issues:
|
|
123
|
+
logger.error(" - %s", issue)
|
|
124
|
+
|
|
125
|
+
ret_code = 1
|
|
126
|
+
else:
|
|
127
|
+
logger.info("All bundles respect the genelastic YAML bundle format.")
|
|
128
|
+
ret_code = 0
|
|
129
|
+
|
|
130
|
+
files_failing_validation = len(
|
|
131
|
+
{issue.file_path for issue in validation_issues}
|
|
132
|
+
)
|
|
133
|
+
files_passing_validation = file_count - files_failing_validation
|
|
134
|
+
|
|
135
|
+
logger.info(
|
|
136
|
+
"Out of %s file(s), validation passed for %s and failed for %s.",
|
|
137
|
+
file_count,
|
|
138
|
+
files_passing_validation,
|
|
139
|
+
files_failing_validation,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return ret_code
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
if __name__ == "__main__":
|
|
146
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from genelastic.common.exceptions import DataFileCollectorError
|
|
6
|
+
from genelastic.common.types import Metadata
|
|
7
|
+
from genelastic.import_data.models.data_file import DataFile
|
|
8
|
+
from genelastic.import_data.models.tags import Tags
|
|
9
|
+
from genelastic.import_data.patterns import FilenamePattern
|
|
10
|
+
from genelastic.import_data.resolve import resolve_analysis_id
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger("genelastic")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def collect_files(data_path: Path) -> set[Path]:
|
|
16
|
+
"""Collect files for a given analysis.
|
|
17
|
+
All files directly under ``data_path`` are returned.
|
|
18
|
+
|
|
19
|
+
:param data_path: Directory containing the files.
|
|
20
|
+
:raises DataFileCollectorError: If ``data_path`` is not an existing
|
|
21
|
+
directory.
|
|
22
|
+
:return: Set of absolute paths to collected files.
|
|
23
|
+
"""
|
|
24
|
+
try:
|
|
25
|
+
collected_files = {x for x in data_path.iterdir() if x.is_file()}
|
|
26
|
+
except OSError as e:
|
|
27
|
+
msg = f"Error collecting files: data directory is invalid. {e}."
|
|
28
|
+
raise DataFileCollectorError(msg) from e
|
|
29
|
+
return collected_files
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def match_files(
|
|
33
|
+
files: set[Path],
|
|
34
|
+
filename_pattern: FilenamePattern,
|
|
35
|
+
) -> tuple[set[Path], set[Path]]:
|
|
36
|
+
"""Splits a set of files into those that match a given filename pattern and
|
|
37
|
+
those that don't.
|
|
38
|
+
|
|
39
|
+
This function applies the provided ``filename_pattern`` to each file name
|
|
40
|
+
in ``files``, and returns two sets: one containing files that match the
|
|
41
|
+
pattern, and one containing those that do not.
|
|
42
|
+
|
|
43
|
+
:param files: A set of file paths to check.
|
|
44
|
+
:param filename_pattern: The filename pattern used for matching.
|
|
45
|
+
|
|
46
|
+
:returns: A tuple containing in first position a set of files that match
|
|
47
|
+
the pattern, and in second position a set of files that do not match the
|
|
48
|
+
pattern.
|
|
49
|
+
"""
|
|
50
|
+
matched_files = {
|
|
51
|
+
f for f in files if filename_pattern.matches_pattern(f.name)
|
|
52
|
+
}
|
|
53
|
+
return matched_files, files - matched_files
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def extract_analysis_metadata(
|
|
57
|
+
data_path: Path,
|
|
58
|
+
file_prefix: str,
|
|
59
|
+
tags: Tags,
|
|
60
|
+
filename_pattern: FilenamePattern,
|
|
61
|
+
) -> dict[str, Metadata]:
|
|
62
|
+
analysis = {}
|
|
63
|
+
|
|
64
|
+
for file in collect_files(data_path):
|
|
65
|
+
if not filename_pattern.matches_pattern(file.name):
|
|
66
|
+
logger.debug("File '%s' was not matched.", file.name)
|
|
67
|
+
continue
|
|
68
|
+
|
|
69
|
+
filename_metadata = filename_pattern.extract_metadata(file.name)
|
|
70
|
+
analysis_id = resolve_analysis_id(file_prefix, tags, filename_metadata)
|
|
71
|
+
analysis[analysis_id] = filename_metadata
|
|
72
|
+
|
|
73
|
+
return analysis
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def init_data_files(
|
|
77
|
+
analysis_id: str,
|
|
78
|
+
files: set[Path],
|
|
79
|
+
filename_pattern: FilenamePattern,
|
|
80
|
+
bundle_file: Path,
|
|
81
|
+
) -> set[DataFile]:
|
|
82
|
+
"""Instantiate ``DataFile`` objects from a set of file paths associated
|
|
83
|
+
with an analysis.
|
|
84
|
+
|
|
85
|
+
:param analysis_id: ID of the analysis, shared by all created ``DataFile``
|
|
86
|
+
instances.
|
|
87
|
+
:param files: Set of file paths associated with the analysis.
|
|
88
|
+
:param filename_pattern: Pattern used to extract metadata from filenames.
|
|
89
|
+
The extracted metadata is included in each ``DataFile``.
|
|
90
|
+
:param bundle_file: Path to the YAML bundle file from which the analysis is
|
|
91
|
+
defined.
|
|
92
|
+
:raises DataFileCollectorError: If metadata extraction or instantiation
|
|
93
|
+
of a data file objet fails for a given file.
|
|
94
|
+
:return: A set of successfully instantiated ``DataFile`` objects.
|
|
95
|
+
"""
|
|
96
|
+
data_files = set()
|
|
97
|
+
for file in files:
|
|
98
|
+
try:
|
|
99
|
+
metadata = filename_pattern.extract_metadata(file.name)
|
|
100
|
+
data_file = DataFile(
|
|
101
|
+
analysis_id=analysis_id,
|
|
102
|
+
path=file,
|
|
103
|
+
bundle_file=bundle_file,
|
|
104
|
+
metadata=metadata,
|
|
105
|
+
)
|
|
106
|
+
data_files.add(data_file)
|
|
107
|
+
except RuntimeError as e:
|
|
108
|
+
msg = f"Error instantiating data files: {e}"
|
|
109
|
+
raise DataFileCollectorError(msg) from None
|
|
110
|
+
return data_files
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@dataclass(frozen=True)
|
|
114
|
+
class DataFileCollectorResult:
|
|
115
|
+
"""Result of a data file collection."""
|
|
116
|
+
|
|
117
|
+
matched_files: set[Path]
|
|
118
|
+
unmatched_files: set[Path]
|
|
119
|
+
data_files: set[DataFile]
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class DataFileCollector:
|
|
123
|
+
"""Collect all data files belonging to an analysis."""
|
|
124
|
+
|
|
125
|
+
def __init__(
|
|
126
|
+
self,
|
|
127
|
+
analysis_id: str,
|
|
128
|
+
bundle_file: Path,
|
|
129
|
+
data_path: Path,
|
|
130
|
+
filename_pattern: FilenamePattern,
|
|
131
|
+
*,
|
|
132
|
+
multi_match: bool = False,
|
|
133
|
+
) -> None:
|
|
134
|
+
self._analysis_id = analysis_id
|
|
135
|
+
self._bundle_file = bundle_file
|
|
136
|
+
self._data_path = data_path
|
|
137
|
+
self._filename_pattern = filename_pattern
|
|
138
|
+
self._multi_match = multi_match
|
|
139
|
+
|
|
140
|
+
def run(self) -> DataFileCollectorResult:
|
|
141
|
+
"""Collects files from the analysis data path, matches them against the
|
|
142
|
+
analysis filename pattern, and instantiates ``DataFile`` objects for
|
|
143
|
+
each matched file.
|
|
144
|
+
|
|
145
|
+
:raises DataFileCollectorError: If the ``data_path`` is not an existing
|
|
146
|
+
directory or if metadata extraction or instantiation of a data file
|
|
147
|
+
objet fails for a given file.
|
|
148
|
+
:return: A ``DataFileCollectorResult`` containing the sets of matched
|
|
149
|
+
and unmatched files, as well as a set of instantiated ``DataFile``
|
|
150
|
+
objects.
|
|
151
|
+
"""
|
|
152
|
+
files = collect_files(self._data_path)
|
|
153
|
+
logger.debug(
|
|
154
|
+
" -> Collected %s file(s):",
|
|
155
|
+
len(files),
|
|
156
|
+
)
|
|
157
|
+
for path in sorted(files):
|
|
158
|
+
logger.debug(" - '%s'", path.name)
|
|
159
|
+
|
|
160
|
+
matched_files, unmatched_files = match_files(
|
|
161
|
+
files, self._filename_pattern
|
|
162
|
+
)
|
|
163
|
+
logger.info(" -> Found %s matching file(s):", len(matched_files))
|
|
164
|
+
for path in sorted(matched_files):
|
|
165
|
+
logger.info(" - '%s'", path.name)
|
|
166
|
+
|
|
167
|
+
logger.info(
|
|
168
|
+
" -> Found %s non-matching file(s):",
|
|
169
|
+
len(unmatched_files),
|
|
170
|
+
)
|
|
171
|
+
for path in sorted(unmatched_files):
|
|
172
|
+
logger.info(" - '%s'", path.name)
|
|
173
|
+
|
|
174
|
+
data_files = init_data_files(
|
|
175
|
+
self._analysis_id,
|
|
176
|
+
matched_files,
|
|
177
|
+
self._filename_pattern,
|
|
178
|
+
self._bundle_file,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
return DataFileCollectorResult(
|
|
182
|
+
matched_files=matched_files,
|
|
183
|
+
unmatched_files=unmatched_files,
|
|
184
|
+
data_files=data_files,
|
|
185
|
+
)
|