genelastic 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. genelastic/api/.env +4 -0
  2. genelastic/api/cli_start_api.py +2 -2
  3. genelastic/api/errors.py +52 -0
  4. genelastic/api/extends/example.py +0 -6
  5. genelastic/api/extends/example.yml +0 -20
  6. genelastic/api/routes.py +313 -181
  7. genelastic/api/server.py +8 -3
  8. genelastic/api/specification.yml +343 -181
  9. genelastic/common/__init__.py +0 -44
  10. genelastic/common/cli.py +48 -0
  11. genelastic/common/elastic.py +374 -46
  12. genelastic/common/exceptions.py +34 -2
  13. genelastic/common/server.py +9 -1
  14. genelastic/common/types.py +1 -14
  15. genelastic/import_data/__init__.py +0 -27
  16. genelastic/import_data/checker.py +99 -0
  17. genelastic/import_data/checker_observer.py +13 -0
  18. genelastic/import_data/cli/__init__.py +0 -0
  19. genelastic/import_data/cli/cli_check.py +136 -0
  20. genelastic/import_data/{cli_gen_data.py → cli/gen_data.py} +4 -4
  21. genelastic/import_data/cli/import_data.py +346 -0
  22. genelastic/import_data/cli/info.py +247 -0
  23. genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
  24. genelastic/import_data/cli/validate.py +146 -0
  25. genelastic/import_data/collect.py +185 -0
  26. genelastic/import_data/constants.py +136 -11
  27. genelastic/import_data/import_bundle.py +102 -59
  28. genelastic/import_data/import_bundle_factory.py +70 -149
  29. genelastic/import_data/importers/__init__.py +0 -0
  30. genelastic/import_data/importers/importer_base.py +131 -0
  31. genelastic/import_data/importers/importer_factory.py +85 -0
  32. genelastic/import_data/importers/importer_types.py +223 -0
  33. genelastic/import_data/logger.py +2 -1
  34. genelastic/import_data/models/__init__.py +0 -0
  35. genelastic/import_data/models/analyses.py +178 -0
  36. genelastic/import_data/models/analysis.py +144 -0
  37. genelastic/import_data/models/data_file.py +110 -0
  38. genelastic/import_data/models/process.py +45 -0
  39. genelastic/import_data/models/processes.py +84 -0
  40. genelastic/import_data/models/tags.py +170 -0
  41. genelastic/import_data/models/unique_list.py +109 -0
  42. genelastic/import_data/models/validate.py +26 -0
  43. genelastic/import_data/patterns.py +90 -0
  44. genelastic/import_data/random_bundle.py +10 -8
  45. genelastic/import_data/resolve.py +157 -0
  46. genelastic/ui/.env +1 -0
  47. genelastic/ui/cli_start_ui.py +4 -2
  48. genelastic/ui/routes.py +289 -42
  49. genelastic/ui/static/cea-cnrgh.ico +0 -0
  50. genelastic/ui/static/cea.ico +0 -0
  51. genelastic/ui/static/layout.ico +0 -0
  52. genelastic/ui/static/novaseq6000.png +0 -0
  53. genelastic/ui/static/style.css +430 -0
  54. genelastic/ui/static/ui.js +458 -0
  55. genelastic/ui/templates/analyses.html +96 -9
  56. genelastic/ui/templates/analysis_detail.html +44 -0
  57. genelastic/ui/templates/bi_process_detail.html +129 -0
  58. genelastic/ui/templates/bi_processes.html +114 -9
  59. genelastic/ui/templates/explorer.html +356 -0
  60. genelastic/ui/templates/home.html +205 -2
  61. genelastic/ui/templates/layout.html +148 -29
  62. genelastic/ui/templates/version.html +19 -7
  63. genelastic/ui/templates/wet_process_detail.html +131 -0
  64. genelastic/ui/templates/wet_processes.html +114 -9
  65. genelastic-0.9.0.dist-info/METADATA +686 -0
  66. genelastic-0.9.0.dist-info/RECORD +76 -0
  67. genelastic-0.9.0.dist-info/WHEEL +4 -0
  68. genelastic-0.9.0.dist-info/entry_points.txt +10 -0
  69. genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
  70. genelastic/import_data/analyses.py +0 -69
  71. genelastic/import_data/analysis.py +0 -205
  72. genelastic/import_data/bi_process.py +0 -27
  73. genelastic/import_data/bi_processes.py +0 -49
  74. genelastic/import_data/cli_import.py +0 -379
  75. genelastic/import_data/cli_info.py +0 -256
  76. genelastic/import_data/cli_validate.py +0 -54
  77. genelastic/import_data/data_file.py +0 -87
  78. genelastic/import_data/filename_pattern.py +0 -57
  79. genelastic/import_data/tags.py +0 -123
  80. genelastic/import_data/wet_process.py +0 -28
  81. genelastic/import_data/wet_processes.py +0 -53
  82. genelastic-0.8.0.dist-info/METADATA +0 -109
  83. genelastic-0.8.0.dist-info/RECORD +0 -52
  84. genelastic-0.8.0.dist-info/WHEEL +0 -5
  85. genelastic-0.8.0.dist-info/entry_points.txt +0 -8
  86. genelastic-0.8.0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,247 @@
1
+ import argparse
2
+ import logging
3
+ from datetime import datetime
4
+
5
+ from genelastic.common.cli import (
6
+ add_es_connection_args,
7
+ add_verbose_control_args,
8
+ add_version_arg,
9
+ )
10
+ from genelastic.common.elastic import ElasticQueryConn
11
+ from genelastic.import_data.logger import configure_logging
12
+
13
+ logger = logging.getLogger("genelastic")
14
+ logging.getLogger("elastic_transport").setLevel(
15
+ logging.WARNING
16
+ ) # Disable excessive logging
17
+
18
+
19
+ def read_args() -> argparse.Namespace:
20
+ """Read arguments from the command line."""
21
+ parser = argparse.ArgumentParser(
22
+ description="ElasticSearch database info.",
23
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
24
+ allow_abbrev=False,
25
+ )
26
+ add_version_arg(parser)
27
+ add_verbose_control_args(parser)
28
+ add_es_connection_args(parser)
29
+ parser.add_argument(
30
+ "-a",
31
+ "--list-analyses",
32
+ action="store_true",
33
+ help="List all imported analyses.",
34
+ )
35
+ parser.add_argument(
36
+ "-w",
37
+ "--list-wet-processes",
38
+ action="store_true",
39
+ help="List all imported wet processes.",
40
+ )
41
+ parser.add_argument(
42
+ "-b",
43
+ "--list-bi-processes",
44
+ action="store_true",
45
+ help="List all imported bio info processes.",
46
+ )
47
+ parser.add_argument(
48
+ "-B",
49
+ "--list-bundles",
50
+ action="store_true",
51
+ help="List YAML bundles and associated analyses.",
52
+ )
53
+
54
+ return parser.parse_args()
55
+
56
+
57
+ def list_processes(es_query_conn: ElasticQueryConn, index: str) -> None:
58
+ """List all processes."""
59
+ process_ids = es_query_conn.get_field_values(index, "proc_id")
60
+
61
+ if len(process_ids) == 0:
62
+ logger.info("Empty response.")
63
+ return
64
+
65
+ for process_id in process_ids:
66
+ logger.info("- %s", process_id)
67
+
68
+
69
+ def list_wet_processes(es_query_conn: ElasticQueryConn) -> None:
70
+ """List all wet processes."""
71
+ logger.info("Imported wet processes")
72
+ logger.info("======================")
73
+ list_processes(es_query_conn, es_query_conn.wet_processes_index)
74
+
75
+
76
+ def list_bi_processes(es_query_conn: ElasticQueryConn) -> None:
77
+ """List all bio info processes."""
78
+ logger.info("Imported bi processes")
79
+ logger.info("=====================")
80
+ list_processes(es_query_conn, es_query_conn.bi_processes_index)
81
+
82
+
83
+ def list_analyses(es_query_conn: ElasticQueryConn) -> None:
84
+ """List all imported analyses and their associated data files."""
85
+ query = {
86
+ "size": 0,
87
+ "aggs": {
88
+ "by_analysis": {
89
+ "composite": {
90
+ "size": 1000,
91
+ "sources": [
92
+ {
93
+ "analysis_id": {
94
+ "terms": {"field": "analysis_id.keyword"}
95
+ }
96
+ }
97
+ ],
98
+ },
99
+ "aggs": {
100
+ "data_files": {
101
+ "top_hits": {
102
+ "size": 100,
103
+ }
104
+ }
105
+ },
106
+ }
107
+ },
108
+ }
109
+
110
+ buckets = es_query_conn.run_composite_aggregation(
111
+ es_query_conn.data_files_index, query
112
+ )
113
+
114
+ if not buckets:
115
+ logger.info("No data files found.")
116
+ return
117
+
118
+ logger.info("Data files per YAML bundle")
119
+ logger.info("=" * 80)
120
+
121
+ for i, bucket in enumerate(buckets):
122
+ analysis_id = bucket["key"]["analysis_id"]
123
+ hits = bucket["data_files"]["hits"]["hits"]
124
+ doc_count = len(hits)
125
+
126
+ logger.info(
127
+ "[%d] Analysis ID: %s (%d file%s)",
128
+ i + 1,
129
+ analysis_id,
130
+ doc_count,
131
+ "s" if doc_count > 1 else "",
132
+ )
133
+ logger.info("-" * 80)
134
+
135
+ for j, hit in enumerate(hits):
136
+ source = hit["_source"]
137
+
138
+ created_at = datetime.fromisoformat(source["created_at"])
139
+ created_at_formatted = created_at.strftime("%Y-%m-%d")
140
+
141
+ logger.info(" File %d of %d:", j + 1, doc_count)
142
+ logger.info(" created_at : %s", created_at_formatted)
143
+ logger.info(" bundle_file : %s", source["bundle_file"])
144
+ logger.info(" path : %s", source["path"])
145
+
146
+
147
+ def list_bundles(es_query_conn: ElasticQueryConn) -> None:
148
+ """List bundle_file → associated analysis_id (clean visual CLI output)."""
149
+ query = {
150
+ "size": 0,
151
+ "aggs": {
152
+ "by_bundle": {
153
+ "composite": {
154
+ "size": 2000,
155
+ "sources": [
156
+ {
157
+ "bundle_file": {
158
+ "terms": {"field": "bundle_file.keyword"}
159
+ }
160
+ }
161
+ ],
162
+ },
163
+ "aggs": {
164
+ "analyses": {
165
+ "terms": {
166
+ "field": "analysis_id.keyword",
167
+ "size": 2000,
168
+ }
169
+ }
170
+ },
171
+ }
172
+ },
173
+ }
174
+
175
+ buckets = es_query_conn.run_composite_aggregation(
176
+ es_query_conn.data_files_index, query
177
+ )
178
+
179
+ if not buckets:
180
+ logger.info("No bundles found.")
181
+ return
182
+
183
+ # Sort bundles by bundle_file path
184
+ buckets = sorted(buckets, key=lambda b: b["key"]["bundle_file"])
185
+
186
+ logger.info("========================================")
187
+ logger.info(" BUNDLES AND ASSOCIATED ANALYSES")
188
+ logger.info("========================================")
189
+ logger.info("")
190
+
191
+ for idx, bucket in enumerate(buckets, start=1):
192
+ bundle = bucket["key"]["bundle_file"]
193
+ analyses = bucket["analyses"]["buckets"]
194
+
195
+ logger.info("#%d %s", idx, bundle)
196
+ if not analyses:
197
+ logger.info(" (no analyses)")
198
+ else:
199
+ for a in analyses:
200
+ logger.info(" • %s", a["key"])
201
+
202
+ logger.info("----------------------------------------")
203
+
204
+
205
+ def main() -> None:
206
+ """Entry point of the info script."""
207
+ args = read_args()
208
+
209
+ configure_logging(args.verbose)
210
+ logger.debug("Arguments: %s", args)
211
+
212
+ addr = f"https://{args.es_host}:{args.es_port}"
213
+ logger.info("Connecting to Elasticsearch at %s...", addr)
214
+ es_query_conn = ElasticQueryConn(
215
+ addr,
216
+ args.es_cert_fp,
217
+ args.es_index_prefix,
218
+ basic_auth=(args.es_usr, args.es_pwd),
219
+ )
220
+
221
+ list_call_count = 0
222
+
223
+ if args.list_bundles:
224
+ list_bundles(es_query_conn)
225
+ list_call_count += 1
226
+
227
+ if args.list_analyses:
228
+ list_analyses(es_query_conn)
229
+ list_call_count += 1
230
+
231
+ if args.list_wet_processes:
232
+ list_wet_processes(es_query_conn)
233
+ list_call_count += 1
234
+
235
+ if args.list_bi_processes:
236
+ list_bi_processes(es_query_conn)
237
+ list_call_count += 1
238
+
239
+ if list_call_count == 0:
240
+ logger.debug("No list option specified, listing everything.")
241
+ list_analyses(es_query_conn)
242
+ list_wet_processes(es_query_conn)
243
+ list_bi_processes(es_query_conn)
244
+
245
+
246
+ if __name__ == "__main__":
247
+ main()
@@ -1,17 +1,20 @@
1
1
  import argparse
2
2
  import logging
3
+ import typing
3
4
 
4
5
  from elasticsearch import NotFoundError
5
6
 
6
- from genelastic.common import (
7
- Bucket,
8
- DBIntegrityError,
9
- ElasticQueryConn,
7
+ from genelastic.common.cli import (
10
8
  add_es_connection_args,
11
9
  add_verbose_control_args,
10
+ add_version_arg,
12
11
  )
12
+ from genelastic.common.elastic import ElasticQueryConn
13
+ from genelastic.common.exceptions import DBIntegrityError
14
+ from genelastic.import_data.logger import configure_logging
13
15
 
14
- from .logger import configure_logging
16
+ if typing.TYPE_CHECKING:
17
+ from genelastic.common.types import Bucket
15
18
 
16
19
  logger = logging.getLogger("genelastic")
17
20
  logging.getLogger("elastic_transport").setLevel(
@@ -27,6 +30,7 @@ def read_args() -> argparse.Namespace:
27
30
  formatter_class=argparse.ArgumentDefaultsHelpFormatter,
28
31
  allow_abbrev=False,
29
32
  )
33
+ add_version_arg(parser)
30
34
  add_verbose_control_args(parser)
31
35
  add_es_connection_args(parser)
32
36
  return parser.parse_args()
@@ -42,6 +46,13 @@ def check_for_undefined_file_indices(
42
46
  :raises genelastic.common.DBIntegrityError:
43
47
  Some files indices are used in the analyses index but are undefined.
44
48
  """
49
+ if not es_query_conn.client:
50
+ logger.info(
51
+ "[Dryrun] check_for_undefined_file_indices: "
52
+ "no Elasticsearch client."
53
+ )
54
+ return
55
+
45
56
  logger.info(
46
57
  "Checking for references to undefined file indices in the index '%s'...",
47
58
  analyses_index,
@@ -217,6 +228,13 @@ def check_for_unused_file_indices(
217
228
  :returns: 1 if some file indices exists but are unused in the analyses index,
218
229
  and 0 otherwise.
219
230
  """
231
+ if not es_query_conn.client:
232
+ logger.info(
233
+ "[Dryrun] check_for_unused_file_indices: "
234
+ "no Elasticsearch client."
235
+ )
236
+ return -1
237
+
220
238
  json_indices = es_query_conn.client.cat.indices(
221
239
  index=f"{index_prefix}-file-*", format="json"
222
240
  ).body
@@ -349,9 +367,13 @@ def main() -> None:
349
367
  bi_processes_index = f"{args.es_index_prefix}-bi_processes"
350
368
 
351
369
  addr = f"https://{args.es_host}:{args.es_port}"
352
- logger.info("Trying to connect to Elasticsearch at %s...", addr)
370
+ logger.info("Connecting to Elasticsearch at %s...", addr)
353
371
  es_query_conn = ElasticQueryConn(
354
- addr, args.es_cert_fp, basic_auth=(args.es_usr, args.es_pwd)
372
+ addr,
373
+ args.es_cert_fp,
374
+ args.es_index_prefix,
375
+ args.dryrun,
376
+ basic_auth=(args.es_usr, args.es_pwd),
355
377
  )
356
378
 
357
379
  # Fatal errors
@@ -0,0 +1,146 @@
1
+ import argparse
2
+ import logging
3
+ from pathlib import Path
4
+
5
+ from genelastic.common.cli import add_verbose_control_args, add_version_arg
6
+ from genelastic.common.exceptions import (
7
+ ValidationError,
8
+ YAMLFileReadError,
9
+ )
10
+ from genelastic.import_data.import_bundle_factory import (
11
+ load_yaml_file,
12
+ validate_doc,
13
+ )
14
+ from genelastic.import_data.logger import configure_logging
15
+ from genelastic.import_data.models.validate import ValidationIssue
16
+
17
+ logger = logging.getLogger("genelastic")
18
+
19
+
20
+ def read_args() -> argparse.Namespace:
21
+ """Read arguments from command line."""
22
+ parser = argparse.ArgumentParser(
23
+ description="Statically validates YAML bundles: "
24
+ "ensure they comply to the bundle schema.",
25
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
26
+ allow_abbrev=False,
27
+ )
28
+ add_version_arg(parser)
29
+ add_verbose_control_args(parser)
30
+ parser.add_argument(
31
+ "files",
32
+ type=Path,
33
+ nargs="+",
34
+ default=None,
35
+ help="Paths to YAML files containing bundles to validate.",
36
+ )
37
+ parser.add_argument(
38
+ "-x",
39
+ "--fail-fast",
40
+ dest="fail_fast",
41
+ action="store_true",
42
+ help="Stop validating files after the first error is encountered.",
43
+ )
44
+ return parser.parse_args()
45
+
46
+
47
+ def main() -> int:
48
+ """Entry point of the validate script."""
49
+ args = read_args()
50
+ configure_logging(args.verbose)
51
+
52
+ validation_issues = []
53
+ file_count = len(args.files)
54
+
55
+ for file_index, file_path in enumerate(args.files):
56
+ resolved_file_path = file_path.resolve()
57
+
58
+ logger.info(
59
+ "[%s/%s] Validating bundle(s) from file '%s'.",
60
+ file_index + 1,
61
+ file_count,
62
+ resolved_file_path,
63
+ )
64
+ logger.info("Loading YAML file...")
65
+
66
+ try:
67
+ docs = load_yaml_file(resolved_file_path)
68
+ except YAMLFileReadError as e:
69
+ logger.error(e)
70
+
71
+ if args.fail_fast:
72
+ raise SystemExit(1) from None
73
+
74
+ validation_issues.append(
75
+ ValidationIssue(
76
+ exc_type=type(e).__name__,
77
+ file_path=resolved_file_path,
78
+ file_index=file_index + 1,
79
+ file_count=file_count,
80
+ )
81
+ )
82
+ continue
83
+
84
+ logger.info("-> YAML file successfully loaded.")
85
+
86
+ doc_count = len(docs)
87
+ logger.info("Found %s document(s) in the YAML file.", doc_count)
88
+
89
+ for doc_index, doc in enumerate(docs):
90
+ logger.info(
91
+ " Validating bundle format for document #%s/%s...",
92
+ doc_index + 1,
93
+ doc_count,
94
+ )
95
+
96
+ try:
97
+ validate_doc(doc)
98
+ except ValidationError as e:
99
+ logger.error(e)
100
+
101
+ if args.fail_fast:
102
+ raise SystemExit(1) from None
103
+
104
+ validation_issues.append(
105
+ ValidationIssue(
106
+ exc_type=type(e).__name__,
107
+ file_path=resolved_file_path,
108
+ file_index=file_index + 1,
109
+ file_count=file_count,
110
+ doc_index=doc_index + 1,
111
+ doc_count=doc_count,
112
+ )
113
+ )
114
+ continue
115
+
116
+ logger.info(" -> Bundle format is valid.")
117
+
118
+ logger.info("")
119
+
120
+ if len(validation_issues) > 0:
121
+ logger.error("Some files raised exceptions:")
122
+ for issue in validation_issues:
123
+ logger.error(" - %s", issue)
124
+
125
+ ret_code = 1
126
+ else:
127
+ logger.info("All bundles respect the genelastic YAML bundle format.")
128
+ ret_code = 0
129
+
130
+ files_failing_validation = len(
131
+ {issue.file_path for issue in validation_issues}
132
+ )
133
+ files_passing_validation = file_count - files_failing_validation
134
+
135
+ logger.info(
136
+ "Out of %s file(s), validation passed for %s and failed for %s.",
137
+ file_count,
138
+ files_passing_validation,
139
+ files_failing_validation,
140
+ )
141
+
142
+ return ret_code
143
+
144
+
145
+ if __name__ == "__main__":
146
+ raise SystemExit(main())
@@ -0,0 +1,185 @@
1
+ import logging
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+
5
+ from genelastic.common.exceptions import DataFileCollectorError
6
+ from genelastic.common.types import Metadata
7
+ from genelastic.import_data.models.data_file import DataFile
8
+ from genelastic.import_data.models.tags import Tags
9
+ from genelastic.import_data.patterns import FilenamePattern
10
+ from genelastic.import_data.resolve import resolve_analysis_id
11
+
12
+ logger = logging.getLogger("genelastic")
13
+
14
+
15
+ def collect_files(data_path: Path) -> set[Path]:
16
+ """Collect files for a given analysis.
17
+ All files directly under ``data_path`` are returned.
18
+
19
+ :param data_path: Directory containing the files.
20
+ :raises DataFileCollectorError: If ``data_path`` is not an existing
21
+ directory.
22
+ :return: Set of absolute paths to collected files.
23
+ """
24
+ try:
25
+ collected_files = {x for x in data_path.iterdir() if x.is_file()}
26
+ except OSError as e:
27
+ msg = f"Error collecting files: data directory is invalid. {e}."
28
+ raise DataFileCollectorError(msg) from e
29
+ return collected_files
30
+
31
+
32
+ def match_files(
33
+ files: set[Path],
34
+ filename_pattern: FilenamePattern,
35
+ ) -> tuple[set[Path], set[Path]]:
36
+ """Splits a set of files into those that match a given filename pattern and
37
+ those that don't.
38
+
39
+ This function applies the provided ``filename_pattern`` to each file name
40
+ in ``files``, and returns two sets: one containing files that match the
41
+ pattern, and one containing those that do not.
42
+
43
+ :param files: A set of file paths to check.
44
+ :param filename_pattern: The filename pattern used for matching.
45
+
46
+ :returns: A tuple containing in first position a set of files that match
47
+ the pattern, and in second position a set of files that do not match the
48
+ pattern.
49
+ """
50
+ matched_files = {
51
+ f for f in files if filename_pattern.matches_pattern(f.name)
52
+ }
53
+ return matched_files, files - matched_files
54
+
55
+
56
+ def extract_analysis_metadata(
57
+ data_path: Path,
58
+ file_prefix: str,
59
+ tags: Tags,
60
+ filename_pattern: FilenamePattern,
61
+ ) -> dict[str, Metadata]:
62
+ analysis = {}
63
+
64
+ for file in collect_files(data_path):
65
+ if not filename_pattern.matches_pattern(file.name):
66
+ logger.debug("File '%s' was not matched.", file.name)
67
+ continue
68
+
69
+ filename_metadata = filename_pattern.extract_metadata(file.name)
70
+ analysis_id = resolve_analysis_id(file_prefix, tags, filename_metadata)
71
+ analysis[analysis_id] = filename_metadata
72
+
73
+ return analysis
74
+
75
+
76
+ def init_data_files(
77
+ analysis_id: str,
78
+ files: set[Path],
79
+ filename_pattern: FilenamePattern,
80
+ bundle_file: Path,
81
+ ) -> set[DataFile]:
82
+ """Instantiate ``DataFile`` objects from a set of file paths associated
83
+ with an analysis.
84
+
85
+ :param analysis_id: ID of the analysis, shared by all created ``DataFile``
86
+ instances.
87
+ :param files: Set of file paths associated with the analysis.
88
+ :param filename_pattern: Pattern used to extract metadata from filenames.
89
+ The extracted metadata is included in each ``DataFile``.
90
+ :param bundle_file: Path to the YAML bundle file from which the analysis is
91
+ defined.
92
+ :raises DataFileCollectorError: If metadata extraction or instantiation
93
+ of a data file objet fails for a given file.
94
+ :return: A set of successfully instantiated ``DataFile`` objects.
95
+ """
96
+ data_files = set()
97
+ for file in files:
98
+ try:
99
+ metadata = filename_pattern.extract_metadata(file.name)
100
+ data_file = DataFile(
101
+ analysis_id=analysis_id,
102
+ path=file,
103
+ bundle_file=bundle_file,
104
+ metadata=metadata,
105
+ )
106
+ data_files.add(data_file)
107
+ except RuntimeError as e:
108
+ msg = f"Error instantiating data files: {e}"
109
+ raise DataFileCollectorError(msg) from None
110
+ return data_files
111
+
112
+
113
+ @dataclass(frozen=True)
114
+ class DataFileCollectorResult:
115
+ """Result of a data file collection."""
116
+
117
+ matched_files: set[Path]
118
+ unmatched_files: set[Path]
119
+ data_files: set[DataFile]
120
+
121
+
122
+ class DataFileCollector:
123
+ """Collect all data files belonging to an analysis."""
124
+
125
+ def __init__(
126
+ self,
127
+ analysis_id: str,
128
+ bundle_file: Path,
129
+ data_path: Path,
130
+ filename_pattern: FilenamePattern,
131
+ *,
132
+ multi_match: bool = False,
133
+ ) -> None:
134
+ self._analysis_id = analysis_id
135
+ self._bundle_file = bundle_file
136
+ self._data_path = data_path
137
+ self._filename_pattern = filename_pattern
138
+ self._multi_match = multi_match
139
+
140
+ def run(self) -> DataFileCollectorResult:
141
+ """Collects files from the analysis data path, matches them against the
142
+ analysis filename pattern, and instantiates ``DataFile`` objects for
143
+ each matched file.
144
+
145
+ :raises DataFileCollectorError: If the ``data_path`` is not an existing
146
+ directory or if metadata extraction or instantiation of a data file
147
+ objet fails for a given file.
148
+ :return: A ``DataFileCollectorResult`` containing the sets of matched
149
+ and unmatched files, as well as a set of instantiated ``DataFile``
150
+ objects.
151
+ """
152
+ files = collect_files(self._data_path)
153
+ logger.debug(
154
+ " -> Collected %s file(s):",
155
+ len(files),
156
+ )
157
+ for path in sorted(files):
158
+ logger.debug(" - '%s'", path.name)
159
+
160
+ matched_files, unmatched_files = match_files(
161
+ files, self._filename_pattern
162
+ )
163
+ logger.info(" -> Found %s matching file(s):", len(matched_files))
164
+ for path in sorted(matched_files):
165
+ logger.info(" - '%s'", path.name)
166
+
167
+ logger.info(
168
+ " -> Found %s non-matching file(s):",
169
+ len(unmatched_files),
170
+ )
171
+ for path in sorted(unmatched_files):
172
+ logger.info(" - '%s'", path.name)
173
+
174
+ data_files = init_data_files(
175
+ self._analysis_id,
176
+ matched_files,
177
+ self._filename_pattern,
178
+ self._bundle_file,
179
+ )
180
+
181
+ return DataFileCollectorResult(
182
+ matched_files=matched_files,
183
+ unmatched_files=unmatched_files,
184
+ data_files=data_files,
185
+ )