genelastic 0.8.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. genelastic/api/.env +4 -0
  2. genelastic/api/cli_start_api.py +2 -2
  3. genelastic/api/errors.py +52 -0
  4. genelastic/api/extends/example.py +0 -6
  5. genelastic/api/extends/example.yml +0 -20
  6. genelastic/api/routes.py +313 -181
  7. genelastic/api/server.py +8 -3
  8. genelastic/api/specification.yml +343 -181
  9. genelastic/common/__init__.py +0 -44
  10. genelastic/common/cli.py +52 -11
  11. genelastic/common/elastic.py +374 -46
  12. genelastic/common/exceptions.py +34 -2
  13. genelastic/common/server.py +9 -1
  14. genelastic/common/types.py +1 -14
  15. genelastic/import_data/__init__.py +0 -27
  16. genelastic/import_data/checker.py +99 -0
  17. genelastic/import_data/checker_observer.py +13 -0
  18. genelastic/import_data/cli/__init__.py +0 -0
  19. genelastic/import_data/cli/cli_check.py +135 -0
  20. genelastic/import_data/{cli_gen_data.py → cli/gen_data.py} +4 -4
  21. genelastic/import_data/cli/import_data.py +345 -0
  22. genelastic/import_data/cli/info.py +246 -0
  23. genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -8
  24. genelastic/import_data/cli/validate.py +146 -0
  25. genelastic/import_data/collect.py +185 -0
  26. genelastic/import_data/constants.py +136 -11
  27. genelastic/import_data/import_bundle.py +102 -59
  28. genelastic/import_data/import_bundle_factory.py +70 -149
  29. genelastic/import_data/importers/__init__.py +0 -0
  30. genelastic/import_data/importers/importer_base.py +136 -0
  31. genelastic/import_data/importers/importer_factory.py +85 -0
  32. genelastic/import_data/importers/importer_types.py +234 -0
  33. genelastic/import_data/logger.py +2 -1
  34. genelastic/import_data/models/__init__.py +0 -0
  35. genelastic/import_data/models/analyses.py +178 -0
  36. genelastic/import_data/models/analysis.py +144 -0
  37. genelastic/import_data/models/data_file.py +110 -0
  38. genelastic/import_data/models/process.py +45 -0
  39. genelastic/import_data/models/processes.py +84 -0
  40. genelastic/import_data/models/tags.py +170 -0
  41. genelastic/import_data/models/unique_list.py +109 -0
  42. genelastic/import_data/models/validate.py +26 -0
  43. genelastic/import_data/patterns.py +90 -0
  44. genelastic/import_data/random_bundle.py +10 -8
  45. genelastic/import_data/resolve.py +157 -0
  46. genelastic/ui/.env +1 -0
  47. genelastic/ui/cli_start_ui.py +4 -2
  48. genelastic/ui/routes.py +289 -42
  49. genelastic/ui/static/cea-cnrgh.ico +0 -0
  50. genelastic/ui/static/cea.ico +0 -0
  51. genelastic/ui/static/layout.ico +0 -0
  52. genelastic/ui/static/novaseq6000.png +0 -0
  53. genelastic/ui/static/style.css +430 -0
  54. genelastic/ui/static/ui.js +458 -0
  55. genelastic/ui/templates/analyses.html +96 -9
  56. genelastic/ui/templates/analysis_detail.html +44 -0
  57. genelastic/ui/templates/bi_process_detail.html +129 -0
  58. genelastic/ui/templates/bi_processes.html +114 -9
  59. genelastic/ui/templates/explorer.html +356 -0
  60. genelastic/ui/templates/home.html +205 -2
  61. genelastic/ui/templates/layout.html +148 -29
  62. genelastic/ui/templates/version.html +19 -7
  63. genelastic/ui/templates/wet_process_detail.html +131 -0
  64. genelastic/ui/templates/wet_processes.html +114 -9
  65. genelastic-0.10.0.dist-info/METADATA +686 -0
  66. genelastic-0.10.0.dist-info/RECORD +76 -0
  67. {genelastic-0.8.0.dist-info → genelastic-0.10.0.dist-info}/WHEEL +1 -2
  68. genelastic-0.10.0.dist-info/entry_points.txt +10 -0
  69. genelastic-0.10.0.dist-info/licenses/LICENSE +519 -0
  70. genelastic/import_data/analyses.py +0 -69
  71. genelastic/import_data/analysis.py +0 -205
  72. genelastic/import_data/bi_process.py +0 -27
  73. genelastic/import_data/bi_processes.py +0 -49
  74. genelastic/import_data/cli_import.py +0 -379
  75. genelastic/import_data/cli_info.py +0 -256
  76. genelastic/import_data/cli_validate.py +0 -54
  77. genelastic/import_data/data_file.py +0 -87
  78. genelastic/import_data/filename_pattern.py +0 -57
  79. genelastic/import_data/tags.py +0 -123
  80. genelastic/import_data/wet_process.py +0 -28
  81. genelastic/import_data/wet_processes.py +0 -53
  82. genelastic-0.8.0.dist-info/METADATA +0 -109
  83. genelastic-0.8.0.dist-info/RECORD +0 -52
  84. genelastic-0.8.0.dist-info/entry_points.txt +0 -8
  85. genelastic-0.8.0.dist-info/top_level.txt +0 -1
@@ -5,17 +5,25 @@ import sys
5
5
  import uvicorn
6
6
 
7
7
 
8
- def start_dev_server(app_module: str, args: argparse.Namespace) -> None:
8
+ def start_dev_server(
9
+ app_module: str,
10
+ args: argparse.Namespace,
11
+ reload_includes: list[str] | None = None,
12
+ ) -> None:
9
13
  """Start the development server using Uvicorn.
10
14
  :args app_module: The module containing the Flask server to start.
11
15
  :args argparse.Namespace: The parsed arguments.
12
16
  """
17
+ if reload_includes is None:
18
+ reload_includes = []
19
+
13
20
  uvicorn.run(
14
21
  app_module,
15
22
  host=args.host,
16
23
  port=args.port,
17
24
  log_level=args.log_level,
18
25
  reload=True,
26
+ reload_includes=reload_includes,
19
27
  )
20
28
 
21
29
 
@@ -4,20 +4,7 @@ import typing
4
4
  Bucket: typing.TypeAlias = dict[str, dict[typing.Any, typing.Any]]
5
5
  BundleDict: typing.TypeAlias = dict[str, typing.Any]
6
6
 
7
- AnalysisMetaData: typing.TypeAlias = dict[str, str | int]
8
- WetProcessesData: typing.TypeAlias = dict[str, str | int | float]
9
- BioInfoProcessData: typing.TypeAlias = dict[str, str | list[str]]
10
-
11
- AnalysisDocument: typing.TypeAlias = dict[str, str | None | AnalysisMetaData]
12
- MetadataDocument: typing.TypeAlias = dict[
13
- str, int | str | list[typing.Any | None]
14
- ]
15
- ProcessDocument: typing.TypeAlias = (
16
- dict[str, str] | WetProcessesData | BioInfoProcessData
17
- )
18
- BulkItems: typing.TypeAlias = list[
19
- dict[str, str | MetadataDocument | AnalysisDocument | ProcessDocument]
20
- ]
7
+ Metadata: typing.TypeAlias = dict[str, str | int]
21
8
 
22
9
  # Types related to random bundle generation.
23
10
  RandomBiProcessData: typing.TypeAlias = dict[str, str | list[dict[str, str]]]
@@ -1,27 +0,0 @@
1
- """Genelastic package for importing Genomic data into Elasticsearch."""
2
-
3
- from .analysis import Analysis
4
- from .import_bundle import ImportBundle
5
- from .import_bundle_factory import (
6
- load_import_bundle_file,
7
- make_import_bundle_from_files,
8
- )
9
- from .random_bundle import (
10
- RandomAnalysis,
11
- RandomBiProcess,
12
- RandomBundle,
13
- RandomWetProcess,
14
- )
15
- from .tags import Tags
16
-
17
- __all__ = [
18
- "Analysis",
19
- "ImportBundle",
20
- "RandomAnalysis",
21
- "RandomBiProcess",
22
- "RandomBundle",
23
- "RandomWetProcess",
24
- "Tags",
25
- "load_import_bundle_file",
26
- "make_import_bundle_from_files",
27
- ]
@@ -0,0 +1,99 @@
1
+ import logging
2
+
3
+ from genelastic.common.elastic import ElasticQueryConn
4
+ from genelastic.import_data.checker_observer import CheckerObserver
5
+ from genelastic.import_data.models.analyses import Analyses
6
+ from genelastic.import_data.models.processes import Processes
7
+
8
+ logger = logging.getLogger("genelastic")
9
+
10
+
11
+ class Checker:
12
+ """Validate coherence between YAML metadata and Elasticsearch,
13
+ using a project-specific observer mechanism.
14
+ """
15
+
16
+ def __init__(self, es: ElasticQueryConn, *, strict: bool = False) -> None:
17
+ """Initialize the Checker.
18
+
19
+ Args:
20
+ es: Elasticsearch connection instance.
21
+ strict: Treat ES-only entries as errors when True.
22
+ """
23
+ self.es = es
24
+ self.strict = strict
25
+ self.errors_detected = False
26
+ self._observers: list[CheckerObserver] = []
27
+
28
+ def attach(self, observer: CheckerObserver) -> None:
29
+ """Register an observer to receive Checker notifications."""
30
+ self._observers.append(observer)
31
+
32
+ def detach(self, observer: CheckerObserver) -> None:
33
+ """Unregister an observer so it no longer receives notifications."""
34
+ self._observers.remove(observer)
35
+
36
+ def _notify_missing(self, label: str, missing: list[str]) -> None:
37
+ """Notify observers about missing IDs."""
38
+ self.errors_detected = True
39
+ for obs in self._observers:
40
+ obs.notify_missing(label, missing)
41
+
42
+ def _notify_extra(self, label: str, extra: list[str]) -> None:
43
+ """Notify observers about extra IDs."""
44
+ self.errors_detected = True
45
+ for obs in self._observers:
46
+ obs.notify_extra(label, extra)
47
+
48
+ def _check_generic(
49
+ self, label: str, ids_yaml: set[str], ids_es: set[str]
50
+ ) -> None:
51
+ """Compare YAML IDs vs Elasticsearch IDs for a given entity type."""
52
+ logger.info("Checking %s...", label)
53
+
54
+ missing = sorted(ids_yaml - ids_es)
55
+ extra = sorted(ids_es - ids_yaml)
56
+
57
+ if missing:
58
+ logger.error("Missing %s in ES: %s", label, missing)
59
+ self._notify_missing(label, missing)
60
+
61
+ if extra:
62
+ if self.strict:
63
+ logger.error(
64
+ "%s in ES but missing from YAML: %s",
65
+ label.capitalize(),
66
+ extra,
67
+ )
68
+ self._notify_extra(label, extra)
69
+ else:
70
+ logger.info("Extra %s ignored (non-strict mode).", label)
71
+
72
+ if not missing and (self.strict and not extra):
73
+ logger.info("OK ✓ All %s match exactly.", label)
74
+ elif not missing and not self.strict:
75
+ logger.info("OK ✓ YAML %s present (extra ignored).", label)
76
+
77
+ def check_analyses(self, analyses: Analyses) -> None:
78
+ """Check analysis IDs between YAML and Elasticsearch."""
79
+ ids_yaml = {a.id for a in analyses}
80
+ ids_es = set(
81
+ self.es.get_field_values(self.es.data_files_index, "analysis_id")
82
+ )
83
+ self._check_generic("analyses", ids_yaml, ids_es)
84
+
85
+ def check_wet_processes(self, processes: Processes) -> None:
86
+ """Check wet process IDs between YAML and Elasticsearch."""
87
+ ids_yaml = set(processes.keys())
88
+ ids_es = set(
89
+ self.es.get_field_values(self.es.wet_processes_index, "proc_id")
90
+ )
91
+ self._check_generic("wet processes", ids_yaml, ids_es)
92
+
93
+ def check_bi_processes(self, processes: Processes) -> None:
94
+ """Check biological process IDs between YAML and Elasticsearch."""
95
+ ids_yaml = set(processes.keys())
96
+ ids_es = set(
97
+ self.es.get_field_values(self.es.bi_processes_index, "proc_id")
98
+ )
99
+ self._check_generic("bi processes", ids_yaml, ids_es)
@@ -0,0 +1,13 @@
1
+ from typing import Protocol
2
+
3
+
4
+ class CheckerObserver(Protocol):
5
+ """Protocol for classes observing Checker events."""
6
+
7
+ def notify_missing(self, label: str, missing: list[str]) -> None:
8
+ """Called when expected IDs are missing in Elasticsearch."""
9
+ ...
10
+
11
+ def notify_extra(self, label: str, extra: list[str]) -> None:
12
+ """Called when unexpected IDs exist in Elasticsearch."""
13
+ ...
File without changes
@@ -0,0 +1,135 @@
1
+ import argparse
2
+ import logging
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ from genelastic.common.cli import (
7
+ add_es_connection_args,
8
+ add_verbose_control_args,
9
+ add_version_arg,
10
+ )
11
+ from genelastic.common.elastic import ElasticQueryConn
12
+ from genelastic.import_data.checker import Checker
13
+ from genelastic.import_data.import_bundle_factory import (
14
+ make_import_bundle_from_files,
15
+ )
16
+ from genelastic.import_data.logger import configure_logging
17
+
18
+ logger = logging.getLogger("genelastic")
19
+ logging.getLogger("elastic_transport").setLevel(logging.WARNING)
20
+
21
+
22
+ class CLICheckObserver:
23
+ """Observer used by the CLI to log Checker errors."""
24
+
25
+ def __init__(self) -> None:
26
+ self._logger = logger
27
+
28
+ def notify_missing(self, label: str, missing: list[str]) -> None:
29
+ """Handle missing IDs by logging an error."""
30
+ self._logger.error("[CHECKER] Missing %s in ES: %s", label, missing)
31
+
32
+ def notify_extra(self, label: str, extra: list[str]) -> None:
33
+ """Handle extra IDs by logging an error."""
34
+ self._logger.error("[CHECKER] Extra %s in ES: %s", label, extra)
35
+
36
+
37
+ def read_args() -> argparse.Namespace:
38
+ parser = argparse.ArgumentParser(
39
+ description="Check database coherency against one or more YAML bundles.",
40
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
41
+ )
42
+
43
+ add_version_arg(parser)
44
+ add_verbose_control_args(parser)
45
+ add_es_connection_args(parser)
46
+
47
+ parser.add_argument(
48
+ "files",
49
+ type=Path,
50
+ nargs="+",
51
+ help="Paths to YAML bundle files to validate.",
52
+ )
53
+
54
+ parser.add_argument(
55
+ "--strict",
56
+ action="store_true",
57
+ help=(
58
+ "Enable strict mode: also report entries present in Elasticsearch "
59
+ "but missing from YAML bundles."
60
+ ),
61
+ )
62
+
63
+ parser.add_argument(
64
+ "-A",
65
+ "--check-analyses",
66
+ action="store_true",
67
+ help="Check only analyses coherence.",
68
+ )
69
+
70
+ parser.add_argument(
71
+ "-W",
72
+ "--check-wet",
73
+ action="store_true",
74
+ help="Check only wet processes coherence.",
75
+ )
76
+
77
+ parser.add_argument(
78
+ "-B",
79
+ "--check-bi",
80
+ action="store_true",
81
+ help="Check only biological processes coherence.",
82
+ )
83
+
84
+ parser.add_argument(
85
+ "-X",
86
+ "--all",
87
+ action="store_true",
88
+ help="Check all entities (analyses, wet processes and bi processes).",
89
+ )
90
+
91
+ return parser.parse_args()
92
+
93
+
94
+ def main() -> None:
95
+ args = read_args()
96
+ configure_logging(args.verbose)
97
+
98
+ logger.info(
99
+ "Connecting to Elasticsearch at %s...",
100
+ args.es_url,
101
+ )
102
+
103
+ es = ElasticQueryConn(
104
+ args.es_url,
105
+ args.es_cert_fp,
106
+ args.es_index_prefix,
107
+ basic_auth=(args.es_usr, args.es_pwd),
108
+ )
109
+
110
+ import_bundle = make_import_bundle_from_files(args.files)
111
+
112
+ checker = Checker(es, strict=args.strict)
113
+ checker.attach(CLICheckObserver())
114
+
115
+ run_all = args.all or not (
116
+ args.check_analyses or args.check_wet or args.check_bi
117
+ )
118
+
119
+ if args.check_analyses or run_all:
120
+ checker.check_analyses(import_bundle.analyses)
121
+
122
+ if args.check_wet or run_all:
123
+ checker.check_wet_processes(import_bundle.wet_processes)
124
+
125
+ if args.check_bi or run_all:
126
+ checker.check_bi_processes(import_bundle.bi_processes)
127
+
128
+ if checker.errors_detected:
129
+ sys.exit(1)
130
+
131
+ sys.exit(0)
132
+
133
+
134
+ if __name__ == "__main__":
135
+ main()
@@ -4,10 +4,9 @@ from pathlib import Path
4
4
 
5
5
  from biophony import DEFAULT_RATE, MutSimParams
6
6
 
7
- from genelastic.common import add_verbose_control_args
8
-
9
- from .logger import configure_logging
10
- from .random_bundle import (
7
+ from genelastic.common.cli import add_verbose_control_args, add_version_arg
8
+ from genelastic.import_data.logger import configure_logging
9
+ from genelastic.import_data.random_bundle import (
11
10
  RandomBundle,
12
11
  )
13
12
 
@@ -26,6 +25,7 @@ def read_args() -> argparse.Namespace:
26
25
  formatter_class=argparse.ArgumentDefaultsHelpFormatter,
27
26
  allow_abbrev=False,
28
27
  )
28
+ add_version_arg(parser)
29
29
  add_verbose_control_args(parser)
30
30
  parser.add_argument(
31
31
  "output_dir",
@@ -0,0 +1,345 @@
1
+ # vi: se tw=80
2
+
3
+ # Elasticsearch Python API:
4
+ # https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/overview.html
5
+ # https://elasticsearch-py.readthedocs.io/en/latest/api.html
6
+
7
+ import argparse
8
+ import logging
9
+ import sys
10
+ from datetime import UTC, datetime
11
+ from pathlib import Path
12
+
13
+ from genelastic.common.cli import (
14
+ add_es_connection_args,
15
+ add_verbose_control_args,
16
+ add_version_arg,
17
+ log_item,
18
+ log_section,
19
+ log_subsection,
20
+ positive_int,
21
+ )
22
+ from genelastic.common.elastic import ElasticImportConn
23
+ from genelastic.import_data.import_bundle_factory import (
24
+ make_import_bundle_from_files,
25
+ )
26
+ from genelastic.import_data.importers.importer_base import ImporterError
27
+ from genelastic.import_data.importers.importer_factory import ImporterFactory
28
+ from genelastic.import_data.logger import configure_logging
29
+ from genelastic.import_data.models.analysis import Analysis
30
+ from genelastic.import_data.models.data_file import DataFile
31
+ from genelastic.import_data.models.processes import Processes
32
+
33
+ logger = logging.getLogger("genelastic")
34
+ logging.getLogger("elastic_transport").setLevel(
35
+ logging.WARNING
36
+ ) # Disable excessive logging
37
+ logging.getLogger("urllib3").setLevel(
38
+ logging.WARNING
39
+ ) # Disable excessive logging
40
+
41
+
42
+ def read_args() -> argparse.Namespace:
43
+ """Read arguments from command line."""
44
+ parser = argparse.ArgumentParser(
45
+ description="Genetics data importer.",
46
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
47
+ allow_abbrev=False,
48
+ )
49
+ add_version_arg(parser)
50
+ add_verbose_control_args(parser)
51
+ add_es_connection_args(parser)
52
+ parser.add_argument(
53
+ "-D",
54
+ "--dry-run",
55
+ dest="dryrun",
56
+ action="count",
57
+ default=0,
58
+ help=(
59
+ "Dry-run level. -D for data files loading (VCF, coverage, etc) "
60
+ "without connecting or importing to database. "
61
+ "-DD for metadata YAML files loading only (no loading of data files)."
62
+ ),
63
+ )
64
+ parser.add_argument(
65
+ "--log-file", dest="log_file", help="Path to a log file."
66
+ )
67
+ parser.add_argument(
68
+ "--no-list",
69
+ dest="no_list",
70
+ action="store_true",
71
+ help="Do not print list of files to be imported.",
72
+ )
73
+ parser.add_argument(
74
+ "--no-confirm",
75
+ dest="no_confirm",
76
+ action="store_true",
77
+ help="Do not ask confirmation before importing.",
78
+ )
79
+ parser.add_argument(
80
+ "-t",
81
+ "--threads",
82
+ dest="thread_count",
83
+ type=positive_int,
84
+ default=4,
85
+ help="Number of threads to use for parallel data files import.",
86
+ )
87
+ parser.add_argument(
88
+ "--multi-match",
89
+ dest="multi_match",
90
+ action="store_true",
91
+ help=(
92
+ "Enable grouping of files from the same 'data_path' into multiple "
93
+ "analyses by extracting variable metadata fields directly from "
94
+ "filenames using the file prefix. If some metadata fields (e.g., "
95
+ "sample_name, wet_process, bi_process) are not defined in the YAML "
96
+ "bundle, the importer detects all analyses sharing the same "
97
+ "defined metadata, but differing by the undefined fields. This "
98
+ "allows importing and filtering several analyses at once from a "
99
+ "single directory, based on the metadata present in filenames. "
100
+ "When disabled (default), only files matching the fixed filename "
101
+ "pattern (where all metadata fields are defined in the YAML) are "
102
+ "grouped into a single analysis; other files are ignored."
103
+ ),
104
+ )
105
+ parser.add_argument(
106
+ "files",
107
+ type=Path,
108
+ nargs="+",
109
+ default=None,
110
+ help="Data files that describe what to import.",
111
+ )
112
+ return parser.parse_args()
113
+
114
+
115
+ def import_analysis(
116
+ es_import_conn: ElasticImportConn,
117
+ analysis: Analysis,
118
+ ) -> None:
119
+ """Import analysis into a dedicated index."""
120
+ logger.info(
121
+ " -> Importing analysis '%s' metadata into index '%s'...",
122
+ analysis.id,
123
+ es_import_conn.analyses_index,
124
+ )
125
+
126
+ documents = [
127
+ {
128
+ "_index": es_import_conn.analyses_index,
129
+ "_source": {
130
+ "created_at": datetime.now(UTC).isoformat(),
131
+ "analysis_id": analysis.id,
132
+ "bundle_file": str(analysis.bundle_file),
133
+ "data_path": str(analysis.data_path),
134
+ "metadata": analysis.metadata,
135
+ },
136
+ }
137
+ ]
138
+
139
+ es_import_conn.bulk_import(documents)
140
+
141
+
142
+ def import_data_file(
143
+ es_import_conn: ElasticImportConn,
144
+ data_file: DataFile,
145
+ ) -> None:
146
+ """Import data files into a dedicated index."""
147
+ logger.info(
148
+ " -> Importing metadata into index '%s'...",
149
+ es_import_conn.data_files_index,
150
+ )
151
+
152
+ documents = [
153
+ {
154
+ "_index": es_import_conn.data_files_index,
155
+ "_source": {
156
+ "created_at": datetime.now(UTC).isoformat(),
157
+ "analysis_id": data_file.analysis_id,
158
+ "path": str(data_file.path),
159
+ "bundle_file": str(data_file.bundle_file),
160
+ "metadata": data_file.metadata,
161
+ "metrics": data_file.metrics,
162
+ },
163
+ }
164
+ ]
165
+
166
+ es_import_conn.bulk_import(documents)
167
+
168
+
169
+ def import_data_file_content(
170
+ es_import_conn: ElasticImportConn,
171
+ data_file: DataFile,
172
+ thread_count: int,
173
+ dry_run: int,
174
+ ) -> None:
175
+ """Import data file content into a dedicated index,
176
+ based on their extension and type.
177
+ """
178
+ # -DD: no file processing, no import.
179
+ if dry_run > 1:
180
+ logger.info("[Dryrun] Data file neither processed nor imported.")
181
+ return
182
+
183
+ try:
184
+ logger.info(
185
+ " -> Processing file content for import...",
186
+ )
187
+ importer = ImporterFactory.get_importer(
188
+ data_file, es_import_conn, thread_count
189
+ )
190
+
191
+ # -D: only process files, no import.
192
+ if dry_run == 1:
193
+ logger.info("[Dryrun] Data file processed but not imported.")
194
+ return
195
+
196
+ logger.info(
197
+ " -> Importing file content into index '%s'...",
198
+ importer.target_index,
199
+ )
200
+ importer.import_docs()
201
+ except ImporterError as e:
202
+ logger.error(e)
203
+
204
+
205
+ def import_processes(
206
+ es_import_conn: ElasticImportConn,
207
+ index: str,
208
+ processes: Processes,
209
+ ) -> None:
210
+ """Import processes into a dedicated index, based on their type."""
211
+ documents = [
212
+ {
213
+ "_index": index,
214
+ "_source": {
215
+ "proc_id": process.id,
216
+ "type": process.type,
217
+ "metadata": process.data,
218
+ },
219
+ }
220
+ for process in processes.values()
221
+ ]
222
+
223
+ es_import_conn.bulk_import(documents)
224
+
225
+
226
+ def main() -> None:
227
+ """Entry point of the import script."""
228
+ # Read command line arguments
229
+ args = read_args()
230
+
231
+ # Configure logging
232
+ configure_logging(args.verbose, log_file=args.log_file)
233
+ logger.debug("Arguments: %s", args)
234
+ logger.debug("LOGGERS: %s", logging.root.manager.loggerDict)
235
+
236
+ # Open connection to ES
237
+ logger.info("Connecting to Elasticsearch at %s...", args.es_url)
238
+ es_import_conn = ElasticImportConn(
239
+ args.es_url,
240
+ args.es_cert_fp,
241
+ args.es_index_prefix,
242
+ args.dryrun,
243
+ basic_auth=(args.es_usr, args.es_pwd),
244
+ )
245
+
246
+ log_section("LOAD DATA")
247
+ logger.info("")
248
+ import_bundle = make_import_bundle_from_files(
249
+ args.files, multi_match=args.multi_match, check=True
250
+ )
251
+ all_bundled_files = import_bundle.analyses.get_data_files()
252
+
253
+ if not all_bundled_files:
254
+ logger.warning("No matching data files found from import bundle(s) !")
255
+
256
+ log_section("IMPORT DATA")
257
+ # List files before importing.
258
+ if not args.no_list:
259
+ logger.info("")
260
+ logger.info(
261
+ "The following %s file(s) will be imported:", len(all_bundled_files)
262
+ )
263
+
264
+ for data_file in all_bundled_files:
265
+ logger.info("- '%s'", data_file.path)
266
+ else:
267
+ logger.debug(
268
+ "'--no-list' argument provided: "
269
+ "not listing files about to be imported."
270
+ )
271
+
272
+ # Ask confirmation for importing
273
+ if not args.no_confirm:
274
+ answer: str = "maybe"
275
+ while answer not in ["", "n", "y"]:
276
+ answer = input("Import (y/N)? ").lower()
277
+ if answer != "y":
278
+ logger.info("Import canceled.")
279
+ sys.exit(0)
280
+ else:
281
+ logger.debug(
282
+ "'--no-confirm' argument provided: "
283
+ "not asking for confirmation before importing files."
284
+ )
285
+
286
+ # Start import.
287
+ log_subsection("Importing wet processes...")
288
+ logger.info(
289
+ "-> Importing %s wet process(es) into index '%s': %s.",
290
+ len(import_bundle.wet_processes),
291
+ es_import_conn.wet_processes_index,
292
+ ", ".join(import_bundle.wet_processes.keys()),
293
+ )
294
+ import_processes(
295
+ es_import_conn,
296
+ es_import_conn.wet_processes_index,
297
+ import_bundle.wet_processes,
298
+ )
299
+ log_subsection("Importing bioinformatics processes...")
300
+ logger.info(
301
+ "-> Importing %s bioinformatics process(es) into index '%s': %s.",
302
+ len(import_bundle.bi_processes),
303
+ es_import_conn.bi_processes_index,
304
+ ", ".join(import_bundle.bi_processes.keys()),
305
+ )
306
+ import_processes(
307
+ es_import_conn,
308
+ es_import_conn.bi_processes_index,
309
+ import_bundle.bi_processes,
310
+ )
311
+
312
+ log_subsection("Importing analysis metadata...")
313
+ for i, analysis in enumerate(sorted(import_bundle.analyses)):
314
+ log_item(
315
+ "Analysis",
316
+ i + 1,
317
+ len(import_bundle.analyses),
318
+ )
319
+ import_analysis(es_import_conn, analysis)
320
+
321
+ log_subsection("Importing data files...")
322
+ counter = 1
323
+ for ext in sorted(import_bundle.analyses.extensions):
324
+ data_files = import_bundle.analyses.get_data_files(ext)
325
+ logger.info("[ %s data files ]", ext.upper())
326
+
327
+ for data_file in data_files:
328
+ logger.info(
329
+ " -> Processing data file #%s/%s: '%s'...",
330
+ counter,
331
+ len(import_bundle.analyses.get_data_files()),
332
+ data_file.path.name,
333
+ )
334
+ import_data_file(es_import_conn, data_file)
335
+ import_data_file_content(
336
+ es_import_conn, data_file, args.thread_count, args.dryrun
337
+ )
338
+ logger.info("")
339
+ counter += 1
340
+
341
+ logger.info("=> Done.")
342
+
343
+
344
+ if __name__ == "__main__":
345
+ main()