genelastic 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. genelastic/api/.env +4 -0
  2. genelastic/api/cli_start_api.py +18 -0
  3. genelastic/api/errors.py +52 -0
  4. genelastic/api/extends/example.py +0 -6
  5. genelastic/api/extends/example.yml +0 -0
  6. genelastic/api/routes.py +313 -181
  7. genelastic/api/server.py +34 -26
  8. genelastic/api/settings.py +5 -9
  9. genelastic/api/specification.yml +512 -0
  10. genelastic/common/__init__.py +0 -39
  11. genelastic/common/cli.py +100 -0
  12. genelastic/common/elastic.py +374 -46
  13. genelastic/common/exceptions.py +34 -2
  14. genelastic/common/server.py +59 -0
  15. genelastic/common/types.py +1 -14
  16. genelastic/import_data/__init__.py +0 -27
  17. genelastic/import_data/checker.py +99 -0
  18. genelastic/import_data/checker_observer.py +13 -0
  19. genelastic/import_data/cli/__init__.py +0 -0
  20. genelastic/import_data/cli/cli_check.py +136 -0
  21. genelastic/import_data/cli/gen_data.py +143 -0
  22. genelastic/import_data/cli/import_data.py +346 -0
  23. genelastic/import_data/cli/info.py +247 -0
  24. genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
  25. genelastic/import_data/cli/validate.py +146 -0
  26. genelastic/import_data/collect.py +185 -0
  27. genelastic/import_data/constants.py +136 -11
  28. genelastic/import_data/import_bundle.py +102 -59
  29. genelastic/import_data/import_bundle_factory.py +70 -149
  30. genelastic/import_data/importers/__init__.py +0 -0
  31. genelastic/import_data/importers/importer_base.py +131 -0
  32. genelastic/import_data/importers/importer_factory.py +85 -0
  33. genelastic/import_data/importers/importer_types.py +223 -0
  34. genelastic/import_data/logger.py +2 -1
  35. genelastic/import_data/models/__init__.py +0 -0
  36. genelastic/import_data/models/analyses.py +178 -0
  37. genelastic/import_data/models/analysis.py +144 -0
  38. genelastic/import_data/models/data_file.py +110 -0
  39. genelastic/import_data/models/process.py +45 -0
  40. genelastic/import_data/models/processes.py +84 -0
  41. genelastic/import_data/models/tags.py +170 -0
  42. genelastic/import_data/models/unique_list.py +109 -0
  43. genelastic/import_data/models/validate.py +26 -0
  44. genelastic/import_data/patterns.py +90 -0
  45. genelastic/import_data/random_bundle.py +79 -54
  46. genelastic/import_data/resolve.py +157 -0
  47. genelastic/ui/.env +1 -0
  48. genelastic/ui/cli_start_ui.py +20 -0
  49. genelastic/ui/routes.py +333 -0
  50. genelastic/ui/server.py +9 -82
  51. genelastic/ui/settings.py +2 -6
  52. genelastic/ui/static/cea-cnrgh.ico +0 -0
  53. genelastic/ui/static/cea.ico +0 -0
  54. genelastic/ui/static/layout.ico +0 -0
  55. genelastic/ui/static/novaseq6000.png +0 -0
  56. genelastic/ui/static/style.css +430 -0
  57. genelastic/ui/static/ui.js +458 -0
  58. genelastic/ui/templates/analyses.html +98 -0
  59. genelastic/ui/templates/analysis_detail.html +44 -0
  60. genelastic/ui/templates/bi_process_detail.html +129 -0
  61. genelastic/ui/templates/bi_processes.html +116 -0
  62. genelastic/ui/templates/explorer.html +356 -0
  63. genelastic/ui/templates/home.html +207 -0
  64. genelastic/ui/templates/layout.html +153 -0
  65. genelastic/ui/templates/version.html +21 -0
  66. genelastic/ui/templates/wet_process_detail.html +131 -0
  67. genelastic/ui/templates/wet_processes.html +116 -0
  68. genelastic-0.9.0.dist-info/METADATA +686 -0
  69. genelastic-0.9.0.dist-info/RECORD +76 -0
  70. genelastic-0.9.0.dist-info/WHEEL +4 -0
  71. genelastic-0.9.0.dist-info/entry_points.txt +10 -0
  72. genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
  73. genelastic/import_data/analyses.py +0 -69
  74. genelastic/import_data/analysis.py +0 -205
  75. genelastic/import_data/bi_process.py +0 -27
  76. genelastic/import_data/bi_processes.py +0 -49
  77. genelastic/import_data/cli_gen_data.py +0 -116
  78. genelastic/import_data/cli_import.py +0 -379
  79. genelastic/import_data/cli_info.py +0 -256
  80. genelastic/import_data/cli_validate.py +0 -54
  81. genelastic/import_data/data_file.py +0 -87
  82. genelastic/import_data/filename_pattern.py +0 -57
  83. genelastic/import_data/tags.py +0 -123
  84. genelastic/import_data/wet_process.py +0 -28
  85. genelastic/import_data/wet_processes.py +0 -53
  86. genelastic-0.7.0.dist-info/METADATA +0 -105
  87. genelastic-0.7.0.dist-info/RECORD +0 -40
  88. genelastic-0.7.0.dist-info/WHEEL +0 -5
  89. genelastic-0.7.0.dist-info/entry_points.txt +0 -6
  90. genelastic-0.7.0.dist-info/top_level.txt +0 -1
@@ -0,0 +1,686 @@
1
+ Metadata-Version: 2.4
2
+ Name: genelastic
3
+ Version: 0.9.0
4
+ Summary: Generate and store genetic data into an Elasticsearch database.
5
+ Keywords: CNRGH,genelastic,generation,storage,elasticsearch,database
6
+ Author: CNRGH, Pierrick ROGER, Maxime BLANCHON
7
+ Author-email: Pierrick ROGER <pierrick.roger@cnrgh.fr>, Maxime BLANCHON <maxime.blanchon@cnrgh.fr>
8
+ License-Expression: CECILL-2.1
9
+ License-File: LICENSE
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Requires-Dist: elasticsearch>=8.18.1,<8.19.0
14
+ Requires-Dist: pyvcf3>=1.0.3,<2.0.0
15
+ Requires-Dist: schema>=0.7.7,<0.8.0
16
+ Requires-Dist: pyyaml>=6.0.2,<7.0.0
17
+ Requires-Dist: biophony>=1.3.0,<1.4.0
18
+ Requires-Dist: colorlog>=6.9.0,<7.0.0
19
+ Requires-Dist: tqdm>=4.67.1,<5.0.0
20
+ Requires-Dist: flask>=3.1.0,<4.0.0 ; extra == 'api'
21
+ Requires-Dist: environs>=14.1.1,<15.0.0 ; extra == 'api'
22
+ Requires-Dist: connexion[flask,swagger-ui,uvicorn]>=3.2.0,<4.0.0 ; extra == 'api'
23
+ Requires-Dist: gunicorn>=23.0.0,<24.0.0 ; extra == 'api'
24
+ Requires-Dist: flask>=3.1.0,<4.0.0 ; extra == 'ui'
25
+ Requires-Dist: requests>=2.32.3,<3.0.0 ; extra == 'ui'
26
+ Requires-Dist: environs>=14.1.1,<15.0.0 ; extra == 'ui'
27
+ Requires-Dist: uvicorn>=0.34.0,<0.35.0 ; extra == 'ui'
28
+ Requires-Dist: asgiref>=3.8.1,<4.0.0 ; extra == 'ui'
29
+ Requires-Dist: gunicorn>=23.0.0,<24.0.0 ; extra == 'ui'
30
+ Requires-Python: >=3.11
31
+ Provides-Extra: api
32
+ Provides-Extra: ui
33
+ Description-Content-Type: text/markdown
34
+
35
+ # genelastic
36
+
37
+ **Genelastic** is a set of tools for genetic technologies comparisons.
38
+
39
+ It includes a set of scripts to import and manage genetics data in an
40
+ Elasticsearch database, as well as a REST API serving a dedicated user
41
+ interface to query, visualize and compare imported data.
42
+
43
+ ## Table of contents
44
+
45
+ - [I - Prerequisites](#i---prerequisites)
46
+ - [II - Installation](#ii---installation)
47
+ - [III - Core concepts](#iii---core-concepts)
48
+ - [IV - Bundle file definition](#iv---bundle-file-definition)
49
+ - [V - Scripts usage](#v---scripts-usage)
50
+ - [VI - Servers usage](#vi---servers-usage)
51
+ - [VII - For developers](#vii---for-developers)
52
+
53
+ ## I - Prerequisites
54
+
55
+ - `python` >= 3.11
56
+
57
+ ## II - Installation
58
+
59
+ - With **pipx** (recommended):
60
+
61
+ ```bash
62
+ pipx install genelastic
63
+ ```
64
+
65
+ `pipx` installs `genelastic` in its own isolated
66
+ virtual environment and makes all scripts (`gnl-*`) available globally.
67
+
68
+ - Or with **pip**:
69
+
70
+ ```bash
71
+ python -m venv .venv
72
+ source .venv/bin/activate # Activate the virtual environment
73
+ pip install genelastic
74
+ ```
75
+
76
+ It is recommended to install `genelastic` inside its own virtual
77
+ environment to avoid conflicts with globally installed Python packages.
78
+ To run the scripts, you need to activate the environment first.
79
+
80
+ Test the installation by running one of the genelastic scripts:
81
+
82
+ ```bash
83
+ gnl-import -h # Print the help message and exit.
84
+ ```
85
+
86
+ ## III - Core concepts
87
+
88
+ This section introduces the main ideas behind **Genelastic**. It explains how
89
+ a bundle (YAML manifest) describes analyses, how files are located with
90
+ file prefixes and tags, which data file types are supported, and how they are
91
+ imported into Elasticsearch.
92
+
93
+ ### Bundle
94
+
95
+ A **bundle** is a YAML manifest that describes one or several analyses.\
96
+ It defines:
97
+
98
+ - metadata for each analysis,
99
+ - the path where the related files are stored,
100
+ - the naming rules (file prefix and tags) to automatically locate those files.
101
+
102
+ The bundle itself is **not** imported into Elasticsearch.
103
+ Instead, `gnl-import` uses it to build filename patterns (regular expressions
104
+ with named groups), retrieve matching files, and import both the files'
105
+ contents and the associated metadata into the database. Because the regex is
106
+ built from tags, the metadata can always be re-extracted from the filenames if
107
+ needed.
108
+
109
+ ### Analysis
110
+
111
+ An **analysis** is the central unit described in a bundle.\
112
+ It combines:
113
+
114
+ - a set of metadata fields (sample, source, reference genome, etc.),
115
+ - one wet lab and one bioinformatics process,
116
+ - a `data_path` where files are located,
117
+ - and a `file_prefix` that defines the naming pattern of all files belonging
118
+ to the analysis.
119
+
120
+ The analysis acts as a template: it tells Genelastic how to find the right
121
+ files and how to attach them to the correct metadata.
122
+
123
+ ### File prefix and tags
124
+
125
+ A **file prefix** is a naming template made of tags, each tag representing a
126
+ metadata field. When processing a bundle, Genelastic replaces tags with their
127
+ values to build a filename pattern, a regular expression used to automatically
128
+ retrieve files in `data_path`.
129
+
130
+ By default, tags use `%` as a start delimiter and `""` (empty char) as an end
131
+ delimiter. Both delimiters can be overridden if needed.
132
+
133
+ For example, the following tags are all valid:
134
+
135
+ - `%S` uses default start delimiter (`%`) and default end delimiter (`""`),
136
+ - `%S%` uses default start delimiter (`%`) and custom end delimiter (`%`),
137
+ - `$S$` uses custom start delimiter (`$`) and custom end delimiter (`$`).
138
+
139
+ However, not all characters are allowed as delimiters (see [Tags](#tags)).
140
+
141
+ **Default tags** provided by Genelastic:
142
+
143
+ - `%S` => `sample_name`
144
+ - `%F` => `source`
145
+ - `%W` => `wet_process`
146
+ - `%B` => `bi_process`
147
+ - `%D` => `cov_depth`
148
+ - `%A` => `barcode`
149
+ - `%R` => `reference_genome`
150
+
151
+ Custom tags can also be defined.
152
+
153
+ #### Example
154
+
155
+ Suppose the bundle defines the following analysis:
156
+
157
+ ```yaml
158
+ ---
159
+ - analyses:
160
+ - file_prefix: "%S_%F_%W_%B_%D_%R_rep-1"
161
+ data_path: "/data/"
162
+ sample_name: "HG002" # %S
163
+ source: "CNRGH" # %F
164
+ wet_process: "novaseqxplus-25b" # %W
165
+ bi_process: "dragen-4123" # %B
166
+ cov_depth: 30 # %D
167
+ reference_genome: "hg38" # %R
168
+ # ...
169
+ ```
170
+
171
+ Genelastic expands tags and automatically retrieve matching files:
172
+
173
+ ```text
174
+ - HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1.cov
175
+ - HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1.vcf.gz
176
+ ```
177
+
178
+ ### Data files
179
+
180
+ A **data file** is any file belonging to an analysis that is imported into the
181
+ database. Each data file has a **type** and an **extension**.
182
+
183
+ Supported **raw** data file types:
184
+
185
+ - **VCF** (gzipped or not): type = `vcf`, ext = `vcf`,
186
+ - **Coverage** in TSV format: type = `cov`, ext = `cov`.
187
+
188
+ Supported **metrics** data file types:
189
+
190
+ - **QC**: type = `qc`, ext = `yml` / `yaml`,
191
+ - **SV** (gzipped or not): type = `sv`, ext = `json`,
192
+ - **Smallvar** (gzipped or not): type = `smallvar`, ext = `json`.
193
+
194
+ During import, Genelastic automatically handles gzipped files.
195
+ Each line of each data file is parsed and transformed into a document, enriched
196
+ with the analysis metadata, and indexed into the Elasticsearch index
197
+ corresponding to the file type.
198
+
199
+ ### Processes
200
+
201
+ An analysis can also reference:
202
+
203
+ - a **wet lab process** (sequencing metadata),
204
+ - and a **bioinformatics process** (pipeline metadata).
205
+
206
+ These processes provide contextual metadata about how the data was generated
207
+ (wet lab) and processed (bioinformatics). They enable filtering and comparison
208
+ of analyses based on production and pipeline characteristics.
209
+
210
+ ### Benefits
211
+
212
+ Using bundles brings several advantages:
213
+
214
+ - Metadata is defined once per analysis, not repeated for each file,
215
+ - Files are retrieved automatically using filename patterns built from file
216
+ prefixes, avoiding manual lists,
217
+ - Metadata and file contents are imported consistently into Elasticsearch,
218
+ - Analyses remain reproducible and traceable thanks to regex-based matching.
219
+
220
+ ## IV - Bundle file definition
221
+
222
+ ### Bundle
223
+
224
+ *Attributes:*
225
+
226
+ - `version` *(int)*: **required**, version of the bundle. Currently, Genelastic
227
+ only supports **version 3**, which is specified in this document,
228
+ - `analyses` *(List\[[Analysis](#analysis-1)\])*: **optional**,
229
+ list of analyses to import,
230
+ - `wet_processes` *(List\[[WetProcess](#wetprocess)\])*: **optional**,
231
+ wet lab process metadata,
232
+ - `bi_processes` *(List\[[BiProcess](#biprocess)\])*: **optional**,
233
+ bioinformatics process metadata,
234
+ - `tags` *([Tags](#tags))*: **optional**, custom tags used in the file prefix.
235
+
236
+ *Bundle example:*
237
+
238
+ ```yaml
239
+ ---
240
+ version: 3
241
+ analyses:
242
+ - # First analysis definition
243
+ - # Second analysis definition
244
+ - # etc...
245
+ wet_processes:
246
+ - # First wet lab process definition.
247
+ - # etc...
248
+ bi_processes:
249
+ - # First bioinformatics process definition.
250
+ - # etc...
251
+ tags:
252
+ # Tags definition.
253
+ ```
254
+
255
+ ### Analysis
256
+
257
+ *Attributes:*
258
+
259
+ - `file_prefix` *(str)*: **required**, file prefix to identify analysis files.
260
+ The file prefix describes the naming convention of the analysis data files
261
+ using tags. It represents the fixed part of the filename, thus must not
262
+ include regular expressions. To account for variable filename parts, use the
263
+ `suffix` attribute.
264
+ - `suffix` *(str)*: **optional**, suffix appended to the file prefix to match
265
+ data files with varying filename suffix (default: `""`),
266
+ - `data_path` *(str)*: **optional**, path to the directory where analysis
267
+ files are stored. If it is relative, the full path is resolved relative to
268
+ the bundle file location (default: `bundle file location`),
269
+ - `wet_process` *(str)*: **optional**, identifier of the wet lab process
270
+ used for the analysis,
271
+ - `bi_process` *(str)*: **optional**, identifier of the bioinformatics
272
+ process used for the analysis,
273
+ - `sample_name` *(str)*: **optional**, metadata field to define sample name,
274
+ - `source` *(str)*: **optional**, metadata field to define source,
275
+ - `barcode` *(str)*: **optional**, metadata field to define barcode,
276
+ - `reference_genome` *(str)*: **optional**, metadata field to define
277
+ reference genome,
278
+ - `flowcell` *(str)*: **optional**, metadata field to define flowcell,
279
+ - `lanes` *(List[int])*: **optional**, metadata field to define
280
+ lanes,
281
+ - `seq_indices` *(List[str])*: **optional**, metadata field to define
282
+ sequencing indices,
283
+ - `cov_depth` *(int)*: **optional**, metadata field to define
284
+ coverage depth,
285
+ - `qc_comment` *(str)*: **optional**, metadata field to define
286
+ quality control comment.
287
+
288
+ *Analysis example:*
289
+
290
+ ```yaml
291
+ ---
292
+ - analyses:
293
+ - file_prefix: "%S_%F_%W_%B_%D_%R_rep-1"
294
+ suffix: "_(?P<type>sv|smallvar|qc)"
295
+ data_path: "/data/"
296
+ sample_name: "HG002" # %S
297
+ source: "CNRGH" # %F
298
+ wet_process: "novaseqxplus-25b" # %W
299
+ bi_process: "dragen-4123" # %B
300
+ cov_depth: 30 # %D
301
+ reference_genome: "hg38" # %R
302
+ ```
303
+
304
+ ### WetProcess
305
+
306
+ *Attributes:*
307
+
308
+ - `proc_id` *(str)*: **required**, identifier of the wet lab process,
309
+ - `manufacturer` *(str)*: **required**, sequencer manufacturer,
310
+ - `sequencer` *(str)*: **required**, sequencer model,
311
+ - `generic_kit` *(str)*: **required**, generic kit name,
312
+ - `fragmentation` *(int)*: **required**, fragment size (bp),
313
+ - `reads_size` *(int)*: **required**, reads size,
314
+ - `input_type` *(str)*: **required**, input type,
315
+ - `amplification` *(str)*: **required**, amplification method,
316
+ - `flowcell_type` *(str)*: **required**, flowcell type,
317
+ - `sequencing_type` *(str)*: **required**, sequencing type,
318
+ - `desc` *(str)*: **optional**, description of the wet lab process.
319
+ - `library_kit` *(str)*: **optional**, library kit name,
320
+ - `sequencing_kit` *(str)*: **optional**, sequencing kit name,
321
+ - `error_rate_expected` *(float)*: **optional**, expected error rate.
322
+
323
+ *WetProcess Example:*
324
+
325
+ ```yaml
326
+ ---
327
+ wet_processes:
328
+ - proc_id: "novaseqxplus-25b"
329
+ manufacturer: "illumina"
330
+ sequencer: "novaseqxplus"
331
+ generic_kit: "truseq-illumina"
332
+ fragmentation: 350
333
+ reads_size: 300
334
+ input_type: "gdna"
335
+ amplification: "pcr-free"
336
+ flowcell_type: "25b"
337
+ sequencing_type: "wgs"
338
+ ```
339
+
340
+ ### BiProcess
341
+
342
+ *Attributes:*
343
+
344
+ - `proc_id` *(str)*: **required**, identifier of the bioinformatics process,
345
+ - `name` *(str)*: **required**, name of the bioinformatics process,
346
+ - `pipeline_version` *(str)*: **required**, version of the bioinformatics
347
+ pipeline,
348
+ - `sequencing_type` *(str)*: **required**, sequencing type,
349
+ - `steps` *(List\[[BiProcess.Step](#biprocessstep)\])*: **optional**,
350
+ list of steps in the bioinformatics pipeline,
351
+ - `desc` *(str)*: **optional**, description of the bioinformatics process.
352
+
353
+ #### BiProcess.Step
354
+
355
+ *Attributes:*
356
+
357
+ - `name` *(str)*: **required**, name of the step,
358
+ - `cmd` *(str)*: **required**, command used in the step,
359
+ - `version` *(str)*: **optional**, version of the command,
360
+ - `output` *(str)*: **optional**, output of the command.
361
+
362
+ *BiProcess example:*
363
+
364
+ ```yaml
365
+ ---
366
+ bi_processes:
367
+ - proc_id: "dragen-4123"
368
+ name: dragen
369
+ pipeline_version: "4.1.2.3"
370
+ steps:
371
+ - {name: basecalling, cmd: bclconvert, version: "3.9.3.2"}
372
+ - {name: trimming, cmd: dragen}
373
+ - {name: mapping, cmd: dragmap}
374
+ - {name: postmapping, cmd: dragen, version: "4.1.23"}
375
+ - {name: smallvarcalling, cmd: dragen, version: "4.1.23"}
376
+ - {name: svcalling, cmd: dragen, version: "4.1.23"}
377
+ - {name: secondary_qc, cmd: dragen, version: "4.1.23"}
378
+ sequencing_type: "wgs"
379
+ ```
380
+
381
+ ### Tags
382
+
383
+ *Attributes:*
384
+
385
+ - `delimiter`: *([Tags.Delimiter](#tagsdelimiter))*: **optional**,
386
+ defines the special characters used to delimit tags within a file prefix.
387
+ Each tag is identified by a start and an optional end delimiter, surrounding
388
+ a tag name that maps to a metadata field.
389
+ - `match`: *(Dict\[str, [Tags.Match](#tagsmatch)\])*: **optional**,
390
+ custom tags definition. Keys are the tag names, and values are the
391
+ corresponding tag definitions. A tag name must contain at least one
392
+ alphanumeric character: `a-z`, `A-Z` and `0-9`.
393
+
394
+ #### Tags.Delimiter
395
+
396
+ *Attributes:*
397
+
398
+ - `start` *(str)*: **optional**, character marking the **beginning of a tag**.
399
+ It must be one special character, excluding the following:
400
+ `(`, `)`, `?`, `<`, `>` (default: `%`),
401
+ - `end` *(str)*: **optional**, character marking the **end of a tag**.
402
+ It must be one special character, excluding the following:
403
+ `(`, `)`, `?`, `<`, `>` (default: `""`). If omitted or empty, the tag ends
404
+ immediately after the tag name. Default: `""` (no explicit end delimiter).
405
+
406
+ #### Tags.Match
407
+
408
+ *Attributes:*
409
+
410
+ - `field` *(str)*: **required**, metadata field name associated with the tag,
411
+ - `regex` *(str)*: **required**, regular expression to match the expected
412
+ metadata value in the filename.
413
+
414
+ *Tags example:*
415
+
416
+ ```yaml
417
+ ---
418
+ tags:
419
+ delimiter:
420
+ start: "#"
421
+ end: "#"
422
+ match:
423
+ Z:
424
+ field: custom_field
425
+ regex: "[^_-]+"
426
+ ```
427
+
428
+ ## V - Scripts usage
429
+
430
+ **Genelastic** provides the following scripts:
431
+
432
+ - `gnl-data`: Create a bundle with randomly generated analyses, metadata,
433
+ processes, and data files for testing,
434
+ - `gnl-validate`: Standalone script that statically validates YAML bundles.
435
+ Useful locally or in CI pipelines to ensure bundles follow the
436
+ [expected schema](#iv---bundle-file-definition) before integrating them into
437
+ a repository,
438
+ - `gnl-import`: Import bundles to an Elasticsearch database,
439
+ - `gnl-info`: Query information about genetic data already imported,
440
+ - `gnl-integrity`: Check the integrity of previously imported data.
441
+
442
+ ### Import behavior
443
+
444
+ Among these, `gnl-import` is the central script. It starts by statically
445
+ validating YAML bundles, like `gnl-validate` does. Then, it parses bundles data
446
+ files, and imports the resulting documents (records built from the files and
447
+ metadata) into the Elasticsearch database.
448
+
449
+ #### Dry-run
450
+
451
+ `gnl-import` can run in "dry-run" mode, letting you check what would happen
452
+ without touching Elasticsearch.
453
+
454
+ - `-D`: parse the files, build the documents, but stop before import.
455
+ Useful to check that the right files are selected and can be read correctly.
456
+ - `-DD`: only list the files matching your file prefix, without parsing or
457
+ import. Useful to check that your `file_prefix` is correct.
458
+
459
+ #### Single-match vs multi-match
460
+
461
+ By default, `gnl-import` runs in **single-match** mode:
462
+
463
+ - Every tag used in the `file_prefix` must have its corresponding metadata
464
+ field explicitly defined.
465
+ - For example, if the file prefix contains `%S`, then the field `sample_name`
466
+ must be provided in the analysis. Otherwise, an error is raised,
467
+ - Only files that exactly match the declared metadata are imported.
468
+
469
+ This mode is best suited for environments where data files belonging to a
470
+ single analysis are grouped together in a dedicated directory.
471
+
472
+ With the option `--multi-match`, the behavior changes:
473
+
474
+ - Undefined metadata fields are tolerated,
475
+ - When a tag has no defined value, it is replaced by its corresponding
476
+ regular expression. For example, `%S` normally maps to `sample_name`.
477
+ If this metadata field is not defined, `%S` expands to its default regex
478
+ (`[^_]+`), which matches any non-underscore sequence. Users can override
479
+ these defaults in the bundle (via [`Tags.Match`](#tagsmatch)).
480
+ - All files matching the defined metadata and regex expansions are collected,
481
+ - Each **unique combination of values** found in the filenames results in a new
482
+ analysis with its own ID.
483
+
484
+ This mode is best suited for environments where multiple analyses share a
485
+ single directory of data files. It is more flexible than single-match, but
486
+ also carries the risk of importing more files than expected if regex patterns
487
+ are too permissive. **Carefully check matches in dry-run mode
488
+ (`-D` / `-DD`) before importing.**
489
+
490
+ **Example:**
491
+
492
+ ```yaml
493
+ ---
494
+ - analyses:
495
+ - file_prefix: "%S_%F_%W_%B_%D_%R_rep-1"
496
+ data_path: "/data/"
497
+ # sample_name (%S) is omitted
498
+ source: "CNRGH" # %F
499
+ wet_process: "novaseqxplus-25b" # %W
500
+ bi_process: "dragen-4123" # %B
501
+ cov_depth: 30 # %D
502
+ reference_genome: "hg38" # %R
503
+ ```
504
+
505
+ In multi-match mode, the `%S` tag expands to its regex (`[^_]+`), so all
506
+ sample names are accepted. The following files are matched:
507
+
508
+ ```text
509
+ - HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1.vcf
510
+ - HG003_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1.vcf
511
+ - HG004_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1.vcf
512
+ ```
513
+
514
+ Three analyses are automatically created, each with its own `sample_name`
515
+ derived from the filename:
516
+
517
+ - `HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1`,
518
+ - `HG003_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1`,
519
+ - `HG004_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1`
520
+
521
+ #### Metrics
522
+
523
+ In addition to raw data files (`.vcf`, `.cov`), Genelastic also supports
524
+ metrics data files.
525
+
526
+ A **metrics data file** is identified by the `.metrics` suffix, which appears
527
+ **before the file extension**. This suffix can optionally include metadata
528
+ about the tools used to generate the metrics and their versions.
529
+
530
+ The format for this metadata is:
531
+
532
+ ```text
533
+ .metrics_<TOOL>-<VERSION>[_<TOOL>-<VERSION>...]
534
+ ```
535
+
536
+ - Multiple tools can be listed, separated by underscores (`_`),
537
+ - Each version must contain at least one number and is separated from the tool
538
+ name by a hyphen (`-`).
539
+
540
+ For raw files, their `type` is the same as their `extension`.
541
+
542
+ For metrics files, however, multiple metrics types may share the same extension
543
+ (e.g. `.json`). To distinguish them, the `suffix` attribute in the analysis
544
+ definition must specify a regex that extracts the `type` from the filename.
545
+ If no `suffix` is defined, Genelastic will raise an error.
546
+
547
+ **Example:**
548
+
549
+ ```yaml
550
+ ---
551
+ - analyses:
552
+ - file_prefix: "%S_%F_%W_%B_%D_%R_rep-1"
553
+ suffix: "_(?P<type>sv|smallvar|qc)"
554
+ data_path: "/data/"
555
+ sample_name: "HG002" # %S
556
+ source: "CNRGH" # %F
557
+ wet_process: "novaseqxplus-25b" # %W
558
+ bi_process: "dragen-4123" # %B
559
+ cov_depth: 30 # %D
560
+ reference_genome: "hg38" # %R
561
+ ```
562
+
563
+ This will match files such as:
564
+
565
+ ```text
566
+ - HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1_smallvar.vcf
567
+ - HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1_smallvar.cov
568
+ - HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1_smallvar.metrics_happy-2-0-0_giab-3-0-0.json
569
+ - HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1_qc.metrics.json
570
+ ```
571
+
572
+ In this example, the analysis includes both raw data files and metrics files.
573
+ The value of the `metrics` field in the imported document depends on the file
574
+ type:
575
+
576
+ - **Non-metrics files** (e.g. VCF, Coverage):
577
+
578
+ ```json
579
+ {
580
+ "metrics": null
581
+ }
582
+ ```
583
+
584
+ - **Metrics files without tool metadata** (e.g. QC):
585
+
586
+ ```json
587
+ {
588
+ "metrics": []
589
+ }
590
+ ```
591
+
592
+ - **Metrics files with tool metadata** (e.g. Smallvar):
593
+
594
+ ```json
595
+ {
596
+ "metrics": [
597
+ {
598
+ "tool": "happy",
599
+ "version": "2.0.0"
600
+ },
601
+ {
602
+ "tool": "giab",
603
+ "version": "3.0.0"
604
+ }
605
+ ]
606
+ }
607
+ ```
608
+
609
+ ## VI - Servers usage
610
+
611
+ Genelastic includes two servers: an **API server** and a **UI server**.
612
+ The UI does not communicate directly with Elasticsearch: it always goes
613
+ through the API server, which acts as a gateway and provides HTTP endpoints.
614
+
615
+ ### API server
616
+
617
+ The API server needs to know how to connect to Elasticsearch. Configure it
618
+ through the following environment variables:
619
+
620
+ - `GENAPI_ES_URL`: URL of the Elasticsearch server,
621
+ - `GENAPI_ES_ENCODED_API_KEY`: Encoded API key,
622
+ - `GENAPI_ES_INDEX_PREFIX`: Prefix to identify indices of interest,
623
+ - `GENAPI_ES_CERT_FP`: Certificate fingerprint of the Elasticsearch server.
624
+
625
+ Start the API server in development mode:
626
+
627
+ ```bash
628
+ gnl-start-api dev
629
+ ```
630
+
631
+ ### UI server
632
+
633
+ The UI server only needs the address of the API server:
634
+
635
+ - `GENUI_API_URL`: URL of the API server.
636
+
637
+ Start the UI server in development mode:
638
+
639
+ ```bash
640
+ gnl-start-ui dev
641
+ ```
642
+
643
+ ### Development vs production
644
+
645
+ Both `gnl-start-api` and `gnl-start-ui` support two modes:
646
+
647
+ - dev: development mode (hot reload, debug logs, not optimized),
648
+ - prod: production mode (optimized build, suitable for deployment).
649
+
650
+ ## VII - For developers
651
+
652
+ ### Prerequisites
653
+
654
+ - `python` >= 3.11
655
+ - `uv` >= 0.9
656
+ - `make`
657
+
658
+ ### Installation
659
+
660
+ To install development dependencies, run the following commands:
661
+
662
+ ```bash
663
+ python -m venv .venv
664
+ source .venv/bin/activate
665
+ make
666
+ ```
667
+
668
+ ### Pre-commit hooks setup
669
+
670
+ This project uses [pre-commit](https://pre-commit.com/) to manage Git hooks
671
+ scripts. To install project hooks, run:
672
+
673
+ ```bash
674
+ pre-commit install
675
+ ```
676
+
677
+ After that, each commit will succeed only if all hooks (defined in
678
+ `.pre-commit-config.yaml`) pass.
679
+
680
+ If necessary (though not recommended),
681
+ you can skip these hooks by using the `--no-verify` / `-n` option when
682
+ committing:
683
+
684
+ ```bash
685
+ git commit -m "My commit message" --no-verify # This commit will not run installed hooks.
686
+ ```