genelastic 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genelastic/api/.env +4 -0
- genelastic/api/cli_start_api.py +18 -0
- genelastic/api/errors.py +52 -0
- genelastic/api/extends/example.py +0 -6
- genelastic/api/extends/example.yml +0 -0
- genelastic/api/routes.py +313 -181
- genelastic/api/server.py +34 -26
- genelastic/api/settings.py +5 -9
- genelastic/api/specification.yml +512 -0
- genelastic/common/__init__.py +0 -39
- genelastic/common/cli.py +100 -0
- genelastic/common/elastic.py +374 -46
- genelastic/common/exceptions.py +34 -2
- genelastic/common/server.py +59 -0
- genelastic/common/types.py +1 -14
- genelastic/import_data/__init__.py +0 -27
- genelastic/import_data/checker.py +99 -0
- genelastic/import_data/checker_observer.py +13 -0
- genelastic/import_data/cli/__init__.py +0 -0
- genelastic/import_data/cli/cli_check.py +136 -0
- genelastic/import_data/cli/gen_data.py +143 -0
- genelastic/import_data/cli/import_data.py +346 -0
- genelastic/import_data/cli/info.py +247 -0
- genelastic/import_data/{cli_integrity.py → cli/integrity.py} +29 -7
- genelastic/import_data/cli/validate.py +146 -0
- genelastic/import_data/collect.py +185 -0
- genelastic/import_data/constants.py +136 -11
- genelastic/import_data/import_bundle.py +102 -59
- genelastic/import_data/import_bundle_factory.py +70 -149
- genelastic/import_data/importers/__init__.py +0 -0
- genelastic/import_data/importers/importer_base.py +131 -0
- genelastic/import_data/importers/importer_factory.py +85 -0
- genelastic/import_data/importers/importer_types.py +223 -0
- genelastic/import_data/logger.py +2 -1
- genelastic/import_data/models/__init__.py +0 -0
- genelastic/import_data/models/analyses.py +178 -0
- genelastic/import_data/models/analysis.py +144 -0
- genelastic/import_data/models/data_file.py +110 -0
- genelastic/import_data/models/process.py +45 -0
- genelastic/import_data/models/processes.py +84 -0
- genelastic/import_data/models/tags.py +170 -0
- genelastic/import_data/models/unique_list.py +109 -0
- genelastic/import_data/models/validate.py +26 -0
- genelastic/import_data/patterns.py +90 -0
- genelastic/import_data/random_bundle.py +79 -54
- genelastic/import_data/resolve.py +157 -0
- genelastic/ui/.env +1 -0
- genelastic/ui/cli_start_ui.py +20 -0
- genelastic/ui/routes.py +333 -0
- genelastic/ui/server.py +9 -82
- genelastic/ui/settings.py +2 -6
- genelastic/ui/static/cea-cnrgh.ico +0 -0
- genelastic/ui/static/cea.ico +0 -0
- genelastic/ui/static/layout.ico +0 -0
- genelastic/ui/static/novaseq6000.png +0 -0
- genelastic/ui/static/style.css +430 -0
- genelastic/ui/static/ui.js +458 -0
- genelastic/ui/templates/analyses.html +98 -0
- genelastic/ui/templates/analysis_detail.html +44 -0
- genelastic/ui/templates/bi_process_detail.html +129 -0
- genelastic/ui/templates/bi_processes.html +116 -0
- genelastic/ui/templates/explorer.html +356 -0
- genelastic/ui/templates/home.html +207 -0
- genelastic/ui/templates/layout.html +153 -0
- genelastic/ui/templates/version.html +21 -0
- genelastic/ui/templates/wet_process_detail.html +131 -0
- genelastic/ui/templates/wet_processes.html +116 -0
- genelastic-0.9.0.dist-info/METADATA +686 -0
- genelastic-0.9.0.dist-info/RECORD +76 -0
- genelastic-0.9.0.dist-info/WHEEL +4 -0
- genelastic-0.9.0.dist-info/entry_points.txt +10 -0
- genelastic-0.9.0.dist-info/licenses/LICENSE +519 -0
- genelastic/import_data/analyses.py +0 -69
- genelastic/import_data/analysis.py +0 -205
- genelastic/import_data/bi_process.py +0 -27
- genelastic/import_data/bi_processes.py +0 -49
- genelastic/import_data/cli_gen_data.py +0 -116
- genelastic/import_data/cli_import.py +0 -379
- genelastic/import_data/cli_info.py +0 -256
- genelastic/import_data/cli_validate.py +0 -54
- genelastic/import_data/data_file.py +0 -87
- genelastic/import_data/filename_pattern.py +0 -57
- genelastic/import_data/tags.py +0 -123
- genelastic/import_data/wet_process.py +0 -28
- genelastic/import_data/wet_processes.py +0 -53
- genelastic-0.7.0.dist-info/METADATA +0 -105
- genelastic-0.7.0.dist-info/RECORD +0 -40
- genelastic-0.7.0.dist-info/WHEEL +0 -5
- genelastic-0.7.0.dist-info/entry_points.txt +0 -6
- genelastic-0.7.0.dist-info/top_level.txt +0 -1
|
@@ -0,0 +1,686 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: genelastic
|
|
3
|
+
Version: 0.9.0
|
|
4
|
+
Summary: Generate and store genetic data into an Elasticsearch database.
|
|
5
|
+
Keywords: CNRGH,genelastic,generation,storage,elasticsearch,database
|
|
6
|
+
Author: CNRGH, Pierrick ROGER, Maxime BLANCHON
|
|
7
|
+
Author-email: Pierrick ROGER <pierrick.roger@cnrgh.fr>, Maxime BLANCHON <maxime.blanchon@cnrgh.fr>
|
|
8
|
+
License-Expression: CECILL-2.1
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Requires-Dist: elasticsearch>=8.18.1,<8.19.0
|
|
14
|
+
Requires-Dist: pyvcf3>=1.0.3,<2.0.0
|
|
15
|
+
Requires-Dist: schema>=0.7.7,<0.8.0
|
|
16
|
+
Requires-Dist: pyyaml>=6.0.2,<7.0.0
|
|
17
|
+
Requires-Dist: biophony>=1.3.0,<1.4.0
|
|
18
|
+
Requires-Dist: colorlog>=6.9.0,<7.0.0
|
|
19
|
+
Requires-Dist: tqdm>=4.67.1,<5.0.0
|
|
20
|
+
Requires-Dist: flask>=3.1.0,<4.0.0 ; extra == 'api'
|
|
21
|
+
Requires-Dist: environs>=14.1.1,<15.0.0 ; extra == 'api'
|
|
22
|
+
Requires-Dist: connexion[flask,swagger-ui,uvicorn]>=3.2.0,<4.0.0 ; extra == 'api'
|
|
23
|
+
Requires-Dist: gunicorn>=23.0.0,<24.0.0 ; extra == 'api'
|
|
24
|
+
Requires-Dist: flask>=3.1.0,<4.0.0 ; extra == 'ui'
|
|
25
|
+
Requires-Dist: requests>=2.32.3,<3.0.0 ; extra == 'ui'
|
|
26
|
+
Requires-Dist: environs>=14.1.1,<15.0.0 ; extra == 'ui'
|
|
27
|
+
Requires-Dist: uvicorn>=0.34.0,<0.35.0 ; extra == 'ui'
|
|
28
|
+
Requires-Dist: asgiref>=3.8.1,<4.0.0 ; extra == 'ui'
|
|
29
|
+
Requires-Dist: gunicorn>=23.0.0,<24.0.0 ; extra == 'ui'
|
|
30
|
+
Requires-Python: >=3.11
|
|
31
|
+
Provides-Extra: api
|
|
32
|
+
Provides-Extra: ui
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# genelastic
|
|
36
|
+
|
|
37
|
+
**Genelastic** is a set of tools for genetic technologies comparisons.
|
|
38
|
+
|
|
39
|
+
It includes a set of scripts to import and manage genetics data in an
|
|
40
|
+
Elasticsearch database, as well as a REST API serving a dedicated user
|
|
41
|
+
interface to query, visualize and compare imported data.
|
|
42
|
+
|
|
43
|
+
## Table of contents
|
|
44
|
+
|
|
45
|
+
- [I - Prerequisites](#i---prerequisites)
|
|
46
|
+
- [II - Installation](#ii---installation)
|
|
47
|
+
- [III - Core concepts](#iii---core-concepts)
|
|
48
|
+
- [IV - Bundle file definition](#iv---bundle-file-definition)
|
|
49
|
+
- [V - Scripts usage](#v---scripts-usage)
|
|
50
|
+
- [VI - Servers usage](#vi---servers-usage)
|
|
51
|
+
- [VII - For developers](#vii---for-developers)
|
|
52
|
+
|
|
53
|
+
## I - Prerequisites
|
|
54
|
+
|
|
55
|
+
- `python` >= 3.11
|
|
56
|
+
|
|
57
|
+
## II - Installation
|
|
58
|
+
|
|
59
|
+
- With **pipx** (recommended):
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pipx install genelastic
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
`pipx` installs `genelastic` in its own isolated
|
|
66
|
+
virtual environment and makes all scripts (`gnl-*`) available globally.
|
|
67
|
+
|
|
68
|
+
- Or with **pip**:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
python -m venv .venv
|
|
72
|
+
source .venv/bin/activate # Activate the virtual environment
|
|
73
|
+
pip install genelastic
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
It is recommended to install `genelastic` inside its own virtual
|
|
77
|
+
environment to avoid conflicts with globally installed Python packages.
|
|
78
|
+
To run the scripts, you need to activate the environment first.
|
|
79
|
+
|
|
80
|
+
Test the installation by running one of the genelastic scripts:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
gnl-import -h # Print the help message and exit.
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## III - Core concepts
|
|
87
|
+
|
|
88
|
+
This section introduces the main ideas behind **Genelastic**. It explains how
|
|
89
|
+
a bundle (YAML manifest) describes analyses, how files are located with
|
|
90
|
+
file prefixes and tags, which data file types are supported, and how they are
|
|
91
|
+
imported into Elasticsearch.
|
|
92
|
+
|
|
93
|
+
### Bundle
|
|
94
|
+
|
|
95
|
+
A **bundle** is a YAML manifest that describes one or several analyses.\
|
|
96
|
+
It defines:
|
|
97
|
+
|
|
98
|
+
- metadata for each analysis,
|
|
99
|
+
- the path where the related files are stored,
|
|
100
|
+
- the naming rules (file prefix and tags) to automatically locate those files.
|
|
101
|
+
|
|
102
|
+
The bundle itself is **not** imported into Elasticsearch.
|
|
103
|
+
Instead, `gnl-import` uses it to build filename patterns (regular expressions
|
|
104
|
+
with named groups), retrieve matching files, and import both the files'
|
|
105
|
+
contents and the associated metadata into the database. Because the regex is
|
|
106
|
+
built from tags, the metadata can always be re-extracted from the filenames if
|
|
107
|
+
needed.
|
|
108
|
+
|
|
109
|
+
### Analysis
|
|
110
|
+
|
|
111
|
+
An **analysis** is the central unit described in a bundle.\
|
|
112
|
+
It combines:
|
|
113
|
+
|
|
114
|
+
- a set of metadata fields (sample, source, reference genome, etc.),
|
|
115
|
+
- one wet lab and one bioinformatics process,
|
|
116
|
+
- a `data_path` where files are located,
|
|
117
|
+
- and a `file_prefix` that defines the naming pattern of all files belonging
|
|
118
|
+
to the analysis.
|
|
119
|
+
|
|
120
|
+
The analysis acts as a template: it tells Genelastic how to find the right
|
|
121
|
+
files and how to attach them to the correct metadata.
|
|
122
|
+
|
|
123
|
+
### File prefix and tags
|
|
124
|
+
|
|
125
|
+
A **file prefix** is a naming template made of tags, each tag representing a
|
|
126
|
+
metadata field. When processing a bundle, Genelastic replaces tags with their
|
|
127
|
+
values to build a filename pattern, a regular expression used to automatically
|
|
128
|
+
retrieve files in `data_path`.
|
|
129
|
+
|
|
130
|
+
By default, tags use `%` as a start delimiter and `""` (empty char) as an end
|
|
131
|
+
delimiter. Both delimiters can be overridden if needed.
|
|
132
|
+
|
|
133
|
+
For example, the following tags are all valid:
|
|
134
|
+
|
|
135
|
+
- `%S` uses default start delimiter (`%`) and default end delimiter (`""`),
|
|
136
|
+
- `%S%` uses default start delimiter (`%`) and custom end delimiter (`%`),
|
|
137
|
+
- `$S$` uses custom start delimiter (`$`) and custom end delimiter (`$`).
|
|
138
|
+
|
|
139
|
+
However, not all characters are allowed as delimiters (see [Tags](#tags)).
|
|
140
|
+
|
|
141
|
+
**Default tags** provided by Genelastic:
|
|
142
|
+
|
|
143
|
+
- `%S` => `sample_name`
|
|
144
|
+
- `%F` => `source`
|
|
145
|
+
- `%W` => `wet_process`
|
|
146
|
+
- `%B` => `bi_process`
|
|
147
|
+
- `%D` => `cov_depth`
|
|
148
|
+
- `%A` => `barcode`
|
|
149
|
+
- `%R` => `reference_genome`
|
|
150
|
+
|
|
151
|
+
Custom tags can also be defined.
|
|
152
|
+
|
|
153
|
+
#### Example
|
|
154
|
+
|
|
155
|
+
Suppose the bundle defines the following analysis:
|
|
156
|
+
|
|
157
|
+
```yaml
|
|
158
|
+
---
|
|
159
|
+
- analyses:
|
|
160
|
+
- file_prefix: "%S_%F_%W_%B_%D_%R_rep-1"
|
|
161
|
+
data_path: "/data/"
|
|
162
|
+
sample_name: "HG002" # %S
|
|
163
|
+
source: "CNRGH" # %F
|
|
164
|
+
wet_process: "novaseqxplus-25b" # %W
|
|
165
|
+
bi_process: "dragen-4123" # %B
|
|
166
|
+
cov_depth: 30 # %D
|
|
167
|
+
reference_genome: "hg38" # %R
|
|
168
|
+
# ...
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
Genelastic expands tags and automatically retrieve matching files:
|
|
172
|
+
|
|
173
|
+
```text
|
|
174
|
+
- HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1.cov
|
|
175
|
+
- HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1.vcf.gz
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Data files
|
|
179
|
+
|
|
180
|
+
A **data file** is any file belonging to an analysis that is imported into the
|
|
181
|
+
database. Each data file has a **type** and an **extension**.
|
|
182
|
+
|
|
183
|
+
Supported **raw** data file types:
|
|
184
|
+
|
|
185
|
+
- **VCF** (gzipped or not): type = `vcf`, ext = `vcf`,
|
|
186
|
+
- **Coverage** in TSV format: type = `cov`, ext = `cov`.
|
|
187
|
+
|
|
188
|
+
Supported **metrics** data file types:
|
|
189
|
+
|
|
190
|
+
- **QC**: type = `qc`, ext = `yml` / `yaml`,
|
|
191
|
+
- **SV** (gzipped or not): type = `sv`, ext = `json`,
|
|
192
|
+
- **Smallvar** (gzipped or not): type = `smallvar`, ext = `json`.
|
|
193
|
+
|
|
194
|
+
During import, Genelastic automatically handles gzipped files.
|
|
195
|
+
Each line of each data file is parsed and transformed into a document, enriched
|
|
196
|
+
with the analysis metadata, and indexed into the Elasticsearch index
|
|
197
|
+
corresponding to the file type.
|
|
198
|
+
|
|
199
|
+
### Processes
|
|
200
|
+
|
|
201
|
+
An analysis can also reference:
|
|
202
|
+
|
|
203
|
+
- a **wet lab process** (sequencing metadata),
|
|
204
|
+
- and a **bioinformatics process** (pipeline metadata).
|
|
205
|
+
|
|
206
|
+
These processes provide contextual metadata about how the data was generated
|
|
207
|
+
(wet lab) and processed (bioinformatics). They enable filtering and comparison
|
|
208
|
+
of analyses based on production and pipeline characteristics.
|
|
209
|
+
|
|
210
|
+
### Benefits
|
|
211
|
+
|
|
212
|
+
Using bundles brings several advantages:
|
|
213
|
+
|
|
214
|
+
- Metadata is defined once per analysis, not repeated for each file,
|
|
215
|
+
- Files are retrieved automatically using filename patterns built from file
|
|
216
|
+
prefixes, avoiding manual lists,
|
|
217
|
+
- Metadata and file contents are imported consistently into Elasticsearch,
|
|
218
|
+
- Analyses remain reproducible and traceable thanks to regex-based matching.
|
|
219
|
+
|
|
220
|
+
## IV - Bundle file definition
|
|
221
|
+
|
|
222
|
+
### Bundle
|
|
223
|
+
|
|
224
|
+
*Attributes:*
|
|
225
|
+
|
|
226
|
+
- `version` *(int)*: **required**, version of the bundle. Currently, Genelastic
|
|
227
|
+
only supports **version 3**, which is specified in this document,
|
|
228
|
+
- `analyses` *(List\[[Analysis](#analysis-1)\])*: **optional**,
|
|
229
|
+
list of analyses to import,
|
|
230
|
+
- `wet_processes` *(List\[[WetProcess](#wetprocess)\])*: **optional**,
|
|
231
|
+
wet lab process metadata,
|
|
232
|
+
- `bi_processes` *(List\[[BiProcess](#biprocess)\])*: **optional**,
|
|
233
|
+
bioinformatics process metadata,
|
|
234
|
+
- `tags` *([Tags](#tags))*: **optional**, custom tags used in the file prefix.
|
|
235
|
+
|
|
236
|
+
*Bundle example:*
|
|
237
|
+
|
|
238
|
+
```yaml
|
|
239
|
+
---
|
|
240
|
+
version: 3
|
|
241
|
+
analyses:
|
|
242
|
+
- # First analysis definition
|
|
243
|
+
- # Second analysis definition
|
|
244
|
+
- # etc...
|
|
245
|
+
wet_processes:
|
|
246
|
+
- # First wet lab process definition.
|
|
247
|
+
- # etc...
|
|
248
|
+
bi_processes:
|
|
249
|
+
- # First bioinformatics process definition.
|
|
250
|
+
- # etc...
|
|
251
|
+
tags:
|
|
252
|
+
# Tags definition.
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
### Analysis
|
|
256
|
+
|
|
257
|
+
*Attributes:*
|
|
258
|
+
|
|
259
|
+
- `file_prefix` *(str)*: **required**, file prefix to identify analysis files.
|
|
260
|
+
The file prefix describes the naming convention of the analysis data files
|
|
261
|
+
using tags. It represents the fixed part of the filename, thus must not
|
|
262
|
+
include regular expressions. To account for variable filename parts, use the
|
|
263
|
+
`suffix` attribute.
|
|
264
|
+
- `suffix` *(str)*: **optional**, suffix appended to the file prefix to match
|
|
265
|
+
data files with varying filename suffix (default: `""`),
|
|
266
|
+
- `data_path` *(str)*: **optional**, path to the directory where analysis
|
|
267
|
+
files are stored. If it is relative, the full path is resolved relative to
|
|
268
|
+
the bundle file location (default: `bundle file location`),
|
|
269
|
+
- `wet_process` *(str)*: **optional**, identifier of the wet lab process
|
|
270
|
+
used for the analysis,
|
|
271
|
+
- `bi_process` *(str)*: **optional**, identifier of the bioinformatics
|
|
272
|
+
process used for the analysis,
|
|
273
|
+
- `sample_name` *(str)*: **optional**, metadata field to define sample name,
|
|
274
|
+
- `source` *(str)*: **optional**, metadata field to define source,
|
|
275
|
+
- `barcode` *(str)*: **optional**, metadata field to define barcode,
|
|
276
|
+
- `reference_genome` *(str)*: **optional**, metadata field to define
|
|
277
|
+
reference genome,
|
|
278
|
+
- `flowcell` *(str)*: **optional**, metadata field to define flowcell,
|
|
279
|
+
- `lanes` *(List[int])*: **optional**, metadata field to define
|
|
280
|
+
lanes,
|
|
281
|
+
- `seq_indices` *(List[str])*: **optional**, metadata field to define
|
|
282
|
+
sequencing indices,
|
|
283
|
+
- `cov_depth` *(int)*: **optional**, metadata field to define
|
|
284
|
+
coverage depth,
|
|
285
|
+
- `qc_comment` *(str)*: **optional**, metadata field to define
|
|
286
|
+
quality control comment.
|
|
287
|
+
|
|
288
|
+
*Analysis example:*
|
|
289
|
+
|
|
290
|
+
```yaml
|
|
291
|
+
---
|
|
292
|
+
- analyses:
|
|
293
|
+
- file_prefix: "%S_%F_%W_%B_%D_%R_rep-1"
|
|
294
|
+
suffix: "_(?P<type>sv|smallvar|qc)"
|
|
295
|
+
data_path: "/data/"
|
|
296
|
+
sample_name: "HG002" # %S
|
|
297
|
+
source: "CNRGH" # %F
|
|
298
|
+
wet_process: "novaseqxplus-25b" # %W
|
|
299
|
+
bi_process: "dragen-4123" # %B
|
|
300
|
+
cov_depth: 30 # %D
|
|
301
|
+
reference_genome: "hg38" # %R
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
### WetProcess
|
|
305
|
+
|
|
306
|
+
*Attributes:*
|
|
307
|
+
|
|
308
|
+
- `proc_id` *(str)*: **required**, identifier of the wet lab process,
|
|
309
|
+
- `manufacturer` *(str)*: **required**, sequencer manufacturer,
|
|
310
|
+
- `sequencer` *(str)*: **required**, sequencer model,
|
|
311
|
+
- `generic_kit` *(str)*: **required**, generic kit name,
|
|
312
|
+
- `fragmentation` *(int)*: **required**, fragment size (bp),
|
|
313
|
+
- `reads_size` *(int)*: **required**, reads size,
|
|
314
|
+
- `input_type` *(str)*: **required**, input type,
|
|
315
|
+
- `amplification` *(str)*: **required**, amplification method,
|
|
316
|
+
- `flowcell_type` *(str)*: **required**, flowcell type,
|
|
317
|
+
- `sequencing_type` *(str)*: **required**, sequencing type,
|
|
318
|
+
- `desc` *(str)*: **optional**, description of the wet lab process.
|
|
319
|
+
- `library_kit` *(str)*: **optional**, library kit name,
|
|
320
|
+
- `sequencing_kit` *(str)*: **optional**, sequencing kit name,
|
|
321
|
+
- `error_rate_expected` *(float)*: **optional**, expected error rate.
|
|
322
|
+
|
|
323
|
+
*WetProcess Example:*
|
|
324
|
+
|
|
325
|
+
```yaml
|
|
326
|
+
---
|
|
327
|
+
wet_processes:
|
|
328
|
+
- proc_id: "novaseqxplus-25b"
|
|
329
|
+
manufacturer: "illumina"
|
|
330
|
+
sequencer: "novaseqxplus"
|
|
331
|
+
generic_kit: "truseq-illumina"
|
|
332
|
+
fragmentation: 350
|
|
333
|
+
reads_size: 300
|
|
334
|
+
input_type: "gdna"
|
|
335
|
+
amplification: "pcr-free"
|
|
336
|
+
flowcell_type: "25b"
|
|
337
|
+
sequencing_type: "wgs"
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
### BiProcess
|
|
341
|
+
|
|
342
|
+
*Attributes:*
|
|
343
|
+
|
|
344
|
+
- `proc_id` *(str)*: **required**, identifier of the bioinformatics process,
|
|
345
|
+
- `name` *(str)*: **required**, name of the bioinformatics process,
|
|
346
|
+
- `pipeline_version` *(str)*: **required**, version of the bioinformatics
|
|
347
|
+
pipeline,
|
|
348
|
+
- `sequencing_type` *(str)*: **required**, sequencing type,
|
|
349
|
+
- `steps` *(List\[[BiProcess.Step](#biprocessstep)\])*: **optional**,
|
|
350
|
+
list of steps in the bioinformatics pipeline,
|
|
351
|
+
- `desc` *(str)*: **optional**, description of the bioinformatics process.
|
|
352
|
+
|
|
353
|
+
#### BiProcess.Step
|
|
354
|
+
|
|
355
|
+
*Attributes:*
|
|
356
|
+
|
|
357
|
+
- `name` *(str)*: **required**, name of the step,
|
|
358
|
+
- `cmd` *(str)*: **required**, command used in the step,
|
|
359
|
+
- `version` *(str)*: **optional**, version of the command,
|
|
360
|
+
- `output` *(str)*: **optional**, output of the command.
|
|
361
|
+
|
|
362
|
+
*BiProcess example:*
|
|
363
|
+
|
|
364
|
+
```yaml
|
|
365
|
+
---
|
|
366
|
+
bi_processes:
|
|
367
|
+
- proc_id: "dragen-4123"
|
|
368
|
+
name: dragen
|
|
369
|
+
pipeline_version: "4.1.2.3"
|
|
370
|
+
steps:
|
|
371
|
+
- {name: basecalling, cmd: bclconvert, version: "3.9.3.2"}
|
|
372
|
+
- {name: trimming, cmd: dragen}
|
|
373
|
+
- {name: mapping, cmd: dragmap}
|
|
374
|
+
- {name: postmapping, cmd: dragen, version: "4.1.23"}
|
|
375
|
+
- {name: smallvarcalling, cmd: dragen, version: "4.1.23"}
|
|
376
|
+
- {name: svcalling, cmd: dragen, version: "4.1.23"}
|
|
377
|
+
- {name: secondary_qc, cmd: dragen, version: "4.1.23"}
|
|
378
|
+
sequencing_type: "wgs"
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
### Tags
|
|
382
|
+
|
|
383
|
+
*Attributes:*
|
|
384
|
+
|
|
385
|
+
- `delimiter`: *([Tags.Delimiter](#tagsdelimiter))*: **optional**,
|
|
386
|
+
defines the special characters used to delimit tags within a file prefix.
|
|
387
|
+
Each tag is identified by a start and an optional end delimiter, surrounding
|
|
388
|
+
a tag name that maps to a metadata field.
|
|
389
|
+
- `match`: *(Dict\[str, [Tags.Match](#tagsmatch)\])*: **optional**,
|
|
390
|
+
custom tags definition. Keys are the tag names, and values are the
|
|
391
|
+
corresponding tag definitions. A tag name must contain at least one
|
|
392
|
+
alphanumeric character: `a-z`, `A-Z` and `0-9`.
|
|
393
|
+
|
|
394
|
+
#### Tags.Delimiter
|
|
395
|
+
|
|
396
|
+
*Attributes:*
|
|
397
|
+
|
|
398
|
+
- `start` *(str)*: **optional**, character marking the **beginning of a tag**.
|
|
399
|
+
It must be one special character, excluding the following:
|
|
400
|
+
`(`, `)`, `?`, `<`, `>` (default: `%`),
|
|
401
|
+
- `end` *(str)*: **optional**, character marking the **end of a tag**.
|
|
402
|
+
It must be one special character, excluding the following:
|
|
403
|
+
`(`, `)`, `?`, `<`, `>` (default: `""`). If omitted or empty, the tag ends
|
|
404
|
+
immediately after the tag name. Default: `""` (no explicit end delimiter).
|
|
405
|
+
|
|
406
|
+
#### Tags.Match
|
|
407
|
+
|
|
408
|
+
*Attributes:*
|
|
409
|
+
|
|
410
|
+
- `field` *(str)*: **required**, metadata field name associated with the tag,
|
|
411
|
+
- `regex` *(str)*: **required**, regular expression to match the expected
|
|
412
|
+
metadata value in the filename.
|
|
413
|
+
|
|
414
|
+
*Tags example:*
|
|
415
|
+
|
|
416
|
+
```yaml
|
|
417
|
+
---
|
|
418
|
+
tags:
|
|
419
|
+
delimiter:
|
|
420
|
+
start: "#"
|
|
421
|
+
end: "#"
|
|
422
|
+
match:
|
|
423
|
+
Z:
|
|
424
|
+
field: custom_field
|
|
425
|
+
regex: "[^_-]+"
|
|
426
|
+
```
|
|
427
|
+
|
|
428
|
+
## V - Scripts usage
|
|
429
|
+
|
|
430
|
+
**Genelastic** provides the following scripts:
|
|
431
|
+
|
|
432
|
+
- `gnl-data`: Create a bundle with randomly generated analyses, metadata,
|
|
433
|
+
processes, and data files for testing,
|
|
434
|
+
- `gnl-validate`: Standalone script that statically validates YAML bundles.
|
|
435
|
+
Useful locally or in CI pipelines to ensure bundles follow the
|
|
436
|
+
[expected schema](#iv---bundle-file-definition) before integrating them into
|
|
437
|
+
a repository,
|
|
438
|
+
- `gnl-import`: Import bundles to an Elasticsearch database,
|
|
439
|
+
- `gnl-info`: Query information about genetic data already imported,
|
|
440
|
+
- `gnl-integrity`: Check the integrity of previously imported data.
|
|
441
|
+
|
|
442
|
+
### Import behavior
|
|
443
|
+
|
|
444
|
+
Among these, `gnl-import` is the central script. It starts by statically
|
|
445
|
+
validating YAML bundles, like `gnl-validate` does. Then, it parses bundles data
|
|
446
|
+
files, and imports the resulting documents (records built from the files and
|
|
447
|
+
metadata) into the Elasticsearch database.
|
|
448
|
+
|
|
449
|
+
#### Dry-run
|
|
450
|
+
|
|
451
|
+
`gnl-import` can run in "dry-run" mode, letting you check what would happen
|
|
452
|
+
without touching Elasticsearch.
|
|
453
|
+
|
|
454
|
+
- `-D`: parse the files, build the documents, but stop before import.
|
|
455
|
+
Useful to check that the right files are selected and can be read correctly.
|
|
456
|
+
- `-DD`: only list the files matching your file prefix, without parsing or
|
|
457
|
+
import. Useful to check that your `file_prefix` is correct.
|
|
458
|
+
|
|
459
|
+
#### Single-match vs multi-match
|
|
460
|
+
|
|
461
|
+
By default, `gnl-import` runs in **single-match** mode:
|
|
462
|
+
|
|
463
|
+
- Every tag used in the `file_prefix` must have its corresponding metadata
|
|
464
|
+
field explicitly defined.
|
|
465
|
+
- For example, if the file prefix contains `%S`, then the field `sample_name`
|
|
466
|
+
must be provided in the analysis. Otherwise, an error is raised,
|
|
467
|
+
- Only files that exactly match the declared metadata are imported.
|
|
468
|
+
|
|
469
|
+
This mode is best suited for environments where data files belonging to a
|
|
470
|
+
single analysis are grouped together in a dedicated directory.
|
|
471
|
+
|
|
472
|
+
With the option `--multi-match`, the behavior changes:
|
|
473
|
+
|
|
474
|
+
- Undefined metadata fields are tolerated,
|
|
475
|
+
- When a tag has no defined value, it is replaced by its corresponding
|
|
476
|
+
regular expression. For example, `%S` normally maps to `sample_name`.
|
|
477
|
+
If this metadata field is not defined, `%S` expands to its default regex
|
|
478
|
+
(`[^_]+`), which matches any non-underscore sequence. Users can override
|
|
479
|
+
these defaults in the bundle (via [`Tags.Match`](#tagsmatch)).
|
|
480
|
+
- All files matching the defined metadata and regex expansions are collected,
|
|
481
|
+
- Each **unique combination of values** found in the filenames results in a new
|
|
482
|
+
analysis with its own ID.
|
|
483
|
+
|
|
484
|
+
This mode is best suited for environments where multiple analyses share a
|
|
485
|
+
single directory of data files. It is more flexible than single-match, but
|
|
486
|
+
also carries the risk of importing more files than expected if regex patterns
|
|
487
|
+
are too permissive. **Carefully check matches in dry-run mode
|
|
488
|
+
(`-D` / `-DD`) before importing.**
|
|
489
|
+
|
|
490
|
+
**Example:**
|
|
491
|
+
|
|
492
|
+
```yaml
|
|
493
|
+
---
|
|
494
|
+
- analyses:
|
|
495
|
+
- file_prefix: "%S_%F_%W_%B_%D_%R_rep-1"
|
|
496
|
+
data_path: "/data/"
|
|
497
|
+
# sample_name (%S) is omitted
|
|
498
|
+
source: "CNRGH" # %F
|
|
499
|
+
wet_process: "novaseqxplus-25b" # %W
|
|
500
|
+
bi_process: "dragen-4123" # %B
|
|
501
|
+
cov_depth: 30 # %D
|
|
502
|
+
reference_genome: "hg38" # %R
|
|
503
|
+
```
|
|
504
|
+
|
|
505
|
+
In multi-match mode, the `%S` tag expands to its regex (`[^_]+`), so all
|
|
506
|
+
sample names are accepted. The following files are matched:
|
|
507
|
+
|
|
508
|
+
```text
|
|
509
|
+
- HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1.vcf
|
|
510
|
+
- HG003_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1.vcf
|
|
511
|
+
- HG004_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1.vcf
|
|
512
|
+
```
|
|
513
|
+
|
|
514
|
+
Three analyses are automatically created, each with its own `sample_name`
|
|
515
|
+
derived from the filename:
|
|
516
|
+
|
|
517
|
+
- `HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1`,
|
|
518
|
+
- `HG003_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1`,
|
|
519
|
+
- `HG004_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1`
|
|
520
|
+
|
|
521
|
+
#### Metrics
|
|
522
|
+
|
|
523
|
+
In addition to raw data files (`.vcf`, `.cov`), Genelastic also supports
|
|
524
|
+
metrics data files.
|
|
525
|
+
|
|
526
|
+
A **metrics data file** is identified by the `.metrics` suffix, which appears
|
|
527
|
+
**before the file extension**. This suffix can optionally include metadata
|
|
528
|
+
about the tools used to generate the metrics and their versions.
|
|
529
|
+
|
|
530
|
+
The format for this metadata is:
|
|
531
|
+
|
|
532
|
+
```text
|
|
533
|
+
.metrics_<TOOL>-<VERSION>[_<TOOL>-<VERSION>...]
|
|
534
|
+
```
|
|
535
|
+
|
|
536
|
+
- Multiple tools can be listed, separated by underscores (`_`),
|
|
537
|
+
- Each version must contain at least one number and is separated from the tool
|
|
538
|
+
name by a hyphen (`-`).
|
|
539
|
+
|
|
540
|
+
For raw files, their `type` is the same as their `extension`.
|
|
541
|
+
|
|
542
|
+
For metrics files, however, multiple metrics types may share the same extension
|
|
543
|
+
(e.g. `.json`). To distinguish them, the `suffix` attribute in the analysis
|
|
544
|
+
definition must specify a regex that extracts the `type` from the filename.
|
|
545
|
+
If no `suffix` is defined, Genelastic will raise an error.
|
|
546
|
+
|
|
547
|
+
**Example:**
|
|
548
|
+
|
|
549
|
+
```yaml
|
|
550
|
+
---
|
|
551
|
+
- analyses:
|
|
552
|
+
- file_prefix: "%S_%F_%W_%B_%D_%R_rep-1"
|
|
553
|
+
suffix: "_(?P<type>sv|smallvar|qc)"
|
|
554
|
+
data_path: "/data/"
|
|
555
|
+
sample_name: "HG002" # %S
|
|
556
|
+
source: "CNRGH" # %F
|
|
557
|
+
wet_process: "novaseqxplus-25b" # %W
|
|
558
|
+
bi_process: "dragen-4123" # %B
|
|
559
|
+
cov_depth: 30 # %D
|
|
560
|
+
reference_genome: "hg38" # %R
|
|
561
|
+
```
|
|
562
|
+
|
|
563
|
+
This will match files such as:
|
|
564
|
+
|
|
565
|
+
```text
|
|
566
|
+
- HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1_smallvar.vcf
|
|
567
|
+
- HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1_smallvar.cov
|
|
568
|
+
- HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1_smallvar.metrics_happy-2-0-0_giab-3-0-0.json
|
|
569
|
+
- HG002_CNRGH_novaseqxplus-25b_dragen-4123_30_hg38_rep-1_qc.metrics.json
|
|
570
|
+
```
|
|
571
|
+
|
|
572
|
+
In this example, the analysis includes both raw data files and metrics files.
|
|
573
|
+
The value of the `metrics` field in the imported document depends on the file
|
|
574
|
+
type:
|
|
575
|
+
|
|
576
|
+
- **Non-metrics files** (e.g. VCF, Coverage):
|
|
577
|
+
|
|
578
|
+
```json
|
|
579
|
+
{
|
|
580
|
+
"metrics": null
|
|
581
|
+
}
|
|
582
|
+
```
|
|
583
|
+
|
|
584
|
+
- **Metrics files without tool metadata** (e.g. QC):
|
|
585
|
+
|
|
586
|
+
```json
|
|
587
|
+
{
|
|
588
|
+
"metrics": []
|
|
589
|
+
}
|
|
590
|
+
```
|
|
591
|
+
|
|
592
|
+
- **Metrics files with tool metadata** (e.g. Smallvar):
|
|
593
|
+
|
|
594
|
+
```json
|
|
595
|
+
{
|
|
596
|
+
"metrics": [
|
|
597
|
+
{
|
|
598
|
+
"tool": "happy",
|
|
599
|
+
"version": "2.0.0"
|
|
600
|
+
},
|
|
601
|
+
{
|
|
602
|
+
"tool": "giab",
|
|
603
|
+
"version": "3.0.0"
|
|
604
|
+
}
|
|
605
|
+
]
|
|
606
|
+
}
|
|
607
|
+
```
|
|
608
|
+
|
|
609
|
+
## VI - Servers usage
|
|
610
|
+
|
|
611
|
+
Genelastic includes two servers: an **API server** and a **UI server**.
|
|
612
|
+
The UI does not communicate directly with Elasticsearch: it always goes
|
|
613
|
+
through the API server, which acts as a gateway and provides HTTP endpoints.
|
|
614
|
+
|
|
615
|
+
### API server
|
|
616
|
+
|
|
617
|
+
The API server needs to know how to connect to Elasticsearch. Configure it
|
|
618
|
+
through the following environment variables:
|
|
619
|
+
|
|
620
|
+
- `GENAPI_ES_URL`: URL of the Elasticsearch server,
|
|
621
|
+
- `GENAPI_ES_ENCODED_API_KEY`: Encoded API key,
|
|
622
|
+
- `GENAPI_ES_INDEX_PREFIX`: Prefix to identify indices of interest,
|
|
623
|
+
- `GENAPI_ES_CERT_FP`: Certificate fingerprint of the Elasticsearch server.
|
|
624
|
+
|
|
625
|
+
Start the API server in development mode:
|
|
626
|
+
|
|
627
|
+
```bash
|
|
628
|
+
gnl-start-api dev
|
|
629
|
+
```
|
|
630
|
+
|
|
631
|
+
### UI server
|
|
632
|
+
|
|
633
|
+
The UI server only needs the address of the API server:
|
|
634
|
+
|
|
635
|
+
- `GENUI_API_URL`: URL of the API server.
|
|
636
|
+
|
|
637
|
+
Start the UI server in development mode:
|
|
638
|
+
|
|
639
|
+
```bash
|
|
640
|
+
gnl-start-ui dev
|
|
641
|
+
```
|
|
642
|
+
|
|
643
|
+
### Development vs production
|
|
644
|
+
|
|
645
|
+
Both `gnl-start-api` and `gnl-start-ui` support two modes:
|
|
646
|
+
|
|
647
|
+
- dev: development mode (hot reload, debug logs, not optimized),
|
|
648
|
+
- prod: production mode (optimized build, suitable for deployment).
|
|
649
|
+
|
|
650
|
+
## VII - For developers
|
|
651
|
+
|
|
652
|
+
### Prerequisites
|
|
653
|
+
|
|
654
|
+
- `python` >= 3.11
|
|
655
|
+
- `uv` >= 0.9
|
|
656
|
+
- `make`
|
|
657
|
+
|
|
658
|
+
### Installation
|
|
659
|
+
|
|
660
|
+
To install development dependencies, run the following commands:
|
|
661
|
+
|
|
662
|
+
```bash
|
|
663
|
+
python -m venv .venv
|
|
664
|
+
source .venv/bin/activate
|
|
665
|
+
make
|
|
666
|
+
```
|
|
667
|
+
|
|
668
|
+
### Pre-commit hooks setup
|
|
669
|
+
|
|
670
|
+
This project uses [pre-commit](https://pre-commit.com/) to manage Git hooks
|
|
671
|
+
scripts. To install project hooks, run:
|
|
672
|
+
|
|
673
|
+
```bash
|
|
674
|
+
pre-commit install
|
|
675
|
+
```
|
|
676
|
+
|
|
677
|
+
After that, each commit will succeed only if all hooks (defined in
|
|
678
|
+
`.pre-commit-config.yaml`) pass.
|
|
679
|
+
|
|
680
|
+
If necessary (though not recommended),
|
|
681
|
+
you can skip these hooks by using the `--no-verify` / `-n` option when
|
|
682
|
+
committing:
|
|
683
|
+
|
|
684
|
+
```bash
|
|
685
|
+
git commit -m "My commit message" --no-verify # This commit will not run installed hooks.
|
|
686
|
+
```
|