genelastic 0.6.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. genelastic/api/cli_start_api.py +18 -0
  2. genelastic/api/extends/example.py +2 -3
  3. genelastic/api/extends/example.yml +20 -0
  4. genelastic/api/routes.py +160 -23
  5. genelastic/api/server.py +42 -31
  6. genelastic/api/settings.py +5 -8
  7. genelastic/api/specification.yml +350 -0
  8. genelastic/common/__init__.py +41 -9
  9. genelastic/common/cli.py +103 -23
  10. genelastic/common/elastic.py +80 -49
  11. genelastic/common/exceptions.py +0 -2
  12. genelastic/common/server.py +51 -0
  13. genelastic/common/types.py +20 -15
  14. genelastic/import_data/__init__.py +23 -5
  15. genelastic/import_data/analyses.py +17 -20
  16. genelastic/import_data/analysis.py +69 -65
  17. genelastic/import_data/bi_process.py +7 -5
  18. genelastic/import_data/bi_processes.py +8 -8
  19. genelastic/import_data/cli_gen_data.py +143 -0
  20. genelastic/import_data/cli_import.py +379 -0
  21. genelastic/import_data/{info.py → cli_info.py} +104 -75
  22. genelastic/import_data/cli_integrity.py +384 -0
  23. genelastic/import_data/cli_validate.py +54 -0
  24. genelastic/import_data/constants.py +11 -32
  25. genelastic/import_data/data_file.py +23 -20
  26. genelastic/import_data/filename_pattern.py +26 -32
  27. genelastic/import_data/import_bundle.py +56 -47
  28. genelastic/import_data/import_bundle_factory.py +166 -158
  29. genelastic/import_data/logger.py +22 -18
  30. genelastic/import_data/random_bundle.py +425 -0
  31. genelastic/import_data/tags.py +46 -26
  32. genelastic/import_data/wet_process.py +8 -4
  33. genelastic/import_data/wet_processes.py +13 -8
  34. genelastic/ui/__init__.py +0 -0
  35. genelastic/ui/cli_start_ui.py +18 -0
  36. genelastic/ui/routes.py +86 -0
  37. genelastic/ui/server.py +14 -0
  38. genelastic/ui/settings.py +7 -0
  39. genelastic/ui/templates/analyses.html +11 -0
  40. genelastic/ui/templates/bi_processes.html +11 -0
  41. genelastic/ui/templates/home.html +4 -0
  42. genelastic/ui/templates/layout.html +34 -0
  43. genelastic/ui/templates/version.html +9 -0
  44. genelastic/ui/templates/wet_processes.html +11 -0
  45. genelastic-0.8.0.dist-info/METADATA +109 -0
  46. genelastic-0.8.0.dist-info/RECORD +52 -0
  47. {genelastic-0.6.1.dist-info → genelastic-0.8.0.dist-info}/WHEEL +1 -1
  48. genelastic-0.8.0.dist-info/entry_points.txt +8 -0
  49. genelastic/import_data/gen_data.py +0 -194
  50. genelastic/import_data/import_data.py +0 -292
  51. genelastic/import_data/integrity.py +0 -290
  52. genelastic/import_data/validate_data.py +0 -43
  53. genelastic-0.6.1.dist-info/METADATA +0 -41
  54. genelastic-0.6.1.dist-info/RECORD +0 -36
  55. genelastic-0.6.1.dist-info/entry_points.txt +0 -6
  56. {genelastic-0.6.1.dist-info → genelastic-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,5 @@
1
- # pylint: disable=missing-module-docstring
2
1
  import copy
3
- import glob
4
2
  import logging
5
- import os
6
3
  import re
7
4
  import typing
8
5
  from pathlib import Path
@@ -14,28 +11,31 @@ from .data_file import DataFile
14
11
  from .filename_pattern import FilenamePattern
15
12
  from .tags import Tags
16
13
 
17
- logger = logging.getLogger('genelastic')
14
+ logger = logging.getLogger("genelastic")
18
15
 
19
16
 
20
17
  class Analysis:
21
18
  """Class Analysis that represents an analysis."""
22
19
 
23
- # pylint: disable-next=too-many-arguments, too-many-positional-arguments
24
- def __init__(self,
25
- tags: Tags,
26
- root_dir: str = '.',
27
- bundle_file: str | None = None,
28
- file_prefix: str | None = None,
29
- files: typing.Sequence[str] | None = None,
30
- data_path: str | None = None,
31
- **metadata: str | int) -> None:
32
- self._bundle_file = bundle_file
20
+ def __init__( # noqa: PLR0913
21
+ self,
22
+ tags: Tags,
23
+ root_dir: str = ".",
24
+ bundle_file: str | None = None,
25
+ file_prefix: str | None = None,
26
+ files: typing.Sequence[str] | None = None,
27
+ data_path: str | None = None,
28
+ **metadata: str | int,
29
+ ) -> None:
30
+ self._bundle_file = Path(bundle_file) if bundle_file else None
33
31
  self._file_prefix = file_prefix
34
32
  self._files = files
35
- self._data_path = Analysis._resolve_data_path(root_dir, data_path)
33
+ self._data_path = Analysis._resolve_data_path(
34
+ Path(root_dir), Path(data_path) if data_path else None
35
+ )
36
36
  self._tags = tags
37
37
  self._metadata: AnalysisMetaData = metadata
38
- self._categories: typing.Set[str] = set()
38
+ self._categories: set[str] = set()
39
39
 
40
40
  @property
41
41
  def metadata(self) -> AnalysisMetaData:
@@ -43,17 +43,15 @@ class Analysis:
43
43
  return copy.deepcopy(self._metadata)
44
44
 
45
45
  @property
46
- def bundle_file(self) -> str | None:
46
+ def bundle_file(self) -> Path | None:
47
47
  """Get the bundle file."""
48
48
  return self._bundle_file
49
49
 
50
50
  @property
51
51
  def filename_regex(self) -> str:
52
- """
53
- Resolve placeholders in a file prefix using metadata
52
+ """Resolve placeholders in a file prefix using metadata
54
53
  and unresolved placeholders are converted to regex groups
55
54
  """
56
-
57
55
  x: str = r"^.+\.(?P<ext>vcf|cov)(\.gz)?$"
58
56
 
59
57
  # Use existing generic prefix
@@ -65,84 +63,87 @@ class Analysis:
65
63
  regex = tag_attrs["regex"]
66
64
 
67
65
  # Build field regex
68
- field_regex = (f"(?P<{field}>{self._metadata.get(field)})"
69
- if field in self._metadata else
70
- f"(?P<{field}>{regex})")
66
+ field_regex = (
67
+ f"(?P<{field}>{self._metadata.get(field)})"
68
+ if field in self._metadata
69
+ else f"(?P<{field}>{regex})"
70
+ )
71
71
  # Replace tag with field regex
72
72
  x = x.replace(tag_name, field_regex)
73
73
 
74
74
  # Check for tags that were not replaced.
75
75
  groups = re.findall(self._tags.search_regex, x)
76
76
  for match in groups:
77
- logger.warning("String '%s' in key 'file_prefix' looks like an undefined tag. "
78
- "If this string is not a tag, you can ignore this warning.",
79
- match)
77
+ logger.warning(
78
+ "String '%s' in key 'file_prefix' looks like an undefined tag. "
79
+ "If this string is not a tag, you can ignore this warning.",
80
+ match,
81
+ )
80
82
 
81
83
  # Add missing start and end markers
82
84
  if not x.startswith("^"):
83
85
  x = "^" + x
84
86
  if not x.endswith("$"):
85
- x += (r"\.(?P<ext>" + '|'.join(ALLOWED_CATEGORIES)
86
- + r")(\.gz)?$")
87
+ x += r"\.(?P<ext>" + "|".join(ALLOWED_CATEGORIES) + r")(\.gz)?$"
87
88
  logger.debug("File regex for %s: %s", self._bundle_file, x)
88
89
 
89
90
  return x
90
91
 
91
92
  def get_nb_files(self, cat: str | None = None) -> int:
92
- """Returns the total number of files.
93
- """
93
+ """Returns the total number of files."""
94
94
  return len(self.get_file_paths(cat=cat))
95
95
 
96
- def get_data_files(self, cat: str | None = None) -> typing.List[DataFile]:
97
- """Returns the list of matched files as DataFile objects.
98
- """
99
-
96
+ def get_data_files(self, cat: str | None = None) -> list[DataFile]:
97
+ """Returns the list of matched files as DataFile objects."""
100
98
  files = self.get_file_paths(cat=cat)
101
99
  filename_pattern = FilenamePattern(self.filename_regex)
102
100
 
103
- data_files: typing.List[DataFile] = []
101
+ data_files: list[DataFile] = []
104
102
 
105
103
  for f in files:
106
104
  try:
107
- data_files.append(DataFile.make_from_bundle(
108
- path=f, bundle_path=self._bundle_file,
109
- pattern=filename_pattern))
110
- except (IOError, ValueError) as e:
105
+ data_files.append(
106
+ DataFile.make_from_bundle(
107
+ path=f,
108
+ bundle_path=self._bundle_file,
109
+ pattern=filename_pattern,
110
+ )
111
+ )
112
+ except (OSError, ValueError) as e:
111
113
  logger.error("Error processing file %s: %s", f, str(e))
112
114
 
113
115
  return data_files
114
116
 
115
- def get_file_paths(self, cat: str | None = None) -> typing.Sequence[str]:
116
- """Returns the list of matched files.
117
- """
117
+ def get_file_paths(self, cat: str | None = None) -> typing.Sequence[Path]:
118
+ """Returns the list of matched files."""
118
119
  files, _, _ = self._do_get_file_paths(cat=cat)
119
120
  return files
120
121
 
121
- def get_unmatched_file_paths(self, cat: str | None = None
122
- ) -> typing.Sequence[str]:
123
- """Returns the list of unmatched files.
124
- """
122
+ def get_unmatched_file_paths(
123
+ self, cat: str | None = None
124
+ ) -> typing.Sequence[Path]:
125
+ """Returns the list of unmatched files."""
125
126
  _, files, _ = self._do_get_file_paths(cat=cat)
126
127
  return files
127
128
 
128
- def get_all_categories(self) -> typing.Set[str]:
129
+ def get_all_categories(self) -> set[str]:
129
130
  """Returns all categories of the analysis."""
130
131
  _, _, categories = self._do_get_file_paths()
131
132
  return categories
132
133
 
133
134
  @staticmethod
134
- def _resolve_data_path(root_dir: str, data_path: str | None) -> str:
135
- resolved_data_path = '' if data_path is None else data_path
135
+ def _resolve_data_path(root_dir: Path, data_path: Path | None) -> Path:
136
+ resolved_data_path = Path() if data_path is None else data_path
136
137
 
137
- if not os.path.isabs(resolved_data_path):
138
- resolved_data_path = os.path.abspath(os.path.join(root_dir, resolved_data_path))
138
+ if not resolved_data_path.is_absolute():
139
+ resolved_data_path = (root_dir / resolved_data_path).absolute()
139
140
 
140
141
  return resolved_data_path
141
142
 
142
- def _get_files_with_allowed_categories(self) -> typing.Dict[str, str]:
143
+ def _get_files_with_allowed_categories(self) -> dict[Path, str]:
143
144
  # Create a dict to store allowed files. Keys are the filepaths,
144
145
  # and values are their corresponding category.
145
- allowed_files: typing.Dict[str, str] = {}
146
+ allowed_files: dict[Path, str] = {}
146
147
  # If files are listed explicitly in the YAML in the 'files' attribute, process them.
147
148
  if self._files is not None:
148
149
  abs_filepaths = [Path(self._data_path) / f for f in self._files]
@@ -151,14 +152,14 @@ class Analysis:
151
152
  cat = file.suffixes[0][1:]
152
153
  # Add each matching file and its category to the dict.
153
154
  if cat in ALLOWED_CATEGORIES:
154
- allowed_files[str(file)] = cat
155
+ allowed_files[file] = cat
155
156
  # Else, look for files on disk using the YAML 'data_path' attribute.
156
157
  else:
157
158
  # Try to retrieve files matching allowed categories using glob.
158
159
  for cat in ALLOWED_CATEGORIES:
159
- glob_res = []
160
- glob_res.extend(glob.glob(os.path.join(self._data_path, f"*.{cat}")))
161
- glob_res.extend(glob.glob(os.path.join(self._data_path, f"*.{cat}.gz")))
160
+ glob_res: list[Path] = []
161
+ glob_res.extend(self._data_path.glob(f"*.{cat}"))
162
+ glob_res.extend(self._data_path.glob(f"*.{cat}.gz"))
162
163
 
163
164
  # Add each globed file and its category to the dict.
164
165
  for g_file in glob_res:
@@ -166,12 +167,13 @@ class Analysis:
166
167
 
167
168
  return allowed_files
168
169
 
169
- def _do_get_file_paths(self, cat: str | None = None) \
170
- -> tuple[typing.Sequence[str], typing.Sequence[str], typing.Set[str]]:
171
-
170
+ def _do_get_file_paths(
171
+ self, cat: str | None = None
172
+ ) -> tuple[typing.Sequence[Path], typing.Sequence[Path], set[str]]:
172
173
  # Raise an error if the category given as a parameter is not part of the allowed categories.
173
174
  if cat is not None and cat not in ALLOWED_CATEGORIES:
174
- raise ValueError(f"Unknown category {cat}.")
175
+ msg = f"Unknown category {cat}."
176
+ raise ValueError(msg)
175
177
 
176
178
  # Obtain a dict of all files matching the allowed categories.
177
179
  allowed_files = self._get_files_with_allowed_categories()
@@ -181,16 +183,18 @@ class Analysis:
181
183
  files_to_match = allowed_files
182
184
  else:
183
185
  # A category was given as a parameter, so we match only this specific category.
184
- files_to_match = dict((k, v) for (k, v) in allowed_files.items() if v == cat)
186
+ files_to_match = {
187
+ k: v for k, v in allowed_files.items() if v == cat
188
+ }
185
189
 
186
190
  filename_pattern = FilenamePattern(self.filename_regex)
187
- matching_files: typing.List[str] = []
188
- non_matching_files: typing.List[str] = []
191
+ matching_files: list[Path] = []
192
+ non_matching_files: list[Path] = []
189
193
  categories = set()
190
194
 
191
195
  # We filter files by ensuring that they match the filename pattern defined in the analysis.
192
196
  for file, category in sorted(files_to_match.items()):
193
- if filename_pattern.matches_pattern(os.path.basename(file)):
197
+ if filename_pattern.matches_pattern(file.name):
194
198
  matching_files.append(file)
195
199
  logger.info("MATCHED file %s.", file)
196
200
  # Add the file category to the categories set.
@@ -1,15 +1,17 @@
1
- # pylint: disable=missing-module-docstring
2
1
  import copy
3
- import typing
4
2
 
5
3
  from genelastic.common import BioInfoProcessData
6
4
 
7
5
 
8
6
  class BioInfoProcess:
9
7
  """Class representing a bio process."""
10
- def __init__(self, proc_id: str,
11
- bundle_file: str | None = None,
12
- **data: str | typing.List[str]) -> None:
8
+
9
+ def __init__(
10
+ self,
11
+ proc_id: str,
12
+ bundle_file: str | None = None,
13
+ **data: str | list[str],
14
+ ) -> None:
13
15
  self._proc_id = proc_id
14
16
  self._bundle_file = bundle_file
15
17
  self._data: BioInfoProcessData = data
@@ -1,4 +1,3 @@
1
- # pylint: disable=missing-module-docstring
2
1
  import logging
3
2
  import typing
4
3
 
@@ -6,14 +5,14 @@ from genelastic.common import BundleDict
6
5
 
7
6
  from .bi_process import BioInfoProcess
8
7
 
9
- logger = logging.getLogger('genelastic')
8
+ logger = logging.getLogger("genelastic")
10
9
 
11
10
 
12
11
  class BioInfoProcesses:
13
12
  """Class BioInfoProcesses is a container of BioInfoProcess objects."""
14
13
 
15
14
  def __init__(self) -> None:
16
- self._dict: typing.Dict[str, BioInfoProcess] = {}
15
+ self._dict: dict[str, BioInfoProcess] = {}
17
16
 
18
17
  def __len__(self) -> int:
19
18
  return len(self._dict)
@@ -27,20 +26,21 @@ class BioInfoProcesses:
27
26
  the program exits.
28
27
  """
29
28
  if process.id in self._dict:
30
- raise ValueError(f"A bi process with the id '{process.id}' is already present.")
29
+ msg = f"A bi process with the id '{process.id}' is already present."
30
+ raise ValueError(msg)
31
31
 
32
32
  # Add one WetProcess object.
33
33
  self._dict[process.id] = process
34
34
 
35
- def get_process_ids(self) -> typing.Set[str]:
35
+ def get_process_ids(self) -> set[str]:
36
36
  """Get a list of the bio processes IDs."""
37
37
  return set(self._dict.keys())
38
38
 
39
39
  @classmethod
40
- def from_array_of_dicts(cls, arr: typing.Sequence[BundleDict]
41
- ) -> typing.Self:
40
+ def from_array_of_dicts(
41
+ cls, arr: typing.Sequence[BundleDict]
42
+ ) -> typing.Self:
42
43
  """Build a BioInfoProcesses instance."""
43
-
44
44
  bi_processes = cls()
45
45
 
46
46
  for d in arr:
@@ -0,0 +1,143 @@
1
+ import argparse
2
+ import logging
3
+ from pathlib import Path
4
+
5
+ from biophony import DEFAULT_RATE, MutSimParams
6
+
7
+ from genelastic.common import add_verbose_control_args
8
+
9
+ from .logger import configure_logging
10
+ from .random_bundle import (
11
+ RandomBundle,
12
+ )
13
+
14
+ logger = logging.getLogger("genelastic")
15
+
16
+
17
+ def read_args() -> argparse.Namespace:
18
+ """Read arguments from the command line."""
19
+ parser = argparse.ArgumentParser(
20
+ description="Random bundle generator. "
21
+ "A bundle is a YAML file format used to import genetic data into an Elasticsearch database. "
22
+ "It can contain one or more analyses; "
23
+ "each analysis including metadata, references to "
24
+ "a wet lab and bioinformatics process "
25
+ "and paths to a VCF file and optionally to a coverage file.",
26
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
27
+ allow_abbrev=False,
28
+ )
29
+ add_verbose_control_args(parser)
30
+ parser.add_argument(
31
+ "output_dir",
32
+ help="Path where analyses VCF and coverage files will be generated.",
33
+ type=Path,
34
+ )
35
+ parser.add_argument("--log-file", help="Path to a log file.")
36
+ parser.add_argument(
37
+ "-n",
38
+ "--chrom-nb",
39
+ type=int,
40
+ default=5,
41
+ help="Number of chromosomes to include in the generated VCF file.",
42
+ )
43
+ parser.add_argument(
44
+ "-o",
45
+ "--output-bundle",
46
+ default=None,
47
+ help="Path where the YAML bundle file will be written. "
48
+ "If no path is provided, the bundle is written to stdout.",
49
+ type=Path,
50
+ )
51
+ parser.add_argument(
52
+ "-l",
53
+ "--sequence-length",
54
+ type=int,
55
+ default=2000,
56
+ help="Sequence length (number of nucleotides) generated for each chromosome.",
57
+ )
58
+ parser.add_argument(
59
+ "-c",
60
+ "--coverage",
61
+ action="store_true",
62
+ help="Generate a coverage file for each analysis.",
63
+ )
64
+ parser.add_argument(
65
+ "-a",
66
+ "--analyses",
67
+ help="Number of analyses to generate. "
68
+ "Each analysis will reference a wet lab and bioinformatics process, "
69
+ "a VCF file and optionally a coverage file.",
70
+ default=1,
71
+ type=int,
72
+ )
73
+ parser.add_argument(
74
+ "-p",
75
+ "--processes",
76
+ help="Number of wet lab and bioinformatics processes to generate.",
77
+ default=1,
78
+ type=int,
79
+ )
80
+ parser.add_argument(
81
+ "-s",
82
+ "--snp-rate",
83
+ help="Generated VCF SNP rate.",
84
+ type=float,
85
+ default=DEFAULT_RATE,
86
+ )
87
+ parser.add_argument(
88
+ "-i",
89
+ "--ins-rate",
90
+ help="Generated VCF insertion rate.",
91
+ type=float,
92
+ default=DEFAULT_RATE,
93
+ )
94
+ parser.add_argument(
95
+ "-d",
96
+ "--del-rate",
97
+ help="Generated VCF deletion rate.",
98
+ type=float,
99
+ default=DEFAULT_RATE,
100
+ )
101
+ return parser.parse_args()
102
+
103
+
104
+ def main() -> None:
105
+ """Entry point of the gen-data script."""
106
+ # Read command line arguments
107
+ args = read_args()
108
+ output_dir = args.output_dir.resolve()
109
+
110
+ if not output_dir.is_dir():
111
+ msg = f"ERROR: '{output_dir}' does not exist or is not a directory."
112
+ raise SystemExit(msg)
113
+
114
+ if args.analyses < 1:
115
+ msg = "Analyses count must be at least 1."
116
+ raise SystemExit(msg)
117
+
118
+ if args.processes < 1:
119
+ msg = "Processes count must be at least 1."
120
+ raise SystemExit(msg)
121
+
122
+ # Configure logging
123
+ configure_logging(args.verbose, log_file=args.log_file)
124
+ logger.debug("Arguments: %s", args)
125
+
126
+ # Write to stdout or file
127
+ RandomBundle(
128
+ output_dir,
129
+ args.analyses,
130
+ args.processes,
131
+ args.chrom_nb,
132
+ args.sequence_length,
133
+ MutSimParams(
134
+ snp_rate=args.snp_rate,
135
+ ins_rate=args.ins_rate,
136
+ del_rate=args.del_rate,
137
+ ),
138
+ do_gen_coverage=args.coverage,
139
+ ).to_yaml(args.output_bundle)
140
+
141
+
142
+ if __name__ == "__main__":
143
+ main()