genelastic 0.6.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. genelastic/api/cli_start_api.py +18 -0
  2. genelastic/api/extends/example.py +2 -3
  3. genelastic/api/extends/example.yml +20 -0
  4. genelastic/api/routes.py +160 -23
  5. genelastic/api/server.py +42 -31
  6. genelastic/api/settings.py +5 -8
  7. genelastic/api/specification.yml +350 -0
  8. genelastic/common/__init__.py +41 -9
  9. genelastic/common/cli.py +103 -23
  10. genelastic/common/elastic.py +80 -49
  11. genelastic/common/exceptions.py +0 -2
  12. genelastic/common/server.py +51 -0
  13. genelastic/common/types.py +20 -15
  14. genelastic/import_data/__init__.py +23 -5
  15. genelastic/import_data/analyses.py +17 -20
  16. genelastic/import_data/analysis.py +69 -65
  17. genelastic/import_data/bi_process.py +7 -5
  18. genelastic/import_data/bi_processes.py +8 -8
  19. genelastic/import_data/cli_gen_data.py +143 -0
  20. genelastic/import_data/cli_import.py +379 -0
  21. genelastic/import_data/{info.py → cli_info.py} +104 -75
  22. genelastic/import_data/cli_integrity.py +384 -0
  23. genelastic/import_data/cli_validate.py +54 -0
  24. genelastic/import_data/constants.py +11 -32
  25. genelastic/import_data/data_file.py +23 -20
  26. genelastic/import_data/filename_pattern.py +26 -32
  27. genelastic/import_data/import_bundle.py +56 -47
  28. genelastic/import_data/import_bundle_factory.py +166 -158
  29. genelastic/import_data/logger.py +22 -18
  30. genelastic/import_data/random_bundle.py +425 -0
  31. genelastic/import_data/tags.py +46 -26
  32. genelastic/import_data/wet_process.py +8 -4
  33. genelastic/import_data/wet_processes.py +13 -8
  34. genelastic/ui/__init__.py +0 -0
  35. genelastic/ui/cli_start_ui.py +18 -0
  36. genelastic/ui/routes.py +86 -0
  37. genelastic/ui/server.py +14 -0
  38. genelastic/ui/settings.py +7 -0
  39. genelastic/ui/templates/analyses.html +11 -0
  40. genelastic/ui/templates/bi_processes.html +11 -0
  41. genelastic/ui/templates/home.html +4 -0
  42. genelastic/ui/templates/layout.html +34 -0
  43. genelastic/ui/templates/version.html +9 -0
  44. genelastic/ui/templates/wet_processes.html +11 -0
  45. genelastic-0.8.0.dist-info/METADATA +109 -0
  46. genelastic-0.8.0.dist-info/RECORD +52 -0
  47. {genelastic-0.6.1.dist-info → genelastic-0.8.0.dist-info}/WHEEL +1 -1
  48. genelastic-0.8.0.dist-info/entry_points.txt +8 -0
  49. genelastic/import_data/gen_data.py +0 -194
  50. genelastic/import_data/import_data.py +0 -292
  51. genelastic/import_data/integrity.py +0 -290
  52. genelastic/import_data/validate_data.py +0 -43
  53. genelastic-0.6.1.dist-info/METADATA +0 -41
  54. genelastic-0.6.1.dist-info/RECORD +0 -36
  55. genelastic-0.6.1.dist-info/entry_points.txt +0 -6
  56. {genelastic-0.6.1.dist-info → genelastic-0.8.0.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,3 @@
1
- # pylint: disable=missing-module-docstring
2
1
  import datetime
3
2
  import logging
4
3
  import time
@@ -13,14 +12,15 @@ from elasticsearch import Elasticsearch
13
12
  from .exceptions import DBIntegrityError
14
13
  from .types import Bucket, BulkItems
15
14
 
16
- logger = logging.getLogger('genelastic')
15
+ logger = logging.getLogger("genelastic")
17
16
 
18
17
 
19
- class ElasticConn(ABC): # pylint: disable=too-few-public-methods
18
+ class ElasticConn(ABC):
20
19
  """Abstract class representing a connector for an Elasticsearch server."""
20
+
21
21
  client: Elasticsearch
22
22
 
23
- def __init__(self, url: str, fingerprint: str, **kwargs: Any):
23
+ def __init__(self, url: str, fingerprint: str, **kwargs: Any) -> None: # noqa: ANN401
24
24
  """Initialize an elasticsearch client instance.
25
25
 
26
26
  :url: URL of the Elasticsearch host.
@@ -34,41 +34,49 @@ class ElasticConn(ABC): # pylint: disable=too-few-public-methods
34
34
  ssl_assert_fingerprint=fingerprint,
35
35
  # Verify cert only when the fingerprint is not None.
36
36
  verify_certs=bool(fingerprint),
37
- **kwargs
37
+ **kwargs,
38
38
  )
39
39
  self.client.info()
40
- except (elastic_transport.TransportError, elasticsearch.AuthenticationException) as e:
40
+ except (
41
+ elastic_transport.TransportError,
42
+ elasticsearch.AuthenticationException,
43
+ ) as e:
41
44
  raise SystemExit(e) from e
42
45
 
43
46
 
44
- class ElasticImportConn(ElasticConn): # pylint: disable=too-few-public-methods
47
+ class ElasticImportConn(ElasticConn):
45
48
  """Connector to import data into an Elasticsearch database."""
46
- def import_items(self, bulk_items: BulkItems,
47
- start_time: float,
48
- total_items: int) -> None:
49
+
50
+ def import_items(
51
+ self, bulk_items: BulkItems, start_time: float, total_items: int
52
+ ) -> None:
49
53
  """Import items to the Elasticsearch database."""
50
54
  if len(bulk_items) > 0:
51
55
  elasticsearch.helpers.bulk(self.client, bulk_items)
52
56
  elapsed = time.perf_counter() - start_time
53
- logger.info("Imported %d items in %s (%f items/s).", total_items,
54
- datetime.timedelta(seconds=elapsed), total_items / elapsed)
57
+ logger.info(
58
+ "Imported %d items in %s (%f items/s).",
59
+ total_items,
60
+ datetime.timedelta(seconds=elapsed),
61
+ total_items / elapsed,
62
+ )
55
63
 
56
64
 
57
65
  class ElasticQueryConn(ElasticConn):
58
66
  """Connector to query data from an Elasticsearch database."""
59
67
 
60
- def get_indices(self) -> Any | str:
68
+ def get_indices(self) -> Any | str: # noqa: ANN401
61
69
  """Return all indices."""
62
70
  return self.client.cat.indices(format="json").body
63
71
 
64
- def get_document_by_id(self, index: str, document_id: str) -> Any | str:
72
+ def get_document_by_id(self, index: str, document_id: str) -> Any | str: # noqa: ANN401
65
73
  """Return a document by its ID."""
66
74
  return self.client.get(index=index, id=document_id).body
67
75
 
68
- def run_composite_aggregation(self, index: str, query: dict[str, typing.Any]) \
69
- -> list[Bucket]:
70
- """
71
- Executes a composite aggregation on an Elasticsearch index and
76
+ def run_composite_aggregation(
77
+ self, index: str, query: dict[str, typing.Any]
78
+ ) -> list[Bucket]:
79
+ """Executes a composite aggregation on an Elasticsearch index and
72
80
  returns all paginated results.
73
81
 
74
82
  :param index: Name of the index to query.
@@ -77,29 +85,39 @@ class ElasticQueryConn(ElasticConn):
77
85
  """
78
86
  # Extract the aggregation name from the query dict.
79
87
  agg_name = next(iter(query["aggs"]))
80
- all_buckets: typing.List[Bucket] = []
88
+ all_buckets: list[Bucket] = []
81
89
 
82
90
  try:
83
- logger.debug("Running composite aggregation query %s on index '%s'.", query, index)
91
+ logger.debug(
92
+ "Running composite aggregation query %s on index '%s'.",
93
+ query,
94
+ index,
95
+ )
84
96
  response = self.client.search(index=index, body=query)
85
97
  except elasticsearch.NotFoundError as e:
86
- raise SystemExit(f"Error: {e.message} for index '{index}'.") from e
98
+ msg = f"Error: {e.message} for index '{index}'."
99
+ raise SystemExit(msg) from e
87
100
 
88
101
  while True:
89
102
  # Extract buckets from the response.
90
- buckets: typing.List[Bucket] = response['aggregations'][agg_name]['buckets']
103
+ buckets: list[Bucket] = response["aggregations"][agg_name][
104
+ "buckets"
105
+ ]
91
106
  all_buckets.extend(buckets)
92
107
 
93
108
  # Check if there are more results to fetch.
94
- if 'after_key' in response['aggregations'][agg_name]:
95
- after_key = response['aggregations'][agg_name]['after_key']
96
- query['aggs'][agg_name]['composite']['after'] = after_key
109
+ if "after_key" in response["aggregations"][agg_name]:
110
+ after_key = response["aggregations"][agg_name]["after_key"]
111
+ query["aggs"][agg_name]["composite"]["after"] = after_key
97
112
  try:
98
- logger.debug("Running query %s on index '%s'.", query, index)
113
+ logger.debug(
114
+ "Running query %s on index '%s'.", query, index
115
+ )
99
116
  # Fetch the next page of results.
100
117
  response = self.client.search(index=index, body=query)
101
118
  except elasticsearch.NotFoundError as e:
102
- raise SystemExit(f"Error: {e.message} for index '{index}'.") from e
119
+ msg = f"Error: {e.message} for index '{index}'."
120
+ raise SystemExit(msg) from e
103
121
  else:
104
122
  break
105
123
 
@@ -114,25 +132,34 @@ class ElasticQueryConn(ElasticConn):
114
132
  "aggs": {
115
133
  "get_field_values": {
116
134
  "composite": {
117
- "sources": {"values": {"terms": {"field": f"{field_name}.keyword"}}},
135
+ "sources": {
136
+ "values": {
137
+ "terms": {"field": f"{field_name}.keyword"}
138
+ }
139
+ },
118
140
  "size": 1000,
119
141
  }
120
142
  }
121
- }
143
+ },
122
144
  }
123
145
 
124
- buckets: typing.List[Bucket] = self.run_composite_aggregation(index, query)
146
+ buckets: list[Bucket] = self.run_composite_aggregation(index, query)
125
147
 
126
148
  for bucket in buckets:
127
- values.add(bucket['key']['values'])
149
+ values.add(bucket["key"]["values"])
128
150
 
129
151
  return values
130
152
 
131
- def search_by_field_value(self, index: str, field: str, value: str) -> (
132
- typing.Dict[str, typing.Any] | None):
153
+ def search_by_field_value(
154
+ self, index: str, field: str, value: str
155
+ ) -> dict[str, typing.Any] | None:
133
156
  """Search a document by a value for a certain field."""
134
- logger.info("Searching for field '%s' with value '%s' inside index '%s'.",
135
- field, value, index)
157
+ logger.info(
158
+ "Searching for field '%s' with value '%s' inside index '%s'.",
159
+ field,
160
+ value,
161
+ index,
162
+ )
136
163
  search_query = {
137
164
  "query": {
138
165
  "term": {
@@ -144,22 +171,23 @@ class ElasticQueryConn(ElasticConn):
144
171
  response = self.client.search(index=index, body=search_query)
145
172
 
146
173
  try:
147
- return response['hits']['hits'][0]['_source'] # type: ignore
174
+ return response["hits"]["hits"][0]["_source"] # type: ignore[no-any-return]
148
175
  except KeyError:
149
176
  return None
150
177
 
151
178
  def ensure_unique(self, index: str, field: str) -> None:
152
- """
153
- Ensure that all values of a field in an index are all unique.
179
+ """Ensure that all values of a field in an index are all unique.
154
180
 
155
181
  :param index: Name of the index.
156
182
  :param field: Field name to check for value uniqueness.
157
183
  :raises genelastic.common.DBIntegrityError:
158
184
  Some values of the given field are duplicated in the index.
159
185
  """
160
-
161
- logger.info("Ensuring that the field '%s' in the index '%s' only contains unique values...",
162
- field, index)
186
+ logger.info(
187
+ "Ensuring that the field '%s' in the index '%s' only contains unique values...",
188
+ field,
189
+ index,
190
+ )
163
191
  query = {
164
192
  "size": 0,
165
193
  "aggs": {
@@ -167,17 +195,20 @@ class ElasticQueryConn(ElasticConn):
167
195
  "terms": {
168
196
  "field": f"{field}.keyword",
169
197
  "size": 10000,
170
- "min_doc_count": 2
198
+ "min_doc_count": 2,
171
199
  }
172
200
  }
173
- }
201
+ },
202
+ }
203
+ buckets: list[Bucket] = self.run_composite_aggregation(index, query)
204
+ duplicated_processes: set[str] = {
205
+ str(bucket["key"]) for bucket in buckets
174
206
  }
175
- buckets: typing.List[Bucket] = self.run_composite_aggregation(index, query)
176
- duplicated_processes: typing.Set[str] = set(map(lambda bucket: str(bucket["key"]), buckets))
177
207
 
178
208
  if len(duplicated_processes) > 0:
179
- raise DBIntegrityError(f"Found non-unique value for field {field} in index '{index}': "
180
- f"{', '.join(duplicated_processes)}.")
209
+ msg = f"Found non-unique value for field {field} in index '{index}': {', '.join(duplicated_processes)}."
210
+ raise DBIntegrityError(msg)
181
211
 
182
- logger.info("All values of field '%s' in index '%s' are unique.",
183
- field, index)
212
+ logger.info(
213
+ "All values of field '%s' in index '%s' are unique.", field, index
214
+ )
@@ -1,5 +1,3 @@
1
- # pylint: disable=missing-module-docstring
2
-
3
1
  class DBIntegrityError(Exception):
4
2
  """Represents an integrity error,
5
3
  raised when the database content does not match the expected data schema.
@@ -0,0 +1,51 @@
1
+ import argparse
2
+ import subprocess
3
+ import sys
4
+
5
+ import uvicorn
6
+
7
+
8
+ def start_dev_server(app_module: str, args: argparse.Namespace) -> None:
9
+ """Start the development server using Uvicorn.
10
+ :args app_module: The module containing the Flask server to start.
11
+ :args argparse.Namespace: The parsed arguments.
12
+ """
13
+ uvicorn.run(
14
+ app_module,
15
+ host=args.host,
16
+ port=args.port,
17
+ log_level=args.log_level,
18
+ reload=True,
19
+ )
20
+
21
+
22
+ def start_prod_server(app_module: str, args: argparse.Namespace) -> None:
23
+ """Start the production server using Gunicorn.
24
+ It will spawn one primary process and workers
25
+ :args app_module: The module containing the Flask server to start.
26
+ :args argparse.Namespace: The parsed arguments.
27
+ :raises subprocess.CalledProcessError: If gunicorn exits with a non-zero status code.
28
+ """
29
+ cmd = [
30
+ sys.executable,
31
+ "-m",
32
+ "gunicorn",
33
+ "-k",
34
+ "uvicorn.workers.UvicornWorker",
35
+ "--workers",
36
+ str(args.workers),
37
+ "--log-level",
38
+ args.log_level,
39
+ "-b",
40
+ f"{args.host}:{args.port}",
41
+ "--capture-output",
42
+ app_module,
43
+ ]
44
+
45
+ if args.log_file:
46
+ cmd.extend(["--log-file", args.log_file])
47
+
48
+ if args.access_logfile:
49
+ cmd.extend(["--access-logfile", args.access_logfile])
50
+
51
+ subprocess.run(cmd, check=True) # noqa: S603
@@ -1,20 +1,25 @@
1
- # pylint: disable=missing-module-docstring
2
-
3
1
  import typing
4
2
 
3
+ # Types related to Elasticsearch data import.
5
4
  Bucket: typing.TypeAlias = dict[str, dict[typing.Any, typing.Any]]
5
+ BundleDict: typing.TypeAlias = dict[str, typing.Any]
6
+
7
+ AnalysisMetaData: typing.TypeAlias = dict[str, str | int]
8
+ WetProcessesData: typing.TypeAlias = dict[str, str | int | float]
9
+ BioInfoProcessData: typing.TypeAlias = dict[str, str | list[str]]
6
10
 
7
- AnalysisMetaData: typing.TypeAlias = typing.Dict[str, str | int]
8
- WetProcessesData: typing.TypeAlias = typing.Dict[str, str | int | float]
9
- BioInfoProcessData: typing.TypeAlias = typing.Dict[str, str | typing.List[str]]
10
- BundleDict: typing.TypeAlias = typing.Dict[str, typing.Any]
11
+ AnalysisDocument: typing.TypeAlias = dict[str, str | None | AnalysisMetaData]
12
+ MetadataDocument: typing.TypeAlias = dict[
13
+ str, int | str | list[typing.Any | None]
14
+ ]
15
+ ProcessDocument: typing.TypeAlias = (
16
+ dict[str, str] | WetProcessesData | BioInfoProcessData
17
+ )
18
+ BulkItems: typing.TypeAlias = list[
19
+ dict[str, str | MetadataDocument | AnalysisDocument | ProcessDocument]
20
+ ]
11
21
 
12
- AnalysisDocument: typing.TypeAlias = typing.Dict[str, str | None | AnalysisMetaData]
13
- MetadataDocument: typing.TypeAlias = typing.Dict[str, int | str | typing.List[typing.Any | None]]
14
- ProcessDocument: typing.TypeAlias = (typing.Dict[str, str] |
15
- WetProcessesData |
16
- BioInfoProcessData)
17
- BulkItems: typing.TypeAlias = typing.List[typing.Dict[str, str |
18
- MetadataDocument |
19
- AnalysisDocument |
20
- ProcessDocument]]
22
+ # Types related to random bundle generation.
23
+ RandomBiProcessData: typing.TypeAlias = dict[str, str | list[dict[str, str]]]
24
+ RandomWetProcessData: typing.TypeAlias = dict[str, str | float]
25
+ RandomAnalysisData: typing.TypeAlias = dict[str, str | list[int | str]]
@@ -1,9 +1,27 @@
1
1
  """Genelastic package for importing Genomic data into Elasticsearch."""
2
+
2
3
  from .analysis import Analysis
3
- from .import_bundle_factory import (make_import_bundle_from_files,
4
- load_import_bundle_file)
5
- from .tags import Tags
6
4
  from .import_bundle import ImportBundle
5
+ from .import_bundle_factory import (
6
+ load_import_bundle_file,
7
+ make_import_bundle_from_files,
8
+ )
9
+ from .random_bundle import (
10
+ RandomAnalysis,
11
+ RandomBiProcess,
12
+ RandomBundle,
13
+ RandomWetProcess,
14
+ )
15
+ from .tags import Tags
7
16
 
8
- __all__ = ['Analysis', 'Tags', 'ImportBundle', 'make_import_bundle_from_files',
9
- 'load_import_bundle_file']
17
+ __all__ = [
18
+ "Analysis",
19
+ "ImportBundle",
20
+ "RandomAnalysis",
21
+ "RandomBiProcess",
22
+ "RandomBundle",
23
+ "RandomWetProcess",
24
+ "Tags",
25
+ "load_import_bundle_file",
26
+ "make_import_bundle_from_files",
27
+ ]
@@ -1,23 +1,22 @@
1
- # pylint: disable=missing-module-docstring
2
1
  import typing
3
2
 
4
- from genelastic.common import BundleDict
3
+ from genelastic.common import BundleDict
5
4
 
6
5
  from .analysis import Analysis
7
6
  from .data_file import DataFile
8
7
 
8
+
9
9
  class Analyses:
10
- """Class Analyses is a container of Analysis objects.
11
- """
10
+ """Class Analyses is a container of Analysis objects."""
12
11
 
13
12
  def __init__(self) -> None:
14
- self._arr: typing.List[Analysis] = []
13
+ self._arr: list[Analysis] = []
15
14
  self._iter_index: int = 0
16
15
 
17
16
  def __len__(self) -> int:
18
17
  return len(self._arr)
19
18
 
20
- def __iter__(self) -> typing.Generator[Analysis, typing.Any, None]:
19
+ def __iter__(self) -> typing.Iterator[Analysis]:
21
20
  yield from self._arr
22
21
 
23
22
  @typing.overload
@@ -25,13 +24,13 @@ class Analyses:
25
24
  pass
26
25
 
27
26
  @typing.overload
28
- def __getitem__(self, k: slice) -> typing.List[Analysis]:
27
+ def __getitem__(self, k: slice) -> list[Analysis]:
29
28
  pass
30
29
 
31
- def __getitem__(self, k): # type: ignore
30
+ def __getitem__(self, k): # type: ignore[no-untyped-def]
32
31
  if isinstance(k, int):
33
32
  return self._arr[k]
34
- return self._arr[k.start:k.stop]
33
+ return self._arr[k.start : k.stop]
35
34
 
36
35
  def add(self, a: Analysis) -> None:
37
36
  """Add one Analysis object."""
@@ -39,20 +38,18 @@ class Analyses:
39
38
 
40
39
  def get_nb_files(self, cat: str | None = None) -> int:
41
40
  """Get the total number of files as paths."""
42
- return len(self.get_data_files(cat = cat))
43
-
44
- def get_data_files(self, cat: str | None = None) -> typing.List[DataFile]:
45
- """Get the total number of files as DataFile objects.
46
- """
41
+ return len(self.get_data_files(cat=cat))
47
42
 
48
- data_files: typing.List[DataFile] = []
43
+ def get_data_files(self, cat: str | None = None) -> list[DataFile]:
44
+ """Get the total number of files as DataFile objects."""
45
+ data_files: list[DataFile] = []
49
46
 
50
47
  for a in self._arr:
51
- data_files.extend(a.get_data_files(cat = cat))
48
+ data_files.extend(a.get_data_files(cat=cat))
52
49
 
53
50
  return data_files
54
51
 
55
- def get_all_categories(self) -> typing.Set[str]:
52
+ def get_all_categories(self) -> set[str]:
56
53
  """Return all the categories of the analyses."""
57
54
  categories = set()
58
55
  for a in self._arr:
@@ -60,10 +57,10 @@ class Analyses:
60
57
  return categories
61
58
 
62
59
  @classmethod
63
- def from_array_of_dicts(cls, arr: typing.Sequence[BundleDict]
64
- ) -> typing.Self:
60
+ def from_array_of_dicts(
61
+ cls, arr: typing.Sequence[BundleDict]
62
+ ) -> typing.Self:
65
63
  """Build an Analyses instance."""
66
-
67
64
  analyses = cls()
68
65
 
69
66
  for d in arr: