runnable 0.25.0__py3-none-any.whl → 0.27.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,104 @@
1
+ import logging
2
+ from abc import abstractmethod
3
+ from typing import Any, Dict
4
+
5
+ from runnable import defaults, exceptions
6
+ from runnable.datastore import BaseRunLogStore, RunLog
7
+
8
+ logger = logging.getLogger(defaults.LOGGER_NAME)
9
+
10
+
11
+ class AnyPathRunLogStore(BaseRunLogStore):
12
+ """
13
+ In this type of Run Log store, we use a file system to store the JSON run log.
14
+
15
+ Every single run is stored as a different file which makes it compatible across other store types.
16
+
17
+ When to use:
18
+ When locally testing a pipeline and have the need to compare across runs.
19
+ Its fully featured and perfectly fine if your local environment is where you would do everything.
20
+
21
+ Do not use:
22
+ If you need parallelization on local, this run log would not support it.
23
+
24
+ Example config:
25
+
26
+ run_log:
27
+ type: file-system
28
+ config:
29
+ log_folder: The folder to out the logs. Defaults to .run_log_store
30
+
31
+ """
32
+
33
+ service_name: str = "file-system"
34
+ log_folder: str = defaults.LOG_LOCATION_FOLDER
35
+
36
+ @property
37
+ def log_folder_name(self):
38
+ return self.log_folder
39
+
40
+ def get_summary(self) -> Dict[str, Any]:
41
+ summary = {"Type": self.service_name, "Location": self.log_folder}
42
+
43
+ return summary
44
+
45
+ @abstractmethod
46
+ def write_to_path(self, run_log: RunLog): ...
47
+
48
+ @abstractmethod
49
+ def read_from_path(self, run_id: str) -> RunLog: ...
50
+
51
+ def create_run_log(
52
+ self,
53
+ run_id: str,
54
+ dag_hash: str = "",
55
+ use_cached: bool = False,
56
+ tag: str = "",
57
+ original_run_id: str = "",
58
+ status: str = defaults.CREATED,
59
+ ) -> RunLog:
60
+ """
61
+ # Creates a Run log
62
+ # Adds it to the db
63
+ """
64
+
65
+ try:
66
+ self.get_run_log_by_id(run_id=run_id, full=False)
67
+ raise exceptions.RunLogExistsError(run_id=run_id)
68
+ except exceptions.RunLogNotFoundError:
69
+ pass
70
+
71
+ logger.info(f"{self.service_name} Creating a Run Log for : {run_id}")
72
+ run_log = RunLog(
73
+ run_id=run_id,
74
+ dag_hash=dag_hash,
75
+ tag=tag,
76
+ status=status,
77
+ )
78
+ self.write_to_path(run_log)
79
+ return run_log
80
+
81
+ def get_run_log_by_id(
82
+ self,
83
+ run_id: str,
84
+ full: bool = False,
85
+ ) -> RunLog:
86
+ """
87
+ # Returns the run_log defined by id
88
+ # Raises Exception if not found
89
+ """
90
+ try:
91
+ logger.info(f"{self.service_name} Getting a Run Log for : {run_id}")
92
+ run_log = self.read_from_path(run_id)
93
+ return run_log
94
+ except FileNotFoundError as e:
95
+ raise exceptions.RunLogNotFoundError(run_id) from e
96
+
97
+ def put_run_log(self, run_log: RunLog):
98
+ """
99
+ # Puts the run_log into the database
100
+ """
101
+ logger.info(
102
+ f"{self.service_name} Putting the run log in the DB: {run_log.run_id}"
103
+ )
104
+ self.write_to_path(run_log)
@@ -2,14 +2,16 @@ import json
2
2
  import logging
3
3
  from pathlib import Path
4
4
  from string import Template
5
- from typing import Any, Dict, Optional, Sequence, Union
5
+ from typing import Any, Dict, Optional, Union
6
+
7
+ from cloudpathlib import CloudPath
6
8
 
7
9
  from extensions.run_log_store.generic_chunked import ChunkedRunLogStore
8
10
  from runnable import defaults, utils
9
11
 
10
12
  logger = logging.getLogger(defaults.LOGGER_NAME)
11
13
 
12
- T = Union[str, Path]
14
+ MixT = Union[CloudPath, Path]
13
15
 
14
16
 
15
17
  class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
@@ -28,7 +30,7 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
28
30
 
29
31
  def get_matches(
30
32
  self, run_id: str, name: str, multiple_allowed: bool = False
31
- ) -> Optional[Union[Sequence[T], T]]:
33
+ ) -> Optional[Union[list[Path], list[CloudPath], MixT]]:
32
34
  """
33
35
  Get contents of files matching the pattern name*
34
36
 
@@ -78,7 +80,7 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
78
80
 
79
81
  return str(name) + ".json"
80
82
 
81
- def _store(self, run_id: str, contents: dict, name: Union[Path, str], insert=False):
83
+ def _store(self, run_id: str, contents: dict, name: MixT, insert=False):
82
84
  """
83
85
  Store the contents against the name in the folder.
84
86
 
@@ -87,15 +89,16 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
87
89
  contents (dict): The dict to store
88
90
  name (str): The name to store as
89
91
  """
92
+ log_folder_with_run_id = self.log_folder_with_run_id(run_id=run_id)
90
93
  if insert:
91
- name = self.log_folder_with_run_id(run_id=run_id) / name
94
+ name = log_folder_with_run_id / name
92
95
 
93
- utils.safe_make_dir(self.log_folder_with_run_id(run_id=run_id))
96
+ utils.safe_make_dir(log_folder_with_run_id)
94
97
 
95
- with open(self.safe_suffix_json(name), "w") as fw:
98
+ with open(log_folder_with_run_id / self.safe_suffix_json(name.name), "w") as fw:
96
99
  json.dump(contents, fw, ensure_ascii=True, indent=4)
97
100
 
98
- def _retrieve(self, name: Union[str, Path]) -> dict:
101
+ def _retrieve(self, run_id: str, name: MixT) -> dict:
99
102
  """
100
103
  Does the job of retrieving from the folder.
101
104
 
@@ -106,8 +109,9 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
106
109
  dict: The contents
107
110
  """
108
111
  contents: dict = {}
112
+ log_folder_with_run_id = self.log_folder_with_run_id(run_id=run_id)
109
113
 
110
- with open(self.safe_suffix_json(name), "r") as fr:
114
+ with open(log_folder_with_run_id / self.safe_suffix_json(name.name), "r") as fr:
111
115
  contents = json.load(fr)
112
116
 
113
117
  return contents
@@ -3,13 +3,14 @@ import logging
3
3
  from pathlib import Path
4
4
  from typing import Any, Dict
5
5
 
6
- from runnable import defaults, exceptions, utils
7
- from runnable.datastore import BaseRunLogStore, RunLog
6
+ from extensions.run_log_store.any_path import AnyPathRunLogStore
7
+ from runnable import defaults, utils
8
+ from runnable.datastore import RunLog
8
9
 
9
10
  logger = logging.getLogger(defaults.LOGGER_NAME)
10
11
 
11
12
 
12
- class FileSystemRunLogstore(BaseRunLogStore):
13
+ class FileSystemRunLogstore(AnyPathRunLogStore):
13
14
  """
14
15
  In this type of Run Log store, we use a file system to store the JSON run log.
15
16
 
@@ -43,7 +44,7 @@ class FileSystemRunLogstore(BaseRunLogStore):
43
44
 
44
45
  return summary
45
46
 
46
- def write_to_folder(self, run_log: RunLog):
47
+ def write_to_path(self, run_log: RunLog):
47
48
  """
48
49
  Write the run log to the folder
49
50
 
@@ -60,7 +61,7 @@ class FileSystemRunLogstore(BaseRunLogStore):
60
61
  with json_file_path.open("w") as fw:
61
62
  json.dump(run_log.model_dump(), fw, ensure_ascii=True, indent=4) # pylint: disable=no-member
62
63
 
63
- def get_from_folder(self, run_id: str) -> RunLog:
64
+ def read_from_path(self, run_id: str) -> RunLog:
64
65
  """
65
66
  Look into the run log folder for the run log for the run id.
66
67
 
@@ -88,58 +89,3 @@ class FileSystemRunLogstore(BaseRunLogStore):
88
89
  json_str = json.load(fr)
89
90
  run_log = RunLog(**json_str) # pylint: disable=no-member
90
91
  return run_log
91
-
92
- def create_run_log(
93
- self,
94
- run_id: str,
95
- dag_hash: str = "",
96
- use_cached: bool = False,
97
- tag: str = "",
98
- original_run_id: str = "",
99
- status: str = defaults.CREATED,
100
- ) -> RunLog:
101
- """
102
- # Creates a Run log
103
- # Adds it to the db
104
- """
105
-
106
- try:
107
- self.get_run_log_by_id(run_id=run_id, full=False)
108
- raise exceptions.RunLogExistsError(run_id=run_id)
109
- except exceptions.RunLogNotFoundError:
110
- pass
111
-
112
- logger.info(f"{self.service_name} Creating a Run Log for : {run_id}")
113
- run_log = RunLog(
114
- run_id=run_id,
115
- dag_hash=dag_hash,
116
- tag=tag,
117
- status=status,
118
- )
119
- self.write_to_folder(run_log)
120
- return run_log
121
-
122
- def get_run_log_by_id(
123
- self,
124
- run_id: str,
125
- full: bool = False,
126
- ) -> RunLog:
127
- """
128
- # Returns the run_log defined by id
129
- # Raises Exception if not found
130
- """
131
- try:
132
- logger.info(f"{self.service_name} Getting a Run Log for : {run_id}")
133
- run_log = self.get_from_folder(run_id)
134
- return run_log
135
- except FileNotFoundError as e:
136
- raise exceptions.RunLogNotFoundError(run_id) from e
137
-
138
- def put_run_log(self, run_log: RunLog):
139
- """
140
- # Puts the run_log into the database
141
- """
142
- logger.info(
143
- f"{self.service_name} Putting the run log in the DB: {run_log.run_id}"
144
- )
145
- self.write_to_folder(run_log)
@@ -4,7 +4,9 @@ from abc import abstractmethod
4
4
  from enum import Enum
5
5
  from pathlib import Path
6
6
  from string import Template
7
- from typing import Any, Dict, Optional, Sequence, Union
7
+ from typing import Any, Dict, Optional, Union
8
+
9
+ from cloudpathlib import CloudPath
8
10
 
9
11
  from runnable import defaults, exceptions
10
12
  from runnable.datastore import (
@@ -21,7 +23,7 @@ from runnable.datastore import (
21
23
  logger = logging.getLogger(defaults.LOGGER_NAME)
22
24
 
23
25
 
24
- T = Union[str, Path] # Holds str, path
26
+ MixT = Union[CloudPath, Path] # Holds str, path
25
27
 
26
28
 
27
29
  class EntityNotFoundError(Exception):
@@ -87,7 +89,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
87
89
  @abstractmethod
88
90
  def get_matches(
89
91
  self, run_id: str, name: str, multiple_allowed: bool = False
90
- ) -> Optional[Union[Sequence[T], T]]:
92
+ ) -> Optional[Union[list[Path], list[CloudPath], MixT]]:
91
93
  """
92
94
  Get contents of persistence layer matching the pattern name*
93
95
 
@@ -98,7 +100,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
98
100
  ...
99
101
 
100
102
  @abstractmethod
101
- def _store(self, run_id: str, contents: dict, name: T, insert: bool = False):
103
+ def _store(self, run_id: str, contents: dict, name: MixT, insert: bool = False):
102
104
  """
103
105
  Store the contents against the name in the persistence layer.
104
106
 
@@ -110,7 +112,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
110
112
  ...
111
113
 
112
114
  @abstractmethod
113
- def _retrieve(self, name: T) -> dict:
115
+ def _retrieve(self, run_id: str, name: MixT) -> dict:
114
116
  """
115
117
  Does the job of retrieving from the persistent layer.
116
118
 
@@ -140,7 +142,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
140
142
  insert = False
141
143
 
142
144
  if match:
143
- existing_contents = self._retrieve(name=match) # type: ignore
145
+ existing_contents = self._retrieve(run_id=run_id, name=match) # type: ignore
144
146
  contents = dict(existing_contents, **contents)
145
147
  name_to_give = match # type: ignore
146
148
  else:
@@ -149,7 +151,9 @@ class ChunkedRunLogStore(BaseRunLogStore):
149
151
  )
150
152
  insert = True
151
153
 
152
- self._store(run_id=run_id, contents=contents, name=name_to_give, insert=insert)
154
+ self._store(
155
+ run_id=run_id, contents=contents, name=Path(name_to_give), insert=insert
156
+ )
153
157
 
154
158
  def retrieve(
155
159
  self, run_id: str, log_type: LogTypes, name: str = "", multiple_allowed=False
@@ -190,13 +194,13 @@ class ChunkedRunLogStore(BaseRunLogStore):
190
194
 
191
195
  if matches:
192
196
  if not multiple_allowed:
193
- contents = self._retrieve(name=matches) # type: ignore
197
+ contents = self._retrieve(run_id=run_id, name=matches) # type: ignore
194
198
  model = self.ModelTypes[log_type.name].value
195
199
  return model(**contents)
196
200
 
197
201
  models = []
198
202
  for match in matches: # type: ignore
199
- contents = self._retrieve(name=match)
203
+ contents = self._retrieve(run_id=run_id, name=match)
200
204
  model = self.ModelTypes[log_type.name].value
201
205
  models.append(model(**contents))
202
206
  return models
@@ -225,7 +229,9 @@ class ChunkedRunLogStore(BaseRunLogStore):
225
229
  # No branch logs are found
226
230
  return {}
227
231
  # Forcing get_matches to always return a list is a better design
228
- epoch_created = [str(match).split("-")[-1] for match in matches] # type: ignore
232
+
233
+ assert isinstance(matches, list)
234
+ epoch_created = [str(match).split("-")[-1] for match in matches]
229
235
 
230
236
  # sort matches by epoch created
231
237
  epoch_created, matches = zip(*sorted(zip(epoch_created, matches))) # type: ignore
@@ -234,7 +240,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
234
240
 
235
241
  for match in matches:
236
242
  model = self.ModelTypes[log_type.name].value
237
- log_model = model(**self._retrieve(match))
243
+ log_model = model(**self._retrieve(run_id=run_id, name=match))
238
244
  logs[log_model.internal_name] = log_model # type: ignore
239
245
 
240
246
  return logs
@@ -0,0 +1,111 @@
1
+ import json
2
+ import logging
3
+ from functools import lru_cache
4
+ from typing import Any, Dict
5
+
6
+ from cloudpathlib import S3Client, S3Path
7
+ from pydantic import Field, SecretStr
8
+
9
+ from extensions.run_log_store.any_path import AnyPathRunLogStore
10
+ from runnable import defaults
11
+ from runnable.datastore import RunLog
12
+
13
+ logger = logging.getLogger(defaults.LOGGER_NAME)
14
+
15
+
16
+ @lru_cache
17
+ def get_minio_client(
18
+ endpoint_url: str, aws_access_key_id: str, aws_secret_access_key: str
19
+ ) -> S3Client:
20
+ return S3Client(
21
+ endpoint_url=endpoint_url,
22
+ aws_access_key_id=aws_access_key_id,
23
+ aws_secret_access_key=aws_secret_access_key,
24
+ )
25
+
26
+
27
+ class MinioRunLogStore(AnyPathRunLogStore):
28
+ """
29
+ In this type of Run Log store, we use a file system to store the JSON run log.
30
+
31
+ Every single run is stored as a different file which makes it compatible across other store types.
32
+
33
+ When to use:
34
+ When locally testing a pipeline and have the need to compare across runs.
35
+ Its fully featured and perfectly fine if your local environment is where you would do everything.
36
+
37
+ Do not use:
38
+ If you need parallelization on local, this run log would not support it.
39
+
40
+ Example config:
41
+
42
+ run_log:
43
+ type: file-system
44
+ config:
45
+ log_folder: The folder to out the logs. Defaults to .run_log_store
46
+
47
+ """
48
+
49
+ service_name: str = "file-system"
50
+
51
+ endpoint_url: str = Field(default="http://localhost:9002")
52
+ aws_access_key_id: SecretStr = SecretStr(secret_value="minioadmin")
53
+ aws_secret_access_key: SecretStr = SecretStr(secret_value="minioadmin")
54
+ bucket: str = Field(default="runnable/run-logs")
55
+
56
+ def get_summary(self) -> Dict[str, Any]:
57
+ summary = {"Type": self.service_name, "Location": self.log_folder}
58
+
59
+ return summary
60
+
61
+ def get_run_log_bucket(self) -> S3Path:
62
+ run_id = self._context.run_id
63
+
64
+ return S3Path(
65
+ f"s3://{self.bucket}/{run_id}/",
66
+ client=get_minio_client(
67
+ self.endpoint_url,
68
+ self.aws_access_key_id.get_secret_value(),
69
+ self.aws_secret_access_key.get_secret_value(),
70
+ ),
71
+ )
72
+
73
+ def write_to_path(self, run_log: RunLog):
74
+ """
75
+ Write the run log to the folder
76
+
77
+ Args:
78
+ run_log (RunLog): The run log to be added to the database
79
+ """
80
+ run_log_bucket = self.get_run_log_bucket()
81
+ run_log_bucket.mkdir(parents=True, exist_ok=True)
82
+
83
+ run_log_object = run_log_bucket / f"{run_log.run_id}.json"
84
+ run_log_object.write_text(
85
+ json.dumps(run_log.model_dump_json(), ensure_ascii=True, indent=4)
86
+ )
87
+
88
+ def read_from_path(self, run_id: str) -> RunLog:
89
+ """
90
+ Look into the run log folder for the run log for the run id.
91
+
92
+ If the run log does not exist, raise an exception. If it does, decode it
93
+ as a RunLog and return it
94
+
95
+ Args:
96
+ run_id (str): The requested run id to retrieve the run log store
97
+
98
+ Raises:
99
+ FileNotFoundError: If the Run Log has not been found.
100
+
101
+ Returns:
102
+ RunLog: The decoded Run log
103
+ """
104
+ run_log_bucket = self.get_run_log_bucket()
105
+
106
+ run_log_object = run_log_bucket / f"{run_id}.json"
107
+
108
+ run_log_text = json.loads(run_log_object.read_text())
109
+ run_log = RunLog(**json.loads(run_log_text))
110
+
111
+ return run_log
runnable/catalog.py CHANGED
@@ -2,7 +2,7 @@ import logging
2
2
  from abc import ABC, abstractmethod
3
3
  from typing import Any, Dict, List, Optional
4
4
 
5
- from pydantic import BaseModel, ConfigDict
5
+ from pydantic import BaseModel, ConfigDict, Field
6
6
 
7
7
  import runnable.context as context
8
8
  from runnable import defaults
@@ -43,6 +43,9 @@ class BaseCatalog(ABC, BaseModel):
43
43
 
44
44
  service_name: str = ""
45
45
  service_type: str = "catalog"
46
+
47
+ compute_data_folder: str = Field(default=defaults.COMPUTE_DATA_FOLDER)
48
+
46
49
  model_config = ConfigDict(extra="forbid")
47
50
 
48
51
  @abstractmethod
@@ -52,14 +55,8 @@ class BaseCatalog(ABC, BaseModel):
52
55
  def _context(self):
53
56
  return context.run_context
54
57
 
55
- @property
56
- def compute_data_folder(self) -> str:
57
- return defaults.COMPUTE_DATA_FOLDER
58
-
59
58
  @abstractmethod
60
- def get(
61
- self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
62
- ) -> List[DataCatalog]:
59
+ def get(self, name: str) -> List[DataCatalog]:
63
60
  """
64
61
  Get the catalog item by 'name' for the 'run id' and store it in compute data folder.
65
62
 
@@ -79,14 +76,7 @@ class BaseCatalog(ABC, BaseModel):
79
76
  raise NotImplementedError
80
77
 
81
78
  @abstractmethod
82
- def put(
83
- self,
84
- name: str,
85
- run_id: str,
86
- compute_data_folder: str = "",
87
- synced_catalogs: Optional[List[DataCatalog]] = None,
88
- **kwargs,
89
- ) -> List[DataCatalog]:
79
+ def put(self, name: str) -> List[DataCatalog]:
90
80
  """
91
81
  Put the file by 'name' from the 'compute_data_folder' in the catalog for the run_id.
92
82
 
@@ -140,23 +130,14 @@ class DoNothingCatalog(BaseCatalog):
140
130
  def get_summary(self) -> Dict[str, Any]:
141
131
  return {}
142
132
 
143
- def get(
144
- self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
145
- ) -> List[DataCatalog]:
133
+ def get(self, name: str) -> List[DataCatalog]:
146
134
  """
147
135
  Does nothing
148
136
  """
149
137
  logger.info("Using a do-nothing catalog, doing nothing in get")
150
138
  return []
151
139
 
152
- def put(
153
- self,
154
- name: str,
155
- run_id: str,
156
- compute_data_folder: str = "",
157
- synced_catalogs: Optional[List[DataCatalog]] = None,
158
- **kwargs,
159
- ) -> List[DataCatalog]:
140
+ def put(self, name: str) -> List[DataCatalog]:
160
141
  """
161
142
  Does nothing
162
143
  """
@@ -168,4 +149,3 @@ class DoNothingCatalog(BaseCatalog):
168
149
  Does nothing
169
150
  """
170
151
  logger.info("Using a do-nothing catalog, doing nothing while sync between runs")
171
- logger.info("Using a do-nothing catalog, doing nothing while sync between runs")
runnable/datastore.py CHANGED
@@ -114,7 +114,7 @@ class ObjectParameter(BaseModel):
114
114
 
115
115
  # If the object was serialised, get it from the catalog
116
116
  catalog_handler = context.run_context.catalog_handler
117
- catalog_handler.get(name=self.file_name, run_id=context.run_context.run_id)
117
+ catalog_handler.get(name=self.file_name)
118
118
  obj = context.run_context.pickler.load(path=self.file_name)
119
119
  os.remove(self.file_name) # Remove after loading
120
120
  return obj
@@ -128,7 +128,7 @@ class ObjectParameter(BaseModel):
128
128
  context.run_context.pickler.dump(data=data, path=self.file_name)
129
129
 
130
130
  catalog_handler = context.run_context.catalog_handler
131
- catalog_handler.put(name=self.file_name, run_id=context.run_context.run_id)
131
+ catalog_handler.put(name=self.file_name)
132
132
  os.remove(self.file_name) # Remove after loading
133
133
 
134
134
 
runnable/executor.py CHANGED
@@ -173,23 +173,6 @@ class BasePipelineExecutor(BaseExecutor):
173
173
  """
174
174
  ...
175
175
 
176
- @abstractmethod
177
- def get_effective_compute_data_folder(self) -> Optional[str]:
178
- """
179
- Get the effective compute data folder for the given stage.
180
- If there is nothing to catalog, we return None.
181
-
182
- The default is the compute data folder of the catalog but this can be over-ridden by the node.
183
-
184
- Args:
185
- stage (str): The stage we are in the process of cataloging
186
-
187
-
188
- Returns:
189
- Optional[str]: The compute data folder as defined by catalog handler or the node or None.
190
- """
191
- ...
192
-
193
176
  @abstractmethod
194
177
  def _sync_catalog(
195
178
  self, stage: str, synced_catalogs=None
runnable/tasks.py CHANGED
@@ -501,9 +501,7 @@ class NotebookTaskType(BaseTaskType):
501
501
  pm.execute_notebook(**kwds)
502
502
  task_console.print(out_file.getvalue())
503
503
 
504
- context.run_context.catalog_handler.put(
505
- name=notebook_output_path, run_id=context.run_context.run_id
506
- )
504
+ context.run_context.catalog_handler.put(name=notebook_output_path)
507
505
 
508
506
  client = PloomberClient.from_path(path=notebook_output_path)
509
507
  namespace = client.get_namespace()
runnable/utils.py CHANGED
@@ -359,26 +359,26 @@ def diff_dict(d1: Dict[str, Any], d2: Dict[str, Any]) -> Dict[str, Any]:
359
359
  return diff
360
360
 
361
361
 
362
- def hash_bytestr_iter(bytesiter, hasher, ashexstr=True): # pylint: disable=C0116
363
- """Hashes the given bytesiter using the given hasher."""
364
- for block in bytesiter: # pragma: no cover
365
- hasher.update(block)
366
- return hasher.hexdigest() if ashexstr else hasher.digest() # pragma: no cover
362
+ # def hash_bytestr_iter(bytesiter, hasher, ashexstr=True): # pylint: disable=C0116
363
+ # """Hashes the given bytesiter using the given hasher."""
364
+ # for block in bytesiter: # pragma: no cover
365
+ # hasher.update(block)
366
+ # return hasher.hexdigest() if ashexstr else hasher.digest() # pragma: no cover
367
367
 
368
368
 
369
- def file_as_blockiter(afile, blocksize=65536): # pylint: disable=C0116
370
- """From a StackOverflow answer: that is used to generate a MD5 hash of a large files.
371
- # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file.
369
+ # def file_as_blockiter(afile, blocksize=65536): # pylint: disable=C0116
370
+ # """From a StackOverflow answer: that is used to generate a MD5 hash of a large files.
371
+ # # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file.
372
372
 
373
- """
374
- with afile: # pragma: no cover
375
- block = afile.read(blocksize)
376
- while len(block) > 0:
377
- yield block
378
- block = afile.read(blocksize)
373
+ # """
374
+ # with afile: # pragma: no cover
375
+ # block = afile.read(blocksize)
376
+ # while len(block) > 0:
377
+ # yield block
378
+ # block = afile.read(blocksize)
379
379
 
380
380
 
381
- def get_data_hash(file_name: str):
381
+ def get_data_hash(file_name: str) -> str:
382
382
  """Returns the hash of the data file.
383
383
 
384
384
  Args:
@@ -389,9 +389,12 @@ def get_data_hash(file_name: str):
389
389
  """
390
390
  # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
391
391
  # TODO: For a big file, we should only hash the first few bytes
392
- return hash_bytestr_iter(
393
- file_as_blockiter(open(file_name, "rb")), hashlib.sha256()
394
- ) # pragma: no cover
392
+ with open(file_name, "rb") as f:
393
+ file_hash = hashlib.md5()
394
+ for chunk in iter(lambda: f.read(4096), b""):
395
+ file_hash.update(chunk)
396
+
397
+ return file_hash.hexdigest()
395
398
 
396
399
 
397
400
  # TODO: This is not the right place for this.