runnable 0.25.0__py3-none-any.whl → 0.27.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,104 @@
1
+ import logging
2
+ from abc import abstractmethod
3
+ from typing import Any, Dict
4
+
5
+ from runnable import defaults, exceptions
6
+ from runnable.datastore import BaseRunLogStore, RunLog
7
+
8
+ logger = logging.getLogger(defaults.LOGGER_NAME)
9
+
10
+
11
+ class AnyPathRunLogStore(BaseRunLogStore):
12
+ """
13
+ In this type of Run Log store, we use a file system to store the JSON run log.
14
+
15
+ Every single run is stored as a different file which makes it compatible across other store types.
16
+
17
+ When to use:
18
+ When locally testing a pipeline and have the need to compare across runs.
19
+ Its fully featured and perfectly fine if your local environment is where you would do everything.
20
+
21
+ Do not use:
22
+ If you need parallelization on local, this run log would not support it.
23
+
24
+ Example config:
25
+
26
+ run_log:
27
+ type: file-system
28
+ config:
29
+ log_folder: The folder to out the logs. Defaults to .run_log_store
30
+
31
+ """
32
+
33
+ service_name: str = "file-system"
34
+ log_folder: str = defaults.LOG_LOCATION_FOLDER
35
+
36
+ @property
37
+ def log_folder_name(self):
38
+ return self.log_folder
39
+
40
+ def get_summary(self) -> Dict[str, Any]:
41
+ summary = {"Type": self.service_name, "Location": self.log_folder}
42
+
43
+ return summary
44
+
45
+ @abstractmethod
46
+ def write_to_path(self, run_log: RunLog): ...
47
+
48
+ @abstractmethod
49
+ def read_from_path(self, run_id: str) -> RunLog: ...
50
+
51
+ def create_run_log(
52
+ self,
53
+ run_id: str,
54
+ dag_hash: str = "",
55
+ use_cached: bool = False,
56
+ tag: str = "",
57
+ original_run_id: str = "",
58
+ status: str = defaults.CREATED,
59
+ ) -> RunLog:
60
+ """
61
+ # Creates a Run log
62
+ # Adds it to the db
63
+ """
64
+
65
+ try:
66
+ self.get_run_log_by_id(run_id=run_id, full=False)
67
+ raise exceptions.RunLogExistsError(run_id=run_id)
68
+ except exceptions.RunLogNotFoundError:
69
+ pass
70
+
71
+ logger.info(f"{self.service_name} Creating a Run Log for : {run_id}")
72
+ run_log = RunLog(
73
+ run_id=run_id,
74
+ dag_hash=dag_hash,
75
+ tag=tag,
76
+ status=status,
77
+ )
78
+ self.write_to_path(run_log)
79
+ return run_log
80
+
81
+ def get_run_log_by_id(
82
+ self,
83
+ run_id: str,
84
+ full: bool = False,
85
+ ) -> RunLog:
86
+ """
87
+ # Returns the run_log defined by id
88
+ # Raises Exception if not found
89
+ """
90
+ try:
91
+ logger.info(f"{self.service_name} Getting a Run Log for : {run_id}")
92
+ run_log = self.read_from_path(run_id)
93
+ return run_log
94
+ except FileNotFoundError as e:
95
+ raise exceptions.RunLogNotFoundError(run_id) from e
96
+
97
+ def put_run_log(self, run_log: RunLog):
98
+ """
99
+ # Puts the run_log into the database
100
+ """
101
+ logger.info(
102
+ f"{self.service_name} Putting the run log in the DB: {run_log.run_id}"
103
+ )
104
+ self.write_to_path(run_log)
@@ -2,14 +2,16 @@ import json
2
2
  import logging
3
3
  from pathlib import Path
4
4
  from string import Template
5
- from typing import Any, Dict, Optional, Sequence, Union
5
+ from typing import Any, Dict, Optional, Union
6
+
7
+ from cloudpathlib import CloudPath
6
8
 
7
9
  from extensions.run_log_store.generic_chunked import ChunkedRunLogStore
8
10
  from runnable import defaults, utils
9
11
 
10
12
  logger = logging.getLogger(defaults.LOGGER_NAME)
11
13
 
12
- T = Union[str, Path]
14
+ MixT = Union[CloudPath, Path]
13
15
 
14
16
 
15
17
  class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
@@ -28,7 +30,7 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
28
30
 
29
31
  def get_matches(
30
32
  self, run_id: str, name: str, multiple_allowed: bool = False
31
- ) -> Optional[Union[Sequence[T], T]]:
33
+ ) -> Optional[Union[list[Path], list[CloudPath], MixT]]:
32
34
  """
33
35
  Get contents of files matching the pattern name*
34
36
 
@@ -78,7 +80,7 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
78
80
 
79
81
  return str(name) + ".json"
80
82
 
81
- def _store(self, run_id: str, contents: dict, name: Union[Path, str], insert=False):
83
+ def _store(self, run_id: str, contents: dict, name: MixT, insert=False):
82
84
  """
83
85
  Store the contents against the name in the folder.
84
86
 
@@ -87,15 +89,16 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
87
89
  contents (dict): The dict to store
88
90
  name (str): The name to store as
89
91
  """
92
+ log_folder_with_run_id = self.log_folder_with_run_id(run_id=run_id)
90
93
  if insert:
91
- name = self.log_folder_with_run_id(run_id=run_id) / name
94
+ name = log_folder_with_run_id / name
92
95
 
93
- utils.safe_make_dir(self.log_folder_with_run_id(run_id=run_id))
96
+ utils.safe_make_dir(log_folder_with_run_id)
94
97
 
95
- with open(self.safe_suffix_json(name), "w") as fw:
98
+ with open(log_folder_with_run_id / self.safe_suffix_json(name.name), "w") as fw:
96
99
  json.dump(contents, fw, ensure_ascii=True, indent=4)
97
100
 
98
- def _retrieve(self, name: Union[str, Path]) -> dict:
101
+ def _retrieve(self, run_id: str, name: MixT) -> dict:
99
102
  """
100
103
  Does the job of retrieving from the folder.
101
104
 
@@ -106,8 +109,9 @@ class ChunkedFileSystemRunLogStore(ChunkedRunLogStore):
106
109
  dict: The contents
107
110
  """
108
111
  contents: dict = {}
112
+ log_folder_with_run_id = self.log_folder_with_run_id(run_id=run_id)
109
113
 
110
- with open(self.safe_suffix_json(name), "r") as fr:
114
+ with open(log_folder_with_run_id / self.safe_suffix_json(name.name), "r") as fr:
111
115
  contents = json.load(fr)
112
116
 
113
117
  return contents
@@ -3,13 +3,14 @@ import logging
3
3
  from pathlib import Path
4
4
  from typing import Any, Dict
5
5
 
6
- from runnable import defaults, exceptions, utils
7
- from runnable.datastore import BaseRunLogStore, RunLog
6
+ from extensions.run_log_store.any_path import AnyPathRunLogStore
7
+ from runnable import defaults, utils
8
+ from runnable.datastore import RunLog
8
9
 
9
10
  logger = logging.getLogger(defaults.LOGGER_NAME)
10
11
 
11
12
 
12
- class FileSystemRunLogstore(BaseRunLogStore):
13
+ class FileSystemRunLogstore(AnyPathRunLogStore):
13
14
  """
14
15
  In this type of Run Log store, we use a file system to store the JSON run log.
15
16
 
@@ -43,7 +44,7 @@ class FileSystemRunLogstore(BaseRunLogStore):
43
44
 
44
45
  return summary
45
46
 
46
- def write_to_folder(self, run_log: RunLog):
47
+ def write_to_path(self, run_log: RunLog):
47
48
  """
48
49
  Write the run log to the folder
49
50
 
@@ -60,7 +61,7 @@ class FileSystemRunLogstore(BaseRunLogStore):
60
61
  with json_file_path.open("w") as fw:
61
62
  json.dump(run_log.model_dump(), fw, ensure_ascii=True, indent=4) # pylint: disable=no-member
62
63
 
63
- def get_from_folder(self, run_id: str) -> RunLog:
64
+ def read_from_path(self, run_id: str) -> RunLog:
64
65
  """
65
66
  Look into the run log folder for the run log for the run id.
66
67
 
@@ -88,58 +89,3 @@ class FileSystemRunLogstore(BaseRunLogStore):
88
89
  json_str = json.load(fr)
89
90
  run_log = RunLog(**json_str) # pylint: disable=no-member
90
91
  return run_log
91
-
92
- def create_run_log(
93
- self,
94
- run_id: str,
95
- dag_hash: str = "",
96
- use_cached: bool = False,
97
- tag: str = "",
98
- original_run_id: str = "",
99
- status: str = defaults.CREATED,
100
- ) -> RunLog:
101
- """
102
- # Creates a Run log
103
- # Adds it to the db
104
- """
105
-
106
- try:
107
- self.get_run_log_by_id(run_id=run_id, full=False)
108
- raise exceptions.RunLogExistsError(run_id=run_id)
109
- except exceptions.RunLogNotFoundError:
110
- pass
111
-
112
- logger.info(f"{self.service_name} Creating a Run Log for : {run_id}")
113
- run_log = RunLog(
114
- run_id=run_id,
115
- dag_hash=dag_hash,
116
- tag=tag,
117
- status=status,
118
- )
119
- self.write_to_folder(run_log)
120
- return run_log
121
-
122
- def get_run_log_by_id(
123
- self,
124
- run_id: str,
125
- full: bool = False,
126
- ) -> RunLog:
127
- """
128
- # Returns the run_log defined by id
129
- # Raises Exception if not found
130
- """
131
- try:
132
- logger.info(f"{self.service_name} Getting a Run Log for : {run_id}")
133
- run_log = self.get_from_folder(run_id)
134
- return run_log
135
- except FileNotFoundError as e:
136
- raise exceptions.RunLogNotFoundError(run_id) from e
137
-
138
- def put_run_log(self, run_log: RunLog):
139
- """
140
- # Puts the run_log into the database
141
- """
142
- logger.info(
143
- f"{self.service_name} Putting the run log in the DB: {run_log.run_id}"
144
- )
145
- self.write_to_folder(run_log)
@@ -4,7 +4,9 @@ from abc import abstractmethod
4
4
  from enum import Enum
5
5
  from pathlib import Path
6
6
  from string import Template
7
- from typing import Any, Dict, Optional, Sequence, Union
7
+ from typing import Any, Dict, Optional, Union
8
+
9
+ from cloudpathlib import CloudPath
8
10
 
9
11
  from runnable import defaults, exceptions
10
12
  from runnable.datastore import (
@@ -21,7 +23,7 @@ from runnable.datastore import (
21
23
  logger = logging.getLogger(defaults.LOGGER_NAME)
22
24
 
23
25
 
24
- T = Union[str, Path] # Holds str, path
26
+ MixT = Union[CloudPath, Path] # Holds str, path
25
27
 
26
28
 
27
29
  class EntityNotFoundError(Exception):
@@ -87,7 +89,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
87
89
  @abstractmethod
88
90
  def get_matches(
89
91
  self, run_id: str, name: str, multiple_allowed: bool = False
90
- ) -> Optional[Union[Sequence[T], T]]:
92
+ ) -> Optional[Union[list[Path], list[CloudPath], MixT]]:
91
93
  """
92
94
  Get contents of persistence layer matching the pattern name*
93
95
 
@@ -98,7 +100,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
98
100
  ...
99
101
 
100
102
  @abstractmethod
101
- def _store(self, run_id: str, contents: dict, name: T, insert: bool = False):
103
+ def _store(self, run_id: str, contents: dict, name: MixT, insert: bool = False):
102
104
  """
103
105
  Store the contents against the name in the persistence layer.
104
106
 
@@ -110,7 +112,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
110
112
  ...
111
113
 
112
114
  @abstractmethod
113
- def _retrieve(self, name: T) -> dict:
115
+ def _retrieve(self, run_id: str, name: MixT) -> dict:
114
116
  """
115
117
  Does the job of retrieving from the persistent layer.
116
118
 
@@ -140,7 +142,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
140
142
  insert = False
141
143
 
142
144
  if match:
143
- existing_contents = self._retrieve(name=match) # type: ignore
145
+ existing_contents = self._retrieve(run_id=run_id, name=match) # type: ignore
144
146
  contents = dict(existing_contents, **contents)
145
147
  name_to_give = match # type: ignore
146
148
  else:
@@ -149,7 +151,9 @@ class ChunkedRunLogStore(BaseRunLogStore):
149
151
  )
150
152
  insert = True
151
153
 
152
- self._store(run_id=run_id, contents=contents, name=name_to_give, insert=insert)
154
+ self._store(
155
+ run_id=run_id, contents=contents, name=Path(name_to_give), insert=insert
156
+ )
153
157
 
154
158
  def retrieve(
155
159
  self, run_id: str, log_type: LogTypes, name: str = "", multiple_allowed=False
@@ -190,13 +194,13 @@ class ChunkedRunLogStore(BaseRunLogStore):
190
194
 
191
195
  if matches:
192
196
  if not multiple_allowed:
193
- contents = self._retrieve(name=matches) # type: ignore
197
+ contents = self._retrieve(run_id=run_id, name=matches) # type: ignore
194
198
  model = self.ModelTypes[log_type.name].value
195
199
  return model(**contents)
196
200
 
197
201
  models = []
198
202
  for match in matches: # type: ignore
199
- contents = self._retrieve(name=match)
203
+ contents = self._retrieve(run_id=run_id, name=match)
200
204
  model = self.ModelTypes[log_type.name].value
201
205
  models.append(model(**contents))
202
206
  return models
@@ -225,7 +229,9 @@ class ChunkedRunLogStore(BaseRunLogStore):
225
229
  # No branch logs are found
226
230
  return {}
227
231
  # Forcing get_matches to always return a list is a better design
228
- epoch_created = [str(match).split("-")[-1] for match in matches] # type: ignore
232
+
233
+ assert isinstance(matches, list)
234
+ epoch_created = [str(match).split("-")[-1] for match in matches]
229
235
 
230
236
  # sort matches by epoch created
231
237
  epoch_created, matches = zip(*sorted(zip(epoch_created, matches))) # type: ignore
@@ -234,7 +240,7 @@ class ChunkedRunLogStore(BaseRunLogStore):
234
240
 
235
241
  for match in matches:
236
242
  model = self.ModelTypes[log_type.name].value
237
- log_model = model(**self._retrieve(match))
243
+ log_model = model(**self._retrieve(run_id=run_id, name=match))
238
244
  logs[log_model.internal_name] = log_model # type: ignore
239
245
 
240
246
  return logs
@@ -0,0 +1,111 @@
1
+ import json
2
+ import logging
3
+ from functools import lru_cache
4
+ from typing import Any, Dict
5
+
6
+ from cloudpathlib import S3Client, S3Path
7
+ from pydantic import Field, SecretStr
8
+
9
+ from extensions.run_log_store.any_path import AnyPathRunLogStore
10
+ from runnable import defaults
11
+ from runnable.datastore import RunLog
12
+
13
+ logger = logging.getLogger(defaults.LOGGER_NAME)
14
+
15
+
16
+ @lru_cache
17
+ def get_minio_client(
18
+ endpoint_url: str, aws_access_key_id: str, aws_secret_access_key: str
19
+ ) -> S3Client:
20
+ return S3Client(
21
+ endpoint_url=endpoint_url,
22
+ aws_access_key_id=aws_access_key_id,
23
+ aws_secret_access_key=aws_secret_access_key,
24
+ )
25
+
26
+
27
+ class MinioRunLogStore(AnyPathRunLogStore):
28
+ """
29
+ In this type of Run Log store, we use a file system to store the JSON run log.
30
+
31
+ Every single run is stored as a different file which makes it compatible across other store types.
32
+
33
+ When to use:
34
+ When locally testing a pipeline and have the need to compare across runs.
35
+ Its fully featured and perfectly fine if your local environment is where you would do everything.
36
+
37
+ Do not use:
38
+ If you need parallelization on local, this run log would not support it.
39
+
40
+ Example config:
41
+
42
+ run_log:
43
+ type: file-system
44
+ config:
45
+ log_folder: The folder to out the logs. Defaults to .run_log_store
46
+
47
+ """
48
+
49
+ service_name: str = "file-system"
50
+
51
+ endpoint_url: str = Field(default="http://localhost:9002")
52
+ aws_access_key_id: SecretStr = SecretStr(secret_value="minioadmin")
53
+ aws_secret_access_key: SecretStr = SecretStr(secret_value="minioadmin")
54
+ bucket: str = Field(default="runnable/run-logs")
55
+
56
+ def get_summary(self) -> Dict[str, Any]:
57
+ summary = {"Type": self.service_name, "Location": self.log_folder}
58
+
59
+ return summary
60
+
61
+ def get_run_log_bucket(self) -> S3Path:
62
+ run_id = self._context.run_id
63
+
64
+ return S3Path(
65
+ f"s3://{self.bucket}/{run_id}/",
66
+ client=get_minio_client(
67
+ self.endpoint_url,
68
+ self.aws_access_key_id.get_secret_value(),
69
+ self.aws_secret_access_key.get_secret_value(),
70
+ ),
71
+ )
72
+
73
+ def write_to_path(self, run_log: RunLog):
74
+ """
75
+ Write the run log to the folder
76
+
77
+ Args:
78
+ run_log (RunLog): The run log to be added to the database
79
+ """
80
+ run_log_bucket = self.get_run_log_bucket()
81
+ run_log_bucket.mkdir(parents=True, exist_ok=True)
82
+
83
+ run_log_object = run_log_bucket / f"{run_log.run_id}.json"
84
+ run_log_object.write_text(
85
+ json.dumps(run_log.model_dump_json(), ensure_ascii=True, indent=4)
86
+ )
87
+
88
+ def read_from_path(self, run_id: str) -> RunLog:
89
+ """
90
+ Look into the run log folder for the run log for the run id.
91
+
92
+ If the run log does not exist, raise an exception. If it does, decode it
93
+ as a RunLog and return it
94
+
95
+ Args:
96
+ run_id (str): The requested run id to retrieve the run log store
97
+
98
+ Raises:
99
+ FileNotFoundError: If the Run Log has not been found.
100
+
101
+ Returns:
102
+ RunLog: The decoded Run log
103
+ """
104
+ run_log_bucket = self.get_run_log_bucket()
105
+
106
+ run_log_object = run_log_bucket / f"{run_id}.json"
107
+
108
+ run_log_text = json.loads(run_log_object.read_text())
109
+ run_log = RunLog(**json.loads(run_log_text))
110
+
111
+ return run_log
runnable/catalog.py CHANGED
@@ -2,7 +2,7 @@ import logging
2
2
  from abc import ABC, abstractmethod
3
3
  from typing import Any, Dict, List, Optional
4
4
 
5
- from pydantic import BaseModel, ConfigDict
5
+ from pydantic import BaseModel, ConfigDict, Field
6
6
 
7
7
  import runnable.context as context
8
8
  from runnable import defaults
@@ -43,6 +43,9 @@ class BaseCatalog(ABC, BaseModel):
43
43
 
44
44
  service_name: str = ""
45
45
  service_type: str = "catalog"
46
+
47
+ compute_data_folder: str = Field(default=defaults.COMPUTE_DATA_FOLDER)
48
+
46
49
  model_config = ConfigDict(extra="forbid")
47
50
 
48
51
  @abstractmethod
@@ -52,14 +55,8 @@ class BaseCatalog(ABC, BaseModel):
52
55
  def _context(self):
53
56
  return context.run_context
54
57
 
55
- @property
56
- def compute_data_folder(self) -> str:
57
- return defaults.COMPUTE_DATA_FOLDER
58
-
59
58
  @abstractmethod
60
- def get(
61
- self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
62
- ) -> List[DataCatalog]:
59
+ def get(self, name: str) -> List[DataCatalog]:
63
60
  """
64
61
  Get the catalog item by 'name' for the 'run id' and store it in compute data folder.
65
62
 
@@ -79,14 +76,7 @@ class BaseCatalog(ABC, BaseModel):
79
76
  raise NotImplementedError
80
77
 
81
78
  @abstractmethod
82
- def put(
83
- self,
84
- name: str,
85
- run_id: str,
86
- compute_data_folder: str = "",
87
- synced_catalogs: Optional[List[DataCatalog]] = None,
88
- **kwargs,
89
- ) -> List[DataCatalog]:
79
+ def put(self, name: str) -> List[DataCatalog]:
90
80
  """
91
81
  Put the file by 'name' from the 'compute_data_folder' in the catalog for the run_id.
92
82
 
@@ -140,23 +130,14 @@ class DoNothingCatalog(BaseCatalog):
140
130
  def get_summary(self) -> Dict[str, Any]:
141
131
  return {}
142
132
 
143
- def get(
144
- self, name: str, run_id: str, compute_data_folder: str = "", **kwargs
145
- ) -> List[DataCatalog]:
133
+ def get(self, name: str) -> List[DataCatalog]:
146
134
  """
147
135
  Does nothing
148
136
  """
149
137
  logger.info("Using a do-nothing catalog, doing nothing in get")
150
138
  return []
151
139
 
152
- def put(
153
- self,
154
- name: str,
155
- run_id: str,
156
- compute_data_folder: str = "",
157
- synced_catalogs: Optional[List[DataCatalog]] = None,
158
- **kwargs,
159
- ) -> List[DataCatalog]:
140
+ def put(self, name: str) -> List[DataCatalog]:
160
141
  """
161
142
  Does nothing
162
143
  """
@@ -168,4 +149,3 @@ class DoNothingCatalog(BaseCatalog):
168
149
  Does nothing
169
150
  """
170
151
  logger.info("Using a do-nothing catalog, doing nothing while sync between runs")
171
- logger.info("Using a do-nothing catalog, doing nothing while sync between runs")
runnable/datastore.py CHANGED
@@ -114,7 +114,7 @@ class ObjectParameter(BaseModel):
114
114
 
115
115
  # If the object was serialised, get it from the catalog
116
116
  catalog_handler = context.run_context.catalog_handler
117
- catalog_handler.get(name=self.file_name, run_id=context.run_context.run_id)
117
+ catalog_handler.get(name=self.file_name)
118
118
  obj = context.run_context.pickler.load(path=self.file_name)
119
119
  os.remove(self.file_name) # Remove after loading
120
120
  return obj
@@ -128,7 +128,7 @@ class ObjectParameter(BaseModel):
128
128
  context.run_context.pickler.dump(data=data, path=self.file_name)
129
129
 
130
130
  catalog_handler = context.run_context.catalog_handler
131
- catalog_handler.put(name=self.file_name, run_id=context.run_context.run_id)
131
+ catalog_handler.put(name=self.file_name)
132
132
  os.remove(self.file_name) # Remove after loading
133
133
 
134
134
 
runnable/executor.py CHANGED
@@ -173,23 +173,6 @@ class BasePipelineExecutor(BaseExecutor):
173
173
  """
174
174
  ...
175
175
 
176
- @abstractmethod
177
- def get_effective_compute_data_folder(self) -> Optional[str]:
178
- """
179
- Get the effective compute data folder for the given stage.
180
- If there is nothing to catalog, we return None.
181
-
182
- The default is the compute data folder of the catalog but this can be over-ridden by the node.
183
-
184
- Args:
185
- stage (str): The stage we are in the process of cataloging
186
-
187
-
188
- Returns:
189
- Optional[str]: The compute data folder as defined by catalog handler or the node or None.
190
- """
191
- ...
192
-
193
176
  @abstractmethod
194
177
  def _sync_catalog(
195
178
  self, stage: str, synced_catalogs=None
runnable/tasks.py CHANGED
@@ -501,9 +501,7 @@ class NotebookTaskType(BaseTaskType):
501
501
  pm.execute_notebook(**kwds)
502
502
  task_console.print(out_file.getvalue())
503
503
 
504
- context.run_context.catalog_handler.put(
505
- name=notebook_output_path, run_id=context.run_context.run_id
506
- )
504
+ context.run_context.catalog_handler.put(name=notebook_output_path)
507
505
 
508
506
  client = PloomberClient.from_path(path=notebook_output_path)
509
507
  namespace = client.get_namespace()
runnable/utils.py CHANGED
@@ -359,26 +359,26 @@ def diff_dict(d1: Dict[str, Any], d2: Dict[str, Any]) -> Dict[str, Any]:
359
359
  return diff
360
360
 
361
361
 
362
- def hash_bytestr_iter(bytesiter, hasher, ashexstr=True): # pylint: disable=C0116
363
- """Hashes the given bytesiter using the given hasher."""
364
- for block in bytesiter: # pragma: no cover
365
- hasher.update(block)
366
- return hasher.hexdigest() if ashexstr else hasher.digest() # pragma: no cover
362
+ # def hash_bytestr_iter(bytesiter, hasher, ashexstr=True): # pylint: disable=C0116
363
+ # """Hashes the given bytesiter using the given hasher."""
364
+ # for block in bytesiter: # pragma: no cover
365
+ # hasher.update(block)
366
+ # return hasher.hexdigest() if ashexstr else hasher.digest() # pragma: no cover
367
367
 
368
368
 
369
- def file_as_blockiter(afile, blocksize=65536): # pylint: disable=C0116
370
- """From a StackOverflow answer: that is used to generate a MD5 hash of a large files.
371
- # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file.
369
+ # def file_as_blockiter(afile, blocksize=65536): # pylint: disable=C0116
370
+ # """From a StackOverflow answer: that is used to generate a MD5 hash of a large files.
371
+ # # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file.
372
372
 
373
- """
374
- with afile: # pragma: no cover
375
- block = afile.read(blocksize)
376
- while len(block) > 0:
377
- yield block
378
- block = afile.read(blocksize)
373
+ # """
374
+ # with afile: # pragma: no cover
375
+ # block = afile.read(blocksize)
376
+ # while len(block) > 0:
377
+ # yield block
378
+ # block = afile.read(blocksize)
379
379
 
380
380
 
381
- def get_data_hash(file_name: str):
381
+ def get_data_hash(file_name: str) -> str:
382
382
  """Returns the hash of the data file.
383
383
 
384
384
  Args:
@@ -389,9 +389,12 @@ def get_data_hash(file_name: str):
389
389
  """
390
390
  # https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
391
391
  # TODO: For a big file, we should only hash the first few bytes
392
- return hash_bytestr_iter(
393
- file_as_blockiter(open(file_name, "rb")), hashlib.sha256()
394
- ) # pragma: no cover
392
+ with open(file_name, "rb") as f:
393
+ file_hash = hashlib.md5()
394
+ for chunk in iter(lambda: f.read(4096), b""):
395
+ file_hash.update(chunk)
396
+
397
+ return file_hash.hexdigest()
395
398
 
396
399
 
397
400
  # TODO: This is not the right place for this.