memodisk 0.0.6.dev0__tar.gz → 0.1.1.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. memodisk-0.1.1.dev0/.coverage +0 -0
  2. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/.github/workflows/python-publish.yml +2 -2
  3. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/.github/workflows/run-tests.yml +1 -1
  4. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/PKG-INFO +52 -47
  5. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/README.md +46 -41
  6. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/memodisk/__init__.py +10 -5
  7. memodisk-0.1.1.dev0/memodisk/_version.py +24 -0
  8. memodisk-0.1.1.dev0/memodisk/memodisk.py +2129 -0
  9. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/memodisk.egg-info/PKG-INFO +52 -47
  10. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/memodisk.egg-info/SOURCES.txt +7 -0
  11. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/memodisk.egg-info/requires.txt +2 -3
  12. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/pyproject.toml +12 -13
  13. memodisk-0.1.1.dev0/run_tests_in_python_venvs.ps1 +88 -0
  14. memodisk-0.1.1.dev0/setup_python_venvs.ps1 +103 -0
  15. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/code_test_code_dep_numba_a.py +3 -2
  16. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/code_test_code_dep_numba_b.py +3 -2
  17. memodisk-0.1.1.dev0/tests/debug_tracer.py +27 -0
  18. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/non_pure_functions.py +2 -1
  19. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/test_closure.py +2 -2
  20. memodisk-0.1.1.dev0/tests/test_code_dependency.py +401 -0
  21. memodisk-0.1.1.dev0/tests/test_comprehensive.py +1318 -0
  22. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/test_data_dependency_change.py +44 -0
  23. memodisk-0.1.1.dev0/tests/test_debug_tracer.py +30 -0
  24. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/test_decorator.py +4 -6
  25. memodisk-0.1.1.dev0/tests/test_failure_modes.py +1357 -0
  26. memodisk-0.1.1.dev0/tests/test_internals.py +26 -0
  27. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/test_non_pure_functions.py +43 -5
  28. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/test_numba.py +4 -4
  29. memodisk-0.1.1.dev0/uv.lock +848 -0
  30. memodisk-0.0.6.dev0/memodisk/_version.py +0 -21
  31. memodisk-0.0.6.dev0/memodisk/memodisk.py +0 -903
  32. memodisk-0.0.6.dev0/tests/test_code_dependency.py +0 -33
  33. memodisk-0.0.6.dev0/tests/test_internals.py +0 -47
  34. memodisk-0.0.6.dev0/uv.lock +0 -818
  35. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/.github/workflows/python-package.yml +0 -0
  36. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/.gitignore +0 -0
  37. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/.vscode/launch.json +0 -0
  38. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/LICENSE +0 -0
  39. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/examples/example_1.py +0 -0
  40. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/examples/example_2.py +0 -0
  41. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/examples/example_3.py +0 -0
  42. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/memodisk/development.md +0 -0
  43. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/memodisk/py.typed +0 -0
  44. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/memodisk.egg-info/dependency_links.txt +0 -0
  45. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/memodisk.egg-info/top_level.txt +0 -0
  46. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/setup.cfg +0 -0
  47. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/__memodisk__/__main__.f2_48fdc14256d5fb8d_dependencies.json +0 -0
  48. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/__memodisk__/__main__.f2_48fdc14256d5fb8d_result.pkl +0 -0
  49. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/__memodisk__/__main__.f3_4a6f12b45303cb21_dependencies.json +0 -0
  50. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/__memodisk__/__main__.f3_4a6f12b45303cb21_result.pkl +0 -0
  51. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/__memodisk__/__main__.fun_a_a18bf514f8e6e6e6_dependencies.json +0 -0
  52. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/__memodisk__/__main__.fun_a_a18bf514f8e6e6e6_result.pkl +0 -0
  53. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/__memodisk__/__main__.fun_b_1eba499228544c31_dependencies.json +0 -0
  54. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/__memodisk__/__main__.fun_b_1eba499228544c31_result.pkl +0 -0
  55. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/code_test_code_dep_1_a.py +0 -0
  56. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/code_test_code_dep_1_b.py +0 -0
  57. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/test_class.py +0 -0
  58. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/test_data_dependency_change_monkey_patch.py +0 -0
  59. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/test_dependency_inheritance.py +0 -0
  60. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/test_dependency_inheritance_a.py +0 -0
  61. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/test_dependency_inheritance_b.py +0 -0
  62. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/test_memoize_random_state.py +0 -0
  63. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/test_method_dependency.py +0 -0
  64. {memodisk-0.0.6.dev0 → memodisk-0.1.1.dev0}/tests/test_nested_memoize.py +0 -0
Binary file
@@ -14,7 +14,7 @@ jobs:
14
14
  verify-tests:
15
15
  uses: ./.github/workflows/run-tests.yml
16
16
  with:
17
- python-version: '["3.10"]' # Run with a single version to be faster
17
+ python-version: '["3.12", "3.13", "3.14"]'
18
18
 
19
19
  pypi-publish:
20
20
  name: upload release to PyPI
@@ -31,7 +31,7 @@ jobs:
31
31
  - name: Set up Python
32
32
  uses: actions/setup-python@v3
33
33
  with:
34
- python-version: '3.10'
34
+ python-version: '3.12'
35
35
  - name: Install dependencies
36
36
  run: |
37
37
  python -m pip install --upgrade pip
@@ -6,7 +6,7 @@ on:
6
6
  python-version:
7
7
  required: false
8
8
  type: string
9
- default: '["3.10"]'
9
+ default: '["3.12", "3.13", "3.14"]'
10
10
  description: 'Python versions to test with'
11
11
 
12
12
  jobs:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: memodisk
3
- Version: 0.0.6.dev0
3
+ Version: 0.1.1.dev0
4
4
  Summary: A python module to memoize function results on disk with python code and data dependencies tracking
5
5
  Author-email: Martin de La Gorce <martin.delagorce@gmail.com>
6
6
  Maintainer-email: Martin de La Gorce <martin.delagorce@gmail.com>
@@ -34,15 +34,15 @@ Classifier: Intended Audience :: Developers
34
34
  Classifier: Intended Audience :: Science/Research
35
35
  Classifier: License :: OSI Approved :: MIT License
36
36
  Classifier: Operating System :: OS Independent
37
- Classifier: Programming Language :: Python :: 3.10
38
- Requires-Python: ==3.10.*
37
+ Classifier: Programming Language :: Python :: 3.12
38
+ Classifier: Programming Language :: Python :: 3.13
39
+ Requires-Python: >=3.12
39
40
  Description-Content-Type: text/markdown
40
41
  License-File: LICENSE
41
- Requires-Dist: typing-extensions>=4.0.0
42
42
  Provides-Extra: dev
43
43
  Requires-Dist: pytest>=6.0; extra == "dev"
44
44
  Requires-Dist: pytest-cov; extra == "dev"
45
- Requires-Dist: ruff>=0.1.0; extra == "dev"
45
+ Requires-Dist: ruff>=0.15.0; extra == "dev"
46
46
  Requires-Dist: mypy; extra == "dev"
47
47
  Requires-Dist: pre-commit; extra == "dev"
48
48
  Requires-Dist: build; extra == "dev"
@@ -51,30 +51,30 @@ Provides-Extra: test
51
51
  Requires-Dist: pytest>=6.0; extra == "test"
52
52
  Requires-Dist: pytest-cov; extra == "test"
53
53
  Requires-Dist: numpy>1.24.4; extra == "test"
54
- Requires-Dist: numba>=0.61; extra == "test"
54
+ Requires-Dist: numba>=0.65; extra == "test"
55
55
  Requires-Dist: opencv-python; extra == "test"
56
56
  Dynamic: license-file
57
57
 
58
58
  # memodisk
59
59
 
60
- **memodisk** is a python module to memoize function results on disk with python code and data dependencies tracking.
60
+ A Python package to cache function results on disk with dependency changes tracking.
61
61
 
62
62
  ![Python package](https://github.com/martinResearch/memodisk/workflows/Python%20package/badge.svg)
63
63
 
64
64
  ## Goal
65
65
 
66
- This package provides a python decorator to save on disk and reuse the results of functions that are long to execute. This can be referred to as *persistent memoization*.
66
+ This package provides a Python decorator to save on disk and reuse the results of functions that are long to execute. This can be referred to as *persistent memoization*.
67
67
 
68
68
  The result of a decorated function is loaded from disk if and only if
69
69
 
70
70
  * the function has been called previously with the same input argument
71
- * there are no changes in the python code dependencies or data dependency files.
71
+ * there are no changes in the Python code dependencies or data dependency files.
72
72
 
73
- The use of the second condition differentiates this library from most other python persistent memoization libraries. It is a useful feature when prototyping code with frequent code changes.
73
+ The use of the second condition differentiates this library from most other Python persistent memoization libraries. It is a useful feature when prototyping code with frequent code changes.
74
74
 
75
75
  ## Warning
76
76
 
77
- This is a prototype with limited testing and works with python 3.10 only. There could be security risks related to the use of Pickle. Some data, code or global variables dependencies could be not detected leading the memoization to return stale results (see the limitations section). If you find failure modes that are not listed in the limitations, please create an issue with a minimal code example to reproduce the problem.
77
+ This is a prototype with limited testing and currently targets Python 3.12+. There could be security risks related to the use of Pickle. Some data, code or global variables dependencies could be not detected leading the memoization to return stale results (see the limitations section). If you find failure modes that are not listed in the limitations, please create an issue with a minimal code example to reproduce the problem.
78
78
 
79
79
  ## Installation
80
80
 
@@ -87,9 +87,11 @@ pip install memodisk
87
87
  Using the memoization is as simple as adding the `@memoize` decorator to the function you want to use memoization with. In general you want to use memoization on a function whose execution time is longer than the time it takes to check that this function has already been called with the same argument (compute the input hashes) and load the result from disk.
88
88
  To get the largest speedups thanks to memoization, it might be necessary to refactor the code to move the parts that are long to execute into functions that take limited size inputs.
89
89
 
90
+ The decorator also accepts options such as `mode=...`, `external_process_mode=...`, `ignore_code_changes=True`, `condition=...`, `serializer=...`, `argument_hasher=...`, and `store_call_arguments=True`. The `condition` callable receives the same arguments as the memoized function and allows caching only for selected invocations. The `serializer` option customizes how cached results are written and read back, while argument hashing remains unchanged. The `argument_hasher` option customizes cache-key generation for `args` and `kwargs`, which is useful for very large inputs or objects that are not picklable by default. The `store_call_arguments` option writes a companion pickle file containing the original `args` and `kwargs` so a cached call can be replayed later. The helper `load_cached_call_arguments(...)` reads those stored arguments back from a dependencies JSON file. The `external_process_mode` option controls subprocess and `os.system` handling: `manual` disables automatic executable tracking, `direct` records the launched executable path but marks subprocess coverage as incomplete, and `strict` raises when traced code launches external processes without complete dependency coverage. The `mode` policy also governs common ambient time APIs such as `time.time()` and `datetime.datetime.now()`: `safe` recomputes and skips cache writes, `strict` raises, and `optimistic` allows caching.
91
+
90
92
  ### Example 1: code dependency changes
91
93
 
92
- let run the code in [example_1.py](./tests/example_1.py) several times.
94
+ let's run the code in [example_1.py](./tests/example_1.py) several times.
93
95
 
94
96
  ```python
95
97
  from memodisk import memoize
@@ -128,7 +130,7 @@ fun_b(5) = 77
128
130
 
129
131
  As you can see the function fun_b is executed only once. The second time the function is called, the result is loaded from a file on disk and the function is not executed.
130
132
 
131
- Let now edit the file and replace `x * x * 3` by `x * x * 2` and execute again. As expected we now get
133
+ Let's now edit the file and replace `x * x * 3` by `x * x * 2` and execute again. As expected we now get
132
134
 
133
135
  ```
134
136
  executing fun_b
@@ -140,7 +142,7 @@ The change in the body of the function `fun_a`, that is a dependency of `fun_b`,
140
142
 
141
143
  ### Example 2: data dependency changes
142
144
 
143
- The example [example_2.py](./tests/example_2.py) illustrates how to keep track of data dependencies access using the build-in `open` function.
145
+ The example [example_2.py](./tests/example_2.py) illustrates how to keep track of data dependencies access using the built-in `open` function.
144
146
 
145
147
  ```python
146
148
  from memodisk import memoize
@@ -167,13 +169,13 @@ if __name__ == "__main__":
167
169
  assert load_file() == "b"
168
170
  ```
169
171
 
170
- When we call `save_file("b")` we overwrite the data in `data_file.txt`. This change in the file content gets detected when calling `load_file` for the second time. This is done by automatically replacing the built-in python function *open* with a wrapper around this function that keeps track of files that are accessed for reading.
172
+ When we call `save_file("b")` we overwrite the data in `data_file.txt`. This change in the file content gets detected when calling `load_file` for the second time. This is done by automatically replacing the built-in Python function *open* with a wrapper around this function that keeps track of files that are accessed for reading.
171
173
 
172
174
  ### Example 3: file access monkey patching
173
175
 
174
- The built-in `open` function is not the only way the code can access data. For example images can be loaded using opencv's `imread` function. If some data is loaded with another function than the build-in `open` function then the data dependency will not be automatically detected.
176
+ The built-in `open` function is not the only way the code can access data. For example images can be loaded using opencv's `imread` function. If some data is loaded with another function than the built-in `open` function then the data dependency will not be automatically detected.
175
177
 
176
- We provide a function `add_data_dependency` that the user can call from his code next to the line of code that loads the data, with the path of the file that contains the data as input. However this can be error prone as it is very easy to forget calling `add_data_dependency` in some places.
178
+ We provide a function `add_data_dependency` that the user can call from their code next to the line of code that loads the data, with the path of the file that contains the data as input. However this can be error prone as it is very easy to forget calling `add_data_dependency` in some places.
177
179
 
178
180
  We provide a less error-prone mechanism, through a functor called `DataLoaderWrapper`. The functor allows the user to replace any function accessing data (opencv's `imread` function for example) by a wrapper around this function so that it automatically calls the function add_data_dependency each time `imread` is used. This in-memory modification is called *monkey-patching* and is done in [example_3.py](./tests/example_3.py) using the `DataLoaderWrapper` functor.
179
181
 
@@ -187,17 +189,22 @@ cv2.imread = DataLoaderWrapper(cv2.imread)
187
189
 
188
190
  ## How this works
189
191
 
190
- For each function that is decorated with the `memoize` decorator we keep track of all the python functions it depends on at runtime. Similarly to what is done in the [coverage](https://coverage.readthedocs.io/en/6.0.2/), we provide through `sys.settrace` a callback function to the python interpreter that gets called each time the interpreter calls a new function or executes a new line of code. Doing run time analysis allows us to keep as dependencies only the functions that are required to evaluate the memoize function for the specific arguments. In contrast, static analysis would yield unnecessary dependencies by detecting dependencies for all possible code paths, even the one not executed for the specific set of input arguments. In order to keep the list of dependency files reasonable we exclude from this list of dependencies the functions defined in files under the python lib folder, assuming these will not get modified. The user can also provide an additional list of files he wants to exclude.
192
+ For each function that is decorated with the `memoize` decorator we keep track of all the Python functions it depends on at runtime. Similarly to what is done in the [coverage](https://coverage.readthedocs.io/en/6.0.2/), we register a callback with Python's runtime monitoring API (`sys.monitoring` in Python 3.12+) so the interpreter notifies memodisk when Python functions start executing. Doing runtime analysis allows us to keep as dependencies only the functions that are required to evaluate the memoize function for the specific arguments. In contrast, static analysis would yield unnecessary dependencies by detecting dependencies for all possible code paths, even the one not executed for the specific set of input arguments. In order to keep the list of dependency files reasonable we exclude from this list of dependencies the functions defined in files under the Python lib folder, assuming these will not get modified. The user can also provide an additional list of files to exclude.
191
193
 
192
- For each function listed in the dependencies we compute a hash from its bytecode
193
- Using the bytecode instead of the function body text allows the user to modify the comments or the formatting a the function without invalidating the previously cached results for this function or the functions that depends on it. We could use instead the hash of the Abstract Syntax Tree of the function (see the [ast module](https://docs.python.org/3/library/ast.html)), but that would rely on the assumption that the code source is not modified during execution of the script (unless there is a way to get access to the AST from when the execution was started). One current limitation resulting from hashing the bytecode is that the python debugger modifies the bytecode when adding breakpoint, which leads to cache misses. This could be potentially resolved by filtering out the lines added by the python debugger before computing the hash of a function.
194
+ For each function listed in the dependencies we compute a hash from its bytecode.
195
+ Using the bytecode instead of the function body text allows the user to modify the comments or the formatting of the function without invalidating the previously cached results for this function or the functions that depend on it. We could use instead the hash of the Abstract Syntax Tree of the function (see the [ast module](https://docs.python.org/3/library/ast.html)), but that would rely on the assumption that the code source is not modified during execution of the script (unless there is a way to get access to the AST from when the execution was started). On current Python 3.12+ with `debugpy`, line breakpoints did not reproduce a bytecode hash change in our tests.
196
+ We also store the source file modification time for each code dependency so cache validation can skip recomputing bytecode hashes when the dependency file has not changed.
197
+ For globals or imported callables backed by packages installed in `site-packages`, we also store the distribution version so upgrading a dependency invalidates stale cache entries even when the package code itself is not traced as a user dependency.
198
+ For compiled extension modules that are already loaded under those same top-level packages, we also store the extension file modification times so in-place `.pyd`/`.so`/`.dll` swaps invalidate the cache.
194
199
 
195
200
  We keep track of the data dependencies by storing the last modification date of the files that have been opened in read mode.
196
- The code dependencies hashes and data dependencies last modification dates are saved in a human readable json file in the folder `memodisk_cache` in the user's temp folder (this can be modified at the module level by changing the model variable `disk_cache_dir` ) while the result of the function is pickled in a binary file.
201
+ When traced code launches external processes through `subprocess` or `os.system`, memodisk can also store the resolved top-level executable path as a data dependency so changing that executable invalidates stale cache entries. This is only direct executable tracking: memodisk does not attempt to discover the full runtime dependency graph of that process such as transitively loaded `.dll`/`.so` files, plugins, shell expansion, or other environment-dependent behavior. Cache entries also record whether external-process tracking was complete. In `safe` mode, incomplete external-process coverage forces recomputation instead of serving a cached result, and in `strict` mode it raises.
202
+ When traced code calls common ambient time APIs such as `time.time()`, `time.time_ns()`, `datetime.datetime.now()`, `datetime.datetime.utcnow()`, `datetime.datetime.today()`, or `datetime.date.today()`, memodisk records that the result depended on ambient time. This tracking is intentionally treated as incomplete: in `safe` mode memodisk recomputes instead of serving or writing a cache entry, and in `strict` mode it raises. The robust way to memoize time-sensitive code is to pass the relevant timestamp or date in as an explicit function argument.
203
+ The code dependencies hashes and data dependencies last modification dates are saved in a human readable json file in the folder `memodisk_cache` in the user's temp folder (this can be modified at the module level by changing the module variable `disk_cache_dir` ) while the result of the function is saved in a binary file using pickle by default or a custom serializer when provided.
197
204
  The names of the two generated files differ only by the extension (json and pkl) and are formatted
198
205
  by default as `{function_name}_{arguments_hash}.json` and `{function_name}_{arguments_hash}.pkl`.
199
206
  There are some subtle details to take into consideration when accessing data from a file in order to guarantee that the caching will not provide stale results.
200
- The data dependency "version" is obtained by recording the modification date of the accessed file. The modification date resolution is limited and it is possible that a file gets modified during the lapse of time during which the modification date is not incremented. We guard against this by locking the file for a time that is greater than the modification date quantization step.
207
+ The data dependency "version" is obtained by recording the modification date of the accessed file. The modification date resolution is limited and it is possible that a file gets modified during the lapse of time during which the modification date is not incremented. We guard against this by locking the file for a time that is greater than the modification date quantization step.
201
208
 
202
209
  The hash of the arguments is obtained by pickling the arguments. This can be slow if the input is large or made of many small objects.
203
210
 
@@ -213,7 +220,10 @@ Here is an example of a generated dependencies json file:
213
220
  "filename": "D:\\repos\\memodisk\\tests\\test_data_dependency_change.py",
214
221
  "bytecode_hash": "f7b971c6ea7997dc5c5222f74cec2a249e8680293511c6a8350b621643af2d07",
215
222
  "global_vars": {},
216
- "closure_vars": {}
223
+ "closure_vars": {},
224
+ "package_versions": {},
225
+ "compiled_dependencies": {},
226
+ "file_last_modified_date_str": "2026-04-04 15:33:14.682488"
217
227
  }
218
228
  ],
219
229
  "data": [
@@ -222,35 +232,39 @@ Here is an example of a generated dependencies json file:
222
232
  "last_modified_date_str": "2020-03-04 15:33:14.682488"
223
233
  }
224
234
  ],
225
- "random_states": null
235
+ "random_states": null,
236
+ "external_processes": [],
237
+ "ambient_time_sources": [],
238
+ "ambient_environment_sources": [],
239
+ "ignore_code_changes": false,
240
+ "result_serializer": "pickle",
241
+ "argument_hasher": null,
242
+ "store_call_arguments": false,
243
+ "call_arguments_file": null
226
244
  }
227
245
  ```
228
246
 
229
- We do not try to detect changes in the global variables but an error will be raised if any of the functions listed in the dependencies uses global variables.
247
+ We do not try to detect changes in global variables but an error will be raised if any of the functions listed in the dependencies uses global variables.
230
248
 
231
249
  ## Using numpy's random generator
232
250
 
233
- A function that uses the default numpy generator is problematic from caching two reasons: 1) its output depends on the random generator state that is not provided as an explicit input to the function 2) the function modifies the state of the random generator for the functions that get called after it.
234
- When retrieving a cached results for such a function we want to use the state of the random generator when entering the function in the hash of the inputs and after retrieveing cached results we want to set the random state to the same state as the one we would get by running the the cached function.
251
+ A function that uses the default numpy generator is problematic for caching for two reasons: 1) its output depends on the random generator state that is not provided as an explicit input to the function 2) the function modifies the state of the random generator for the functions that get called after it.
252
+ When retrieving cached results for such a function we want to use the state of the random generator when entering the function in the hash of the inputs and after retrieving cached results we want to set the random state to the same state as the one we would get by running the cached function.
235
253
  The use of the random generator is detected by comparing the state of the random generator state before and after executing the function.
236
254
  The input state and output state of the random generator are saved in the json file and the memoized result is loaded in subsequent run only if the random state is identical to the one saved in the json file i.e. when entering the function at the first run, the result is loaded from the pickle file and the random state is modified to match the random state after execution of the function at the first run.
237
255
 
238
- This mechanism can fail in some cases, if a function access the random generator but restore the generator in the same state as it was when entring the function for example and thus we recommend to avoid using the default "global" numpy random generator, but instead to use instances of `numpy.random.Generator` that are passed as arguments to the functions that use the random number in order to reduce the risk of getting stale results from the memoize decorator.
256
+ This mechanism can fail in some cases, if a function accesses the random generator but restores the generator in the same state as it was when entering the function for example and thus we recommend to avoid using the default "global" numpy random generator, but instead to use instances of `numpy.random.Generator` that are passed as arguments to the functions that use the random number in order to reduce the risk of getting stale results from the memoize decorator.
239
257
 
240
258
  If the same function is called multiple times with the same input arguments but with different random states, then a single memoization file is used and gets overwritten. We could add an argument to the memoize decorator to tell the memoize decorator to use the random state when computing the hash of the input arguments to allow the use of multiple memoization files for the same function with one file for each state of the random generator.
241
259
 
242
260
  ## Limitations
243
261
 
244
- * does not support the property decorator in some cases as `getattr` is trying to execute the function.
245
- * requires all the function arguments of the memoized function to be serializable using pickle.
262
+ * by default, requires all the function arguments of the memoized function to be serializable using pickle. This can be overridden with `argument_hasher=...` for cache-key generation.
246
263
  * may not detect all global variables dependencies.
247
- * will detect an added breakpoint in a function as a change in the code because the python debugger adds line in the function bytecode when using breakpoint.
248
- * does not detect non determinism due to use of time.
249
- * does not detect changes in C/C++ extension modules or external executables, unless the pyd file, dll or executable dependency is explicitly specified through the `add_data_dependency` function.
264
+ * ambient time detection covers common direct calls such as `time.time()` and `datetime.datetime.now()`, but may still miss indirect wrappers, custom clocks, or time values obtained outside the memoized call and captured through globals or closures.
265
+ * external-process tracking is limited to the directly resolved executable path. It does not cover full transitive native dependencies such as `.dll`/`.so` chains, plugins, registry/config driven behavior, or other runtime environment changes.
266
+ * may miss some shell-invoked external commands when the real executable cannot be resolved from the command string. In those cases the executable should still be declared explicitly with `add_data_dependency`.
250
267
  * does not detect changes in remote dependencies fetched from the network.
251
- * is not thread safe. It does not support multi-threading
252
- * will not detect if an aliased import is modified.
253
- * computes argument hash using pickled object strings, which does not always produce the same string for identical objects. Could use [compute_fixed_hash](https://github.com/QUVA-Lab/artemis/blob/84d3b1daf0de363cc823d99f978e2861ed400b5b/artemis/general/hashing.py#L25).
254
268
  * has no configurable cache size.
255
269
  * will memoize only the decorated functions.
256
270
 
@@ -258,20 +272,10 @@ Some of these failure modes can be reproduced using scripts in the [failure_mode
258
272
 
259
273
  ## TODO
260
274
 
261
- * add the module name of the functions in the code dependencies description.
262
- * filter out the lines added by the python debugger when using breakpoint before computing the hash of a function.
263
- * add the file modification date in the code dependencies and a test to skip checking functions hashes if the file modification date did not change.
264
- * add an option to save the full input arguments in the pickle file in order to be able to re-run a function directly from the pickled data
265
- * save the versions of module dependencies that in the lib/site-packages folder and add an option to remove function dependencies from packages under site-packages and use only module version to detect a dependency change, assuming the package in site-packages does not get modified. maybe use the file modification date for code under site-packages instead of functions ast
266
275
  * improve the detection of non-pure function so that it works when using a compiled third party module
267
- * allow the use of a different serialization library than pickle. It could be provided at module level to disc_memoize or as argument to the memoize decorator
268
- * add the possibility to provide a condition in the decorator to memoize or not
269
- * add a less intrusive alternative to the use of decorator by registering a function in a list of function names provided directly to disc_memoize
276
+ * add a less intrusive alternative to the use of a decorator by registering a function in a list of function names provided directly to disk_memoize
270
277
  * implement an automatic memoization of function that are long to evaluate using similar criterion to IncPy (see references) to decide if a function should be memoize or not
271
- * make the tool thread-safe
272
- * see if we can detect compiled module loading and compiled module calling to add the compiled module as dependency.
273
278
  * publish module on [pypi.org](pypi.org)
274
- * see if we can make the hashing more deterministic using the method implemented in [charmonium.cache](https://pypi.org/project/charmonium.cache).
275
279
 
276
280
  ## Alternatives
277
281
 
@@ -283,3 +287,4 @@ Some of these failure modes can be reproduced using scripts in the [failure_mode
283
287
  * [Artemis.fileman.disk_memoize](https://github.com/QUVA-Lab/artemis/blob/master/artemis/fileman/disk_memoize.py) It does not detect changes in the code or data dependencies. [pdf](http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=59BEC4646686E70CFD2428EF9786B9D0?doi=10.1.1.224.164&rep=rep1&type=pdf)
284
288
  * [noWorkflow](http://gems-uff.github.io/noworkflow/). *noWorkflow: a Tool for Collecting, Analyzing, and Managing Provenance from Python Scripts* [pdf](https://par.nsf.gov/servlets/purl/10048452). Library that allows to track how data has been generated. It bears some similarity with the library as it also requires to keep track of dependencies.
285
289
  * [klepto](https://mmckerns.github.io/project/pathos/wiki/klepto.html). Allows caching of python function results to files or database archive. The detection of code change is not mentioned.
290
+ * [exca](https://github.com/facebookresearch/exca). Developed by Facebook Research. The detection of code change is not mentioned.
@@ -1,23 +1,23 @@
1
1
  # memodisk
2
2
 
3
- **memodisk** is a python module to memoize function results on disk with python code and data dependencies tracking.
3
+ A Python package to cache function results on disk with dependency changes tracking.
4
4
 
5
5
  ![Python package](https://github.com/martinResearch/memodisk/workflows/Python%20package/badge.svg)
6
6
 
7
7
  ## Goal
8
8
 
9
- This package provides a python decorator to save on disk and reuse the results of functions that are long to execute. This can be referred to as *persistent memoization*.
9
+ This package provides a Python decorator to save on disk and reuse the results of functions that are long to execute. This can be referred to as *persistent memoization*.
10
10
 
11
11
  The result of a decorated function is loaded from disk if and only if
12
12
 
13
13
  * the function has been called previously with the same input argument
14
- * there are no changes in the python code dependencies or data dependency files.
14
+ * there are no changes in the Python code dependencies or data dependency files.
15
15
 
16
- The use of the second condition differentiates this library from most other python persistent memoization libraries. It is a useful feature when prototyping code with frequent code changes.
16
+ The use of the second condition differentiates this library from most other Python persistent memoization libraries. It is a useful feature when prototyping code with frequent code changes.
17
17
 
18
18
  ## Warning
19
19
 
20
- This is a prototype with limited testing and works with python 3.10 only. There could be security risks related to the use of Pickle. Some data, code or global variables dependencies could be not detected leading the memoization to return stale results (see the limitations section). If you find failure modes that are not listed in the limitations, please create an issue with a minimal code example to reproduce the problem.
20
+ This is a prototype with limited testing and currently targets Python 3.12+. There could be security risks related to the use of Pickle. Some data, code or global variables dependencies could be not detected leading the memoization to return stale results (see the limitations section). If you find failure modes that are not listed in the limitations, please create an issue with a minimal code example to reproduce the problem.
21
21
 
22
22
  ## Installation
23
23
 
@@ -30,9 +30,11 @@ pip install memodisk
30
30
  Using the memoization is as simple as adding the `@memoize` decorator to the function you want to use memoization with. In general you want to use memoization on a function whose execution time is longer than the time it takes to check that this function has already been called with the same argument (compute the input hashes) and load the result from disk.
31
31
  To get the largest speedups thanks to memoization, it might be necessary to refactor the code to move the parts that are long to execute into functions that take limited size inputs.
32
32
 
33
+ The decorator also accepts options such as `mode=...`, `external_process_mode=...`, `ignore_code_changes=True`, `condition=...`, `serializer=...`, `argument_hasher=...`, and `store_call_arguments=True`. The `condition` callable receives the same arguments as the memoized function and allows caching only for selected invocations. The `serializer` option customizes how cached results are written and read back, while argument hashing remains unchanged. The `argument_hasher` option customizes cache-key generation for `args` and `kwargs`, which is useful for very large inputs or objects that are not picklable by default. The `store_call_arguments` option writes a companion pickle file containing the original `args` and `kwargs` so a cached call can be replayed later. The helper `load_cached_call_arguments(...)` reads those stored arguments back from a dependencies JSON file. The `external_process_mode` option controls subprocess and `os.system` handling: `manual` disables automatic executable tracking, `direct` records the launched executable path but marks subprocess coverage as incomplete, and `strict` raises when traced code launches external processes without complete dependency coverage. The `mode` policy also governs common ambient time APIs such as `time.time()` and `datetime.datetime.now()`: `safe` recomputes and skips cache writes, `strict` raises, and `optimistic` allows caching.
34
+
33
35
  ### Example 1: code dependency changes
34
36
 
35
- let run the code in [example_1.py](./tests/example_1.py) several times.
37
+ let's run the code in [example_1.py](./tests/example_1.py) several times.
36
38
 
37
39
  ```python
38
40
  from memodisk import memoize
@@ -71,7 +73,7 @@ fun_b(5) = 77
71
73
 
72
74
  As you can see the function fun_b is executed only once. The second time the function is called, the result is loaded from a file on disk and the function is not executed.
73
75
 
74
- Let now edit the file and replace `x * x * 3` by `x * x * 2` and execute again. As expected we now get
76
+ Let's now edit the file and replace `x * x * 3` by `x * x * 2` and execute again. As expected we now get
75
77
 
76
78
  ```
77
79
  executing fun_b
@@ -83,7 +85,7 @@ The change in the body of the function `fun_a`, that is a dependency of `fun_b`,
83
85
 
84
86
  ### Example 2: data dependency changes
85
87
 
86
- The example [example_2.py](./tests/example_2.py) illustrates how to keep track of data dependencies access using the build-in `open` function.
88
+ The example [example_2.py](./tests/example_2.py) illustrates how to keep track of data dependencies access using the built-in `open` function.
87
89
 
88
90
  ```python
89
91
  from memodisk import memoize
@@ -110,13 +112,13 @@ if __name__ == "__main__":
110
112
  assert load_file() == "b"
111
113
  ```
112
114
 
113
- When we call `save_file("b")` we overwrite the data in `data_file.txt`. This change in the file content gets detected when calling `load_file` for the second time. This is done by automatically replacing the built-in python function *open* with a wrapper around this function that keeps track of files that are accessed for reading.
115
+ When we call `save_file("b")` we overwrite the data in `data_file.txt`. This change in the file content gets detected when calling `load_file` for the second time. This is done by automatically replacing the built-in Python function *open* with a wrapper around this function that keeps track of files that are accessed for reading.
114
116
 
115
117
  ### Example 3: file access monkey patching
116
118
 
117
- The built-in `open` function is not the only way the code can access data. For example images can be loaded using opencv's `imread` function. If some data is loaded with another function than the build-in `open` function then the data dependency will not be automatically detected.
119
+ The built-in `open` function is not the only way the code can access data. For example images can be loaded using opencv's `imread` function. If some data is loaded with another function than the built-in `open` function then the data dependency will not be automatically detected.
118
120
 
119
- We provide a function `add_data_dependency` that the user can call from his code next to the line of code that loads the data, with the path of the file that contains the data as input. However this can be error prone as it is very easy to forget calling `add_data_dependency` in some places.
121
+ We provide a function `add_data_dependency` that the user can call from their code next to the line of code that loads the data, with the path of the file that contains the data as input. However this can be error prone as it is very easy to forget calling `add_data_dependency` in some places.
120
122
 
121
123
  We provide a less error-prone mechanism, through a functor called `DataLoaderWrapper`. The functor allows the user to replace any function accessing data (opencv's `imread` function for example) by a wrapper around this function so that it automatically calls the function add_data_dependency each time `imread` is used. This in-memory modification is called *monkey-patching* and is done in [example_3.py](./tests/example_3.py) using the `DataLoaderWrapper` functor.
122
124
 
@@ -130,17 +132,22 @@ cv2.imread = DataLoaderWrapper(cv2.imread)
130
132
 
131
133
  ## How this works
132
134
 
133
- For each function that is decorated with the `memoize` decorator we keep track of all the python functions it depends on at runtime. Similarly to what is done in the [coverage](https://coverage.readthedocs.io/en/6.0.2/), we provide through `sys.settrace` a callback function to the python interpreter that gets called each time the interpreter calls a new function or executes a new line of code. Doing run time analysis allows us to keep as dependencies only the functions that are required to evaluate the memoize function for the specific arguments. In contrast, static analysis would yield unnecessary dependencies by detecting dependencies for all possible code paths, even the one not executed for the specific set of input arguments. In order to keep the list of dependency files reasonable we exclude from this list of dependencies the functions defined in files under the python lib folder, assuming these will not get modified. The user can also provide an additional list of files he wants to exclude.
135
+ For each function that is decorated with the `memoize` decorator we keep track of all the Python functions it depends on at runtime. Similarly to what is done in the [coverage](https://coverage.readthedocs.io/en/6.0.2/), we register a callback with Python's runtime monitoring API (`sys.monitoring` in Python 3.12+) so the interpreter notifies memodisk when Python functions start executing. Doing runtime analysis allows us to keep as dependencies only the functions that are required to evaluate the memoize function for the specific arguments. In contrast, static analysis would yield unnecessary dependencies by detecting dependencies for all possible code paths, even the one not executed for the specific set of input arguments. In order to keep the list of dependency files reasonable we exclude from this list of dependencies the functions defined in files under the Python lib folder, assuming these will not get modified. The user can also provide an additional list of files to exclude.
134
136
 
135
- For each function listed in the dependencies we compute a hash from its bytecode
136
- Using the bytecode instead of the function body text allows the user to modify the comments or the formatting a the function without invalidating the previously cached results for this function or the functions that depends on it. We could use instead the hash of the Abstract Syntax Tree of the function (see the [ast module](https://docs.python.org/3/library/ast.html)), but that would rely on the assumption that the code source is not modified during execution of the script (unless there is a way to get access to the AST from when the execution was started). One current limitation resulting from hashing the bytecode is that the python debugger modifies the bytecode when adding breakpoint, which leads to cache misses. This could be potentially resolved by filtering out the lines added by the python debugger before computing the hash of a function.
137
+ For each function listed in the dependencies we compute a hash from its bytecode.
138
+ Using the bytecode instead of the function body text allows the user to modify the comments or the formatting of the function without invalidating the previously cached results for this function or the functions that depend on it. We could use instead the hash of the Abstract Syntax Tree of the function (see the [ast module](https://docs.python.org/3/library/ast.html)), but that would rely on the assumption that the code source is not modified during execution of the script (unless there is a way to get access to the AST from when the execution was started). On current Python 3.12+ with `debugpy`, line breakpoints did not reproduce a bytecode hash change in our tests.
139
+ We also store the source file modification time for each code dependency so cache validation can skip recomputing bytecode hashes when the dependency file has not changed.
140
+ For globals or imported callables backed by packages installed in `site-packages`, we also store the distribution version so upgrading a dependency invalidates stale cache entries even when the package code itself is not traced as a user dependency.
141
+ For compiled extension modules that are already loaded under those same top-level packages, we also store the extension file modification times so in-place `.pyd`/`.so`/`.dll` swaps invalidate the cache.
137
142
 
138
143
  We keep track of the data dependencies by storing the last modification date of the files that have been opened in read mode.
139
- The code dependencies hashes and data dependencies last modification dates are saved in a human readable json file in the folder `memodisk_cache` in the user's temp folder (this can be modified at the module level by changing the model variable `disk_cache_dir` ) while the result of the function is pickled in a binary file.
144
+ When traced code launches external processes through `subprocess` or `os.system`, memodisk can also store the resolved top-level executable path as a data dependency so changing that executable invalidates stale cache entries. This is only direct executable tracking: memodisk does not attempt to discover the full runtime dependency graph of that process such as transitively loaded `.dll`/`.so` files, plugins, shell expansion, or other environment-dependent behavior. Cache entries also record whether external-process tracking was complete. In `safe` mode, incomplete external-process coverage forces recomputation instead of serving a cached result, and in `strict` mode it raises.
145
+ When traced code calls common ambient time APIs such as `time.time()`, `time.time_ns()`, `datetime.datetime.now()`, `datetime.datetime.utcnow()`, `datetime.datetime.today()`, or `datetime.date.today()`, memodisk records that the result depended on ambient time. This tracking is intentionally treated as incomplete: in `safe` mode memodisk recomputes instead of serving or writing a cache entry, and in `strict` mode it raises. The robust way to memoize time-sensitive code is to pass the relevant timestamp or date in as an explicit function argument.
146
+ The code dependencies hashes and data dependencies last modification dates are saved in a human readable json file in the folder `memodisk_cache` in the user's temp folder (this can be modified at the module level by changing the module variable `disk_cache_dir` ) while the result of the function is saved in a binary file using pickle by default or a custom serializer when provided.
140
147
  The names of the two generated files differ only by the extension (json and pkl) and are formatted
141
148
  by default as `{function_name}_{arguments_hash}.json` and `{function_name}_{arguments_hash}.pkl`.
142
149
  There are some subtle details to take into consideration when accessing data from a file in order to guarantee that the caching will not provide stale results.
143
- The data dependency "version" is obtained by recording the modification date of the accessed file. The modification date resolution is limited and it is possible that a file gets modified during the lapse of time during which the modification date is not incremented. We guard against this by locking the file for a time that is greater than the modification date quantization step.
150
+ The data dependency "version" is obtained by recording the modification date of the accessed file. The modification date resolution is limited and it is possible that a file gets modified during the lapse of time during which the modification date is not incremented. We guard against this by locking the file for a time that is greater than the modification date quantization step.
144
151
 
145
152
  The hash of the arguments is obtained by pickling the arguments. This can be slow if the input is large or made of many small objects.
146
153
 
@@ -156,7 +163,10 @@ Here is an example of a generated dependencies json file:
156
163
  "filename": "D:\\repos\\memodisk\\tests\\test_data_dependency_change.py",
157
164
  "bytecode_hash": "f7b971c6ea7997dc5c5222f74cec2a249e8680293511c6a8350b621643af2d07",
158
165
  "global_vars": {},
159
- "closure_vars": {}
166
+ "closure_vars": {},
167
+ "package_versions": {},
168
+ "compiled_dependencies": {},
169
+ "file_last_modified_date_str": "2026-04-04 15:33:14.682488"
160
170
  }
161
171
  ],
162
172
  "data": [
@@ -165,35 +175,39 @@ Here is an example of a generated dependencies json file:
165
175
  "last_modified_date_str": "2020-03-04 15:33:14.682488"
166
176
  }
167
177
  ],
168
- "random_states": null
178
+ "random_states": null,
179
+ "external_processes": [],
180
+ "ambient_time_sources": [],
181
+ "ambient_environment_sources": [],
182
+ "ignore_code_changes": false,
183
+ "result_serializer": "pickle",
184
+ "argument_hasher": null,
185
+ "store_call_arguments": false,
186
+ "call_arguments_file": null
169
187
  }
170
188
  ```
171
189
 
172
- We do not try to detect changes in the global variables but an error will be raised if any of the functions listed in the dependencies uses global variables.
190
+ We do not try to detect changes in global variables but an error will be raised if any of the functions listed in the dependencies uses global variables.
173
191
 
174
192
  ## Using numpy's random generator
175
193
 
176
- A function that uses the default numpy generator is problematic from caching two reasons: 1) its output depends on the random generator state that is not provided as an explicit input to the function 2) the function modifies the state of the random generator for the functions that get called after it.
177
- When retrieving a cached results for such a function we want to use the state of the random generator when entering the function in the hash of the inputs and after retrieveing cached results we want to set the random state to the same state as the one we would get by running the the cached function.
194
+ A function that uses the default numpy generator is problematic for caching for two reasons: 1) its output depends on the random generator state that is not provided as an explicit input to the function 2) the function modifies the state of the random generator for the functions that get called after it.
195
+ When retrieving cached results for such a function we want to use the state of the random generator when entering the function in the hash of the inputs and after retrieving cached results we want to set the random state to the same state as the one we would get by running the cached function.
178
196
  The use of the random generator is detected by comparing the state of the random generator state before and after executing the function.
179
197
  The input state and output state of the random generator are saved in the json file and the memoized result is loaded in subsequent run only if the random state is identical to the one saved in the json file i.e. when entering the function at the first run, the result is loaded from the pickle file and the random state is modified to match the random state after execution of the function at the first run.
180
198
 
181
- This mechanism can fail in some cases, if a function access the random generator but restore the generator in the same state as it was when entring the function for example and thus we recommend to avoid using the default "global" numpy random generator, but instead to use instances of `numpy.random.Generator` that are passed as arguments to the functions that use the random number in order to reduce the risk of getting stale results from the memoize decorator.
199
+ This mechanism can fail in some cases, if a function accesses the random generator but restores the generator in the same state as it was when entering the function for example and thus we recommend to avoid using the default "global" numpy random generator, but instead to use instances of `numpy.random.Generator` that are passed as arguments to the functions that use the random number in order to reduce the risk of getting stale results from the memoize decorator.
182
200
 
183
201
  If the same function is called multiple times with the same input arguments but with different random states, then a single memoization file is used and gets overwritten. We could add an argument to the memoize decorator to tell the memoize decorator to use the random state when computing the hash of the input arguments to allow the use of multiple memoization files for the same function with one file for each state of the random generator.
184
202
 
185
203
  ## Limitations
186
204
 
187
- * does not support the property decorator in some cases as `getattr` is trying to execute the function.
188
- * requires all the function arguments of the memoized function to be serializable using pickle.
205
+ * by default, requires all the function arguments of the memoized function to be serializable using pickle. This can be overridden with `argument_hasher=...` for cache-key generation.
189
206
  * may not detect all global variables dependencies.
190
- * will detect an added breakpoint in a function as a change in the code because the python debugger adds line in the function bytecode when using breakpoint.
191
- * does not detect non determinism due to use of time.
192
- * does not detect changes in C/C++ extension modules or external executables, unless the pyd file, dll or executable dependency is explicitly specified through the `add_data_dependency` function.
207
+ * ambient time detection covers common direct calls such as `time.time()` and `datetime.datetime.now()`, but may still miss indirect wrappers, custom clocks, or time values obtained outside the memoized call and captured through globals or closures.
208
+ * external-process tracking is limited to the directly resolved executable path. It does not cover full transitive native dependencies such as `.dll`/`.so` chains, plugins, registry/config driven behavior, or other runtime environment changes.
209
+ * may miss some shell-invoked external commands when the real executable cannot be resolved from the command string. In those cases the executable should still be declared explicitly with `add_data_dependency`.
193
210
  * does not detect changes in remote dependencies fetched from the network.
194
- * is not thread safe. It does not support multi-threading
195
- * will not detect if an aliased import is modified.
196
- * computes argument hash using pickled object strings, which does not always produce the same string for identical objects. Could use [compute_fixed_hash](https://github.com/QUVA-Lab/artemis/blob/84d3b1daf0de363cc823d99f978e2861ed400b5b/artemis/general/hashing.py#L25).
197
211
  * has no configurable cache size.
198
212
  * will memoize only the decorated functions.
199
213
 
@@ -201,20 +215,10 @@ Some of these failure modes can be reproduced using scripts in the [failure_mode
201
215
 
202
216
  ## TODO
203
217
 
204
- * add the module name of the functions in the code dependencies description.
205
- * filter out the lines added by the python debugger when using breakpoint before computing the hash of a function.
206
- * add the file modification date in the code dependencies and a test to skip checking functions hashes if the file modification date did not change.
207
- * add an option to save the full input arguments in the pickle file in order to be able to re-run a function directly from the pickled data
208
- * save the versions of module dependencies that in the lib/site-packages folder and add an option to remove function dependencies from packages under site-packages and use only module version to detect a dependency change, assuming the package in site-packages does not get modified. maybe use the file modification date for code under site-packages instead of functions ast
209
218
  * improve the detection of non-pure function so that it works when using a compiled third party module
210
- * allow the use of a different serialization library than pickle. It could be provided at module level to disc_memoize or as argument to the memoize decorator
211
- * add the possibility to provide a condition in the decorator to memoize or not
212
- * add a less intrusive alternative to the use of decorator by registering a function in a list of function names provided directly to disc_memoize
219
+ * add a less intrusive alternative to the use of a decorator by registering a function in a list of function names provided directly to disk_memoize
213
220
  * implement an automatic memoization of function that are long to evaluate using similar criterion to IncPy (see references) to decide if a function should be memoize or not
214
- * make the tool thread-safe
215
- * see if we can detect compiled module loading and compiled module calling to add the compiled module as dependency.
216
221
  * publish module on [pypi.org](pypi.org)
217
- * see if we can make the hashing more deterministic using the method implemented in [charmonium.cache](https://pypi.org/project/charmonium.cache).
218
222
 
219
223
  ## Alternatives
220
224
 
@@ -226,3 +230,4 @@ Some of these failure modes can be reproduced using scripts in the [failure_mode
226
230
  * [Artemis.fileman.disk_memoize](https://github.com/QUVA-Lab/artemis/blob/master/artemis/fileman/disk_memoize.py) It does not detect changes in the code or data dependencies. [pdf](http://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=59BEC4646686E70CFD2428EF9786B9D0?doi=10.1.1.224.164&rep=rep1&type=pdf)
227
231
  * [noWorkflow](http://gems-uff.github.io/noworkflow/). *noWorkflow: a Tool for Collecting, Analyzing, and Managing Provenance from Python Scripts* [pdf](https://par.nsf.gov/servlets/purl/10048452). Library that allows to track how data has been generated. It bears some similarity with the library as it also requires to keep track of dependencies.
228
232
  * [klepto](https://mmckerns.github.io/project/pathos/wiki/klepto.html). Allows caching of python function results to files or database archive. The detection of code change is not mentioned.
233
+ * [exca](https://github.com/facebookresearch/exca). Developed by Facebook Research. The detection of code change is not mentioned.
@@ -4,29 +4,34 @@ from ._version import __version__
4
4
 
5
5
  __all__ = [
6
6
  "memoize",
7
+ "MemoizeMode",
8
+ "ExternalProcessMode",
9
+ "ArgumentHasher",
10
+ "ResultSerializer",
7
11
  "add_data_dependency",
8
12
  "DataLoaderWrapper",
9
13
  "get_globals_from_code",
10
14
  "set_cache_dir",
11
15
  "open_delay",
12
- "get_function_qualified_name_from_frame",
13
- "get_globals_from_code",
14
16
  "get_last_cache_loading",
15
17
  "reset_last_cache_loading",
16
- "get_function_from_frame",
18
+ "load_cached_call_arguments",
17
19
  "hashing_func_map",
18
20
  "user_ignore_files",
19
21
  "__version__",
20
22
  ]
21
23
 
22
24
  from .memodisk import (
25
+ ArgumentHasher,
23
26
  DataLoaderWrapper,
27
+ ExternalProcessMode,
28
+ MemoizeMode,
29
+ ResultSerializer,
24
30
  add_data_dependency,
25
- get_function_from_frame,
26
- get_function_qualified_name_from_frame,
27
31
  get_globals_from_code,
28
32
  get_last_cache_loading,
29
33
  hashing_func_map,
34
+ load_cached_call_arguments,
30
35
  memoize,
31
36
  open_delay,
32
37
  reset_last_cache_loading,
@@ -0,0 +1,24 @@
1
+ # file generated by vcs-versioning
2
+ # don't change, don't track in version control
3
+ from __future__ import annotations
4
+
5
+ __all__ = [
6
+ "__version__",
7
+ "__version_tuple__",
8
+ "version",
9
+ "version_tuple",
10
+ "__commit_id__",
11
+ "commit_id",
12
+ ]
13
+
14
+ version: str
15
+ __version__: str
16
+ __version_tuple__: tuple[int | str, ...]
17
+ version_tuple: tuple[int | str, ...]
18
+ commit_id: str | None
19
+ __commit_id__: str | None
20
+
21
+ __version__ = version = '0.1.1.dev0'
22
+ __version_tuple__ = version_tuple = (0, 1, 1, 'dev0')
23
+
24
+ __commit_id__ = commit_id = 'gd2df32186'