shrinkray 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
shrinkray/learning.py ADDED
@@ -0,0 +1,221 @@
1
+ import hashlib
2
+ import os
3
+ import random
4
+ import re
5
+ import site
6
+ import subprocess
7
+ import sys
8
+ from glob import glob
9
+ from typing import Awaitable, Callable
10
+
11
+ import trio
12
+ from attrs import define
13
+
14
+ from shrinkray.problem import BasicReductionProblem
15
+ from shrinkray.reducer import ShrinkRay
16
+ from shrinkray.work import WorkContext
17
+
18
+ WHITESPACE = re.compile(rb"\s+")
19
+
20
+
21
+ def whitespace_normalize(s):
22
+ return WHITESPACE.sub(b" ", s).strip()
23
+
24
+
25
+ def threshold_value(seed: bytes, test_case: bytes) -> float:
26
+ return random.Random(hashlib.sha1(seed + b":" + test_case).digest()).random()
27
+
28
+
29
+ @define
30
+ class RandomInterestingnessTest:
31
+ seed: bytes
32
+ threshold: float
33
+ key_substring: bytes
34
+ base_interestingness: Callable[[bytes], Awaitable[bool]]
35
+
36
+ async def __call__(self, test_case: bytes) -> bool:
37
+ if threshold_value(self.seed, test_case) > self.threshold:
38
+ await trio.lowlevel.checkpoint()
39
+ return False
40
+ normalized = whitespace_normalize(test_case)
41
+ if self.key_substring not in normalized:
42
+ await trio.lowlevel.checkpoint()
43
+ return False
44
+ return await self.base_interestingness(test_case)
45
+
46
+
47
+ class CouldNotLearn(Exception):
48
+ pass
49
+
50
+
51
+ class CorpusExplorer:
52
+ def __init__(
53
+ self,
54
+ is_interesting: Callable[[bytes], Awaitable[bool]],
55
+ corpus: list[str],
56
+ parallelism=os.cpu_count(),
57
+ ):
58
+ self.__is_interesting = is_interesting
59
+ self.__is_interesting_cache = {}
60
+ self.__corpus_files = [t for t in corpus if os.stat(t).st_size > 0]
61
+ self.__corpus_data = []
62
+ self.__limit = trio.CapacityLimiter(parallelism)
63
+ self.__wait_for_me = object()
64
+ self.__random = random.Random()
65
+ self.__parallelism = parallelism
66
+
67
+ async def is_interesting(self, tc):
68
+ try:
69
+ result = self.__is_interesting_cache[tc]
70
+ if result != self.__wait_for_me:
71
+ await trio.lowlevel.checkpoint()
72
+ return result
73
+ while result == self.__wait_for_me:
74
+ await trio.sleep(0.01)
75
+ result = self.__is_interesting_cache[tc]
76
+ assert result is not self.__wait_for_me
77
+ return result
78
+ except KeyError:
79
+ self.__is_interesting_cache[tc] = self.__wait_for_me
80
+ async with self.__limit:
81
+ result = await self.__is_interesting(tc)
82
+ self.__is_interesting_cache[tc] = result
83
+ return result
84
+
85
+ async def random_corpus_member(self):
86
+ return list(await self.random_corpus_sample(1))[0]
87
+
88
+ async def random_corpus_sample(self, max_elements=100):
89
+ result = []
90
+ max_from_data = len(self.__corpus_data)
91
+
92
+ async def run():
93
+ nonlocal max_from_data
94
+ while len(result) < max_elements and (
95
+ max_from_data > 0 or self.__corpus_files
96
+ ):
97
+ i = self.__random.randrange(0, len(self.__corpus_files) + max_from_data)
98
+ if i < len(self.__corpus_files):
99
+ j = len(self.__corpus_files) - 1
100
+ self.__corpus_files[i], self.__corpus_files[j] = (
101
+ self.__corpus_files[j],
102
+ self.__corpus_files[i],
103
+ )
104
+ f = self.__corpus_files.pop()
105
+ with open(f, "rb") as input:
106
+ data = input.read()
107
+
108
+ if await self.is_interesting(data):
109
+ self.__corpus_data.append(data)
110
+ result.append(data)
111
+ else:
112
+ i -= len(self.__corpus_files)
113
+ j = max_from_data - 1
114
+ self.__corpus_data[i], self.__corpus_data[j] = (
115
+ self.__corpus_data[j],
116
+ self.__corpus_data[i],
117
+ )
118
+ result.append(self.__corpus_data[j])
119
+ max_from_data -= 1
120
+ await trio.lowlevel.checkpoint()
121
+
122
+ if self.__parallelism > 1:
123
+ async with trio.open_nursery() as nursery:
124
+ for _ in range(self.__parallelism):
125
+ nursery.start_soon(run)
126
+ else:
127
+ await run()
128
+ return result[:max_elements]
129
+
130
+ async def random_reduction_problem(self):
131
+ while True:
132
+ sample = await self.random_corpus_sample(400)
133
+
134
+ index = defaultdict(list)
135
+
136
+ for i, s in enumerate(sample):
137
+ for j, c in enumerate(whitespace_normalize(s)):
138
+ index[c].append((i, j))
139
+
140
+ queue = dequeue()
141
+
142
+ strings = [b""]
143
+
144
+ if not normalized:
145
+ continue
146
+ while True:
147
+ start = self.__random.randrange(0, len(normalized))
148
+ if not WHITESPACE.match(normalized[start : start + 1]):
149
+ break
150
+ while True:
151
+ end = self.__random.randint(start, min(start + 20, len(normalized) - 1))
152
+ if not WHITESPACE.match(normalized[end : end + 1]):
153
+ break
154
+ substring = normalized[start:end]
155
+ while True:
156
+ seed = str(self.__random.randint(0, 10**6)).encode("ascii")
157
+ value = threshold_value(seed=seed, test_case=initial)
158
+ if value >= 0.5:
159
+ threshold = self.__random.random() * (1.0 - value) + value
160
+ break
161
+
162
+ return initial, RandomInterestingnessTest(
163
+ seed=seed,
164
+ threshold=threshold,
165
+ key_substring=substring,
166
+ base_interestingness=self.is_interesting,
167
+ )
168
+
169
+ async def full_shrink(self, initial, condition) -> bytes:
170
+ problem: BasicReductionProblem[bytes] = BasicReductionProblem(
171
+ initial=initial,
172
+ is_interesting=condition,
173
+ work=WorkContext(parallelism=self.__parallelism, random=self.__random),
174
+ )
175
+
176
+ reducer = ShrinkRay(
177
+ target=problem,
178
+ )
179
+
180
+ await reducer.run()
181
+
182
+ return problem.current_test_case
183
+
184
+ async def all_shrinks(self, sample, condition):
185
+ shrinks = set()
186
+
187
+ async def run_and_add(c, condition):
188
+ shrinks.add(await self.full_shrink(c, condition))
189
+
190
+ async with trio.open_nursery() as nursery:
191
+ for d in sample:
192
+ nursery.start_soon(run_and_add, d, condition)
193
+
194
+ return shrinks
195
+
196
+
197
+ def python_files() -> list[str]:
198
+ return [
199
+ f
200
+ for d in site.getsitepackages()
201
+ for f in glob(os.path.join(d, "**", "*.py"), recursive=True)
202
+ ]
203
+
204
+
205
+ ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
206
+
207
+ IS_PYTHON_SCRIPT = os.path.join(ROOT, "scripts", "ispython.py")
208
+
209
+ assert os.path.exists(IS_PYTHON_SCRIPT), ROOT
210
+
211
+
212
+ async def is_python(data):
213
+ return (
214
+ await trio.run_process(
215
+ [sys.executable, IS_PYTHON_SCRIPT],
216
+ stdin=data,
217
+ stdout=subprocess.DEVNULL,
218
+ stderr=subprocess.DEVNULL,
219
+ check=False,
220
+ )
221
+ ).returncode == 0
File without changes