weco 0.2.14__tar.gz → 0.2.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {weco-0.2.14 → weco-0.2.15}/PKG-INFO +1 -1
- {weco-0.2.14 → weco-0.2.15}/examples/cuda/evaluate.py +21 -17
- {weco-0.2.14 → weco-0.2.15}/examples/hello-kernel-world/evaluate.py +8 -9
- {weco-0.2.14 → weco-0.2.15}/examples/metal/evaluate.py +2 -2
- {weco-0.2.14 → weco-0.2.15}/examples/triton/evaluate.py +11 -8
- {weco-0.2.14 → weco-0.2.15}/pyproject.toml +1 -1
- {weco-0.2.14 → weco-0.2.15}/weco/__init__.py +1 -1
- {weco-0.2.14 → weco-0.2.15}/weco/cli.py +7 -9
- {weco-0.2.14 → weco-0.2.15}/weco.egg-info/PKG-INFO +1 -1
- {weco-0.2.14 → weco-0.2.15}/.github/workflows/lint.yml +0 -0
- {weco-0.2.14 → weco-0.2.15}/.github/workflows/release.yml +0 -0
- {weco-0.2.14 → weco-0.2.15}/.gitignore +0 -0
- {weco-0.2.14 → weco-0.2.15}/.repomixignore +0 -0
- {weco-0.2.14 → weco-0.2.15}/LICENSE +0 -0
- {weco-0.2.14 → weco-0.2.15}/README.md +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/cuda/README.md +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/cuda/guide.md +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/cuda/optimize.py +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/hello-kernel-world/optimize.py +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/metal/README.md +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/metal/examples.rst +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/metal/optimize.py +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/prompt/README.md +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/prompt/eval.py +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/prompt/optimize.py +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/prompt/prompt_guide.md +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/spaceship-titanic/README.md +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/spaceship-titanic/baseline.py +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/spaceship-titanic/evaluate.py +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/spaceship-titanic/optimize.py +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/spaceship-titanic/requirements-test.txt +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/spaceship-titanic/utils.py +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/triton/README.md +0 -0
- {weco-0.2.14 → weco-0.2.15}/examples/triton/optimize.py +0 -0
- {weco-0.2.14 → weco-0.2.15}/setup.cfg +0 -0
- {weco-0.2.14 → weco-0.2.15}/weco/api.py +0 -0
- {weco-0.2.14 → weco-0.2.15}/weco/auth.py +0 -0
- {weco-0.2.14 → weco-0.2.15}/weco/panels.py +0 -0
- {weco-0.2.14 → weco-0.2.15}/weco/utils.py +0 -0
- {weco-0.2.14 → weco-0.2.15}/weco.egg-info/SOURCES.txt +0 -0
- {weco-0.2.14 → weco-0.2.15}/weco.egg-info/dependency_links.txt +0 -0
- {weco-0.2.14 → weco-0.2.15}/weco.egg-info/entry_points.txt +0 -0
- {weco-0.2.14 → weco-0.2.15}/weco.egg-info/requires.txt +0 -0
- {weco-0.2.14 → weco-0.2.15}/weco.egg-info/top_level.txt +0 -0
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import time
|
|
2
1
|
import sys
|
|
3
2
|
import os
|
|
4
3
|
import pathlib
|
|
@@ -78,22 +77,27 @@ def get_inputs(batch_size, seq_len, n_embd, device):
|
|
|
78
77
|
return torch.randn(batch_size, seq_len, n_embd, device=device, dtype=torch.float32)
|
|
79
78
|
|
|
80
79
|
|
|
80
|
+
@torch.no_grad()
|
|
81
81
|
def bench(f, inputs, n_warmup, n_rep):
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
#
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
82
|
+
start_event = torch.cuda.Event(enable_timing=True)
|
|
83
|
+
end_event = torch.cuda.Event(enable_timing=True)
|
|
84
|
+
|
|
85
|
+
# warmup
|
|
86
|
+
for _ in range(n_warmup):
|
|
87
|
+
f(inputs) # noqa
|
|
88
|
+
torch.cuda.synchronize()
|
|
89
|
+
|
|
90
|
+
# benchmark
|
|
91
|
+
t_avg_ms = 0.0
|
|
92
|
+
for _ in range(n_rep):
|
|
93
|
+
# time the forward pass
|
|
94
|
+
start_event.record()
|
|
95
|
+
f(inputs)
|
|
96
|
+
end_event.record()
|
|
97
|
+
# wait for all computations to complete
|
|
98
|
+
torch.cuda.synchronize()
|
|
99
|
+
t_avg_ms += start_event.elapsed_time(end_event)
|
|
100
|
+
return t_avg_ms / n_rep
|
|
97
101
|
|
|
98
102
|
|
|
99
103
|
if __name__ == "__main__":
|
|
@@ -113,7 +117,7 @@ if __name__ == "__main__":
|
|
|
113
117
|
seq_len = 256
|
|
114
118
|
n_embd = 768
|
|
115
119
|
n_head = 8
|
|
116
|
-
# turn off dropout to measure correctness
|
|
120
|
+
# turn off dropout to measure correctness
|
|
117
121
|
attn_pdrop = 0.0
|
|
118
122
|
resid_pdrop = 0.0
|
|
119
123
|
|
|
@@ -62,20 +62,19 @@ def get_inputs(B, N, device):
|
|
|
62
62
|
|
|
63
63
|
@torch.no_grad()
|
|
64
64
|
def bench(f, inputs, n_warmup, n_rep):
|
|
65
|
-
|
|
65
|
+
device_type = inputs.device.type
|
|
66
|
+
|
|
67
|
+
# warm up
|
|
66
68
|
for _ in range(n_warmup):
|
|
67
69
|
f(inputs) # noqa
|
|
70
|
+
if device_type == "cuda":
|
|
71
|
+
torch.cuda.synchronize()
|
|
72
|
+
elif device_type == "mps":
|
|
73
|
+
torch.mps.synchronize()
|
|
68
74
|
|
|
69
|
-
#
|
|
70
|
-
device_type = inputs.device.type
|
|
75
|
+
# benchmark
|
|
71
76
|
t_avg = 0.0
|
|
72
77
|
for _ in range(n_rep):
|
|
73
|
-
# Clear cache before timing
|
|
74
|
-
if device_type == "cuda":
|
|
75
|
-
torch.cuda.empty_cache()
|
|
76
|
-
elif device_type == "mps":
|
|
77
|
-
torch.mps.empty_cache()
|
|
78
|
-
|
|
79
78
|
# time forward pass
|
|
80
79
|
start_time = time.time()
|
|
81
80
|
f(inputs)
|
|
@@ -55,15 +55,15 @@ def load_module_from_path(module_path: str, add_to_sys_modules: bool = False):
|
|
|
55
55
|
# Benchmark
|
|
56
56
|
########################################################
|
|
57
57
|
def get_inputs(batch_size, img_height, img_width, img_channels):
|
|
58
|
-
# MLX doesn't use device parameter like PyTorch, as it automatically uses Metal
|
|
59
58
|
return mx.random.normal(shape=(batch_size, img_height, img_width, img_channels), dtype=mx.float32)
|
|
60
59
|
|
|
61
60
|
|
|
62
61
|
def bench(f, inputs, n_warmup, n_rep):
|
|
63
|
-
#
|
|
62
|
+
# warm up
|
|
64
63
|
for _ in range(n_warmup):
|
|
65
64
|
result = f(inputs)
|
|
66
65
|
mx.eval(result) # Force computation due to lazy evaluation
|
|
66
|
+
mx.synchronize() # Wait for all computations to complete
|
|
67
67
|
|
|
68
68
|
t_avg = 0.0
|
|
69
69
|
for _ in range(n_rep):
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import time
|
|
2
1
|
import sys
|
|
3
2
|
import pathlib
|
|
4
3
|
import importlib
|
|
@@ -76,20 +75,24 @@ def get_inputs(batch_size, seq_len, n_embd, device):
|
|
|
76
75
|
|
|
77
76
|
@torch.no_grad()
|
|
78
77
|
def bench(f, inputs, n_warmup, n_rep):
|
|
78
|
+
start_event = torch.cuda.Event(enable_timing=True)
|
|
79
|
+
end_event = torch.cuda.Event(enable_timing=True)
|
|
80
|
+
|
|
79
81
|
# warmup
|
|
80
82
|
for _ in range(n_warmup):
|
|
81
83
|
f(inputs) # noqa
|
|
84
|
+
torch.cuda.synchronize()
|
|
82
85
|
|
|
83
86
|
# benchmark
|
|
84
|
-
|
|
87
|
+
t_avg_ms = 0.0
|
|
85
88
|
for _ in range(n_rep):
|
|
86
|
-
|
|
87
|
-
start_time = time.time()
|
|
89
|
+
start_event.record()
|
|
88
90
|
f(inputs)
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
91
|
+
end_event.record()
|
|
92
|
+
# wait for all computations to complete
|
|
93
|
+
torch.cuda.synchronize()
|
|
94
|
+
t_avg_ms += start_event.elapsed_time(end_event)
|
|
95
|
+
return t_avg_ms / n_rep
|
|
93
96
|
|
|
94
97
|
|
|
95
98
|
if __name__ == "__main__":
|
|
@@ -10,7 +10,7 @@ authors = [
|
|
|
10
10
|
]
|
|
11
11
|
description = "Documentation for `weco`, a CLI for using Weco AI's code optimizer."
|
|
12
12
|
readme = "README.md"
|
|
13
|
-
version = "0.2.
|
|
13
|
+
version = "0.2.15"
|
|
14
14
|
license = {text = "MIT"}
|
|
15
15
|
requires-python = ">=3.8"
|
|
16
16
|
dependencies = ["requests", "rich"]
|
|
@@ -265,14 +265,14 @@ def main() -> None:
|
|
|
265
265
|
"debug_prob": 0.5,
|
|
266
266
|
"max_debug_depth": max(1, math.ceil(0.1 * steps)),
|
|
267
267
|
}
|
|
268
|
+
# API request timeout
|
|
269
|
+
timeout = 800
|
|
268
270
|
# Read additional instructions
|
|
269
271
|
additional_instructions = read_additional_instructions(additional_instructions=args.additional_instructions)
|
|
270
272
|
# Read source code path
|
|
271
273
|
source_fp = pathlib.Path(args.source)
|
|
272
274
|
# Read source code content
|
|
273
275
|
source_code = read_from_path(fp=source_fp, is_json=False)
|
|
274
|
-
# API request timeout
|
|
275
|
-
timeout = 800
|
|
276
276
|
|
|
277
277
|
# --- Panel Initialization ---
|
|
278
278
|
summary_panel = SummaryPanel(
|
|
@@ -310,9 +310,8 @@ def main() -> None:
|
|
|
310
310
|
runs_dir = pathlib.Path(args.log_dir) / session_id
|
|
311
311
|
runs_dir.mkdir(parents=True, exist_ok=True)
|
|
312
312
|
|
|
313
|
-
#
|
|
314
|
-
|
|
315
|
-
write_to_path(fp=runs_copy_source_fp, content=source_code)
|
|
313
|
+
# Write the initial code string to the logs
|
|
314
|
+
write_to_path(fp=runs_dir / f"step_0{source_fp.suffix}", content=session_response["code"])
|
|
316
315
|
|
|
317
316
|
# Write the initial code string to the source file path (if not preserving)
|
|
318
317
|
if not args.preserve_source:
|
|
@@ -380,7 +379,8 @@ def main() -> None:
|
|
|
380
379
|
transition_delay=0.1,
|
|
381
380
|
)
|
|
382
381
|
|
|
383
|
-
|
|
382
|
+
# Starting from step 1 to steps (inclusive) because the baseline solution is step 0, so we want to optimize for steps worth of steps
|
|
383
|
+
for step in range(1, steps + 1):
|
|
384
384
|
# Re-read instructions from the original source (file path or string) BEFORE each suggest call
|
|
385
385
|
current_additional_instructions = read_additional_instructions(
|
|
386
386
|
additional_instructions=args.additional_instructions
|
|
@@ -553,9 +553,7 @@ def main() -> None:
|
|
|
553
553
|
best_solution_score = None
|
|
554
554
|
|
|
555
555
|
if best_solution_code is None or best_solution_score is None:
|
|
556
|
-
best_solution_content = (
|
|
557
|
-
f"# Weco could not find a better solution\n\n{read_from_path(fp=runs_copy_source_fp, is_json=False)}"
|
|
558
|
-
)
|
|
556
|
+
best_solution_content = f"# Weco could not find a better solution\n\n{read_from_path(fp=runs_dir / f'step_0{source_fp.suffix}', is_json=False)}"
|
|
559
557
|
else:
|
|
560
558
|
# Format score for the comment
|
|
561
559
|
best_score_str = (
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|