glim_ai 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +25 -0
  3. data/Gemfile.lock +49 -0
  4. data/LICENSE.txt +21 -0
  5. data/README.md +125 -0
  6. data/Rakefile +31 -0
  7. data/examples/autocode/autocode.rb +166 -0
  8. data/examples/autocode/solargraph_test.rb +59 -0
  9. data/examples/autocode/templates/changed_files_now_evaluate_output.erb +29 -0
  10. data/examples/autocode/templates/task.erb +16 -0
  11. data/examples/calc/calc.rb +50 -0
  12. data/examples/code_competition/code_competition.rb +78 -0
  13. data/examples/code_competition/output/python_claude-2.rb +33 -0
  14. data/examples/code_competition/output/python_claude-instant-1.rb +18 -0
  15. data/examples/code_competition/output/python_gpt-3.5-turbo-16k.rb +69 -0
  16. data/examples/code_competition/output/python_gpt-3.5-turbo.rb +43 -0
  17. data/examples/code_competition/output/python_gpt-4.rb +34 -0
  18. data/examples/code_competition/output/ruby_claude-2.rb +22 -0
  19. data/examples/code_competition/output/ruby_claude-instant-1.rb +20 -0
  20. data/examples/code_competition/output/ruby_gpt-3.5-turbo-16k.rb +27 -0
  21. data/examples/code_competition/output/ruby_gpt-3.5-turbo.rb +30 -0
  22. data/examples/code_competition/output/ruby_gpt-4.rb +31 -0
  23. data/examples/code_competition/output/ruby_human.rb +41 -0
  24. data/examples/code_competition/templates/analyze_code.erb +33 -0
  25. data/examples/code_competition/templates/write_code.erb +26 -0
  26. data/examples/glim_demo/ask_all.rb +35 -0
  27. data/examples/glim_demo/templates/rate_all.erb +24 -0
  28. data/examples/improve_prompt/improve_prompt.rb +62 -0
  29. data/examples/improve_prompt/templates/stashed/prompt_attempt_explicit_steps.erb +15 -0
  30. data/examples/improve_prompt/templates/stashed/prompt_attempt_explicit_steps_user_message.erb +15 -0
  31. data/examples/improve_prompt/templates/stashed/prompt_attempt_initial.erb +8 -0
  32. data/examples/improve_prompt/templates/stashed/prompt_attempt_nothing.erb +19 -0
  33. data/examples/improve_prompt/templates/try_code_first.erb +13 -0
  34. data/examples/improve_prompt/templates/try_code_first_system.erb +22 -0
  35. data/examples/old/econ/discounting.rb +27 -0
  36. data/examples/old/econ/templates/discounting.erb +10 -0
  37. data/examples/old/generate_glim_code/generate_glim_code.rb +34 -0
  38. data/examples/old/generate_glim_code/templates/generate_glim_code.erb +17 -0
  39. data/examples/old/generate_glim_code/templates/improve_code.erb +27 -0
  40. data/examples/old/glim_dev_tools/ask_code_question.rb +38 -0
  41. data/examples/old/glim_dev_tools/templates/ask_code_question.erb +12 -0
  42. data/examples/old/glim_dev_tools/templates/write_globals_test.erb +28 -0
  43. data/examples/old/glim_dev_tools/write_globals_test.rb +20 -0
  44. data/examples/old/linguistics/nine.rb +0 -0
  45. data/examples/old/rewrite_code/input/hello.py +1 -0
  46. data/examples/old/rewrite_code/input/subdir/hello.py +1 -0
  47. data/examples/old/rewrite_code/input/world.py +1 -0
  48. data/examples/old/rewrite_code/rewrite_code.rb +18 -0
  49. data/examples/old/rewrite_code/templates/rewrite_code.erb +32 -0
  50. data/examples/window_check/data.rb +1260 -0
  51. data/examples/window_check/fruits.rb +118 -0
  52. data/examples/window_check/tools.rb +56 -0
  53. data/examples/window_check/window_check.rb +214 -0
  54. data/glim_generated_tests/make_special_code_with_fixed_length_test.rb +44 -0
  55. data/glim_generated_tests/old-20230831120513-make_special_code_with_fixed_length_test.rb +1 -0
  56. data/glim_generated_tests/old-20230831121222-make_special_code_with_fixed_length_test.rb +55 -0
  57. data/glim_generated_tests/old-20230831124501-make_special_code_with_fixed_length_test.rb +33 -0
  58. data/glim_generated_tests/test/make_special_code_with_fixed_length_test.rb +58 -0
  59. data/lib/anthropic_request_details.rb +37 -0
  60. data/lib/anthropic_response.rb +101 -0
  61. data/lib/chat_request_details.rb +140 -0
  62. data/lib/chat_response.rb +303 -0
  63. data/lib/glim_ai/version.rb +5 -0
  64. data/lib/glim_ai.rb +8 -0
  65. data/lib/glim_ai_callable.rb +151 -0
  66. data/lib/glim_context.rb +62 -0
  67. data/lib/glim_helpers.rb +54 -0
  68. data/lib/glim_request.rb +266 -0
  69. data/lib/glim_response.rb +155 -0
  70. data/lib/globals.rb +255 -0
  71. data/lib/html_templates/chat_request.erb +86 -0
  72. data/sample.env +9 -0
  73. metadata +131 -0
@@ -0,0 +1,33 @@
1
+ ### library (python)
2
+ # no lib needed
3
+
4
+ ### problem specific code
5
+ import asyncio
6
+
7
+ async def Q(msg, fut):
8
+ await asyncio.sleep(0.1) # pretend this is an API call
9
+ fut.set_result(msg)
10
+ return fut
11
+
12
+ async def f(m1, m2, m3):
13
+ return [await m1, await m2, await m3]
14
+
15
+ async def g(m1, m2):
16
+ return m1[0] + m2[1]
17
+
18
+ prompt1 = asyncio.Future()
19
+ prompt1.set_result('code')
20
+
21
+ m1 = Q('analyzing code 1', asyncio.Future())
22
+ m2 = Q('analyzing code 2', asyncio.Future())
23
+ m3 = Q('analyzing code 3', asyncio.Future())
24
+
25
+ prompt2 = asyncio.gather(f(m1, m2, m3))
26
+
27
+ m1 = Q('summary 1', asyncio.Future())
28
+ m2 = Q('summary 2', asyncio.Future())
29
+
30
+ prompt3 = g(m1, m2)
31
+
32
+ result = asyncio.gather(prompt3)
33
+ print(asyncio.run(result))
@@ -0,0 +1,18 @@
1
+ '''
2
+ ### library (python)
3
+ # no lib needed
4
+
5
+ ### problem specific code
6
+ import asyncio
7
+
8
+ async def prompt2_analyze_code():
9
+ tasks = [Q(m1,"prompt1_write_code"), Q(m2,"prompt1_write_code"), Q(m3,"prompt1_write_code")]
10
+ await asyncio.gather(*tasks)
11
+
12
+ async def prompt3_summarize():
13
+ tasks = [Q(m1, await prompt2_analyze_code()), Q(m2, await prompt2_analyze_code())]
14
+ await asyncio.gather(*tasks)
15
+
16
+ asyncio.run(prompt3_summarize())
17
+ result = Q(m1, await prompt3_summarize())
18
+ '''
@@ -0,0 +1,69 @@
1
+ ### library ( python )
2
+
3
+ ```python
4
+ import threading
5
+
6
+ class APICall:
7
+ def __init__(self, func, args):
8
+ self.func = func
9
+ self.args = args
10
+ self.result = None
11
+ self.finished = False
12
+ self.lock = threading.Lock()
13
+
14
+ def finish(self, result):
15
+ with self.lock:
16
+ self.result = result
17
+ self.finished = True
18
+ self.lock.notify_all()
19
+
20
+ class Q:
21
+ def __init__(self, module, prompt):
22
+ self.module = module
23
+ self.prompt = prompt
24
+
25
+ def __call__(self, *args):
26
+ api_call = APICall(self.module.Q, (self.prompt,) + args)
27
+ threading.Thread(target=self.module.Q, args=((self.prompt,) + args, api_call.finish)).start()
28
+ return api_call
29
+
30
+ ### problem specific code
31
+
32
+ def f(*args):
33
+ print("API call f with args:", args)
34
+ # Perform the API call using external library or code
35
+ return "Some result from f"
36
+
37
+ def g(*args):
38
+ print("API call g with args:", args)
39
+ # Perform the API call using external library or code
40
+ return "Some result from g"
41
+
42
+ def prompt1_write_code(arg):
43
+ print("Prompt 1:", arg)
44
+ # Perform some operation using external library or code
45
+
46
+ def prompt2_analyze_code(arg):
47
+ print("Prompt 2:", arg)
48
+ # Perform some operation using external library or code
49
+
50
+ def prompt3_summarize(arg):
51
+ print("Prompt 3:", arg)
52
+ # Perform some operation using external library or code
53
+
54
+ m1 = None # Placeholder for module 1, replace with actual module
55
+ m2 = None # Placeholder for module 2, replace with actual module
56
+ m3 = None # Placeholder for module 3, replace with actual module
57
+
58
+ def main():
59
+ prompt2_analyze_code = f(Q(m1, prompt1_write_code), Q(m2, prompt1_write_code), Q(m3, prompt1_write_code))
60
+ prompt3_summarize = g(Q(m1, prompt2_analyze_code), Q(m2, prompt2_analyze_code))
61
+ result = Q(m1, prompt3_summarize)
62
+
63
+ print("Result:", result.result) # Wait for the result, if needed
64
+
65
+ if __name__ == '__main__':
66
+ main()
67
+ ```
68
+
69
+ Note: Replace the `print` statements and function bodies with actual implementation according to your requirements.
@@ -0,0 +1,43 @@
1
+ ### library ( python )
2
+ ```python
3
+ import threading
4
+
5
+ class Q:
6
+ def __init__(self, caller, func):
7
+ self.caller = caller
8
+ self.func = func
9
+ self.result = None
10
+ self.thread = None
11
+ self.lock = threading.Lock()
12
+
13
+ def __call__(self, *args, **kwargs):
14
+ if self.result is None:
15
+ with self.lock:
16
+ if self.result is None:
17
+ self.thread = threading.Thread(target=self.execute, args=args, kwargs=kwargs)
18
+ self.thread.start()
19
+ self.thread.join()
20
+ return self.result
21
+
22
+ def execute(self, *args, **kwargs):
23
+ self.result = self.func(*args, **kwargs)
24
+
25
+ def f(*args, **kwargs):
26
+ return sum(args)
27
+
28
+ def g(*args, **kwargs):
29
+ return sum(args)
30
+ ```
31
+
32
+ ### problem specific code
33
+ ```python
34
+ m1 = "John"
35
+ m2 = "Jane"
36
+ m3 = "David"
37
+
38
+ prompt1_write_code = "Write code"
39
+
40
+ prompt2_analyze_code = f(Q(m1, prompt1_write_code), Q(m2, prompt1_write_code), Q(m3, prompt1_write_code))
41
+ prompt3_summarize = g(Q(m1, prompt2_analyze_code), Q(m2, prompt2_analyze_code))
42
+ result = Q(m1, prompt3_summarize)
43
+ ```
@@ -0,0 +1,34 @@
1
+ '''
2
+ ### library ( python )
3
+
4
+ import asyncio
5
+ import concurrent.futures
6
+
7
+ class AsyncAPI:
8
+ def __init__(self):
9
+ self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
10
+
11
+ def Queue(self, function, *args):
12
+ loop = asyncio.get_event_loop()
13
+ return loop.run_in_executor(self.executor, function, *args)
14
+
15
+ asyncApi = AsyncAPI()
16
+
17
+ ### problem specific code
18
+
19
+ def Q(model, task):
20
+ ## write the actual code for api call
21
+ pass
22
+
23
+ def f(*results):
24
+ ## write the actual code for analyze code
25
+ pass
26
+
27
+ def g(*results):
28
+ ## write the actual code for summarize
29
+ pass
30
+
31
+ prompt2_analyze_code = f( asyncApi.Queue(Q,m1,prompt1_write_code), asyncApi.Queue(Q,m2,prompt1_write_code), asyncApi.Queue(Q,m3,prompt1_write_code) )
32
+ prompt3_summarize = g( asyncApi.Queue(Q,m1,prompt2_analyze_code), asyncApi.Queue(Q,m2,prompt2_analyze_code) )
33
+ result = asyncApi.Queue(Q, m1, prompt3_summarize)
34
+ '''
@@ -0,0 +1,22 @@
1
+ ### library (ruby)
2
+ # no lib needed
3
+
4
+ ### problem specific code
5
+
6
+ m1 = Q.async(:prompt1_write_code)
7
+ m2 = Q.async(:prompt1_write_code)
8
+ m3 = Q.async(:prompt1_write_code)
9
+
10
+ m1.wait
11
+ m2.wait
12
+ m3.wait
13
+
14
+ prompt2_analyze_code = f.async(m1, m2, m3)
15
+
16
+ m1.wait
17
+ m2.wait
18
+ m3.wait
19
+
20
+ prompt3_summarize = g.async(m1, m2)
21
+
22
+ result = prompt3_summarize.wait
@@ -0,0 +1,20 @@
1
+ ### library ( ruby )
2
+ # no lib needed
3
+
4
+ ### problem specific code
5
+ require 'concurrent'
6
+
7
+ executor = Concurrent::CachedThreadPool.new
8
+
9
+ prompt2_analyze_code = -> do
10
+ f(executor.post { Q(m1, prompt1_write_code) },
11
+ executor.post { Q(m2, prompt1_write_code) },
12
+ executor.post { Q(m3, prompt1_write_code) })
13
+ end
14
+
15
+ prompt3_summarize = -> do
16
+ g(executor.post { prompt2_analyze_code.call },
17
+ executor.post { prompt2_analyze_code.call })
18
+ end
19
+
20
+ result = executor.post { prompt3_summarize.call }.await
@@ -0,0 +1,27 @@
1
+ ### library (ruby)
2
+
3
+ No library is needed for this problem.
4
+
5
+ ### problem specific code
6
+
7
+ ```ruby
8
+ def Q(module_name, statement)
9
+ # Code to make API call and return the result
10
+ # (Assuming implementation for making API call is already done)
11
+ end
12
+
13
+ def f(*statements)
14
+ # Code to process multiple statements asynchronously
15
+ end
16
+
17
+ def g(*statements)
18
+ # Code to process multiple statements asynchronously
19
+ end
20
+
21
+ # Example usage
22
+ prompt2_analyze_code = f(Q(:m1, Q(:prompt1_write_code)), Q(:m2, Q(:prompt1_write_code)), Q(:m3, Q(:prompt1_write_code)))
23
+ prompt3_summarize = g(Q(:m1, prompt2_analyze_code), Q(:m2, prompt2_analyze_code))
24
+ result = Q(:m1, prompt3_summarize)
25
+ ```
26
+
27
+ Note: The implementation of making API calls and processing statements asynchronously is not provided as it is specific to the API being used and the requirements of processing the statements.
@@ -0,0 +1,30 @@
1
+ ### library (ruby)
2
+ ```ruby
3
+ module Q
4
+ def self.call(api, prompt)
5
+ # logic for making an API call
6
+ end
7
+ end
8
+ ```
9
+
10
+ ### problem specific code
11
+ ```ruby
12
+ m1 = "m1"
13
+ m2 = "m2"
14
+ m3 = "m3"
15
+
16
+ prompt1_write_code = "prompt1_write_code"
17
+
18
+ prompt2_analyze_code = Q.call(m1, prompt1_write_code)
19
+ Q.call(m2, prompt1_write_code)
20
+ Q.call(m3, prompt1_write_code)
21
+
22
+ prompt3_summarize = Q.call(m1, prompt2_analyze_code)
23
+ Q.call(m2, prompt2_analyze_code)
24
+
25
+ result = Q.call(m1, prompt3_summarize)
26
+ ```
27
+
28
+ In the above code, I have created a module `Q` which defines a `call` method to make the API call. The `call` method takes two parameters - the API name and the prompt. This allows us to easily make API calls without waiting for the answer, unless the answer is needed to proceed.
29
+
30
+ The `problem specific code` section shows how the `Q` module can be used to solve the problem mentioned in the example. The API calls are made in the desired sequence, and the result is obtained by making the necessary API calls in the desired order.
@@ -0,0 +1,31 @@
1
+ ```ruby
2
+ ### library ( ruby )
3
+
4
+ require 'concurrent'
5
+
6
+ class Q
7
+ def initialize(m, action)
8
+ @future = Concurrent::Future.execute do
9
+ m.send(action)
10
+ end
11
+ end
12
+
13
+ def result
14
+ @future.value
15
+ end
16
+ end
17
+
18
+ ### problem specific code
19
+
20
+ def f(*actions)
21
+ actions.map(&:result)
22
+ end
23
+
24
+ def g(*actions)
25
+ actions.map(&:result)
26
+ end
27
+
28
+ prompt2_analyze_code = f( Q.new(m1, :prompt1_write_code), Q.new(m2, :prompt1_write_code), Q.new(m3, :prompt1_write_code))
29
+ prompt3_summarize = g( Q.new(m1, :prompt2_analyze_code), Q.new(m2, :prompt2_analyze_code))
30
+ result = Q.new(m1, :prompt3_summarize).result
31
+ ```
@@ -0,0 +1,41 @@
1
+
2
+ # library code - can be reused for other applications
3
+ class Future
4
+ def initialize(&block)
5
+ @thread = Thread.new(&block)
6
+ end
7
+
8
+ def value
9
+ @thread.value
10
+ end
11
+ end
12
+
13
+ ####################
14
+ # code specific to problem
15
+
16
+ def Q(m, p)
17
+ Future.new do
18
+ # Define the asynchronous operation here
19
+ end
20
+ end
21
+
22
+ p1 = # define p1
23
+ m1 = # define m1
24
+ m2 = # define m2
25
+
26
+ # Create futures for Q(m1, p1) and Q(m2, p1), will be evaluated in parallel
27
+ q1_p1 = Q(m1, p1)
28
+ q2_p1 = Q(m2, p1)
29
+
30
+ # Retrieve values and calculate p2; this blocks until both values are there
31
+ p2 = f(q1_p1.value, q2_p1.value)
32
+
33
+ # Create futures for Q(m1, p2) and Q(m2, p2), will be evaluated in parallel
34
+ q1_p2 = Q(m1, p2)
35
+ q2_p2 = Q(m2, p2)
36
+
37
+ # Retrieve values and calculate p3, blocks until both values are there
38
+ p3 = g(q1_p2.value, q2_p2.value)
39
+
40
+ # Calculate the final result
41
+ result = Q(m1, p3).value # blocks until value is there
@@ -0,0 +1,33 @@
1
+ I gave the following programming task to some LLMs and a human, in different programming languages:
2
+ '''
3
+ <%= text %>
4
+ '''
5
+ Below are the answers:
6
+ <% for l, m in code.keys %>
7
+ ---l
8
+ Language: <%= l %>
9
+ Author: <%= m %>
10
+ '''
11
+ <%= code[[l,m]] %>
12
+ '''
13
+ <% end %>
14
+ ---
15
+ Write a brief critique of each answer. Carefully check if the provided code will work and if it maximizes parallelism.
16
+ Rate the elegance of the problem-specific code; disregard the library code for rating elegance.
17
+ Then, analyze which language works better for this, and which LLMs did best.
18
+ Finally: Can you draw any conclusions about connections between the LLMs and the languages?
19
+ For example, does it seem like a particular LLM prioritizes patterns that are more common in one language than another?
20
+ At the end of your answer, after a "---" as a separator, write a json array with the various ratings for each one, on a scale of 0..1, like this:
21
+ ---
22
+ [
23
+ <% for l, m in code.keys %>
24
+ {
25
+ lang: <%=l%>,
26
+ model: <%=m%>,
27
+ elegance: 0.42, # reason for elegance score
28
+ parallelism: 0.42, # reason for parallelism score
29
+ correctness: 0.42, # reason for correctness score
30
+ instructions_conformity: 0.42,}, # reason this score; it should measure how well the answer conforms to the instructions
31
+ <% end %>
32
+ ]
33
+ (Replace 0.42 with your rating for each answer)
@@ -0,0 +1,26 @@
1
+
2
+ I am working on a <%= language %> library for developing software that frequently requires use of external APIs.
3
+ It can take a while for the external API calls to finish, so we want to make it easy for the developer
4
+ to send API calls without waiting for the answer, unless the answer is needed to proceed.
5
+
6
+ For example:
7
+ '''
8
+ prompt2_analyze_code = f( Q(m1,prompt1_write_code), Q(m2,prompt1_write_code), Q(m3,prompt1_write_code))
9
+ prompt3_summarize = g( Q(m1,prompt2_analyze_code), Q(m2,prompt2_analyze_code))
10
+ result = Q(m1,prompt3_summarize)
11
+ '''
12
+ The API calls are in function Q. Q,f,g do not have side effects we need to worry about.
13
+
14
+ Write a <%= language %> library for this, and then show how it can be used for the example above.
15
+ If no library is needed, or one already exists, then no need to write one - just write "# no lib needed"
16
+
17
+ Structure your code like this:
18
+ '''
19
+ ### library ( <%= language %> )
20
+ (library code if needed, or # no lib needed)
21
+
22
+ ### problem specific code
23
+ (elegant code that solves the problem above)
24
+ '''
25
+ Your goal is to make the (non-library) code as short, readable, and elegant as possible.
26
+ Do not explain your code -- only respond with the code itself.
@@ -0,0 +1,35 @@
1
+ require_relative '../../lib/globals'
2
+ glim = GlimContext.new(log_name: "ask_all")
3
+ # these first two lines above are the only ones that you need to add to your code
4
+
5
+ # in this example, we will want to compare the answers of these different models
6
+ models = ["claude-instant-1", "gpt-3.5-turbo"]
7
+
8
+ # we will ask this question to each model
9
+ question = "If, in some cataclysm, all of scientific knowledge were to be destroyed, and only one sentence passed on to the next generation of creatures, what statement would contain the most information in the fewest words?"
10
+
11
+ responses = {}
12
+ for model in models
13
+ # construct a request that will be sent to the LLM
14
+ request = glim.request(llm_name: model)
15
+ request.prompt = question
16
+ # LLMResponse.compute will send the request to the model specified in the request
17
+ responses[model] = request.response
18
+ end
19
+
20
+ # now we can rate and summarize the answers
21
+
22
+ # construct a request using an erb template. The template is in the specs directory
23
+ # and is called "rate_all.erb". We will pass the question and the hash with all of the
24
+ # answers to the template.
25
+ request = glim.request_from_template("rate_all", question:, answers: responses)
26
+
27
+ # the request now contains a prompt that is based on the template and the
28
+ # arguments that we passed to the template (question and answers)
29
+ puts request.inspect
30
+
31
+ # send the request and print the completion it generated
32
+ response = request.response
33
+ puts response.completion
34
+
35
+
@@ -0,0 +1,24 @@
1
+ <%
2
+ # This is the template for rendering the prompt for rating all answers to a question.
3
+ req.llm_name = 'gpt-3.5-turbo'
4
+ %>
5
+ <%= prompt_output_files %>
6
+
7
+ ---
8
+
9
+ Below are different answers to the following question:
10
+ "<%= question # this will insert the question passed in to the erb template
11
+ %>"
12
+
13
+ <%
14
+ # iterate over the answers we want to rate so that they will all be in the prompt
15
+ answers.each_pair do |model, answer|
16
+ %>
17
+ <%= model %>'s response: <%= answer.completion %>
18
+ <% end %>
19
+
20
+ <%
21
+ # now we will instruct the LLM to rate each of the answers:
22
+ %>
23
+ Write a brief critique of each answer.
24
+ Then, generate a string in json format providing, for each respondent, a rating for that respondent's response on a scale of 0 to 1.
@@ -0,0 +1,62 @@
1
+ require_relative '../../lib/globals'
2
+
3
+ glim = GlimContext.new
4
+
5
+ testcases = [
6
+ # [ "gen_two_files", "Generate two files named f1 and f2, which each contain the world `hello` and nothing else" ],
7
+ # [ "gen_word_list", "Generate a file named `word_list` which contains the first 5 words from NATO phonetic alphabet, each in its own line." ],
8
+ # [ "write_code", "Write a program in ruby called 'count_lines.rb' which reads a file and prints the number of lines in it. Also generate a file for testing your code."],
9
+ [ "gen_word_list_subdir", "Generate a file named `word_list` which contains the first 5 words from NATO phonetic alphabet, each in its own line. It should go into a subdirectory called 'fun_words'" ]
10
+ ]
11
+
12
+ llm_names = ["gpt-3.5-turbo", "claude-instant-1"]
13
+
14
+ responses = {}
15
+
16
+ Dir.glob(File.join(__dir__, "templates/try_*.erb")) do |try_path|
17
+ try = File.basename(try_path,'.erb')
18
+ responses[try] = {}
19
+ for test_name, test_prompt in testcases
20
+ responses[try][test_name] = {}
21
+ for llm_name in llm_names
22
+ #puts "LLM = #{llm_name}, testing #{try} with prompt #{test_prompt}"
23
+ req = glim.request_from_template(try, test_prompt: test_prompt)
24
+ req.llm_name = llm_name
25
+ req.temperature = 0.0
26
+ responses[try][test_name][llm_name] = req.response
27
+ end
28
+ end
29
+ end
30
+
31
+ extracted_info = {}
32
+ for try in responses.keys
33
+ extracted_info[try] = {}
34
+ for test_name in responses[try].keys
35
+ extracted_info[try][test_name] = {}
36
+ baseline_extracted_info = nil
37
+ baseline_completion = nil
38
+ for llm_name in llm_names # we want them in this order because first one is the gold standard
39
+ completion = responses[try][test_name][llm_name].completion
40
+ extracted_by_llm = extract_and_save_files(completion)
41
+ if !baseline_extracted_info
42
+ baseline_extracted_info = extracted_by_llm
43
+ baseline_completion = completion
44
+ next
45
+ end
46
+ info = ""
47
+ if baseline_extracted_info[0] != extracted_by_llm[0]
48
+ info += "\n\nExtracted info_text differs:\n#{extracted_info[0]}."
49
+ info += "\nBaseline was:\n#{baseline_extracted_info[0]}."
50
+ end
51
+ # if baseline_extracted_info[1] != extracted_by_llm[1]
52
+ # info += "\n\nExtracted files differ: #{JSON.pretty_generate(extracted_by_llm[1])}."
53
+ # info += "\nBaseline was: #{JSON.pretty_generate(baseline_extracted_info[1])}."
54
+ # end
55
+ if !info.empty?
56
+ puts "\n\n#{try} on test case #{test_name} with #{llm_name}:"
57
+ puts info
58
+ puts "\nCompletion was: \n#{completion}"
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,15 @@
1
+ <% req.replace_initial_system_message <<~SYSTEM
2
+
3
+ ALWAYS, when asked to generate one or more files, include such files in your response as follows:
4
+ 1. Write the following file separator line:
5
+ ```
6
+ 2. Write a line containing # followed by the relative pathname of the file, for example:
7
+ # File: relative_path_to_file/filename.suffix
8
+ 3. Write out the content of the file, line by line
9
+ 4. Write the following file separator line again:
10
+ ```
11
+
12
+ SYSTEM
13
+ %>
14
+
15
+ <%= test_prompt %>
@@ -0,0 +1,15 @@
1
+
2
+
3
+ SYSTEM MESSAGE: ALWAYS, when asked to generate one or more files, include such files in your response as follows:
4
+ 1. Write the following file separator line:
5
+ ```
6
+ 2. Write a line containing # followed by the relative pathname of the file, for example:
7
+ # File: relative_path_to_file/filename.suffix
8
+ 3. Write out the content of the file, line by line
9
+ 4. Write the following file separator line again:
10
+ ```
11
+
12
+ ---
13
+
14
+
15
+ <%= test_prompt %>
@@ -0,0 +1,8 @@
1
+ <% req.replace_initial_system_message <<~SYSTEM
2
+
3
+ #{prompt_output_files}
4
+
5
+ SYSTEM
6
+ %>
7
+
8
+ <%= test_prompt %>
@@ -0,0 +1,19 @@
1
+
2
+
3
+ SYSTEM MESSAGE: ALWAYS, when asked to generate one or more files, include such files in your response as follows:
4
+ 1. Write the following file separator line:
5
+ ```
6
+ 2. Write a line containing # followed by the relative pathname of the file, for example:
7
+ # File: relative_path_to_file/filename.suffix
8
+ 3. Write out the content of the file, line by line
9
+ 4. Write the following file separator line again:
10
+ ```
11
+
12
+ ---
13
+
14
+
15
+ <%= test_prompt %>
16
+
17
+
18
+
19
+ <%= test_prompt %>
@@ -0,0 +1,13 @@
1
+
2
+ SYSTEM MESSAGE: ALWAYS, when asked to generate source code or other text files, use the following format:
3
+ <file pathname="path_to_file/hello.rb">
4
+ puts "Hello from Line 1"
5
+ puts "hello from Line 2"
6
+ </file>
7
+ So, the example above shows how you would include a file called "hello.rb" that belongs in the subdirectory "path_to_file" of the current directory.
8
+ The file would contain two "puts" statements.
9
+ Use this for all text files you generate, not just source code.
10
+
11
+ ---
12
+
13
+ <%= test_prompt %>
@@ -0,0 +1,22 @@
1
+ <% req.replace_initial_system_message <<~SYSTEM
2
+
3
+ When asked to generate source code or other text files, use the following format:
4
+ <file pathname="path_to_file/hello.rb">
5
+ puts "Hello from Line 1"
6
+ puts "hello from Line 2"
7
+ </file>
8
+ So, the example above shows how you would include a file called "hello.rb" that belongs in the subdirectory "path_to_file" of the current directory.
9
+ The file would contain two "puts" statements.
10
+ Use this for all text files you generate, not just source code.
11
+
12
+ SYSTEM
13
+ %>
14
+
15
+
16
+ <%= test_prompt %>
17
+
18
+
19
+
20
+ ---
21
+
22
+ <%= test_prompt %>