cl-magic 0.3.9 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,169 @@
1
+
2
+ require 'json'
3
+ require 'uri'
4
+ require 'pp'
5
+ require 'digest'
6
+ require 'date'
7
+
8
+ require 'tty-progressbar'
9
+ require 'concurrent'
10
+
11
+ require 'cl/magic/common/ai_text_splitter.rb'
12
+
13
+
14
+ class AIPrompt
15
+ API_COMPLETIONS_PATH = "/openai/v1/chat/completions"
16
+ API_EMBEDDINGS_PATH = "/openai/v1/embeddings"
17
+ MAX_THREADS = 10 # set to 1 to debug without concurrency
18
+
19
+ def initialize(logger, cache_dir, max_chunk_size=10000, temperature=1)
20
+ @cache_dir = cache_dir
21
+ @logger = logger
22
+ @max_chunk_size = max_chunk_size
23
+ @temperature = temperature
24
+ @ai_text_splitter = AITextSplitter.new(@max_chunk_size, @logger)
25
+ @thread_pool = Concurrent::ThreadPoolExecutor.new(
26
+ min_threads: 0,
27
+ max_threads: MAX_THREADS,
28
+ max_queue: 0,
29
+ fallback_policy: :caller_runs
30
+ )
31
+ end
32
+
33
+ def gen_embeddings(input)
34
+ data = {
35
+ model: "text-embedding-ada-002",
36
+ input: input,
37
+ }
38
+ response = post_open_ai(API_EMBEDDINGS_PATH, data.to_json)
39
+ return response["data"][0]["embedding"]
40
+ end
41
+
42
+ def prompt(raw_data, prompt, split_as_markdown=false, separator)
43
+
44
+ # split
45
+ split_data = @ai_text_splitter.split(raw_data, split_as_markdown, separator)
46
+
47
+ # summarize
48
+ responses = summarize_split_text(split_data, prompt, split_as_markdown)
49
+
50
+ # map and return
51
+ return responses.collect do |json|
52
+ json["choices"].map {|c| c["message"]["content"]}.join("\n")
53
+ end
54
+ end
55
+
56
+ def clear_cache()
57
+ Dir.glob(File.join(get_cache_path, '*.json')).each do |file|
58
+ File.delete(file)
59
+ end
60
+ end
61
+
62
+ private
63
+
64
+ def do_concurently
65
+ if MAX_THREADS > 1
66
+ @thread_pool.post do
67
+ yield
68
+ end
69
+ else
70
+ yield
71
+ end
72
+ end
73
+
74
+ def wait_concurrently
75
+ if MAX_THREADS > 1
76
+ @thread_pool.shutdown
77
+ @thread_pool.wait_for_termination
78
+ end
79
+ end
80
+
81
+ def munge_prompt(text, prompt)
82
+ final_prompt = "#{prompt}"
83
+
84
+ if text.length > @max_chunk_size
85
+ half = text.length / 2
86
+ final_prompt = "#{prompt}. Summarize it and keep it under #{half} characters"
87
+ end
88
+
89
+ return final_prompt
90
+ end
91
+
92
+ def summarize_split_text(split_text, prompt, split_as_markdown)
93
+
94
+ bar = TTY::ProgressBar.new("processing #{split_text.count} chunks [:bar]", total: split_text.count)
95
+
96
+ json_responses = []
97
+ split_text.each do |text|
98
+ do_concurently do
99
+ final_prompt = munge_prompt(text, prompt)
100
+ messages = [
101
+ { role: "user", content: final_prompt },
102
+ { role: "user", content: text }
103
+ ]
104
+ json_responses << post_open_ai(API_COMPLETIONS_PATH, {
105
+ messages: messages
106
+ }.to_json)
107
+ bar.advance
108
+ end
109
+ end
110
+
111
+ # wait
112
+ wait_concurrently
113
+ return json_responses
114
+ end
115
+
116
+ def post_open_ai(endpoint, data)
117
+ # url
118
+ api_url = ENV["OPENAPI_URL"]
119
+ final_url = URI.join(api_url, endpoint)
120
+
121
+ # data
122
+ sanitized_data = data.gsub("'", "")
123
+
124
+ # post
125
+ api_key = ENV["OPENAPI_KEY"]
126
+ cmd = """
127
+ curl -s -X POST \
128
+ '#{final_url}' \
129
+ -H 'Content-Type: application/json' \
130
+ -H 'Authorization: Bearer #{api_key}' \
131
+ -d '#{sanitized_data}'
132
+ """
133
+ response_text = `#{cmd}`
134
+ begin
135
+ timestamp = DateTime.now.strftime("%Y%m%d%H%M%S")
136
+ response_hash = JSON.parse(response_text)
137
+
138
+ # completions
139
+ raise if endpoint == API_COMPLETIONS_PATH and not response_hash.key?("choices")
140
+
141
+ # cache
142
+ save_to_cache(sanitized_data, timestamp, "request")
143
+ save_to_cache(response_text, timestamp, "response")
144
+
145
+ # response
146
+ return response_hash
147
+ rescue => e
148
+ #@logger.error e
149
+ @logger.error response_text
150
+ exit
151
+ end
152
+ end
153
+
154
+ def get_cache_path
155
+ cache_path = File.join(@cache_dir, ".open_ai_cache")
156
+ Dir.mkdir(cache_path) if !File.directory?(cache_path)
157
+ return cache_path
158
+ end
159
+
160
+ def save_to_cache(json_string, timestamp, postfix)
161
+ unless @cache_dir.nil?
162
+ current_datetime = DateTime.now.strftime("%Y%m%d%H%M%S")
163
+ filepath = File.join(get_cache_path, "#{current_datetime}_#{postfix}.json")
164
+ File.open(filepath, "w") do |file|
165
+ file.write(JSON.pretty_generate(JSON.parse(json_string)))
166
+ end
167
+ end
168
+ end
169
+ end
@@ -0,0 +1,78 @@
1
+ require 'baran'
2
+
3
+ class AITextSplitter
4
+
5
+ def initialize(max_chunk_size, logger)
6
+ @max_chunk_size = max_chunk_size
7
+ @cut_off = (@max_chunk_size + (@max_chunk_size * 0.1)).floor
8
+ @logger = logger
9
+ end
10
+
11
+ def split(data, split_as_markdown=false, separator)
12
+ return markdown_to_array(data) if split_as_markdown
13
+ return basic_split_then_reduce(data, separator)
14
+ end
15
+
16
+ private
17
+
18
+ #
19
+ # Separator Varients
20
+ #
21
+
22
+ # basic splitter, would lose context when splits got too small
23
+ def basic_splitter(data, separator)
24
+ separator = "\n\n" if separator.nil? or separator.empty?
25
+ splitter = Baran::CharacterTextSplitter.new(chunk_size: @max_chunk_size, chunk_overlap: 64, separator: separator)
26
+ chunks = splitter.chunks(data).collect {|c| c[:text]}
27
+ return reduce_to_max_size(chunks)
28
+ end
29
+
30
+ # Preferred: provides ever better context through insisting on splits near max_chunk_size
31
+ def basic_split_then_reduce(data, separator)
32
+ chunks = basic_splitter(data, separator)
33
+ return reduce_to_max_size(chunks)
34
+ end
35
+
36
+ # User can hint at split points; it didn't work great
37
+ def recursive_splitter(data, separator)
38
+ separator = ([separator] + ["\n\n"]).compact
39
+ splitter = Baran::RecursiveCharacterTextSplitter.new(
40
+ chunk_size: @max_chunk_size, chunk_overlap: 64,
41
+ separators: separator
42
+ )
43
+ chunks = splitter.chunks(data).collect {|c| c[:text]}
44
+ return reduce_to_max_size(chunks)
45
+ end
46
+
47
+ #
48
+ # Markdown
49
+ #
50
+
51
+ def markdown_to_array(data)
52
+ splitter = Baran::MarkdownSplitter.new()
53
+ return splitter.chunks(data).collect {|c| c[:text]}
54
+ end
55
+
56
+ #
57
+ # Splitting is done by separator and the LLM can respond
58
+ # with content of any length. Let's reduce the chunks by
59
+ # combining smaller responses up to @max_chunk_size
60
+ #
61
+
62
+ def reduce_to_max_size(chunks)
63
+ combined = []
64
+ i = 0
65
+ while i < chunks.length
66
+ c = chunks[i]
67
+ n = chunks[i + 1]
68
+ unless n.nil? or (c.length + n.length) > @cut_off
69
+ combined << [c, n].join("\n")
70
+ i += 2
71
+ else
72
+ combined << c
73
+ i += 1
74
+ end
75
+ end
76
+ combined
77
+ end
78
+ end
@@ -17,7 +17,7 @@ def add_help(opts)
17
17
  end
18
18
 
19
19
  def ask_and_store_option(options, key, question)
20
- if options[key].nil?
20
+ if options[key].nil? or options[key].empty?
21
21
  options[key] = TTY::Prompt.new.ask(question)
22
22
  end
23
23
  end
@@ -0,0 +1,41 @@
1
+ class Elastic
2
+ def initialize(elastic_url)
3
+ @elastic_url = elastic_url
4
+ end
5
+
6
+ def query_by_id(ids)
7
+ url = "/_search"
8
+ verb = "POST"
9
+ data = {
10
+ query: {
11
+ terms: {
12
+ _id: ids
13
+ }
14
+ }
15
+ }
16
+ sanitized_data = data.to_json
17
+ return post(url, verb, data)
18
+ end
19
+
20
+ def post(url, verb, data)
21
+ final_url = "#{@elastic_url}/#{url}"
22
+
23
+ # sanitize
24
+ sanitized_data = data.to_json
25
+ ["'", "’"].each { |c| sanitized_data.gsub!(c, "\#{c}") }
26
+
27
+ # post
28
+ cmd = """
29
+ curl -s -X#{verb} \
30
+ #{final_url} \
31
+ -H 'Content-Type: application/json' \
32
+ -d '#{sanitized_data}'
33
+ """
34
+ return `#{cmd}`
35
+ end
36
+
37
+ def create_index(elastic_index, body)
38
+ url = "#{elastic_index}"
39
+ return post(url, "PUT", body)
40
+ end
41
+ end
@@ -1,12 +1,70 @@
1
- require 'byebug'
1
+ require 'tty-progressbar'
2
+ require 'tty-spinner'
3
+ require 'concurrent'
2
4
 
3
5
  class Jira
4
6
 
7
+ MAX_THREADS = 20 # set to 1 to debug without concurrency
8
+
5
9
  def initialize(base_uri, username, token, break_at_one_page=false)
6
10
  @base_uri = base_uri.chomp("/")
7
11
  @username = username
8
12
  @token = token
9
13
  @break_at_one_page = break_at_one_page
14
+
15
+ @thread_pool = Concurrent::ThreadPoolExecutor.new(
16
+ min_threads: 0,
17
+ max_threads: MAX_THREADS,
18
+ max_queue: 0,
19
+ fallback_policy: :caller_runs
20
+ )
21
+ end
22
+
23
+ #
24
+ # Formatter
25
+ #
26
+
27
+ def self.jira_to_markdown(issue)
28
+
29
+ md = []
30
+ md << ""
31
+ md << "# #{issue['key']}"
32
+ md << "project: #{issue['fields']['project']['key']}"
33
+ md << "created: #{issue['fields']['created']}"
34
+ md << "updated: #{issue['fields']['updated']}"
35
+ md << "status: #{issue['fields']['status']['statusCategory']['name']}" unless issue['fields']["status"].nil?
36
+ md << "priority: #{issue['fields']['priority']['name']}"
37
+ md << "labels: #{issue['fields']['labels'].join(',')}"
38
+ md << "issue_type: #{issue['fields']['issuetype']['name']}" unless issue['fields']["issuetype"].nil?
39
+ md << "assignee: #{issue['fields']['assignee']['displayName']}" unless issue['fields']["assignee"].nil?
40
+ md << ""
41
+ md << "## Summary"
42
+ md << "#{issue['fields']['summary']}"
43
+ md << ""
44
+ md << ""
45
+ issue_md = md.join("\n")
46
+
47
+ comments = []
48
+ issue["comments"].each_with_index do |comment, i|
49
+ c_md = []
50
+ c_md << "### Comment - #{comment["author"]["displayName"]} "
51
+ c_md << ""
52
+ c_md << "created: #{comment["created"]}"
53
+
54
+ # nest markdown deeper
55
+ comment["body"].split("\n").each do |line|
56
+ c_md << if line.start_with?("#")
57
+ "####{line}"
58
+ else
59
+ line
60
+ end
61
+ end
62
+
63
+ c_md << ""
64
+ comments << [comment["id"], c_md.join("\n")]
65
+ end
66
+
67
+ return issue_md, comments
10
68
  end
11
69
 
12
70
  #
@@ -16,10 +74,12 @@ class Jira
16
74
  def get_epic_ids(project, epic_wildcard)
17
75
  jql_query = "project = \"#{project}\" AND issuetype = Epic AND text ~ \"#{epic_wildcard}\""
18
76
  results = run_jql_query(jql_query)
19
- return results.select{|h| h['fields']['summary'].start_with? epic_wildcard}.map {|h| h['id']}
77
+ epics = results.select{|h| h['fields']['summary'].start_with? epic_wildcard}
78
+ epic_ids = epics.map {|h| h['id']}
79
+ return epic_ids, epics
20
80
  end
21
81
 
22
- def get_issues(project, epic_ids)
82
+ def get_issues_by_epic_ids(project, epic_ids)
23
83
  jql_query = "project = \"#{project}\" AND parentEpic IN (#{epic_ids.join(',')})"
24
84
  return run_jql_query(jql_query)
25
85
  end
@@ -32,6 +92,14 @@ class Jira
32
92
  end
33
93
  end
34
94
 
95
+ def get_issue_comments(issue_key)
96
+ uri = URI.parse("#{@base_uri}/rest/api/2/issue/#{issue_key}/comment")
97
+ jira_get(uri) do |response|
98
+ result = JSON.parse(response.body)
99
+ return result["comments"]
100
+ end
101
+ end
102
+
35
103
  #
36
104
  # Helpers: GET & POST
37
105
  #
@@ -49,7 +117,11 @@ class Jira
49
117
  if response.code == '200'
50
118
  yield response
51
119
  else
52
- raise "Jira query failed with HTTP status code #{response.code}"
120
+ raise """
121
+ Jira query failed with HTTP status code #{response.code}
122
+
123
+ #{response.body}
124
+ """
53
125
  end
54
126
  end
55
127
 
@@ -68,7 +140,13 @@ class Jira
68
140
  if response.code == '200'
69
141
  yield response
70
142
  else
71
- raise "Jira query failed with HTTP status code #{response.code}"
143
+ raise """
144
+ Jira query failed with HTTP status code #{response.code}
145
+
146
+ BODY: #{body.to_json}
147
+
148
+ RESPONSE: #{response.body}
149
+ """
72
150
  end
73
151
  end
74
152
 
@@ -77,6 +155,9 @@ class Jira
77
155
  #
78
156
 
79
157
  def run_jql_query(jql)
158
+ spinner = TTY::Spinner.new("[:spinner] fetching ...", format: :pulse_2)
159
+ spinner.auto_spin # Automatic animation with default interval
160
+
80
161
  start_at = 0
81
162
  max_results = 50
82
163
  total_results = nil
@@ -110,53 +191,100 @@ class Jira
110
191
  start_at += max_results # else next page
111
192
  end
112
193
  end
113
-
114
- print '.' # loop
115
194
  end
195
+ spinner.stop("#{all_results.count} issues")
116
196
  all_results.map {|h| h}
117
197
  end
118
- end
119
198
 
120
- #
121
- # Collect status changelogs
122
- #
123
- # Given a array of jira issue hashes
124
- # * fetch the change log
125
- # * filter down to status changes
126
- # * add it to the issue hash as ["status_changelogs"]
127
- #
128
-
129
- def collect_status_changelogs(jira, issues, options)
130
- final_issue_hashes = []
131
-
132
- issues.each do |issue|
133
- issue_key = issue["key"]
134
- issue["status_changelogs"] = []
135
-
136
- # fetch change log
137
- print '.'
138
- changelogs = jira.get_issue_status_changelog(issue_key)
139
-
140
- changelogs.each do |change_log|
141
-
142
- # all items that are status changes
143
- status_logs = change_log["items"].select {|i| i["field"]=="status"}
144
- status_logs = status_logs.collect do |status_log|
145
- {
146
- "key": issue_key,
147
- "created": change_log["created"],
148
- "toString": status_log["toString"],
149
- "fromString": status_log["fromString"]
150
- }
199
+ def collect_comments(jira, issues)
200
+ final_issue_hashes = []
201
+ bar = TTY::ProgressBar.new("fetching [:bar]", total: issues.count)
202
+
203
+ issues.each do |issue|
204
+ do_concurently do
205
+ issue_key = issue["key"]
206
+ issue["comments"] = []
207
+
208
+ # fetch change log
209
+ comments = get_issue_comments(issue_key)
210
+ issue["comments"] = comments
211
+ final_issue_hashes << issue # save
212
+ bar.advance
151
213
  end
214
+ end
215
+
216
+ # wait
217
+ wait_concurrently
218
+ return final_issue_hashes
219
+ end
220
+
221
+ #
222
+ # Collect status changelogs
223
+ #
224
+ # Given a array of jira issue hashes
225
+ # * fetch the change log
226
+ # * filter down to status changes
227
+ # * add it to the issue hash as ["status_changelogs"]
228
+ #
229
+
230
+ def collect_status_changelogs(jira, issues)
231
+ final_issue_hashes = []
232
+ bar = TTY::ProgressBar.new("fetching [:bar]", total: issues.count)
233
+
234
+ issues.each do |issue|
235
+ do_concurently do
236
+ issue_key = issue["key"]
237
+ issue["status_changelogs"] = []
238
+
239
+ # fetch change log
240
+ changelogs = get_issue_status_changelog(issue_key)
241
+
242
+ changelogs.each do |change_log|
243
+
244
+ # all items that are status changes
245
+ status_logs = change_log["items"].select {|i| i["field"]=="status"}
246
+ status_logs = status_logs.collect do |status_log|
247
+ {
248
+ "key": issue_key,
249
+ "created": change_log["created"],
250
+ "toString": status_log["toString"],
251
+ "fromString": status_log["fromString"]
252
+ }
253
+ end
152
254
 
153
- # append them to issue
154
- status_logs.each do |status_log|
155
- issue["status_changelogs"] << status_log
156
- end if status_logs.count > 0
255
+ # append them to issue
256
+ status_logs.each do |status_log|
257
+ issue["status_changelogs"] << status_log
258
+ end
259
+ end
260
+
261
+ final_issue_hashes << issue # save
262
+ bar.advance
263
+ end
157
264
  end
158
265
 
159
- final_issue_hashes << issue # save
266
+ # wait
267
+ wait_concurrently
268
+ return final_issue_hashes
160
269
  end
161
- return final_issue_hashes
270
+
271
+ private
272
+
273
+ def do_concurently
274
+ if MAX_THREADS > 1
275
+ @thread_pool.post do
276
+ yield
277
+ end
278
+ else
279
+ yield
280
+ end
281
+ end
282
+
283
+ def wait_concurrently
284
+ if MAX_THREADS > 1
285
+ @thread_pool.shutdown
286
+ @thread_pool.wait_for_termination
287
+ end
288
+ end
289
+
162
290
  end
@@ -0,0 +1,78 @@
1
+
2
+ class Milvus
3
+ def initialize(host, port)
4
+ @host = host
5
+ @port = port
6
+ end
7
+
8
+ def search(collection_name, embedding)
9
+ final_url = "http://#{@host}:#{@port}/v1/vector/search"
10
+ data = {
11
+ collectionName: collection_name,
12
+ vector: embedding,
13
+ outputFields: ["id", "name", "doc_key", "distance"],
14
+ }
15
+
16
+ # post
17
+ sanitized_data = data.to_json
18
+ cmd = """
19
+ curl -s \
20
+ '#{final_url}' \
21
+ -X 'POST' \
22
+ -H 'accept: application/json' \
23
+ -H 'Content-Type: application/json' \
24
+ -d '#{sanitized_data}'
25
+ """
26
+ return `#{cmd}`
27
+ end
28
+
29
+ def create_collection(collection_name)
30
+ final_url = "http://#{@host}:#{@port}/v1/vector/collections/create"
31
+ data = {
32
+ dbName: "default",
33
+ collectionName: collection_name,
34
+ dimension: 1536,
35
+ metricType: "L2",
36
+ primaryField: "id",
37
+ vectorField: "vector"
38
+ }
39
+
40
+ # post
41
+ sanitized_data = data.to_json
42
+ cmd = """
43
+ curl -s \
44
+ '#{final_url}' \
45
+ -X 'POST' \
46
+ -H 'accept: application/json' \
47
+ -H 'Content-Type: application/json' \
48
+ -d '#{sanitized_data}'
49
+ """
50
+ return `#{cmd}`
51
+ end
52
+
53
+ def post_to_collection(collection_name, doc_key, embedding)
54
+ final_url = "http://#{@host}:#{@port}/v1/vector/insert"
55
+ data = {
56
+ collectionName: collection_name,
57
+ data: {
58
+ doc_key: doc_key,
59
+ vector: embedding
60
+ }
61
+ }
62
+
63
+ # post
64
+ sanitized_data = data.to_json
65
+ cmd = """
66
+ curl -s \
67
+ '#{final_url}' \
68
+ -X POST \
69
+ -H 'accept: application/json' \
70
+ -H 'Content-Type: application/json' \
71
+ -d '#{sanitized_data}'
72
+ """
73
+ response = `#{cmd}`
74
+ data = JSON.parse(response)
75
+ raise "Error: #{data.to_json}\n\nData #{sanitized_data}" if data.has_key?("message")
76
+ return data.to_json
77
+ end
78
+ end
@@ -0,0 +1,29 @@
1
+
2
+ class HelpPrinter
3
+
4
+ def initialize(logger)
5
+ @logger = logger
6
+ end
7
+
8
+ def print_dk_help_line(key, help)
9
+ if $stdout.isatty
10
+ if help.nil?
11
+ @logger.puts("#{key.ljust(15, ' ')} ???no help???")
12
+ else
13
+ key = key.ljust(15, ' ')
14
+ help_parts = help.split(";")
15
+
16
+ # first line
17
+ @logger.puts(key, help_parts.shift)
18
+
19
+ # following lines
20
+ padding = "".ljust(15, ' ')
21
+ help_parts.each do |p|
22
+ @logger.puts(padding, p)
23
+ end
24
+ @logger.puts("") if help.end_with?(";")
25
+ end
26
+ end
27
+ end
28
+
29
+ end