cl-magic 0.3.9 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,169 @@
1
+
2
+ require 'json'
3
+ require 'uri'
4
+ require 'pp'
5
+ require 'digest'
6
+ require 'date'
7
+
8
+ require 'tty-progressbar'
9
+ require 'concurrent'
10
+
11
+ require 'cl/magic/common/ai_text_splitter.rb'
12
+
13
+
14
+ class AIPrompt
15
+ API_COMPLETIONS_PATH = "/openai/v1/chat/completions"
16
+ API_EMBEDDINGS_PATH = "/openai/v1/embeddings"
17
+ MAX_THREADS = 10 # set to 1 to debug without concurrency
18
+
19
+ def initialize(logger, cache_dir, max_chunk_size=10000, temperature=1)
20
+ @cache_dir = cache_dir
21
+ @logger = logger
22
+ @max_chunk_size = max_chunk_size
23
+ @temperature = temperature
24
+ @ai_text_splitter = AITextSplitter.new(@max_chunk_size, @logger)
25
+ @thread_pool = Concurrent::ThreadPoolExecutor.new(
26
+ min_threads: 0,
27
+ max_threads: MAX_THREADS,
28
+ max_queue: 0,
29
+ fallback_policy: :caller_runs
30
+ )
31
+ end
32
+
33
+ def gen_embeddings(input)
34
+ data = {
35
+ model: "text-embedding-ada-002",
36
+ input: input,
37
+ }
38
+ response = post_open_ai(API_EMBEDDINGS_PATH, data.to_json)
39
+ return response["data"][0]["embedding"]
40
+ end
41
+
42
+ def prompt(raw_data, prompt, split_as_markdown=false, separator)
43
+
44
+ # split
45
+ split_data = @ai_text_splitter.split(raw_data, split_as_markdown, separator)
46
+
47
+ # summarize
48
+ responses = summarize_split_text(split_data, prompt, split_as_markdown)
49
+
50
+ # map and return
51
+ return responses.collect do |json|
52
+ json["choices"].map {|c| c["message"]["content"]}.join("\n")
53
+ end
54
+ end
55
+
56
+ def clear_cache()
57
+ Dir.glob(File.join(get_cache_path, '*.json')).each do |file|
58
+ File.delete(file)
59
+ end
60
+ end
61
+
62
+ private
63
+
64
+ def do_concurently
65
+ if MAX_THREADS > 1
66
+ @thread_pool.post do
67
+ yield
68
+ end
69
+ else
70
+ yield
71
+ end
72
+ end
73
+
74
+ def wait_concurrently
75
+ if MAX_THREADS > 1
76
+ @thread_pool.shutdown
77
+ @thread_pool.wait_for_termination
78
+ end
79
+ end
80
+
81
+ def munge_prompt(text, prompt)
82
+ final_prompt = "#{prompt}"
83
+
84
+ if text.length > @max_chunk_size
85
+ half = text.length / 2
86
+ final_prompt = "#{prompt}. Summarize it and keep it under #{half} characters"
87
+ end
88
+
89
+ return final_prompt
90
+ end
91
+
92
+ def summarize_split_text(split_text, prompt, split_as_markdown)
93
+
94
+ bar = TTY::ProgressBar.new("processing #{split_text.count} chunks [:bar]", total: split_text.count)
95
+
96
+ json_responses = []
97
+ split_text.each do |text|
98
+ do_concurently do
99
+ final_prompt = munge_prompt(text, prompt)
100
+ messages = [
101
+ { role: "user", content: final_prompt },
102
+ { role: "user", content: text }
103
+ ]
104
+ json_responses << post_open_ai(API_COMPLETIONS_PATH, {
105
+ messages: messages
106
+ }.to_json)
107
+ bar.advance
108
+ end
109
+ end
110
+
111
+ # wait
112
+ wait_concurrently
113
+ return json_responses
114
+ end
115
+
116
+ def post_open_ai(endpoint, data)
117
+ # url
118
+ api_url = ENV["OPENAPI_URL"]
119
+ final_url = URI.join(api_url, endpoint)
120
+
121
+ # data
122
+ sanitized_data = data.gsub("'", "")
123
+
124
+ # post
125
+ api_key = ENV["OPENAPI_KEY"]
126
+ cmd = """
127
+ curl -s -X POST \
128
+ '#{final_url}' \
129
+ -H 'Content-Type: application/json' \
130
+ -H 'Authorization: Bearer #{api_key}' \
131
+ -d '#{sanitized_data}'
132
+ """
133
+ response_text = `#{cmd}`
134
+ begin
135
+ timestamp = DateTime.now.strftime("%Y%m%d%H%M%S")
136
+ response_hash = JSON.parse(response_text)
137
+
138
+ # completions
139
+ raise if endpoint == API_COMPLETIONS_PATH and not response_hash.key?("choices")
140
+
141
+ # cache
142
+ save_to_cache(sanitized_data, timestamp, "request")
143
+ save_to_cache(response_text, timestamp, "response")
144
+
145
+ # response
146
+ return response_hash
147
+ rescue => e
148
+ #@logger.error e
149
+ @logger.error response_text
150
+ exit
151
+ end
152
+ end
153
+
154
+ def get_cache_path
155
+ cache_path = File.join(@cache_dir, ".open_ai_cache")
156
+ Dir.mkdir(cache_path) if !File.directory?(cache_path)
157
+ return cache_path
158
+ end
159
+
160
+ def save_to_cache(json_string, timestamp, postfix)
161
+ unless @cache_dir.nil?
162
+ current_datetime = DateTime.now.strftime("%Y%m%d%H%M%S")
163
+ filepath = File.join(get_cache_path, "#{current_datetime}_#{postfix}.json")
164
+ File.open(filepath, "w") do |file|
165
+ file.write(JSON.pretty_generate(JSON.parse(json_string)))
166
+ end
167
+ end
168
+ end
169
+ end
@@ -0,0 +1,78 @@
1
+ require 'baran'
2
+
3
+ class AITextSplitter
4
+
5
+ def initialize(max_chunk_size, logger)
6
+ @max_chunk_size = max_chunk_size
7
+ @cut_off = (@max_chunk_size + (@max_chunk_size * 0.1)).floor
8
+ @logger = logger
9
+ end
10
+
11
+ def split(data, split_as_markdown=false, separator)
12
+ return markdown_to_array(data) if split_as_markdown
13
+ return basic_split_then_reduce(data, separator)
14
+ end
15
+
16
+ private
17
+
18
+ #
19
+ # Separator Varients
20
+ #
21
+
22
+ # basic splitter, would lose context when splits got too small
23
+ def basic_splitter(data, separator)
24
+ separator = "\n\n" if separator.nil? or separator.empty?
25
+ splitter = Baran::CharacterTextSplitter.new(chunk_size: @max_chunk_size, chunk_overlap: 64, separator: separator)
26
+ chunks = splitter.chunks(data).collect {|c| c[:text]}
27
+ return reduce_to_max_size(chunks)
28
+ end
29
+
30
+ # Preferred: provides ever better context through insisting on splits near max_chunk_size
31
+ def basic_split_then_reduce(data, separator)
32
+ chunks = basic_splitter(data, separator)
33
+ return reduce_to_max_size(chunks)
34
+ end
35
+
36
+ # User can hint at split points; it didn't work great
37
+ def recursive_splitter(data, separator)
38
+ separator = ([separator] + ["\n\n"]).compact
39
+ splitter = Baran::RecursiveCharacterTextSplitter.new(
40
+ chunk_size: @max_chunk_size, chunk_overlap: 64,
41
+ separators: separator
42
+ )
43
+ chunks = splitter.chunks(data).collect {|c| c[:text]}
44
+ return reduce_to_max_size(chunks)
45
+ end
46
+
47
+ #
48
+ # Markdown
49
+ #
50
+
51
+ def markdown_to_array(data)
52
+ splitter = Baran::MarkdownSplitter.new()
53
+ return splitter.chunks(data).collect {|c| c[:text]}
54
+ end
55
+
56
+ #
57
+ # Splitting is done by separator and the LLM can respond
58
+ # with content of any length. Let's reduce the chunks by
59
+ # combining smaller responses up to @max_chunk_size
60
+ #
61
+
62
+ def reduce_to_max_size(chunks)
63
+ combined = []
64
+ i = 0
65
+ while i < chunks.length
66
+ c = chunks[i]
67
+ n = chunks[i + 1]
68
+ unless n.nil? or (c.length + n.length) > @cut_off
69
+ combined << [c, n].join("\n")
70
+ i += 2
71
+ else
72
+ combined << c
73
+ i += 1
74
+ end
75
+ end
76
+ combined
77
+ end
78
+ end
@@ -17,7 +17,7 @@ def add_help(opts)
17
17
  end
18
18
 
19
19
  def ask_and_store_option(options, key, question)
20
- if options[key].nil?
20
+ if options[key].nil? or options[key].empty?
21
21
  options[key] = TTY::Prompt.new.ask(question)
22
22
  end
23
23
  end
@@ -0,0 +1,41 @@
1
+ class Elastic
2
+ def initialize(elastic_url)
3
+ @elastic_url = elastic_url
4
+ end
5
+
6
+ def query_by_id(ids)
7
+ url = "/_search"
8
+ verb = "POST"
9
+ data = {
10
+ query: {
11
+ terms: {
12
+ _id: ids
13
+ }
14
+ }
15
+ }
16
+ sanitized_data = data.to_json
17
+ return post(url, verb, data)
18
+ end
19
+
20
+ def post(url, verb, data)
21
+ final_url = "#{@elastic_url}/#{url}"
22
+
23
+ # sanitize
24
+ sanitized_data = data.to_json
25
+ ["'", "’"].each { |c| sanitized_data.gsub!(c, "\#{c}") }
26
+
27
+ # post
28
+ cmd = """
29
+ curl -s -X#{verb} \
30
+ #{final_url} \
31
+ -H 'Content-Type: application/json' \
32
+ -d '#{sanitized_data}'
33
+ """
34
+ return `#{cmd}`
35
+ end
36
+
37
+ def create_index(elastic_index, body)
38
+ url = "#{elastic_index}"
39
+ return post(url, "PUT", body)
40
+ end
41
+ end
@@ -1,12 +1,70 @@
1
- require 'byebug'
1
+ require 'tty-progressbar'
2
+ require 'tty-spinner'
3
+ require 'concurrent'
2
4
 
3
5
  class Jira
4
6
 
7
+ MAX_THREADS = 20 # set to 1 to debug without concurrency
8
+
5
9
  def initialize(base_uri, username, token, break_at_one_page=false)
6
10
  @base_uri = base_uri.chomp("/")
7
11
  @username = username
8
12
  @token = token
9
13
  @break_at_one_page = break_at_one_page
14
+
15
+ @thread_pool = Concurrent::ThreadPoolExecutor.new(
16
+ min_threads: 0,
17
+ max_threads: MAX_THREADS,
18
+ max_queue: 0,
19
+ fallback_policy: :caller_runs
20
+ )
21
+ end
22
+
23
+ #
24
+ # Formatter
25
+ #
26
+
27
+ def self.jira_to_markdown(issue)
28
+
29
+ md = []
30
+ md << ""
31
+ md << "# #{issue['key']}"
32
+ md << "project: #{issue['fields']['project']['key']}"
33
+ md << "created: #{issue['fields']['created']}"
34
+ md << "updated: #{issue['fields']['updated']}"
35
+ md << "status: #{issue['fields']['status']['statusCategory']['name']}" unless issue['fields']["status"].nil?
36
+ md << "priority: #{issue['fields']['priority']['name']}"
37
+ md << "labels: #{issue['fields']['labels'].join(',')}"
38
+ md << "issue_type: #{issue['fields']['issuetype']['name']}" unless issue['fields']["issuetype"].nil?
39
+ md << "assignee: #{issue['fields']['assignee']['displayName']}" unless issue['fields']["assignee"].nil?
40
+ md << ""
41
+ md << "## Summary"
42
+ md << "#{issue['fields']['summary']}"
43
+ md << ""
44
+ md << ""
45
+ issue_md = md.join("\n")
46
+
47
+ comments = []
48
+ issue["comments"].each_with_index do |comment, i|
49
+ c_md = []
50
+ c_md << "### Comment - #{comment["author"]["displayName"]} "
51
+ c_md << ""
52
+ c_md << "created: #{comment["created"]}"
53
+
54
+ # nest markdown deeper
55
+ comment["body"].split("\n").each do |line|
56
+ c_md << if line.start_with?("#")
57
+ "####{line}"
58
+ else
59
+ line
60
+ end
61
+ end
62
+
63
+ c_md << ""
64
+ comments << [comment["id"], c_md.join("\n")]
65
+ end
66
+
67
+ return issue_md, comments
10
68
  end
11
69
 
12
70
  #
@@ -16,10 +74,12 @@ class Jira
16
74
  def get_epic_ids(project, epic_wildcard)
17
75
  jql_query = "project = \"#{project}\" AND issuetype = Epic AND text ~ \"#{epic_wildcard}\""
18
76
  results = run_jql_query(jql_query)
19
- return results.select{|h| h['fields']['summary'].start_with? epic_wildcard}.map {|h| h['id']}
77
+ epics = results.select{|h| h['fields']['summary'].start_with? epic_wildcard}
78
+ epic_ids = epics.map {|h| h['id']}
79
+ return epic_ids, epics
20
80
  end
21
81
 
22
- def get_issues(project, epic_ids)
82
+ def get_issues_by_epic_ids(project, epic_ids)
23
83
  jql_query = "project = \"#{project}\" AND parentEpic IN (#{epic_ids.join(',')})"
24
84
  return run_jql_query(jql_query)
25
85
  end
@@ -32,6 +92,14 @@ class Jira
32
92
  end
33
93
  end
34
94
 
95
+ def get_issue_comments(issue_key)
96
+ uri = URI.parse("#{@base_uri}/rest/api/2/issue/#{issue_key}/comment")
97
+ jira_get(uri) do |response|
98
+ result = JSON.parse(response.body)
99
+ return result["comments"]
100
+ end
101
+ end
102
+
35
103
  #
36
104
  # Helpers: GET & POST
37
105
  #
@@ -49,7 +117,11 @@ class Jira
49
117
  if response.code == '200'
50
118
  yield response
51
119
  else
52
- raise "Jira query failed with HTTP status code #{response.code}"
120
+ raise """
121
+ Jira query failed with HTTP status code #{response.code}
122
+
123
+ #{response.body}
124
+ """
53
125
  end
54
126
  end
55
127
 
@@ -68,7 +140,13 @@ class Jira
68
140
  if response.code == '200'
69
141
  yield response
70
142
  else
71
- raise "Jira query failed with HTTP status code #{response.code}"
143
+ raise """
144
+ Jira query failed with HTTP status code #{response.code}
145
+
146
+ BODY: #{body.to_json}
147
+
148
+ RESPONSE: #{response.body}
149
+ """
72
150
  end
73
151
  end
74
152
 
@@ -77,6 +155,9 @@ class Jira
77
155
  #
78
156
 
79
157
  def run_jql_query(jql)
158
+ spinner = TTY::Spinner.new("[:spinner] fetching ...", format: :pulse_2)
159
+ spinner.auto_spin # Automatic animation with default interval
160
+
80
161
  start_at = 0
81
162
  max_results = 50
82
163
  total_results = nil
@@ -110,53 +191,100 @@ class Jira
110
191
  start_at += max_results # else next page
111
192
  end
112
193
  end
113
-
114
- print '.' # loop
115
194
  end
195
+ spinner.stop("#{all_results.count} issues")
116
196
  all_results.map {|h| h}
117
197
  end
118
- end
119
198
 
120
- #
121
- # Collect status changelogs
122
- #
123
- # Given a array of jira issue hashes
124
- # * fetch the change log
125
- # * filter down to status changes
126
- # * add it to the issue hash as ["status_changelogs"]
127
- #
128
-
129
- def collect_status_changelogs(jira, issues, options)
130
- final_issue_hashes = []
131
-
132
- issues.each do |issue|
133
- issue_key = issue["key"]
134
- issue["status_changelogs"] = []
135
-
136
- # fetch change log
137
- print '.'
138
- changelogs = jira.get_issue_status_changelog(issue_key)
139
-
140
- changelogs.each do |change_log|
141
-
142
- # all items that are status changes
143
- status_logs = change_log["items"].select {|i| i["field"]=="status"}
144
- status_logs = status_logs.collect do |status_log|
145
- {
146
- "key": issue_key,
147
- "created": change_log["created"],
148
- "toString": status_log["toString"],
149
- "fromString": status_log["fromString"]
150
- }
199
+ def collect_comments(jira, issues)
200
+ final_issue_hashes = []
201
+ bar = TTY::ProgressBar.new("fetching [:bar]", total: issues.count)
202
+
203
+ issues.each do |issue|
204
+ do_concurently do
205
+ issue_key = issue["key"]
206
+ issue["comments"] = []
207
+
208
+ # fetch change log
209
+ comments = get_issue_comments(issue_key)
210
+ issue["comments"] = comments
211
+ final_issue_hashes << issue # save
212
+ bar.advance
151
213
  end
214
+ end
215
+
216
+ # wait
217
+ wait_concurrently
218
+ return final_issue_hashes
219
+ end
220
+
221
+ #
222
+ # Collect status changelogs
223
+ #
224
+ # Given a array of jira issue hashes
225
+ # * fetch the change log
226
+ # * filter down to status changes
227
+ # * add it to the issue hash as ["status_changelogs"]
228
+ #
229
+
230
+ def collect_status_changelogs(jira, issues)
231
+ final_issue_hashes = []
232
+ bar = TTY::ProgressBar.new("fetching [:bar]", total: issues.count)
233
+
234
+ issues.each do |issue|
235
+ do_concurently do
236
+ issue_key = issue["key"]
237
+ issue["status_changelogs"] = []
238
+
239
+ # fetch change log
240
+ changelogs = get_issue_status_changelog(issue_key)
241
+
242
+ changelogs.each do |change_log|
243
+
244
+ # all items that are status changes
245
+ status_logs = change_log["items"].select {|i| i["field"]=="status"}
246
+ status_logs = status_logs.collect do |status_log|
247
+ {
248
+ "key": issue_key,
249
+ "created": change_log["created"],
250
+ "toString": status_log["toString"],
251
+ "fromString": status_log["fromString"]
252
+ }
253
+ end
152
254
 
153
- # append them to issue
154
- status_logs.each do |status_log|
155
- issue["status_changelogs"] << status_log
156
- end if status_logs.count > 0
255
+ # append them to issue
256
+ status_logs.each do |status_log|
257
+ issue["status_changelogs"] << status_log
258
+ end
259
+ end
260
+
261
+ final_issue_hashes << issue # save
262
+ bar.advance
263
+ end
157
264
  end
158
265
 
159
- final_issue_hashes << issue # save
266
+ # wait
267
+ wait_concurrently
268
+ return final_issue_hashes
160
269
  end
161
- return final_issue_hashes
270
+
271
+ private
272
+
273
+ def do_concurently
274
+ if MAX_THREADS > 1
275
+ @thread_pool.post do
276
+ yield
277
+ end
278
+ else
279
+ yield
280
+ end
281
+ end
282
+
283
+ def wait_concurrently
284
+ if MAX_THREADS > 1
285
+ @thread_pool.shutdown
286
+ @thread_pool.wait_for_termination
287
+ end
288
+ end
289
+
162
290
  end
@@ -0,0 +1,78 @@
1
+
2
+ class Milvus
3
+ def initialize(host, port)
4
+ @host = host
5
+ @port = port
6
+ end
7
+
8
+ def search(collection_name, embedding)
9
+ final_url = "http://#{@host}:#{@port}/v1/vector/search"
10
+ data = {
11
+ collectionName: collection_name,
12
+ vector: embedding,
13
+ outputFields: ["id", "name", "doc_key", "distance"],
14
+ }
15
+
16
+ # post
17
+ sanitized_data = data.to_json
18
+ cmd = """
19
+ curl -s \
20
+ '#{final_url}' \
21
+ -X 'POST' \
22
+ -H 'accept: application/json' \
23
+ -H 'Content-Type: application/json' \
24
+ -d '#{sanitized_data}'
25
+ """
26
+ return `#{cmd}`
27
+ end
28
+
29
+ def create_collection(collection_name)
30
+ final_url = "http://#{@host}:#{@port}/v1/vector/collections/create"
31
+ data = {
32
+ dbName: "default",
33
+ collectionName: collection_name,
34
+ dimension: 1536,
35
+ metricType: "L2",
36
+ primaryField: "id",
37
+ vectorField: "vector"
38
+ }
39
+
40
+ # post
41
+ sanitized_data = data.to_json
42
+ cmd = """
43
+ curl -s \
44
+ '#{final_url}' \
45
+ -X 'POST' \
46
+ -H 'accept: application/json' \
47
+ -H 'Content-Type: application/json' \
48
+ -d '#{sanitized_data}'
49
+ """
50
+ return `#{cmd}`
51
+ end
52
+
53
+ def post_to_collection(collection_name, doc_key, embedding)
54
+ final_url = "http://#{@host}:#{@port}/v1/vector/insert"
55
+ data = {
56
+ collectionName: collection_name,
57
+ data: {
58
+ doc_key: doc_key,
59
+ vector: embedding
60
+ }
61
+ }
62
+
63
+ # post
64
+ sanitized_data = data.to_json
65
+ cmd = """
66
+ curl -s \
67
+ '#{final_url}' \
68
+ -X POST \
69
+ -H 'accept: application/json' \
70
+ -H 'Content-Type: application/json' \
71
+ -d '#{sanitized_data}'
72
+ """
73
+ response = `#{cmd}`
74
+ data = JSON.parse(response)
75
+ raise "Error: #{data.to_json}\n\nData #{sanitized_data}" if data.has_key?("message")
76
+ return data.to_json
77
+ end
78
+ end
@@ -0,0 +1,29 @@
1
+
2
+ class HelpPrinter
3
+
4
+ def initialize(logger)
5
+ @logger = logger
6
+ end
7
+
8
+ def print_dk_help_line(key, help)
9
+ if $stdout.isatty
10
+ if help.nil?
11
+ @logger.puts("#{key.ljust(15, ' ')} ???no help???")
12
+ else
13
+ key = key.ljust(15, ' ')
14
+ help_parts = help.split(";")
15
+
16
+ # first line
17
+ @logger.puts(key, help_parts.shift)
18
+
19
+ # following lines
20
+ padding = "".ljust(15, ' ')
21
+ help_parts.each do |p|
22
+ @logger.puts(padding, p)
23
+ end
24
+ @logger.puts("") if help.end_with?(";")
25
+ end
26
+ end
27
+ end
28
+
29
+ end