lex-dataset 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/legion/extensions/dataset/client.rb +17 -0
- data/lib/legion/extensions/dataset/helpers/import_export.rb +49 -0
- data/lib/legion/extensions/dataset/runners/dataset.rb +93 -0
- data/lib/legion/extensions/dataset/runners/experiment.rb +80 -0
- data/lib/legion/extensions/dataset/version.rb +9 -0
- data/lib/legion/extensions/dataset.rb +15 -0
- metadata +48 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 5e63278e11eb728b818a7519bc0a33fb713088d3d149100d4398414126e6ced8
|
|
4
|
+
data.tar.gz: a119bcde6cec1c5f253a46284ead75f5fe672f0cd6a75e7334a6d7b325e61255
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: f97ed63b5cc90b64a8773186bdc7d2df964a8ba334c5827dff51653db83030c9e19f89fbd07b7537995fe23677cd22792241bcc53244bcd19bd0e7705c9c6caa
|
|
7
|
+
data.tar.gz: 71209de12c040ce5c4a09436cf8902fc517352b30f59072a9a0dcfd7db031d9279b2e108c2188d6a108d40c611d68306f67d1503d6aa38ee623d046b93a0d56b
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Dataset
|
|
6
|
+
class Client
|
|
7
|
+
include Runners::Dataset
|
|
8
|
+
include Runners::Experiment
|
|
9
|
+
|
|
10
|
+
def initialize(db: nil, **opts)
|
|
11
|
+
@db = db
|
|
12
|
+
@opts = opts
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'csv'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module Legion
|
|
7
|
+
module Extensions
|
|
8
|
+
module Dataset
|
|
9
|
+
module Helpers
|
|
10
|
+
module ImportExport
|
|
11
|
+
module_function
|
|
12
|
+
|
|
13
|
+
def import_csv(path)
|
|
14
|
+
rows = []
|
|
15
|
+
CSV.foreach(path, headers: true, header_converters: :symbol) do |row|
|
|
16
|
+
rows << { input: row[:input], expected_output: row[:expected_output], metadata: row[:metadata] }
|
|
17
|
+
end
|
|
18
|
+
rows
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def import_json(path)
|
|
22
|
+
data = ::JSON.parse(File.read(path), symbolize_names: true)
|
|
23
|
+
data.is_a?(Array) ? data : []
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def import_jsonl(path)
|
|
27
|
+
File.readlines(path).map { |line| ::JSON.parse(line.strip, symbolize_names: true) }
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def export_csv(rows, path)
|
|
31
|
+
CSV.open(path, 'w', headers: %w[input expected_output metadata], write_headers: true) do |csv|
|
|
32
|
+
rows.each { |row| csv << [row[:input], row[:expected_output], row[:metadata]] }
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def export_json(rows, path)
|
|
37
|
+
File.write(path, ::JSON.pretty_generate(rows))
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def export_jsonl(rows, path)
|
|
41
|
+
File.open(path, 'w') do |f|
|
|
42
|
+
rows.each { |row| f.puts(::JSON.generate(row)) }
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'openssl'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module Legion
|
|
7
|
+
module Extensions
|
|
8
|
+
module Dataset
|
|
9
|
+
module Runners
|
|
10
|
+
module Dataset
|
|
11
|
+
def create_dataset(name:, description: nil, rows: [], **)
|
|
12
|
+
ds_id = db[:datasets].insert(name: name, description: description, created_at: Time.now.utc)
|
|
13
|
+
create_version(ds_id, rows)
|
|
14
|
+
{ created: true, name: name, version: 1, row_count: rows.size }
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def import_dataset(name:, path:, format: 'json', description: nil, **)
|
|
18
|
+
rows = case format.to_s
|
|
19
|
+
when 'csv' then Helpers::ImportExport.import_csv(path)
|
|
20
|
+
when 'jsonl' then Helpers::ImportExport.import_jsonl(path)
|
|
21
|
+
else Helpers::ImportExport.import_json(path)
|
|
22
|
+
end
|
|
23
|
+
create_dataset(name: name, description: description, rows: rows)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def export_dataset(name:, path:, format: 'json', version: nil, **)
|
|
27
|
+
rows = get_rows(name, version)
|
|
28
|
+
case format.to_s
|
|
29
|
+
when 'csv' then Helpers::ImportExport.export_csv(rows, path)
|
|
30
|
+
when 'jsonl' then Helpers::ImportExport.export_jsonl(rows, path)
|
|
31
|
+
else Helpers::ImportExport.export_json(rows, path)
|
|
32
|
+
end
|
|
33
|
+
{ exported: true, path: path, row_count: rows.size }
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def list_datasets(**)
|
|
37
|
+
db[:datasets].all.map do |dataset|
|
|
38
|
+
latest = db[:dataset_versions].where(dataset_id: dataset[:id]).order(Sequel.desc(:version)).first
|
|
39
|
+
{ name: dataset[:name], description: dataset[:description],
|
|
40
|
+
latest_version: latest ? latest[:version] : nil,
|
|
41
|
+
row_count: latest ? latest[:row_count] : 0 }
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def get_dataset(name:, version: nil, **)
|
|
46
|
+
ds = db[:datasets].where(name: name).first
|
|
47
|
+
return { error: 'not_found' } unless ds
|
|
48
|
+
|
|
49
|
+
ver = if version
|
|
50
|
+
db[:dataset_versions].where(dataset_id: ds[:id], version: version).first
|
|
51
|
+
else
|
|
52
|
+
db[:dataset_versions].where(dataset_id: ds[:id]).order(Sequel.desc(:version)).first
|
|
53
|
+
end
|
|
54
|
+
return { error: 'version_not_found' } unless ver
|
|
55
|
+
|
|
56
|
+
rows = db[:dataset_rows].where(version_id: ver[:id]).order(:row_index).all
|
|
57
|
+
{ name: name, version: ver[:version], version_id: ver[:id], row_count: ver[:row_count],
|
|
58
|
+
rows: rows.map { |r| { row_index: r[:row_index], input: r[:input], expected_output: r[:expected_output] } } }
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
private
|
|
62
|
+
|
|
63
|
+
def create_version(dataset_id, rows)
|
|
64
|
+
hash = OpenSSL::Digest.new('SHA256').hexdigest(rows.to_s)
|
|
65
|
+
ver_num = (db[:dataset_versions].where(dataset_id: dataset_id).max(:version) || 0) + 1
|
|
66
|
+
ver_id = db[:dataset_versions].insert(
|
|
67
|
+
dataset_id: dataset_id, version: ver_num, row_count: rows.size,
|
|
68
|
+
content_hash: hash, created_at: Time.now.utc
|
|
69
|
+
)
|
|
70
|
+
rows.each_with_index do |row, idx|
|
|
71
|
+
db[:dataset_rows].insert(
|
|
72
|
+
version_id: ver_id, row_index: idx,
|
|
73
|
+
input: row[:input].to_s,
|
|
74
|
+
expected_output: row[:expected_output]&.to_s,
|
|
75
|
+
metadata: row[:metadata]&.to_s
|
|
76
|
+
)
|
|
77
|
+
end
|
|
78
|
+
ver_id
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def get_rows(name, version)
|
|
82
|
+
result = get_dataset(name: name, version: version)
|
|
83
|
+
result[:rows] || []
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def db
|
|
87
|
+
@db
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module Legion
|
|
6
|
+
module Extensions
|
|
7
|
+
module Dataset
|
|
8
|
+
module Runners
|
|
9
|
+
module Experiment
|
|
10
|
+
def run_experiment(name:, dataset_name:, task_callable:, dataset_version: nil, evaluators: [], **)
|
|
11
|
+
ds = get_dataset(name: dataset_name, version: dataset_version)
|
|
12
|
+
return { error: ds[:error] } if ds[:error]
|
|
13
|
+
|
|
14
|
+
exp_id = db[:experiments].insert(
|
|
15
|
+
name: name, dataset_version_id: ds[:version_id],
|
|
16
|
+
eval_config: ::JSON.dump(evaluators.map { |e| e.respond_to?(:name) ? e.name : e.to_s }),
|
|
17
|
+
status: 'running', created_at: Time.now.utc
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
results = ds[:rows].map do |row|
|
|
21
|
+
start_time = Time.now
|
|
22
|
+
output = task_callable.call(row[:input])
|
|
23
|
+
latency = ((Time.now - start_time) * 1000).round
|
|
24
|
+
|
|
25
|
+
scores = evaluators.map do |evaluator|
|
|
26
|
+
evaluator.evaluate(input: row[:input], output: output, expected: row[:expected_output])
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
passed = scores.empty? || scores.all? { |s| s[:passed] }
|
|
30
|
+
db[:experiment_results].insert(
|
|
31
|
+
experiment_id: exp_id, row_index: row[:row_index],
|
|
32
|
+
output: output.to_s, eval_scores: ::JSON.dump(scores),
|
|
33
|
+
latency_ms: latency, passed: passed
|
|
34
|
+
)
|
|
35
|
+
{ row_index: row[:row_index], passed: passed, latency_ms: latency }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
summary = build_summary(results)
|
|
39
|
+
db[:experiments].where(id: exp_id).update(
|
|
40
|
+
status: 'completed', summary: ::JSON.dump(summary), completed_at: Time.now.utc
|
|
41
|
+
)
|
|
42
|
+
{ experiment_id: exp_id, name: name, summary: summary }
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def compare_experiments(exp1_name:, exp2_name:, **)
|
|
46
|
+
r1 = load_experiment_results(exp1_name)
|
|
47
|
+
r2 = load_experiment_results(exp2_name)
|
|
48
|
+
return { error: 'experiments_not_found' } unless r1 && r2
|
|
49
|
+
|
|
50
|
+
pairs = r1.zip(r2).select { |a, b| a && b }
|
|
51
|
+
regressions = pairs.select { |a, b| a[:passed] && !b[:passed] }.map { |a, _| a[:row_index] }
|
|
52
|
+
improvements = pairs.select { |a, b| !a[:passed] && b[:passed] }.map { |_, b| b[:row_index] }
|
|
53
|
+
|
|
54
|
+
{ exp1: exp1_name, exp2: exp2_name, rows_compared: pairs.size,
|
|
55
|
+
regressions: regressions, improvements: improvements,
|
|
56
|
+
regression_count: regressions.size, improvement_count: improvements.size }
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
private
|
|
60
|
+
|
|
61
|
+
def build_summary(results)
|
|
62
|
+
{
|
|
63
|
+
total: results.size,
|
|
64
|
+
passed: results.count { |r| r[:passed] },
|
|
65
|
+
failed: results.count { |r| !r[:passed] },
|
|
66
|
+
avg_latency_ms: results.empty? ? 0 : (results.sum { |r| r[:latency_ms] } / results.size).round
|
|
67
|
+
}
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def load_experiment_results(name)
|
|
71
|
+
exp = db[:experiments].where(name: name).first
|
|
72
|
+
return nil unless exp
|
|
73
|
+
|
|
74
|
+
db[:experiment_results].where(experiment_id: exp[:id]).order(:row_index).all
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'dataset/version'
|
|
4
|
+
require_relative 'dataset/helpers/import_export'
|
|
5
|
+
require_relative 'dataset/runners/dataset'
|
|
6
|
+
require_relative 'dataset/runners/experiment'
|
|
7
|
+
require_relative 'dataset/client'
|
|
8
|
+
|
|
9
|
+
module Legion
|
|
10
|
+
module Extensions
|
|
11
|
+
module Dataset
|
|
12
|
+
extend Legion::Extensions::Core if defined?(Legion::Extensions::Core)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: lex-dataset
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Matthew Iverson
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies: []
|
|
12
|
+
description: Provides versioned dataset storage with import/export (CSV/JSON/JSONL),
|
|
13
|
+
experiment runner with evaluator integration, and regression detection.
|
|
14
|
+
email:
|
|
15
|
+
- matt@iverson.io
|
|
16
|
+
executables: []
|
|
17
|
+
extensions: []
|
|
18
|
+
extra_rdoc_files: []
|
|
19
|
+
files:
|
|
20
|
+
- lib/legion/extensions/dataset.rb
|
|
21
|
+
- lib/legion/extensions/dataset/client.rb
|
|
22
|
+
- lib/legion/extensions/dataset/helpers/import_export.rb
|
|
23
|
+
- lib/legion/extensions/dataset/runners/dataset.rb
|
|
24
|
+
- lib/legion/extensions/dataset/runners/experiment.rb
|
|
25
|
+
- lib/legion/extensions/dataset/version.rb
|
|
26
|
+
homepage: https://github.com/LegionIO/lex-dataset
|
|
27
|
+
licenses:
|
|
28
|
+
- MIT
|
|
29
|
+
metadata:
|
|
30
|
+
rubygems_mfa_required: 'true'
|
|
31
|
+
rdoc_options: []
|
|
32
|
+
require_paths:
|
|
33
|
+
- lib
|
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
35
|
+
requirements:
|
|
36
|
+
- - ">="
|
|
37
|
+
- !ruby/object:Gem::Version
|
|
38
|
+
version: '3.4'
|
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
40
|
+
requirements:
|
|
41
|
+
- - ">="
|
|
42
|
+
- !ruby/object:Gem::Version
|
|
43
|
+
version: '0'
|
|
44
|
+
requirements: []
|
|
45
|
+
rubygems_version: 3.6.9
|
|
46
|
+
specification_version: 4
|
|
47
|
+
summary: Versioned dataset management for LegionIO
|
|
48
|
+
test_files: []
|