lex-dataset 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 5e63278e11eb728b818a7519bc0a33fb713088d3d149100d4398414126e6ced8
4
+ data.tar.gz: a119bcde6cec1c5f253a46284ead75f5fe672f0cd6a75e7334a6d7b325e61255
5
+ SHA512:
6
+ metadata.gz: f97ed63b5cc90b64a8773186bdc7d2df964a8ba334c5827dff51653db83030c9e19f89fbd07b7537995fe23677cd22792241bcc53244bcd19bd0e7705c9c6caa
7
+ data.tar.gz: 71209de12c040ce5c4a09436cf8902fc517352b30f59072a9a0dcfd7db031d9279b2e108c2188d6a108d40c611d68306f67d1503d6aa38ee623d046b93a0d56b
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Dataset
6
+ class Client
7
+ include Runners::Dataset
8
+ include Runners::Experiment
9
+
10
+ def initialize(db: nil, **opts)
11
+ @db = db
12
+ @opts = opts
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+ require 'json'
5
+
6
+ module Legion
7
+ module Extensions
8
+ module Dataset
9
+ module Helpers
10
+ module ImportExport
11
+ module_function
12
+
13
+ def import_csv(path)
14
+ rows = []
15
+ CSV.foreach(path, headers: true, header_converters: :symbol) do |row|
16
+ rows << { input: row[:input], expected_output: row[:expected_output], metadata: row[:metadata] }
17
+ end
18
+ rows
19
+ end
20
+
21
+ def import_json(path)
22
+ data = ::JSON.parse(File.read(path), symbolize_names: true)
23
+ data.is_a?(Array) ? data : []
24
+ end
25
+
26
+ def import_jsonl(path)
27
+ File.readlines(path).map { |line| ::JSON.parse(line.strip, symbolize_names: true) }
28
+ end
29
+
30
+ def export_csv(rows, path)
31
+ CSV.open(path, 'w', headers: %w[input expected_output metadata], write_headers: true) do |csv|
32
+ rows.each { |row| csv << [row[:input], row[:expected_output], row[:metadata]] }
33
+ end
34
+ end
35
+
36
+ def export_json(rows, path)
37
+ File.write(path, ::JSON.pretty_generate(rows))
38
+ end
39
+
40
+ def export_jsonl(rows, path)
41
+ File.open(path, 'w') do |f|
42
+ rows.each { |row| f.puts(::JSON.generate(row)) }
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'openssl'
4
+ require 'json'
5
+
6
+ module Legion
7
+ module Extensions
8
+ module Dataset
9
+ module Runners
10
+ module Dataset
11
+ def create_dataset(name:, description: nil, rows: [], **)
12
+ ds_id = db[:datasets].insert(name: name, description: description, created_at: Time.now.utc)
13
+ create_version(ds_id, rows)
14
+ { created: true, name: name, version: 1, row_count: rows.size }
15
+ end
16
+
17
+ def import_dataset(name:, path:, format: 'json', description: nil, **)
18
+ rows = case format.to_s
19
+ when 'csv' then Helpers::ImportExport.import_csv(path)
20
+ when 'jsonl' then Helpers::ImportExport.import_jsonl(path)
21
+ else Helpers::ImportExport.import_json(path)
22
+ end
23
+ create_dataset(name: name, description: description, rows: rows)
24
+ end
25
+
26
+ def export_dataset(name:, path:, format: 'json', version: nil, **)
27
+ rows = get_rows(name, version)
28
+ case format.to_s
29
+ when 'csv' then Helpers::ImportExport.export_csv(rows, path)
30
+ when 'jsonl' then Helpers::ImportExport.export_jsonl(rows, path)
31
+ else Helpers::ImportExport.export_json(rows, path)
32
+ end
33
+ { exported: true, path: path, row_count: rows.size }
34
+ end
35
+
36
+ def list_datasets(**)
37
+ db[:datasets].all.map do |dataset|
38
+ latest = db[:dataset_versions].where(dataset_id: dataset[:id]).order(Sequel.desc(:version)).first
39
+ { name: dataset[:name], description: dataset[:description],
40
+ latest_version: latest ? latest[:version] : nil,
41
+ row_count: latest ? latest[:row_count] : 0 }
42
+ end
43
+ end
44
+
45
+ def get_dataset(name:, version: nil, **)
46
+ ds = db[:datasets].where(name: name).first
47
+ return { error: 'not_found' } unless ds
48
+
49
+ ver = if version
50
+ db[:dataset_versions].where(dataset_id: ds[:id], version: version).first
51
+ else
52
+ db[:dataset_versions].where(dataset_id: ds[:id]).order(Sequel.desc(:version)).first
53
+ end
54
+ return { error: 'version_not_found' } unless ver
55
+
56
+ rows = db[:dataset_rows].where(version_id: ver[:id]).order(:row_index).all
57
+ { name: name, version: ver[:version], version_id: ver[:id], row_count: ver[:row_count],
58
+ rows: rows.map { |r| { row_index: r[:row_index], input: r[:input], expected_output: r[:expected_output] } } }
59
+ end
60
+
61
+ private
62
+
63
+ def create_version(dataset_id, rows)
64
+ hash = OpenSSL::Digest.new('SHA256').hexdigest(rows.to_s)
65
+ ver_num = (db[:dataset_versions].where(dataset_id: dataset_id).max(:version) || 0) + 1
66
+ ver_id = db[:dataset_versions].insert(
67
+ dataset_id: dataset_id, version: ver_num, row_count: rows.size,
68
+ content_hash: hash, created_at: Time.now.utc
69
+ )
70
+ rows.each_with_index do |row, idx|
71
+ db[:dataset_rows].insert(
72
+ version_id: ver_id, row_index: idx,
73
+ input: row[:input].to_s,
74
+ expected_output: row[:expected_output]&.to_s,
75
+ metadata: row[:metadata]&.to_s
76
+ )
77
+ end
78
+ ver_id
79
+ end
80
+
81
+ def get_rows(name, version)
82
+ result = get_dataset(name: name, version: version)
83
+ result[:rows] || []
84
+ end
85
+
86
+ def db
87
+ @db
88
+ end
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Legion
6
+ module Extensions
7
+ module Dataset
8
+ module Runners
9
+ module Experiment
10
+ def run_experiment(name:, dataset_name:, task_callable:, dataset_version: nil, evaluators: [], **)
11
+ ds = get_dataset(name: dataset_name, version: dataset_version)
12
+ return { error: ds[:error] } if ds[:error]
13
+
14
+ exp_id = db[:experiments].insert(
15
+ name: name, dataset_version_id: ds[:version_id],
16
+ eval_config: ::JSON.dump(evaluators.map { |e| e.respond_to?(:name) ? e.name : e.to_s }),
17
+ status: 'running', created_at: Time.now.utc
18
+ )
19
+
20
+ results = ds[:rows].map do |row|
21
+ start_time = Time.now
22
+ output = task_callable.call(row[:input])
23
+ latency = ((Time.now - start_time) * 1000).round
24
+
25
+ scores = evaluators.map do |evaluator|
26
+ evaluator.evaluate(input: row[:input], output: output, expected: row[:expected_output])
27
+ end
28
+
29
+ passed = scores.empty? || scores.all? { |s| s[:passed] }
30
+ db[:experiment_results].insert(
31
+ experiment_id: exp_id, row_index: row[:row_index],
32
+ output: output.to_s, eval_scores: ::JSON.dump(scores),
33
+ latency_ms: latency, passed: passed
34
+ )
35
+ { row_index: row[:row_index], passed: passed, latency_ms: latency }
36
+ end
37
+
38
+ summary = build_summary(results)
39
+ db[:experiments].where(id: exp_id).update(
40
+ status: 'completed', summary: ::JSON.dump(summary), completed_at: Time.now.utc
41
+ )
42
+ { experiment_id: exp_id, name: name, summary: summary }
43
+ end
44
+
45
+ def compare_experiments(exp1_name:, exp2_name:, **)
46
+ r1 = load_experiment_results(exp1_name)
47
+ r2 = load_experiment_results(exp2_name)
48
+ return { error: 'experiments_not_found' } unless r1 && r2
49
+
50
+ pairs = r1.zip(r2).select { |a, b| a && b }
51
+ regressions = pairs.select { |a, b| a[:passed] && !b[:passed] }.map { |a, _| a[:row_index] }
52
+ improvements = pairs.select { |a, b| !a[:passed] && b[:passed] }.map { |_, b| b[:row_index] }
53
+
54
+ { exp1: exp1_name, exp2: exp2_name, rows_compared: pairs.size,
55
+ regressions: regressions, improvements: improvements,
56
+ regression_count: regressions.size, improvement_count: improvements.size }
57
+ end
58
+
59
+ private
60
+
61
+ def build_summary(results)
62
+ {
63
+ total: results.size,
64
+ passed: results.count { |r| r[:passed] },
65
+ failed: results.count { |r| !r[:passed] },
66
+ avg_latency_ms: results.empty? ? 0 : (results.sum { |r| r[:latency_ms] } / results.size).round
67
+ }
68
+ end
69
+
70
+ def load_experiment_results(name)
71
+ exp = db[:experiments].where(name: name).first
72
+ return nil unless exp
73
+
74
+ db[:experiment_results].where(experiment_id: exp[:id]).order(:row_index).all
75
+ end
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Dataset
6
+ VERSION = '0.1.0'
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'dataset/version'
4
+ require_relative 'dataset/helpers/import_export'
5
+ require_relative 'dataset/runners/dataset'
6
+ require_relative 'dataset/runners/experiment'
7
+ require_relative 'dataset/client'
8
+
9
+ module Legion
10
+ module Extensions
11
+ module Dataset
12
+ extend Legion::Extensions::Core if defined?(Legion::Extensions::Core)
13
+ end
14
+ end
15
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: lex-dataset
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Matthew Iverson
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies: []
12
+ description: Provides versioned dataset storage with import/export (CSV/JSON/JSONL),
13
+ experiment runner with evaluator integration, and regression detection.
14
+ email:
15
+ - matt@iverson.io
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/legion/extensions/dataset.rb
21
+ - lib/legion/extensions/dataset/client.rb
22
+ - lib/legion/extensions/dataset/helpers/import_export.rb
23
+ - lib/legion/extensions/dataset/runners/dataset.rb
24
+ - lib/legion/extensions/dataset/runners/experiment.rb
25
+ - lib/legion/extensions/dataset/version.rb
26
+ homepage: https://github.com/LegionIO/lex-dataset
27
+ licenses:
28
+ - MIT
29
+ metadata:
30
+ rubygems_mfa_required: 'true'
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '3.4'
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubygems_version: 3.6.9
46
+ specification_version: 4
47
+ summary: Versioned dataset management for LegionIO
48
+ test_files: []