lex-dataset 0.1.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5e63278e11eb728b818a7519bc0a33fb713088d3d149100d4398414126e6ced8
4
- data.tar.gz: a119bcde6cec1c5f253a46284ead75f5fe672f0cd6a75e7334a6d7b325e61255
3
+ metadata.gz: 8ae51a149983082edf65b6715a2471b4f058594ccb322d4019b5a5745fb77dd4
4
+ data.tar.gz: 9d59265e25ef0f84351bcafbd6a748c6477cbc349c3ad1660cee6faef89a2813
5
5
  SHA512:
6
- metadata.gz: f97ed63b5cc90b64a8773186bdc7d2df964a8ba334c5827dff51653db83030c9e19f89fbd07b7537995fe23677cd22792241bcc53244bcd19bd0e7705c9c6caa
7
- data.tar.gz: 71209de12c040ce5c4a09436cf8902fc517352b30f59072a9a0dcfd7db031d9279b2e108c2188d6a108d40c611d68306f67d1503d6aa38ee623d046b93a0d56b
6
+ metadata.gz: e863c250713b07392d4c63a74424fc6cc13d583a08a51d12f78d6285aa9158a1bcbe5ba2c26ef7e6ed6ced3de2de18a070ef0062ed1ad3f2f0dd2bafcdd10468
7
+ data.tar.gz: 6a2343d550f3fdefaee648b4d4bd5d771d04296e99adab52e04dda1997f39db985dd4fec7d9d09b48e1dfbcb46249f2351edde2806135286621bd496f0bd71da
data/README.md ADDED
@@ -0,0 +1,71 @@
1
+ # lex-dataset
2
+
3
+ Versioned dataset management for LegionIO. Provides immutable versioned dataset storage with CSV, JSON, and JSONL import/export and content-hash deduplication.
4
+
5
+ ## Overview
6
+
7
+ `lex-dataset` stores named datasets with full version history. Each version is content-hashed — submitting the same rows twice results in no new version. Datasets consist of input/expected-output row pairs suitable for LLM evaluation workflows.
8
+
9
+ ## Installation
10
+
11
+ ```ruby
12
+ gem 'lex-dataset'
13
+ ```
14
+
15
+ ## Usage
16
+
17
+ ```ruby
18
+ require 'legion/extensions/dataset'
19
+
20
+ client = Legion::Extensions::Dataset::Client.new
21
+
22
+ # Create a dataset with inline rows
23
+ client.create_dataset(
24
+ name: 'qa-pairs-v1',
25
+ description: 'Question-answer evaluation set',
26
+ rows: [
27
+ { input: 'What is BGP?', expected_output: 'Border Gateway Protocol' },
28
+ { input: 'What is OSPF?', expected_output: 'Open Shortest Path First' }
29
+ ]
30
+ )
31
+ # => { created: true, name: 'qa-pairs-v1', version: 1, row_count: 2 }
32
+
33
+ # Import from file
34
+ client.import_dataset(name: 'qa-from-file', path: '/data/qa.jsonl', format: 'jsonl')
35
+
36
+ # Export a specific version
37
+ client.export_dataset(name: 'qa-pairs-v1', path: '/tmp/export.json', format: 'json')
38
+
39
+ # Retrieve rows
40
+ client.get_dataset(name: 'qa-pairs-v1')
41
+ # => { name: 'qa-pairs-v1', version: 1, row_count: 2, rows: [...] }
42
+
43
+ # List all datasets
44
+ client.list_datasets
45
+ ```
46
+
47
+ ## Supported Formats
48
+
49
+ | Format | Description |
50
+ |--------|-------------|
51
+ | `json` | Array of row objects (default) |
52
+ | `jsonl` | One JSON object per line |
53
+ | `csv` | Header row + data rows |
54
+
55
+ ## Related Repos
56
+
57
+ - `lex-eval` — uses datasets as input for LLM evaluation runs
58
+ - `lex-prompt` — versioned prompt templates consumed alongside datasets in evaluation workflows
59
+ - `legion-data` — underlying Sequel database connection (SQLite/PostgreSQL/MySQL)
60
+
61
+ ## Development
62
+
63
+ ```bash
64
+ bundle install
65
+ bundle exec rspec
66
+ bundle exec rubocop
67
+ ```
68
+
69
+ ## License
70
+
71
+ MIT
@@ -6,6 +6,7 @@ module Legion
6
6
  class Client
7
7
  include Runners::Dataset
8
8
  include Runners::Experiment
9
+ include Runners::Sampling
9
10
 
10
11
  def initialize(db: nil, **opts)
11
12
  @db = db
@@ -58,8 +58,94 @@ module Legion
58
58
  rows: rows.map { |r| { row_index: r[:row_index], input: r[:input], expected_output: r[:expected_output] } } }
59
59
  end
60
60
 
61
+ def generate_dataset(name:, description:, count: 10, schema: nil, model: nil, **)
62
+ return { error: 'legion-llm is not available' } unless llm_available?
63
+
64
+ rows = call_llm_for_rows(description: description, count: count, schema: schema, model: model)
65
+ return rows if rows.is_a?(Hash) && rows[:error]
66
+
67
+ result = create_dataset(name: name, description: description, rows: rows)
68
+ result.merge(generated: true)
69
+ end
70
+
61
71
  private
62
72
 
73
+ def llm_available?
74
+ defined?(Legion::LLM) && Legion::LLM.respond_to?(:started?) && Legion::LLM.started?
75
+ end
76
+
77
+ def call_llm_for_rows(description:, count:, schema:, model:)
78
+ prompt = build_generate_prompt(description: description, count: count, schema: schema)
79
+ llm_opts = model ? { model: model } : {}
80
+
81
+ response = invoke_llm(prompt: prompt, **llm_opts)
82
+ rows = parse_llm_rows(response)
83
+
84
+ if rows.nil?
85
+ retry_prompt = "#{prompt}\n\nIMPORTANT: Your previous response was not valid JSON. Return ONLY a valid JSON array."
86
+ response = invoke_llm(prompt: retry_prompt, **llm_opts)
87
+ rows = parse_llm_rows(response)
88
+ end
89
+
90
+ rows || { error: 'LLM did not return valid JSON after retry' }
91
+ end
92
+
93
+ def invoke_llm(prompt:, **llm_opts)
94
+ result = if Legion::LLM.respond_to?(:structured)
95
+ Legion::LLM.structured(
96
+ message: prompt,
97
+ schema: generate_schema,
98
+ **llm_opts
99
+ )
100
+ else
101
+ Legion::LLM.chat(message: prompt, **llm_opts)
102
+ end
103
+ content = result.respond_to?(:content) ? result.content : result.to_s
104
+ content.strip.sub(/\A```(?:json)?\n?/, '').sub(/\n?```\z/, '')
105
+ end
106
+
107
+ def parse_llm_rows(content)
108
+ parsed = ::JSON.parse(content)
109
+ return nil unless parsed.is_a?(Array)
110
+
111
+ parsed.map do |item|
112
+ h = item.transform_keys(&:to_sym)
113
+ { input: h[:input].to_s, expected_output: h[:expected_output]&.to_s }
114
+ end
115
+ rescue ::JSON::ParserError
116
+ nil
117
+ end
118
+
119
+ def build_generate_prompt(description:, count:, schema:)
120
+ lines = []
121
+ lines << "You are a test case generator. Generate exactly #{count} test cases as a JSON array."
122
+ lines << 'Each test case must have "input" and "expected_output" fields.'
123
+ lines << ''
124
+ lines << "Description: #{description}"
125
+ if schema
126
+ lines << ''
127
+ lines << 'Schema guidance for inputs and outputs:'
128
+ lines << "```json\n#{::JSON.generate(schema)}\n```"
129
+ end
130
+ lines << ''
131
+ lines << 'Respond ONLY with a valid JSON array, no other text.'
132
+ lines.join("\n")
133
+ end
134
+
135
+ def generate_schema
136
+ {
137
+ type: 'array',
138
+ items: {
139
+ type: 'object',
140
+ properties: {
141
+ input: { type: 'string' },
142
+ expected_output: { type: 'string' }
143
+ },
144
+ required: %w[input expected_output]
145
+ }
146
+ }
147
+ end
148
+
63
149
  def create_version(dataset_id, rows)
64
150
  hash = OpenSSL::Digest.new('SHA256').hexdigest(rows.to_s)
65
151
  ver_num = (db[:dataset_versions].where(dataset_id: dataset_id).max(:version) || 0) + 1
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Dataset
6
+ module Runners
7
+ module Sampling
8
+ def sample_from_traces(dataset_name:, source: :legion_data, filters: {},
9
+ sample_size: nil, strategy: :recent, **)
10
+ traces = fetch_traces(source, filters)
11
+ sampled = apply_strategy(traces, strategy, sample_size)
12
+ rows = sampled.map { |t| { input: t[:input], expected_output: nil, metadata: t[:span_kind] } }
13
+ create_dataset(name: dataset_name, rows: rows)
14
+ end
15
+
16
+ private
17
+
18
+ def fetch_traces(source, filters)
19
+ case source
20
+ when :legion_data then fetch_from_db(filters)
21
+ else raise ArgumentError, "unknown trace source: #{source}"
22
+ end
23
+ end
24
+
25
+ def fetch_from_db(filters)
26
+ query = db[:traces]
27
+ query = query.where(span_kind: filters[:span_kind]) if filters[:span_kind]
28
+ query = query.where(status: filters[:status]) if filters[:status]
29
+ if filters[:time_range]
30
+ cutoff = Time.now.utc - filters[:time_range]
31
+ query = query.where { created_at >= cutoff }
32
+ end
33
+ query.order(Sequel.desc(:created_at)).all
34
+ end
35
+
36
+ def apply_strategy(traces, strategy, sample_size)
37
+ case strategy.to_sym
38
+ when :random then sample_random(traces, sample_size)
39
+ when :error_biased then sample_error_biased(traces, sample_size)
40
+ when :stratified then sample_stratified(traces, sample_size)
41
+ else sample_recent(traces, sample_size)
42
+ end
43
+ end
44
+
45
+ def sample_recent(traces, size)
46
+ size ? traces.first(size) : traces
47
+ end
48
+
49
+ def sample_random(traces, size)
50
+ size ? traces.sample(size) : traces.shuffle
51
+ end
52
+
53
+ def sample_error_biased(traces, size)
54
+ errors, successes = traces.partition { |t| t[:status] == 'error' }
55
+ return traces unless size
56
+
57
+ half = size / 2
58
+ (errors.first(half) + successes.first(size - half)).first(size)
59
+ end
60
+
61
+ def sample_stratified(traces, size)
62
+ groups = traces.group_by { |t| t[:span_kind] }
63
+ return traces unless size
64
+
65
+ per_group = [size / [groups.size, 1].max, 1].max
66
+ groups.values.flat_map { |g| g.first(per_group) }.first(size)
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
@@ -3,7 +3,7 @@
3
3
  module Legion
4
4
  module Extensions
5
5
  module Dataset
6
- VERSION = '0.1.0'
6
+ VERSION = '0.2.2'
7
7
  end
8
8
  end
9
9
  end
@@ -4,6 +4,7 @@ require_relative 'dataset/version'
4
4
  require_relative 'dataset/helpers/import_export'
5
5
  require_relative 'dataset/runners/dataset'
6
6
  require_relative 'dataset/runners/experiment'
7
+ require_relative 'dataset/runners/sampling'
7
8
  require_relative 'dataset/client'
8
9
 
9
10
  module Legion
metadata CHANGED
@@ -1,14 +1,112 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-dataset
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthew Iverson
8
8
  bindir: bin
9
9
  cert_chain: []
10
10
  date: 1980-01-02 00:00:00.000000000 Z
11
- dependencies: []
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: legion-cache
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: 1.3.11
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: 1.3.11
26
+ - !ruby/object:Gem::Dependency
27
+ name: legion-crypt
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.4.9
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: 1.4.9
40
+ - !ruby/object:Gem::Dependency
41
+ name: legion-data
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 1.4.17
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 1.4.17
54
+ - !ruby/object:Gem::Dependency
55
+ name: legion-json
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 1.2.1
61
+ type: :runtime
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: 1.2.1
68
+ - !ruby/object:Gem::Dependency
69
+ name: legion-logging
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: 1.3.2
75
+ type: :runtime
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: 1.3.2
82
+ - !ruby/object:Gem::Dependency
83
+ name: legion-settings
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: 1.3.14
89
+ type: :runtime
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: 1.3.14
96
+ - !ruby/object:Gem::Dependency
97
+ name: legion-transport
98
+ requirement: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: 1.3.9
103
+ type: :runtime
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: 1.3.9
12
110
  description: Provides versioned dataset storage with import/export (CSV/JSON/JSONL),
13
111
  experiment runner with evaluator integration, and regression detection.
14
112
  email:
@@ -17,11 +115,13 @@ executables: []
17
115
  extensions: []
18
116
  extra_rdoc_files: []
19
117
  files:
118
+ - README.md
20
119
  - lib/legion/extensions/dataset.rb
21
120
  - lib/legion/extensions/dataset/client.rb
22
121
  - lib/legion/extensions/dataset/helpers/import_export.rb
23
122
  - lib/legion/extensions/dataset/runners/dataset.rb
24
123
  - lib/legion/extensions/dataset/runners/experiment.rb
124
+ - lib/legion/extensions/dataset/runners/sampling.rb
25
125
  - lib/legion/extensions/dataset/version.rb
26
126
  homepage: https://github.com/LegionIO/lex-dataset
27
127
  licenses: