lex-dataset 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5e63278e11eb728b818a7519bc0a33fb713088d3d149100d4398414126e6ced8
4
- data.tar.gz: a119bcde6cec1c5f253a46284ead75f5fe672f0cd6a75e7334a6d7b325e61255
3
+ metadata.gz: 736a22b87a807e24ded11c873e9a8d20dc754a7aabff14e54d33f6b889df2a1c
4
+ data.tar.gz: 702e7e82a1d51e996938034c04354bb467019bf99e116097b3a4da02e618e132
5
5
  SHA512:
6
- metadata.gz: f97ed63b5cc90b64a8773186bdc7d2df964a8ba334c5827dff51653db83030c9e19f89fbd07b7537995fe23677cd22792241bcc53244bcd19bd0e7705c9c6caa
7
- data.tar.gz: 71209de12c040ce5c4a09436cf8902fc517352b30f59072a9a0dcfd7db031d9279b2e108c2188d6a108d40c611d68306f67d1503d6aa38ee623d046b93a0d56b
6
+ metadata.gz: c74a390f975e215614623f06e9e2cb9a4af33137329e05d82719fc12a1fe71715079a448f18ef64ae9967492f8cc11a873fb1f670d320575b823568cb7902012
7
+ data.tar.gz: dbb2cedfd0146ab4c66230761f21ec01bcf9284a01fd699753184ccbf18d2e139620046e167f0792ec5ff3eaa171c31b53f7b0599a7161cb932ebc1f6fb4588d
data/README.md ADDED
@@ -0,0 +1,71 @@
1
+ # lex-dataset
2
+
3
+ Versioned dataset management for LegionIO. Provides immutable versioned dataset storage with CSV, JSON, and JSONL import/export and content-hash deduplication.
4
+
5
+ ## Overview
6
+
7
+ `lex-dataset` stores named datasets with full version history. Each version is content-hashed — submitting the same rows twice results in no new version. Datasets consist of input/expected-output row pairs suitable for LLM evaluation workflows.
8
+
9
+ ## Installation
10
+
11
+ ```ruby
12
+ gem 'lex-dataset'
13
+ ```
14
+
15
+ ## Usage
16
+
17
+ ```ruby
18
+ require 'legion/extensions/dataset'
19
+
20
+ client = Legion::Extensions::Dataset::Client.new
21
+
22
+ # Create a dataset with inline rows
23
+ client.create_dataset(
24
+ name: 'qa-pairs-v1',
25
+ description: 'Question-answer evaluation set',
26
+ rows: [
27
+ { input: 'What is BGP?', expected_output: 'Border Gateway Protocol' },
28
+ { input: 'What is OSPF?', expected_output: 'Open Shortest Path First' }
29
+ ]
30
+ )
31
+ # => { created: true, name: 'qa-pairs-v1', version: 1, row_count: 2 }
32
+
33
+ # Import from file
34
+ client.import_dataset(name: 'qa-from-file', path: '/data/qa.jsonl', format: 'jsonl')
35
+
36
+ # Export a specific version
37
+ client.export_dataset(name: 'qa-pairs-v1', path: '/tmp/export.json', format: 'json')
38
+
39
+ # Retrieve rows
40
+ client.get_dataset(name: 'qa-pairs-v1')
41
+ # => { name: 'qa-pairs-v1', version: 1, row_count: 2, rows: [...] }
42
+
43
+ # List all datasets
44
+ client.list_datasets
45
+ ```
46
+
47
+ ## Supported Formats
48
+
49
+ | Format | Description |
50
+ |--------|-------------|
51
+ | `json` | Array of row objects (default) |
52
+ | `jsonl` | One JSON object per line |
53
+ | `csv` | Header row + data rows |
54
+
55
+ ## Related Repos
56
+
57
+ - `lex-eval` — uses datasets as input for LLM evaluation runs
58
+ - `lex-prompt` — versioned prompt templates consumed alongside datasets in evaluation workflows
59
+ - `legion-data` — underlying Sequel database connection (SQLite/PostgreSQL/MySQL)
60
+
61
+ ## Development
62
+
63
+ ```bash
64
+ bundle install
65
+ bundle exec rspec
66
+ bundle exec rubocop
67
+ ```
68
+
69
+ ## License
70
+
71
+ MIT
@@ -6,6 +6,7 @@ module Legion
6
6
  class Client
7
7
  include Runners::Dataset
8
8
  include Runners::Experiment
9
+ include Runners::Sampling
9
10
 
10
11
  def initialize(db: nil, **opts)
11
12
  @db = db
@@ -58,8 +58,94 @@ module Legion
58
58
  rows: rows.map { |r| { row_index: r[:row_index], input: r[:input], expected_output: r[:expected_output] } } }
59
59
  end
60
60
 
61
+ def generate_dataset(name:, description:, count: 10, schema: nil, model: nil, **)
62
+ return { error: 'legion-llm is not available' } unless llm_available?
63
+
64
+ rows = call_llm_for_rows(description: description, count: count, schema: schema, model: model)
65
+ return rows if rows.is_a?(Hash) && rows[:error]
66
+
67
+ result = create_dataset(name: name, description: description, rows: rows)
68
+ result.merge(generated: true)
69
+ end
70
+
61
71
  private
62
72
 
73
+ def llm_available?
74
+ defined?(Legion::LLM) && Legion::LLM.respond_to?(:started?) && Legion::LLM.started?
75
+ end
76
+
77
+ def call_llm_for_rows(description:, count:, schema:, model:)
78
+ prompt = build_generate_prompt(description: description, count: count, schema: schema)
79
+ llm_opts = model ? { model: model } : {}
80
+
81
+ response = invoke_llm(prompt: prompt, **llm_opts)
82
+ rows = parse_llm_rows(response)
83
+
84
+ if rows.nil?
85
+ retry_prompt = "#{prompt}\n\nIMPORTANT: Your previous response was not valid JSON. Return ONLY a valid JSON array."
86
+ response = invoke_llm(prompt: retry_prompt, **llm_opts)
87
+ rows = parse_llm_rows(response)
88
+ end
89
+
90
+ rows || { error: 'LLM did not return valid JSON after retry' }
91
+ end
92
+
93
+ def invoke_llm(prompt:, **llm_opts)
94
+ result = if Legion::LLM.respond_to?(:structured)
95
+ Legion::LLM.structured(
96
+ message: prompt,
97
+ schema: generate_schema,
98
+ **llm_opts
99
+ )
100
+ else
101
+ Legion::LLM.chat(message: prompt, **llm_opts)
102
+ end
103
+ content = result.respond_to?(:content) ? result.content : result.to_s
104
+ content.strip.sub(/\A```(?:json)?\n?/, '').sub(/\n?```\z/, '')
105
+ end
106
+
107
+ def parse_llm_rows(content)
108
+ parsed = ::JSON.parse(content)
109
+ return nil unless parsed.is_a?(Array)
110
+
111
+ parsed.map do |item|
112
+ h = item.transform_keys(&:to_sym)
113
+ { input: h[:input].to_s, expected_output: h[:expected_output]&.to_s }
114
+ end
115
+ rescue ::JSON::ParserError
116
+ nil
117
+ end
118
+
119
+ def build_generate_prompt(description:, count:, schema:)
120
+ lines = []
121
+ lines << "You are a test case generator. Generate exactly #{count} test cases as a JSON array."
122
+ lines << 'Each test case must have "input" and "expected_output" fields.'
123
+ lines << ''
124
+ lines << "Description: #{description}"
125
+ if schema
126
+ lines << ''
127
+ lines << 'Schema guidance for inputs and outputs:'
128
+ lines << "```json\n#{::JSON.generate(schema)}\n```"
129
+ end
130
+ lines << ''
131
+ lines << 'Respond ONLY with a valid JSON array, no other text.'
132
+ lines.join("\n")
133
+ end
134
+
135
+ def generate_schema
136
+ {
137
+ type: 'array',
138
+ items: {
139
+ type: 'object',
140
+ properties: {
141
+ input: { type: 'string' },
142
+ expected_output: { type: 'string' }
143
+ },
144
+ required: %w[input expected_output]
145
+ }
146
+ }
147
+ end
148
+
63
149
  def create_version(dataset_id, rows)
64
150
  hash = OpenSSL::Digest.new('SHA256').hexdigest(rows.to_s)
65
151
  ver_num = (db[:dataset_versions].where(dataset_id: dataset_id).max(:version) || 0) + 1
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Legion
4
+ module Extensions
5
+ module Dataset
6
+ module Runners
7
+ module Sampling
8
+ def sample_from_traces(dataset_name:, source: :legion_data, filters: {},
9
+ sample_size: nil, strategy: :recent, **)
10
+ traces = fetch_traces(source, filters)
11
+ sampled = apply_strategy(traces, strategy, sample_size)
12
+ rows = sampled.map { |t| { input: t[:input], expected_output: nil, metadata: t[:span_kind] } }
13
+ create_dataset(name: dataset_name, rows: rows)
14
+ end
15
+
16
+ private
17
+
18
+ def fetch_traces(source, filters)
19
+ case source
20
+ when :legion_data then fetch_from_db(filters)
21
+ else raise ArgumentError, "unknown trace source: #{source}"
22
+ end
23
+ end
24
+
25
+ def fetch_from_db(filters)
26
+ query = db[:traces]
27
+ query = query.where(span_kind: filters[:span_kind]) if filters[:span_kind]
28
+ query = query.where(status: filters[:status]) if filters[:status]
29
+ if filters[:time_range]
30
+ cutoff = Time.now.utc - filters[:time_range]
31
+ query = query.where { created_at >= cutoff }
32
+ end
33
+ query.order(Sequel.desc(:created_at)).all
34
+ end
35
+
36
+ def apply_strategy(traces, strategy, sample_size)
37
+ case strategy.to_sym
38
+ when :random then sample_random(traces, sample_size)
39
+ when :error_biased then sample_error_biased(traces, sample_size)
40
+ when :stratified then sample_stratified(traces, sample_size)
41
+ else sample_recent(traces, sample_size)
42
+ end
43
+ end
44
+
45
+ def sample_recent(traces, size)
46
+ size ? traces.first(size) : traces
47
+ end
48
+
49
+ def sample_random(traces, size)
50
+ size ? traces.sample(size) : traces.shuffle
51
+ end
52
+
53
+ def sample_error_biased(traces, size)
54
+ errors, successes = traces.partition { |t| t[:status] == 'error' }
55
+ return traces unless size
56
+
57
+ half = size / 2
58
+ (errors.first(half) + successes.first(size - half)).first(size)
59
+ end
60
+
61
+ def sample_stratified(traces, size)
62
+ groups = traces.group_by { |t| t[:span_kind] }
63
+ return traces unless size
64
+
65
+ per_group = [size / [groups.size, 1].max, 1].max
66
+ groups.values.flat_map { |g| g.first(per_group) }.first(size)
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
@@ -3,7 +3,7 @@
3
3
  module Legion
4
4
  module Extensions
5
5
  module Dataset
6
- VERSION = '0.1.0'
6
+ VERSION = '0.2.1'
7
7
  end
8
8
  end
9
9
  end
@@ -4,6 +4,7 @@ require_relative 'dataset/version'
4
4
  require_relative 'dataset/helpers/import_export'
5
5
  require_relative 'dataset/runners/dataset'
6
6
  require_relative 'dataset/runners/experiment'
7
+ require_relative 'dataset/runners/sampling'
7
8
  require_relative 'dataset/client'
8
9
 
9
10
  module Legion
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lex-dataset
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Matthew Iverson
@@ -17,11 +17,13 @@ executables: []
17
17
  extensions: []
18
18
  extra_rdoc_files: []
19
19
  files:
20
+ - README.md
20
21
  - lib/legion/extensions/dataset.rb
21
22
  - lib/legion/extensions/dataset/client.rb
22
23
  - lib/legion/extensions/dataset/helpers/import_export.rb
23
24
  - lib/legion/extensions/dataset/runners/dataset.rb
24
25
  - lib/legion/extensions/dataset/runners/experiment.rb
26
+ - lib/legion/extensions/dataset/runners/sampling.rb
25
27
  - lib/legion/extensions/dataset/version.rb
26
28
  homepage: https://github.com/LegionIO/lex-dataset
27
29
  licenses: