lex-dataset 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +71 -0
- data/lib/legion/extensions/dataset/client.rb +1 -0
- data/lib/legion/extensions/dataset/runners/dataset.rb +86 -0
- data/lib/legion/extensions/dataset/runners/sampling.rb +72 -0
- data/lib/legion/extensions/dataset/version.rb +1 -1
- data/lib/legion/extensions/dataset.rb +1 -0
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 736a22b87a807e24ded11c873e9a8d20dc754a7aabff14e54d33f6b889df2a1c
|
|
4
|
+
data.tar.gz: 702e7e82a1d51e996938034c04354bb467019bf99e116097b3a4da02e618e132
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c74a390f975e215614623f06e9e2cb9a4af33137329e05d82719fc12a1fe71715079a448f18ef64ae9967492f8cc11a873fb1f670d320575b823568cb7902012
|
|
7
|
+
data.tar.gz: dbb2cedfd0146ab4c66230761f21ec01bcf9284a01fd699753184ccbf18d2e139620046e167f0792ec5ff3eaa171c31b53f7b0599a7161cb932ebc1f6fb4588d
|
data/README.md
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# lex-dataset
|
|
2
|
+
|
|
3
|
+
Versioned dataset management for LegionIO. Provides immutable versioned dataset storage with CSV, JSON, and JSONL import/export and content-hash deduplication.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
`lex-dataset` stores named datasets with full version history. Each version is content-hashed — submitting the same rows twice results in no new version. Datasets consist of input/expected-output row pairs suitable for LLM evaluation workflows.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```ruby
|
|
12
|
+
gem 'lex-dataset'
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Usage
|
|
16
|
+
|
|
17
|
+
```ruby
|
|
18
|
+
require 'legion/extensions/dataset'
|
|
19
|
+
|
|
20
|
+
client = Legion::Extensions::Dataset::Client.new
|
|
21
|
+
|
|
22
|
+
# Create a dataset with inline rows
|
|
23
|
+
client.create_dataset(
|
|
24
|
+
name: 'qa-pairs-v1',
|
|
25
|
+
description: 'Question-answer evaluation set',
|
|
26
|
+
rows: [
|
|
27
|
+
{ input: 'What is BGP?', expected_output: 'Border Gateway Protocol' },
|
|
28
|
+
{ input: 'What is OSPF?', expected_output: 'Open Shortest Path First' }
|
|
29
|
+
]
|
|
30
|
+
)
|
|
31
|
+
# => { created: true, name: 'qa-pairs-v1', version: 1, row_count: 2 }
|
|
32
|
+
|
|
33
|
+
# Import from file
|
|
34
|
+
client.import_dataset(name: 'qa-from-file', path: '/data/qa.jsonl', format: 'jsonl')
|
|
35
|
+
|
|
36
|
+
# Export a specific version
|
|
37
|
+
client.export_dataset(name: 'qa-pairs-v1', path: '/tmp/export.json', format: 'json')
|
|
38
|
+
|
|
39
|
+
# Retrieve rows
|
|
40
|
+
client.get_dataset(name: 'qa-pairs-v1')
|
|
41
|
+
# => { name: 'qa-pairs-v1', version: 1, row_count: 2, rows: [...] }
|
|
42
|
+
|
|
43
|
+
# List all datasets
|
|
44
|
+
client.list_datasets
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Supported Formats
|
|
48
|
+
|
|
49
|
+
| Format | Description |
|
|
50
|
+
|--------|-------------|
|
|
51
|
+
| `json` | Array of row objects (default) |
|
|
52
|
+
| `jsonl` | One JSON object per line |
|
|
53
|
+
| `csv` | Header row + data rows |
|
|
54
|
+
|
|
55
|
+
## Related Repos
|
|
56
|
+
|
|
57
|
+
- `lex-eval` — uses datasets as input for LLM evaluation runs
|
|
58
|
+
- `lex-prompt` — versioned prompt templates consumed alongside datasets in evaluation workflows
|
|
59
|
+
- `legion-data` — underlying Sequel database connection (SQLite/PostgreSQL/MySQL)
|
|
60
|
+
|
|
61
|
+
## Development
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
bundle install
|
|
65
|
+
bundle exec rspec
|
|
66
|
+
bundle exec rubocop
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## License
|
|
70
|
+
|
|
71
|
+
MIT
|
|
@@ -58,8 +58,94 @@ module Legion
|
|
|
58
58
|
rows: rows.map { |r| { row_index: r[:row_index], input: r[:input], expected_output: r[:expected_output] } } }
|
|
59
59
|
end
|
|
60
60
|
|
|
61
|
+
def generate_dataset(name:, description:, count: 10, schema: nil, model: nil, **)
|
|
62
|
+
return { error: 'legion-llm is not available' } unless llm_available?
|
|
63
|
+
|
|
64
|
+
rows = call_llm_for_rows(description: description, count: count, schema: schema, model: model)
|
|
65
|
+
return rows if rows.is_a?(Hash) && rows[:error]
|
|
66
|
+
|
|
67
|
+
result = create_dataset(name: name, description: description, rows: rows)
|
|
68
|
+
result.merge(generated: true)
|
|
69
|
+
end
|
|
70
|
+
|
|
61
71
|
private
|
|
62
72
|
|
|
73
|
+
def llm_available?
|
|
74
|
+
defined?(Legion::LLM) && Legion::LLM.respond_to?(:started?) && Legion::LLM.started?
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def call_llm_for_rows(description:, count:, schema:, model:)
|
|
78
|
+
prompt = build_generate_prompt(description: description, count: count, schema: schema)
|
|
79
|
+
llm_opts = model ? { model: model } : {}
|
|
80
|
+
|
|
81
|
+
response = invoke_llm(prompt: prompt, **llm_opts)
|
|
82
|
+
rows = parse_llm_rows(response)
|
|
83
|
+
|
|
84
|
+
if rows.nil?
|
|
85
|
+
retry_prompt = "#{prompt}\n\nIMPORTANT: Your previous response was not valid JSON. Return ONLY a valid JSON array."
|
|
86
|
+
response = invoke_llm(prompt: retry_prompt, **llm_opts)
|
|
87
|
+
rows = parse_llm_rows(response)
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
rows || { error: 'LLM did not return valid JSON after retry' }
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def invoke_llm(prompt:, **llm_opts)
|
|
94
|
+
result = if Legion::LLM.respond_to?(:structured)
|
|
95
|
+
Legion::LLM.structured(
|
|
96
|
+
message: prompt,
|
|
97
|
+
schema: generate_schema,
|
|
98
|
+
**llm_opts
|
|
99
|
+
)
|
|
100
|
+
else
|
|
101
|
+
Legion::LLM.chat(message: prompt, **llm_opts)
|
|
102
|
+
end
|
|
103
|
+
content = result.respond_to?(:content) ? result.content : result.to_s
|
|
104
|
+
content.strip.sub(/\A```(?:json)?\n?/, '').sub(/\n?```\z/, '')
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def parse_llm_rows(content)
|
|
108
|
+
parsed = ::JSON.parse(content)
|
|
109
|
+
return nil unless parsed.is_a?(Array)
|
|
110
|
+
|
|
111
|
+
parsed.map do |item|
|
|
112
|
+
h = item.transform_keys(&:to_sym)
|
|
113
|
+
{ input: h[:input].to_s, expected_output: h[:expected_output]&.to_s }
|
|
114
|
+
end
|
|
115
|
+
rescue ::JSON::ParserError
|
|
116
|
+
nil
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def build_generate_prompt(description:, count:, schema:)
|
|
120
|
+
lines = []
|
|
121
|
+
lines << "You are a test case generator. Generate exactly #{count} test cases as a JSON array."
|
|
122
|
+
lines << 'Each test case must have "input" and "expected_output" fields.'
|
|
123
|
+
lines << ''
|
|
124
|
+
lines << "Description: #{description}"
|
|
125
|
+
if schema
|
|
126
|
+
lines << ''
|
|
127
|
+
lines << 'Schema guidance for inputs and outputs:'
|
|
128
|
+
lines << "```json\n#{::JSON.generate(schema)}\n```"
|
|
129
|
+
end
|
|
130
|
+
lines << ''
|
|
131
|
+
lines << 'Respond ONLY with a valid JSON array, no other text.'
|
|
132
|
+
lines.join("\n")
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def generate_schema
|
|
136
|
+
{
|
|
137
|
+
type: 'array',
|
|
138
|
+
items: {
|
|
139
|
+
type: 'object',
|
|
140
|
+
properties: {
|
|
141
|
+
input: { type: 'string' },
|
|
142
|
+
expected_output: { type: 'string' }
|
|
143
|
+
},
|
|
144
|
+
required: %w[input expected_output]
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
end
|
|
148
|
+
|
|
63
149
|
def create_version(dataset_id, rows)
|
|
64
150
|
hash = OpenSSL::Digest.new('SHA256').hexdigest(rows.to_s)
|
|
65
151
|
ver_num = (db[:dataset_versions].where(dataset_id: dataset_id).max(:version) || 0) + 1
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module Extensions
|
|
5
|
+
module Dataset
|
|
6
|
+
module Runners
|
|
7
|
+
module Sampling
|
|
8
|
+
def sample_from_traces(dataset_name:, source: :legion_data, filters: {},
|
|
9
|
+
sample_size: nil, strategy: :recent, **)
|
|
10
|
+
traces = fetch_traces(source, filters)
|
|
11
|
+
sampled = apply_strategy(traces, strategy, sample_size)
|
|
12
|
+
rows = sampled.map { |t| { input: t[:input], expected_output: nil, metadata: t[:span_kind] } }
|
|
13
|
+
create_dataset(name: dataset_name, rows: rows)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
private
|
|
17
|
+
|
|
18
|
+
def fetch_traces(source, filters)
|
|
19
|
+
case source
|
|
20
|
+
when :legion_data then fetch_from_db(filters)
|
|
21
|
+
else raise ArgumentError, "unknown trace source: #{source}"
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def fetch_from_db(filters)
|
|
26
|
+
query = db[:traces]
|
|
27
|
+
query = query.where(span_kind: filters[:span_kind]) if filters[:span_kind]
|
|
28
|
+
query = query.where(status: filters[:status]) if filters[:status]
|
|
29
|
+
if filters[:time_range]
|
|
30
|
+
cutoff = Time.now.utc - filters[:time_range]
|
|
31
|
+
query = query.where { created_at >= cutoff }
|
|
32
|
+
end
|
|
33
|
+
query.order(Sequel.desc(:created_at)).all
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def apply_strategy(traces, strategy, sample_size)
|
|
37
|
+
case strategy.to_sym
|
|
38
|
+
when :random then sample_random(traces, sample_size)
|
|
39
|
+
when :error_biased then sample_error_biased(traces, sample_size)
|
|
40
|
+
when :stratified then sample_stratified(traces, sample_size)
|
|
41
|
+
else sample_recent(traces, sample_size)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def sample_recent(traces, size)
|
|
46
|
+
size ? traces.first(size) : traces
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def sample_random(traces, size)
|
|
50
|
+
size ? traces.sample(size) : traces.shuffle
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def sample_error_biased(traces, size)
|
|
54
|
+
errors, successes = traces.partition { |t| t[:status] == 'error' }
|
|
55
|
+
return traces unless size
|
|
56
|
+
|
|
57
|
+
half = size / 2
|
|
58
|
+
(errors.first(half) + successes.first(size - half)).first(size)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def sample_stratified(traces, size)
|
|
62
|
+
groups = traces.group_by { |t| t[:span_kind] }
|
|
63
|
+
return traces unless size
|
|
64
|
+
|
|
65
|
+
per_group = [size / [groups.size, 1].max, 1].max
|
|
66
|
+
groups.values.flat_map { |g| g.first(per_group) }.first(size)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
@@ -4,6 +4,7 @@ require_relative 'dataset/version'
|
|
|
4
4
|
require_relative 'dataset/helpers/import_export'
|
|
5
5
|
require_relative 'dataset/runners/dataset'
|
|
6
6
|
require_relative 'dataset/runners/experiment'
|
|
7
|
+
require_relative 'dataset/runners/sampling'
|
|
7
8
|
require_relative 'dataset/client'
|
|
8
9
|
|
|
9
10
|
module Legion
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: lex-dataset
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1
|
|
4
|
+
version: 0.2.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Matthew Iverson
|
|
@@ -17,11 +17,13 @@ executables: []
|
|
|
17
17
|
extensions: []
|
|
18
18
|
extra_rdoc_files: []
|
|
19
19
|
files:
|
|
20
|
+
- README.md
|
|
20
21
|
- lib/legion/extensions/dataset.rb
|
|
21
22
|
- lib/legion/extensions/dataset/client.rb
|
|
22
23
|
- lib/legion/extensions/dataset/helpers/import_export.rb
|
|
23
24
|
- lib/legion/extensions/dataset/runners/dataset.rb
|
|
24
25
|
- lib/legion/extensions/dataset/runners/experiment.rb
|
|
26
|
+
- lib/legion/extensions/dataset/runners/sampling.rb
|
|
25
27
|
- lib/legion/extensions/dataset/version.rb
|
|
26
28
|
homepage: https://github.com/LegionIO/lex-dataset
|
|
27
29
|
licenses:
|