structify 0.1.0 โ 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/CLAUDE.md +27 -0
- data/Gemfile.lock +7 -1
- data/README.md +279 -144
- data/lib/structify/model.rb +290 -58
- data/lib/structify/schema_serializer.rb +165 -0
- data/lib/structify/version.rb +1 -1
- data/lib/structify.rb +67 -4
- data/structify.gemspec +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 56ca4a2c78aa18b382aa54c4ba88ef246bd2014f895db59fb877ecfd5cb12edf
|
4
|
+
data.tar.gz: 7cb8abbf4ebc23b68a7c19492d4a5fa648d89bfea4dd49a632418a3a962be1cc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 16e43c43971e51405759fd3cdf1cd12759ffef115313c144938d5dea704583e04fff57dfd388682127321b71c527131ed3d6edade964db6bb55e9b1c43900744
|
7
|
+
data.tar.gz: 9de0c840ee85e0ac8722b518125461672006624d70d2d7317e5df9db4832a6bd454e030c2790fab8d6e408d4e694039c4a50f471bf4a45e3a523929d59c7473e
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
All notable changes to this project will be documented in this file.
|
4
|
+
|
5
|
+
## [0.2.0] - 2025-03-12
|
6
|
+
|
7
|
+
### Added
|
8
|
+
|
9
|
+
- New `thinking` mode option to automatically add chain of thought reasoning to LLM schemas
|
10
|
+
- When enabled, adds a `chain_of_thought` field as the first property in the generated schema
|
11
|
+
|
12
|
+
## [0.1.0] - Initial Release
|
13
|
+
|
14
|
+
- Initial release of Structify
|
data/CLAUDE.md
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# CLAUDE.md - Guidelines for Structify
|
2
|
+
|
3
|
+
## Commands
|
4
|
+
- Build: `bundle exec rake build`
|
5
|
+
- Install: `bundle exec rake install`
|
6
|
+
- Test all: `bundle exec rake spec`
|
7
|
+
- Test single file: `bundle exec rspec spec/path/to/file_spec.rb`
|
8
|
+
- Test specific example: `bundle exec rspec spec/path/to/file_spec.rb:LINE_NUMBER`
|
9
|
+
- Lint: `bundle exec rubocop`
|
10
|
+
|
11
|
+
## Code Style
|
12
|
+
- Use `# frozen_string_literal: true` at the top of all Ruby files
|
13
|
+
- Follow Ruby naming conventions (snake_case for methods/variables, CamelCase for classes)
|
14
|
+
- Include YARD documentation for classes and methods
|
15
|
+
- Group similar methods together
|
16
|
+
- Include descriptive RSpec tests for all functionality
|
17
|
+
- Keep methods short and focused on a single responsibility
|
18
|
+
- Use specific error classes for error handling
|
19
|
+
- Prefer explicit requires over auto-loading
|
20
|
+
- Follow ActiveSupport::Concern patterns for modules
|
21
|
+
- Keep DSL simple and intuitive for end users
|
22
|
+
|
23
|
+
## Structure
|
24
|
+
- Put core functionality in lib/structify/
|
25
|
+
- Keep implementation details private when possible
|
26
|
+
- Follow semantic versioning guidelines
|
27
|
+
- Ensure proper test coverage for all public APIs
|
data/Gemfile.lock
CHANGED
@@ -2,7 +2,7 @@ PATH
|
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
4
|
structify (0.1.0)
|
5
|
-
activesupport (
|
5
|
+
activesupport (~> 7.1)
|
6
6
|
attr_json (~> 2.1)
|
7
7
|
|
8
8
|
GEM
|
@@ -43,6 +43,8 @@ GEM
|
|
43
43
|
mutex_m
|
44
44
|
securerandom (>= 0.3)
|
45
45
|
tzinfo (~> 2.0)
|
46
|
+
addressable (2.8.7)
|
47
|
+
public_suffix (>= 2.0.2, < 7.0)
|
46
48
|
ast (2.4.2)
|
47
49
|
attr_json (2.5.0)
|
48
50
|
activerecord (>= 6.0.0, < 8.1)
|
@@ -68,6 +70,8 @@ GEM
|
|
68
70
|
rdoc (>= 4.0.0)
|
69
71
|
reline (>= 0.4.2)
|
70
72
|
json (2.9.1)
|
73
|
+
json-schema (4.3.1)
|
74
|
+
addressable (>= 2.8)
|
71
75
|
language_server-protocol (3.17.0.4)
|
72
76
|
logger (1.6.5)
|
73
77
|
loofah (2.24.0)
|
@@ -87,6 +91,7 @@ GEM
|
|
87
91
|
psych (5.2.3)
|
88
92
|
date
|
89
93
|
stringio
|
94
|
+
public_suffix (6.0.1)
|
90
95
|
racc (1.8.1)
|
91
96
|
rack (3.1.9)
|
92
97
|
rack-session (2.1.0)
|
@@ -182,6 +187,7 @@ PLATFORMS
|
|
182
187
|
DEPENDENCIES
|
183
188
|
activerecord (~> 7.1.0)
|
184
189
|
debug (>= 1.0.0)
|
190
|
+
json-schema (~> 4.1)
|
185
191
|
rake (~> 13.0)
|
186
192
|
rspec (~> 3.12)
|
187
193
|
rspec-rails (~> 6.1)
|
data/README.md
CHANGED
@@ -2,220 +2,355 @@
|
|
2
2
|
|
3
3
|
[](https://badge.fury.io/rb/structify)
|
4
4
|
|
5
|
-
|
5
|
+
A Ruby gem for extracting structured data from content using LLMs in Rails applications
|
6
6
|
|
7
|
-
##
|
7
|
+
## What is Structify?
|
8
8
|
|
9
|
-
|
10
|
-
- ๐ Built-in versioning for schema evolution
|
11
|
-
- ๐ Support for custom assistant prompts
|
12
|
-
- ๐๏ธ JSON Schema generation for LLM validation
|
13
|
-
- ๐ Seamless Rails/ActiveRecord integration
|
14
|
-
- ๐พ Automatic JSON attribute handling
|
9
|
+
Structify helps you extract structured data from unstructured content in your Rails apps:
|
15
10
|
|
16
|
-
|
11
|
+
- **Define extraction schemas** directly in your ActiveRecord models
|
12
|
+
- **Generate JSON schemas** to use with OpenAI, Anthropic, or other LLM providers
|
13
|
+
- **Store and validate** extracted data in your models
|
14
|
+
- **Access structured data** through typed model attributes
|
17
15
|
|
18
|
-
|
16
|
+
## Use Cases
|
17
|
+
|
18
|
+
- Extract metadata, topics, and sentiment from articles or blog posts
|
19
|
+
- Pull structured information from user-generated content
|
20
|
+
- Organize unstructured feedback or reviews into categorized data
|
21
|
+
- Convert emails or messages into actionable, structured formats
|
22
|
+
- Extract entities and relationships from documents
|
19
23
|
|
20
24
|
```ruby
|
21
|
-
|
25
|
+
# 1. Define extraction schema in your model
|
26
|
+
class Article < ApplicationRecord
|
27
|
+
include Structify::Model
|
28
|
+
|
29
|
+
schema_definition do
|
30
|
+
field :title, :string
|
31
|
+
field :summary, :text
|
32
|
+
field :category, :string, enum: ["tech", "business", "science"]
|
33
|
+
field :topics, :array, items: { type: "string" }
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# 2. Get schema for your LLM API
|
38
|
+
schema = Article.json_schema
|
39
|
+
|
40
|
+
# 3. Store LLM response in your model
|
41
|
+
article = Article.find(123)
|
42
|
+
article.update(llm_response)
|
43
|
+
|
44
|
+
# 4. Access extracted data
|
45
|
+
article.title # => "AI Advances in 2023"
|
46
|
+
article.summary # => "Recent developments in artificial intelligence..."
|
47
|
+
article.topics # => ["machine learning", "neural networks", "computer vision"]
|
22
48
|
```
|
23
49
|
|
24
|
-
|
50
|
+
## Install
|
25
51
|
|
52
|
+
```ruby
|
53
|
+
# Add to Gemfile
|
54
|
+
gem 'structify'
|
55
|
+
```
|
56
|
+
|
57
|
+
Then:
|
26
58
|
```bash
|
27
|
-
|
59
|
+
bundle install
|
28
60
|
```
|
29
61
|
|
30
|
-
|
62
|
+
## Database Setup
|
31
63
|
|
32
|
-
|
33
|
-
|
64
|
+
Add a JSON column to store extracted data:
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
add_column :articles, :extracted_data, :jsonb # PostgreSQL
|
68
|
+
# or
|
69
|
+
add_column :articles, :extracted_data, :json # MySQL
|
34
70
|
```
|
35
71
|
|
36
72
|
## Usage
|
37
73
|
|
38
|
-
###
|
39
|
-
|
40
|
-
Here's a simple example of using Structify in a Rails model:
|
74
|
+
### Define Your Schema
|
41
75
|
|
42
76
|
```ruby
|
43
77
|
class Article < ApplicationRecord
|
44
78
|
include Structify::Model
|
45
79
|
|
46
80
|
schema_definition do
|
47
|
-
title "Article Extraction"
|
48
|
-
description "Extract key information from articles"
|
49
81
|
version 1
|
50
|
-
|
51
|
-
|
52
|
-
llm_model "gpt-4"
|
53
|
-
|
82
|
+
title "Article Extraction"
|
83
|
+
|
54
84
|
field :title, :string, required: true
|
55
|
-
field :summary, :text
|
85
|
+
field :summary, :text
|
56
86
|
field :category, :string, enum: ["tech", "business", "science"]
|
87
|
+
field :topics, :array, items: { type: "string" }
|
88
|
+
field :metadata, :object, properties: {
|
89
|
+
"author" => { type: "string" },
|
90
|
+
"published_at" => { type: "string" }
|
91
|
+
}
|
57
92
|
end
|
58
93
|
end
|
59
94
|
```
|
60
95
|
|
61
|
-
###
|
96
|
+
### Get Schema for LLM API
|
62
97
|
|
63
|
-
|
98
|
+
Structify generates the JSON schema that you'll need to send to your LLM provider:
|
64
99
|
|
65
100
|
```ruby
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
schema_definition do
|
70
|
-
version 2 # Increment this when making breaking changes
|
71
|
-
title "Email Thread Extraction"
|
72
|
-
description "Extracts key information from email threads"
|
101
|
+
# Get JSON Schema to send to OpenAI, Anthropic, etc.
|
102
|
+
schema = Article.json_schema
|
103
|
+
```
|
73
104
|
|
74
|
-
|
75
|
-
You are an assistant that extracts concise metadata from email threads.
|
76
|
-
Focus on producing a clear summary, action items, and sentiment analysis.
|
77
|
-
If there are multiple participants, include their roles in the conversation.
|
78
|
-
PROMPT
|
105
|
+
### Integration with LLM Services
|
79
106
|
|
80
|
-
|
107
|
+
You need to implement the actual LLM integration. Here's how you can integrate with popular services:
|
81
108
|
|
82
|
-
|
83
|
-
field :subject, :string,
|
84
|
-
required: true,
|
85
|
-
description: "The main topic or subject of the email thread"
|
109
|
+
#### OpenAI Integration Example
|
86
110
|
|
87
|
-
|
88
|
-
|
89
|
-
description: "A concise summary of the entire thread"
|
111
|
+
```ruby
|
112
|
+
require "openai"
|
90
113
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
114
|
+
class OpenAiExtractor
|
115
|
+
def initialize(api_key = ENV["OPENAI_API_KEY"])
|
116
|
+
@client = OpenAI::Client.new(access_token: api_key)
|
117
|
+
end
|
118
|
+
|
119
|
+
def extract(content, model_class)
|
120
|
+
# Get schema from Structify model
|
121
|
+
schema = model_class.json_schema
|
122
|
+
|
123
|
+
# Call OpenAI with structured outputs
|
124
|
+
response = @client.chat(
|
125
|
+
parameters: {
|
126
|
+
model: "gpt-4o",
|
127
|
+
response_format: { type: "json_object", schema: schema },
|
128
|
+
messages: [
|
129
|
+
{ role: "system", content: "Extract structured information from the provided content." },
|
130
|
+
{ role: "user", content: content }
|
131
|
+
]
|
132
|
+
}
|
133
|
+
)
|
134
|
+
|
135
|
+
# Parse and return the structured data
|
136
|
+
JSON.parse(response.dig("choices", 0, "message", "content"), symbolize_names: true)
|
137
|
+
end
|
138
|
+
end
|
95
139
|
|
96
|
-
|
97
|
-
|
98
|
-
|
140
|
+
# Usage
|
141
|
+
extractor = OpenAiExtractor.new
|
142
|
+
article = Article.find(123)
|
143
|
+
extracted_data = extractor.extract(article.content, Article)
|
144
|
+
article.update(extracted_data)
|
145
|
+
```
|
99
146
|
|
100
|
-
|
101
|
-
field :participants, :json,
|
102
|
-
description: "List of participants and their roles"
|
147
|
+
#### Anthropic Integration Example
|
103
148
|
|
104
|
-
|
105
|
-
|
149
|
+
```ruby
|
150
|
+
require "anthropic"
|
106
151
|
|
107
|
-
|
108
|
-
|
152
|
+
class AnthropicExtractor
|
153
|
+
def initialize(api_key = ENV["ANTHROPIC_API_KEY"])
|
154
|
+
@client = Anthropic::Client.new(api_key: api_key)
|
155
|
+
end
|
156
|
+
|
157
|
+
def extract(content, model_class)
|
158
|
+
# Get schema from Structify model
|
159
|
+
schema = model_class.json_schema
|
160
|
+
|
161
|
+
# Call Claude with tool use
|
162
|
+
response = @client.messages.create(
|
163
|
+
model: "claude-3-opus-20240229",
|
164
|
+
max_tokens: 1000,
|
165
|
+
system: "Extract structured data based on the provided schema.",
|
166
|
+
messages: [{ role: "user", content: content }],
|
167
|
+
tools: [{
|
168
|
+
type: "function",
|
169
|
+
function: {
|
170
|
+
name: "extract_data",
|
171
|
+
description: "Extract structured data from content",
|
172
|
+
parameters: schema
|
173
|
+
}
|
174
|
+
}],
|
175
|
+
tool_choice: { type: "function", function: { name: "extract_data" } }
|
176
|
+
)
|
177
|
+
|
178
|
+
# Parse and return structured data
|
179
|
+
JSON.parse(response.content[0].tools[0].function.arguments, symbolize_names: true)
|
109
180
|
end
|
110
|
-
|
111
|
-
# You can still use regular ActiveRecord features
|
112
|
-
validates :subject, presence: true
|
113
|
-
validates :summary, length: { minimum: 10 }
|
114
181
|
end
|
115
182
|
```
|
116
183
|
|
117
|
-
###
|
118
|
-
|
119
|
-
Structify provides several helper methods to access schema information:
|
184
|
+
### Store & Access Extracted Data
|
120
185
|
|
121
186
|
```ruby
|
122
|
-
#
|
123
|
-
|
124
|
-
|
125
|
-
#
|
126
|
-
#
|
127
|
-
#
|
128
|
-
#
|
129
|
-
|
130
|
-
#
|
131
|
-
#
|
132
|
-
# summary: { type: "text" },
|
133
|
-
# sentiment: {
|
134
|
-
# type: "string",
|
135
|
-
# enum: ["positive", "neutral", "negative"]
|
136
|
-
# },
|
137
|
-
# # ...
|
138
|
-
# }
|
139
|
-
# }
|
140
|
-
# }
|
141
|
-
|
142
|
-
# Get the current version
|
143
|
-
EmailSummary.extraction_version # => 2
|
144
|
-
|
145
|
-
# Get the assistant prompt
|
146
|
-
EmailSummary.extraction_assistant_prompt
|
147
|
-
# => "You are an assistant that extracts concise metadata..."
|
148
|
-
|
149
|
-
# Get the LLM model
|
150
|
-
EmailSummary.extraction_llm_model # => "gpt-4"
|
187
|
+
# Store LLM response in your model
|
188
|
+
article.update(response)
|
189
|
+
|
190
|
+
# Access via model attributes
|
191
|
+
article.title # => "How AI is Changing Healthcare"
|
192
|
+
article.category # => "tech"
|
193
|
+
article.topics # => ["machine learning", "healthcare"]
|
194
|
+
|
195
|
+
# All data is in the JSON column
|
196
|
+
article.extracted_data # => The complete JSON
|
151
197
|
```
|
152
198
|
|
153
|
-
|
199
|
+
## Field Types
|
154
200
|
|
155
|
-
Structify
|
201
|
+
Structify supports all standard JSON Schema types:
|
156
202
|
|
157
203
|
```ruby
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
participants: [
|
165
|
-
{ name: "Alice", role: "presenter" },
|
166
|
-
{ name: "Bob", role: "reviewer" }
|
167
|
-
]
|
168
|
-
)
|
169
|
-
|
170
|
-
# Access fields directly
|
171
|
-
summary.subject # => "Project Update"
|
172
|
-
summary.sentiment # => "positive"
|
173
|
-
summary.participants # => [{ name: "Alice", ... }]
|
174
|
-
|
175
|
-
# Validate enum values
|
176
|
-
summary.sentiment = "invalid"
|
177
|
-
summary.valid? # => false
|
204
|
+
field :name, :string # String values
|
205
|
+
field :count, :integer # Integer values
|
206
|
+
field :price, :number # Numeric values (float/int)
|
207
|
+
field :active, :boolean # Boolean values
|
208
|
+
field :metadata, :object # JSON objects
|
209
|
+
field :tags, :array # Arrays
|
178
210
|
```
|
179
211
|
|
180
|
-
##
|
212
|
+
## Field Options
|
213
|
+
|
214
|
+
```ruby
|
215
|
+
# Required fields
|
216
|
+
field :title, :string, required: true
|
217
|
+
|
218
|
+
# Enum values
|
219
|
+
field :status, :string, enum: ["draft", "published", "archived"]
|
220
|
+
|
221
|
+
# Array constraints
|
222
|
+
field :tags, :array,
|
223
|
+
items: { type: "string" },
|
224
|
+
min_items: 1,
|
225
|
+
max_items: 5,
|
226
|
+
unique_items: true
|
227
|
+
|
228
|
+
# Nested objects
|
229
|
+
field :author, :object, properties: {
|
230
|
+
"name" => { type: "string", required: true },
|
231
|
+
"email" => { type: "string" }
|
232
|
+
}
|
233
|
+
```
|
234
|
+
|
235
|
+
## Chain of Thought Mode
|
181
236
|
|
182
|
-
|
237
|
+
Structify supports a "thinking" mode that automatically requests chain of thought reasoning from the LLM:
|
183
238
|
|
184
239
|
```ruby
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
end
|
240
|
+
schema_definition do
|
241
|
+
version 1
|
242
|
+
thinking true # Enable chain of thought reasoning
|
243
|
+
|
244
|
+
field :title, :string, required: true
|
245
|
+
# other fields...
|
192
246
|
end
|
193
247
|
```
|
194
248
|
|
195
|
-
|
249
|
+
Chain of thought (COT) reasoning is beneficial because it:
|
250
|
+
- Adds more context to the extraction process
|
251
|
+
- Helps the LLM think through problems more systematically
|
252
|
+
- Improves accuracy for complex extractions
|
253
|
+
- Makes the reasoning process transparent and explainable
|
254
|
+
- Reduces hallucinations by forcing step-by-step thinking
|
196
255
|
|
197
|
-
|
256
|
+
This is especially useful when:
|
257
|
+
- Answers need more detailed information
|
258
|
+
- Questions require multi-step reasoning
|
259
|
+
- Extractions involve complex decision-making
|
260
|
+
- You need to understand how the LLM reached its conclusions
|
198
261
|
|
199
|
-
|
262
|
+
For best results, include instructions for COT in your base system prompt:
|
200
263
|
|
201
|
-
|
264
|
+
```ruby
|
265
|
+
system_prompt = "Extract structured data from the content.
|
266
|
+
For each field, think step by step before determining the value."
|
267
|
+
```
|
202
268
|
|
203
|
-
|
204
|
-
2. Create your feature branch (`git checkout -b feature/my-new-feature`)
|
205
|
-
3. Commit your changes (`git commit -am 'Add some feature'`)
|
206
|
-
4. Push to the branch (`git push origin feature/my-new-feature`)
|
207
|
-
5. Create a new Pull Request
|
269
|
+
You can generate effective chain of thought prompts using tools like the [Claude Prompt Designer](https://console.anthropic.com/dashboard).
|
208
270
|
|
209
|
-
|
271
|
+
## Schema Versioning and Field Lifecycle
|
210
272
|
|
211
|
-
|
273
|
+
Structify provides a simple field lifecycle management system using a `versions` parameter:
|
212
274
|
|
213
|
-
|
275
|
+
```ruby
|
276
|
+
schema_definition do
|
277
|
+
version 3
|
278
|
+
|
279
|
+
# Fields for specific version ranges
|
280
|
+
field :title, :string # Available in all versions (default behavior)
|
281
|
+
field :legacy, :string, versions: 1...3 # Only in versions 1-2 (removed in v3)
|
282
|
+
field :summary, :text, versions: 2 # Added in version 2 onwards
|
283
|
+
field :content, :text, versions: 2.. # Added in version 2 onwards (endless range)
|
284
|
+
field :temp_field, :string, versions: 2..3 # Only in versions 2-3
|
285
|
+
field :special, :string, versions: [1, 3, 5] # Only in versions 1, 3, and 5
|
286
|
+
end
|
287
|
+
```
|
214
288
|
|
215
|
-
|
289
|
+
### Version Range Syntax
|
216
290
|
|
217
|
-
|
291
|
+
Structify supports several ways to specify which versions a field is available in:
|
218
292
|
|
219
|
-
|
293
|
+
| Syntax | Example | Meaning |
|
294
|
+
|--------|---------|---------|
|
295
|
+
| No version specified | `field :title, :string` | Available in all versions (default) |
|
296
|
+
| Single integer | `versions: 2` | Available from version 2 onwards |
|
297
|
+
| Range (inclusive) | `versions: 1..3` | Available in versions 1, 2, and 3 |
|
298
|
+
| Range (exclusive) | `versions: 1...3` | Available in versions 1 and 2 (not 3) |
|
299
|
+
| Endless range | `versions: 2..` | Available from version 2 onwards |
|
300
|
+
| Array | `versions: [1, 4, 7]` | Only available in versions 1, 4, and 7 |
|
301
|
+
|
302
|
+
### Handling Records with Different Versions
|
220
303
|
|
304
|
+
```ruby
|
305
|
+
# Create a record with version 1 schema
|
306
|
+
article_v1 = Article.create(title: "Original Article")
|
307
|
+
|
308
|
+
# Access with version 3 schema
|
309
|
+
article_v3 = Article.find(article_v1.id)
|
310
|
+
|
311
|
+
# Fields from v1 are still accessible
|
312
|
+
article_v3.title # => "Original Article"
|
313
|
+
|
314
|
+
# Fields not in v1 raise errors
|
315
|
+
article_v3.summary # => VersionRangeError: Field 'summary' is not available in version 1.
|
316
|
+
# This field is only available in versions: 2 to 999.
|
317
|
+
|
318
|
+
# Check version compatibility
|
319
|
+
article_v3.version_compatible_with?(3) # => false
|
320
|
+
article_v3.version_compatible_with?(1) # => true
|
321
|
+
|
322
|
+
# Upgrade record to version 3
|
323
|
+
article_v3.summary = "Added in v3"
|
324
|
+
article_v3.save! # Record version is automatically updated to 3
|
221
325
|
```
|
326
|
+
|
327
|
+
|
328
|
+
## Understanding Structify's Role
|
329
|
+
|
330
|
+
Structify is designed as a **bridge** between your Rails models and LLM extraction services:
|
331
|
+
|
332
|
+
### What Structify Does For You
|
333
|
+
|
334
|
+
- โ
**Define extraction schemas** directly in your ActiveRecord models
|
335
|
+
- โ
**Generate compatible JSON schemas** for OpenAI, Anthropic, and other LLM providers
|
336
|
+
- โ
**Store and validate** extracted data against your schema
|
337
|
+
- โ
**Provide typed access** to extracted fields through your models
|
338
|
+
- โ
**Handle schema versioning** and backward compatibility
|
339
|
+
- โ
**Support chain of thought reasoning** with the thinking mode option
|
340
|
+
|
341
|
+
### What You Need To Implement
|
342
|
+
|
343
|
+
- ๐ง **API integration** with your chosen LLM provider (see examples above)
|
344
|
+
- ๐ง **Processing logic** for when and how to extract data
|
345
|
+
- ๐ง **Authentication** and API key management
|
346
|
+
- ๐ง **Error handling and retries** for API calls
|
347
|
+
|
348
|
+
This separation of concerns allows you to:
|
349
|
+
1. Use any LLM provider and model you prefer
|
350
|
+
2. Implement extraction logic specific to your application
|
351
|
+
3. Handle API access in a way that fits your application architecture
|
352
|
+
4. Change LLM providers without changing your data model
|
353
|
+
|
354
|
+
## License
|
355
|
+
|
356
|
+
[MIT License](https://opensource.org/licenses/MIT)
|
data/lib/structify/model.rb
CHANGED
@@ -3,10 +3,11 @@
|
|
3
3
|
require "active_support/concern"
|
4
4
|
require "active_support/core_ext/class/attribute"
|
5
5
|
require "attr_json"
|
6
|
+
require_relative "schema_serializer"
|
6
7
|
|
7
8
|
module Structify
|
8
9
|
# The Model module provides a DSL for defining LLM extraction schemas in your Rails models.
|
9
|
-
# It allows you to define fields, versioning, and
|
10
|
+
# It allows you to define fields, versioning, and validation for LLM-based data extraction.
|
10
11
|
#
|
11
12
|
# @example
|
12
13
|
# class Article < ApplicationRecord
|
@@ -16,8 +17,6 @@ module Structify
|
|
16
17
|
# title "Article Extraction"
|
17
18
|
# description "Extract article metadata"
|
18
19
|
# version 1
|
19
|
-
# assistant_prompt "Extract the following fields from the article"
|
20
|
-
# llm_model "gpt-4"
|
21
20
|
#
|
22
21
|
# field :title, :string, required: true
|
23
22
|
# field :summary, :text, description: "A brief summary of the article"
|
@@ -34,6 +33,30 @@ module Structify
|
|
34
33
|
# Store all extracted data in the extracted_data JSON column
|
35
34
|
attr_json_config(default_container_attribute: :extracted_data)
|
36
35
|
end
|
36
|
+
|
37
|
+
# Instance methods
|
38
|
+
def version_compatible_with?(required_version)
|
39
|
+
record_version = self.extracted_data && self.extracted_data["version"] ?
|
40
|
+
self.extracted_data["version"] : 1
|
41
|
+
record_version >= required_version
|
42
|
+
end
|
43
|
+
|
44
|
+
# Check if a version is within a given range/array of versions
|
45
|
+
# This is used in field accessors to check version compatibility
|
46
|
+
#
|
47
|
+
# @param version [Integer] The version to check
|
48
|
+
# @param range [Range, Array, Integer] The range, array, or single version to check against
|
49
|
+
# @return [Boolean] Whether the version is within the range
|
50
|
+
def version_in_range?(version, range)
|
51
|
+
case range
|
52
|
+
when Range
|
53
|
+
range.cover?(version)
|
54
|
+
when Array
|
55
|
+
range.include?(version)
|
56
|
+
else
|
57
|
+
version == range
|
58
|
+
end
|
59
|
+
end
|
37
60
|
|
38
61
|
# Class methods added to the including class
|
39
62
|
module ClassMethods
|
@@ -60,19 +83,6 @@ module Structify
|
|
60
83
|
schema_builder&.version_number
|
61
84
|
end
|
62
85
|
|
63
|
-
# Get the assistant prompt
|
64
|
-
#
|
65
|
-
# @return [String] The assistant prompt
|
66
|
-
def extraction_assistant_prompt
|
67
|
-
schema_builder&.assistant_prompt_str
|
68
|
-
end
|
69
|
-
|
70
|
-
# Get the LLM model name
|
71
|
-
#
|
72
|
-
# @return [String] The model name
|
73
|
-
def extraction_llm_model
|
74
|
-
schema_builder&.model_name
|
75
|
-
end
|
76
86
|
end
|
77
87
|
end
|
78
88
|
|
@@ -82,11 +92,9 @@ module Structify
|
|
82
92
|
# @return [Array<Hash>] The field definitions
|
83
93
|
# @return [String] The schema title
|
84
94
|
# @return [String] The schema description
|
85
|
-
# @return [String] The assistant prompt
|
86
|
-
# @return [String] The LLM model name
|
87
95
|
# @return [Integer] The schema version
|
88
|
-
|
89
|
-
|
96
|
+
# @return [Boolean] Whether thinking mode is enabled
|
97
|
+
attr_reader :model, :fields, :title_str, :description_str, :version_number, :thinking_enabled
|
90
98
|
|
91
99
|
# Initialize a new SchemaBuilder
|
92
100
|
#
|
@@ -94,9 +102,17 @@ module Structify
|
|
94
102
|
def initialize(model)
|
95
103
|
@model = model
|
96
104
|
@fields = []
|
97
|
-
@assistant_prompt_str = nil
|
98
|
-
@model_name = nil
|
99
105
|
@version_number = 1
|
106
|
+
@thinking_enabled = false
|
107
|
+
end
|
108
|
+
|
109
|
+
# Enable or disable thinking mode
|
110
|
+
# When enabled, the LLM will be asked to provide chain of thought reasoning
|
111
|
+
#
|
112
|
+
# @param enabled [Boolean] Whether to enable thinking mode
|
113
|
+
# @return [void]
|
114
|
+
def thinking(enabled)
|
115
|
+
@thinking_enabled = enabled
|
100
116
|
end
|
101
117
|
|
102
118
|
# Set the schema title
|
@@ -121,24 +137,15 @@ module Structify
|
|
121
137
|
# @return [void]
|
122
138
|
def version(num)
|
123
139
|
@version_number = num
|
124
|
-
|
140
|
+
|
141
|
+
# Define version as an attr_json field so it's stored in extracted_data
|
142
|
+
model.attr_json :version, :integer, default: num
|
143
|
+
|
144
|
+
# Store mapping of fields to their introduction version
|
145
|
+
@fields_by_version ||= {}
|
146
|
+
@fields_by_version[num] ||= []
|
125
147
|
end
|
126
148
|
|
127
|
-
# Set the assistant prompt
|
128
|
-
#
|
129
|
-
# @param prompt [String] The prompt text
|
130
|
-
# @return [void]
|
131
|
-
def assistant_prompt(prompt)
|
132
|
-
@assistant_prompt_str = prompt.strip
|
133
|
-
end
|
134
|
-
|
135
|
-
# Set the LLM model name
|
136
|
-
#
|
137
|
-
# @param name [String] The model name
|
138
|
-
# @return [void]
|
139
|
-
def llm_model(name)
|
140
|
-
@model_name = name
|
141
|
-
end
|
142
149
|
|
143
150
|
# Define a field in the schema
|
144
151
|
#
|
@@ -147,40 +154,265 @@ module Structify
|
|
147
154
|
# @param required [Boolean] Whether the field is required
|
148
155
|
# @param description [String] The field description
|
149
156
|
# @param enum [Array] Possible values for the field
|
157
|
+
# @param items [Hash] For array type, defines the schema for array items
|
158
|
+
# @param properties [Hash] For object type, defines the properties of the object
|
159
|
+
# @param min_items [Integer] For array type, minimum number of items
|
160
|
+
# @param max_items [Integer] For array type, maximum number of items
|
161
|
+
# @param unique_items [Boolean] For array type, whether items must be unique
|
162
|
+
# @param versions [Range, Array, Integer] The versions this field is available in (default: current version onwards)
|
150
163
|
# @return [void]
|
151
|
-
def field(name, type, required: false, description: nil, enum: nil
|
152
|
-
|
164
|
+
def field(name, type, required: false, description: nil, enum: nil,
|
165
|
+
items: nil, properties: nil, min_items: nil, max_items: nil,
|
166
|
+
unique_items: nil, versions: nil)
|
167
|
+
|
168
|
+
# Handle version information
|
169
|
+
version_range = if versions
|
170
|
+
# Use the versions parameter if provided
|
171
|
+
versions
|
172
|
+
else
|
173
|
+
# Default: field is available in all versions
|
174
|
+
1..999
|
175
|
+
end
|
176
|
+
|
177
|
+
# Check if the field is applicable for the current schema version
|
178
|
+
field_available = version_in_range?(@version_number, version_range)
|
179
|
+
|
180
|
+
# Skip defining the field in the schema if it's not applicable to the current version
|
181
|
+
unless field_available
|
182
|
+
# Still define an accessor that raises an appropriate error
|
183
|
+
define_version_range_accessor(name, version_range)
|
184
|
+
return
|
185
|
+
end
|
186
|
+
|
187
|
+
# Calculate a simple introduced_in for backward compatibility
|
188
|
+
effective_introduced_in = case version_range
|
189
|
+
when Range
|
190
|
+
version_range.begin
|
191
|
+
when Array
|
192
|
+
version_range.min
|
193
|
+
else
|
194
|
+
version_range
|
195
|
+
end
|
196
|
+
|
197
|
+
field_definition = {
|
153
198
|
name: name,
|
154
199
|
type: type,
|
155
200
|
required: required,
|
156
201
|
description: description,
|
157
|
-
|
202
|
+
version_range: version_range,
|
203
|
+
introduced_in: effective_introduced_in
|
158
204
|
}
|
205
|
+
|
206
|
+
# Add enum if provided
|
207
|
+
field_definition[:enum] = enum if enum
|
208
|
+
|
209
|
+
# Array specific properties
|
210
|
+
if type == :array
|
211
|
+
field_definition[:items] = items if items
|
212
|
+
field_definition[:min_items] = min_items if min_items
|
213
|
+
field_definition[:max_items] = max_items if max_items
|
214
|
+
field_definition[:unique_items] = unique_items if unique_items
|
215
|
+
end
|
216
|
+
|
217
|
+
# Object specific properties
|
218
|
+
if type == :object
|
219
|
+
field_definition[:properties] = properties if properties
|
220
|
+
end
|
221
|
+
|
222
|
+
fields << field_definition
|
223
|
+
|
224
|
+
# Track field by its version range
|
225
|
+
@fields_by_version ||= {}
|
226
|
+
@fields_by_version[effective_introduced_in] ||= []
|
227
|
+
@fields_by_version[effective_introduced_in] << name
|
159
228
|
|
229
|
+
# Map JSON Schema types to Ruby/AttrJson types
|
230
|
+
attr_type = case type
|
231
|
+
when :integer, :number
|
232
|
+
:integer
|
233
|
+
when :array
|
234
|
+
:json
|
235
|
+
when :object
|
236
|
+
:json
|
237
|
+
when :boolean
|
238
|
+
:boolean
|
239
|
+
else
|
240
|
+
type # string, text stay the same
|
241
|
+
end
|
242
|
+
|
243
|
+
# Define custom accessor that checks version compatibility
|
244
|
+
define_version_range_accessors(name, attr_type, version_range)
|
245
|
+
end
|
246
|
+
|
247
|
+
# Check if a version is within a given range/array of versions
|
248
|
+
#
|
249
|
+
# @param version [Integer] The version to check
|
250
|
+
# @param range [Range, Array, Integer] The range, array, or single version to check against
|
251
|
+
# @return [Boolean] Whether the version is within the range
|
252
|
+
def version_in_range?(version, range)
|
253
|
+
case range
|
254
|
+
when Range
|
255
|
+
# Handle endless ranges (Ruby 2.6+): 2.. means 2 and above
|
256
|
+
if range.end.nil?
|
257
|
+
version >= range.begin
|
258
|
+
else
|
259
|
+
range.cover?(version)
|
260
|
+
end
|
261
|
+
when Array
|
262
|
+
range.include?(version)
|
263
|
+
else
|
264
|
+
# A single integer means "this version and onwards"
|
265
|
+
version >= range
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
# Define accessor methods that check version compatibility using the new version ranges
|
270
|
+
#
|
271
|
+
# @param name [Symbol] The field name
|
272
|
+
# @param type [Symbol] The field type for attr_json
|
273
|
+
# @param version_range [Range, Array, Integer] The versions this field is available in
|
274
|
+
# @return [void]
|
275
|
+
def define_version_range_accessors(name, type, version_range)
|
276
|
+
# Define the attr_json normally first
|
160
277
|
model.attr_json name, type
|
278
|
+
|
279
|
+
# Extract current version for error messages
|
280
|
+
schema_version = @version_number
|
281
|
+
|
282
|
+
# Then override the reader method to check versions
|
283
|
+
model.class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
284
|
+
# Store original method
|
285
|
+
alias_method :_original_#{name}, :#{name}
|
286
|
+
|
287
|
+
# Override reader to check version compatibility
|
288
|
+
def #{name}
|
289
|
+
# Get the version from the record data
|
290
|
+
record_version = self.extracted_data && self.extracted_data["version"] ?
|
291
|
+
self.extracted_data["version"] : 1
|
292
|
+
|
293
|
+
# Check if record version is compatible with field's version range
|
294
|
+
field_version_range = #{version_range.inspect}
|
295
|
+
|
296
|
+
# Handle field lifecycle based on version
|
297
|
+
unless version_in_range?(record_version, field_version_range)
|
298
|
+
# Check if this is a removed field (was valid in earlier versions but not current version)
|
299
|
+
if field_version_range.is_a?(Range) && field_version_range.begin <= record_version && field_version_range.end < #{schema_version}
|
300
|
+
raise Structify::RemovedFieldError.new(
|
301
|
+
"#{name}",
|
302
|
+
field_version_range.end
|
303
|
+
)
|
304
|
+
# Check if this is a new field (only valid in later versions)
|
305
|
+
elsif (field_version_range.is_a?(Range) && field_version_range.begin > record_version) ||
|
306
|
+
(field_version_range.is_a?(Integer) && field_version_range > record_version)
|
307
|
+
raise Structify::VersionRangeError.new(
|
308
|
+
"#{name}",
|
309
|
+
record_version,
|
310
|
+
field_version_range
|
311
|
+
)
|
312
|
+
# Otherwise it's just not in the valid range
|
313
|
+
else
|
314
|
+
raise Structify::VersionRangeError.new(
|
315
|
+
"#{name}",
|
316
|
+
record_version,
|
317
|
+
field_version_range
|
318
|
+
)
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
# Check for deprecated fields and show warning
|
323
|
+
if field_version_range.is_a?(Range) &&
|
324
|
+
field_version_range.begin < #{schema_version} &&
|
325
|
+
field_version_range.end < 999 &&
|
326
|
+
field_version_range.cover?(record_version)
|
327
|
+
ActiveSupport::Deprecation.warn(
|
328
|
+
"Field '#{name}' is deprecated as of version #{schema_version} and will be removed in version \#{field_version_range.end}."
|
329
|
+
)
|
330
|
+
end
|
331
|
+
|
332
|
+
# Call original method
|
333
|
+
_original_#{name}
|
334
|
+
end
|
335
|
+
RUBY
|
336
|
+
end
|
337
|
+
|
338
|
+
# Define accessor for fields that are not in the current schema version
|
339
|
+
# These will raise an appropriate error when accessed
|
340
|
+
#
|
341
|
+
# @param name [Symbol] The field name
|
342
|
+
# @param version_range [Range, Array, Integer] The versions this field is available in
|
343
|
+
# @return [void]
|
344
|
+
def define_version_range_accessor(name, version_range)
|
345
|
+
# Capture schema version to use in the eval block
|
346
|
+
schema_version = @version_number
|
347
|
+
|
348
|
+
# Handle different version range types
|
349
|
+
version_range_type = case version_range
|
350
|
+
when Range
|
351
|
+
"range"
|
352
|
+
when Array
|
353
|
+
"array"
|
354
|
+
else
|
355
|
+
"integer"
|
356
|
+
end
|
357
|
+
|
358
|
+
# Extract begin/end values for ranges
|
359
|
+
range_begin = case version_range
|
360
|
+
when Range
|
361
|
+
version_range.begin
|
362
|
+
when Array
|
363
|
+
version_range.min
|
364
|
+
else
|
365
|
+
version_range
|
366
|
+
end
|
367
|
+
|
368
|
+
range_end = case version_range
|
369
|
+
when Range
|
370
|
+
version_range.end
|
371
|
+
when Array
|
372
|
+
version_range.max
|
373
|
+
else
|
374
|
+
version_range
|
375
|
+
end
|
376
|
+
|
377
|
+
model.class_eval <<-RUBY, __FILE__, __LINE__ + 1
|
378
|
+
# Define an accessor that raises an error when accessed
|
379
|
+
def #{name}
|
380
|
+
# Based on the version_range type, create appropriate errors
|
381
|
+
case "#{version_range_type}"
|
382
|
+
when "range"
|
383
|
+
if #{range_begin} <= #{schema_version} && #{range_end} < #{schema_version}
|
384
|
+
# Removed field
|
385
|
+
raise Structify::RemovedFieldError.new("#{name}", #{range_end})
|
386
|
+
elsif #{range_begin} > #{schema_version}
|
387
|
+
# Field from future version
|
388
|
+
raise Structify::VersionRangeError.new("#{name}", #{schema_version}, #{version_range.inspect})
|
389
|
+
else
|
390
|
+
# Not in range for other reasons
|
391
|
+
raise Structify::VersionRangeError.new("#{name}", #{schema_version}, #{version_range.inspect})
|
392
|
+
end
|
393
|
+
when "array"
|
394
|
+
# For arrays, we can only check if the current version is in the array
|
395
|
+
raise Structify::VersionRangeError.new("#{name}", #{schema_version}, #{version_range.inspect})
|
396
|
+
else
|
397
|
+
# For integers, just report version mismatch
|
398
|
+
raise Structify::VersionRangeError.new("#{name}", #{schema_version}, #{version_range.inspect})
|
399
|
+
end
|
400
|
+
end
|
401
|
+
|
402
|
+
# Define a writer that raises an error too
|
403
|
+
def #{name}=(value)
|
404
|
+
# Use the same error logic as the reader
|
405
|
+
self.#{name}
|
406
|
+
end
|
407
|
+
RUBY
|
161
408
|
end
|
162
409
|
|
163
410
|
# Generate the JSON schema representation
|
164
411
|
#
|
165
412
|
# @return [Hash] The JSON schema
|
166
413
|
def to_json_schema
|
167
|
-
|
168
|
-
|
169
|
-
prop = { type: f[:type].to_s }
|
170
|
-
prop[:description] = f[:description] if f[:description]
|
171
|
-
prop[:enum] = f[:enum] if f[:enum]
|
172
|
-
hash[f[:name].to_s] = prop
|
173
|
-
end
|
174
|
-
|
175
|
-
{
|
176
|
-
name: title_str,
|
177
|
-
description: description_str,
|
178
|
-
parameters: {
|
179
|
-
type: "object",
|
180
|
-
required: required_fields,
|
181
|
-
properties: properties_hash
|
182
|
-
}
|
183
|
-
}
|
414
|
+
serializer = SchemaSerializer.new(self)
|
415
|
+
serializer.to_json_schema
|
184
416
|
end
|
185
417
|
end
|
186
418
|
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Structify
|
4
|
+
# Handles serialization of schema definitions to different formats
|
5
|
+
class SchemaSerializer
|
6
|
+
# @return [Structify::SchemaBuilder] The schema builder to serialize
|
7
|
+
attr_reader :schema_builder
|
8
|
+
|
9
|
+
# Initialize a new SchemaSerializer
|
10
|
+
#
|
11
|
+
# @param schema_builder [Structify::SchemaBuilder] The schema builder to serialize
|
12
|
+
def initialize(schema_builder)
|
13
|
+
@schema_builder = schema_builder
|
14
|
+
end
|
15
|
+
|
16
|
+
# Generate the JSON schema representation
|
17
|
+
#
|
18
|
+
# @return [Hash] The JSON schema
|
19
|
+
def to_json_schema
|
20
|
+
# Get current schema version
|
21
|
+
current_version = schema_builder.version_number
|
22
|
+
|
23
|
+
# Get fields that are applicable to the current schema version
|
24
|
+
fields = schema_builder.fields.select do |f|
|
25
|
+
# Check if the field has a version_range
|
26
|
+
if f[:version_range]
|
27
|
+
version_in_range?(current_version, f[:version_range])
|
28
|
+
# Legacy check for removed_in
|
29
|
+
elsif f[:removed_in]
|
30
|
+
f[:removed_in] > current_version
|
31
|
+
else
|
32
|
+
true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Get required fields (excluding fields not in the current version)
|
37
|
+
required_fields = fields.select { |f| f[:required] }.map { |f| f[:name].to_s }
|
38
|
+
|
39
|
+
# Start with chain_of_thought if thinking mode is enabled
|
40
|
+
properties_hash = {}
|
41
|
+
if schema_builder.thinking_enabled
|
42
|
+
properties_hash["chain_of_thought"] = {
|
43
|
+
type: "string",
|
44
|
+
description: "Explain your thought process step by step before determining the final values."
|
45
|
+
}
|
46
|
+
end
|
47
|
+
|
48
|
+
# Add all other fields
|
49
|
+
fields.each_with_object(properties_hash) do |f, hash|
|
50
|
+
# Start with the basic type
|
51
|
+
prop = { type: f[:type].to_s }
|
52
|
+
|
53
|
+
# Add description if available
|
54
|
+
prop[:description] = f[:description] if f[:description]
|
55
|
+
|
56
|
+
# Add enum if available
|
57
|
+
prop[:enum] = f[:enum] if f[:enum]
|
58
|
+
|
59
|
+
# Handle array specific properties
|
60
|
+
if f[:type] == :array
|
61
|
+
# Add items schema
|
62
|
+
prop[:items] = f[:items] if f[:items]
|
63
|
+
|
64
|
+
# Add array constraints
|
65
|
+
prop[:minItems] = f[:min_items] if f[:min_items]
|
66
|
+
prop[:maxItems] = f[:max_items] if f[:max_items]
|
67
|
+
prop[:uniqueItems] = f[:unique_items] if f[:unique_items]
|
68
|
+
end
|
69
|
+
|
70
|
+
# Handle object specific properties
|
71
|
+
if f[:type] == :object && f[:properties]
|
72
|
+
prop[:properties] = {}
|
73
|
+
required_props = []
|
74
|
+
|
75
|
+
# Process each property
|
76
|
+
f[:properties].each do |prop_name, prop_def|
|
77
|
+
prop[:properties][prop_name] = prop_def.dup
|
78
|
+
|
79
|
+
# If a property is marked as required, add it to required list and remove from property definition
|
80
|
+
if prop_def[:required]
|
81
|
+
required_props << prop_name
|
82
|
+
prop[:properties][prop_name].delete(:required)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# Add required array if we have required properties
|
87
|
+
prop[:required] = required_props unless required_props.empty?
|
88
|
+
end
|
89
|
+
|
90
|
+
# Add version info to description only if requested by environment variable
|
91
|
+
# This allows for backward compatibility with existing tests
|
92
|
+
if ENV["STRUCTIFY_SHOW_VERSION_INFO"] && f[:version_range] && prop[:description]
|
93
|
+
version_info = format_version_range(f[:version_range])
|
94
|
+
prop[:description] = "#{prop[:description]} (Available in versions: #{version_info})"
|
95
|
+
elsif ENV["STRUCTIFY_SHOW_VERSION_INFO"] && f[:version_range]
|
96
|
+
prop[:description] = "Available in versions: #{format_version_range(f[:version_range])}"
|
97
|
+
end
|
98
|
+
|
99
|
+
# Legacy: Add a deprecation notice to description
|
100
|
+
if f[:deprecated_in] && f[:deprecated_in] <= current_version
|
101
|
+
deprecation_note = "Deprecated in v#{f[:deprecated_in]}. "
|
102
|
+
prop[:description] = if prop[:description]
|
103
|
+
"#{deprecation_note}#{prop[:description]}"
|
104
|
+
else
|
105
|
+
deprecation_note
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
hash[f[:name].to_s] = prop
|
110
|
+
end
|
111
|
+
|
112
|
+
{
|
113
|
+
name: schema_builder.title_str,
|
114
|
+
description: schema_builder.description_str,
|
115
|
+
parameters: {
|
116
|
+
type: "object",
|
117
|
+
required: required_fields,
|
118
|
+
properties: properties_hash
|
119
|
+
}
|
120
|
+
}
|
121
|
+
end
|
122
|
+
|
123
|
+
private
|
124
|
+
|
125
|
+
# Check if a version is within a given range/array of versions
|
126
|
+
#
|
127
|
+
# @param version [Integer] The version to check
|
128
|
+
# @param range [Range, Array, Integer] The range, array, or single version to check against
|
129
|
+
# @return [Boolean] Whether the version is within the range
|
130
|
+
def version_in_range?(version, range)
|
131
|
+
case range
|
132
|
+
when Range
|
133
|
+
# Handle endless ranges (Ruby 2.6+): 2.. means 2 and above
|
134
|
+
if range.end.nil?
|
135
|
+
version >= range.begin
|
136
|
+
else
|
137
|
+
range.cover?(version)
|
138
|
+
end
|
139
|
+
when Array
|
140
|
+
range.include?(version)
|
141
|
+
else
|
142
|
+
# A single integer means "this version and onwards"
|
143
|
+
version >= range
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
# Format a version range for display in error messages
|
148
|
+
#
|
149
|
+
# @param versions [Range, Array, Integer] The version range to format
|
150
|
+
# @return [String] A human-readable version range
|
151
|
+
def format_version_range(versions)
|
152
|
+
if versions.is_a?(Range)
|
153
|
+
if versions.end.nil?
|
154
|
+
"#{versions.begin} and above"
|
155
|
+
else
|
156
|
+
"#{versions.begin} to #{versions.end}#{versions.exclude_end? ? ' (exclusive)' : ''}"
|
157
|
+
end
|
158
|
+
elsif versions.is_a?(Array)
|
159
|
+
versions.join(", ")
|
160
|
+
else
|
161
|
+
"#{versions} and above" # Single integer means this version and onwards
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
data/lib/structify/version.rb
CHANGED
data/lib/structify.rb
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require_relative "structify/version"
|
4
|
+
require_relative "structify/schema_serializer"
|
4
5
|
require_relative "structify/model"
|
5
6
|
|
6
7
|
# Structify is a DSL for defining extraction schemas for LLM-powered models.
|
7
8
|
# It provides a simple way to integrate with Rails models for LLM extraction,
|
8
|
-
#
|
9
|
+
# allowing for schema versioning and evolution.
|
9
10
|
#
|
10
11
|
# @example
|
11
12
|
# class Article < ApplicationRecord
|
@@ -15,8 +16,6 @@ require_relative "structify/model"
|
|
15
16
|
# title "Article Extraction"
|
16
17
|
# description "Extract article metadata"
|
17
18
|
# version 1
|
18
|
-
# assistant_prompt "Extract the following fields from the article"
|
19
|
-
# llm_model "gpt-4"
|
20
19
|
#
|
21
20
|
# field :title, :string, required: true
|
22
21
|
# field :summary, :text, description: "A brief summary of the article"
|
@@ -24,6 +23,70 @@ require_relative "structify/model"
|
|
24
23
|
# end
|
25
24
|
# end
|
26
25
|
module Structify
|
26
|
+
# Base error class for Structify
|
27
27
|
class Error < StandardError; end
|
28
|
-
|
28
|
+
|
29
|
+
# Error raised when trying to access a field that doesn't exist in the record's version
|
30
|
+
class MissingFieldError < Error
|
31
|
+
attr_reader :field_name, :record_version, :schema_version
|
32
|
+
|
33
|
+
def initialize(field_name, record_version, schema_version)
|
34
|
+
@field_name = field_name
|
35
|
+
@record_version = record_version
|
36
|
+
@schema_version = schema_version
|
37
|
+
|
38
|
+
message = "Field '#{field_name}' does not exist in version #{record_version}. " \
|
39
|
+
"It was introduced in version #{schema_version}. " \
|
40
|
+
"To access this field, upgrade the record by setting new field values and saving."
|
41
|
+
|
42
|
+
super(message)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Error raised when trying to access a field that has been removed in the current schema version
|
47
|
+
class RemovedFieldError < Error
|
48
|
+
attr_reader :field_name, :removed_in_version
|
49
|
+
|
50
|
+
def initialize(field_name, removed_in_version)
|
51
|
+
@field_name = field_name
|
52
|
+
@removed_in_version = removed_in_version
|
53
|
+
|
54
|
+
message = "Field '#{field_name}' has been removed in version #{removed_in_version}. " \
|
55
|
+
"This field is no longer available in the current schema."
|
56
|
+
|
57
|
+
super(message)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Error raised when trying to access a field outside its specified version range
|
62
|
+
class VersionRangeError < Error
|
63
|
+
attr_reader :field_name, :record_version, :valid_versions
|
64
|
+
|
65
|
+
def initialize(field_name, record_version, valid_versions)
|
66
|
+
@field_name = field_name
|
67
|
+
@record_version = record_version
|
68
|
+
@valid_versions = valid_versions
|
69
|
+
|
70
|
+
message = "Field '#{field_name}' is not available in version #{record_version}. " \
|
71
|
+
"This field is only available in versions: #{format_versions(valid_versions)}."
|
72
|
+
|
73
|
+
super(message)
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def format_versions(versions)
|
79
|
+
if versions.is_a?(Range)
|
80
|
+
if versions.end.nil?
|
81
|
+
"#{versions.begin} and above"
|
82
|
+
else
|
83
|
+
"#{versions.begin} to #{versions.end}#{versions.exclude_end? ? ' (exclusive)' : ''}"
|
84
|
+
end
|
85
|
+
elsif versions.is_a?(Array)
|
86
|
+
versions.join(", ")
|
87
|
+
else
|
88
|
+
"#{versions} and above" # Single integer means this version and onwards
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
29
92
|
end
|
data/structify.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
# Specify which files should be added to the gem when it is released.
|
20
20
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
21
21
|
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
22
|
-
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
22
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) || f.end_with?('.gem') }
|
23
23
|
end
|
24
24
|
spec.bindir = "exe"
|
25
25
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: structify
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kieran Klaassen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-03-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -49,6 +49,8 @@ files:
|
|
49
49
|
- ".gitignore"
|
50
50
|
- ".rspec"
|
51
51
|
- ".travis.yml"
|
52
|
+
- CHANGELOG.md
|
53
|
+
- CLAUDE.md
|
52
54
|
- CODE_OF_CONDUCT.md
|
53
55
|
- Gemfile
|
54
56
|
- Gemfile.lock
|
@@ -59,6 +61,7 @@ files:
|
|
59
61
|
- bin/setup
|
60
62
|
- lib/structify.rb
|
61
63
|
- lib/structify/model.rb
|
64
|
+
- lib/structify/schema_serializer.rb
|
62
65
|
- lib/structify/version.rb
|
63
66
|
- structify.gemspec
|
64
67
|
homepage: https://github.com/kieranklaassen/structify
|