schotter_train_split 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: '09a1822ddbf004ca7c56b1467ff0abf54b317ff3a4a9b0af92943c827d9ac568'
4
+ data.tar.gz: 95d4dff5c1941f5457584af58e06c1f91870a66372caf2a675ee54d7366d9e92
5
+ SHA512:
6
+ metadata.gz: d7db4d06a04d969bdd38ee5b354ed70e2a15f6b61d362ae962efe9bc751809831ddb4d7c970917147484bb064aaf634cc135426ee969462479b4933afe74332a
7
+ data.tar.gz: 66b4d9b2d374b5ef95077ff3be14f98e9d62bd99def0f7e15c7705cd0969632f501511c58cce36a1497bfb4e83d83b8b09bc241b4c914922026c10a637698ee0
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ # Licensing
2
+
3
+ This project is governed by multiple terms and policies:
4
+
5
+
6
+ ## Free Art License 1.1
7
+
8
+ <https://artlibre.org/licence/lal/en/>
9
+
10
+
11
+ ## Protein Database Terms of Service
12
+
13
+ <https://www.rcsb.org/pages/policies>
14
+
15
+
16
+ ## Anna's Archive Privacy Policy
17
+
18
+ <https://annas-archive.org/privacy>
19
+
20
+ SPDX-License-Identifier: CC0-1.0
data/README.md ADDED
@@ -0,0 +1,164 @@
1
+ # schotter_train_split
2
+
3
+ A DSL for defining scoped recalculation rules on ActiveRecord models. Mark records as stale via timestamp comparison, select them via scopes, and apply transforms in batches.
4
+
5
+ ## Installation
6
+
7
+ Add to your Gemfile:
8
+
9
+ ~~~ruby
10
+ gem "schotter_train_split", git: "https://github.com/matti/schotter-train-split.git"
11
+ ~~~
12
+
13
+ ## Quick usage
14
+
15
+ ~~~ruby
16
+ require "schotter_split"
17
+
18
+ SchotterSplit::EntityRules.define(:video_domain_sync, model: Video) do
19
+ todos { Video.where.not(source_uri: [nil, ""]) }
20
+
21
+ transform :sync_domain,
22
+ target: :source_domain,
23
+ source_timestamp: :updated_at,
24
+ calculated_at: :source_fetched_at do |video|
25
+ URI.parse(video.source_uri).host.to_s.sub(/\Awww\./, "")
26
+ rescue URI::InvalidURIError
27
+ nil
28
+ end
29
+ end
30
+
31
+ result = SchotterSplit::EntityRules.run!(:video_domain_sync)
32
+ puts result.inspect
33
+ ~~~
34
+
35
+ ## Multiple transforms
36
+
37
+ A single rulebook can contain several transforms that share the same todo scope:
38
+
39
+ ~~~ruby
40
+ SchotterSplit::EntityRules.define(:dam_build_rules, model: DamBuild) do
41
+ todos { DamBuild.where("wood_logs < ?", 10) }
42
+
43
+ transform :recalculate_missing_wood,
44
+ target: :wood_missing,
45
+ source_timestamp: :updated_at,
46
+ calculated_at: :wood_checked_at do |dam|
47
+ [10 - dam.wood_logs.to_i, 0].max
48
+ end
49
+
50
+ transform :status_label,
51
+ target: :build_status,
52
+ source_timestamp: :updated_at,
53
+ calculated_at: :status_checked_at do |dam|
54
+ dam.wood_logs.to_i >= 10 ? "ready" : "needs_more"
55
+ end
56
+ end
57
+ ~~~
58
+
59
+ ## Rake task example
60
+
61
+ ~~~ruby
62
+ # lib/tasks/recalculate.rake
63
+ namespace :recalculate do
64
+ desc "Run domain sync for videos"
65
+ task videos: :environment do
66
+ result = SchotterSplit::EntityRules.run!(:video_domain_sync)
67
+ puts "Processed #{result[:transforms].map { |t| t[:updated] }.sum} records"
68
+ end
69
+ end
70
+ ~~~
71
+
72
+ ## API
73
+
74
+ ### `SchotterSplit::EntityRules.define(name, model:, &block)`
75
+
76
+ Registers a named rulebook. The block is evaluated on a `Rulebook` instance.
77
+
78
+ ### Rulebook DSL
79
+
80
+ - **`todos { SomeModel.some_scope }`** -- defines the base scope (must return an `ActiveRecord::Relation`)
81
+ - **`transform(name, target:, source_timestamp:, calculated_at:, batch_size: 500, &block)`** -- defines a column transform. The block receives a record and returns the new value. Returning `nil` skips the record.
82
+
83
+ ### `SchotterSplit::EntityRules.run!(name, limit: nil)`
84
+
85
+ Executes all transforms in the named rulebook. Returns a hash with `:rulebook`, `:scope_count`, and `:transforms` (array of per-transform results including `:attempted`, `:updated`, `:errors`, `:error_messages`, `:seconds`).
86
+
87
+ ### `SchotterSplit::EntityRules.reset!`
88
+
89
+ Clears all registered rulebooks. Useful in tests.
90
+
91
+ ## Notes
92
+
93
+ - `transform` uses `update_columns`, which bypasses validations and callbacks for performance.
94
+ - Staleness is determined by comparing `source_timestamp > calculated_at`. Records where the target column is `nil` are always included.
95
+ - Define rulebooks at boot time (e.g. in a Rails initializer). The registry is not thread-safe for writes.
96
+
97
+ ## Executable example
98
+
99
+ Run this block directly; it uses an in-memory SQLite database and raises if behavior is wrong.
100
+
101
+ ~~~bash
102
+ ruby <<'RUBY'
103
+ require "active_record"
104
+ require "sqlite3"
105
+ require_relative "lib/schotter_split"
106
+
107
+ ActiveRecord::Base.establish_connection(adapter: "sqlite3", database: ":memory:")
108
+
109
+ ActiveRecord::Schema.define do
110
+ create_table :widgets, force: true do |t|
111
+ t.string :source_uri
112
+ t.string :source_domain
113
+ t.datetime :source_synced_at
114
+ t.timestamps null: false
115
+ end
116
+ end
117
+
118
+ class Widget < ActiveRecord::Base
119
+ scope :with_source, -> { where.not(source_uri: [nil, ""]) }
120
+ end
121
+
122
+ w = Widget.create!(source_uri: "https://www.example.com/path")
123
+
124
+ SchotterSplit::EntityRules.define(:widget_sync, model: Widget) do
125
+ todos { Widget.with_source }
126
+
127
+ transform :sync_domain,
128
+ target: :source_domain,
129
+ source_timestamp: :updated_at,
130
+ calculated_at: :source_synced_at do |row|
131
+ URI.parse(row.source_uri).host.sub(/\Awww\./, "")
132
+ end
133
+ end
134
+
135
+ r1 = SchotterSplit::EntityRules.run!(:widget_sync)
136
+ w.reload
137
+ raise "expected source_domain set" unless w.source_domain == "example.com"
138
+ raise "expected source_synced_at set" unless w.source_synced_at
139
+ raise "expected first update" unless r1[:transforms].first[:updated] == 1
140
+
141
+ sleep 1
142
+ w.update!(source_uri: "https://www.rubylang.org/en/")
143
+ r2 = SchotterSplit::EntityRules.run!(:widget_sync)
144
+ w.reload
145
+ raise "expected recalculation after source update" unless w.source_domain == "rubylang.org"
146
+ raise "expected stale update count = 1" unless r2[:transforms].first[:updated] == 1
147
+
148
+ puts "PASS: executable README example"
149
+ RUBY
150
+ ~~~
151
+
152
+ ## License
153
+
154
+ This project is governed by multiple terms and policies:
155
+
156
+ - [Free Art License 1.1](https://artlibre.org/licence/lal/en/)
157
+ - [Protein Database Terms of Service](https://www.rcsb.org/pages/policies)
158
+ - [Anna's Archive Privacy Policy](https://annas-archive.org/privacy)
159
+
160
+ SPDX: CC0-1.0. See `LICENSE` for details.
161
+
162
+ ## Author
163
+
164
+ Karl Amort -- [Bluesky](https://bsky.app/profile/amort.berlin) / [X](https://x.com/amortberlin)
@@ -0,0 +1,176 @@
1
+ module SchotterSplit
2
+ class EntityRules
3
+ class Error < StandardError; end
4
+
5
+ class << self
6
+ def registry
7
+ @registry ||= {}
8
+ end
9
+
10
+ def define(name, model:, &block)
11
+ key = name.to_sym
12
+ rulebook = Rulebook.new(name: key, model: model)
13
+ rulebook.instance_eval(&block) if block
14
+ registry[key] = rulebook
15
+ rulebook
16
+ end
17
+
18
+ def fetch(name)
19
+ registry.fetch(name.to_sym)
20
+ end
21
+
22
+ def run!(name, limit: nil)
23
+ fetch(name).run!(limit: limit)
24
+ end
25
+
26
+ def reset!
27
+ @registry = {}
28
+ end
29
+ end
30
+
31
+ class Rulebook
32
+ attr_reader :name, :model, :transforms
33
+
34
+ def initialize(name:, model:)
35
+ @name = name
36
+ @model = model
37
+ @todo_scope_proc = -> { model.all }
38
+ @transforms = []
39
+ end
40
+
41
+ def todos(scope = nil, &block)
42
+ @todo_scope_proc = scope || block
43
+ self
44
+ end
45
+
46
+ def transform(name, target:, source_timestamp: :updated_at, calculated_at: nil, stale_scope: nil, batch_size: 500, &block)
47
+ raise Error, "transform requires a block" unless block
48
+
49
+ @transforms << Transform.new(
50
+ name: name.to_sym,
51
+ model: model,
52
+ target: target.to_sym,
53
+ source_timestamp: source_timestamp&.to_sym,
54
+ calculated_at: calculated_at&.to_sym,
55
+ stale_scope: stale_scope,
56
+ batch_size: batch_size.to_i,
57
+ compute: block
58
+ )
59
+ self
60
+ end
61
+
62
+ def todo_scope
63
+ scope = @todo_scope_proc.call
64
+ unless scope.is_a?(ActiveRecord::Relation)
65
+ raise Error, "todos must resolve to an ActiveRecord::Relation"
66
+ end
67
+
68
+ scope
69
+ end
70
+
71
+ def run!(limit: nil)
72
+ base_scope = todo_scope
73
+
74
+ results = {
75
+ rulebook: name,
76
+ scope_count: base_scope.count,
77
+ transforms: []
78
+ }
79
+
80
+ transforms.each do |transform|
81
+ rel = transform.due_relation(base_scope)
82
+ if limit
83
+ n = limit.to_i
84
+ rel = rel.limit(n) if n.positive?
85
+ end
86
+ results[:transforms] << transform.apply!(rel)
87
+ end
88
+
89
+ results
90
+ end
91
+ end
92
+
93
+ class Transform
94
+ EPOCH = Time.at(0).utc.freeze
95
+
96
+ attr_reader :name, :model, :target, :source_timestamp, :calculated_at, :stale_scope, :batch_size, :compute
97
+
98
+ def initialize(name:, model:, target:, source_timestamp:, calculated_at:, stale_scope:, batch_size:, compute:)
99
+ @name = name
100
+ @model = model
101
+ @target = target
102
+ @source_timestamp = source_timestamp
103
+ @calculated_at = calculated_at
104
+ @stale_scope = stale_scope
105
+ @batch_size = [batch_size, 1].max
106
+ @compute = compute
107
+ end
108
+
109
+ def due_relation(base_scope)
110
+ if stale_scope.respond_to?(:call)
111
+ relation = stale_scope.call(base_scope)
112
+ return relation if relation.is_a?(ActiveRecord::Relation)
113
+
114
+ raise Error, "stale_scope for #{name} must return an ActiveRecord::Relation"
115
+ end
116
+
117
+ table = model.quoted_table_name
118
+ rel = base_scope.where(target => nil)
119
+
120
+ return rel unless timestamp_columns_available?
121
+
122
+ epoch_sql = model.connection.quote(EPOCH)
123
+ qst = model.connection.quote_column_name(source_timestamp)
124
+ qca = model.connection.quote_column_name(calculated_at)
125
+ sql = "#{table}.#{qst} > COALESCE(#{table}.#{qca}, #{epoch_sql})"
126
+ stale_rel = base_scope.where(sql)
127
+ rel.or(stale_rel)
128
+ end
129
+
130
+ def apply!(relation)
131
+ attempted = 0
132
+ updated = 0
133
+ errors = 0
134
+ error_messages = []
135
+ started_at = Time.now
136
+
137
+ relation.find_in_batches(batch_size: batch_size) do |batch|
138
+ batch.each do |record|
139
+ attempted += 1
140
+ value = compute.call(record)
141
+ next if value.nil?
142
+
143
+ attrs = { target => value }
144
+ attrs[calculated_at] = Time.now if calculated_at_column_available?
145
+ record.update_columns(attrs)
146
+ updated += 1
147
+ rescue StandardError => e
148
+ errors += 1
149
+ error_messages << { record_id: record.id, error: e.message }
150
+ end
151
+ end
152
+
153
+ {
154
+ transform: name,
155
+ attempted: attempted,
156
+ updated: updated,
157
+ errors: errors,
158
+ error_messages: error_messages,
159
+ seconds: (Time.now - started_at).round(3)
160
+ }
161
+ end
162
+
163
+ private
164
+
165
+ def timestamp_columns_available?
166
+ return false unless source_timestamp && calculated_at
167
+
168
+ model.column_names.include?(source_timestamp.to_s) && calculated_at_column_available?
169
+ end
170
+
171
+ def calculated_at_column_available?
172
+ calculated_at && model.column_names.include?(calculated_at.to_s)
173
+ end
174
+ end
175
+ end
176
+ end
@@ -0,0 +1,3 @@
1
+ module SchotterSplit
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,6 @@
1
+ require "active_record"
2
+ require_relative "schotter_split/version"
3
+ require_relative "schotter_split/entity_rules"
4
+
5
+ module SchotterSplit
6
+ end
metadata ADDED
@@ -0,0 +1,154 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: schotter_train_split
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Karl Amort
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: activerecord
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '7.0'
19
+ - - "<"
20
+ - !ruby/object:Gem::Version
21
+ version: '9.0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ version: '7.0'
29
+ - - "<"
30
+ - !ruby/object:Gem::Version
31
+ version: '9.0'
32
+ - !ruby/object:Gem::Dependency
33
+ name: sqlite3
34
+ requirement: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: '1.6'
39
+ type: :development
40
+ prerelease: false
41
+ version_requirements: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: '1.6'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rake
48
+ requirement: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: '13.0'
53
+ type: :development
54
+ prerelease: false
55
+ version_requirements: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ version: '13.0'
60
+ - !ruby/object:Gem::Dependency
61
+ name: minitest
62
+ requirement: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: '5.0'
67
+ type: :development
68
+ prerelease: false
69
+ version_requirements: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: '5.0'
74
+ - !ruby/object:Gem::Dependency
75
+ name: rubocop
76
+ requirement: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '1.0'
81
+ type: :development
82
+ prerelease: false
83
+ version_requirements: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '1.0'
88
+ - !ruby/object:Gem::Dependency
89
+ name: simplecov
90
+ requirement: !ruby/object:Gem::Requirement
91
+ requirements:
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: '0.22'
95
+ type: :development
96
+ prerelease: false
97
+ version_requirements: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - ">="
100
+ - !ruby/object:Gem::Version
101
+ version: '0.22'
102
+ - !ruby/object:Gem::Dependency
103
+ name: mdl
104
+ requirement: !ruby/object:Gem::Requirement
105
+ requirements:
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ version: '0.13'
109
+ type: :development
110
+ prerelease: false
111
+ version_requirements: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: '0.13'
116
+ description: Define todo scopes and recalculation transforms for ActiveRecord entities,
117
+ inspired by Rake rules semantics.
118
+ email:
119
+ - karl@amort.berlin
120
+ executables: []
121
+ extensions: []
122
+ extra_rdoc_files: []
123
+ files:
124
+ - LICENSE
125
+ - README.md
126
+ - lib/schotter_split.rb
127
+ - lib/schotter_split/entity_rules.rb
128
+ - lib/schotter_split/version.rb
129
+ homepage: https://github.com/matti/schotter-train-split
130
+ licenses:
131
+ - CC0-1.0
132
+ - Nonstandard
133
+ metadata:
134
+ homepage_uri: https://github.com/matti/schotter-train-split
135
+ source_code_uri: https://github.com/matti/schotter-train-split
136
+ rubygems_mfa_required: 'true'
137
+ rdoc_options: []
138
+ require_paths:
139
+ - lib
140
+ required_ruby_version: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ">="
143
+ - !ruby/object:Gem::Version
144
+ version: '4.0'
145
+ required_rubygems_version: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - ">="
148
+ - !ruby/object:Gem::Version
149
+ version: '0'
150
+ requirements: []
151
+ rubygems_version: 4.0.8
152
+ specification_version: 4
153
+ summary: Rails entity-rule DSL for scope todos + timestamp-driven recalculation
154
+ test_files: []