time_bucket_stream 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,263 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+
5
+ class TimeBucketStream
6
+ class Claimer
7
+ DEFAULT_CLAIM_GRACE = 10
8
+ DEFAULT_STALE_PARTIAL_AFTER = 600
9
+ DEFAULT_QUARANTINE_RETENTION = Quarantine::DEFAULT_RETENTION
10
+
11
+ attr_reader :claim_grace, :paths, :quarantine_retention, :stale_partial_after
12
+
13
+ def self.normalize_claim_grace(value)
14
+ seconds = Integer(value)
15
+ return seconds if seconds >= 0
16
+
17
+ raise ArgumentError
18
+ rescue ArgumentError, TypeError, RangeError
19
+ raise ArgumentError, "claim_grace must be a non-negative number of seconds"
20
+ end
21
+
22
+ def self.normalize_stale_partial_after(value)
23
+ seconds = Integer(value)
24
+ return seconds if seconds >= 0
25
+
26
+ raise ArgumentError
27
+ rescue ArgumentError, TypeError, RangeError
28
+ raise ArgumentError, "stale_partial_after must be a non-negative number of seconds"
29
+ end
30
+
31
+ def self.normalize_quarantine_retention(value)
32
+ Quarantine.normalize_retention(value)
33
+ end
34
+
35
+ def initialize(path:, clock: Time, claim_grace: DEFAULT_CLAIM_GRACE, stale_partial_after: DEFAULT_STALE_PARTIAL_AFTER, quarantine_retention: DEFAULT_QUARANTINE_RETENTION)
36
+ @paths = Paths.new(path: path)
37
+ @clock = clock
38
+ @claim_grace = self.class.normalize_claim_grace(claim_grace)
39
+ @stale_partial_after = self.class.normalize_stale_partial_after(stale_partial_after)
40
+ @quarantine_retention = self.class.normalize_quarantine_retention(quarantine_retention)
41
+
42
+ ensure_directories
43
+ end
44
+
45
+ def claim_completed(before: claim_before_bucket)
46
+ before = before.to_s
47
+
48
+ ensure_directories
49
+ cleanup_quarantine
50
+ quarantine_stale_partials
51
+
52
+ claimable_log_names(before).filter_map do |log_name|
53
+ claim_log(log_name, before)
54
+ end
55
+ end
56
+
57
+ def delete_claimed(log_name)
58
+ claim_processing(log_name)&.delete
59
+ end
60
+
61
+ def release_claimed(log_name)
62
+ claim_processing(log_name)&.release
63
+ end
64
+
65
+ private
66
+
67
+ def claimable_log_names(before)
68
+ (processing_log_names + completed_log_names(before)).uniq.sort
69
+ end
70
+
71
+ def processing_log_names
72
+ Dir.children(paths.processing).grep(/\.jsonl\z/).select { |name| valid_log_name?(name) }
73
+ rescue SystemCallError
74
+ []
75
+ end
76
+
77
+ def completed_log_names(before)
78
+ Dir.children(paths.logs)
79
+ .grep(/\.jsonl\z/)
80
+ .select { |name| processable_log?(name, before) }
81
+ rescue SystemCallError
82
+ []
83
+ end
84
+
85
+ def partial_log_names(before)
86
+ Dir.children(paths.logs)
87
+ .grep(/\.jsonl\z/)
88
+ .select { |name| partial_log?(name, before) }
89
+ rescue SystemCallError
90
+ []
91
+ end
92
+
93
+ def claim_log(log_name, before)
94
+ return unless processable_log?(log_name, before)
95
+
96
+ claim_processing(log_name) || claim_pending(log_name, before)
97
+ end
98
+
99
+ def claim_processing(log_name)
100
+ return unless valid_log_name?(log_name)
101
+
102
+ claim_path = paths.processing_log_for(log_name)
103
+
104
+ with_claim_lock(log_name) do |lock_file|
105
+ next unless File.exist?(claim_path)
106
+ next unless complete_file?(claim_path)
107
+
108
+ build_claim(log_name, claim_path, lock_file)
109
+ end
110
+ rescue SystemCallError, IOError
111
+ nil
112
+ end
113
+
114
+ def quarantine_stale_partials
115
+ before = stale_partial_before_bucket
116
+
117
+ partial_log_names(before).each do |log_name|
118
+ quarantine_partial(log_name, before)
119
+ end
120
+ end
121
+
122
+ def quarantine_partial(log_name, before)
123
+ return unless partial_log?(log_name, before)
124
+
125
+ source_path = File.join(paths.logs, log_name)
126
+
127
+ with_claim_lock(log_name) do |lock_file|
128
+ next unless File.exist?(source_path)
129
+ next unless partial_log?(log_name, before)
130
+
131
+ Claim.new(name: log_name, path: source_path, lock_file: lock_file).quarantine(
132
+ paths: paths,
133
+ reason: "partial_trailing_line",
134
+ metadata: {
135
+ "stale_partial_after" => stale_partial_after,
136
+ "before_bucket" => before
137
+ },
138
+ clock: @clock
139
+ )
140
+ end
141
+ rescue SystemCallError, IOError
142
+ nil
143
+ end
144
+
145
+ def claim_pending(log_name, before)
146
+ return unless processable_log?(log_name, before)
147
+
148
+ source_path = File.join(paths.logs, log_name)
149
+ claim_path = paths.processing_log_for(log_name)
150
+
151
+ with_claim_lock(log_name) do |lock_file|
152
+ next if File.exist?(claim_path)
153
+ next unless File.exist?(source_path)
154
+ next unless processable_log?(log_name, before)
155
+ next unless complete_file?(source_path)
156
+
157
+ File.rename(source_path, claim_path)
158
+ build_claim(log_name, claim_path, lock_file)
159
+ end
160
+ rescue SystemCallError, IOError
161
+ nil
162
+ end
163
+
164
+ def build_claim(log_name, path, lock_file)
165
+ Claim.new(
166
+ name: log_name,
167
+ path: path,
168
+ lock_file: lock_file
169
+ )
170
+ end
171
+
172
+ def with_claim_lock(log_name)
173
+ FileUtils.mkdir_p(paths.claim_locks)
174
+
175
+ file = File.open(paths.claim_lock_for(log_name), File::RDWR | File::CREAT, 0o600)
176
+ return close_file(file) unless file.flock(File::LOCK_EX | File::LOCK_NB)
177
+
178
+ claim = yield file
179
+ return claim if claim
180
+
181
+ unlock_file(file)
182
+ close_file(file)
183
+ nil
184
+ rescue
185
+ unlock_file(file)
186
+ close_file(file)
187
+ raise
188
+ end
189
+
190
+ def complete_file?(path)
191
+ size = File.size(path)
192
+ return true if size.zero?
193
+
194
+ File.open(path, File::RDONLY) do |file|
195
+ file.seek(size - 1)
196
+ file.read(1) == "\n"
197
+ end
198
+ rescue SystemCallError, IOError
199
+ false
200
+ end
201
+
202
+ def ensure_directories
203
+ FileUtils.mkdir_p(paths.logs)
204
+ FileUtils.mkdir_p(paths.processing)
205
+ FileUtils.mkdir_p(paths.claim_locks)
206
+ end
207
+
208
+ def processable_log?(log_name, before)
209
+ bucket = log_bucket(log_name)
210
+ bucket && bucket < before
211
+ end
212
+
213
+ def partial_log?(log_name, before)
214
+ return false unless processable_log?(log_name, before)
215
+
216
+ !complete_file?(File.join(paths.logs, log_name))
217
+ end
218
+
219
+ def valid_log_name?(log_name)
220
+ LogName.valid?(log_name)
221
+ end
222
+
223
+ def log_bucket(log_name)
224
+ LogName.bucket(log_name)
225
+ end
226
+
227
+ def claim_before_bucket
228
+ (current_time.utc - claim_grace).strftime(Writer::BUCKET_FORMAT)
229
+ end
230
+
231
+ def stale_partial_before_bucket
232
+ (current_time.utc - stale_partial_after).strftime(Writer::BUCKET_FORMAT)
233
+ end
234
+
235
+ def current_time
236
+ @clock.respond_to?(:call) ? @clock.call : @clock.now
237
+ end
238
+
239
+ def cleanup_quarantine
240
+ quarantine.cleanup
241
+ end
242
+
243
+ def quarantine
244
+ @quarantine ||= Quarantine.new(
245
+ path: paths.path,
246
+ clock: @clock,
247
+ retention: quarantine_retention
248
+ )
249
+ end
250
+
251
+ def unlock_file(file)
252
+ file&.flock(File::LOCK_UN)
253
+ rescue SystemCallError, IOError
254
+ nil
255
+ end
256
+
257
+ def close_file(file)
258
+ file&.close
259
+ rescue SystemCallError, IOError
260
+ nil
261
+ end
262
+ end
263
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ class TimeBucketStream
6
+ module Codecs
7
+ class Json
8
+ def dump(value)
9
+ JSON.generate(value)
10
+ end
11
+
12
+ def load(value)
13
+ JSON.parse(value)
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TimeBucketStream
4
+ module Codecs
5
+ class Oj
6
+ def initialize
7
+ require "oj"
8
+ rescue LoadError
9
+ raise LoadError, 'TimeBucketStream::Codecs::Oj requires `gem "oj"`'
10
+ end
11
+
12
+ def dump(value)
13
+ ::Oj.dump(value, mode: :compat)
14
+ end
15
+
16
+ def load(value)
17
+ ::Oj.load(value, mode: :compat)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TimeBucketStream
4
+ module Codecs
5
+ module_function
6
+
7
+ def validate!(codec)
8
+ return codec if codec.respond_to?(:dump) && codec.respond_to?(:load)
9
+
10
+ raise ArgumentError, "codec must respond to dump and load"
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TimeBucketStream
4
+ module LogName
5
+ PATTERN = /\A(?<bucket>\d{12})-[A-Za-z0-9_.-]+-\d+-[0-9a-f]{8}\.jsonl\z/
6
+
7
+ module_function
8
+
9
+ def valid?(name)
10
+ name = name.to_s
11
+ name == File.basename(name) && PATTERN.match?(name)
12
+ rescue ArgumentError, EncodingError
13
+ false
14
+ end
15
+
16
+ def bucket(name)
17
+ match = PATTERN.match(name.to_s)
18
+ match && match[:bucket]
19
+ rescue ArgumentError, EncodingError
20
+ nil
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TimeBucketStream
4
+ class Paths
5
+ attr_reader :path
6
+
7
+ def initialize(path:)
8
+ @path = normalize_path(path)
9
+ end
10
+
11
+ def base
12
+ path
13
+ end
14
+
15
+ def logs
16
+ @logs ||= File.join(base, "logs")
17
+ end
18
+
19
+ def processing
20
+ @processing ||= File.join(base, "processing")
21
+ end
22
+
23
+ def quarantine
24
+ @quarantine ||= File.join(base, "quarantine")
25
+ end
26
+
27
+ def claim_locks
28
+ @claim_locks ||= File.join(base, "claim_locks")
29
+ end
30
+
31
+ def log_for(writer_id)
32
+ File.join(logs, "#{writer_id}.jsonl")
33
+ end
34
+
35
+ def processing_log_for(log_name)
36
+ File.join(processing, log_name)
37
+ end
38
+
39
+ def quarantine_log_for(log_name)
40
+ File.join(quarantine, log_name)
41
+ end
42
+
43
+ def quarantine_metadata_for(log_name)
44
+ File.join(quarantine, "#{log_name}.meta.json")
45
+ end
46
+
47
+ def claim_lock_for(log_name)
48
+ File.join(claim_locks, "#{log_name}.lock")
49
+ end
50
+
51
+ private
52
+
53
+ def normalize_path(path)
54
+ value = path.to_s
55
+ raise ArgumentError, "path must not be blank" if value.strip.empty?
56
+
57
+ File.expand_path(value)
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "json"
5
+ require "time"
6
+
7
+ class TimeBucketStream
8
+ class Quarantine
9
+ DEFAULT_RETENTION = 7 * 24 * 60 * 60
10
+ TIMESTAMP_PATTERN = /\.q-(\d{20})-\d+-[0-9a-f]+\.jsonl\z/
11
+
12
+ attr_reader :paths, :retention
13
+
14
+ def self.normalize_retention(value)
15
+ return nil if value.nil?
16
+
17
+ seconds = Integer(value)
18
+ return seconds if seconds >= 0
19
+
20
+ raise ArgumentError
21
+ rescue ArgumentError, TypeError, RangeError
22
+ raise ArgumentError, "quarantine_retention must be nil or a non-negative number of seconds"
23
+ end
24
+
25
+ def initialize(path:, clock: Time, retention: DEFAULT_RETENTION)
26
+ @paths = Paths.new(path: path)
27
+ @clock = clock
28
+ @retention = self.class.normalize_retention(retention)
29
+ end
30
+
31
+ def cleanup
32
+ return unless retention
33
+
34
+ cutoff = current_time.utc - retention
35
+
36
+ quarantine_log_names.each do |log_name|
37
+ delete_pair(log_name) if expired_log?(log_name, cutoff)
38
+ end
39
+
40
+ orphan_metadata_names.each do |metadata_name|
41
+ delete_metadata(metadata_name) if expired_metadata?(metadata_name, cutoff)
42
+ end
43
+ rescue SystemCallError, IOError, JSON::ParserError
44
+ nil
45
+ end
46
+
47
+ private
48
+
49
+ def quarantine_log_names
50
+ children.select { |name| name.end_with?(".jsonl") }.sort
51
+ end
52
+
53
+ def metadata_names
54
+ children.select { |name| name.end_with?(".meta.json") }.sort
55
+ end
56
+
57
+ def orphan_metadata_names
58
+ metadata_names.reject do |metadata_name|
59
+ File.exist?(quarantine_log_path(metadata_name.delete_suffix(".meta.json")))
60
+ end
61
+ end
62
+
63
+ def children
64
+ Dir.children(paths.quarantine)
65
+ rescue SystemCallError
66
+ []
67
+ end
68
+
69
+ def expired_log?(log_name, cutoff)
70
+ quarantine_time_for(log_name) < cutoff
71
+ end
72
+
73
+ def expired_metadata?(metadata_name, cutoff)
74
+ log_name = metadata_name.delete_suffix(".meta.json")
75
+ quarantine_time_for(log_name, metadata_name: metadata_name) < cutoff
76
+ end
77
+
78
+ def quarantine_time_for(log_name, metadata_name: "#{log_name}.meta.json")
79
+ metadata_time(metadata_path(metadata_name)) ||
80
+ timestamp_from_name(log_name) ||
81
+ file_time(quarantine_log_path(log_name)) ||
82
+ file_time(metadata_path(metadata_name)) ||
83
+ current_time.utc
84
+ end
85
+
86
+ def metadata_time(path)
87
+ return unless File.exist?(path)
88
+
89
+ value = JSON.parse(File.read(path)).fetch("quarantined_at", nil)
90
+ Time.parse(value).utc if value
91
+ rescue SystemCallError, IOError, JSON::ParserError, TypeError, ArgumentError
92
+ nil
93
+ end
94
+
95
+ def timestamp_from_name(log_name)
96
+ match = log_name.match(TIMESTAMP_PATTERN)
97
+ parse_timestamp(match[1]) if match
98
+ rescue ArgumentError
99
+ nil
100
+ end
101
+
102
+ def parse_timestamp(value)
103
+ Time.utc(
104
+ value[0, 4].to_i,
105
+ value[4, 2].to_i,
106
+ value[6, 2].to_i,
107
+ value[8, 2].to_i,
108
+ value[10, 2].to_i,
109
+ value[12, 2].to_i,
110
+ value[14, 6].to_i
111
+ )
112
+ end
113
+
114
+ def file_time(path)
115
+ File.mtime(path).utc
116
+ rescue SystemCallError, IOError
117
+ nil
118
+ end
119
+
120
+ def delete_pair(log_name)
121
+ FileUtils.rm_f(quarantine_log_path(log_name))
122
+ FileUtils.rm_f(metadata_path("#{log_name}.meta.json"))
123
+ end
124
+
125
+ def delete_metadata(metadata_name)
126
+ FileUtils.rm_f(metadata_path(metadata_name))
127
+ end
128
+
129
+ def quarantine_log_path(log_name)
130
+ paths.quarantine_log_for(log_name)
131
+ end
132
+
133
+ def metadata_path(metadata_name)
134
+ File.join(paths.quarantine, metadata_name)
135
+ end
136
+
137
+ def current_time
138
+ @clock.respond_to?(:call) ? @clock.call : @clock.now
139
+ end
140
+ end
141
+ end