time_bucket_stream 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,261 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TimeBucketStream
4
+ DEFAULT_CLAIM_GRACE = Claimer::DEFAULT_CLAIM_GRACE
5
+ DEFAULT_STALE_PARTIAL_AFTER = Claimer::DEFAULT_STALE_PARTIAL_AFTER
6
+ DEFAULT_QUARANTINE_RETENTION = Claimer::DEFAULT_QUARANTINE_RETENTION
7
+ DEFAULT_MALFORMED_ENTRY = :quarantine
8
+ MALFORMED_ENTRY_MODES = %i[quarantine skip].freeze
9
+
10
+ attr_reader :claim_grace, :codec, :malformed_entry, :paths, :quarantine_retention, :stale_partial_after, :sync
11
+
12
+ def initialize(path:, sync: :flush, clock: Time, claim_grace: DEFAULT_CLAIM_GRACE, stale_partial_after: DEFAULT_STALE_PARTIAL_AFTER, quarantine_retention: DEFAULT_QUARANTINE_RETENTION, malformed_entry: DEFAULT_MALFORMED_ENTRY, codec: Codecs::Json.new)
13
+ @paths = Paths.new(path: path)
14
+ @sync = sync.respond_to?(:to_sym) ? sync.to_sym : sync
15
+ @clock = clock
16
+ @codec = Codecs.validate!(codec)
17
+ @claim_grace = Claimer.normalize_claim_grace(claim_grace)
18
+ @stale_partial_after = Claimer.normalize_stale_partial_after(stale_partial_after)
19
+ @quarantine_retention = Claimer.normalize_quarantine_retention(quarantine_retention)
20
+ @malformed_entry = normalize_malformed_entry(malformed_entry)
21
+ @claims_by_log_name = {}
22
+
23
+ writer
24
+ claimer
25
+ end
26
+
27
+ def append(payload)
28
+ writer.append(payload)
29
+ end
30
+
31
+ def read
32
+ writer.close_stale
33
+
34
+ Batch.new(
35
+ entries: active_claim_entries + new_claim_entries,
36
+ on_delete: method(:delete_entries),
37
+ on_release: method(:release_entries)
38
+ )
39
+ end
40
+
41
+ def drain
42
+ raise ArgumentError, "drain requires a block" unless block_given?
43
+
44
+ batch = read
45
+ completed = false
46
+
47
+ batch.each { |payload| yield payload }
48
+ batch.delete
49
+ completed = true
50
+
51
+ batch
52
+ ensure
53
+ batch.release if batch && !completed && !batch.finished?
54
+ end
55
+
56
+ def close
57
+ release_claims
58
+ writer.close
59
+ end
60
+
61
+ private
62
+
63
+ def delete_entries(ids)
64
+ log_names_from_entry_ids(ids).each { |log_name| delete_claim(log_name) }
65
+ end
66
+
67
+ def release_entries(ids)
68
+ log_names_from_entry_ids(ids).each { |log_name| release_claim(log_name) }
69
+ end
70
+
71
+ def active_claim_entries
72
+ @claims_by_log_name.values.flat_map do |claim|
73
+ entries = read_claim(claim)
74
+ next entries unless entries.empty?
75
+
76
+ @claims_by_log_name.delete(claim.name)
77
+ claim.release
78
+ []
79
+ end
80
+ end
81
+
82
+ def new_claim_entries
83
+ claimer.claim_completed.flat_map do |claim|
84
+ if @claims_by_log_name.key?(claim.name)
85
+ claim.release
86
+ next []
87
+ end
88
+
89
+ entries_from_claim(claim)
90
+ end
91
+ end
92
+
93
+ def entries_from_claim(claim)
94
+ entries = read_claim(claim)
95
+
96
+ if entries.empty?
97
+ claim.release
98
+ else
99
+ @claims_by_log_name[claim.name] = claim
100
+ end
101
+
102
+ entries
103
+ end
104
+
105
+ def read_claim(claim)
106
+ lines = claim.read_lines
107
+ return quarantine_claim(claim, reason: "empty_file", metadata: {"line_count" => 0}) if lines.empty? && empty_claim?(claim)
108
+ return [] if lines.empty?
109
+
110
+ entries = []
111
+ malformed_lines = []
112
+
113
+ lines.each_with_index do |line, index|
114
+ entry = parse_entry(line)
115
+ if entry
116
+ entries << [
117
+ encode_entry_id(claim.name, entry.fetch("id")),
118
+ entry.fetch("payload")
119
+ ]
120
+ else
121
+ malformed_lines << malformed_line_summary(line, index)
122
+ end
123
+ end
124
+
125
+ return entries if malformed_lines.empty?
126
+
127
+ if malformed_entry == :skip
128
+ return entries if entries.any?
129
+
130
+ return discard_fully_malformed_claim(claim)
131
+ end
132
+
133
+ quarantine_claim(
134
+ claim,
135
+ reason: "malformed_jsonl",
136
+ metadata: {
137
+ "line_count" => lines.length,
138
+ "malformed_lines" => malformed_lines
139
+ }
140
+ )
141
+ rescue SystemCallError, IOError
142
+ []
143
+ end
144
+
145
+ def parse_entry(line)
146
+ entry = codec.load(line)
147
+ return unless valid_entry?(entry)
148
+
149
+ entry
150
+ rescue
151
+ nil
152
+ end
153
+
154
+ def valid_entry?(entry)
155
+ entry.is_a?(Hash) &&
156
+ entry.fetch("id", nil).is_a?(Integer) &&
157
+ entry.fetch("id").positive? &&
158
+ entry.key?("payload")
159
+ end
160
+
161
+ def malformed_line_summary(line, index)
162
+ {
163
+ "line" => index + 1,
164
+ "bytes" => line.bytesize,
165
+ "sample" => line.byteslice(0, 200)
166
+ }
167
+ end
168
+
169
+ def discard_fully_malformed_claim(claim)
170
+ @claims_by_log_name.delete(claim.name)
171
+ claim.delete
172
+ []
173
+ rescue SystemCallError, IOError
174
+ claim.release
175
+ []
176
+ end
177
+
178
+ def normalize_malformed_entry(value)
179
+ mode = value.respond_to?(:to_sym) ? value.to_sym : value
180
+ return mode if MALFORMED_ENTRY_MODES.include?(mode)
181
+
182
+ raise ArgumentError, "malformed_entry must be :quarantine or :skip"
183
+ end
184
+
185
+ def encode_entry_id(log_name, entry_id)
186
+ "#{log_name}:#{entry_id}"
187
+ end
188
+
189
+ def log_names_from_entry_ids(ids)
190
+ Array(ids).filter_map { |id| decode_entry_id(id) }.uniq
191
+ end
192
+
193
+ def decode_entry_id(id)
194
+ value = id.to_s
195
+ separator = value.index(":")
196
+ return unless separator
197
+
198
+ log_name = value[0...separator]
199
+ log_name if valid_log_name?(log_name)
200
+ end
201
+
202
+ def valid_log_name?(log_name)
203
+ LogName.valid?(log_name)
204
+ end
205
+
206
+ def empty_claim?(claim)
207
+ File.zero?(claim.path)
208
+ rescue SystemCallError, IOError
209
+ false
210
+ end
211
+
212
+ def delete_claim(log_name)
213
+ claim = @claims_by_log_name.delete(log_name)
214
+ claim ? claim.delete : claimer.delete_claimed(log_name)
215
+ rescue SystemCallError, IOError
216
+ release_claim(log_name)
217
+ end
218
+
219
+ def release_claim(log_name)
220
+ claim = @claims_by_log_name.delete(log_name)
221
+ claim ? claim.release : claimer.release_claimed(log_name)
222
+ rescue SystemCallError, IOError
223
+ nil
224
+ end
225
+
226
+ def quarantine_claim(claim, reason:, metadata:)
227
+ @claims_by_log_name.delete(claim.name)
228
+ claim.quarantine(paths: paths, reason: reason, metadata: metadata, clock: @clock)
229
+ []
230
+ rescue SystemCallError, IOError
231
+ claim.release
232
+ []
233
+ end
234
+
235
+ def release_claims
236
+ claims = @claims_by_log_name.values
237
+ @claims_by_log_name = {}
238
+ claims.each(&:release)
239
+ rescue SystemCallError, IOError
240
+ nil
241
+ end
242
+
243
+ def writer
244
+ @writer ||= Writer.new(
245
+ path: paths.path,
246
+ sync: sync,
247
+ clock: @clock,
248
+ codec: codec
249
+ )
250
+ end
251
+
252
+ def claimer
253
+ @claimer ||= Claimer.new(
254
+ path: paths.path,
255
+ clock: @clock,
256
+ claim_grace: claim_grace,
257
+ stale_partial_after: stale_partial_after,
258
+ quarantine_retention: quarantine_retention
259
+ )
260
+ end
261
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class TimeBucketStream
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,173 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+ require "securerandom"
5
+ require "socket"
6
+
7
+ class TimeBucketStream
8
+ class Writer
9
+ BUCKET_FORMAT = "%Y%m%d%H%M"
10
+ SYNC_MODES = %i[none flush fsync].freeze
11
+
12
+ attr_reader :codec, :paths, :sync
13
+
14
+ def initialize(path:, sync: :flush, clock: Time, codec: Codecs::Json.new)
15
+ @paths = Paths.new(path: path)
16
+ @sync = sync.respond_to?(:to_sym) ? sync.to_sym : sync
17
+ @clock = clock
18
+ @codec = Codecs.validate!(codec)
19
+ @mutex = Mutex.new
20
+
21
+ validate_sync!
22
+ ensure_directories
23
+ end
24
+
25
+ def append(payload)
26
+ @mutex.synchronize do
27
+ prepare_writer
28
+ @sequence += 1
29
+
30
+ entry = {
31
+ "id" => @sequence,
32
+ "payload" => payload
33
+ }
34
+
35
+ write_line(encode_entry(entry))
36
+ entry.fetch("id")
37
+ end
38
+ end
39
+
40
+ def close
41
+ @mutex.synchronize do
42
+ close_writer
43
+ end
44
+ end
45
+
46
+ def close_stale
47
+ bucket = current_bucket
48
+
49
+ @mutex.synchronize do
50
+ close_writer if @writer_bucket && @writer_bucket < bucket
51
+ end
52
+ end
53
+
54
+ private
55
+
56
+ def validate_sync!
57
+ return if SYNC_MODES.include?(sync)
58
+
59
+ raise ArgumentError, "unsupported file sync mode: #{sync.inspect}"
60
+ end
61
+
62
+ def prepare_writer
63
+ bucket = current_bucket
64
+ return if writer_ready?(bucket)
65
+
66
+ close_writer
67
+ ensure_directories
68
+
69
+ @writer_pid = Process.pid
70
+ @writer_bucket = bucket
71
+ @writer_id = "#{bucket}-#{safe_hostname}-#{Process.pid}-#{SecureRandom.hex(4)}"
72
+ @sequence = 0
73
+ @log_path = paths.log_for(@writer_id)
74
+ @file = File.open(@log_path, File::WRONLY | File::CREAT | File::APPEND, 0o600)
75
+ end
76
+
77
+ def encode_entry(entry)
78
+ encoded = codec.dump(entry)
79
+ return encoded if encoded.is_a?(String) && !encoded.match?(/[\r\n]/)
80
+
81
+ raise ArgumentError, "codec must dump each entry to one line"
82
+ end
83
+
84
+ def writer_ready?(bucket)
85
+ active_writer? &&
86
+ @writer_bucket == bucket &&
87
+ log_file_exists?(@log_path)
88
+ end
89
+
90
+ def active_writer?
91
+ @writer_pid == Process.pid &&
92
+ @file &&
93
+ !@file.closed? &&
94
+ @log_path
95
+ end
96
+
97
+ def close_writer
98
+ close_file(@file)
99
+ @file = nil
100
+ end
101
+
102
+ def write_line(line)
103
+ data = "#{line}\n"
104
+
105
+ if sync == :none
106
+ syswrite_all(data)
107
+ return
108
+ end
109
+
110
+ write_all(data)
111
+ @file.flush
112
+ @file.fsync if sync == :fsync
113
+ end
114
+
115
+ def write_all(data)
116
+ bytes_written = 0
117
+
118
+ while bytes_written < data.bytesize
119
+ chunk = data.byteslice(bytes_written, data.bytesize - bytes_written)
120
+ written = @file.write(chunk)
121
+
122
+ raise IOError, "file write returned nil" unless written
123
+ raise IOError, "file write made no progress" unless written.positive?
124
+
125
+ bytes_written += written
126
+ end
127
+ end
128
+
129
+ def syswrite_all(data)
130
+ bytes_written = 0
131
+
132
+ while bytes_written < data.bytesize
133
+ chunk = data.byteslice(bytes_written, data.bytesize - bytes_written)
134
+ written = @file.syswrite(chunk)
135
+
136
+ raise IOError, "file syswrite made no progress" unless written.positive?
137
+
138
+ bytes_written += written
139
+ end
140
+ end
141
+
142
+ def ensure_directories
143
+ FileUtils.mkdir_p(paths.logs)
144
+ end
145
+
146
+ def current_bucket
147
+ current_time.utc.strftime(BUCKET_FORMAT)
148
+ end
149
+
150
+ def current_time
151
+ @clock.respond_to?(:call) ? @clock.call : @clock.now
152
+ end
153
+
154
+ def safe_hostname
155
+ value = Socket.gethostname.gsub(/[^A-Za-z0-9_.-]/, "_")
156
+ value.empty? ? "unknown-host" : value
157
+ rescue
158
+ "unknown-host"
159
+ end
160
+
161
+ def log_file_exists?(path)
162
+ File.exist?(path)
163
+ rescue SystemCallError
164
+ false
165
+ end
166
+
167
+ def close_file(file)
168
+ file&.close
169
+ rescue SystemCallError, IOError
170
+ nil
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "time_bucket_stream/version"
4
+ require_relative "time_bucket_stream/codecs"
5
+ require_relative "time_bucket_stream/codecs/json"
6
+ require_relative "time_bucket_stream/codecs/oj"
7
+ require_relative "time_bucket_stream/paths"
8
+ require_relative "time_bucket_stream/log_name"
9
+ require_relative "time_bucket_stream/writer"
10
+ require_relative "time_bucket_stream/quarantine"
11
+ require_relative "time_bucket_stream/claim"
12
+ require_relative "time_bucket_stream/claimer"
13
+ require_relative "time_bucket_stream/batch"
14
+ require_relative "time_bucket_stream/stream"
@@ -0,0 +1,132 @@
1
+ class TimeBucketStream
2
+ VERSION: String
3
+ DEFAULT_CLAIM_GRACE: Integer
4
+ DEFAULT_MALFORMED_ENTRY: Symbol
5
+ DEFAULT_STALE_PARTIAL_AFTER: Integer
6
+ DEFAULT_QUARANTINE_RETENTION: Integer
7
+ MALFORMED_ENTRY_MODES: Array[Symbol]
8
+
9
+ attr_reader claim_grace: Integer
10
+ attr_reader codec: untyped
11
+ attr_reader malformed_entry: Symbol
12
+ attr_reader paths: Paths
13
+ attr_reader quarantine_retention: Integer?
14
+ attr_reader stale_partial_after: Integer
15
+ attr_reader sync: Symbol
16
+
17
+ def initialize: (path: String, ?sync: untyped, ?clock: untyped, ?claim_grace: untyped, ?stale_partial_after: untyped, ?quarantine_retention: untyped, ?malformed_entry: untyped, ?codec: untyped) -> void
18
+ def append: (untyped payload) -> Integer
19
+ def read: () -> Batch
20
+ def drain: () { (untyped payload) -> untyped } -> Batch
21
+ def close: () -> void
22
+
23
+ module Codecs
24
+ def self.validate!: (untyped codec) -> untyped
25
+
26
+ class Json
27
+ def dump: (untyped value) -> String
28
+ def load: (String value) -> untyped
29
+ end
30
+
31
+ class Oj
32
+ def initialize: () -> void
33
+ def dump: (untyped value) -> String
34
+ def load: (String value) -> untyped
35
+ end
36
+ end
37
+
38
+ class Batch
39
+ include Enumerable[untyped]
40
+
41
+ attr_reader entries: Array[[String, untyped]]
42
+
43
+ def initialize: (entries: Array[[String, untyped]], on_delete: untyped, on_release: untyped) -> void
44
+ def each: () { (untyped) -> void } -> Array[[String, untyped]]
45
+ def each: () -> Enumerator[untyped, void]
46
+ def size: () -> Integer
47
+ def length: () -> Integer
48
+ def empty?: () -> bool
49
+ def delete: () -> bool
50
+ def release: () -> bool
51
+ def finished?: () -> bool
52
+ end
53
+
54
+ class Writer
55
+ BUCKET_FORMAT: String
56
+ SYNC_MODES: Array[Symbol]
57
+
58
+ attr_reader codec: untyped
59
+ attr_reader paths: Paths
60
+ attr_reader sync: Symbol
61
+
62
+ def initialize: (path: String, ?sync: untyped, ?clock: untyped, ?codec: untyped) -> void
63
+ def append: (untyped payload) -> Integer
64
+ def close: () -> void
65
+ def close_stale: () -> void
66
+ end
67
+
68
+ class Quarantine
69
+ DEFAULT_RETENTION: Integer
70
+ TIMESTAMP_PATTERN: Regexp
71
+
72
+ attr_reader paths: Paths
73
+ attr_reader retention: Integer?
74
+
75
+ def self.normalize_retention: (untyped value) -> Integer?
76
+ def initialize: (path: String, ?clock: untyped, ?retention: untyped) -> void
77
+ def cleanup: () -> nil
78
+ end
79
+
80
+ module LogName
81
+ PATTERN: Regexp
82
+
83
+ def self.valid?: (untyped name) -> bool
84
+ def self.bucket: (untyped name) -> String?
85
+ end
86
+
87
+ class Claimer
88
+ DEFAULT_CLAIM_GRACE: Integer
89
+ DEFAULT_STALE_PARTIAL_AFTER: Integer
90
+ DEFAULT_QUARANTINE_RETENTION: Integer
91
+
92
+ attr_reader claim_grace: Integer
93
+ attr_reader paths: Paths
94
+ attr_reader quarantine_retention: Integer?
95
+ attr_reader stale_partial_after: Integer
96
+
97
+ def self.normalize_claim_grace: (untyped value) -> Integer
98
+ def self.normalize_stale_partial_after: (untyped value) -> Integer
99
+ def self.normalize_quarantine_retention: (untyped value) -> Integer?
100
+ def initialize: (path: String, ?clock: untyped, ?claim_grace: untyped, ?stale_partial_after: untyped, ?quarantine_retention: untyped) -> void
101
+ def claim_completed: (?before: String) -> Array[Claim]
102
+ def delete_claimed: (untyped log_name) -> untyped
103
+ def release_claimed: (untyped log_name) -> untyped
104
+ end
105
+
106
+ class Claim
107
+ attr_reader name: String
108
+ attr_reader path: String
109
+
110
+ def initialize: (name: String, path: String, lock_file: untyped) -> void
111
+ def read_lines: () -> Array[String]
112
+ def delete: () -> void
113
+ def quarantine: (paths: Paths, reason: untyped, ?metadata: untyped, ?clock: untyped) -> String?
114
+ def release: () -> void
115
+ end
116
+
117
+ class Paths
118
+ attr_reader path: String
119
+
120
+ def initialize: (path: String) -> void
121
+ def base: () -> String
122
+ def logs: () -> String
123
+ def processing: () -> String
124
+ def quarantine: () -> String
125
+ def claim_locks: () -> String
126
+ def log_for: (String writer_id) -> String
127
+ def processing_log_for: (String log_name) -> String
128
+ def quarantine_log_for: (String log_name) -> String
129
+ def quarantine_metadata_for: (String log_name) -> String
130
+ def claim_lock_for: (String log_name) -> String
131
+ end
132
+ end
metadata ADDED
@@ -0,0 +1,61 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: time_bucket_stream
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Lim Yu Kwang
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies: []
12
+ description: TimeBucketStream appends JSONL events into time-bucketed files and atomically
13
+ claims completed files for processing.
14
+ email:
15
+ - aaron.lim.yu.kwang@gmail.com
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - ".standard.yml"
21
+ - LICENSE.txt
22
+ - README.md
23
+ - Rakefile
24
+ - lib/time_bucket_stream.rb
25
+ - lib/time_bucket_stream/batch.rb
26
+ - lib/time_bucket_stream/claim.rb
27
+ - lib/time_bucket_stream/claimer.rb
28
+ - lib/time_bucket_stream/codecs.rb
29
+ - lib/time_bucket_stream/codecs/json.rb
30
+ - lib/time_bucket_stream/codecs/oj.rb
31
+ - lib/time_bucket_stream/log_name.rb
32
+ - lib/time_bucket_stream/paths.rb
33
+ - lib/time_bucket_stream/quarantine.rb
34
+ - lib/time_bucket_stream/stream.rb
35
+ - lib/time_bucket_stream/version.rb
36
+ - lib/time_bucket_stream/writer.rb
37
+ - sig/time_bucket_stream.rbs
38
+ homepage: https://github.com/aaron-lim/time_bucket_stream
39
+ licenses:
40
+ - MIT
41
+ metadata:
42
+ allowed_push_host: https://rubygems.org
43
+ source_code_uri: https://github.com/aaron-lim/time_bucket_stream
44
+ rdoc_options: []
45
+ require_paths:
46
+ - lib
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - ">="
50
+ - !ruby/object:Gem::Version
51
+ version: 3.1.0
52
+ required_rubygems_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ requirements: []
58
+ rubygems_version: 3.6.9
59
+ specification_version: 4
60
+ summary: Time-bucketed file streams for Ruby.
61
+ test_files: []