red-arrow-format 23.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +26 -0
  3. data/LICENSE.txt +202 -0
  4. data/NOTICE.txt +2 -0
  5. data/README.md +61 -0
  6. data/Rakefile +67 -0
  7. data/lib/arrow-format/array.rb +476 -0
  8. data/lib/arrow-format/bitmap.rb +44 -0
  9. data/lib/arrow-format/error.rb +34 -0
  10. data/lib/arrow-format/field.rb +33 -0
  11. data/lib/arrow-format/file-reader.rb +213 -0
  12. data/lib/arrow-format/org/apache/arrow/flatbuf/binary.rb +21 -0
  13. data/lib/arrow-format/org/apache/arrow/flatbuf/binary_view.rb +27 -0
  14. data/lib/arrow-format/org/apache/arrow/flatbuf/block.rb +38 -0
  15. data/lib/arrow-format/org/apache/arrow/flatbuf/body_compression.rb +47 -0
  16. data/lib/arrow-format/org/apache/arrow/flatbuf/body_compression_method.rb +31 -0
  17. data/lib/arrow-format/org/apache/arrow/flatbuf/bool.rb +20 -0
  18. data/lib/arrow-format/org/apache/arrow/flatbuf/buffer.rb +38 -0
  19. data/lib/arrow-format/org/apache/arrow/flatbuf/compression_type.rb +22 -0
  20. data/lib/arrow-format/org/apache/arrow/flatbuf/date.rb +36 -0
  21. data/lib/arrow-format/org/apache/arrow/flatbuf/date_unit.rb +22 -0
  22. data/lib/arrow-format/org/apache/arrow/flatbuf/decimal.rb +48 -0
  23. data/lib/arrow-format/org/apache/arrow/flatbuf/dictionary_batch.rb +50 -0
  24. data/lib/arrow-format/org/apache/arrow/flatbuf/dictionary_encoding.rb +64 -0
  25. data/lib/arrow-format/org/apache/arrow/flatbuf/dictionary_kind.rb +26 -0
  26. data/lib/arrow-format/org/apache/arrow/flatbuf/duration.rb +30 -0
  27. data/lib/arrow-format/org/apache/arrow/flatbuf/endianness.rb +24 -0
  28. data/lib/arrow-format/org/apache/arrow/flatbuf/feature.rb +46 -0
  29. data/lib/arrow-format/org/apache/arrow/flatbuf/field.rb +92 -0
  30. data/lib/arrow-format/org/apache/arrow/flatbuf/field_node.rb +43 -0
  31. data/lib/arrow-format/org/apache/arrow/flatbuf/fixed_size_binary.rb +27 -0
  32. data/lib/arrow-format/org/apache/arrow/flatbuf/fixed_size_list.rb +27 -0
  33. data/lib/arrow-format/org/apache/arrow/flatbuf/floating_point.rb +30 -0
  34. data/lib/arrow-format/org/apache/arrow/flatbuf/footer.rb +74 -0
  35. data/lib/arrow-format/org/apache/arrow/flatbuf/int.rb +33 -0
  36. data/lib/arrow-format/org/apache/arrow/flatbuf/interval.rb +30 -0
  37. data/lib/arrow-format/org/apache/arrow/flatbuf/interval_unit.rb +23 -0
  38. data/lib/arrow-format/org/apache/arrow/flatbuf/key_value.rb +36 -0
  39. data/lib/arrow-format/org/apache/arrow/flatbuf/large_binary.rb +22 -0
  40. data/lib/arrow-format/org/apache/arrow/flatbuf/large_list.rb +22 -0
  41. data/lib/arrow-format/org/apache/arrow/flatbuf/large_list_view.rb +22 -0
  42. data/lib/arrow-format/org/apache/arrow/flatbuf/large_utf8.rb +22 -0
  43. data/lib/arrow-format/org/apache/arrow/flatbuf/list.rb +20 -0
  44. data/lib/arrow-format/org/apache/arrow/flatbuf/list_view.rb +23 -0
  45. data/lib/arrow-format/org/apache/arrow/flatbuf/map.rb +52 -0
  46. data/lib/arrow-format/org/apache/arrow/flatbuf/message.rb +68 -0
  47. data/lib/arrow-format/org/apache/arrow/flatbuf/message_header.rb +39 -0
  48. data/lib/arrow-format/org/apache/arrow/flatbuf/metadata_version.rb +36 -0
  49. data/lib/arrow-format/org/apache/arrow/flatbuf/null.rb +21 -0
  50. data/lib/arrow-format/org/apache/arrow/flatbuf/precision.rb +23 -0
  51. data/lib/arrow-format/org/apache/arrow/flatbuf/record_batch.rb +93 -0
  52. data/lib/arrow-format/org/apache/arrow/flatbuf/run_end_encoded.rb +25 -0
  53. data/lib/arrow-format/org/apache/arrow/flatbuf/schema.rb +68 -0
  54. data/lib/arrow-format/org/apache/arrow/flatbuf/sparse_matrix_compressed_axis.rb +22 -0
  55. data/lib/arrow-format/org/apache/arrow/flatbuf/sparse_matrix_index_csx.rb +96 -0
  56. data/lib/arrow-format/org/apache/arrow/flatbuf/sparse_tensor.rb +92 -0
  57. data/lib/arrow-format/org/apache/arrow/flatbuf/sparse_tensor_index.rb +29 -0
  58. data/lib/arrow-format/org/apache/arrow/flatbuf/sparse_tensor_index_coo.rb +93 -0
  59. data/lib/arrow-format/org/apache/arrow/flatbuf/sparse_tensor_index_csf.rb +129 -0
  60. data/lib/arrow-format/org/apache/arrow/flatbuf/struct_.rb +23 -0
  61. data/lib/arrow-format/org/apache/arrow/flatbuf/tensor.rb +74 -0
  62. data/lib/arrow-format/org/apache/arrow/flatbuf/tensor_dim.rb +38 -0
  63. data/lib/arrow-format/org/apache/arrow/flatbuf/time.rb +51 -0
  64. data/lib/arrow-format/org/apache/arrow/flatbuf/time_unit.rb +24 -0
  65. data/lib/arrow-format/org/apache/arrow/flatbuf/timestamp.rb +152 -0
  66. data/lib/arrow-format/org/apache/arrow/flatbuf/type.rb +55 -0
  67. data/lib/arrow-format/org/apache/arrow/flatbuf/union.rb +44 -0
  68. data/lib/arrow-format/org/apache/arrow/flatbuf/union_mode.rb +22 -0
  69. data/lib/arrow-format/org/apache/arrow/flatbuf/utf8.rb +21 -0
  70. data/lib/arrow-format/org/apache/arrow/flatbuf/utf8view.rb +27 -0
  71. data/lib/arrow-format/readable.rb +271 -0
  72. data/lib/arrow-format/record-batch.rb +36 -0
  73. data/lib/arrow-format/schema.rb +24 -0
  74. data/lib/arrow-format/streaming-pull-reader.rb +243 -0
  75. data/lib/arrow-format/streaming-reader.rb +50 -0
  76. data/lib/arrow-format/type.rb +704 -0
  77. data/lib/arrow-format/version.rb +26 -0
  78. data/lib/arrow-format.rb +20 -0
  79. data/red-arrow-format.gemspec +57 -0
  80. metadata +137 -0
@@ -0,0 +1,243 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ require_relative "array"
19
+ require_relative "error"
20
+ require_relative "field"
21
+ require_relative "readable"
22
+ require_relative "record-batch"
23
+ require_relative "schema"
24
+ require_relative "type"
25
+
26
+ module ArrowFormat
27
+ class MessagePullReader
28
+ CONTINUATION_TYPE = :s32
29
+ CONTINUATION_SIZE = IO::Buffer.size_of(CONTINUATION_TYPE)
30
+ CONTINUATION_STRING = "\xFF\xFF\xFF\xFF".b.freeze
31
+ CONTINUATION_INT32 = -1
32
+ METADATA_LENGTH_TYPE = :s32
33
+ METADATA_LENGTH_SIZE = IO::Buffer.size_of(METADATA_LENGTH_TYPE)
34
+
35
+ def initialize(&on_read)
36
+ @on_read = on_read
37
+ @buffer = IO::Buffer.new(0)
38
+ @metadata_length = nil
39
+ @body_length = nil
40
+ @state = :initial
41
+ end
42
+
43
+ def next_required_size
44
+ case @state
45
+ when :initial
46
+ CONTINUATION_SIZE
47
+ when :metadata_length
48
+ METADATA_LENGTH_SIZE
49
+ when :metadata
50
+ @metadata_length
51
+ when :body
52
+ @body_length
53
+ when :eos
54
+ 0
55
+ end
56
+ end
57
+
58
+ def eos?
59
+ @state == :eos
60
+ end
61
+
62
+ def consume(chunk)
63
+ return if eos?
64
+
65
+ if @buffer.size.zero?
66
+ target = chunk
67
+ else
68
+ @buffer.resize(@buffer.size + chunk.size)
69
+ @buffer.copy(chunk)
70
+ target = @buffer
71
+ end
72
+
73
+ loop do
74
+ next_size = next_required_size
75
+ break if next_size.zero?
76
+
77
+ if target.size < next_size
78
+ @buffer.resize(target.size) if @buffer.size < target.size
79
+ @buffer.copy(target)
80
+ @buffer.resize(target.size)
81
+ return
82
+ end
83
+
84
+ case @state
85
+ when :initial
86
+ consume_initial(target)
87
+ when :metadata_length
88
+ consume_metadata_length(target)
89
+ when :metadata
90
+ consume_metadata(target)
91
+ when :body
92
+ consume_body(target)
93
+ end
94
+ break if target.size == next_size
95
+
96
+ target = target.slice(next_size)
97
+ end
98
+ end
99
+
100
+ private
101
+ def consume_initial(target)
102
+ continuation = target.get_value(CONTINUATION_TYPE, 0)
103
+ unless continuation == CONTINUATION_INT32
104
+ raise ReadError.new("Invalid continuation token: " +
105
+ continuation.inspect)
106
+ end
107
+ @state = :metadata_length
108
+ end
109
+
110
+ def consume_metadata_length(target)
111
+ length = target.get_value(METADATA_LENGTH_TYPE, 0)
112
+ if length < 0
113
+ raise ReadError.new("Negative metadata length: " +
114
+ length.inspect)
115
+ end
116
+ if length == 0
117
+ @state = :eos
118
+ else
119
+ @metadata_length = length
120
+ @state = :metadata
121
+ end
122
+ end
123
+
124
+ def consume_metadata(target)
125
+ metadata_buffer = target.slice(0, @metadata_length)
126
+ @message = Org::Apache::Arrow::Flatbuf::Message.new(metadata_buffer)
127
+ @body_length = @message.body_length
128
+ if @body_length < 0
129
+ raise ReadError.new("Negative body length: " +
130
+ @body_length.inspect)
131
+ end
132
+ @state = :body
133
+ consume_body if @body_length.zero?
134
+ end
135
+
136
+ def consume_body(target=nil)
137
+ body = target&.slice(0, @body_length)
138
+ @on_read.call(@message, body)
139
+ @state = :initial
140
+ end
141
+ end
142
+
143
+ class StreamingPullReader
144
+ include Readable
145
+
146
+ attr_reader :schema
147
+ def initialize(&on_read)
148
+ @on_read = on_read
149
+ @message_pull_reader = MessagePullReader.new do |message, body|
150
+ process_message(message, body)
151
+ end
152
+ @state = :schema
153
+ @schema = nil
154
+ @dictionaries = nil
155
+ @dictionary_fields = nil
156
+ end
157
+
158
+ def next_required_size
159
+ @message_pull_reader.next_required_size
160
+ end
161
+
162
+ def eos?
163
+ @message_pull_reader.eos?
164
+ end
165
+
166
+ def consume(chunk)
167
+ @message_pull_reader.consume(chunk)
168
+ end
169
+
170
+ private
171
+ def process_message(message, body)
172
+ case @state
173
+ when :schema
174
+ process_schema_message(message, body)
175
+ when :initial_dictionaries
176
+ header = message.header
177
+ unless header.is_a?(Org::Apache::Arrow::Flatbuf::DictionaryBatch)
178
+ raise ReadError.new("Not a dictionary batch message: " +
179
+ header.inspect)
180
+ end
181
+ process_dictionary_batch_message(message, body)
182
+ if @dictionaries.size == @dictionary_fields.size
183
+ @state = :data
184
+ end
185
+ when :data
186
+ case message.header
187
+ when Org::Apache::Arrow::Flatbuf::DictionaryBatch
188
+ process_dictionary_batch_message(message, body)
189
+ when Org::Apache::Arrow::Flatbuf::RecordBatch
190
+ process_record_batch_message(message, body)
191
+ end
192
+ end
193
+ end
194
+
195
+ def process_schema_message(message, body)
196
+ header = message.header
197
+ unless header.is_a?(Org::Apache::Arrow::Flatbuf::Schema)
198
+ raise ReadError.new("Not a schema message: " +
199
+ header.inspect)
200
+ end
201
+
202
+ @schema = read_schema(header)
203
+ @dictionaries = {}
204
+ @dictionary_fields = {}
205
+ @schema.fields.each do |field|
206
+ next unless field.type.is_a?(DictionaryType)
207
+ @dictionary_fields[field.dictionary_id] = field
208
+ end
209
+ if @dictionaries.size < @dictionary_fields.size
210
+ @state = :initial_dictionaries
211
+ else
212
+ @state = :data
213
+ end
214
+ end
215
+
216
+ def process_dictionary_batch_message(message, body)
217
+ header = message.header
218
+ if @state == :initial_dictionaries and header.delta?
219
+ raise ReadError.new("An initial dictionary batch message must be " +
220
+ "a non delta dictionary batch message: " +
221
+ header.inspect)
222
+ end
223
+ field = @dictionary_fields[header.id]
224
+ value_type = field.type.value_type
225
+ schema = Schema.new([Field.new("dummy", value_type, true, nil)])
226
+ record_batch = read_record_batch(header.data, schema, body)
227
+ if header.delta?
228
+ @dictionaries[header.id] << record_batch.columns[0]
229
+ else
230
+ @dictionaries[header.id] = [record_batch.columns[0]]
231
+ end
232
+ end
233
+
234
+ def find_dictionary(id)
235
+ @dictionaries[id]
236
+ end
237
+
238
+ def process_record_batch_message(message, body)
239
+ header = message.header
240
+ @on_read.call(read_record_batch(header, @schema, body))
241
+ end
242
+ end
243
+ end
@@ -0,0 +1,50 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ require_relative "streaming-pull-reader"
19
+
20
+ module ArrowFormat
21
+ class StreamingReader
22
+ include Enumerable
23
+
24
+ attr_reader :schema
25
+ def initialize(input)
26
+ @input = input
27
+ @schema = nil
28
+ end
29
+
30
+ def each
31
+ return to_enum(__method__) unless block_given?
32
+
33
+ reader = StreamingPullReader.new do |record_batch|
34
+ @schema ||= reader.schema
35
+ yield(record_batch)
36
+ end
37
+
38
+ buffer = "".b
39
+ loop do
40
+ next_size = reader.next_required_size
41
+ break if next_size.zero?
42
+
43
+ next_chunk = @input.read(next_size, buffer)
44
+ break if next_chunk.nil?
45
+
46
+ reader.consume(IO::Buffer.for(next_chunk))
47
+ end
48
+ end
49
+ end
50
+ end