red-arrow 0.4.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- # Copyright 2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -14,7 +14,31 @@
14
14
 
15
15
  module Arrow
16
16
  class Table
17
- def each_column(&block)
17
+ alias_method :initialize_raw, :initialize
18
+ def initialize(schema_or_raw_table_or_columns, columns=nil)
19
+ if columns.nil?
20
+ if schema_or_raw_table_or_columns[0].is_a?(Column)
21
+ columns = schema_or_raw_table_or_columns
22
+ fields = columns.collect(&:field)
23
+ schema = Schema.new(fields)
24
+ else
25
+ raw_table = schema_or_raw_table_or_columns
26
+ fields = []
27
+ columns = []
28
+ raw_table.each do |name, array|
29
+ field = Field.new(name.to_s, array.value_data_type)
30
+ fields << field
31
+ columns << Column.new(field, array)
32
+ end
33
+ schema = Schema.new(fields)
34
+ end
35
+ else
36
+ schema = schema_or_raw_table_or_columns
37
+ end
38
+ initialize_raw(schema, columns)
39
+ end
40
+
41
+ def each_column
18
42
  return to_enum(__method__) unless block_given?
19
43
 
20
44
  n_columns.times do |i|
@@ -23,7 +47,289 @@ module Arrow
23
47
  end
24
48
 
25
49
  def columns
26
- each_column.to_a
50
+ @columns ||= each_column.to_a
51
+ end
52
+
53
+ def each_record_batch
54
+ return to_enum(__method__) unless block_given?
55
+
56
+ reader = TableBatchReader.new(self)
57
+ while record_batch = reader.read_next
58
+ yield(record_batch)
59
+ end
60
+ end
61
+
62
+ # TODO
63
+ #
64
+ # @return [Arrow::Column, Array<Arrow::Column>, nil]
65
+ def [](*args)
66
+ if args.size == 1
67
+ case args[0]
68
+ when String, Symbol
69
+ find_column(args[0])
70
+ else
71
+ message = "#{self.class}\#[#{args[0].inspect}]: " +
72
+ "Must be String or Symbol"
73
+ raise ArgumentError, message
74
+ end
75
+ else
76
+ new_columns = args.collect do |column_name|
77
+ column = find_column(column_name)
78
+ if column.nil?
79
+ message = "Unknown column: <#{column_name.inspect}>: #{inspect}"
80
+ raise ArgumentError, message
81
+ end
82
+ column
83
+ end
84
+ self.class.new(schema, new_columns)
85
+ end
86
+ end
87
+
88
+ # TODO
89
+ #
90
+ # @return [Arrow::Table]
91
+ def slice(*slicers)
92
+ if block_given?
93
+ block_slicer = yield(Slicer.new(self))
94
+ case block_slicer
95
+ when nil
96
+ # Ignore
97
+ when ::Array
98
+ slicers.concat(block_slicer)
99
+ else
100
+ slicers << block_slicer
101
+ end
102
+ end
103
+ ranges = []
104
+ slicers.each do |slicer|
105
+ slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
106
+ case slicer
107
+ when Integer
108
+ ranges << [slicer, slicer]
109
+ when Range
110
+ from = slicer.first
111
+ to = slicer.last
112
+ to -= 1 if slicer.exclude_end?
113
+ ranges << [from, to]
114
+ when ::Array
115
+ from = slicer[0]
116
+ to = from + slicer[1] - 1
117
+ ranges << [from, to]
118
+ when BooleanArray
119
+ in_target = false
120
+ target_start = nil
121
+ slicer.each_with_index do |is_target, i|
122
+ if is_target
123
+ unless in_target
124
+ target_start = i
125
+ in_target = true
126
+ end
127
+ else
128
+ if in_target
129
+ ranges << [target_start, i - 1]
130
+ target_start = nil
131
+ in_target = false
132
+ end
133
+ end
134
+ end
135
+ if in_target
136
+ ranges << [target_start, slicer.length - 1]
137
+ end
138
+ else
139
+ message = "slicer must be Integer, Range, [from, to] or " +
140
+ "Arrow::BooleanArray, Arrow::Slicer::Condition: #{slicer.inspect}"
141
+ raise ArgumentError, message
142
+ end
143
+ end
144
+ slice_by_ranges(ranges)
145
+ end
146
+
147
+ # TODO
148
+ #
149
+ # @return [Arrow::Table]
150
+ def merge(other)
151
+ added_columns = {}
152
+ removed_columns = {}
153
+
154
+ case other
155
+ when Hash
156
+ other.each do |name, value|
157
+ name = name.to_s
158
+ if value
159
+ added_columns[name] = ensure_column(name, value)
160
+ else
161
+ removed_columns[name] = true
162
+ end
163
+ end
164
+ when Table
165
+ added_columns = {}
166
+ other.columns.each do |column|
167
+ added_columns[column.name] = column
168
+ end
169
+ else
170
+ message = "merge target must be Hash or Arrow::Table: " +
171
+ "<#{other.inspect}>: #{inspect}"
172
+ raise ArgumentError, message
173
+ end
174
+
175
+ new_columns = []
176
+ columns.each do |column|
177
+ column_name = column.name
178
+ new_column = added_columns.delete(column_name)
179
+ if new_column
180
+ new_columns << new_column
181
+ next
182
+ end
183
+ next if removed_columns.key?(column_name)
184
+ new_columns << column
185
+ end
186
+ added_columns.each do |name, new_column|
187
+ new_columns << new_column
188
+ end
189
+ new_fields = new_columns.collect do |new_column|
190
+ new_column.field
191
+ end
192
+ self.class.new(Schema.new(new_fields), new_columns)
193
+ end
194
+
195
+ alias_method :remove_column_raw, :remove_column
196
+ def remove_column(name_or_index)
197
+ case name_or_index
198
+ when String, Symbol
199
+ name = name_or_index.to_s
200
+ index = columns.index {|column| column.name == name}
201
+ if index.nil?
202
+ message = "unknown column: #{name_or_index.inspect}: #{inspect}"
203
+ raise KeyError.new(message)
204
+ end
205
+ else
206
+ index = name_or_index
207
+ index += n_columns if index < 0
208
+ if index < 0 or index >= n_columns
209
+ message = "out of index (0..#{n_columns - 1}): " +
210
+ "#{name_or_index.inspect}: #{inspect}"
211
+ raise IndexError.new(message)
212
+ end
213
+ end
214
+ remove_column_raw(index)
215
+ end
216
+
217
+ def select_columns(*selectors, &block)
218
+ if selectors.empty?
219
+ return to_enum(__method__) unless block_given?
220
+ selected_columns = columns.select(&block)
221
+ else
222
+ selected_columns = []
223
+ selectors.each do |selector|
224
+ case selector
225
+ when String, Symbol
226
+ column = find_column(selector)
227
+ if column.nil?
228
+ message = "unknown column: #{selector.inspect}: #{inspect}"
229
+ raise KeyError.new(message)
230
+ end
231
+ selected_columns << column
232
+ when Range
233
+ selected_columns.concat(columns[selector])
234
+ else
235
+ column = columns[selector]
236
+ if column.nil?
237
+ message = "out of index (0..#{n_columns - 1}): " +
238
+ "#{selector.inspect}: #{inspect}"
239
+ raise IndexError.new(message)
240
+ end
241
+ selected_columns << column
242
+ end
243
+ end
244
+ selected_columns = selected_columns.select(&block) if block_given?
245
+ end
246
+ self.class.new(selected_columns)
247
+ end
248
+
249
+ def to_s(options={})
250
+ formatter = TableFormatter.new(self, options)
251
+ formatter.format
252
+ end
253
+
254
+ def inspect
255
+ "#{super}\n#{to_s}"
256
+ end
257
+
258
+ def respond_to_missing?(name, include_private)
259
+ return true if find_column(name)
260
+ super
261
+ end
262
+
263
+ def method_missing(name, *args, &block)
264
+ if args.empty?
265
+ column = find_column(name)
266
+ return column if column
267
+ end
268
+ super
269
+ end
270
+
271
+ private
272
+ def find_column(name)
273
+ name = name.to_s
274
+ columns.find do |column|
275
+ column.name == name
276
+ end
277
+ end
278
+
279
+ def slice_by_ranges(ranges)
280
+ sliced_columns = columns.collect do |column|
281
+ chunks = []
282
+ arrays = column.data.each_chunk.to_a
283
+ offset = 0
284
+ offset_in_array = 0
285
+ ranges.each do |from, to|
286
+ range_size = to - from + 1
287
+ while range_size > 0
288
+ while offset + arrays.first.length - offset_in_array < from
289
+ offset += arrays.first.length - offset_in_array
290
+ arrays.shift
291
+ offset_in_array = 0
292
+ end
293
+ if offset < from
294
+ skipped_size = from - offset
295
+ offset += skipped_size
296
+ offset_in_array += skipped_size
297
+ end
298
+ array = arrays.first
299
+ array_length = array.length
300
+ rest_length = array_length - offset_in_array
301
+ if rest_length <= range_size
302
+ chunks << array.slice(offset_in_array, array_length)
303
+ offset += rest_length
304
+ range_size -= rest_length
305
+ offset_in_array = 0
306
+ arrays.shift
307
+ else
308
+ chunks << array.slice(offset_in_array, range_size)
309
+ offset += range_size
310
+ offset_in_array += range_size
311
+ range_size = 0
312
+ end
313
+ end
314
+ end
315
+ Column.new(column.field, ChunkedArray.new(chunks))
316
+ end
317
+
318
+ self.class.new(schema, sliced_columns)
319
+ end
320
+
321
+ def ensure_column(name, data)
322
+ case data
323
+ when Array
324
+ field = Field.new(name, data.value_data_type)
325
+ Column.new(field, data)
326
+ when Column
327
+ data
328
+ else
329
+ message = "column must be Arrow::Array or Arrow::Column: " +
330
+ "<#{name}>: <#{data.inspect}>: #{inspect}"
331
+ raise ArgumentError, message
332
+ end
27
333
  end
28
334
  end
29
335
  end
@@ -0,0 +1,69 @@
1
+ # Copyright 2017 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ module Arrow
16
+ class TimestampArray
17
+ class << self
18
+ def new(unit, values)
19
+ data_type = TimestampDataType.new(unit)
20
+ builder = TimestampArrayBuilder.new(data_type)
21
+ builder.build(values)
22
+ end
23
+ end
24
+
25
+ def get_value(i)
26
+ to_time(get_raw_value(i))
27
+ end
28
+
29
+ def unit
30
+ @unit ||= value_data_type_unit
31
+ end
32
+
33
+ private
34
+ def unit_id
35
+ @unit_id ||= unit.nick.to_sym
36
+ end
37
+
38
+ def value_data_type_unit
39
+ data_type = value_data_type
40
+ if data_type.respond_to?(:unit)
41
+ data_type.unit
42
+ else
43
+ data_type_name = data_type.to_s
44
+ if data_type_name.end_with?("[s]")
45
+ TimeUnit::SECOND
46
+ elsif data_type_name.end_with?("[ms]")
47
+ TimeUnit::MILLI
48
+ elsif data_type_name.end_with?("[us]")
49
+ TimeUnit::MICRO
50
+ else
51
+ TimeUnit::NANO
52
+ end
53
+ end
54
+ end
55
+
56
+ def to_time(raw_value)
57
+ case unit_id
58
+ when :second
59
+ Time.at(raw_value)
60
+ when :milli
61
+ Time.at(*raw_value.divmod(1_000))
62
+ when :micro
63
+ Time.at(*raw_value.divmod(1_000_000))
64
+ else
65
+ Time.at(raw_value / 1_000_000_000.0)
66
+ end
67
+ end
68
+ end
69
+ end
@@ -13,5 +13,5 @@
13
13
  # limitations under the License.
14
14
 
15
15
  module Arrow
16
- VERSION = "0.4.1"
16
+ VERSION = "0.8.0"
17
17
  end
@@ -38,6 +38,7 @@ Gem::Specification.new do |spec|
38
38
  spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
39
39
  spec.files += [".yardopts"]
40
40
  spec.files += Dir.glob("lib/**/*.rb")
41
+ spec.files += Dir.glob("image/*.*")
41
42
  spec.files += Dir.glob("doc/text/*")
42
43
  spec.test_files += Dir.glob("test/**/*")
43
44
  spec.extensions = ["dependency-check/Rakefile"]
@@ -0,0 +1,4 @@
1
+ name,score
2
+ alice,10
3
+ bob,29
4
+ chris,-1
@@ -0,0 +1,3 @@
1
+ alice,10
2
+ bob,29
3
+ chris,-1
@@ -14,6 +14,9 @@
14
14
 
15
15
  require "arrow"
16
16
 
17
+ require "pathname"
17
18
  require "tempfile"
18
19
 
19
20
  require "test-unit"
21
+
22
+ require_relative "helper/fixture"
@@ -0,0 +1,25 @@
1
+ # Copyright 2017 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ module Helper
16
+ module Fixture
17
+ def fixture_dir
18
+ Pathname.new(__dir__).join("..", "fixture").expand_path
19
+ end
20
+
21
+ def fixture_path(*components)
22
+ fixture_dir.join(*components)
23
+ end
24
+ end
25
+ end