red-arrow 0.4.1 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,4 @@
1
- # Copyright 2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -14,7 +14,31 @@
14
14
 
15
15
  module Arrow
16
16
  class Table
17
- def each_column(&block)
17
+ alias_method :initialize_raw, :initialize
18
+ def initialize(schema_or_raw_table_or_columns, columns=nil)
19
+ if columns.nil?
20
+ if schema_or_raw_table_or_columns[0].is_a?(Column)
21
+ columns = schema_or_raw_table_or_columns
22
+ fields = columns.collect(&:field)
23
+ schema = Schema.new(fields)
24
+ else
25
+ raw_table = schema_or_raw_table_or_columns
26
+ fields = []
27
+ columns = []
28
+ raw_table.each do |name, array|
29
+ field = Field.new(name.to_s, array.value_data_type)
30
+ fields << field
31
+ columns << Column.new(field, array)
32
+ end
33
+ schema = Schema.new(fields)
34
+ end
35
+ else
36
+ schema = schema_or_raw_table_or_columns
37
+ end
38
+ initialize_raw(schema, columns)
39
+ end
40
+
41
+ def each_column
18
42
  return to_enum(__method__) unless block_given?
19
43
 
20
44
  n_columns.times do |i|
@@ -23,7 +47,289 @@ module Arrow
23
47
  end
24
48
 
25
49
  def columns
26
- each_column.to_a
50
+ @columns ||= each_column.to_a
51
+ end
52
+
53
+ def each_record_batch
54
+ return to_enum(__method__) unless block_given?
55
+
56
+ reader = TableBatchReader.new(self)
57
+ while record_batch = reader.read_next
58
+ yield(record_batch)
59
+ end
60
+ end
61
+
62
+ # TODO
63
+ #
64
+ # @return [Arrow::Column, Array<Arrow::Column>, nil]
65
+ def [](*args)
66
+ if args.size == 1
67
+ case args[0]
68
+ when String, Symbol
69
+ find_column(args[0])
70
+ else
71
+ message = "#{self.class}\#[#{args[0].inspect}]: " +
72
+ "Must be String or Symbol"
73
+ raise ArgumentError, message
74
+ end
75
+ else
76
+ new_columns = args.collect do |column_name|
77
+ column = find_column(column_name)
78
+ if column.nil?
79
+ message = "Unknown column: <#{column_name.inspect}>: #{inspect}"
80
+ raise ArgumentError, message
81
+ end
82
+ column
83
+ end
84
+ self.class.new(schema, new_columns)
85
+ end
86
+ end
87
+
88
+ # TODO
89
+ #
90
+ # @return [Arrow::Table]
91
+ def slice(*slicers)
92
+ if block_given?
93
+ block_slicer = yield(Slicer.new(self))
94
+ case block_slicer
95
+ when nil
96
+ # Ignore
97
+ when ::Array
98
+ slicers.concat(block_slicer)
99
+ else
100
+ slicers << block_slicer
101
+ end
102
+ end
103
+ ranges = []
104
+ slicers.each do |slicer|
105
+ slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
106
+ case slicer
107
+ when Integer
108
+ ranges << [slicer, slicer]
109
+ when Range
110
+ from = slicer.first
111
+ to = slicer.last
112
+ to -= 1 if slicer.exclude_end?
113
+ ranges << [from, to]
114
+ when ::Array
115
+ from = slicer[0]
116
+ to = from + slicer[1] - 1
117
+ ranges << [from, to]
118
+ when BooleanArray
119
+ in_target = false
120
+ target_start = nil
121
+ slicer.each_with_index do |is_target, i|
122
+ if is_target
123
+ unless in_target
124
+ target_start = i
125
+ in_target = true
126
+ end
127
+ else
128
+ if in_target
129
+ ranges << [target_start, i - 1]
130
+ target_start = nil
131
+ in_target = false
132
+ end
133
+ end
134
+ end
135
+ if in_target
136
+ ranges << [target_start, slicer.length - 1]
137
+ end
138
+ else
139
+ message = "slicer must be Integer, Range, [from, to] or " +
140
+ "Arrow::BooleanArray, Arrow::Slicer::Condition: #{slicer.inspect}"
141
+ raise ArgumentError, message
142
+ end
143
+ end
144
+ slice_by_ranges(ranges)
145
+ end
146
+
147
+ # TODO
148
+ #
149
+ # @return [Arrow::Table]
150
+ def merge(other)
151
+ added_columns = {}
152
+ removed_columns = {}
153
+
154
+ case other
155
+ when Hash
156
+ other.each do |name, value|
157
+ name = name.to_s
158
+ if value
159
+ added_columns[name] = ensure_column(name, value)
160
+ else
161
+ removed_columns[name] = true
162
+ end
163
+ end
164
+ when Table
165
+ added_columns = {}
166
+ other.columns.each do |column|
167
+ added_columns[column.name] = column
168
+ end
169
+ else
170
+ message = "merge target must be Hash or Arrow::Table: " +
171
+ "<#{other.inspect}>: #{inspect}"
172
+ raise ArgumentError, message
173
+ end
174
+
175
+ new_columns = []
176
+ columns.each do |column|
177
+ column_name = column.name
178
+ new_column = added_columns.delete(column_name)
179
+ if new_column
180
+ new_columns << new_column
181
+ next
182
+ end
183
+ next if removed_columns.key?(column_name)
184
+ new_columns << column
185
+ end
186
+ added_columns.each do |name, new_column|
187
+ new_columns << new_column
188
+ end
189
+ new_fields = new_columns.collect do |new_column|
190
+ new_column.field
191
+ end
192
+ self.class.new(Schema.new(new_fields), new_columns)
193
+ end
194
+
195
+ alias_method :remove_column_raw, :remove_column
196
+ def remove_column(name_or_index)
197
+ case name_or_index
198
+ when String, Symbol
199
+ name = name_or_index.to_s
200
+ index = columns.index {|column| column.name == name}
201
+ if index.nil?
202
+ message = "unknown column: #{name_or_index.inspect}: #{inspect}"
203
+ raise KeyError.new(message)
204
+ end
205
+ else
206
+ index = name_or_index
207
+ index += n_columns if index < 0
208
+ if index < 0 or index >= n_columns
209
+ message = "out of index (0..#{n_columns - 1}): " +
210
+ "#{name_or_index.inspect}: #{inspect}"
211
+ raise IndexError.new(message)
212
+ end
213
+ end
214
+ remove_column_raw(index)
215
+ end
216
+
217
+ def select_columns(*selectors, &block)
218
+ if selectors.empty?
219
+ return to_enum(__method__) unless block_given?
220
+ selected_columns = columns.select(&block)
221
+ else
222
+ selected_columns = []
223
+ selectors.each do |selector|
224
+ case selector
225
+ when String, Symbol
226
+ column = find_column(selector)
227
+ if column.nil?
228
+ message = "unknown column: #{selector.inspect}: #{inspect}"
229
+ raise KeyError.new(message)
230
+ end
231
+ selected_columns << column
232
+ when Range
233
+ selected_columns.concat(columns[selector])
234
+ else
235
+ column = columns[selector]
236
+ if column.nil?
237
+ message = "out of index (0..#{n_columns - 1}): " +
238
+ "#{selector.inspect}: #{inspect}"
239
+ raise IndexError.new(message)
240
+ end
241
+ selected_columns << column
242
+ end
243
+ end
244
+ selected_columns = selected_columns.select(&block) if block_given?
245
+ end
246
+ self.class.new(selected_columns)
247
+ end
248
+
249
+ def to_s(options={})
250
+ formatter = TableFormatter.new(self, options)
251
+ formatter.format
252
+ end
253
+
254
+ def inspect
255
+ "#{super}\n#{to_s}"
256
+ end
257
+
258
+ def respond_to_missing?(name, include_private)
259
+ return true if find_column(name)
260
+ super
261
+ end
262
+
263
+ def method_missing(name, *args, &block)
264
+ if args.empty?
265
+ column = find_column(name)
266
+ return column if column
267
+ end
268
+ super
269
+ end
270
+
271
+ private
272
+ def find_column(name)
273
+ name = name.to_s
274
+ columns.find do |column|
275
+ column.name == name
276
+ end
277
+ end
278
+
279
+ def slice_by_ranges(ranges)
280
+ sliced_columns = columns.collect do |column|
281
+ chunks = []
282
+ arrays = column.data.each_chunk.to_a
283
+ offset = 0
284
+ offset_in_array = 0
285
+ ranges.each do |from, to|
286
+ range_size = to - from + 1
287
+ while range_size > 0
288
+ while offset + arrays.first.length - offset_in_array < from
289
+ offset += arrays.first.length - offset_in_array
290
+ arrays.shift
291
+ offset_in_array = 0
292
+ end
293
+ if offset < from
294
+ skipped_size = from - offset
295
+ offset += skipped_size
296
+ offset_in_array += skipped_size
297
+ end
298
+ array = arrays.first
299
+ array_length = array.length
300
+ rest_length = array_length - offset_in_array
301
+ if rest_length <= range_size
302
+ chunks << array.slice(offset_in_array, array_length)
303
+ offset += rest_length
304
+ range_size -= rest_length
305
+ offset_in_array = 0
306
+ arrays.shift
307
+ else
308
+ chunks << array.slice(offset_in_array, range_size)
309
+ offset += range_size
310
+ offset_in_array += range_size
311
+ range_size = 0
312
+ end
313
+ end
314
+ end
315
+ Column.new(column.field, ChunkedArray.new(chunks))
316
+ end
317
+
318
+ self.class.new(schema, sliced_columns)
319
+ end
320
+
321
+ def ensure_column(name, data)
322
+ case data
323
+ when Array
324
+ field = Field.new(name, data.value_data_type)
325
+ Column.new(field, data)
326
+ when Column
327
+ data
328
+ else
329
+ message = "column must be Arrow::Array or Arrow::Column: " +
330
+ "<#{name}>: <#{data.inspect}>: #{inspect}"
331
+ raise ArgumentError, message
332
+ end
27
333
  end
28
334
  end
29
335
  end
@@ -0,0 +1,69 @@
1
+ # Copyright 2017 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ module Arrow
16
+ class TimestampArray
17
+ class << self
18
+ def new(unit, values)
19
+ data_type = TimestampDataType.new(unit)
20
+ builder = TimestampArrayBuilder.new(data_type)
21
+ builder.build(values)
22
+ end
23
+ end
24
+
25
+ def get_value(i)
26
+ to_time(get_raw_value(i))
27
+ end
28
+
29
+ def unit
30
+ @unit ||= value_data_type_unit
31
+ end
32
+
33
+ private
34
+ def unit_id
35
+ @unit_id ||= unit.nick.to_sym
36
+ end
37
+
38
+ def value_data_type_unit
39
+ data_type = value_data_type
40
+ if data_type.respond_to?(:unit)
41
+ data_type.unit
42
+ else
43
+ data_type_name = data_type.to_s
44
+ if data_type_name.end_with?("[s]")
45
+ TimeUnit::SECOND
46
+ elsif data_type_name.end_with?("[ms]")
47
+ TimeUnit::MILLI
48
+ elsif data_type_name.end_with?("[us]")
49
+ TimeUnit::MICRO
50
+ else
51
+ TimeUnit::NANO
52
+ end
53
+ end
54
+ end
55
+
56
+ def to_time(raw_value)
57
+ case unit_id
58
+ when :second
59
+ Time.at(raw_value)
60
+ when :milli
61
+ Time.at(*raw_value.divmod(1_000))
62
+ when :micro
63
+ Time.at(*raw_value.divmod(1_000_000))
64
+ else
65
+ Time.at(raw_value / 1_000_000_000.0)
66
+ end
67
+ end
68
+ end
69
+ end
@@ -13,5 +13,5 @@
13
13
  # limitations under the License.
14
14
 
15
15
  module Arrow
16
- VERSION = "0.4.1"
16
+ VERSION = "0.8.0"
17
17
  end
@@ -38,6 +38,7 @@ Gem::Specification.new do |spec|
38
38
  spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"]
39
39
  spec.files += [".yardopts"]
40
40
  spec.files += Dir.glob("lib/**/*.rb")
41
+ spec.files += Dir.glob("image/*.*")
41
42
  spec.files += Dir.glob("doc/text/*")
42
43
  spec.test_files += Dir.glob("test/**/*")
43
44
  spec.extensions = ["dependency-check/Rakefile"]
@@ -0,0 +1,4 @@
1
+ name,score
2
+ alice,10
3
+ bob,29
4
+ chris,-1
@@ -0,0 +1,3 @@
1
+ alice,10
2
+ bob,29
3
+ chris,-1
@@ -14,6 +14,9 @@
14
14
 
15
15
  require "arrow"
16
16
 
17
+ require "pathname"
17
18
  require "tempfile"
18
19
 
19
20
  require "test-unit"
21
+
22
+ require_relative "helper/fixture"
@@ -0,0 +1,25 @@
1
+ # Copyright 2017 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ module Helper
16
+ module Fixture
17
+ def fixture_dir
18
+ Pathname.new(__dir__).join("..", "fixture").expand_path
19
+ end
20
+
21
+ def fixture_path(*components)
22
+ fixture_dir.join(*components)
23
+ end
24
+ end
25
+ end