red-arrow 0.8.1 → 0.8.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/arrow/loader.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -35,6 +35,7 @@ module Arrow
35
35
  require "arrow/column"
36
36
  require "arrow/csv-loader"
37
37
  require "arrow/csv-reader"
38
+ require "arrow/data-type"
38
39
  require "arrow/date32-array"
39
40
  require "arrow/date32-array-builder"
40
41
  require "arrow/date64-array"
@@ -45,6 +46,8 @@ module Arrow
45
46
  require "arrow/slicer"
46
47
  require "arrow/table"
47
48
  require "arrow/table-formatter"
49
+ require "arrow/table-list-formatter"
50
+ require "arrow/table-table-formatter"
48
51
  require "arrow/table-loader"
49
52
  require "arrow/table-saver"
50
53
  require "arrow/tensor"
data/lib/arrow/slicer.rb CHANGED
@@ -110,26 +110,19 @@ module Arrow
110
110
 
111
111
  def evaluate
112
112
  data = @column.data
113
- if data.n_chunks == 1
114
- array = data.get_chunk(0)
115
- if array.is_a?(BooleanArray)
116
- array
117
- else
118
- array.cast(BooleanDataType.new)
119
- end
113
+
114
+ case @column.data_type
115
+ when BooleanDataType
116
+ data
120
117
  else
121
- raw_array = []
122
- data.each_chunk do |chunk|
123
- if chunk.is_a?(BooleanArray)
124
- boolean_array = chunk
125
- else
126
- boolean_array = chunk.cast(BooleanDataType.new)
127
- end
128
- boolean_array.each do |value|
129
- raw_array << value
118
+ if data.n_chunks == 1
119
+ data.get_chunk(0).cast(BooleanDataType.new)
120
+ else
121
+ arrays = data.each_chunk.collect do |chunk|
122
+ chunk.cast(BooleanDataType.new)
130
123
  end
124
+ ChunkedArray.new(arrays)
131
125
  end
132
- BooleanArray.new(raw_array)
133
126
  end
134
127
  end
135
128
 
@@ -141,6 +134,10 @@ module Arrow
141
134
  self == nil
142
135
  end
143
136
 
137
+ def valid?
138
+ self != nil
139
+ end
140
+
144
141
  def ==(value)
145
142
  EqualCondition.new(@column, value)
146
143
  end
@@ -165,6 +162,10 @@ module Arrow
165
162
  GreaterEqualCondition.new(@column, value)
166
163
  end
167
164
 
165
+ def in?(values)
166
+ InCondition.new(@column, values)
167
+ end
168
+
168
169
  def select(&block)
169
170
  SelectCondition.new(@column, block)
170
171
  end
@@ -245,8 +246,12 @@ module Arrow
245
246
  def evaluate
246
247
  case @value
247
248
  when nil
248
- raw_array = @column.collect do |value|
249
- not value.nil?
249
+ if @column.n_nulls.zero?
250
+ raw_array = [true] * @column.length
251
+ else
252
+ raw_array = @column.length.times.collect do |i|
253
+ @column.valid?(i)
254
+ end
250
255
  end
251
256
  BooleanArray.new(raw_array)
252
257
  else
@@ -350,6 +355,58 @@ module Arrow
350
355
  end
351
356
  end
352
357
 
358
+ class InCondition < Condition
359
+ def initialize(column, values)
360
+ @column = column
361
+ @values = values
362
+ end
363
+
364
+ def !@
365
+ NotInCondition.new(@column, @values)
366
+ end
367
+
368
+ def evaluate
369
+ values_index = {}
370
+ @values.each do |value|
371
+ values_index[value] = true
372
+ end
373
+ raw_array = @column.collect do |value|
374
+ if value.nil?
375
+ nil
376
+ else
377
+ values_index.key?(value)
378
+ end
379
+ end
380
+ BooleanArray.new(raw_array)
381
+ end
382
+ end
383
+
384
+ class NotInCondition < Condition
385
+ def initialize(column, values)
386
+ @column = column
387
+ @values = values
388
+ end
389
+
390
+ def !@
391
+ InCondition.new(@column, @values)
392
+ end
393
+
394
+ def evaluate
395
+ values_index = {}
396
+ @values.each do |value|
397
+ values_index[value] = true
398
+ end
399
+ raw_array = @column.collect do |value|
400
+ if value.nil?
401
+ nil
402
+ else
403
+ not values_index.key?(value)
404
+ end
405
+ end
406
+ BooleanArray.new(raw_array)
407
+ end
408
+ end
409
+
353
410
  class SelectCondition < Condition
354
411
  def initialize(column, block)
355
412
  @column = column
@@ -12,8 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- require "time"
16
-
17
15
  module Arrow
18
16
  class TableFormatter
19
17
  def initialize(table, options={})
@@ -24,65 +22,41 @@ module Arrow
24
22
  def format
25
23
  text = ""
26
24
  columns = @table.columns
27
- columns.each do |column|
28
- text << "\t"
29
- text << format_column_name(column)
30
- end
31
- text << "\n"
25
+ format_header(text, columns)
32
26
 
33
27
  n_rows = @table.n_rows
34
28
  return text if n_rows.zero?
35
29
 
36
30
  border = @options[:border] || 10
37
31
  n_digits = (Math.log10(n_rows) + 1).truncate
38
- [border, n_rows].min.times do |i|
39
- format_row(text, columns, i, n_digits)
32
+ head_limit = [border, n_rows].min
33
+ head_column_values = columns.collect do |column|
34
+ column.each.take(head_limit)
40
35
  end
36
+ format_rows(text,
37
+ columns,
38
+ head_column_values.transpose,
39
+ n_digits,
40
+ 0)
41
41
  return text if n_rows <= border
42
42
 
43
- text << "...\n"
44
- [border, n_rows - border].max.upto(n_rows - 1) do |i|
45
- format_row(text, columns, i, n_digits)
43
+ tail_start = [border, n_rows - border].max
44
+ tail_limit = n_rows - tail_start
45
+ tail_column_values = columns.collect do |column|
46
+ column.reverse_each.take(tail_limit).reverse
46
47
  end
47
48
 
48
- text
49
- end
50
-
51
- private
52
- FLOAT_N_DIGITS = 10
53
- def format_column_name(column)
54
- case column.data_type
55
- when TimestampDataType
56
- "%*s" % [Time.now.iso8601.size, column.name]
57
- when FloatDataType, DoubleDataType
58
- "%*s" % [FLOAT_N_DIGITS, column.name]
59
- else
60
- column.name
49
+ if head_limit != tail_start
50
+ format_ellipsis(text)
61
51
  end
62
- end
63
52
 
64
- def format_row(text, columns, i, n_digits)
65
- text << ("%*d" % [n_digits, i])
66
- columns.each do |column|
67
- text << "\t"
68
- text << format_column_value(column, i)
69
- end
70
- text << "\n"
71
- text
72
- end
53
+ format_rows(text,
54
+ columns,
55
+ tail_column_values.transpose,
56
+ n_digits,
57
+ tail_start)
73
58
 
74
- def format_column_value(column, i)
75
- value = column[i]
76
- case value
77
- when Time
78
- value.iso8601
79
- when Float
80
- "%*f" % [[column.name.size, FLOAT_N_DIGITS].max, value]
81
- when Integer
82
- "%*d" % [column.name.size, value]
83
- else
84
- "%-*s" % [column.name.size, value.to_s]
85
- end
59
+ text
86
60
  end
87
61
  end
88
62
  end
@@ -0,0 +1,35 @@
1
+ # Copyright 2018 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ module Arrow
16
+ class TableListFormatter < TableFormatter
17
+ private
18
+ def format_header(text, columns)
19
+ end
20
+
21
+ def format_rows(text, columns, rows, n_digits, start_offset)
22
+ rows.each_with_index do |row, nth_row|
23
+ text << ("=" * 20 + " #{start_offset + nth_row} " + "=" * 20 + "\n")
24
+ row.each_with_index do |column_value, nth_column|
25
+ column = columns[nth_column]
26
+ text << "#{column.name}: #{column_value}\n"
27
+ end
28
+ end
29
+ end
30
+
31
+ def format_ellipsis(text)
32
+ text << "...\n"
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,69 @@
1
+ # Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ require "time"
16
+
17
+ module Arrow
18
+ class TableTableFormatter < TableFormatter
19
+ private
20
+ def format_header(text, columns)
21
+ columns.each do |column|
22
+ text << "\t"
23
+ text << format_column_name(column)
24
+ end
25
+ text << "\n"
26
+ end
27
+
28
+ FLOAT_N_DIGITS = 10
29
+ def format_column_name(column)
30
+ case column.data_type
31
+ when TimestampDataType
32
+ "%*s" % [Time.now.iso8601.size, column.name]
33
+ when FloatDataType, DoubleDataType
34
+ "%*s" % [FLOAT_N_DIGITS, column.name]
35
+ else
36
+ column.name
37
+ end
38
+ end
39
+
40
+ def format_rows(text, columns, rows, n_digits, start_offset)
41
+ rows.each_with_index do |row, nth_row|
42
+ text << ("%*d" % [n_digits, start_offset + nth_row])
43
+ row.each_with_index do |column_value, nth_column|
44
+ text << "\t"
45
+ column = columns[nth_column]
46
+ text << format_column_value(column, column_value)
47
+ end
48
+ text << "\n"
49
+ end
50
+ end
51
+
52
+ def format_column_value(column, value)
53
+ case value
54
+ when Time
55
+ value.iso8601
56
+ when Float
57
+ "%*f" % [[column.name.size, FLOAT_N_DIGITS].max, value]
58
+ when Integer
59
+ "%*d" % [column.name.size, value]
60
+ else
61
+ "%-*s" % [column.name.size, value.to_s]
62
+ end
63
+ end
64
+
65
+ def format_ellipsis(text)
66
+ text << "...\n"
67
+ end
68
+ end
69
+ end
data/lib/arrow/table.rb CHANGED
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ require "arrow/group"
15
16
  require "arrow/record-containable"
16
17
 
17
18
  module Arrow
@@ -61,24 +62,10 @@ module Arrow
61
62
  end
62
63
  end
63
64
 
64
- # TODO
65
- #
66
- # @return [Arrow::Column, Array<Arrow::Column>, nil]
67
- def [](*args)
68
- if args.size == 1
69
- find_column(args[0])
70
- else
71
- new_columns = args.collect do |column_name|
72
- column = find_column(column_name)
73
- if column.nil?
74
- message = "unknown column: <#{column_name.inspect}>: #{inspect}"
75
- raise ArgumentError, message
76
- end
77
- column
78
- end
79
- self.class.new(schema, new_columns)
80
- end
81
- end
65
+ alias_method :size, :n_rows
66
+ alias_method :length, :n_rows
67
+
68
+ alias_method :[], :find_column
82
69
 
83
70
  # TODO
84
71
  #
@@ -100,39 +87,32 @@ module Arrow
100
87
  slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
101
88
  case slicer
102
89
  when Integer
90
+ slicer += n_rows if slicer < 0
103
91
  ranges << [slicer, slicer]
104
92
  when Range
105
93
  from = slicer.first
106
94
  to = slicer.last
107
95
  to -= 1 if slicer.exclude_end?
96
+ from += n_rows if from < 0
97
+ to += n_rows if to < 0
108
98
  ranges << [from, to]
109
99
  when ::Array
110
100
  from = slicer[0]
101
+ from += n_rows if from < 0
111
102
  to = from + slicer[1] - 1
112
103
  ranges << [from, to]
113
- when BooleanArray
114
- in_target = false
115
- target_start = nil
116
- slicer.each_with_index do |is_target, i|
117
- if is_target
118
- unless in_target
119
- target_start = i
120
- in_target = true
121
- end
122
- else
123
- if in_target
124
- ranges << [target_start, i - 1]
125
- target_start = nil
126
- in_target = false
127
- end
128
- end
129
- end
130
- if in_target
131
- ranges << [target_start, slicer.length - 1]
104
+ when ChunkedArray
105
+ offset = 0
106
+ slicer.each_chunk do |array|
107
+ boolean_array_to_slice_ranges(array, offset, ranges)
108
+ offset += array.length
132
109
  end
110
+ when BooleanArray
111
+ boolean_array_to_slice_ranges(slicer, 0, ranges)
133
112
  else
134
- message = "slicer must be Integer, Range, [from, to] or " +
135
- "Arrow::BooleanArray, Arrow::Slicer::Condition: #{slicer.inspect}"
113
+ message = "slicer must be Integer, Range, [from, to], " +
114
+ "Arrow::ChunkedArray of Arrow::BooleanArray, " +
115
+ "Arrow::BooleanArray or Arrow::Slicer::Condition: #{slicer.inspect}"
136
116
  raise ArgumentError, message
137
117
  end
138
118
  end
@@ -209,6 +189,9 @@ module Arrow
209
189
  remove_column_raw(index)
210
190
  end
211
191
 
192
+ # TODO
193
+ #
194
+ # @return [Arrow::Table]
212
195
  def select_columns(*selectors, &block)
213
196
  if selectors.empty?
214
197
  return to_enum(__method__) unless block_given?
@@ -241,13 +224,30 @@ module Arrow
241
224
  self.class.new(selected_columns)
242
225
  end
243
226
 
227
+ def group(*keys)
228
+ Group.new(self, keys)
229
+ end
230
+
244
231
  def save(path, options={})
245
232
  saver = TableSaver.new(self, path, options)
246
233
  saver.save
247
234
  end
248
235
 
236
+ def pack
237
+ packed_columns = columns.collect do |column|
238
+ column.pack
239
+ end
240
+ self.class.new(schema, packed_columns)
241
+ end
242
+
249
243
  def to_s(options={})
250
- formatter = TableFormatter.new(self, options)
244
+ case options[:format]
245
+ when :list
246
+ formatter_class = TableListFormatter
247
+ else
248
+ formatter_class = TableTableFormatter
249
+ end
250
+ formatter = formatter_class.new(self, options)
251
251
  formatter.format
252
252
  end
253
253
 
@@ -269,6 +269,28 @@ module Arrow
269
269
  end
270
270
 
271
271
  private
272
+ def boolean_array_to_slice_ranges(array, offset, ranges)
273
+ in_target = false
274
+ target_start = nil
275
+ array.each_with_index do |is_target, i|
276
+ if is_target
277
+ unless in_target
278
+ target_start = offset + i
279
+ in_target = true
280
+ end
281
+ else
282
+ if in_target
283
+ ranges << [target_start, offset + i - 1]
284
+ target_start = nil
285
+ in_target = false
286
+ end
287
+ end
288
+ end
289
+ if in_target
290
+ ranges << [target_start, offset + array.length - 1]
291
+ end
292
+ end
293
+
272
294
  def slice_by_ranges(ranges)
273
295
  sliced_columns = columns.collect do |column|
274
296
  chunks = []