red-arrow 0.8.1 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/arrow/loader.rb CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2017 Kouhei Sutou <kou@clear-code.com>
1
+ # Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
2
2
  #
3
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
4
  # you may not use this file except in compliance with the License.
@@ -35,6 +35,7 @@ module Arrow
35
35
  require "arrow/column"
36
36
  require "arrow/csv-loader"
37
37
  require "arrow/csv-reader"
38
+ require "arrow/data-type"
38
39
  require "arrow/date32-array"
39
40
  require "arrow/date32-array-builder"
40
41
  require "arrow/date64-array"
@@ -45,6 +46,8 @@ module Arrow
45
46
  require "arrow/slicer"
46
47
  require "arrow/table"
47
48
  require "arrow/table-formatter"
49
+ require "arrow/table-list-formatter"
50
+ require "arrow/table-table-formatter"
48
51
  require "arrow/table-loader"
49
52
  require "arrow/table-saver"
50
53
  require "arrow/tensor"
data/lib/arrow/slicer.rb CHANGED
@@ -110,26 +110,19 @@ module Arrow
110
110
 
111
111
  def evaluate
112
112
  data = @column.data
113
- if data.n_chunks == 1
114
- array = data.get_chunk(0)
115
- if array.is_a?(BooleanArray)
116
- array
117
- else
118
- array.cast(BooleanDataType.new)
119
- end
113
+
114
+ case @column.data_type
115
+ when BooleanDataType
116
+ data
120
117
  else
121
- raw_array = []
122
- data.each_chunk do |chunk|
123
- if chunk.is_a?(BooleanArray)
124
- boolean_array = chunk
125
- else
126
- boolean_array = chunk.cast(BooleanDataType.new)
127
- end
128
- boolean_array.each do |value|
129
- raw_array << value
118
+ if data.n_chunks == 1
119
+ data.get_chunk(0).cast(BooleanDataType.new)
120
+ else
121
+ arrays = data.each_chunk.collect do |chunk|
122
+ chunk.cast(BooleanDataType.new)
130
123
  end
124
+ ChunkedArray.new(arrays)
131
125
  end
132
- BooleanArray.new(raw_array)
133
126
  end
134
127
  end
135
128
 
@@ -141,6 +134,10 @@ module Arrow
141
134
  self == nil
142
135
  end
143
136
 
137
+ def valid?
138
+ self != nil
139
+ end
140
+
144
141
  def ==(value)
145
142
  EqualCondition.new(@column, value)
146
143
  end
@@ -165,6 +162,10 @@ module Arrow
165
162
  GreaterEqualCondition.new(@column, value)
166
163
  end
167
164
 
165
+ def in?(values)
166
+ InCondition.new(@column, values)
167
+ end
168
+
168
169
  def select(&block)
169
170
  SelectCondition.new(@column, block)
170
171
  end
@@ -245,8 +246,12 @@ module Arrow
245
246
  def evaluate
246
247
  case @value
247
248
  when nil
248
- raw_array = @column.collect do |value|
249
- not value.nil?
249
+ if @column.n_nulls.zero?
250
+ raw_array = [true] * @column.length
251
+ else
252
+ raw_array = @column.length.times.collect do |i|
253
+ @column.valid?(i)
254
+ end
250
255
  end
251
256
  BooleanArray.new(raw_array)
252
257
  else
@@ -350,6 +355,58 @@ module Arrow
350
355
  end
351
356
  end
352
357
 
358
+ class InCondition < Condition
359
+ def initialize(column, values)
360
+ @column = column
361
+ @values = values
362
+ end
363
+
364
+ def !@
365
+ NotInCondition.new(@column, @values)
366
+ end
367
+
368
+ def evaluate
369
+ values_index = {}
370
+ @values.each do |value|
371
+ values_index[value] = true
372
+ end
373
+ raw_array = @column.collect do |value|
374
+ if value.nil?
375
+ nil
376
+ else
377
+ values_index.key?(value)
378
+ end
379
+ end
380
+ BooleanArray.new(raw_array)
381
+ end
382
+ end
383
+
384
+ class NotInCondition < Condition
385
+ def initialize(column, values)
386
+ @column = column
387
+ @values = values
388
+ end
389
+
390
+ def !@
391
+ InCondition.new(@column, @values)
392
+ end
393
+
394
+ def evaluate
395
+ values_index = {}
396
+ @values.each do |value|
397
+ values_index[value] = true
398
+ end
399
+ raw_array = @column.collect do |value|
400
+ if value.nil?
401
+ nil
402
+ else
403
+ not values_index.key?(value)
404
+ end
405
+ end
406
+ BooleanArray.new(raw_array)
407
+ end
408
+ end
409
+
353
410
  class SelectCondition < Condition
354
411
  def initialize(column, block)
355
412
  @column = column
@@ -12,8 +12,6 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- require "time"
16
-
17
15
  module Arrow
18
16
  class TableFormatter
19
17
  def initialize(table, options={})
@@ -24,65 +22,41 @@ module Arrow
24
22
  def format
25
23
  text = ""
26
24
  columns = @table.columns
27
- columns.each do |column|
28
- text << "\t"
29
- text << format_column_name(column)
30
- end
31
- text << "\n"
25
+ format_header(text, columns)
32
26
 
33
27
  n_rows = @table.n_rows
34
28
  return text if n_rows.zero?
35
29
 
36
30
  border = @options[:border] || 10
37
31
  n_digits = (Math.log10(n_rows) + 1).truncate
38
- [border, n_rows].min.times do |i|
39
- format_row(text, columns, i, n_digits)
32
+ head_limit = [border, n_rows].min
33
+ head_column_values = columns.collect do |column|
34
+ column.each.take(head_limit)
40
35
  end
36
+ format_rows(text,
37
+ columns,
38
+ head_column_values.transpose,
39
+ n_digits,
40
+ 0)
41
41
  return text if n_rows <= border
42
42
 
43
- text << "...\n"
44
- [border, n_rows - border].max.upto(n_rows - 1) do |i|
45
- format_row(text, columns, i, n_digits)
43
+ tail_start = [border, n_rows - border].max
44
+ tail_limit = n_rows - tail_start
45
+ tail_column_values = columns.collect do |column|
46
+ column.reverse_each.take(tail_limit).reverse
46
47
  end
47
48
 
48
- text
49
- end
50
-
51
- private
52
- FLOAT_N_DIGITS = 10
53
- def format_column_name(column)
54
- case column.data_type
55
- when TimestampDataType
56
- "%*s" % [Time.now.iso8601.size, column.name]
57
- when FloatDataType, DoubleDataType
58
- "%*s" % [FLOAT_N_DIGITS, column.name]
59
- else
60
- column.name
49
+ if head_limit != tail_start
50
+ format_ellipsis(text)
61
51
  end
62
- end
63
52
 
64
- def format_row(text, columns, i, n_digits)
65
- text << ("%*d" % [n_digits, i])
66
- columns.each do |column|
67
- text << "\t"
68
- text << format_column_value(column, i)
69
- end
70
- text << "\n"
71
- text
72
- end
53
+ format_rows(text,
54
+ columns,
55
+ tail_column_values.transpose,
56
+ n_digits,
57
+ tail_start)
73
58
 
74
- def format_column_value(column, i)
75
- value = column[i]
76
- case value
77
- when Time
78
- value.iso8601
79
- when Float
80
- "%*f" % [[column.name.size, FLOAT_N_DIGITS].max, value]
81
- when Integer
82
- "%*d" % [column.name.size, value]
83
- else
84
- "%-*s" % [column.name.size, value.to_s]
85
- end
59
+ text
86
60
  end
87
61
  end
88
62
  end
@@ -0,0 +1,35 @@
1
+ # Copyright 2018 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ module Arrow
16
+ class TableListFormatter < TableFormatter
17
+ private
18
+ def format_header(text, columns)
19
+ end
20
+
21
+ def format_rows(text, columns, rows, n_digits, start_offset)
22
+ rows.each_with_index do |row, nth_row|
23
+ text << ("=" * 20 + " #{start_offset + nth_row} " + "=" * 20 + "\n")
24
+ row.each_with_index do |column_value, nth_column|
25
+ column = columns[nth_column]
26
+ text << "#{column.name}: #{column_value}\n"
27
+ end
28
+ end
29
+ end
30
+
31
+ def format_ellipsis(text)
32
+ text << "...\n"
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,69 @@
1
+ # Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ require "time"
16
+
17
+ module Arrow
18
+ class TableTableFormatter < TableFormatter
19
+ private
20
+ def format_header(text, columns)
21
+ columns.each do |column|
22
+ text << "\t"
23
+ text << format_column_name(column)
24
+ end
25
+ text << "\n"
26
+ end
27
+
28
+ FLOAT_N_DIGITS = 10
29
+ def format_column_name(column)
30
+ case column.data_type
31
+ when TimestampDataType
32
+ "%*s" % [Time.now.iso8601.size, column.name]
33
+ when FloatDataType, DoubleDataType
34
+ "%*s" % [FLOAT_N_DIGITS, column.name]
35
+ else
36
+ column.name
37
+ end
38
+ end
39
+
40
+ def format_rows(text, columns, rows, n_digits, start_offset)
41
+ rows.each_with_index do |row, nth_row|
42
+ text << ("%*d" % [n_digits, start_offset + nth_row])
43
+ row.each_with_index do |column_value, nth_column|
44
+ text << "\t"
45
+ column = columns[nth_column]
46
+ text << format_column_value(column, column_value)
47
+ end
48
+ text << "\n"
49
+ end
50
+ end
51
+
52
+ def format_column_value(column, value)
53
+ case value
54
+ when Time
55
+ value.iso8601
56
+ when Float
57
+ "%*f" % [[column.name.size, FLOAT_N_DIGITS].max, value]
58
+ when Integer
59
+ "%*d" % [column.name.size, value]
60
+ else
61
+ "%-*s" % [column.name.size, value.to_s]
62
+ end
63
+ end
64
+
65
+ def format_ellipsis(text)
66
+ text << "...\n"
67
+ end
68
+ end
69
+ end
data/lib/arrow/table.rb CHANGED
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ require "arrow/group"
15
16
  require "arrow/record-containable"
16
17
 
17
18
  module Arrow
@@ -61,24 +62,10 @@ module Arrow
61
62
  end
62
63
  end
63
64
 
64
- # TODO
65
- #
66
- # @return [Arrow::Column, Array<Arrow::Column>, nil]
67
- def [](*args)
68
- if args.size == 1
69
- find_column(args[0])
70
- else
71
- new_columns = args.collect do |column_name|
72
- column = find_column(column_name)
73
- if column.nil?
74
- message = "unknown column: <#{column_name.inspect}>: #{inspect}"
75
- raise ArgumentError, message
76
- end
77
- column
78
- end
79
- self.class.new(schema, new_columns)
80
- end
81
- end
65
+ alias_method :size, :n_rows
66
+ alias_method :length, :n_rows
67
+
68
+ alias_method :[], :find_column
82
69
 
83
70
  # TODO
84
71
  #
@@ -100,39 +87,32 @@ module Arrow
100
87
  slicer = slicer.evaluate if slicer.respond_to?(:evaluate)
101
88
  case slicer
102
89
  when Integer
90
+ slicer += n_rows if slicer < 0
103
91
  ranges << [slicer, slicer]
104
92
  when Range
105
93
  from = slicer.first
106
94
  to = slicer.last
107
95
  to -= 1 if slicer.exclude_end?
96
+ from += n_rows if from < 0
97
+ to += n_rows if to < 0
108
98
  ranges << [from, to]
109
99
  when ::Array
110
100
  from = slicer[0]
101
+ from += n_rows if from < 0
111
102
  to = from + slicer[1] - 1
112
103
  ranges << [from, to]
113
- when BooleanArray
114
- in_target = false
115
- target_start = nil
116
- slicer.each_with_index do |is_target, i|
117
- if is_target
118
- unless in_target
119
- target_start = i
120
- in_target = true
121
- end
122
- else
123
- if in_target
124
- ranges << [target_start, i - 1]
125
- target_start = nil
126
- in_target = false
127
- end
128
- end
129
- end
130
- if in_target
131
- ranges << [target_start, slicer.length - 1]
104
+ when ChunkedArray
105
+ offset = 0
106
+ slicer.each_chunk do |array|
107
+ boolean_array_to_slice_ranges(array, offset, ranges)
108
+ offset += array.length
132
109
  end
110
+ when BooleanArray
111
+ boolean_array_to_slice_ranges(slicer, 0, ranges)
133
112
  else
134
- message = "slicer must be Integer, Range, [from, to] or " +
135
- "Arrow::BooleanArray, Arrow::Slicer::Condition: #{slicer.inspect}"
113
+ message = "slicer must be Integer, Range, [from, to], " +
114
+ "Arrow::ChunkedArray of Arrow::BooleanArray, " +
115
+ "Arrow::BooleanArray or Arrow::Slicer::Condition: #{slicer.inspect}"
136
116
  raise ArgumentError, message
137
117
  end
138
118
  end
@@ -209,6 +189,9 @@ module Arrow
209
189
  remove_column_raw(index)
210
190
  end
211
191
 
192
+ # TODO
193
+ #
194
+ # @return [Arrow::Table]
212
195
  def select_columns(*selectors, &block)
213
196
  if selectors.empty?
214
197
  return to_enum(__method__) unless block_given?
@@ -241,13 +224,30 @@ module Arrow
241
224
  self.class.new(selected_columns)
242
225
  end
243
226
 
227
+ def group(*keys)
228
+ Group.new(self, keys)
229
+ end
230
+
244
231
  def save(path, options={})
245
232
  saver = TableSaver.new(self, path, options)
246
233
  saver.save
247
234
  end
248
235
 
236
+ def pack
237
+ packed_columns = columns.collect do |column|
238
+ column.pack
239
+ end
240
+ self.class.new(schema, packed_columns)
241
+ end
242
+
249
243
  def to_s(options={})
250
- formatter = TableFormatter.new(self, options)
244
+ case options[:format]
245
+ when :list
246
+ formatter_class = TableListFormatter
247
+ else
248
+ formatter_class = TableTableFormatter
249
+ end
250
+ formatter = formatter_class.new(self, options)
251
251
  formatter.format
252
252
  end
253
253
 
@@ -269,6 +269,28 @@ module Arrow
269
269
  end
270
270
 
271
271
  private
272
+ def boolean_array_to_slice_ranges(array, offset, ranges)
273
+ in_target = false
274
+ target_start = nil
275
+ array.each_with_index do |is_target, i|
276
+ if is_target
277
+ unless in_target
278
+ target_start = offset + i
279
+ in_target = true
280
+ end
281
+ else
282
+ if in_target
283
+ ranges << [target_start, offset + i - 1]
284
+ target_start = nil
285
+ in_target = false
286
+ end
287
+ end
288
+ end
289
+ if in_target
290
+ ranges << [target_start, offset + array.length - 1]
291
+ end
292
+ end
293
+
272
294
  def slice_by_ranges(ranges)
273
295
  sliced_columns = columns.collect do |column|
274
296
  chunks = []