red-arrow 0.11.0 → 0.12.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of red-arrow might be problematic. Click here for more details.

Files changed (64) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/Rakefile +4 -0
  4. data/lib/arrow/array-builder.rb +6 -2
  5. data/lib/arrow/array.rb +6 -2
  6. data/{test/test-csv-reader.rb → lib/arrow/compression-type.rb} +16 -13
  7. data/lib/arrow/csv-loader.rb +102 -2
  8. data/lib/arrow/csv-read-options.rb +25 -0
  9. data/lib/arrow/data-type.rb +135 -0
  10. data/lib/arrow/decimal128-array-builder.rb +64 -0
  11. data/lib/arrow/decimal128-data-type.rb +69 -0
  12. data/lib/arrow/dense-union-data-type.rb +90 -0
  13. data/lib/arrow/dictionary-data-type.rb +106 -0
  14. data/lib/arrow/field-containable.rb +35 -0
  15. data/lib/arrow/field.rb +92 -8
  16. data/lib/arrow/file-output-stream.rb +34 -0
  17. data/lib/arrow/list-array-builder.rb +96 -0
  18. data/lib/arrow/list-data-type.rb +68 -0
  19. data/lib/arrow/loader.rb +30 -5
  20. data/lib/arrow/{csv-reader.rb → path-extension.rb} +19 -28
  21. data/lib/arrow/record-batch-builder.rb +115 -0
  22. data/lib/arrow/record-batch.rb +25 -0
  23. data/lib/arrow/schema.rb +97 -0
  24. data/lib/arrow/sparse-union-data-type.rb +90 -0
  25. data/lib/arrow/struct-array-builder.rb +146 -0
  26. data/lib/arrow/struct-array.rb +34 -0
  27. data/lib/arrow/struct-data-type.rb +130 -0
  28. data/lib/arrow/struct.rb +68 -0
  29. data/lib/arrow/table-loader.rb +65 -25
  30. data/lib/arrow/table-saver.rb +73 -24
  31. data/lib/arrow/table.rb +11 -2
  32. data/lib/arrow/time32-data-type.rb +61 -0
  33. data/lib/arrow/time64-data-type.rb +61 -0
  34. data/lib/arrow/timestamp-data-type.rb +57 -0
  35. data/lib/arrow/version.rb +5 -7
  36. data/lib/arrow/writable.rb +22 -0
  37. data/red-arrow.gemspec +8 -4
  38. data/test/helper.rb +1 -2
  39. data/test/test-csv-loader.rb +27 -0
  40. data/test/test-data-type.rb +47 -0
  41. data/test/test-decimal128-array-builder.rb +95 -0
  42. data/test/test-decimal128-array.rb +38 -0
  43. data/test/test-decimal128-data-type.rb +31 -0
  44. data/test/test-dense-union-data-type.rb +41 -0
  45. data/test/test-dictionary-data-type.rb +40 -0
  46. data/test/test-feather.rb +34 -0
  47. data/test/test-field.rb +71 -0
  48. data/test/test-file-output-stream.rb +54 -0
  49. data/test/test-list-array-builder.rb +79 -0
  50. data/test/test-list-array.rb +32 -0
  51. data/test/test-list-data-type.rb +43 -0
  52. data/test/test-record-batch-builder.rb +116 -0
  53. data/test/test-record-batch.rb +82 -27
  54. data/test/test-schema.rb +104 -0
  55. data/test/test-sparse-union-data-type.rb +41 -0
  56. data/test/test-struct-array-builder.rb +180 -0
  57. data/test/test-struct-array.rb +60 -15
  58. data/test/test-struct-data-type.rb +112 -0
  59. data/test/test-struct.rb +81 -0
  60. data/test/test-table.rb +165 -29
  61. data/test/test-time32-data-type.rb +42 -0
  62. data/test/test-time64-data-type.rb +42 -0
  63. data/test/test-timestamp-data-type.rb +42 -0
  64. metadata +99 -10
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 42b3a66d39aaa8365dcbb141542f4e6d4b0a0a6b6848cd7a386bd8ec3e01585d
4
- data.tar.gz: c3b10778cb4cb2d1647d25e600120c38d470cf95f9cc14d6ffb9cb2bbf3fe50f
3
+ metadata.gz: 847a4994cc15fb50df335c7231c942d49392f1ec528b85647b2bbe6fb4e82f7b
4
+ data.tar.gz: 4337680e47dea67107a1fef863d66e936b9c5d20b2cb6698e879b774eada3c88
5
5
  SHA512:
6
- metadata.gz: 625c274985358822083c9be7082badd2581df26bbb0a428e620fc8a4c55bc569f624819064f4f690e5c31b1abf2265c605a48affe1724d0bf3aec3a5ed1a6fd5
7
- data.tar.gz: 9934efa27ff209c2fdfa4a47254d1368f06f18a7e7affa694fd94fc26f1ceb8aac7ff61c57228542a6a342c744474d16f42c5aebef95c5706cac370ccee1ba11
6
+ metadata.gz: c72b26d9b4f488c4d00184ea3243056c69a601fe59fba80a7423075b19034b0db7ad12dbbf5e210ea4b66607acd2ad7123a85057a27266b21665e143961bccb7
7
+ data.tar.gz: 1fb8c82007a25cac5e99d41f3bb879ed86ad1f9deab320f45470252513fada6c6f972f7ddb0ed3f38ca1dcdf7de3f5662a3fa28536a4981b9e5993dd5bc666a8
data/README.md CHANGED
@@ -39,7 +39,7 @@ Note that the Apache Arrow GLib packages are "unofficial". "Official" packages w
39
39
 
40
40
  Install Red Arrow after you install Apache Arrow GLib:
41
41
 
42
- ```text
42
+ ```console
43
43
  % gem install red-arrow
44
44
  ```
45
45
 
data/Rakefile CHANGED
@@ -19,6 +19,7 @@
19
19
 
20
20
  require "rubygems"
21
21
  require "bundler/gem_helper"
22
+ require "yard"
22
23
 
23
24
  base_dir = File.join(__dir__)
24
25
 
@@ -37,3 +38,6 @@ task :test do
37
38
  end
38
39
 
39
40
  task default: :test
41
+
42
+ YARD::Rake::YardocTask.new do |task|
43
+ end
@@ -65,6 +65,12 @@ module Arrow
65
65
  end
66
66
 
67
67
  def build(values)
68
+ append(*values)
69
+ finish
70
+ end
71
+
72
+ # @since 0.12.0
73
+ def append(*values)
68
74
  value_convertable = respond_to?(:convert_to_arrow_value, true)
69
75
  start_index = 0
70
76
  current_index = 0
@@ -111,8 +117,6 @@ module Arrow
111
117
  append_nulls(current_index - start_index)
112
118
  end
113
119
  end
114
-
115
- finish
116
120
  end
117
121
 
118
122
  def append_nulls(n)
@@ -20,11 +20,15 @@ module Arrow
20
20
  include Enumerable
21
21
 
22
22
  class << self
23
- def new(values)
23
+ def new(*args)
24
24
  builder_class_name = "#{name}Builder"
25
25
  if const_defined?(builder_class_name)
26
26
  builder_class = const_get(builder_class_name)
27
- builder_class.build(values)
27
+ if args.size == builder_class.method(:build).arity
28
+ builder_class.build(*args)
29
+ else
30
+ super
31
+ end
28
32
  else
29
33
  super
30
34
  end
@@ -15,20 +15,23 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- class CSVReaderTest < Test::Unit::TestCase
19
- include Helper::Fixture
18
+ module Arrow
19
+ class CompressionType
20
+ EXTENSIONS = {}
21
+ values.each do |value|
22
+ case value
23
+ when UNCOMPRESSED
24
+ when GZIP
25
+ EXTENSIONS["gz"] = value
26
+ else
27
+ EXTENSIONS[value.nick] = value
28
+ end
29
+ end
20
30
 
21
- test("#read") do
22
- CSV.open(fixture_path("with-header.csv").to_s,
23
- headers: true,
24
- skip_lines: /^#/) do |csv|
25
- reader = Arrow::CSVReader.new(csv)
26
- assert_equal(<<-TABLE, reader.read.to_s)
27
- name score
28
- 0 alice 10
29
- 1 bob 29
30
- 2 chris -1
31
- TABLE
31
+ class << self
32
+ def resolve_extension(extension)
33
+ EXTENSIONS[extension.to_s]
34
+ end
32
35
  end
33
36
  end
34
37
  end
@@ -30,6 +30,7 @@ module Arrow
30
30
  def initialize(path_or_data, **options)
31
31
  @path_or_data = path_or_data
32
32
  @options = options
33
+ @compression = @options.delete(:compression)
33
34
  end
34
35
 
35
36
  def load
@@ -60,11 +61,85 @@ module Arrow
60
61
  end
61
62
 
62
63
  def read_csv(csv)
63
- reader = CSVReader.new(csv)
64
- reader.read
64
+ values_set = []
65
+ csv.each do |row|
66
+ if row.is_a?(CSV::Row)
67
+ row = row.collect(&:last)
68
+ end
69
+ row.each_with_index do |value, i|
70
+ values = (values_set[i] ||= [])
71
+ values << value
72
+ end
73
+ end
74
+ return nil if values_set.empty?
75
+
76
+ arrays = values_set.collect.with_index do |values, i|
77
+ ArrayBuilder.build(values)
78
+ end
79
+ if csv.headers
80
+ names = csv.headers
81
+ else
82
+ names = arrays.size.times.collect(&:to_s)
83
+ end
84
+ raw_table = {}
85
+ names.each_with_index do |name, i|
86
+ raw_table[name] = arrays[i]
87
+ end
88
+ Table.new(raw_table)
89
+ end
90
+
91
+ def reader_options
92
+ options = CSVReadOptions.new
93
+ @options.each do |key, value|
94
+ case key
95
+ when :headers
96
+ if value
97
+ options.n_header_rows = 1
98
+ else
99
+ options.n_header_rows = 0
100
+ end
101
+ when :column_types
102
+ value.each do |name, type|
103
+ options.add_column_type(name, type)
104
+ end
105
+ when :schema
106
+ options.add_schema(value)
107
+ else
108
+ setter = "#{key}="
109
+ if options.respond_to?(setter)
110
+ options.__send__(setter, value)
111
+ else
112
+ return nil
113
+ end
114
+ end
115
+ end
116
+ options
117
+ end
118
+
119
+ def open_input(raw_input)
120
+ if @compression
121
+ codec = Codec.new(@compression)
122
+ CompressedInputStream.open(codec, raw_input) do |input|
123
+ yield(input)
124
+ end
125
+ else
126
+ yield(raw_input)
127
+ end
65
128
  end
66
129
 
67
130
  def load_from_path(path)
131
+ options = reader_options
132
+ if options
133
+ begin
134
+ MemoryMappedInputStream.open(path.to_s) do |raw_input|
135
+ open_input(raw_input) do |input|
136
+ return CSVReader.new(input, options).read
137
+ end
138
+ end
139
+ rescue Arrow::Error::Invalid
140
+ end
141
+ end
142
+
68
143
  options = update_csv_parse_options(@options, :open_csv, path)
69
144
  open_csv(path, **options) do |csv|
70
145
  read_csv(csv)
@@ -72,6 +147,18 @@ module Arrow
72
147
  end
73
148
 
74
149
  def load_data(data)
150
+ options = reader_options
151
+ if options
152
+ begin
153
+ BufferInputStream.open(Buffer.new(data)) do |raw_input|
154
+ open_input(raw_input) do |input|
155
+ return CSVReader.new(input, options).read
156
+ end
157
+ end
158
+ rescue Arrow::Error::Invalid
159
+ end
160
+ end
161
+
75
162
  options = update_csv_parse_options(@options, :parse_csv_data, data)
76
163
  parse_csv_data(data, **options) do |csv|
77
164
  read_csv(csv)
@@ -119,6 +206,11 @@ module Arrow
119
206
  end
120
207
  end
121
208
 
209
+ AVAILABLE_CSV_PARSE_OPTIONS = {}
210
+ CSV.instance_method(:initialize).parameters.each do |type, name|
211
+ AVAILABLE_CSV_PARSE_OPTIONS[name] = true if type == :key
212
+ end
213
+
122
214
  def update_csv_parse_options(options, create_csv, *args)
123
215
  if options.key?(:converters)
124
216
  new_options = options.dup
@@ -127,6 +219,14 @@ module Arrow
127
219
  new_options = options.merge(converters: converters)
128
220
  end
129
221
 
222
+ # TODO: Support :schema and :column_types
223
+
224
+ unless AVAILABLE_CSV_PARSE_OPTIONS.empty?
225
+ new_options.select! do |key, value|
226
+ AVAILABLE_CSV_PARSE_OPTIONS.key?(key)
227
+ end
228
+ end
229
+
130
230
  unless options.key?(:headers)
131
231
  __send__(create_csv, *args, **new_options) do |csv|
132
232
  new_options[:headers] = have_header?(csv)
@@ -0,0 +1,25 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class CSVReadOptions
20
+ alias_method :add_column_type_raw, :add_column_type
21
+ def add_column_type(name, type)
22
+ add_column_type_raw(name, DataType.resolve(type))
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,135 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class DataType
20
+ class << self
21
+ # Creates a new suitable {Arrow::DataType}.
22
+ #
23
+ # @overload resolve(data_type)
24
+ #
25
+ # Returns the given data type itself. This is convenient to
26
+ # use this method as {Arrow::DataType} converter.
27
+ #
28
+ # @param data_type [Arrow::DataType] The data type.
29
+ #
30
+ # @return [Arrow::DataType] The given data type itself.
31
+ #
32
+ # @overload resolve(name, *arguments)
33
+ #
34
+ # Creates a suitable data type from type name. For example,
35
+ # you can create {Arrow::BooleanDataType} from `:boolean`.
36
+ #
37
+ # @param name [String, Symbol] The type name of the data type.
38
+ #
39
+ # @param arguments [::Array] The additional information of the
40
+ # data type.
41
+ #
42
+ # For example, {Arrow::TimestampDataType} needs unit as
43
+ # additional information.
44
+ #
45
+ # @example Create a boolean data type
46
+ # Arrow::DataType.resolve(:boolean)
47
+ #
48
+ # @example Create a milliseconds unit timestamp data type
49
+ # Arrow::DataType.resolve(:timestamp, :milli)
50
+ #
51
+ # @overload resolve(description)
52
+ #
53
+ # Creates a suitable data type from data type description.
54
+ #
55
+ # Data type description is a raw `Hash`. Data type description
56
+ # must have `:type` value. `:type` is the type of the data type.
57
+ #
58
+ # If the type needs additional information, you need to
59
+ # specify it. See constructor document what information is
60
+ # needed. For example, {Arrow::ListDataType#initialize} needs
61
+ # `:field` value.
62
+ #
63
+ # @param description [Hash] The description of the data type.
64
+ #
65
+ # @option description [String, Symbol] :type The type name of
66
+ # the data type.
67
+ #
68
+ # @example Create a boolean data type
69
+ # Arrow::DataType.resolve(type: :boolean)
70
+ #
71
+ # @example Create a list data type
72
+ # Arrow::DataType.resolve(type: :list,
73
+ # field: {name: "visible", type: :boolean})
74
+ def resolve(data_type)
75
+ case data_type
76
+ when DataType
77
+ data_type
78
+ when String, Symbol
79
+ resolve_class(data_type).new
80
+ when ::Array
81
+ type, *arguments = data_type
82
+ resolve_class(type).new(*arguments)
83
+ when Hash
84
+ type = nil
85
+ description = {}
86
+ data_type.each do |key, value|
87
+ key = key.to_sym
88
+ case key
89
+ when :type
90
+ type = value
91
+ else
92
+ description[key] = value
93
+ end
94
+ end
95
+ if type.nil?
96
+ message =
97
+ "data type description must have :type value: #{data_type.inspect}"
98
+ raise ArgumentError, message
99
+ end
100
+ data_type_class = resolve_class(type)
101
+ if description.empty?
102
+ data_type_class.new
103
+ else
104
+ data_type_class.new(description)
105
+ end
106
+ else
107
+ message =
108
+ "data type must be " +
109
+ "Arrow::DataType, String, Symbol, [String, ...], [Symbol, ...] " +
110
+ "{type: String, ...} or {type: Symbol, ...}: #{data_type.inspect}"
111
+ raise ArgumentError, message
112
+ end
113
+ end
114
+
115
+ private
116
+ def resolve_class(data_type)
117
+ data_type_name = data_type.to_s.capitalize.gsub(/\AUint/, "UInt")
118
+ data_type_class_name = "#{data_type_name}DataType"
119
+ unless Arrow.const_defined?(data_type_class_name)
120
+ available_types = []
121
+ Arrow.constants.each do |name|
122
+ if name.to_s.end_with?("DataType")
123
+ available_types << name.to_s.gsub(/DataType\z/, "").downcase.to_sym
124
+ end
125
+ end
126
+ message =
127
+ "unknown type: #{data_type.inspect}: " +
128
+ "available types: #{available_types.inspect}"
129
+ raise ArgumentError, message
130
+ end
131
+ Arrow.const_get(data_type_class_name)
132
+ end
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,64 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ require "bigdecimal"
19
+
20
+ module Arrow
21
+ class Decimal128ArrayBuilder
22
+ class << self
23
+ def build(data_type, values)
24
+ builder = new(data_type)
25
+ builder.build(values)
26
+ end
27
+ end
28
+
29
+ alias_method :append_value_raw, :append_value
30
+ def append_value(value)
31
+ case value
32
+ when nil
33
+ return append_null
34
+ when String
35
+ value = Decimal128.new(value)
36
+ when Float
37
+ value = Decimal128.new(value.to_s)
38
+ when BigDecimal
39
+ value = Decimal128.new(value.to_s)
40
+ end
41
+ append_value_raw(value)
42
+ end
43
+
44
+ def append_values(values, is_valids=nil)
45
+ if is_valids
46
+ is_valids.each_with_index do |is_valid, i|
47
+ if is_valid
48
+ append_value(values[i])
49
+ else
50
+ append_null
51
+ end
52
+ end
53
+ else
54
+ values.each do |value|
55
+ if value.nil?
56
+ append_null
57
+ else
58
+ append_value(value)
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end