red-arrow 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/Rakefile +4 -0
  4. data/lib/arrow/array-builder.rb +6 -2
  5. data/lib/arrow/array.rb +6 -2
  6. data/{test/test-csv-reader.rb → lib/arrow/compression-type.rb} +16 -13
  7. data/lib/arrow/csv-loader.rb +102 -2
  8. data/lib/arrow/csv-read-options.rb +25 -0
  9. data/lib/arrow/data-type.rb +135 -0
  10. data/lib/arrow/decimal128-array-builder.rb +64 -0
  11. data/lib/arrow/decimal128-data-type.rb +69 -0
  12. data/lib/arrow/dense-union-data-type.rb +90 -0
  13. data/lib/arrow/dictionary-data-type.rb +106 -0
  14. data/lib/arrow/field-containable.rb +35 -0
  15. data/lib/arrow/field.rb +92 -8
  16. data/lib/arrow/file-output-stream.rb +34 -0
  17. data/lib/arrow/list-array-builder.rb +96 -0
  18. data/lib/arrow/list-data-type.rb +68 -0
  19. data/lib/arrow/loader.rb +30 -5
  20. data/lib/arrow/{csv-reader.rb → path-extension.rb} +19 -28
  21. data/lib/arrow/record-batch-builder.rb +115 -0
  22. data/lib/arrow/record-batch.rb +25 -0
  23. data/lib/arrow/schema.rb +97 -0
  24. data/lib/arrow/sparse-union-data-type.rb +90 -0
  25. data/lib/arrow/struct-array-builder.rb +146 -0
  26. data/lib/arrow/struct-array.rb +34 -0
  27. data/lib/arrow/struct-data-type.rb +130 -0
  28. data/lib/arrow/struct.rb +68 -0
  29. data/lib/arrow/table-loader.rb +65 -25
  30. data/lib/arrow/table-saver.rb +73 -24
  31. data/lib/arrow/table.rb +11 -2
  32. data/lib/arrow/time32-data-type.rb +61 -0
  33. data/lib/arrow/time64-data-type.rb +61 -0
  34. data/lib/arrow/timestamp-data-type.rb +57 -0
  35. data/lib/arrow/version.rb +5 -7
  36. data/lib/arrow/writable.rb +22 -0
  37. data/red-arrow.gemspec +8 -4
  38. data/test/helper.rb +1 -2
  39. data/test/test-csv-loader.rb +27 -0
  40. data/test/test-data-type.rb +47 -0
  41. data/test/test-decimal128-array-builder.rb +95 -0
  42. data/test/test-decimal128-array.rb +38 -0
  43. data/test/test-decimal128-data-type.rb +31 -0
  44. data/test/test-dense-union-data-type.rb +41 -0
  45. data/test/test-dictionary-data-type.rb +40 -0
  46. data/test/test-feather.rb +34 -0
  47. data/test/test-field.rb +71 -0
  48. data/test/test-file-output-stream.rb +54 -0
  49. data/test/test-list-array-builder.rb +79 -0
  50. data/test/test-list-array.rb +32 -0
  51. data/test/test-list-data-type.rb +43 -0
  52. data/test/test-record-batch-builder.rb +116 -0
  53. data/test/test-record-batch.rb +82 -27
  54. data/test/test-schema.rb +104 -0
  55. data/test/test-sparse-union-data-type.rb +41 -0
  56. data/test/test-struct-array-builder.rb +180 -0
  57. data/test/test-struct-array.rb +60 -15
  58. data/test/test-struct-data-type.rb +112 -0
  59. data/test/test-struct.rb +81 -0
  60. data/test/test-table.rb +165 -29
  61. data/test/test-time32-data-type.rb +42 -0
  62. data/test/test-time64-data-type.rb +42 -0
  63. data/test/test-timestamp-data-type.rb +42 -0
  64. metadata +99 -10
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 42b3a66d39aaa8365dcbb141542f4e6d4b0a0a6b6848cd7a386bd8ec3e01585d
4
- data.tar.gz: c3b10778cb4cb2d1647d25e600120c38d470cf95f9cc14d6ffb9cb2bbf3fe50f
3
+ metadata.gz: 847a4994cc15fb50df335c7231c942d49392f1ec528b85647b2bbe6fb4e82f7b
4
+ data.tar.gz: 4337680e47dea67107a1fef863d66e936b9c5d20b2cb6698e879b774eada3c88
5
5
  SHA512:
6
- metadata.gz: 625c274985358822083c9be7082badd2581df26bbb0a428e620fc8a4c55bc569f624819064f4f690e5c31b1abf2265c605a48affe1724d0bf3aec3a5ed1a6fd5
7
- data.tar.gz: 9934efa27ff209c2fdfa4a47254d1368f06f18a7e7affa694fd94fc26f1ceb8aac7ff61c57228542a6a342c744474d16f42c5aebef95c5706cac370ccee1ba11
6
+ metadata.gz: c72b26d9b4f488c4d00184ea3243056c69a601fe59fba80a7423075b19034b0db7ad12dbbf5e210ea4b66607acd2ad7123a85057a27266b21665e143961bccb7
7
+ data.tar.gz: 1fb8c82007a25cac5e99d41f3bb879ed86ad1f9deab320f45470252513fada6c6f972f7ddb0ed3f38ca1dcdf7de3f5662a3fa28536a4981b9e5993dd5bc666a8
data/README.md CHANGED
@@ -39,7 +39,7 @@ Note that the Apache Arrow GLib packages are "unofficial". "Official" packages w
39
39
 
40
40
  Install Red Arrow after you install Apache Arrow GLib:
41
41
 
42
- ```text
42
+ ```console
43
43
  % gem install red-arrow
44
44
  ```
45
45
 
data/Rakefile CHANGED
@@ -19,6 +19,7 @@
19
19
 
20
20
  require "rubygems"
21
21
  require "bundler/gem_helper"
22
+ require "yard"
22
23
 
23
24
  base_dir = File.join(__dir__)
24
25
 
@@ -37,3 +38,6 @@ task :test do
37
38
  end
38
39
 
39
40
  task default: :test
41
+
42
+ YARD::Rake::YardocTask.new do |task|
43
+ end
@@ -65,6 +65,12 @@ module Arrow
65
65
  end
66
66
 
67
67
  def build(values)
68
+ append(*values)
69
+ finish
70
+ end
71
+
72
+ # @since 0.12.0
73
+ def append(*values)
68
74
  value_convertable = respond_to?(:convert_to_arrow_value, true)
69
75
  start_index = 0
70
76
  current_index = 0
@@ -111,8 +117,6 @@ module Arrow
111
117
  append_nulls(current_index - start_index)
112
118
  end
113
119
  end
114
-
115
- finish
116
120
  end
117
121
 
118
122
  def append_nulls(n)
@@ -20,11 +20,15 @@ module Arrow
20
20
  include Enumerable
21
21
 
22
22
  class << self
23
- def new(values)
23
+ def new(*args)
24
24
  builder_class_name = "#{name}Builder"
25
25
  if const_defined?(builder_class_name)
26
26
  builder_class = const_get(builder_class_name)
27
- builder_class.build(values)
27
+ if args.size == builder_class.method(:build).arity
28
+ builder_class.build(*args)
29
+ else
30
+ super
31
+ end
28
32
  else
29
33
  super
30
34
  end
@@ -15,20 +15,23 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- class CSVReaderTest < Test::Unit::TestCase
19
- include Helper::Fixture
18
+ module Arrow
19
+ class CompressionType
20
+ EXTENSIONS = {}
21
+ values.each do |value|
22
+ case value
23
+ when UNCOMPRESSED
24
+ when GZIP
25
+ EXTENSIONS["gz"] = value
26
+ else
27
+ EXTENSIONS[value.nick] = value
28
+ end
29
+ end
20
30
 
21
- test("#read") do
22
- CSV.open(fixture_path("with-header.csv").to_s,
23
- headers: true,
24
- skip_lines: /^#/) do |csv|
25
- reader = Arrow::CSVReader.new(csv)
26
- assert_equal(<<-TABLE, reader.read.to_s)
27
- name score
28
- 0 alice 10
29
- 1 bob 29
30
- 2 chris -1
31
- TABLE
31
+ class << self
32
+ def resolve_extension(extension)
33
+ EXTENSIONS[extension.to_s]
34
+ end
32
35
  end
33
36
  end
34
37
  end
@@ -30,6 +30,7 @@ module Arrow
30
30
  def initialize(path_or_data, **options)
31
31
  @path_or_data = path_or_data
32
32
  @options = options
33
+ @compression = @options.delete(:compression)
33
34
  end
34
35
 
35
36
  def load
@@ -60,11 +61,85 @@ module Arrow
60
61
  end
61
62
 
62
63
  def read_csv(csv)
63
- reader = CSVReader.new(csv)
64
- reader.read
64
+ values_set = []
65
+ csv.each do |row|
66
+ if row.is_a?(CSV::Row)
67
+ row = row.collect(&:last)
68
+ end
69
+ row.each_with_index do |value, i|
70
+ values = (values_set[i] ||= [])
71
+ values << value
72
+ end
73
+ end
74
+ return nil if values_set.empty?
75
+
76
+ arrays = values_set.collect.with_index do |values, i|
77
+ ArrayBuilder.build(values)
78
+ end
79
+ if csv.headers
80
+ names = csv.headers
81
+ else
82
+ names = arrays.size.times.collect(&:to_s)
83
+ end
84
+ raw_table = {}
85
+ names.each_with_index do |name, i|
86
+ raw_table[name] = arrays[i]
87
+ end
88
+ Table.new(raw_table)
89
+ end
90
+
91
+ def reader_options
92
+ options = CSVReadOptions.new
93
+ @options.each do |key, value|
94
+ case key
95
+ when :headers
96
+ if value
97
+ options.n_header_rows = 1
98
+ else
99
+ options.n_header_rows = 0
100
+ end
101
+ when :column_types
102
+ value.each do |name, type|
103
+ options.add_column_type(name, type)
104
+ end
105
+ when :schema
106
+ options.add_schema(value)
107
+ else
108
+ setter = "#{key}="
109
+ if options.respond_to?(setter)
110
+ options.__send__(setter, value)
111
+ else
112
+ return nil
113
+ end
114
+ end
115
+ end
116
+ options
117
+ end
118
+
119
+ def open_input(raw_input)
120
+ if @compression
121
+ codec = Codec.new(@compression)
122
+ CompressedInputStream.open(codec, raw_input) do |input|
123
+ yield(input)
124
+ end
125
+ else
126
+ yield(raw_input)
127
+ end
65
128
  end
66
129
 
67
130
  def load_from_path(path)
131
+ options = reader_options
132
+ if options
133
+ begin
134
+ MemoryMappedInputStream.open(path.to_s) do |raw_input|
135
+ open_input(raw_input) do |input|
136
+ return CSVReader.new(input, options).read
137
+ end
138
+ end
139
+ rescue Arrow::Error::Invalid
140
+ end
141
+ end
142
+
68
143
  options = update_csv_parse_options(@options, :open_csv, path)
69
144
  open_csv(path, **options) do |csv|
70
145
  read_csv(csv)
@@ -72,6 +147,18 @@ module Arrow
72
147
  end
73
148
 
74
149
  def load_data(data)
150
+ options = reader_options
151
+ if options
152
+ begin
153
+ BufferInputStream.open(Buffer.new(data)) do |raw_input|
154
+ open_input(raw_input) do |input|
155
+ return CSVReader.new(input, options).read
156
+ end
157
+ end
158
+ rescue Arrow::Error::Invalid
159
+ end
160
+ end
161
+
75
162
  options = update_csv_parse_options(@options, :parse_csv_data, data)
76
163
  parse_csv_data(data, **options) do |csv|
77
164
  read_csv(csv)
@@ -119,6 +206,11 @@ module Arrow
119
206
  end
120
207
  end
121
208
 
209
+ AVAILABLE_CSV_PARSE_OPTIONS = {}
210
+ CSV.instance_method(:initialize).parameters.each do |type, name|
211
+ AVAILABLE_CSV_PARSE_OPTIONS[name] = true if type == :key
212
+ end
213
+
122
214
  def update_csv_parse_options(options, create_csv, *args)
123
215
  if options.key?(:converters)
124
216
  new_options = options.dup
@@ -127,6 +219,14 @@ module Arrow
127
219
  new_options = options.merge(converters: converters)
128
220
  end
129
221
 
222
+ # TODO: Support :schema and :column_types
223
+
224
+ unless AVAILABLE_CSV_PARSE_OPTIONS.empty?
225
+ new_options.select! do |key, value|
226
+ AVAILABLE_CSV_PARSE_OPTIONS.key?(key)
227
+ end
228
+ end
229
+
130
230
  unless options.key?(:headers)
131
231
  __send__(create_csv, *args, **new_options) do |csv|
132
232
  new_options[:headers] = have_header?(csv)
@@ -0,0 +1,25 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class CSVReadOptions
20
+ alias_method :add_column_type_raw, :add_column_type
21
+ def add_column_type(name, type)
22
+ add_column_type_raw(name, DataType.resolve(type))
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,135 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ module Arrow
19
+ class DataType
20
+ class << self
21
+ # Creates a new suitable {Arrow::DataType}.
22
+ #
23
+ # @overload resolve(data_type)
24
+ #
25
+ # Returns the given data type itself. This is convenient to
26
+ # use this method as {Arrow::DataType} converter.
27
+ #
28
+ # @param data_type [Arrow::DataType] The data type.
29
+ #
30
+ # @return [Arrow::DataType] The given data type itself.
31
+ #
32
+ # @overload resolve(name, *arguments)
33
+ #
34
+ # Creates a suitable data type from type name. For example,
35
+ # you can create {Arrow::BooleanDataType} from `:boolean`.
36
+ #
37
+ # @param name [String, Symbol] The type name of the data type.
38
+ #
39
+ # @param arguments [::Array] The additional information of the
40
+ # data type.
41
+ #
42
+ # For example, {Arrow::TimestampDataType} needs unit as
43
+ # additional information.
44
+ #
45
+ # @example Create a boolean data type
46
+ # Arrow::DataType.resolve(:boolean)
47
+ #
48
+ # @example Create a milliseconds unit timestamp data type
49
+ # Arrow::DataType.resolve(:timestamp, :milli)
50
+ #
51
+ # @overload resolve(description)
52
+ #
53
+ # Creates a suitable data type from data type description.
54
+ #
55
+ # Data type description is a raw `Hash`. Data type description
56
+ # must have `:type` value. `:type` is the type of the data type.
57
+ #
58
+ # If the type needs additional information, you need to
59
+ # specify it. See constructor document what information is
60
+ # needed. For example, {Arrow::ListDataType#initialize} needs
61
+ # `:field` value.
62
+ #
63
+ # @param description [Hash] The description of the data type.
64
+ #
65
+ # @option description [String, Symbol] :type The type name of
66
+ # the data type.
67
+ #
68
+ # @example Create a boolean data type
69
+ # Arrow::DataType.resolve(type: :boolean)
70
+ #
71
+ # @example Create a list data type
72
+ # Arrow::DataType.resolve(type: :list,
73
+ # field: {name: "visible", type: :boolean})
74
+ def resolve(data_type)
75
+ case data_type
76
+ when DataType
77
+ data_type
78
+ when String, Symbol
79
+ resolve_class(data_type).new
80
+ when ::Array
81
+ type, *arguments = data_type
82
+ resolve_class(type).new(*arguments)
83
+ when Hash
84
+ type = nil
85
+ description = {}
86
+ data_type.each do |key, value|
87
+ key = key.to_sym
88
+ case key
89
+ when :type
90
+ type = value
91
+ else
92
+ description[key] = value
93
+ end
94
+ end
95
+ if type.nil?
96
+ message =
97
+ "data type description must have :type value: #{data_type.inspect}"
98
+ raise ArgumentError, message
99
+ end
100
+ data_type_class = resolve_class(type)
101
+ if description.empty?
102
+ data_type_class.new
103
+ else
104
+ data_type_class.new(description)
105
+ end
106
+ else
107
+ message =
108
+ "data type must be " +
109
+ "Arrow::DataType, String, Symbol, [String, ...], [Symbol, ...] " +
110
+ "{type: String, ...} or {type: Symbol, ...}: #{data_type.inspect}"
111
+ raise ArgumentError, message
112
+ end
113
+ end
114
+
115
+ private
116
+ def resolve_class(data_type)
117
+ data_type_name = data_type.to_s.capitalize.gsub(/\AUint/, "UInt")
118
+ data_type_class_name = "#{data_type_name}DataType"
119
+ unless Arrow.const_defined?(data_type_class_name)
120
+ available_types = []
121
+ Arrow.constants.each do |name|
122
+ if name.to_s.end_with?("DataType")
123
+ available_types << name.to_s.gsub(/DataType\z/, "").downcase.to_sym
124
+ end
125
+ end
126
+ message =
127
+ "unknown type: #{data_type.inspect}: " +
128
+ "available types: #{available_types.inspect}"
129
+ raise ArgumentError, message
130
+ end
131
+ Arrow.const_get(data_type_class_name)
132
+ end
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,64 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing,
12
+ # software distributed under the License is distributed on an
13
+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
+ # KIND, either express or implied. See the License for the
15
+ # specific language governing permissions and limitations
16
+ # under the License.
17
+
18
+ require "bigdecimal"
19
+
20
+ module Arrow
21
+ class Decimal128ArrayBuilder
22
+ class << self
23
+ def build(data_type, values)
24
+ builder = new(data_type)
25
+ builder.build(values)
26
+ end
27
+ end
28
+
29
+ alias_method :append_value_raw, :append_value
30
+ def append_value(value)
31
+ case value
32
+ when nil
33
+ return append_null
34
+ when String
35
+ value = Decimal128.new(value)
36
+ when Float
37
+ value = Decimal128.new(value.to_s)
38
+ when BigDecimal
39
+ value = Decimal128.new(value.to_s)
40
+ end
41
+ append_value_raw(value)
42
+ end
43
+
44
+ def append_values(values, is_valids=nil)
45
+ if is_valids
46
+ is_valids.each_with_index do |is_valid, i|
47
+ if is_valid
48
+ append_value(values[i])
49
+ else
50
+ append_null
51
+ end
52
+ end
53
+ else
54
+ values.each do |value|
55
+ if value.nil?
56
+ append_null
57
+ else
58
+ append_value(value)
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end