red-arrow 0.8.0 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: add6b81f8b6fe0d623d022d16b358b756bf0a559
4
- data.tar.gz: ffa5678bd352df22212c7dd3d3b6beea6491bbde
3
+ metadata.gz: 59cce19c00fbc436f48f29c2d58aca2685330a63
4
+ data.tar.gz: 280b1305e60c39d74d4fd101dc43387cd398e770
5
5
  SHA512:
6
- metadata.gz: da4959b8123e205d5fd16d17704ecc1e93d29659294575ac6c5797e7f578b8ce0c43710f344c909a70b0b61def35a5d1f9d6c1d1de59679d3ad6bb037df2671f
7
- data.tar.gz: 11c7f909073eaa609aa2402a629016a27ecac19a0ac85a54468fba5d5ea433b23c41cfa6179ff176363d17c61f86b2233812440c3bb26d32459a145d14d7d06a
6
+ metadata.gz: 3ecba86d0d061186def935c54a999f1c0adf1f109a1687eefd0375792a87f34333e7a46ee7f78fe51b01fbe294166491f300d9e46a8315e19ff47a1cb3f45102
7
+ data.tar.gz: 49d09794b0d007bf9e65675e133c45cd1a90a3eec748b9c747d2e7d78e4c21d682ba503c696cb61d128fec362fa138ec50b04ab780a0bf554557417ccaf02afd
@@ -0,0 +1,15 @@
1
+ # Development
2
+
3
+ ## Naming convention
4
+
5
+ ### Reader and Writer
6
+
7
+ Reader and Writer require an opened IO stream.
8
+
9
+ ### Loader and Saver
10
+
11
+ Loader and Saver require a path. They are convenient classes.
12
+
13
+ Loader opens the path and reads data by Reader.
14
+
15
+ Writer opens the path and writes data by Writer.
@@ -1,5 +1,19 @@
1
1
  # News
2
2
 
3
+ ## 0.8.1 - 2018-01-05
4
+
5
+ ### Improvements
6
+
7
+ * `Arrow::ArrayBuilder.build`: Added generic array build support.
8
+
9
+ * `Arrow::Table#save`: Added.
10
+
11
+ * `Arrow::Table.load`: Added.
12
+
13
+ * `Arrow::CSVLoader`: Added.
14
+
15
+ * `Arrow::CSVReader.read`: Removed.
16
+
3
17
  ## 0.8.0 - 2018-01-04
4
18
 
5
19
  ### Improvements
@@ -12,16 +12,58 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ require "date"
16
+
15
17
  module Arrow
16
18
  class ArrayBuilder
17
19
  class << self
18
20
  def build(values)
19
- builder = new
20
- builder.build(values)
21
+ if self != ArrayBuilder
22
+ builder = new
23
+ return builder.build(values)
24
+ end
25
+
26
+ builder_class = nil
27
+ values.each do |value|
28
+ case value
29
+ when nil
30
+ # Ignore
31
+ nil
32
+ when true, false
33
+ return BooleanArray.new(values)
34
+ when String
35
+ return StringArray.new(values)
36
+ when Float
37
+ return DoubleArray.new(values)
38
+ when Integer
39
+ if value.negative?
40
+ builder = IntArrayBuilder.new
41
+ return builder.build(values)
42
+ else
43
+ builder_class = UIntArrayBuilder
44
+ end
45
+ when Time
46
+ data_type = TimestampDataType.new(:nano)
47
+ builder = TimestampArrayBuilder.new(data_type)
48
+ return builder.build(values)
49
+ when DateTime
50
+ return Date64Array.new(values)
51
+ when Date
52
+ return Date32Array.new(values)
53
+ else
54
+ return StringArray.new(values)
55
+ end
56
+ end
57
+ if builder_class
58
+ builder_class.new.build(values)
59
+ else
60
+ Arrow::StringArray.new(values)
61
+ end
21
62
  end
22
63
  end
23
64
 
24
65
  def build(values)
66
+ value_convertable = respond_to?(:convert_to_arrow_value, true)
25
67
  if respond_to?(:append_values)
26
68
  start_index = 0
27
69
  current_index = 0
@@ -30,7 +72,13 @@ module Arrow
30
72
  if value.nil?
31
73
  if status == :value
32
74
  if start_index != current_index
33
- append_values(values[start_index...current_index])
75
+ target_values = values[start_index...current_index]
76
+ if value_convertable
77
+ target_values = target_values.collect do |v|
78
+ convert_to_arrow_value(v)
79
+ end
80
+ end
81
+ append_values(target_values)
34
82
  start_index = current_index
35
83
  end
36
84
  status = :null
@@ -47,10 +95,16 @@ module Arrow
47
95
  if start_index != current_index
48
96
  if status == :value
49
97
  if start_index == 0 and current_index == values.size
50
- append_values(values)
98
+ target_values = values
51
99
  else
52
- append_values(values[start_index...current_index])
100
+ target_values = values[start_index...current_index]
101
+ end
102
+ if value_convertable
103
+ target_values = target_values.collect do |v|
104
+ convert_to_arrow_value(v)
105
+ end
53
106
  end
107
+ append_values(target_values)
54
108
  else
55
109
  append_nulls(current_index - start_index)
56
110
  end
@@ -60,6 +114,7 @@ module Arrow
60
114
  if value.nil?
61
115
  append_null
62
116
  else
117
+ value = convert_to_arrow_value(value) if value_convertable
63
118
  append(value)
64
119
  end
65
120
  end
@@ -0,0 +1,206 @@
1
+ # Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ require "csv"
16
+ require "pathname"
17
+ require "time"
18
+
19
+ module Arrow
20
+ class CSVLoader
21
+ class << self
22
+ def load(path_or_data, **options)
23
+ new(path_or_data, **options).load
24
+ end
25
+ end
26
+
27
+ def initialize(path_or_data, **options)
28
+ @path_or_data = path_or_data
29
+ @options = options
30
+ end
31
+
32
+ def load
33
+ case @path_or_data
34
+ when Pathname
35
+ load_from_path(@path_or_data.to_path)
36
+ when /\A.+\.csv\z/i
37
+ load_from_path(@path_or_data)
38
+ else
39
+ load_data(@path_or_data)
40
+ end
41
+ end
42
+
43
+ private
44
+ def open_csv(path, **options)
45
+ CSV.open(path, **options) do |csv|
46
+ yield(csv)
47
+ end
48
+ end
49
+
50
+ def parse_csv_data(data, **options)
51
+ csv = CSV.new(data, **options)
52
+ begin
53
+ yield(csv)
54
+ ensure
55
+ csv.close
56
+ end
57
+ end
58
+
59
+ def read_csv(csv)
60
+ reader = CSVReader.new(csv)
61
+ reader.read
62
+ end
63
+
64
+ def load_from_path(path)
65
+ options = update_csv_parse_options(@options, :open_csv, path)
66
+ open_csv(path, **options) do |csv|
67
+ read_csv(csv)
68
+ end
69
+ end
70
+
71
+ def load_data(data)
72
+ options = update_csv_parse_options(@options, :parse_csv_data, data)
73
+ parse_csv_data(data, **options) do |csv|
74
+ read_csv(csv)
75
+ end
76
+ end
77
+
78
+ def selective_converter(target_index)
79
+ lambda do |field, field_info|
80
+ if target_index.nil? or field_info.index == target_index
81
+ yield(field)
82
+ else
83
+ field
84
+ end
85
+ end
86
+ end
87
+
88
+ BOOLEAN_CONVERTER = lambda do |field|
89
+ begin
90
+ encoded_field = field.encode(CSV::ConverterEncoding)
91
+ rescue EncodingError
92
+ field
93
+ else
94
+ case encoded_field
95
+ when "true"
96
+ true
97
+ when "false"
98
+ false
99
+ else
100
+ field
101
+ end
102
+ end
103
+ end
104
+
105
+ ISO8601_CONVERTER = lambda do |field|
106
+ begin
107
+ encoded_field = field.encode(CSV::ConverterEncoding)
108
+ rescue EncodingError
109
+ field
110
+ else
111
+ begin
112
+ Time.iso8601(encoded_field)
113
+ rescue ArgumentError
114
+ field
115
+ end
116
+ end
117
+ end
118
+
119
+ def update_csv_parse_options(options, create_csv, *args)
120
+ return options unless options.empty?
121
+
122
+ converters = [:all, BOOLEAN_CONVERTER, ISO8601_CONVERTER]
123
+ new_options = options.merge(converters: converters)
124
+ __send__(create_csv, *args, **new_options) do |csv|
125
+ new_options[:headers] = have_header?(csv)
126
+ end
127
+ __send__(create_csv, *args, **new_options) do |csv|
128
+ new_options[:converters] = detect_robust_converters(csv)
129
+ end
130
+ return new_options
131
+ end
132
+
133
+ def have_header?(csv)
134
+ row1 = csv.shift
135
+ return false if row1.nil?
136
+ return false if row1.any?(&:nil?)
137
+
138
+ row2 = csv.shift
139
+ return nil if row2.nil?
140
+ return true if row2.any?(&:nil?)
141
+
142
+ if row1.collect(&:class) != row2.collect(&:class)
143
+ return true
144
+ end
145
+
146
+ nil
147
+ end
148
+
149
+ def detect_robust_converters(csv)
150
+ column_types = []
151
+ csv.each do |row|
152
+ row.each_with_index do |(_name, value), i|
153
+ current_column_type = column_types[i]
154
+ next if current_column_type == :string
155
+
156
+ candidate_type = nil
157
+ case value
158
+ when nil
159
+ next
160
+ when "true", "false", true, false
161
+ candidate_type = :boolean
162
+ when Integer
163
+ candidate_type = :integer
164
+ when Float
165
+ candidate_type = :float
166
+ if current_column_type == :integer
167
+ column_types[i] = candidate_type
168
+ end
169
+ when Time
170
+ candidate_type = :time
171
+ when DateTime
172
+ candidate_type = :date_time
173
+ when Date
174
+ candidate_type = :date
175
+ else
176
+ candidate_type = :string
177
+ end
178
+
179
+ column_types[i] ||= candidate_type
180
+ if column_types[i] != candidate_type
181
+ column_types[i] = :string
182
+ end
183
+ end
184
+ end
185
+
186
+ converters = []
187
+ column_types.each_with_index do |type, i|
188
+ case type
189
+ when :boolean
190
+ converters << selective_converter(i, &BOOLEAN_CONVERTER)
191
+ when :integer
192
+ converters << selective_converter(i, &CSV::Converters[:integer])
193
+ when :float
194
+ converters << selective_converter(i, &CSV::Converters[:float])
195
+ when :time
196
+ converters << selective_converter(i, &ISO8601_CONVERTER)
197
+ when :date_time
198
+ converters << selective_converter(i, &CSV::Converters[:date_time])
199
+ when :date
200
+ converters << selective_converter(i, &CSV::Converters[:date])
201
+ end
202
+ end
203
+ converters
204
+ end
205
+ end
206
+ end
@@ -13,150 +13,39 @@
13
13
  # limitations under the License.
14
14
 
15
15
  require "csv"
16
- require "pathname"
17
- require "time"
18
16
 
19
17
  module Arrow
20
18
  class CSVReader
21
- class << self
22
- def read(csv, **options)
23
- case csv
24
- when Pathname
25
- path = csv.to_path
26
- options = update_csv_parse_options(options, :open_csv, path)
27
- open_csv(path, **options) do |_csv|
28
- read(_csv)
29
- end
30
- when /\A.+\.csv\z/i
31
- read(Pathname.new(csv), **options)
32
- when String
33
- options = update_csv_parse_options(options, :parse_csv_data, csv)
34
- parse_csv_data(csv, **options) do |_csv|
35
- read(_csv)
36
- end
37
- else
38
- new(csv).read
39
- end
40
- end
41
-
42
- private
43
- def open_csv(path, **options)
44
- CSV.open(path, **options) do |csv|
45
- yield(csv)
46
- end
47
- end
48
-
49
- def parse_csv_data(data, **options)
50
- csv = CSV.new(data, **options)
51
- begin
52
- yield(csv)
53
- ensure
54
- csv.close
55
- end
56
- end
57
-
58
- ISO8601_CONVERTER = lambda do |field|
59
- begin
60
- encoded_field = field.encode(CSV::ConverterEncoding)
61
- rescue EncodingError
62
- field
63
- else
64
- begin
65
- Time.iso8601(encoded_field)
66
- rescue ArgumentError
67
- field
68
- end
69
- end
70
- end
71
-
72
- def update_csv_parse_options(options, create_csv, *args)
73
- return options unless options.empty?
74
-
75
- new_options = options.merge(converters: [:all, ISO8601_CONVERTER])
76
- __send__(create_csv, *args, **new_options) do |csv|
77
- row1 = csv.shift
78
- if row1.nil?
79
- new_options[:headers] = false
80
- return new_options
81
- end
82
- if row1.any?(&:nil?)
83
- new_options[:headers] = false
84
- return new_options
85
- end
86
-
87
- row2 = csv.shift
88
- return new_options if row2.nil?
89
- if row2.any?(&:nil?)
90
- new_options[:headers] = true
91
- return new_options
92
- end
93
-
94
- if row1.collect(&:class) != row2.collect(&:class)
95
- new_options[:headers] = true
96
- return new_options
97
- end
98
-
99
- new_options
100
- end
101
- end
102
- end
103
-
104
19
  def initialize(csv)
105
20
  @csv = csv
106
21
  end
107
22
 
108
23
  def read
109
- builders = []
110
24
  values_set = []
111
25
  @csv.each do |row|
112
26
  if row.is_a?(CSV::Row)
113
27
  row = row.collect(&:last)
114
28
  end
115
29
  row.each_with_index do |value, i|
116
- builders[i] ||= create_builder(value)
117
30
  values = (values_set[i] ||= [])
118
- case value
119
- when Time
120
- value = value.to_i * (10 ** 9) + value.nsec
121
- end
122
31
  values << value
123
32
  end
124
33
  end
125
34
  return nil if values_set.empty?
126
35
 
127
36
  arrays = values_set.collect.with_index do |values, i|
128
- builders[i].build(values)
37
+ ArrayBuilder.build(values)
129
38
  end
130
39
  if @csv.headers
131
40
  names = @csv.headers
132
41
  else
133
- names = builders.size.times.collect(&:to_s)
134
- end
135
- fields = names.collect.with_index do |name, i|
136
- Arrow::Field.new(name, arrays[i].value_data_type)
137
- end
138
- schema = Schema.new(fields)
139
- columns = arrays.collect.with_index do |array, i|
140
- Column.new(fields[i], array)
42
+ names = arrays.size.times.collect(&:to_s)
141
43
  end
142
- Table.new(schema, columns)
143
- end
144
-
145
- private
146
- def create_builder(sample_value)
147
- case sample_value
148
- when Integer
149
- IntArrayBuilder.new
150
- when Float
151
- DoubleArrayBuilder.new
152
- when String
153
- StringArrayBuilder.new
154
- when Time
155
- data_type = TimestampDataType.new(:nano)
156
- TimestampArrayBuilder.new(data_type)
157
- else
158
- nil
44
+ raw_table = {}
45
+ names.each_with_index do |name, i|
46
+ raw_table[name] = arrays[i]
159
47
  end
48
+ Table.new(raw_table)
160
49
  end
161
50
  end
162
51
  end