red-arrow 0.8.0 → 0.8.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: add6b81f8b6fe0d623d022d16b358b756bf0a559
4
- data.tar.gz: ffa5678bd352df22212c7dd3d3b6beea6491bbde
3
+ metadata.gz: 59cce19c00fbc436f48f29c2d58aca2685330a63
4
+ data.tar.gz: 280b1305e60c39d74d4fd101dc43387cd398e770
5
5
  SHA512:
6
- metadata.gz: da4959b8123e205d5fd16d17704ecc1e93d29659294575ac6c5797e7f578b8ce0c43710f344c909a70b0b61def35a5d1f9d6c1d1de59679d3ad6bb037df2671f
7
- data.tar.gz: 11c7f909073eaa609aa2402a629016a27ecac19a0ac85a54468fba5d5ea433b23c41cfa6179ff176363d17c61f86b2233812440c3bb26d32459a145d14d7d06a
6
+ metadata.gz: 3ecba86d0d061186def935c54a999f1c0adf1f109a1687eefd0375792a87f34333e7a46ee7f78fe51b01fbe294166491f300d9e46a8315e19ff47a1cb3f45102
7
+ data.tar.gz: 49d09794b0d007bf9e65675e133c45cd1a90a3eec748b9c747d2e7d78e4c21d682ba503c696cb61d128fec362fa138ec50b04ab780a0bf554557417ccaf02afd
@@ -0,0 +1,15 @@
1
+ # Development
2
+
3
+ ## Naming convention
4
+
5
+ ### Reader and Writer
6
+
7
+ Reader and Writer require an opened IO stream.
8
+
9
+ ### Loader and Saver
10
+
11
+ Loader and Saver require a path. They are convenient classes.
12
+
13
+ Loader opens the path and reads data by Reader.
14
+
15
+ Writer opens the path and writes data by Writer.
@@ -1,5 +1,19 @@
1
1
  # News
2
2
 
3
+ ## 0.8.1 - 2018-01-05
4
+
5
+ ### Improvements
6
+
7
+ * `Arrow::ArrayBuilder.build`: Added generic array build support.
8
+
9
+ * `Arrow::Table#save`: Added.
10
+
11
+ * `Arrow::Table.load`: Added.
12
+
13
+ * `Arrow::CSVLoader`: Added.
14
+
15
+ * `Arrow::CSVReader.read`: Removed.
16
+
3
17
  ## 0.8.0 - 2018-01-04
4
18
 
5
19
  ### Improvements
@@ -12,16 +12,58 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ require "date"
16
+
15
17
  module Arrow
16
18
  class ArrayBuilder
17
19
  class << self
18
20
  def build(values)
19
- builder = new
20
- builder.build(values)
21
+ if self != ArrayBuilder
22
+ builder = new
23
+ return builder.build(values)
24
+ end
25
+
26
+ builder_class = nil
27
+ values.each do |value|
28
+ case value
29
+ when nil
30
+ # Ignore
31
+ nil
32
+ when true, false
33
+ return BooleanArray.new(values)
34
+ when String
35
+ return StringArray.new(values)
36
+ when Float
37
+ return DoubleArray.new(values)
38
+ when Integer
39
+ if value.negative?
40
+ builder = IntArrayBuilder.new
41
+ return builder.build(values)
42
+ else
43
+ builder_class = UIntArrayBuilder
44
+ end
45
+ when Time
46
+ data_type = TimestampDataType.new(:nano)
47
+ builder = TimestampArrayBuilder.new(data_type)
48
+ return builder.build(values)
49
+ when DateTime
50
+ return Date64Array.new(values)
51
+ when Date
52
+ return Date32Array.new(values)
53
+ else
54
+ return StringArray.new(values)
55
+ end
56
+ end
57
+ if builder_class
58
+ builder_class.new.build(values)
59
+ else
60
+ Arrow::StringArray.new(values)
61
+ end
21
62
  end
22
63
  end
23
64
 
24
65
  def build(values)
66
+ value_convertable = respond_to?(:convert_to_arrow_value, true)
25
67
  if respond_to?(:append_values)
26
68
  start_index = 0
27
69
  current_index = 0
@@ -30,7 +72,13 @@ module Arrow
30
72
  if value.nil?
31
73
  if status == :value
32
74
  if start_index != current_index
33
- append_values(values[start_index...current_index])
75
+ target_values = values[start_index...current_index]
76
+ if value_convertable
77
+ target_values = target_values.collect do |v|
78
+ convert_to_arrow_value(v)
79
+ end
80
+ end
81
+ append_values(target_values)
34
82
  start_index = current_index
35
83
  end
36
84
  status = :null
@@ -47,10 +95,16 @@ module Arrow
47
95
  if start_index != current_index
48
96
  if status == :value
49
97
  if start_index == 0 and current_index == values.size
50
- append_values(values)
98
+ target_values = values
51
99
  else
52
- append_values(values[start_index...current_index])
100
+ target_values = values[start_index...current_index]
101
+ end
102
+ if value_convertable
103
+ target_values = target_values.collect do |v|
104
+ convert_to_arrow_value(v)
105
+ end
53
106
  end
107
+ append_values(target_values)
54
108
  else
55
109
  append_nulls(current_index - start_index)
56
110
  end
@@ -60,6 +114,7 @@ module Arrow
60
114
  if value.nil?
61
115
  append_null
62
116
  else
117
+ value = convert_to_arrow_value(value) if value_convertable
63
118
  append(value)
64
119
  end
65
120
  end
@@ -0,0 +1,206 @@
1
+ # Copyright 2017-2018 Kouhei Sutou <kou@clear-code.com>
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ require "csv"
16
+ require "pathname"
17
+ require "time"
18
+
19
+ module Arrow
20
+ class CSVLoader
21
+ class << self
22
+ def load(path_or_data, **options)
23
+ new(path_or_data, **options).load
24
+ end
25
+ end
26
+
27
+ def initialize(path_or_data, **options)
28
+ @path_or_data = path_or_data
29
+ @options = options
30
+ end
31
+
32
+ def load
33
+ case @path_or_data
34
+ when Pathname
35
+ load_from_path(@path_or_data.to_path)
36
+ when /\A.+\.csv\z/i
37
+ load_from_path(@path_or_data)
38
+ else
39
+ load_data(@path_or_data)
40
+ end
41
+ end
42
+
43
+ private
44
+ def open_csv(path, **options)
45
+ CSV.open(path, **options) do |csv|
46
+ yield(csv)
47
+ end
48
+ end
49
+
50
+ def parse_csv_data(data, **options)
51
+ csv = CSV.new(data, **options)
52
+ begin
53
+ yield(csv)
54
+ ensure
55
+ csv.close
56
+ end
57
+ end
58
+
59
+ def read_csv(csv)
60
+ reader = CSVReader.new(csv)
61
+ reader.read
62
+ end
63
+
64
+ def load_from_path(path)
65
+ options = update_csv_parse_options(@options, :open_csv, path)
66
+ open_csv(path, **options) do |csv|
67
+ read_csv(csv)
68
+ end
69
+ end
70
+
71
+ def load_data(data)
72
+ options = update_csv_parse_options(@options, :parse_csv_data, data)
73
+ parse_csv_data(data, **options) do |csv|
74
+ read_csv(csv)
75
+ end
76
+ end
77
+
78
+ def selective_converter(target_index)
79
+ lambda do |field, field_info|
80
+ if target_index.nil? or field_info.index == target_index
81
+ yield(field)
82
+ else
83
+ field
84
+ end
85
+ end
86
+ end
87
+
88
+ BOOLEAN_CONVERTER = lambda do |field|
89
+ begin
90
+ encoded_field = field.encode(CSV::ConverterEncoding)
91
+ rescue EncodingError
92
+ field
93
+ else
94
+ case encoded_field
95
+ when "true"
96
+ true
97
+ when "false"
98
+ false
99
+ else
100
+ field
101
+ end
102
+ end
103
+ end
104
+
105
+ ISO8601_CONVERTER = lambda do |field|
106
+ begin
107
+ encoded_field = field.encode(CSV::ConverterEncoding)
108
+ rescue EncodingError
109
+ field
110
+ else
111
+ begin
112
+ Time.iso8601(encoded_field)
113
+ rescue ArgumentError
114
+ field
115
+ end
116
+ end
117
+ end
118
+
119
+ def update_csv_parse_options(options, create_csv, *args)
120
+ return options unless options.empty?
121
+
122
+ converters = [:all, BOOLEAN_CONVERTER, ISO8601_CONVERTER]
123
+ new_options = options.merge(converters: converters)
124
+ __send__(create_csv, *args, **new_options) do |csv|
125
+ new_options[:headers] = have_header?(csv)
126
+ end
127
+ __send__(create_csv, *args, **new_options) do |csv|
128
+ new_options[:converters] = detect_robust_converters(csv)
129
+ end
130
+ return new_options
131
+ end
132
+
133
+ def have_header?(csv)
134
+ row1 = csv.shift
135
+ return false if row1.nil?
136
+ return false if row1.any?(&:nil?)
137
+
138
+ row2 = csv.shift
139
+ return nil if row2.nil?
140
+ return true if row2.any?(&:nil?)
141
+
142
+ if row1.collect(&:class) != row2.collect(&:class)
143
+ return true
144
+ end
145
+
146
+ nil
147
+ end
148
+
149
+ def detect_robust_converters(csv)
150
+ column_types = []
151
+ csv.each do |row|
152
+ row.each_with_index do |(_name, value), i|
153
+ current_column_type = column_types[i]
154
+ next if current_column_type == :string
155
+
156
+ candidate_type = nil
157
+ case value
158
+ when nil
159
+ next
160
+ when "true", "false", true, false
161
+ candidate_type = :boolean
162
+ when Integer
163
+ candidate_type = :integer
164
+ when Float
165
+ candidate_type = :float
166
+ if current_column_type == :integer
167
+ column_types[i] = candidate_type
168
+ end
169
+ when Time
170
+ candidate_type = :time
171
+ when DateTime
172
+ candidate_type = :date_time
173
+ when Date
174
+ candidate_type = :date
175
+ else
176
+ candidate_type = :string
177
+ end
178
+
179
+ column_types[i] ||= candidate_type
180
+ if column_types[i] != candidate_type
181
+ column_types[i] = :string
182
+ end
183
+ end
184
+ end
185
+
186
+ converters = []
187
+ column_types.each_with_index do |type, i|
188
+ case type
189
+ when :boolean
190
+ converters << selective_converter(i, &BOOLEAN_CONVERTER)
191
+ when :integer
192
+ converters << selective_converter(i, &CSV::Converters[:integer])
193
+ when :float
194
+ converters << selective_converter(i, &CSV::Converters[:float])
195
+ when :time
196
+ converters << selective_converter(i, &ISO8601_CONVERTER)
197
+ when :date_time
198
+ converters << selective_converter(i, &CSV::Converters[:date_time])
199
+ when :date
200
+ converters << selective_converter(i, &CSV::Converters[:date])
201
+ end
202
+ end
203
+ converters
204
+ end
205
+ end
206
+ end
@@ -13,150 +13,39 @@
13
13
  # limitations under the License.
14
14
 
15
15
  require "csv"
16
- require "pathname"
17
- require "time"
18
16
 
19
17
  module Arrow
20
18
  class CSVReader
21
- class << self
22
- def read(csv, **options)
23
- case csv
24
- when Pathname
25
- path = csv.to_path
26
- options = update_csv_parse_options(options, :open_csv, path)
27
- open_csv(path, **options) do |_csv|
28
- read(_csv)
29
- end
30
- when /\A.+\.csv\z/i
31
- read(Pathname.new(csv), **options)
32
- when String
33
- options = update_csv_parse_options(options, :parse_csv_data, csv)
34
- parse_csv_data(csv, **options) do |_csv|
35
- read(_csv)
36
- end
37
- else
38
- new(csv).read
39
- end
40
- end
41
-
42
- private
43
- def open_csv(path, **options)
44
- CSV.open(path, **options) do |csv|
45
- yield(csv)
46
- end
47
- end
48
-
49
- def parse_csv_data(data, **options)
50
- csv = CSV.new(data, **options)
51
- begin
52
- yield(csv)
53
- ensure
54
- csv.close
55
- end
56
- end
57
-
58
- ISO8601_CONVERTER = lambda do |field|
59
- begin
60
- encoded_field = field.encode(CSV::ConverterEncoding)
61
- rescue EncodingError
62
- field
63
- else
64
- begin
65
- Time.iso8601(encoded_field)
66
- rescue ArgumentError
67
- field
68
- end
69
- end
70
- end
71
-
72
- def update_csv_parse_options(options, create_csv, *args)
73
- return options unless options.empty?
74
-
75
- new_options = options.merge(converters: [:all, ISO8601_CONVERTER])
76
- __send__(create_csv, *args, **new_options) do |csv|
77
- row1 = csv.shift
78
- if row1.nil?
79
- new_options[:headers] = false
80
- return new_options
81
- end
82
- if row1.any?(&:nil?)
83
- new_options[:headers] = false
84
- return new_options
85
- end
86
-
87
- row2 = csv.shift
88
- return new_options if row2.nil?
89
- if row2.any?(&:nil?)
90
- new_options[:headers] = true
91
- return new_options
92
- end
93
-
94
- if row1.collect(&:class) != row2.collect(&:class)
95
- new_options[:headers] = true
96
- return new_options
97
- end
98
-
99
- new_options
100
- end
101
- end
102
- end
103
-
104
19
  def initialize(csv)
105
20
  @csv = csv
106
21
  end
107
22
 
108
23
  def read
109
- builders = []
110
24
  values_set = []
111
25
  @csv.each do |row|
112
26
  if row.is_a?(CSV::Row)
113
27
  row = row.collect(&:last)
114
28
  end
115
29
  row.each_with_index do |value, i|
116
- builders[i] ||= create_builder(value)
117
30
  values = (values_set[i] ||= [])
118
- case value
119
- when Time
120
- value = value.to_i * (10 ** 9) + value.nsec
121
- end
122
31
  values << value
123
32
  end
124
33
  end
125
34
  return nil if values_set.empty?
126
35
 
127
36
  arrays = values_set.collect.with_index do |values, i|
128
- builders[i].build(values)
37
+ ArrayBuilder.build(values)
129
38
  end
130
39
  if @csv.headers
131
40
  names = @csv.headers
132
41
  else
133
- names = builders.size.times.collect(&:to_s)
134
- end
135
- fields = names.collect.with_index do |name, i|
136
- Arrow::Field.new(name, arrays[i].value_data_type)
137
- end
138
- schema = Schema.new(fields)
139
- columns = arrays.collect.with_index do |array, i|
140
- Column.new(fields[i], array)
42
+ names = arrays.size.times.collect(&:to_s)
141
43
  end
142
- Table.new(schema, columns)
143
- end
144
-
145
- private
146
- def create_builder(sample_value)
147
- case sample_value
148
- when Integer
149
- IntArrayBuilder.new
150
- when Float
151
- DoubleArrayBuilder.new
152
- when String
153
- StringArrayBuilder.new
154
- when Time
155
- data_type = TimestampDataType.new(:nano)
156
- TimestampArrayBuilder.new(data_type)
157
- else
158
- nil
44
+ raw_table = {}
45
+ names.each_with_index do |name, i|
46
+ raw_table[name] = arrays[i]
159
47
  end
48
+ Table.new(raw_table)
160
49
  end
161
50
  end
162
51
  end