daru 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/.rubocop.yml +99 -0
  4. data/.rubocop_todo.yml +44 -0
  5. data/.travis.yml +3 -1
  6. data/CONTRIBUTING.md +5 -1
  7. data/History.md +43 -0
  8. data/README.md +3 -4
  9. data/benchmarks/duplicating.rb +45 -0
  10. data/benchmarks/group_by.rb +7 -7
  11. data/benchmarks/joining.rb +52 -0
  12. data/benchmarks/sorting.rb +9 -2
  13. data/benchmarks/statistics.rb +39 -0
  14. data/daru.gemspec +4 -4
  15. data/lib/daru.rb +9 -9
  16. data/lib/daru/accessors/array_wrapper.rb +15 -11
  17. data/lib/daru/accessors/dataframe_by_row.rb +1 -1
  18. data/lib/daru/accessors/gsl_wrapper.rb +30 -19
  19. data/lib/daru/accessors/mdarray_wrapper.rb +1 -3
  20. data/lib/daru/accessors/nmatrix_wrapper.rb +15 -15
  21. data/lib/daru/core/group_by.rb +69 -16
  22. data/lib/daru/core/merge.rb +135 -151
  23. data/lib/daru/core/query.rb +9 -30
  24. data/lib/daru/dataframe.rb +476 -439
  25. data/lib/daru/date_time/index.rb +150 -137
  26. data/lib/daru/date_time/offsets.rb +45 -41
  27. data/lib/daru/extensions/rserve.rb +4 -4
  28. data/lib/daru/index.rb +88 -64
  29. data/lib/daru/io/io.rb +33 -34
  30. data/lib/daru/io/sql_data_source.rb +11 -11
  31. data/lib/daru/maths/arithmetic/dataframe.rb +19 -19
  32. data/lib/daru/maths/arithmetic/vector.rb +9 -14
  33. data/lib/daru/maths/statistics/dataframe.rb +89 -61
  34. data/lib/daru/maths/statistics/vector.rb +226 -97
  35. data/lib/daru/monkeys.rb +23 -30
  36. data/lib/daru/plotting/dataframe.rb +27 -28
  37. data/lib/daru/plotting/vector.rb +12 -13
  38. data/lib/daru/vector.rb +221 -330
  39. data/lib/daru/version.rb +2 -2
  40. data/spec/core/group_by_spec.rb +16 -0
  41. data/spec/core/merge_spec.rb +30 -14
  42. data/spec/dataframe_spec.rb +268 -14
  43. data/spec/index_spec.rb +23 -5
  44. data/spec/io/io_spec.rb +37 -16
  45. data/spec/math/statistics/dataframe_spec.rb +40 -8
  46. data/spec/math/statistics/vector_spec.rb +135 -10
  47. data/spec/monkeys_spec.rb +3 -3
  48. data/spec/vector_spec.rb +157 -25
  49. metadata +41 -21
@@ -4,17 +4,17 @@ module Daru
4
4
  class << self
5
5
  def replace_keys_if_duplicates hash, matcher
6
6
  matched = nil
7
- hash.keys.each { |d|
7
+ hash.keys.each { |d|
8
8
  if matcher.match(Regexp.new(d.to_s))
9
9
  matched = d
10
10
  break
11
- end
11
+ end
12
12
  }
13
13
 
14
- if matched
15
- hash[matcher] = hash[matched]
16
- hash.delete matched
17
- end
14
+ return unless matched
15
+
16
+ hash[matcher] = hash[matched]
17
+ hash.delete matched
18
18
  end
19
19
 
20
20
  def resolve_duplicates df_hash1, df_hash2, on
@@ -29,198 +29,182 @@ module Daru
29
29
  end
30
30
 
31
31
  def hashify df
32
- hsh = df.to_hash
32
+ hsh = df.to_h
33
33
  hsh.each { |k,v| hsh[k] = v.to_a }
34
34
  hsh
35
35
  end
36
-
36
+
37
37
  def arrayify df
38
38
  arr = df.to_a
39
39
  col_names = arr[0][0].keys
40
- values = arr[0].map{|h| h.values}
40
+ values = arr[0].map(&:values)
41
41
 
42
- return col_names, values
42
+ [col_names, values]
43
43
  end
44
44
 
45
- def inner_join df1, df2, df_hash1, df_hash2, on
46
- joined_hash = {}
47
- ((df_hash1.keys - on) | on | (df_hash2.keys - on)).each do |k|
48
- joined_hash[k] = []
49
- end
45
+ def arrayify_with_sort_keys(size, df_hash, on)
46
+ # Converting to a hash and then to an array is more complex
47
+ # than using df.to_a or df.map(:row). However, it's
48
+ # substantially faster this way.
50
49
 
51
- (0...df1.size).each do |id1|
52
- (0...df2.size).each do |id2|
53
- if on.all? { |n| df_hash1[n][id1] == df_hash2[n][id2] }
54
- joined_hash.each do |k,v|
55
- v << (df_hash1.has_key?(k) ? df_hash1[k][id1] : df_hash2[k][id2])
56
- end
57
- end
58
- end
50
+ # idx_keys = on.map { |key| df_hash.keys.index(key) }
51
+
52
+ (0...size).reduce([]) do |r, idx|
53
+ key_values = on.map { |col| df_hash[col][idx] }
54
+ row_values = df_hash.map { |_col, val| val[idx] }
55
+ r << [key_values, row_values]
59
56
  end
60
57
 
61
- Daru::DataFrame.new(joined_hash, order: joined_hash.keys)
58
+ # Conceptually simpler and does the same thing, but slows down the
59
+ # total merge algorithm by 2x. Would be nice to improve the performance
60
+ # of df.map(:row)
61
+ #
62
+ # df.map(:row) do |row|
63
+ # key_values = on.map { |key| row[key] }
64
+ # [key_values, row.to_a]
65
+ # end
62
66
  end
63
67
 
64
- def bf_inner_join df1, df2, on
65
- col_names1, table1 = arrayify df1
66
- col_names2, table2 = arrayify df2
67
-
68
- #resolve duplicates
69
- indicies1 = on.map{|i| col_names1.index(i)}
70
- indicies2 = on.map{|i| col_names2.index(i)}
71
- col_names2.map! do |name|
72
- if (col_names1.include?(name))
73
- col_names1[col_names1.index(name)] = (name.to_s + "_1").to_sym unless on.include?(name)
74
- (name.to_s + "_2").to_sym
75
- else
76
- name
77
- end
78
- end
68
+ def verify_dataframes df_hash1, df_hash2, on
69
+ raise ArgumentError,
70
+ 'All fields in :on must be present in self' unless on.all? { |e| df_hash1[e] }
71
+ raise ArgumentError,
72
+ 'All fields in :on must be present in other DF' unless on.all? { |e| df_hash2[e] }
73
+ end
74
+ end
75
+ end
79
76
 
80
- #combine key columns to a single column value
81
- on_cols1 = table1.flat_map{|x| indicies1.map{|i| x[i].to_s}.join("+")}
82
- on_cols2 = table2.flat_map{|x| indicies2.map{|i| x[i].to_s}.join("+")}
77
+ class MergeFrame
78
+ def initialize(df1, df2, on: nil)
79
+ @df1 = df1
80
+ @df2 = df2
81
+ @on = on
82
+ end
83
83
 
84
- #parameters for a BF with approx 0.1% false positives
85
- m = on_cols2.size * 15
86
- k = 11
84
+ def inner _opts
85
+ merge_join(left: false, right: false)
86
+ end
87
87
 
88
- bf = BloomFilter::Native.new({:size => m, :hashes => k, :bucket => 1})
89
- on_cols2.each{|x| bf.insert(x)}
88
+ def left _opts
89
+ merge_join(left: true, right: false)
90
+ end
90
91
 
91
- x_ind = -1
92
- joined_new = on_cols1.map do |x|
93
- x_ind+=1
94
- if (bf.include?(x))
95
- {x_ind => on_cols2.each_index.select{|y_ind| on_cols2[y_ind] == x}}
96
- else
97
- {x_ind => []}
98
- end
99
- end
100
- .reduce({}) {|h,pairs| pairs.each {|k,v| (h[k] ||= []) << v}; h}
101
- .flat_map{|ind1, inds2| inds2.flatten.map{|ind2| [table1[ind1], table2[ind2]].flatten} if inds2.flatten.size > 0}
92
+ def right _opts
93
+ merge_join(left: false, right: true)
94
+ end
102
95
 
103
- joined_cols = [col_names1, col_names2].flatten
104
- df = Daru::DataFrame.rows(joined_new.compact, order: joined_cols)
105
- on.each{|x| df.delete_vector (x.to_s + "_2").to_sym}
96
+ def outer _opts
97
+ merge_join(left: true, right: true)
98
+ end
106
99
 
107
- df
108
- end
100
+ def merge_join(left: true, right: true)
101
+ MergeHelper.verify_dataframes df1_hash, df2_hash, @on
102
+ MergeHelper.resolve_duplicates df1_hash, df2_hash, @on
109
103
 
110
- def full_outer_join df1, df2, df_hash1, df_hash2, on
111
- left = left_outer_join df1, df2, df_hash1, df_hash2, on, true
112
- right = right_outer_join df1, df2, df_hash1, df_hash2, on, true
104
+ # TODO: Use native dataframe sorting.
105
+ # It would be ideal to reuse sorting functionality that is native
106
+ # to dataframes. Unfortunately, native dataframe sort introduces
107
+ # an overhead that reduces join performance by a factor of 4! Until
108
+ # that aspect is improved, we resort to a simpler array sort.
109
+ df1_array.sort_by! { |row| [row[0].nil? ? 0 : 1, row[0]] }
110
+ df2_array.sort_by! { |row| [row[0].nil? ? 0 : 1, row[0]] }
113
111
 
114
- Daru::DataFrame.rows(
115
- (left.values.transpose | right.values.transpose), order: left.keys)
116
- end
112
+ idx1 = 0
113
+ idx2 = 0
117
114
 
118
- def left_outer_join df1, df2, df_hash1, df_hash2, on, as_hash=false
119
- joined_hash = {}
120
- ((df_hash1.keys - on) | on | (df_hash2.keys - on)).each do |k|
121
- joined_hash[k] = []
122
- end
115
+ while idx1 < @df1.size || idx2 < @df2.size
123
116
 
124
-
125
- (0...df1.size).each do |id1|
126
- joined = false
127
- (0...df2.size).each do |id2|
128
- if on.all? { |n| df_hash1[n][id1] == df_hash2[n][id2] }
129
- joined = true
130
- joined_hash.each do |k,v|
131
- v << (df_hash1.has_key?(k) ? df_hash1[k][id1] : df_hash2[k][id2])
132
- end
133
- end
134
- end
117
+ key1 = df1_array[idx1][0] if idx1 < @df1.size
118
+ key2 = df2_array[idx2][0] if idx2 < @df2.size
135
119
 
136
- unless joined
137
- df_hash1.keys.each do |k|
138
- joined_hash[k] << df_hash1[k][id1]
139
- end
120
+ if key1 == key2 && idx1 < @df1.size && idx2 < @df2.size
121
+ idx2_start = idx2
140
122
 
141
- (joined_hash.keys - df_hash1.keys).each do |k|
142
- joined_hash[k] << nil
143
- end
144
- joined = false
123
+ while (idx2 < @df2.size) && (df1_array[idx1][0] == df2_array[idx2][0])
124
+ add_merge_row_to_hash([df1_array[idx1], df2_array[idx2]], joined_hash)
125
+ idx2 += 1
145
126
  end
127
+
128
+ idx2 = idx2_start if idx1+1 < @df1.size && df1_array[idx1][0] == df1_array[idx1+1][0]
129
+ idx1 += 1
130
+ elsif ((key2.nil? || [key1,key2].sort == [key1,key2]) && idx1 < @df1.size) || idx2 == @df2.size
131
+ add_merge_row_to_hash([df1_array[idx1], nil], joined_hash) if left
132
+ idx1 += 1
133
+ elsif idx2 < @df2.size || idx1 == @df1.size
134
+ add_merge_row_to_hash([nil, df2_array[idx2]], joined_hash) if right
135
+ idx2 += 1
136
+ else
137
+ raise 'Unexpected condition met during merge'
146
138
  end
139
+ end
147
140
 
148
- return joined_hash if as_hash
149
- Daru::DataFrame.new(joined_hash, order: joined_hash.keys)
141
+ Daru::DataFrame.new(joined_hash, order: joined_hash.keys)
142
+ end
143
+
144
+ private
145
+
146
+ def joined_hash
147
+ return @joined_hash if @joined_hash
148
+ @joined_hash ||= {}
149
+
150
+ ((df1_keys - @on) | @on | (df2_keys - @on)).each do |k|
151
+ @joined_hash[k] = []
150
152
  end
151
153
 
152
- def right_outer_join df1, df2, df_hash1, df_hash2, on, as_hash=false
153
- joined_hash = {}
154
- ((df_hash1.keys - on) | on | (df_hash2.keys - on)).each do |k|
155
- joined_hash[k] = []
156
- end
154
+ @joined_hash
155
+ end
157
156
 
158
- (0...df2.size).each do |id1|
159
- joined = false
160
- (0...df1.size).each do |id2|
161
- if on.all? { |n| df_hash2[n][id1] == df_hash1[n][id2] }
162
- joined = true
163
- joined_hash.each do |k,v|
164
- v << (df_hash2.has_key?(k) ? df_hash2[k][id1] : df_hash1[k][id2])
165
- end
166
- end
167
- end
157
+ def df1_hash
158
+ @df1_hash ||= MergeHelper.hashify @df1
159
+ end
168
160
 
169
- unless joined
170
- df_hash2.keys.each do |k|
171
- joined_hash[k] << df_hash2[k][id1]
172
- end
161
+ def df2_hash
162
+ @df2_hash ||= MergeHelper.hashify @df2
163
+ end
173
164
 
174
- (joined_hash.keys - df_hash2.keys).each do |k|
175
- joined_hash[k] << nil
176
- end
177
- joined = false
178
- end
179
- end
165
+ def df1_array
166
+ @df1_array ||= MergeHelper.arrayify_with_sort_keys @df1.size, df1_hash, @on
167
+ end
180
168
 
181
- return joined_hash if as_hash
182
- Daru::DataFrame.new(joined_hash, order: joined_hash.keys)
183
- end
169
+ def df2_array
170
+ @df2_array ||= MergeHelper.arrayify_with_sort_keys @df2.size, df2_hash, @on
171
+ end
184
172
 
185
- def verify_dataframes df_hash1, df_hash2, on
186
- raise ArgumentError,
187
- "All fields in :on must be present in self" if !on.all? { |e| df_hash1[e] }
188
- raise ArgumentError,
189
- "All fields in :on must be present in other DF" if !on.all? { |e| df_hash2[e] }
173
+ def df1_keys
174
+ df1_hash.keys
175
+ end
176
+
177
+ def df2_keys
178
+ df2_hash.keys
179
+ end
180
+
181
+ # Private: The merge row contains two elements, the first is the row from the
182
+ # first dataframe, the second is the row from the second dataframe.
183
+ def add_merge_row_to_hash row, hash
184
+ @df1_key_to_index ||= df1_keys.each_with_index.map { |k,idx| [k, idx] }.to_h
185
+ @df2_key_to_index ||= df2_keys.each_with_index.map { |k,idx| [k, idx] }.to_h
186
+
187
+ hash.each do |k,v|
188
+ v ||= []
189
+
190
+ left = df1_keys.include?(k) ? row[0] && row[0][1][@df1_key_to_index[k]] : nil
191
+ right = df2_keys.include?(k) ? row[1] && row[1][1][@df2_key_to_index[k]] : nil
192
+
193
+ v << (left || right)
190
194
  end
191
195
  end
192
196
  end
197
+
193
198
  # Private module containing methods for join, merge, concat operations on
194
199
  # dataframes and vectors.
195
200
  # @private
196
201
  module Merge
197
202
  class << self
198
203
  def join df1, df2, opts={}
199
- helper = MergeHelper
200
-
201
- df_hash1 = helper.hashify df1
202
- df_hash2 = helper.hashify df2
203
204
  on = opts[:on]
204
205
 
205
- helper.verify_dataframes df_hash1, df_hash2, on
206
- helper.resolve_duplicates df_hash1, df_hash2, on
207
-
208
- case opts[:how]
209
- when :inner
210
- if Daru.has_bloomfilter_rb?
211
- helper.bf_inner_join df1, df2, on
212
- else
213
- helper.inner_join df1, df2, df_hash1, df_hash2, on
214
- end
215
- when :outer
216
- helper.full_outer_join df1, df2, df_hash1, df_hash2, on
217
- when :left
218
- helper.left_outer_join df1, df2, df_hash1, df_hash2, on
219
- when :right
220
- helper.right_outer_join df1, df2, df_hash1, df_hash2, on
221
- else
222
- raise ArgumentError, "Unrecognized option in :how => #{opts[:how]}"
223
- end
206
+ mf = MergeFrame.new df1, df2, on: on
207
+ mf.send opts[:how], {}
224
208
  end
225
209
  end
226
210
  end
@@ -9,31 +9,19 @@ module Daru
9
9
  end
10
10
 
11
11
  def & other
12
- new_bool = []
13
- other_barry = other.barry
14
- @barry.each_with_index do |b, i|
15
- new_bool << (b and other_barry[i])
16
- end
17
-
18
- BoolArray.new(new_bool)
12
+ BoolArray.new @barry.zip(other.barry).map { |b, o| b && o }
19
13
  end
20
14
 
21
15
  alias :and :&
22
16
 
23
17
  def | other
24
- new_bool = []
25
- other_barry = other.barry
26
- @barry.each_with_index do |b, i|
27
- new_bool << (b or other_barry[i])
28
- end
29
-
30
- BoolArray.new(new_bool)
18
+ BoolArray.new @barry.zip(other.barry).map { |b, o| b || o }
31
19
  end
32
20
 
33
21
  alias :or :|
34
22
 
35
23
  def !
36
- BoolArray.new(@barry.map { |b| !b })
24
+ BoolArray.new(@barry.map(&:!))
37
25
  end
38
26
 
39
27
  def == other
@@ -45,27 +33,17 @@ module Daru
45
33
  end
46
34
 
47
35
  def inspect
48
- "(#{self.class}:#{self.object_id} bool_arry=#{@barry})"
36
+ "(#{self.class}:#{object_id} bool_arry=#{@barry})"
49
37
  end
50
38
  end
51
39
 
52
40
  class << self
53
41
  def apply_scalar_operator operator, data, other
54
- arry = data.inject([]) do |memo,d|
55
- memo << (d.send(operator, other) ? true : false)
56
- memo
57
- end
58
-
59
- BoolArray.new(arry)
42
+ BoolArray.new data.map { |d| !!d.send(operator, other) }
60
43
  end
61
44
 
62
45
  def apply_vector_operator operator, vector, other
63
- bool_arry = []
64
- vector.each_with_index do |d, i|
65
- bool_arry << (d.send(operator, other[i]) ? true : false)
66
- end
67
-
68
- BoolArray.new(bool_arry)
46
+ BoolArray.new vector.zip(other).map { |d, o| !!d.send(operator, o) }
69
47
  end
70
48
 
71
49
  def df_where data_frame, bool_array
@@ -74,7 +52,8 @@ module Daru
74
52
  end
75
53
 
76
54
  Daru::DataFrame.new(
77
- vecs, order: data_frame.vectors, index: vecs[0].index, clone: false)
55
+ vecs, order: data_frame.vectors, index: vecs[0].index, clone: false
56
+ )
78
57
  end
79
58
 
80
59
  def vector_where data, index, bool_array, dtype
@@ -92,4 +71,4 @@ module Daru
92
71
  end
93
72
  end
94
73
  end
95
- end
74
+ end
@@ -1,14 +1,11 @@
1
- $:.unshift File.dirname(__FILE__)
2
-
3
- require 'accessors/dataframe_by_row.rb'
4
- require 'maths/arithmetic/dataframe.rb'
5
- require 'maths/statistics/dataframe.rb'
6
- require 'plotting/dataframe.rb'
7
- require 'io/io.rb'
1
+ require 'daru/accessors/dataframe_by_row.rb'
2
+ require 'daru/maths/arithmetic/dataframe.rb'
3
+ require 'daru/maths/statistics/dataframe.rb'
4
+ require 'daru/plotting/dataframe.rb'
5
+ require 'daru/io/io.rb'
8
6
 
9
7
  module Daru
10
8
  class DataFrame
11
-
12
9
  include Daru::Maths::Arithmetic::DataFrame
13
10
  include Daru::Maths::Statistics::DataFrame
14
11
  include Daru::Plotting::DataFrame if Daru.has_nyaplot?
@@ -115,31 +112,30 @@ module Daru
115
112
  # Create DataFrame by specifying rows as an Array of Arrays or Array of
116
113
  # Daru::Vector objects.
117
114
  def rows source, opts={}
118
- df = nil
119
- if source.all? { |v| v.size == source[0].size }
120
- first = source[0]
121
- index = []
122
- opts[:order] ||=
123
- if first.is_a?(Daru::Vector) # assume that all are Vectors
124
- source.each { |vec| index << vec.name }
115
+ first = source.first
116
+
117
+ raise SizeError, 'All vectors must have same length' \
118
+ unless source.all? { |v| v.size == first.size }
119
+
120
+ index = []
121
+ opts[:order] ||=
122
+ case first
123
+ when Daru::Vector # assume that all are Vectors
124
+ index = source.map(&:name)
125
125
  first.index.to_a
126
- elsif first.is_a?(Array)
127
- Array.new(first.size) { |i| i.to_s }
126
+ when Array
127
+ Array.new(first.size, &:to_s)
128
128
  end
129
129
 
130
- if source.all? { |s| s.is_a?(Array) }
131
- df = Daru::DataFrame.new(source.transpose, opts)
132
- else # array of Daru::Vectors
133
- df = Daru::DataFrame.new({}, opts)
130
+ if source.all? { |s| s.is_a?(Array) }
131
+ Daru::DataFrame.new(source.transpose, opts)
132
+ else # array of Daru::Vectors
133
+ Daru::DataFrame.new({}, opts).tap do |df|
134
134
  source.each_with_index do |row, idx|
135
- df[(index[idx] || idx), :row] = row
135
+ df[index[idx] || idx, :row] = row
136
136
  end
137
137
  end
138
- else
139
- raise SizeError, "All vectors must have same length"
140
138
  end
141
-
142
- df
143
139
  end
144
140
 
145
141
  # Generates a new dataset, using three vectors
@@ -162,18 +158,16 @@ module Daru
162
158
  #
163
159
  # Useful to process outputs from databases
164
160
  def crosstab_by_assignation rows, columns, values
165
- raise "Three vectors should be equal size" if
166
- rows.size != columns.size or rows.size!=values.size
161
+ raise 'Three vectors should be equal size' if
162
+ rows.size != columns.size || rows.size!=values.size
167
163
 
168
164
  cols_values = columns.factors
169
165
  cols_n = cols_values.size
170
166
 
171
- h_rows = rows.factors.inject({}) do |a,v|
172
- a[v] = cols_values.inject({}) do |a1,v1|
167
+ h_rows = rows.factors.each_with_object({}) do |v, a|
168
+ a[v] = cols_values.each_with_object({}) do |v1, a1|
173
169
  a1[v1]=nil
174
- a1
175
170
  end
176
- a
177
171
  end
178
172
 
179
173
  values.each_index do |i|
@@ -250,7 +244,7 @@ module Daru
250
244
  @data = []
251
245
 
252
246
  temp_name = opts[:name]
253
- @name = temp_name || SecureRandom.uuid
247
+ @name = temp_name || SecureRandom.uuid
254
248
 
255
249
  if source.empty?
256
250
  @vectors = try_create_index vectors
@@ -266,7 +260,7 @@ module Daru
266
260
  @index = try_create_index(index || source[0].size)
267
261
  @vectors = try_create_index(vectors)
268
262
 
269
- @vectors.each_with_index do |vec,idx|
263
+ @vectors.each_with_index do |_vec,idx|
270
264
  @data << Daru::Vector.new(source[idx], index: @index)
271
265
  end
272
266
  elsif source.all? { |s| s.is_a?(Daru::Vector) }
@@ -276,18 +270,18 @@ module Daru
276
270
  end
277
271
  initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
278
272
  else # array of hashes
279
- if vectors.nil?
280
- @vectors = Daru::Index.new source[0].keys
281
- else
282
- @vectors = Daru::Index.new(
283
- (vectors + (source[0].keys - vectors)).uniq)
284
- end
273
+ @vectors =
274
+ if vectors.nil?
275
+ Daru::Index.new source[0].keys
276
+ else
277
+ Daru::Index.new((vectors + (source[0].keys - vectors)).uniq)
278
+ end
285
279
  @index = Daru::Index.new(index || source.size)
286
280
 
287
281
  @vectors.each do |name|
288
282
  v = []
289
- source.each do |hsh|
290
- v << (hsh[name] || hsh[name.to_s])
283
+ source.each do |h|
284
+ v << (h[name] || h[name.to_s])
291
285
  end
292
286
 
293
287
  @data << Daru::Vector.new(v, name: set_name(name), index: @index)
@@ -296,10 +290,10 @@ module Daru
296
290
  when Hash
297
291
  create_vectors_index_with vectors, source
298
292
  if all_daru_vectors_in_source? source
293
+ vectors_have_same_index = all_vectors_have_equal_indexes?(source)
299
294
  if !index.nil?
300
295
  @index = try_create_index index
301
- elsif all_vectors_have_equal_indexes?(source)
302
- vectors_have_same_index = true
296
+ elsif vectors_have_same_index
303
297
  @index = source.values[0].index.dup
304
298
  else
305
299
  all_indexes = []
@@ -320,14 +314,10 @@ module Daru
320
314
  if vectors_have_same_index
321
315
  v = source[vector].dup
322
316
  else
323
- v = Daru::Vector.new([], name: vector, index: @index)
317
+ v = Daru::Vector.new([], name: vector, metadata: source[vector].metadata.dup, index: @index)
324
318
 
325
319
  @index.each do |idx|
326
- if source[vector].index.include? idx
327
- v[idx] = source[vector][idx]
328
- else
329
- v[idx] = nil
330
- end
320
+ v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
331
321
  end
332
322
  end
333
323
  @data << v
@@ -339,7 +329,8 @@ module Daru
339
329
  @index = try_create_index(index || source.values[0].size)
340
330
 
341
331
  @vectors.each do |name|
342
- @data << Daru::Vector.new(source[name].dup, name: set_name(name), index: @index)
332
+ meta_opt = source[name].respond_to?(:metadata) ? {metadata: source[name].metadata.dup} : {}
333
+ @data << Daru::Vector.new(source[name].dup, name: set_name(name), **meta_opt, index: @index)
343
334
  end
344
335
  end
345
336
  end
@@ -350,17 +341,16 @@ module Daru
350
341
  update
351
342
  end
352
343
 
353
- def vector *args
354
- $stderr.puts "#vector has been deprecated in favour of #[]. Please use that."
344
+ def vector(*)
345
+ $stderr.puts '#vector has been deprecated in favour of #[]. Please use that.'
355
346
  self[*names]
356
347
  end
357
348
 
358
349
  # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
359
350
  # Defaults to *:vector*. Use of this method is not recommended for accessing
360
- # rows or vectors. Use df.row[:a] for accessing row with index ':a' or
361
- # df.vector[:vec] for accessing vector with index *:vec*.
351
+ # rows. Use df.row[:a] for accessing row with index ':a'.
362
352
  def [](*names)
363
- if names[-1] == :vector or names[-1] == :row
353
+ if names[-1] == :vector || names[-1] == :row
364
354
  axis = names[-1]
365
355
  names = names[0..-2]
366
356
  else
@@ -368,9 +358,9 @@ module Daru
368
358
  end
369
359
 
370
360
  if axis == :vector
371
- access_vector *names
361
+ access_vector(*names)
372
362
  elsif axis == :row
373
- access_row *names
363
+ access_row(*names)
374
364
  else
375
365
  raise IndexError, "Expected axis to be row or vector not #{axis}"
376
366
  end
@@ -433,7 +423,7 @@ module Daru
433
423
 
434
424
  src = []
435
425
  vectors_to_dup.each do |vec|
436
- src << @data[@vectors[vec]].to_a.dup
426
+ src << @data[@vectors[vec]].dup
437
427
  end
438
428
  new_order = Daru::Index.new(vectors_to_dup)
439
429
 
@@ -454,11 +444,10 @@ module Daru
454
444
  # a view of the whole data frame otherwise.
455
445
  def clone *vectors_to_clone
456
446
  vectors_to_clone.flatten! unless vectors_to_clone.all? { |a| !a.is_a?(Array) }
457
- return super if vectors_to_clone.empty?
447
+ vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
458
448
 
459
- h = vectors_to_clone.inject({}) do |hsh, vec|
449
+ h = vectors_to_clone.each_with_object({}) do |vec, hsh|
460
450
  hsh[vec] = self[vec]
461
- hsh
462
451
  end
463
452
  Daru::DataFrame.new(h, clone: false)
464
453
  end
@@ -476,9 +465,8 @@ module Daru
476
465
  # Creates a new duplicate dataframe containing only rows
477
466
  # without a single missing value.
478
467
  def dup_only_valid vecs=nil
479
- rows_with_nil = @data.inject([]) do |memo, vector|
468
+ rows_with_nil = @data.each_with_object([]) do |vector, memo|
480
469
  memo.concat vector.missing_positions
481
- memo
482
470
  end.uniq
483
471
 
484
472
  row_indexes = @index.to_a
@@ -505,7 +493,7 @@ module Daru
505
493
  alias_method :each_column, :each_vector
506
494
 
507
495
  # Iterate over each vector alongwith the name of the vector
508
- def each_vector_with_index(&block)
496
+ def each_vector_with_index
509
497
  return to_enum(:each_vector_with_index) unless block_given?
510
498
 
511
499
  @vectors.each do |vector|
@@ -518,7 +506,7 @@ module Daru
518
506
  alias_method :each_column_with_index, :each_vector_with_index
519
507
 
520
508
  # Iterate over each row
521
- def each_row(&block)
509
+ def each_row
522
510
  return to_enum(:each_row) unless block_given?
523
511
 
524
512
  @index.each do |index|
@@ -528,7 +516,7 @@ module Daru
528
516
  self
529
517
  end
530
518
 
531
- def each_row_with_index(&block)
519
+ def each_row_with_index
532
520
  return to_enum(:each_row_with_index) unless block_given?
533
521
 
534
522
  @index.each do |index|
@@ -552,7 +540,7 @@ module Daru
552
540
  # * +axis+ - The axis to iterate over. Can be :vector (or :column)
553
541
  # or :row. Default to :vector.
554
542
  def each axis=:vector, &block
555
- if axis == :vector or axis == :column
543
+ if axis == :vector || axis == :column
556
544
  each_vector(&block)
557
545
  elsif axis == :row
558
546
  each_row(&block)
@@ -577,7 +565,7 @@ module Daru
577
565
  # * +axis+ - The axis to iterate over. Can be :vector (or :column)
578
566
  # or :row. Default to :vector.
579
567
  def collect axis=:vector, &block
580
- if axis == :vector or axis == :column
568
+ if axis == :vector || axis == :column
581
569
  collect_vectors(&block)
582
570
  elsif axis == :row
583
571
  collect_rows(&block)
@@ -603,7 +591,7 @@ module Daru
603
591
  # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
604
592
  # Default to :vector.
605
593
  def map axis=:vector, &block
606
- if axis == :vector or axis == :column
594
+ if axis == :vector || axis == :column
607
595
  map_vectors(&block)
608
596
  elsif axis == :row
609
597
  map_rows(&block)
@@ -621,7 +609,7 @@ module Daru
621
609
  # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
622
610
  # Default to :vector.
623
611
  def map! axis=:vector, &block
624
- if axis == :vector or axis == :column
612
+ if axis == :vector || axis == :column
625
613
  map_vectors!(&block)
626
614
  elsif axis == :row
627
615
  map_rows!(&block)
@@ -646,7 +634,7 @@ module Daru
646
634
  # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
647
635
  # Default to :vector.
648
636
  def recode axis=:vector, &block
649
- if axis == :vector or axis == :column
637
+ if axis == :vector || axis == :column
650
638
  recode_vectors(&block)
651
639
  elsif axis == :row
652
640
  recode_rows(&block)
@@ -682,17 +670,17 @@ module Daru
682
670
  # row[:a] + row[:d] < 100
683
671
  # end
684
672
  def filter axis=:vector, &block
685
- if axis == :vector or axis == :column
673
+ if axis == :vector || axis == :column
686
674
  filter_vectors(&block)
687
675
  elsif axis == :row
688
676
  filter_rows(&block)
689
677
  end
690
678
  end
691
679
 
692
- def recode_vectors &block
680
+ def recode_vectors
693
681
  block_given? or return to_enum(:recode_vectors)
694
682
 
695
- df = self.dup
683
+ df = dup
696
684
  df.each_vector_with_index do |v, i|
697
685
  ret = yield v
698
686
  ret.is_a?(Daru::Vector) or
@@ -703,10 +691,10 @@ module Daru
703
691
  df
704
692
  end
705
693
 
706
- def recode_rows &block
694
+ def recode_rows
707
695
  block_given? or return to_enum(:recode_rows)
708
696
 
709
- df = self.dup
697
+ df = dup
710
698
  df.each_row_with_index do |r, i|
711
699
  ret = yield r
712
700
  ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
@@ -717,7 +705,7 @@ module Daru
717
705
  end
718
706
 
719
707
  # Map each vector and return an Array.
720
- def map_vectors(&block)
708
+ def map_vectors
721
709
  return to_enum(:map_vectors) unless block_given?
722
710
 
723
711
  arry = []
@@ -729,7 +717,7 @@ module Daru
729
717
  end
730
718
 
731
719
  # Destructive form of #map_vectors
732
- def map_vectors!(&block)
720
+ def map_vectors!
733
721
  return to_enum(:map_vectors!) unless block_given?
734
722
 
735
723
  vectors.dup.each do |n|
@@ -742,7 +730,7 @@ module Daru
742
730
  end
743
731
 
744
732
  # Map vectors alongwith the index.
745
- def map_vectors_with_index(&block)
733
+ def map_vectors_with_index
746
734
  return to_enum(:map_vectors_with_index) unless block_given?
747
735
 
748
736
  dt = []
@@ -754,7 +742,7 @@ module Daru
754
742
  end
755
743
 
756
744
  # Map each row
757
- def map_rows(&block)
745
+ def map_rows
758
746
  return to_enum(:map_rows) unless block_given?
759
747
 
760
748
  dt = []
@@ -765,7 +753,7 @@ module Daru
765
753
  dt
766
754
  end
767
755
 
768
- def map_rows_with_index(&block)
756
+ def map_rows_with_index
769
757
  return to_enum(:map_rows_with_index) unless block_given?
770
758
 
771
759
  dt = []
@@ -776,13 +764,13 @@ module Daru
776
764
  dt
777
765
  end
778
766
 
779
- def map_rows!(&block)
767
+ def map_rows!
780
768
  return to_enum(:map_rows!) unless block_given?
781
769
 
782
770
  index.dup.each do |i|
783
- r = yield self.row[i]
771
+ r = yield row[i]
784
772
  r.is_a?(Daru::Vector) or raise TypeError, "Returned object must be Daru::Vector not #{r.class}"
785
- self.row[i] = r
773
+ row[i] = r
786
774
  end
787
775
 
788
776
  self
@@ -790,7 +778,7 @@ module Daru
790
778
 
791
779
  # Retrieves a Daru::Vector, based on the result of calculation
792
780
  # performed on each row.
793
- def collect_rows &block
781
+ def collect_rows
794
782
  return to_enum(:collect_rows) unless block_given?
795
783
 
796
784
  data = []
@@ -801,7 +789,7 @@ module Daru
801
789
  Daru::Vector.new(data, index: @index)
802
790
  end
803
791
 
804
- def collect_row_with_index &block
792
+ def collect_row_with_index
805
793
  return to_enum(:collect_row_with_index) unless block_given?
806
794
 
807
795
  data = []
@@ -814,7 +802,7 @@ module Daru
814
802
 
815
803
  # Retrives a Daru::Vector, based on the result of calculation
816
804
  # performed on each vector.
817
- def collect_vectors &block
805
+ def collect_vectors
818
806
  return to_enum(:collect_vectors) unless block_given?
819
807
 
820
808
  data = []
@@ -825,7 +813,7 @@ module Daru
825
813
  Daru::Vector.new(data, index: @vectors)
826
814
  end
827
815
 
828
- def collect_vector_with_index &block
816
+ def collect_vector_with_index
829
817
  return to_enum(:collect_vector_with_index) unless block_given?
830
818
 
831
819
  data = []
@@ -852,15 +840,19 @@ module Daru
852
840
  Matrix.rows(rows)
853
841
  end
854
842
 
855
-
856
843
  # Delete a vector
857
844
  def delete_vector vector
858
- if @vectors.include? vector
859
- @data.delete_at @vectors[vector]
860
- @vectors = Daru::Index.new @vectors.to_a - [vector]
861
- else
862
- raise IndexError, "Vector #{vector} does not exist."
863
- end
845
+ raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
846
+
847
+ @data.delete_at @vectors[vector]
848
+ @vectors = Daru::Index.new @vectors.to_a - [vector]
849
+
850
+ self
851
+ end
852
+
853
+ # Deletes a list of vectors
854
+ def delete_vectors *vectors
855
+ Array(vectors).each { |vec| delete_vector vec }
864
856
 
865
857
  self
866
858
  end
@@ -869,13 +861,10 @@ module Daru
869
861
  def delete_row index
870
862
  idx = named_index_for index
871
863
 
872
- if @index.include? idx
873
- @index = Daru::Index.new(@index.to_a - [idx])
874
- self.each_vector do |vector|
875
- vector.delete_at idx
876
- end
877
- else
878
- raise IndexError, "Index #{index} does not exist."
864
+ raise IndexError, "Index #{index} does not exist." unless @index.include? idx
865
+ @index = Daru::Index.new(@index.to_a - [idx])
866
+ each_vector do |vector|
867
+ vector.delete_at idx
879
868
  end
880
869
 
881
870
  set_size
@@ -895,7 +884,7 @@ module Daru
895
884
  ds_boot
896
885
  end
897
886
 
898
- def keep_row_if &block
887
+ def keep_row_if
899
888
  deletion = []
900
889
 
901
890
  @index.each do |index|
@@ -908,7 +897,7 @@ module Daru
908
897
  }
909
898
  end
910
899
 
911
- def keep_vector_if &block
900
+ def keep_vector_if
912
901
  @vectors.each do |vector|
913
902
  keep_vector = yield @data[@vectors[vector]], vector
914
903
 
@@ -923,27 +912,17 @@ module Daru
923
912
  d.push(row[vec]) if yield row
924
913
  end
925
914
 
926
- Daru::Vector.new(d)
915
+ Daru::Vector.new(d, metadata: self[vec].metadata.dup)
927
916
  end
928
917
 
929
918
  # Iterates over each row and retains it in a new DataFrame if the block returns
930
919
  # true for that row.
931
- def filter_rows &block
920
+ def filter_rows
932
921
  return to_enum(:filter_rows) unless block_given?
933
922
 
934
- df = Daru::DataFrame.new({}, order: @vectors.to_a)
935
- marked = []
936
-
937
- @index.each do |index|
938
- keep_row = yield access_row(index)
939
- marked << index if keep_row
940
- end
941
-
942
- marked.each do |idx|
943
- df.row[idx] = self[idx, :row]
944
- end
923
+ keep_rows = @index.map { |index| yield access_row(index) }
945
924
 
946
- df
925
+ where keep_rows
947
926
  end
948
927
 
949
928
  # Iterates over each vector and retains it in a new DataFrame if the block returns
@@ -951,8 +930,8 @@ module Daru
951
930
  def filter_vectors &block
952
931
  return to_enum(:filter_vectors) unless block_given?
953
932
 
954
- df = self.dup
955
- df.keep_vector_if &block
933
+ df = dup
934
+ df.keep_vector_if(&block)
956
935
 
957
936
  df
958
937
  end
@@ -962,7 +941,7 @@ module Daru
962
941
  #
963
942
  # The function returns an array with all errors.
964
943
  def verify(*tests)
965
- if(tests[0].is_a? Symbol)
944
+ if tests[0].is_a? Symbol
966
945
  id = tests[0]
967
946
  tests.shift
968
947
  else
@@ -974,13 +953,12 @@ module Daru
974
953
  each(:row) do |row|
975
954
  i += 1
976
955
  tests.each do |test|
977
- if !test[2].call(row)
978
- values = ""
979
- if test[1].size>0
980
- values = " (" + test[1].collect{ |k| "#{k}=#{row[k]}" }.join(", ") + ")"
981
- end
982
- vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
956
+ next if test[2].call(row)
957
+ values = ''
958
+ unless test[1].empty?
959
+ values = ' (' + test[1].collect { |k| "#{k}=#{row[k]}" }.join(', ') + ')'
983
960
  end
961
+ vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
984
962
  end
985
963
  end
986
964
  vr
@@ -1051,7 +1029,7 @@ module Daru
1051
1029
  alias :vector_missing_values :missing_values_rows
1052
1030
 
1053
1031
  def has_missing_data?
1054
- !!@data.any? { |v| v.has_missing_data? }
1032
+ !!@data.any?(&:has_missing_data?)
1055
1033
  end
1056
1034
 
1057
1035
  alias :flawed? :has_missing_data?
@@ -1075,9 +1053,9 @@ module Daru
1075
1053
  name = row[tree_keys.last]
1076
1054
  if !block
1077
1055
  current[name] ||= []
1078
- current[name].push(row.to_hash.delete_if { |key,value| tree_keys.include? key})
1056
+ current[name].push(row.to_h.delete_if { |key,_value| tree_keys.include? key })
1079
1057
  else
1080
- current[name] = block.call(row, current,name)
1058
+ current[name] = yield(row, current, name)
1081
1059
  end
1082
1060
  end
1083
1061
 
@@ -1087,7 +1065,7 @@ module Daru
1087
1065
  def vector_count_characters vecs=nil
1088
1066
  vecs ||= @vectors.to_a
1089
1067
 
1090
- collect_row_with_index do |row, i|
1068
+ collect_rows do |row|
1091
1069
  vecs.inject(0) do |memo, vec|
1092
1070
  memo + (row[vec].nil? ? 0 : row[vec].to_s.size)
1093
1071
  end
@@ -1129,7 +1107,7 @@ module Daru
1129
1107
  # row[:a] < 3 and row[:b] == 'b'
1130
1108
  # end #=> true
1131
1109
  def any? axis=:vector, &block
1132
- if axis == :vector or axis == :column
1110
+ if axis == :vector || axis == :column
1133
1111
  @data.any?(&block)
1134
1112
  elsif axis == :row
1135
1113
  each_row do |row|
@@ -1151,7 +1129,7 @@ module Daru
1151
1129
  # row[:a] < 10
1152
1130
  # end #=> true
1153
1131
  def all? axis=:vector, &block
1154
- if axis == :vector or axis == :column
1132
+ if axis == :vector || axis == :column
1155
1133
  @data.all?(&block)
1156
1134
  elsif axis == :row
1157
1135
  each_row do |row|
@@ -1236,46 +1214,52 @@ module Daru
1236
1214
  # # ["foo", "two", 3]=>[2, 4]}
1237
1215
  def group_by *vectors
1238
1216
  vectors.flatten!
1239
- vectors.each { |v| raise(ArgumentError, "Vector #{v} does not exist") unless
1240
- has_vector?(v) }
1217
+ vectors.each { |v|
1218
+ raise(ArgumentError, "Vector #{v} does not exist") unless has_vector?(v)
1219
+ }
1241
1220
 
1242
1221
  Daru::Core::GroupBy.new(self, vectors)
1243
1222
  end
1244
1223
 
1245
1224
  def reindex_vectors new_vectors
1246
- raise ArgumentError, "Must pass the new index of type Index or its "\
1247
- "subclasses, not #{new_index.class}" unless new_vectors.kind_of?(Daru::Index)
1225
+ raise ArgumentError, 'Must pass the new index of type Index or its '\
1226
+ "subclasses, not #{new_index.class}" unless new_vectors.is_a?(Daru::Index)
1248
1227
 
1249
1228
  cl = Daru::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
1250
1229
  new_vectors.each do |vec|
1251
- if @vectors.include?(vec)
1252
- cl[vec] = self[vec]
1253
- else
1254
- cl[vec] = [nil]*nrows
1255
- end
1230
+ cl[vec] = @vectors.include?(vec) ? self[vec] : cl[vec] = [nil]*nrows
1256
1231
  end
1257
1232
 
1258
1233
  cl
1259
1234
  end
1260
1235
 
1261
1236
  # Concatenate another DataFrame along corresponding columns.
1262
- # Very premature implementation. Use with caution.
1237
+ # If columns do not exist in both dataframes, they are filled with nils
1263
1238
  def concat other_df
1264
- vectors = []
1265
- @vectors.each do |v|
1266
- vectors << self[v].to_a.dup.concat(other_df[v].to_a)
1239
+ vectors = @vectors.to_a
1240
+ data = []
1241
+
1242
+ vectors.each do |v|
1243
+ other_vec = other_df.vectors.include?(v) ? other_df[v].to_a : [nil] * other_df.size
1244
+ data << self[v].dup.to_a.concat(other_vec)
1245
+ end
1246
+
1247
+ other_df.vectors.each do |v|
1248
+ next if vectors.include?(v)
1249
+ vectors << v
1250
+ data << ([nil] * size).concat(other_df[v].to_a)
1267
1251
  end
1268
1252
 
1269
- Daru::DataFrame.new(vectors, order: @vectors)
1253
+ Daru::DataFrame.new(data, order: vectors)
1270
1254
  end
1271
1255
 
1272
1256
  # Set a particular column as the new DF
1273
1257
  def set_index new_index, opts={}
1274
- raise ArgumentError, "All elements in new index must be unique." if
1258
+ raise ArgumentError, 'All elements in new index must be unique.' if
1275
1259
  @size != self[new_index].uniq.size
1276
1260
 
1277
1261
  self.index = Daru::Index.new(self[new_index].to_a)
1278
- self.delete_vector(new_index) unless opts[:keep]
1262
+ delete_vector(new_index) unless opts[:keep]
1279
1263
 
1280
1264
  self
1281
1265
  end
@@ -1303,16 +1287,12 @@ module Daru
1303
1287
  # # a 1 11
1304
1288
  # # g nil nil
1305
1289
  def reindex new_index
1306
- raise ArgumentError, "Must pass the new index of type Index or its "\
1307
- "subclasses, not #{new_index.class}" unless new_index.kind_of?(Daru::Index)
1290
+ raise ArgumentError, 'Must pass the new index of type Index or its '\
1291
+ "subclasses, not #{new_index.class}" unless new_index.is_a?(Daru::Index)
1308
1292
 
1309
1293
  cl = Daru::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
1310
1294
  new_index.each do |idx|
1311
- if @index.include?(idx)
1312
- cl.row[idx] = self.row[idx]
1313
- else
1314
- cl.row[idx] = [nil]*ncols
1315
- end
1295
+ cl.row[idx] = @index.include?(idx) ? row[idx] : [nil]*ncols
1316
1296
  end
1317
1297
 
1318
1298
  cl
@@ -1330,7 +1310,7 @@ module Daru
1330
1310
  # df.index.to_a #=> ['a','b','c','d']
1331
1311
  # df.row['a'].to_a #=> [1,11]
1332
1312
  def index= idx
1333
- @data.each { |vec| vec.index = idx}
1313
+ @data.each { |vec| vec.index = idx }
1334
1314
  @index = idx
1335
1315
 
1336
1316
  self
@@ -1347,8 +1327,8 @@ module Daru
1347
1327
  # df.vectors = Daru::Index.new([:foo, :bar, :baz])
1348
1328
  # df.vectors.to_a #=> [:foo, :bar, :baz]
1349
1329
  def vectors= idx
1350
- raise ArgumentError, "Can only reindex with Index and its subclasses" unless
1351
- index.kind_of?(Daru::Index)
1330
+ raise ArgumentError, 'Can only reindex with Index and its subclasses' unless
1331
+ index.is_a?(Daru::Index)
1352
1332
  raise ArgumentError, "Specified index length #{idx.size} not equal to"\
1353
1333
  "dataframe size #{ncols}" if idx.size != ncols
1354
1334
 
@@ -1356,13 +1336,35 @@ module Daru
1356
1336
  self
1357
1337
  end
1358
1338
 
1339
+ # Renames the vectors
1340
+ #
1341
+ # == Arguments
1342
+ #
1343
+ # * name_map - A hash where the keys are the exising vector names and
1344
+ # the values are the new names. If a vector is renamed
1345
+ # to a vector name that is already in use, the existing
1346
+ # one is overwritten.
1347
+ #
1348
+ # == Usage
1349
+ #
1350
+ # df = Daru::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1351
+ # df.rename_vectors :a => :alpha, :c => :gamma
1352
+ # df.vectors.to_a #=> [:alpha, :b, :gamma]
1353
+ def rename_vectors name_map
1354
+ existing_targets = name_map.select { |k,v| k != v }.values & vectors.to_a
1355
+ delete_vectors(*existing_targets)
1356
+
1357
+ new_names = vectors.to_a.map { |v| name_map[v] ? name_map[v] : v }
1358
+ self.vectors = Daru::Index.new new_names
1359
+ end
1360
+
1359
1361
  # Return the indexes of all the numeric vectors. Will include vectors with nils
1360
1362
  # alongwith numbers.
1361
1363
  def numeric_vectors
1362
1364
  numerics = []
1363
1365
 
1364
1366
  each_vector_with_index do |vec, i|
1365
- numerics << i if(vec.type == :numeric)
1367
+ numerics << i if vec.type == :numeric
1366
1368
  end
1367
1369
  numerics
1368
1370
  end
@@ -1371,7 +1373,7 @@ module Daru
1371
1373
  numerics = []
1372
1374
 
1373
1375
  @vectors.each do |v|
1374
- numerics << v if (self[v].type == :numeric)
1376
+ numerics << v if self[v].type == :numeric
1375
1377
  end
1376
1378
  numerics
1377
1379
  end
@@ -1382,9 +1384,8 @@ module Daru
1382
1384
  def only_numerics opts={}
1383
1385
  cln = opts[:clone] == false ? false : true
1384
1386
  nv = numeric_vectors
1385
- arry = nv.inject([]) do |arr, v|
1387
+ arry = nv.each_with_object([]) do |v, arr|
1386
1388
  arr << self[v]
1387
- arr
1388
1389
  end
1389
1390
 
1390
1391
  order = Index.new(nv)
@@ -1392,12 +1393,12 @@ module Daru
1392
1393
  end
1393
1394
 
1394
1395
  # Generate a summary of this DataFrame with ReportBuilder.
1395
- def summary(method = :to_text)
1396
+ def summary(method=:to_text)
1396
1397
  ReportBuilder.new(no_title: true).add(self).send(method)
1397
1398
  end
1398
1399
 
1399
1400
  def report_building(b) # :nodoc: #
1400
- b.section(:name=>@name) do |g|
1401
+ b.section(name: @name) do |g|
1401
1402
  g.text "Number of rows: #{nrows}"
1402
1403
  @vectors.each do |v|
1403
1404
  g.text "Element:[#{v}]"
@@ -1406,8 +1407,8 @@ module Daru
1406
1407
  end
1407
1408
  end
1408
1409
 
1409
- # Sorts a dataframe (ascending/descending)according to the given sequence of
1410
- # vectors, using the attributes provided in the blocks.
1410
+ # Sorts a dataframe (ascending/descending) in the given pripority sequence of
1411
+ # vectors, with or without a block.
1411
1412
  #
1412
1413
  # @param order [Array] The order of vector names in which the DataFrame
1413
1414
  # should be sorted.
@@ -1415,42 +1416,121 @@ module Daru
1415
1416
  # @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
1416
1417
  # or descending order. Specify Array corresponding to *order* for multiple
1417
1418
  # sort orders.
1418
- # @option opts [Hash] :by ({|a,b| a <=> b}) Specify attributes of objects to
1419
+ # @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
1419
1420
  # to be used for sorting, for each vector name in *order* as a hash of
1420
- # vector name and lambda pairs. In case a lambda for a vector is not
1421
+ # vector name and lambda expressions. In case a lambda for a vector is not
1421
1422
  # specified, the default will be used.
1423
+ # @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
1424
+ # automatically or not when a block is provided.
1425
+ # If set to True, nils will appear at top after sorting.
1422
1426
  #
1423
- # == Usage
1427
+ # @example Sort a dataframe with a vector sequence.
1428
+ #
1429
+ #
1430
+ # df = Daru::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
1431
+ #
1432
+ # df.sort [:a, :b]
1433
+ # # =>
1434
+ # # <Daru::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
1435
+ # # a b
1436
+ # # 2 1 3
1437
+ # # 0 1 5
1438
+ # # 3 2 2
1439
+ # # 1 2 4
1440
+ # # 4 3 1
1441
+ #
1442
+ # @example Sort a dataframe without a block. Here nils will be handled automatically.
1443
+ #
1444
+ # df = Daru::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
1445
+ #
1446
+ # df.sort([:a])
1447
+ # # =>
1448
+ # # <Daru::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
1449
+ # # a b
1450
+ # # 1 nil 3
1451
+ # # 3 nil 1
1452
+ # # 0 -3 4
1453
+ # # 2 -1 2
1454
+ # # 4 5 4
1455
+ #
1456
+ # @example Sort a dataframe with a block with nils handled automatically.
1457
+ #
1458
+ # df = Daru::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1424
1459
  #
1425
- # df = Daru::DataFrame.new({a: [-3,2,-1,4], b: [4,3,2,1]})
1460
+ # df.sort [:b], by: {b: lambda { |a| a.length } }
1461
+ # # NoMethodError: undefined method `length' for nil:NilClass
1462
+ # # from (pry):8:in `block in __pry__'
1426
1463
  #
1427
- # #<Daru::DataFrame:140630680 @name = 04e00197-f8d5-4161-bca2-93266bfabc6f @size = 4>
1428
- # # a b
1429
- # # 0 -3 4
1430
- # # 1 2 3
1431
- # # 2 -1 2
1432
- # # 3 4 1
1433
- # df.sort([:a], by: { a: lambda { |a,b| a.abs <=> b.abs } })
1464
+ # df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
1465
+ #
1466
+ # # =>
1467
+ # # <Daru::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
1468
+ # # a b
1469
+ # # 2 1 nil
1470
+ # # 5 1 nil
1471
+ # # 4 -1 x
1472
+ # # 1 -1 aa
1473
+ # # 0 nil aaa
1474
+ # # 3 nil baaa
1475
+ #
1476
+ # @example Sort a dataframe with a block with nils handled manually.
1477
+ #
1478
+ # df = Daru::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1479
+ #
1480
+ # # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
1481
+ # df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
1482
+ #
1483
+ # # =>
1484
+ # #<Daru::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
1485
+ # # a b
1486
+ # # 4 -1 x
1487
+ # # 1 -1 aa
1488
+ # # 0 nil aaa
1489
+ # # 3 nil baaa
1490
+ # # 2 1 nil
1491
+ # # 5 1 nil
1492
+
1434
1493
  def sort! vector_order, opts={}
1435
- raise ArgumentError, "Required atleast one vector name" if vector_order.size < 1
1494
+ raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
1436
1495
  opts = {
1437
1496
  ascending: true,
1438
- type: :quick_sort,
1497
+ handle_nils: false,
1439
1498
  by: {}
1440
1499
  }.merge(opts)
1441
1500
 
1442
- opts[:by] = create_logic_blocks vector_order, opts[:by]
1443
1501
  opts[:ascending] = sort_order_array vector_order, opts[:ascending]
1444
- idx = @index.to_a
1445
- send(opts[:type], vector_order, idx, opts[:by], opts[:ascending])
1446
- self.index = Daru::Index.new(idx)
1502
+ opts[:handle_nils] = handle_nils_array vector_order, opts[:handle_nils]
1503
+ blocks = create_logic_blocks vector_order, opts[:by], opts[:ascending]
1504
+
1505
+ block = lambda do |r1, r2|
1506
+ # Build left and right array to compare two rows
1507
+ left = build_array_from_blocks vector_order, opts, blocks, r1, r2
1508
+ right = build_array_from_blocks vector_order, opts, blocks, r2, r1
1509
+
1510
+ # Resolve conflict by Index if all attributes are same
1511
+ left << r1
1512
+ right << r2
1513
+ left <=> right
1514
+ end
1515
+
1516
+ idx = (0..@index.size-1).sort(&block)
1517
+
1518
+ old_index = @index.to_a
1519
+ self.index = Daru::Index.new(idx.map { |i| old_index[i] })
1520
+
1521
+ vectors.each do |v|
1522
+ @data[@vectors[v]] = Daru::Vector.new(
1523
+ idx.map { |i| @data[@vectors[v]].data[i] },
1524
+ name: self[v].name, metadata: self[v].metadata.dup, index: index
1525
+ )
1526
+ end
1447
1527
 
1448
1528
  self
1449
1529
  end
1450
1530
 
1451
1531
  # Non-destructive version of #sort!
1452
1532
  def sort vector_order, opts={}
1453
- self.dup.sort! vector_order, opts
1533
+ dup.sort! vector_order, opts
1454
1534
  end
1455
1535
 
1456
1536
  # Pivots a data frame on specified vectors and applies an aggregate function
@@ -1489,25 +1569,27 @@ module Daru
1489
1569
  # # [:foo] 10 12
1490
1570
  def pivot_table opts={}
1491
1571
  raise ArgumentError,
1492
- "Specify grouping index" if !opts[:index] or opts[:index].empty?
1572
+ 'Specify grouping index' if !opts[:index] || opts[:index].empty?
1493
1573
 
1494
1574
  index = opts[:index]
1495
1575
  vectors = opts[:vectors] || []
1496
1576
  aggregate_function = opts[:agg] || :mean
1497
1577
  values =
1498
- if opts[:values].is_a?(Symbol)
1499
- [opts[:values]]
1500
- elsif opts[:values].is_a?(Array)
1501
- opts[:values]
1502
- else # nil
1503
- (@vectors.to_a - (index | vectors)) & numeric_vector_names
1504
- end
1578
+ if opts[:values].is_a?(Symbol)
1579
+ [opts[:values]]
1580
+ elsif opts[:values].is_a?(Array)
1581
+ opts[:values]
1582
+ else # nil
1583
+ (@vectors.to_a - (index | vectors)) & numeric_vector_names
1584
+ end
1505
1585
 
1506
- raise IndexError, "No numeric vectors to aggregate" if values.empty?
1586
+ raise IndexError, 'No numeric vectors to aggregate' if values.empty?
1507
1587
 
1508
- grouped = group_by(index)
1588
+ grouped = group_by(index)
1509
1589
 
1510
- unless vectors.empty?
1590
+ if vectors.empty?
1591
+ grouped.send(aggregate_function)
1592
+ else
1511
1593
  super_hash = {}
1512
1594
  values.each do |value|
1513
1595
  grouped.groups.each do |group_name, row_numbers|
@@ -1548,8 +1630,6 @@ module Daru
1548
1630
  end
1549
1631
  end
1550
1632
  return pivoted_dataframe
1551
- else
1552
- grouped.send(aggregate_function)
1553
1633
  end
1554
1634
  end
1555
1635
 
@@ -1561,8 +1641,8 @@ module Daru
1561
1641
  raise "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" unless nrows == other_df.nrows
1562
1642
 
1563
1643
  new_fields = (@vectors.to_a + other_df.vectors.to_a)
1564
- .recode_repeated
1565
- .map(&:to_sym)
1644
+ .recode_repeated
1645
+ .map(&:to_sym)
1566
1646
  df_new = DataFrame.new({}, order: new_fields)
1567
1647
 
1568
1648
  (0...nrows).to_a.each do |i|
@@ -1603,7 +1683,6 @@ module Daru
1603
1683
  Daru::Core::Merge.join(self, other_df, opts)
1604
1684
  end
1605
1685
 
1606
-
1607
1686
  # Creates a new dataset for one to many relations
1608
1687
  # on a dataset, based on pattern of field names.
1609
1688
  #
@@ -1632,26 +1711,25 @@ module Daru
1632
1711
  # # ["white", "2", 20]
1633
1712
  # # ]
1634
1713
  def one_to_many(parent_fields, pattern)
1635
- re = Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
1714
+ re = Regexp.new pattern.gsub('%v','(.+?)').gsub('%n','(\\d+?)')
1636
1715
  ds_vars = parent_fields.dup
1637
1716
  vars = []
1638
1717
  max_n = 0
1639
- h = parent_fields.inject({}) { |a,v|
1718
+ h = parent_fields.each_with_object({}) { |v, a|
1640
1719
  a[v] = Daru::Vector.new([])
1641
- a
1642
1720
  }
1643
1721
  # Adding _row_id
1644
1722
  h['_col_id'] = Daru::Vector.new([])
1645
1723
  ds_vars.push('_col_id')
1646
1724
 
1647
1725
  @vectors.each do |f|
1648
- if f =~ re
1649
- if !vars.include? $1
1650
- vars.push($1)
1651
- h[$1] = Daru::Vector.new([])
1652
- end
1653
- max_n = $2.to_i if max_n < $2.to_i
1726
+ next unless f =~ re
1727
+ unless vars.include? $1
1728
+ vars.push($1)
1729
+ h[$1] = Daru::Vector.new([])
1654
1730
  end
1731
+
1732
+ max_n = $2.to_i if max_n < $2.to_i
1655
1733
  end
1656
1734
  ds = DataFrame.new(h, order: ds_vars+vars)
1657
1735
 
@@ -1662,12 +1740,12 @@ module Daru
1662
1740
  end
1663
1741
 
1664
1742
  max_n.times do |n1|
1665
- n = n1+1
1743
+ n = n1+1
1666
1744
  any_data = false
1667
1745
  vars.each do |v|
1668
- data = row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)]
1746
+ data = row[pattern.gsub('%v',v.to_s).gsub('%n',n.to_s)]
1669
1747
  row_out[v] = data
1670
- any_data = true if !data.nil?
1748
+ any_data = true unless data.nil?
1671
1749
  end
1672
1750
 
1673
1751
  if any_data
@@ -1685,7 +1763,7 @@ module Daru
1685
1763
  i = 1
1686
1764
  split.each { |k,v|
1687
1765
  new_field = name_.to_s + join + i.to_s
1688
- v.rename name_.to_s + ":" + k.to_s
1766
+ v.rename name_.to_s + ':' + k.to_s
1689
1767
  self[new_field.to_sym] = v
1690
1768
  i += 1
1691
1769
  }
@@ -1707,11 +1785,11 @@ module Daru
1707
1785
  # ds.create_sql('names')
1708
1786
  # #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
1709
1787
  #
1710
- def create_sql(table,charset="UTF8")
1788
+ def create_sql(table,charset='UTF8')
1711
1789
  sql = "CREATE TABLE #{table} ("
1712
- fields = self.vectors.to_a.collect do |f|
1790
+ fields = vectors.to_a.collect do |f|
1713
1791
  v = self[f]
1714
- f.to_s + " " + v.db_type
1792
+ f.to_s + ' ' + v.db_type
1715
1793
  end
1716
1794
 
1717
1795
  sql + fields.join(",\n ")+") CHARACTER SET=#{charset};"
@@ -1724,14 +1802,14 @@ module Daru
1724
1802
  numerics_as_arrays << self[n].to_a
1725
1803
  end
1726
1804
 
1727
- GSL::Matrix.alloc *numerics_as_arrays.transpose
1805
+ GSL::Matrix.alloc(*numerics_as_arrays.transpose)
1728
1806
  end
1729
1807
 
1730
1808
  # Convert all vectors of type *:numeric* into a Matrix.
1731
1809
  def to_matrix
1732
1810
  numerics_as_arrays = []
1733
1811
  each_vector do |vector|
1734
- numerics_as_arrays << vector.to_a if(vector.type == :numeric)
1812
+ numerics_as_arrays << vector.to_a if vector.type == :numeric
1735
1813
  end
1736
1814
 
1737
1815
  Matrix.columns numerics_as_arrays
@@ -1746,8 +1824,8 @@ module Daru
1746
1824
  def to_nmatrix
1747
1825
  numerics_as_arrays = []
1748
1826
  each_vector do |vector|
1749
- numerics_as_arrays << vector.to_a if(vector.type == :numeric and
1750
- vector.missing_positions.size == 0)
1827
+ numerics_as_arrays << vector.to_a if vector.type == :numeric &&
1828
+ vector.missing_positions.empty?
1751
1829
  end
1752
1830
 
1753
1831
  numerics_as_arrays.transpose.to_nm
@@ -1760,8 +1838,8 @@ module Daru
1760
1838
  # in the array of hashes, which has the same index.
1761
1839
  def to_a
1762
1840
  arry = [[],[]]
1763
- self.each_row do |row|
1764
- arry[0] << row.to_hash
1841
+ each_row do |row|
1842
+ arry[0] << row.to_h
1765
1843
  end
1766
1844
  arry[1] = @index.to_a
1767
1845
 
@@ -1772,15 +1850,15 @@ module Daru
1772
1850
  # in the JSON thus created.
1773
1851
  def to_json no_index=true
1774
1852
  if no_index
1775
- self.to_a[0].to_json
1853
+ to_a[0].to_json
1776
1854
  else
1777
- self.to_a.to_json
1855
+ to_a.to_json
1778
1856
  end
1779
1857
  end
1780
1858
 
1781
- # Converts DataFrame to a hash with keys as vector names and values as
1859
+ # Converts DataFrame to a hash (explicit) with keys as vector names and values as
1782
1860
  # the corresponding vectors.
1783
- def to_hash
1861
+ def to_h
1784
1862
  hsh = {}
1785
1863
  @vectors.each_with_index do |vec_name, idx|
1786
1864
  hsh[vec_name] = @data[idx]
@@ -1791,12 +1869,12 @@ module Daru
1791
1869
 
1792
1870
  # Convert to html for IRuby.
1793
1871
  def to_html threshold=30
1794
- html = "<table>" +
1795
- "<tr>" +
1796
- "<th colspan=\"#{@vectors.size+1}\">" +
1797
- "Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}"
1798
- "</th>" +
1799
- "</tr>"
1872
+ html = '<table>' \
1873
+ '<tr>' \
1874
+ "<th colspan=\"#{@vectors.size+1}\">" \
1875
+ "Daru::DataFrame:#{object_id} " + " rows: #{nrows} " + " cols: #{ncols}" \
1876
+ '</th>' \
1877
+ '</tr>'
1800
1878
  html +='<tr><th></th>'
1801
1879
  @vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
1802
1880
  html += '</tr>'
@@ -1805,26 +1883,26 @@ module Daru
1805
1883
  html += '<tr>'
1806
1884
  html += '<td>' + index.to_s + '</td>'
1807
1885
 
1808
- self.row[index].each do |element|
1886
+ row[index].each do |element|
1809
1887
  html += '<td>' + element.to_s + '</td>'
1810
1888
  end
1811
1889
 
1812
1890
  html += '</tr>'
1813
- if num > threshold
1814
- html += '<tr>'
1815
- (@vectors.size + 1).times { html += '<td>...</td>' }
1816
- html += '</tr>'
1817
-
1818
- last_index = @index.to_a.last
1819
- last_row = self.row[last_index]
1820
- html += '<tr>'
1821
- html += "<td>" + last_index.to_s + "</td>"
1822
- (0..(ncols - 1)).to_a.each do |i|
1823
- html += '<td>' + last_row[i].to_s + '</td>'
1824
- end
1825
- html += '</tr>'
1826
- break
1891
+ next if num <= threshold
1892
+
1893
+ html += '<tr>'
1894
+ (@vectors.size + 1).times { html += '<td>...</td>' }
1895
+ html += '</tr>'
1896
+
1897
+ last_index = @index.to_a.last
1898
+ last_row = row[last_index]
1899
+ html += '<tr>'
1900
+ html += '<td>' + last_index.to_s + '</td>'
1901
+ (0..(ncols - 1)).to_a.each do |i|
1902
+ html += '<td>' + last_row[i].to_s + '</td>'
1827
1903
  end
1904
+ html += '</tr>'
1905
+ break
1828
1906
  end
1829
1907
  html += '</table>'
1830
1908
 
@@ -1841,7 +1919,7 @@ module Daru
1841
1919
  # assignment/deletion of elements is done. Updating data this way is called
1842
1920
  # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
1843
1921
  def update
1844
- @data.each { |v| v.update } if Daru.lazy_update
1922
+ @data.each(&:update) if Daru.lazy_update
1845
1923
  end
1846
1924
 
1847
1925
  # Rename the DataFrame.
@@ -1890,19 +1968,18 @@ module Daru
1890
1968
  Daru::IO.dataframe_write_sql self, dbh, table
1891
1969
  end
1892
1970
 
1893
-
1894
1971
  # Use marshalling to save dataframe to a file.
1895
1972
  def save filename
1896
1973
  Daru::IO.save self, filename
1897
1974
  end
1898
1975
 
1899
- def _dump depth
1900
- Marshal.dump({
1976
+ def _dump(_depth)
1977
+ Marshal.dump(
1901
1978
  data: @data,
1902
1979
  index: @index.to_a,
1903
1980
  order: @vectors.to_a,
1904
1981
  name: @name
1905
- })
1982
+ )
1906
1983
  end
1907
1984
 
1908
1985
  def self._load data
@@ -1939,29 +2016,29 @@ module Daru
1939
2016
  longest = [@name.to_s.size,
1940
2017
  (@vectors.map(&:to_s).map(&:size).max || 0),
1941
2018
  (@index .map(&:to_s).map(&:size).max || 0),
1942
- (@data .map{ |v| v.map(&:to_s).map(&:size).max}.max || 0)].max
2019
+ (@data .map { |v| v.map(&:to_s).map(&:size).max }.max || 0)].max
1943
2020
 
1944
2021
  name = @name || 'nil'
1945
- content = ""
2022
+ content = ''
1946
2023
  longest = spacing if longest > spacing
1947
2024
  formatter = "\n"
1948
2025
 
1949
2026
  (@vectors.size + 1).times { formatter += "%#{longest}.#{longest}s " }
1950
- content += "\n#<" + self.class.to_s + ":" + self.object_id.to_s + " @name = " +
1951
- name.to_s + " @size = " + @size.to_s + ">"
1952
- content += sprintf formatter, "" , *@vectors.map(&:to_s)
2027
+ content += "\n#<" + self.class.to_s + ':' + object_id.to_s + ' @name = ' +
2028
+ name.to_s + ' @size = ' + @size.to_s + '>'
2029
+ content += formatter % ['', *@vectors.map(&:to_s)]
1953
2030
  row_num = 1
1954
2031
 
1955
- self.each_row_with_index do |row, index|
1956
- content += sprintf formatter, index.to_s, *row.to_hash.values.map { |e| (e || 'nil').to_s }
2032
+ each_row_with_index do |row, index|
2033
+ content += formatter % [index.to_s, *row.to_h.values.map { |e| (e || 'nil').to_s }]
1957
2034
  row_num += 1
1958
- if row_num > threshold
1959
- dots = []
2035
+ next if row_num <= threshold
1960
2036
 
1961
- (@vectors.size + 1).times { dots << "..." }
1962
- content += sprintf formatter, *dots
1963
- break
1964
- end
2037
+ dots = []
2038
+
2039
+ (@vectors.size + 1).times { dots << '...' }
2040
+ content += formatter % dots
2041
+ break
1965
2042
  end
1966
2043
  content += "\n"
1967
2044
 
@@ -1974,24 +2051,24 @@ module Daru
1974
2051
  end
1975
2052
 
1976
2053
  def == other
1977
- self.class == other.class and
1978
- @size == other.size and
1979
- @index == other.index and
1980
- @vectors == other.vectors and
1981
- @vectors.to_a.all? { |v| self[v] == other[v] }
2054
+ self.class == other.class &&
2055
+ @size == other.size &&
2056
+ @index == other.index &&
2057
+ @vectors == other.vectors &&
2058
+ @vectors.to_a.all? { |v| self[v] == other[v] }
1982
2059
  end
1983
2060
 
1984
2061
  def method_missing(name, *args, &block)
1985
- if md = name.match(/(.+)\=/)
1986
- insert_or_modify_vector name[/(.+)\=/].delete("=").to_sym, args[0]
1987
- elsif self.has_vector? name
2062
+ if name =~ /(.+)\=/
2063
+ insert_or_modify_vector name[/(.+)\=/].delete('=').to_sym, args[0]
2064
+ elsif has_vector? name
1988
2065
  self[name]
1989
2066
  else
1990
2067
  super(name, *args, &block)
1991
2068
  end
1992
2069
  end
1993
2070
 
1994
- private
2071
+ private
1995
2072
 
1996
2073
  def possibly_multi_index? index
1997
2074
  if @index.is_a?(MultiIndex)
@@ -2001,101 +2078,51 @@ module Daru
2001
2078
  end
2002
2079
  end
2003
2080
 
2004
- def quick_sort vector_order, index, by, ascending
2005
- recursive_quick_sort vector_order, index, by, ascending, 0, @size-1
2006
- end
2007
-
2008
- # == Arguments
2009
- #
2010
- # vector_order -
2011
- # index -
2012
- # by -
2013
- # ascending -
2014
- # left_lower -
2015
- # right_upper -
2016
- def recursive_quick_sort vector_order, index, by, ascending, left_lower, right_upper
2017
- if left_lower < right_upper
2018
- left_upper, right_lower = partition(vector_order, index, by, ascending, left_lower, right_upper)
2019
- if left_upper - left_lower < right_upper - right_lower
2020
- recursive_quick_sort(vector_order, index, by, ascending, left_lower, left_upper)
2021
- recursive_quick_sort(vector_order, index, by, ascending, right_lower, right_upper)
2022
- else
2023
- recursive_quick_sort(vector_order, index, by, ascending, right_lower, right_upper)
2024
- recursive_quick_sort(vector_order, index, by, ascending, left_lower, left_upper)
2025
- end
2026
- end
2027
- end
2028
-
2029
- def partition vector_order, index, by, ascending, left_lower, right_upper
2030
- mindex = (left_lower + right_upper) / 2
2031
- mvalues = vector_order.inject([]) { |a, vector_name| a << self[vector_name][mindex]; a }
2032
- i = left_lower
2033
- j = right_upper
2034
- descending = ascending.map { |a| !a }
2035
-
2036
- i += 1 while(keep?(i, mvalues, vector_order, ascending , by, 0))
2037
- j -= 1 while(keep?(j, mvalues, vector_order, descending, by, 0))
2038
-
2039
- while i < j - 1
2040
- @data.each do |vector|
2041
- vector[i], vector[j] = vector[j], vector[i]
2042
- end
2043
- index[i], index[j] = index[j], index[i]
2044
- i += 1
2045
- j -= 1
2046
-
2047
- i += 1 while(keep?(i, mvalues, vector_order, ascending , by,0))
2048
- j -= 1 while(keep?(j, mvalues, vector_order, descending, by,0))
2049
- end
2050
-
2051
- if i <= j
2052
- if i < j
2053
- @data.each do |vector|
2054
- vector[i], vector[j] = vector[j], vector[i]
2081
+ def create_logic_blocks vector_order, _by, ascending
2082
+ # Create blocks to handle nils
2083
+ blocks = {}
2084
+ universal_block_ascending = ->(a) { [a.nil? ? 0 : 1, a] }
2085
+ universal_block_decending = ->(a) { [a.nil? ? 1 : 0, a] }
2086
+ vector_order.each_with_index do |vector, i|
2087
+ blocks[vector] =
2088
+ if ascending[i]
2089
+ universal_block_ascending
2090
+ else
2091
+ universal_block_decending
2055
2092
  end
2056
- index[i], index[j] = index[j], index[i]
2057
- end
2058
- i += 1
2059
- j -= 1
2060
2093
  end
2061
2094
 
2062
- [j,i]
2095
+ blocks
2063
2096
  end
2064
2097
 
2065
- def keep? current_index, mvalues, vector_order, sort_order, by, vector_order_index
2066
- vector_name = vector_order[vector_order_index]
2067
- if vector_name
2068
- vec = self[vector_name]
2069
- eval = by[vector_name].call(vec[current_index], mvalues[vector_order_index])
2098
+ def build_array_from_blocks vector_order, opts, blocks, r1, r2
2099
+ # Create an array to be used for comparison of two rows in sorting
2100
+ vector_order.map.each_with_index do |v, i|
2101
+ value = if opts[:ascending][i]
2102
+ @data[@vectors[v]].data[r1]
2103
+ else
2104
+ @data[@vectors[v]].data[r2]
2105
+ end
2070
2106
 
2071
- if sort_order[vector_order_index] # sort in ascending order
2072
- return false if eval == 1
2073
- return true if eval == -1
2074
- if eval == 0
2075
- keep?(current_index, mvalues, vector_order, sort_order, by, vector_order_index + 1)
2076
- end
2077
- else # sort in descending order
2078
- return false if eval == -1
2079
- return true if eval == 1
2080
- if eval == 0
2081
- keep?(current_index, mvalues, vector_order, sort_order, by, vector_order_index + 1)
2082
- end
2083
- end
2084
- end
2085
- end
2107
+ if opts[:by][v] && !opts[:handle_nils][i]
2108
+ # Block given and nils handled manually
2109
+ value = opts[:by][v].call value
2086
2110
 
2087
- def create_logic_blocks vector_order, by={}
2088
- universal_block = lambda { |a,b| a <=> b }
2089
- vector_order.each do |vector|
2090
- by[vector] ||= universal_block
2091
- end
2111
+ elsif opts[:by][v] && opts[:handle_nils][i]
2112
+ # Block given and nils handled automatically
2113
+ value = opts[:by][v].call value rescue nil
2114
+ blocks[v].call value
2092
2115
 
2093
- by
2116
+ else
2117
+ # Block not given and nils handled automatically
2118
+ blocks[v].call value
2119
+ end
2120
+ end
2094
2121
  end
2095
2122
 
2096
2123
  def sort_order_array vector_order, ascending
2097
- if ascending.is_a?(Array)
2098
- raise ArgumentError, "Specify same number of vector names and sort orders" if
2124
+ if ascending.is_a? Array
2125
+ raise ArgumentError, 'Specify same number of vector names and sort orders' if
2099
2126
  vector_order.size != ascending.size
2100
2127
  return ascending
2101
2128
  else
@@ -2103,6 +2130,16 @@ module Daru
2103
2130
  end
2104
2131
  end
2105
2132
 
2133
+ def handle_nils_array vector_order, handle_nils
2134
+ if handle_nils.is_a? Array
2135
+ raise ArgumentError, 'Specify same number of vector names and handle nils' if
2136
+ vector_order.size != handle_nils.size
2137
+ return handle_nils
2138
+ else
2139
+ Array.new(vector_order.size, handle_nils)
2140
+ end
2141
+ end
2142
+
2106
2143
  def vectors_index_for location
2107
2144
  if @vectors.include?(location)
2108
2145
  @vectors[location]
@@ -2118,39 +2155,35 @@ module Daru
2118
2155
  if @vectors.is_a?(MultiIndex)
2119
2156
  pos = @vectors[names]
2120
2157
 
2121
- if pos.is_a?(Integer)
2122
- return @data[pos]
2123
- else # MultiIndex
2124
- new_vectors = pos.map do |tuple|
2125
- @data[@vectors[tuple]]
2126
- end
2158
+ return @data[pos] if pos.is_a?(Integer)
2127
2159
 
2128
- if !location.is_a?(Range) and names.size < @vectors.width
2129
- pos = pos.drop_left_level names.size
2130
- end
2160
+ # MultiIndex
2161
+ new_vectors = pos.map do |tuple|
2162
+ @data[@vectors[tuple]]
2163
+ end
2131
2164
 
2132
- Daru::DataFrame.new(
2133
- new_vectors, index: @index, order: pos)
2165
+ if !location.is_a?(Range) && names.size < @vectors.width
2166
+ pos = pos.drop_left_level names.size
2134
2167
  end
2168
+
2169
+ Daru::DataFrame.new(new_vectors, index: @index, order: pos)
2135
2170
  else
2136
2171
  unless names[1]
2137
2172
  pos = @vectors[location]
2138
2173
 
2139
- if pos.is_a?(Numeric)
2140
- return @data[pos]
2141
- else
2142
- names = pos
2143
- end
2174
+ return @data[pos] if pos.is_a?(Numeric)
2175
+
2176
+ names = pos
2144
2177
  end
2145
2178
 
2146
- new_vcs = []
2179
+ new_vectors = {}
2147
2180
  names.each do |name|
2148
- new_vcs << @data[@vectors[name]].to_a
2181
+ new_vectors[name] = @data[@vectors[name]]
2149
2182
  end
2150
2183
 
2151
2184
  order = names.is_a?(Array) ? Daru::Index.new(names) : names
2152
- Daru::DataFrame.new(new_vcs, order: order,
2153
- index: @index, name: @name)
2185
+ Daru::DataFrame.new(new_vectors, order: order,
2186
+ index: @index, name: @name)
2154
2187
  end
2155
2188
  end
2156
2189
 
@@ -2161,16 +2194,15 @@ module Daru
2161
2194
  pos = @index[names]
2162
2195
  if pos.is_a?(Integer)
2163
2196
  return Daru::Vector.new(populate_row_for(pos), index: @vectors, name: pos)
2164
- else
2165
- new_rows = pos.map { |tuple| populate_row_for(tuple) }
2197
+ end
2166
2198
 
2167
- if !location.is_a?(Range) and names.size < @index.width
2168
- pos = pos.drop_left_level names.size
2169
- end
2199
+ new_rows = pos.map { |tuple| populate_row_for(tuple) }
2170
2200
 
2171
- Daru::DataFrame.rows(
2172
- new_rows, order: @vectors, name: @name, index: pos)
2201
+ if !location.is_a?(Range) && names.size < @index.width
2202
+ pos = pos.drop_left_level names.size
2173
2203
  end
2204
+
2205
+ Daru::DataFrame.rows(new_rows, order: @vectors, name: @name, index: pos)
2174
2206
  else
2175
2207
  if names[1].nil?
2176
2208
  names = @index[location]
@@ -2189,7 +2221,7 @@ module Daru
2189
2221
  rows << self.row[name].to_a
2190
2222
  end
2191
2223
 
2192
- Daru::DataFrame.rows rows, index: names ,name: @name, order: @vectors
2224
+ Daru::DataFrame.rows rows, index: names,name: @name, order: @vectors
2193
2225
  end
2194
2226
  end
2195
2227
 
@@ -2201,17 +2233,22 @@ module Daru
2201
2233
 
2202
2234
  def insert_or_modify_vector name, vector
2203
2235
  name = name[0] unless @vectors.is_a?(MultiIndex)
2204
- v = nil
2236
+ vec = nil
2205
2237
 
2206
2238
  if @index.empty?
2207
- v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a)
2208
- @index = v.index
2209
- assign_or_add_vector name, v
2239
+ vec = if vector.is_a?(Daru::Vector)
2240
+ vector
2241
+ else
2242
+ Daru::Vector.new(vector.to_a, name: set_name(name))
2243
+ end
2244
+
2245
+ @index = vec.index
2246
+ assign_or_add_vector name, vec
2210
2247
  set_size
2211
2248
 
2212
2249
  @data.map! do |v|
2213
- if v.size == 0
2214
- Daru::Vector.new([nil]*@size, name: set_name(name), index: @index)
2250
+ if v.empty?
2251
+ Daru::Vector.new([nil]*@size, name: set_name(name), metadata: v.metadata, index: @index)
2215
2252
  else
2216
2253
  v
2217
2254
  end
@@ -2219,15 +2256,11 @@ module Daru
2219
2256
  else
2220
2257
  if vector.is_a?(Daru::Vector)
2221
2258
  if vector.index == @index # so that index-by-index assignment is avoided when possible.
2222
- v = vector.dup
2259
+ vec = vector.dup
2223
2260
  else
2224
- v = Daru::Vector.new [], name: set_name(name), index: @index
2261
+ vec = Daru::Vector.new [], name: set_name(name), metadata: vector.metadata.dup, index: @index
2225
2262
  @index.each do |idx|
2226
- if vector.index.include? idx
2227
- v[idx] = vector[idx]
2228
- else
2229
- v[idx] = nil
2230
- end
2263
+ vec[idx] = vector.index.include?(idx) ? vector[idx] : nil
2231
2264
  end
2232
2265
  end
2233
2266
  else
@@ -2235,26 +2268,30 @@ module Daru
2235
2268
  "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
2236
2269
  @size != vector.size
2237
2270
 
2238
- v = Daru::Vector.new(vector, name: set_name(name), index: @index)
2271
+ vec = Daru::Vector.new(vector, name: set_name(name), index: @index)
2239
2272
  end
2240
2273
 
2241
- assign_or_add_vector name, v
2274
+ assign_or_add_vector name, vec
2242
2275
  end
2243
2276
  end
2244
2277
 
2245
2278
  def assign_or_add_vector name, v
2246
- #FIXME: fix this jugaad. need to make changes in Indexing itself.
2247
- pos = @vectors[name]
2279
+ # FIXME: fix this jugaad. need to make changes in Indexing itself.
2280
+ begin
2281
+ pos = @vectors[name]
2282
+ rescue IndexError
2283
+ pos = name
2284
+ end
2248
2285
 
2249
- if !pos.kind_of?(Daru::Index) and pos == name and
2250
- (@vectors.include?(name) or (pos.is_a?(Integer) and pos < @data.size))
2286
+ if !pos.is_a?(Daru::Index) && pos == name &&
2287
+ (@vectors.include?(name) || (pos.is_a?(Integer) && pos < @data.size))
2251
2288
  @data[pos] = v
2252
- elsif pos.kind_of?(Daru::Index)
2289
+ elsif pos.is_a?(Daru::Index)
2253
2290
  pos.each do |p|
2254
2291
  @data[@vectors[p]] = v
2255
2292
  end
2256
2293
  else
2257
- @vectors = @vectors | [name] if !@vectors.include?(name)
2294
+ @vectors |= [name] unless @vectors.include?(name)
2258
2295
  @data[@vectors[name]] = v
2259
2296
  end
2260
2297
  end
@@ -2264,21 +2301,21 @@ module Daru
2264
2301
  # TODO
2265
2302
  else
2266
2303
  name = name[0]
2267
- v =
2268
- if vector.is_a?(Daru::Vector)
2269
- vector
2270
- else
2271
- Daru::Vector.new(vector, name: set_name(name), index: @vectors)
2272
- end
2304
+ vec =
2305
+ if vector.is_a?(Daru::Vector)
2306
+ vector
2307
+ else
2308
+ Daru::Vector.new(vector, name: set_name(name), index: @vectors)
2309
+ end
2273
2310
 
2274
2311
  if @index.include? name
2275
- self.each_vector_with_index do |vector,i|
2276
- vector[name] = v.index.include?(i) ? v[i] : nil
2312
+ each_vector_with_index do |v,i|
2313
+ v[name] = vec.index.include?(i) ? vec[i] : nil
2277
2314
  end
2278
2315
  else
2279
- @index = @index | [name]
2280
- self.each_vector_with_index do |vector,i|
2281
- vector.concat((v.index.include?(i) ? v[i] : nil), name)
2316
+ @index |= [name]
2317
+ each_vector_with_index do |v,i|
2318
+ v.concat((vec.index.include?(i) ? vec[i] : nil), name)
2282
2319
  end
2283
2320
  end
2284
2321
 
@@ -2294,15 +2331,15 @@ module Daru
2294
2331
 
2295
2332
  def validate_labels
2296
2333
  raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if
2297
- @vectors and @vectors.size != @data.size
2334
+ @vectors && @vectors.size != @data.size
2298
2335
 
2299
- raise IndexError, "Expected number of indexes same as number of rows" if
2300
- @index and @data[0] and @index.size != @data[0].size
2336
+ raise IndexError, 'Expected number of indexes same as number of rows' if
2337
+ @index && @data[0] && @index.size != @data[0].size
2301
2338
  end
2302
2339
 
2303
2340
  def validate_vector_sizes
2304
2341
  @data.each do |vector|
2305
- raise IndexError, "Expected vectors with equal length" if vector.size != @size
2342
+ raise IndexError, 'Expected vectors with equal length' if vector.size != @size
2306
2343
  end
2307
2344
  end
2308
2345
 
@@ -2332,14 +2369,14 @@ module Daru
2332
2369
  end
2333
2370
 
2334
2371
  def create_vectors_index_with vectors, source
2335
- vectors = source.keys.sort_by { |a| a.to_s } if vectors.nil?
2372
+ vectors = source.keys.sort_by(&:to_s) if vectors.nil?
2336
2373
 
2337
2374
  @vectors =
2338
- unless vectors.is_a?(Index) or vectors.is_a?(MultiIndex)
2339
- Daru::Index.new((vectors + (source.keys - vectors)).uniq)
2340
- else
2341
- vectors
2342
- end
2375
+ if vectors.is_a?(Index) || vectors.is_a?(MultiIndex)
2376
+ vectors
2377
+ else
2378
+ Daru::Index.new((vectors + (source.keys - vectors)).uniq)
2379
+ end
2343
2380
  end
2344
2381
 
2345
2382
  def all_vectors_have_equal_indexes? source
@@ -2351,24 +2388,24 @@ module Daru
2351
2388
  end
2352
2389
 
2353
2390
  def try_create_index index
2354
- index.kind_of?(Index) ? index : Daru::Index.new(index)
2391
+ index.is_a?(Index) ? index : Daru::Index.new(index)
2355
2392
  end
2356
2393
 
2357
- def set_name potential_name
2394
+ def set_name potential_name # rubocop:disable Style/AccessorMethodName
2358
2395
  potential_name.is_a?(Array) ? potential_name.join : potential_name
2359
2396
  end
2360
2397
 
2361
2398
  def symbolize arry
2362
2399
  symbolized_arry =
2363
- if arry.all? { |e| e.is_a?(Array) }
2364
- arry.map do |sub_arry|
2365
- sub_arry.map do |e|
2366
- e.is_a?(Numeric) ? e : e.to_sym
2400
+ if arry.all? { |e| e.is_a?(Array) }
2401
+ arry.map do |sub_arry|
2402
+ sub_arry.map do |e|
2403
+ e.is_a?(Numeric) ? e : e.to_sym
2404
+ end
2367
2405
  end
2406
+ else
2407
+ arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym }
2368
2408
  end
2369
- else
2370
- arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym }
2371
- end
2372
2409
 
2373
2410
  symbolized_arry
2374
2411
  end