daru 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/.rubocop.yml +99 -0
  4. data/.rubocop_todo.yml +44 -0
  5. data/.travis.yml +3 -1
  6. data/CONTRIBUTING.md +5 -1
  7. data/History.md +43 -0
  8. data/README.md +3 -4
  9. data/benchmarks/duplicating.rb +45 -0
  10. data/benchmarks/group_by.rb +7 -7
  11. data/benchmarks/joining.rb +52 -0
  12. data/benchmarks/sorting.rb +9 -2
  13. data/benchmarks/statistics.rb +39 -0
  14. data/daru.gemspec +4 -4
  15. data/lib/daru.rb +9 -9
  16. data/lib/daru/accessors/array_wrapper.rb +15 -11
  17. data/lib/daru/accessors/dataframe_by_row.rb +1 -1
  18. data/lib/daru/accessors/gsl_wrapper.rb +30 -19
  19. data/lib/daru/accessors/mdarray_wrapper.rb +1 -3
  20. data/lib/daru/accessors/nmatrix_wrapper.rb +15 -15
  21. data/lib/daru/core/group_by.rb +69 -16
  22. data/lib/daru/core/merge.rb +135 -151
  23. data/lib/daru/core/query.rb +9 -30
  24. data/lib/daru/dataframe.rb +476 -439
  25. data/lib/daru/date_time/index.rb +150 -137
  26. data/lib/daru/date_time/offsets.rb +45 -41
  27. data/lib/daru/extensions/rserve.rb +4 -4
  28. data/lib/daru/index.rb +88 -64
  29. data/lib/daru/io/io.rb +33 -34
  30. data/lib/daru/io/sql_data_source.rb +11 -11
  31. data/lib/daru/maths/arithmetic/dataframe.rb +19 -19
  32. data/lib/daru/maths/arithmetic/vector.rb +9 -14
  33. data/lib/daru/maths/statistics/dataframe.rb +89 -61
  34. data/lib/daru/maths/statistics/vector.rb +226 -97
  35. data/lib/daru/monkeys.rb +23 -30
  36. data/lib/daru/plotting/dataframe.rb +27 -28
  37. data/lib/daru/plotting/vector.rb +12 -13
  38. data/lib/daru/vector.rb +221 -330
  39. data/lib/daru/version.rb +2 -2
  40. data/spec/core/group_by_spec.rb +16 -0
  41. data/spec/core/merge_spec.rb +30 -14
  42. data/spec/dataframe_spec.rb +268 -14
  43. data/spec/index_spec.rb +23 -5
  44. data/spec/io/io_spec.rb +37 -16
  45. data/spec/math/statistics/dataframe_spec.rb +40 -8
  46. data/spec/math/statistics/vector_spec.rb +135 -10
  47. data/spec/monkeys_spec.rb +3 -3
  48. data/spec/vector_spec.rb +157 -25
  49. metadata +41 -21
@@ -4,17 +4,17 @@ module Daru
4
4
  class << self
5
5
  def replace_keys_if_duplicates hash, matcher
6
6
  matched = nil
7
- hash.keys.each { |d|
7
+ hash.keys.each { |d|
8
8
  if matcher.match(Regexp.new(d.to_s))
9
9
  matched = d
10
10
  break
11
- end
11
+ end
12
12
  }
13
13
 
14
- if matched
15
- hash[matcher] = hash[matched]
16
- hash.delete matched
17
- end
14
+ return unless matched
15
+
16
+ hash[matcher] = hash[matched]
17
+ hash.delete matched
18
18
  end
19
19
 
20
20
  def resolve_duplicates df_hash1, df_hash2, on
@@ -29,198 +29,182 @@ module Daru
29
29
  end
30
30
 
31
31
  def hashify df
32
- hsh = df.to_hash
32
+ hsh = df.to_h
33
33
  hsh.each { |k,v| hsh[k] = v.to_a }
34
34
  hsh
35
35
  end
36
-
36
+
37
37
  def arrayify df
38
38
  arr = df.to_a
39
39
  col_names = arr[0][0].keys
40
- values = arr[0].map{|h| h.values}
40
+ values = arr[0].map(&:values)
41
41
 
42
- return col_names, values
42
+ [col_names, values]
43
43
  end
44
44
 
45
- def inner_join df1, df2, df_hash1, df_hash2, on
46
- joined_hash = {}
47
- ((df_hash1.keys - on) | on | (df_hash2.keys - on)).each do |k|
48
- joined_hash[k] = []
49
- end
45
+ def arrayify_with_sort_keys(size, df_hash, on)
46
+ # Converting to a hash and then to an array is more complex
47
+ # than using df.to_a or df.map(:row). However, it's
48
+ # substantially faster this way.
50
49
 
51
- (0...df1.size).each do |id1|
52
- (0...df2.size).each do |id2|
53
- if on.all? { |n| df_hash1[n][id1] == df_hash2[n][id2] }
54
- joined_hash.each do |k,v|
55
- v << (df_hash1.has_key?(k) ? df_hash1[k][id1] : df_hash2[k][id2])
56
- end
57
- end
58
- end
50
+ # idx_keys = on.map { |key| df_hash.keys.index(key) }
51
+
52
+ (0...size).reduce([]) do |r, idx|
53
+ key_values = on.map { |col| df_hash[col][idx] }
54
+ row_values = df_hash.map { |_col, val| val[idx] }
55
+ r << [key_values, row_values]
59
56
  end
60
57
 
61
- Daru::DataFrame.new(joined_hash, order: joined_hash.keys)
58
+ # Conceptually simpler and does the same thing, but slows down the
59
+ # total merge algorithm by 2x. Would be nice to improve the performance
60
+ # of df.map(:row)
61
+ #
62
+ # df.map(:row) do |row|
63
+ # key_values = on.map { |key| row[key] }
64
+ # [key_values, row.to_a]
65
+ # end
62
66
  end
63
67
 
64
- def bf_inner_join df1, df2, on
65
- col_names1, table1 = arrayify df1
66
- col_names2, table2 = arrayify df2
67
-
68
- #resolve duplicates
69
- indicies1 = on.map{|i| col_names1.index(i)}
70
- indicies2 = on.map{|i| col_names2.index(i)}
71
- col_names2.map! do |name|
72
- if (col_names1.include?(name))
73
- col_names1[col_names1.index(name)] = (name.to_s + "_1").to_sym unless on.include?(name)
74
- (name.to_s + "_2").to_sym
75
- else
76
- name
77
- end
78
- end
68
+ def verify_dataframes df_hash1, df_hash2, on
69
+ raise ArgumentError,
70
+ 'All fields in :on must be present in self' unless on.all? { |e| df_hash1[e] }
71
+ raise ArgumentError,
72
+ 'All fields in :on must be present in other DF' unless on.all? { |e| df_hash2[e] }
73
+ end
74
+ end
75
+ end
79
76
 
80
- #combine key columns to a single column value
81
- on_cols1 = table1.flat_map{|x| indicies1.map{|i| x[i].to_s}.join("+")}
82
- on_cols2 = table2.flat_map{|x| indicies2.map{|i| x[i].to_s}.join("+")}
77
+ class MergeFrame
78
+ def initialize(df1, df2, on: nil)
79
+ @df1 = df1
80
+ @df2 = df2
81
+ @on = on
82
+ end
83
83
 
84
- #parameters for a BF with approx 0.1% false positives
85
- m = on_cols2.size * 15
86
- k = 11
84
+ def inner _opts
85
+ merge_join(left: false, right: false)
86
+ end
87
87
 
88
- bf = BloomFilter::Native.new({:size => m, :hashes => k, :bucket => 1})
89
- on_cols2.each{|x| bf.insert(x)}
88
+ def left _opts
89
+ merge_join(left: true, right: false)
90
+ end
90
91
 
91
- x_ind = -1
92
- joined_new = on_cols1.map do |x|
93
- x_ind+=1
94
- if (bf.include?(x))
95
- {x_ind => on_cols2.each_index.select{|y_ind| on_cols2[y_ind] == x}}
96
- else
97
- {x_ind => []}
98
- end
99
- end
100
- .reduce({}) {|h,pairs| pairs.each {|k,v| (h[k] ||= []) << v}; h}
101
- .flat_map{|ind1, inds2| inds2.flatten.map{|ind2| [table1[ind1], table2[ind2]].flatten} if inds2.flatten.size > 0}
92
+ def right _opts
93
+ merge_join(left: false, right: true)
94
+ end
102
95
 
103
- joined_cols = [col_names1, col_names2].flatten
104
- df = Daru::DataFrame.rows(joined_new.compact, order: joined_cols)
105
- on.each{|x| df.delete_vector (x.to_s + "_2").to_sym}
96
+ def outer _opts
97
+ merge_join(left: true, right: true)
98
+ end
106
99
 
107
- df
108
- end
100
+ def merge_join(left: true, right: true)
101
+ MergeHelper.verify_dataframes df1_hash, df2_hash, @on
102
+ MergeHelper.resolve_duplicates df1_hash, df2_hash, @on
109
103
 
110
- def full_outer_join df1, df2, df_hash1, df_hash2, on
111
- left = left_outer_join df1, df2, df_hash1, df_hash2, on, true
112
- right = right_outer_join df1, df2, df_hash1, df_hash2, on, true
104
+ # TODO: Use native dataframe sorting.
105
+ # It would be ideal to reuse sorting functionality that is native
106
+ # to dataframes. Unfortunately, native dataframe sort introduces
107
+ # an overhead that reduces join performance by a factor of 4! Until
108
+ # that aspect is improved, we resort to a simpler array sort.
109
+ df1_array.sort_by! { |row| [row[0].nil? ? 0 : 1, row[0]] }
110
+ df2_array.sort_by! { |row| [row[0].nil? ? 0 : 1, row[0]] }
113
111
 
114
- Daru::DataFrame.rows(
115
- (left.values.transpose | right.values.transpose), order: left.keys)
116
- end
112
+ idx1 = 0
113
+ idx2 = 0
117
114
 
118
- def left_outer_join df1, df2, df_hash1, df_hash2, on, as_hash=false
119
- joined_hash = {}
120
- ((df_hash1.keys - on) | on | (df_hash2.keys - on)).each do |k|
121
- joined_hash[k] = []
122
- end
115
+ while idx1 < @df1.size || idx2 < @df2.size
123
116
 
124
-
125
- (0...df1.size).each do |id1|
126
- joined = false
127
- (0...df2.size).each do |id2|
128
- if on.all? { |n| df_hash1[n][id1] == df_hash2[n][id2] }
129
- joined = true
130
- joined_hash.each do |k,v|
131
- v << (df_hash1.has_key?(k) ? df_hash1[k][id1] : df_hash2[k][id2])
132
- end
133
- end
134
- end
117
+ key1 = df1_array[idx1][0] if idx1 < @df1.size
118
+ key2 = df2_array[idx2][0] if idx2 < @df2.size
135
119
 
136
- unless joined
137
- df_hash1.keys.each do |k|
138
- joined_hash[k] << df_hash1[k][id1]
139
- end
120
+ if key1 == key2 && idx1 < @df1.size && idx2 < @df2.size
121
+ idx2_start = idx2
140
122
 
141
- (joined_hash.keys - df_hash1.keys).each do |k|
142
- joined_hash[k] << nil
143
- end
144
- joined = false
123
+ while (idx2 < @df2.size) && (df1_array[idx1][0] == df2_array[idx2][0])
124
+ add_merge_row_to_hash([df1_array[idx1], df2_array[idx2]], joined_hash)
125
+ idx2 += 1
145
126
  end
127
+
128
+ idx2 = idx2_start if idx1+1 < @df1.size && df1_array[idx1][0] == df1_array[idx1+1][0]
129
+ idx1 += 1
130
+ elsif ((key2.nil? || [key1,key2].sort == [key1,key2]) && idx1 < @df1.size) || idx2 == @df2.size
131
+ add_merge_row_to_hash([df1_array[idx1], nil], joined_hash) if left
132
+ idx1 += 1
133
+ elsif idx2 < @df2.size || idx1 == @df1.size
134
+ add_merge_row_to_hash([nil, df2_array[idx2]], joined_hash) if right
135
+ idx2 += 1
136
+ else
137
+ raise 'Unexpected condition met during merge'
146
138
  end
139
+ end
147
140
 
148
- return joined_hash if as_hash
149
- Daru::DataFrame.new(joined_hash, order: joined_hash.keys)
141
+ Daru::DataFrame.new(joined_hash, order: joined_hash.keys)
142
+ end
143
+
144
+ private
145
+
146
+ def joined_hash
147
+ return @joined_hash if @joined_hash
148
+ @joined_hash ||= {}
149
+
150
+ ((df1_keys - @on) | @on | (df2_keys - @on)).each do |k|
151
+ @joined_hash[k] = []
150
152
  end
151
153
 
152
- def right_outer_join df1, df2, df_hash1, df_hash2, on, as_hash=false
153
- joined_hash = {}
154
- ((df_hash1.keys - on) | on | (df_hash2.keys - on)).each do |k|
155
- joined_hash[k] = []
156
- end
154
+ @joined_hash
155
+ end
157
156
 
158
- (0...df2.size).each do |id1|
159
- joined = false
160
- (0...df1.size).each do |id2|
161
- if on.all? { |n| df_hash2[n][id1] == df_hash1[n][id2] }
162
- joined = true
163
- joined_hash.each do |k,v|
164
- v << (df_hash2.has_key?(k) ? df_hash2[k][id1] : df_hash1[k][id2])
165
- end
166
- end
167
- end
157
+ def df1_hash
158
+ @df1_hash ||= MergeHelper.hashify @df1
159
+ end
168
160
 
169
- unless joined
170
- df_hash2.keys.each do |k|
171
- joined_hash[k] << df_hash2[k][id1]
172
- end
161
+ def df2_hash
162
+ @df2_hash ||= MergeHelper.hashify @df2
163
+ end
173
164
 
174
- (joined_hash.keys - df_hash2.keys).each do |k|
175
- joined_hash[k] << nil
176
- end
177
- joined = false
178
- end
179
- end
165
+ def df1_array
166
+ @df1_array ||= MergeHelper.arrayify_with_sort_keys @df1.size, df1_hash, @on
167
+ end
180
168
 
181
- return joined_hash if as_hash
182
- Daru::DataFrame.new(joined_hash, order: joined_hash.keys)
183
- end
169
+ def df2_array
170
+ @df2_array ||= MergeHelper.arrayify_with_sort_keys @df2.size, df2_hash, @on
171
+ end
184
172
 
185
- def verify_dataframes df_hash1, df_hash2, on
186
- raise ArgumentError,
187
- "All fields in :on must be present in self" if !on.all? { |e| df_hash1[e] }
188
- raise ArgumentError,
189
- "All fields in :on must be present in other DF" if !on.all? { |e| df_hash2[e] }
173
+ def df1_keys
174
+ df1_hash.keys
175
+ end
176
+
177
+ def df2_keys
178
+ df2_hash.keys
179
+ end
180
+
181
+ # Private: The merge row contains two elements, the first is the row from the
182
+ # first dataframe, the second is the row from the second dataframe.
183
+ def add_merge_row_to_hash row, hash
184
+ @df1_key_to_index ||= df1_keys.each_with_index.map { |k,idx| [k, idx] }.to_h
185
+ @df2_key_to_index ||= df2_keys.each_with_index.map { |k,idx| [k, idx] }.to_h
186
+
187
+ hash.each do |k,v|
188
+ v ||= []
189
+
190
+ left = df1_keys.include?(k) ? row[0] && row[0][1][@df1_key_to_index[k]] : nil
191
+ right = df2_keys.include?(k) ? row[1] && row[1][1][@df2_key_to_index[k]] : nil
192
+
193
+ v << (left || right)
190
194
  end
191
195
  end
192
196
  end
197
+
193
198
  # Private module containing methods for join, merge, concat operations on
194
199
  # dataframes and vectors.
195
200
  # @private
196
201
  module Merge
197
202
  class << self
198
203
  def join df1, df2, opts={}
199
- helper = MergeHelper
200
-
201
- df_hash1 = helper.hashify df1
202
- df_hash2 = helper.hashify df2
203
204
  on = opts[:on]
204
205
 
205
- helper.verify_dataframes df_hash1, df_hash2, on
206
- helper.resolve_duplicates df_hash1, df_hash2, on
207
-
208
- case opts[:how]
209
- when :inner
210
- if Daru.has_bloomfilter_rb?
211
- helper.bf_inner_join df1, df2, on
212
- else
213
- helper.inner_join df1, df2, df_hash1, df_hash2, on
214
- end
215
- when :outer
216
- helper.full_outer_join df1, df2, df_hash1, df_hash2, on
217
- when :left
218
- helper.left_outer_join df1, df2, df_hash1, df_hash2, on
219
- when :right
220
- helper.right_outer_join df1, df2, df_hash1, df_hash2, on
221
- else
222
- raise ArgumentError, "Unrecognized option in :how => #{opts[:how]}"
223
- end
206
+ mf = MergeFrame.new df1, df2, on: on
207
+ mf.send opts[:how], {}
224
208
  end
225
209
  end
226
210
  end
@@ -9,31 +9,19 @@ module Daru
9
9
  end
10
10
 
11
11
  def & other
12
- new_bool = []
13
- other_barry = other.barry
14
- @barry.each_with_index do |b, i|
15
- new_bool << (b and other_barry[i])
16
- end
17
-
18
- BoolArray.new(new_bool)
12
+ BoolArray.new @barry.zip(other.barry).map { |b, o| b && o }
19
13
  end
20
14
 
21
15
  alias :and :&
22
16
 
23
17
  def | other
24
- new_bool = []
25
- other_barry = other.barry
26
- @barry.each_with_index do |b, i|
27
- new_bool << (b or other_barry[i])
28
- end
29
-
30
- BoolArray.new(new_bool)
18
+ BoolArray.new @barry.zip(other.barry).map { |b, o| b || o }
31
19
  end
32
20
 
33
21
  alias :or :|
34
22
 
35
23
  def !
36
- BoolArray.new(@barry.map { |b| !b })
24
+ BoolArray.new(@barry.map(&:!))
37
25
  end
38
26
 
39
27
  def == other
@@ -45,27 +33,17 @@ module Daru
45
33
  end
46
34
 
47
35
  def inspect
48
- "(#{self.class}:#{self.object_id} bool_arry=#{@barry})"
36
+ "(#{self.class}:#{object_id} bool_arry=#{@barry})"
49
37
  end
50
38
  end
51
39
 
52
40
  class << self
53
41
  def apply_scalar_operator operator, data, other
54
- arry = data.inject([]) do |memo,d|
55
- memo << (d.send(operator, other) ? true : false)
56
- memo
57
- end
58
-
59
- BoolArray.new(arry)
42
+ BoolArray.new data.map { |d| !!d.send(operator, other) }
60
43
  end
61
44
 
62
45
  def apply_vector_operator operator, vector, other
63
- bool_arry = []
64
- vector.each_with_index do |d, i|
65
- bool_arry << (d.send(operator, other[i]) ? true : false)
66
- end
67
-
68
- BoolArray.new(bool_arry)
46
+ BoolArray.new vector.zip(other).map { |d, o| !!d.send(operator, o) }
69
47
  end
70
48
 
71
49
  def df_where data_frame, bool_array
@@ -74,7 +52,8 @@ module Daru
74
52
  end
75
53
 
76
54
  Daru::DataFrame.new(
77
- vecs, order: data_frame.vectors, index: vecs[0].index, clone: false)
55
+ vecs, order: data_frame.vectors, index: vecs[0].index, clone: false
56
+ )
78
57
  end
79
58
 
80
59
  def vector_where data, index, bool_array, dtype
@@ -92,4 +71,4 @@ module Daru
92
71
  end
93
72
  end
94
73
  end
95
- end
74
+ end
@@ -1,14 +1,11 @@
1
- $:.unshift File.dirname(__FILE__)
2
-
3
- require 'accessors/dataframe_by_row.rb'
4
- require 'maths/arithmetic/dataframe.rb'
5
- require 'maths/statistics/dataframe.rb'
6
- require 'plotting/dataframe.rb'
7
- require 'io/io.rb'
1
+ require 'daru/accessors/dataframe_by_row.rb'
2
+ require 'daru/maths/arithmetic/dataframe.rb'
3
+ require 'daru/maths/statistics/dataframe.rb'
4
+ require 'daru/plotting/dataframe.rb'
5
+ require 'daru/io/io.rb'
8
6
 
9
7
  module Daru
10
8
  class DataFrame
11
-
12
9
  include Daru::Maths::Arithmetic::DataFrame
13
10
  include Daru::Maths::Statistics::DataFrame
14
11
  include Daru::Plotting::DataFrame if Daru.has_nyaplot?
@@ -115,31 +112,30 @@ module Daru
115
112
  # Create DataFrame by specifying rows as an Array of Arrays or Array of
116
113
  # Daru::Vector objects.
117
114
  def rows source, opts={}
118
- df = nil
119
- if source.all? { |v| v.size == source[0].size }
120
- first = source[0]
121
- index = []
122
- opts[:order] ||=
123
- if first.is_a?(Daru::Vector) # assume that all are Vectors
124
- source.each { |vec| index << vec.name }
115
+ first = source.first
116
+
117
+ raise SizeError, 'All vectors must have same length' \
118
+ unless source.all? { |v| v.size == first.size }
119
+
120
+ index = []
121
+ opts[:order] ||=
122
+ case first
123
+ when Daru::Vector # assume that all are Vectors
124
+ index = source.map(&:name)
125
125
  first.index.to_a
126
- elsif first.is_a?(Array)
127
- Array.new(first.size) { |i| i.to_s }
126
+ when Array
127
+ Array.new(first.size, &:to_s)
128
128
  end
129
129
 
130
- if source.all? { |s| s.is_a?(Array) }
131
- df = Daru::DataFrame.new(source.transpose, opts)
132
- else # array of Daru::Vectors
133
- df = Daru::DataFrame.new({}, opts)
130
+ if source.all? { |s| s.is_a?(Array) }
131
+ Daru::DataFrame.new(source.transpose, opts)
132
+ else # array of Daru::Vectors
133
+ Daru::DataFrame.new({}, opts).tap do |df|
134
134
  source.each_with_index do |row, idx|
135
- df[(index[idx] || idx), :row] = row
135
+ df[index[idx] || idx, :row] = row
136
136
  end
137
137
  end
138
- else
139
- raise SizeError, "All vectors must have same length"
140
138
  end
141
-
142
- df
143
139
  end
144
140
 
145
141
  # Generates a new dataset, using three vectors
@@ -162,18 +158,16 @@ module Daru
162
158
  #
163
159
  # Useful to process outputs from databases
164
160
  def crosstab_by_assignation rows, columns, values
165
- raise "Three vectors should be equal size" if
166
- rows.size != columns.size or rows.size!=values.size
161
+ raise 'Three vectors should be equal size' if
162
+ rows.size != columns.size || rows.size!=values.size
167
163
 
168
164
  cols_values = columns.factors
169
165
  cols_n = cols_values.size
170
166
 
171
- h_rows = rows.factors.inject({}) do |a,v|
172
- a[v] = cols_values.inject({}) do |a1,v1|
167
+ h_rows = rows.factors.each_with_object({}) do |v, a|
168
+ a[v] = cols_values.each_with_object({}) do |v1, a1|
173
169
  a1[v1]=nil
174
- a1
175
170
  end
176
- a
177
171
  end
178
172
 
179
173
  values.each_index do |i|
@@ -250,7 +244,7 @@ module Daru
250
244
  @data = []
251
245
 
252
246
  temp_name = opts[:name]
253
- @name = temp_name || SecureRandom.uuid
247
+ @name = temp_name || SecureRandom.uuid
254
248
 
255
249
  if source.empty?
256
250
  @vectors = try_create_index vectors
@@ -266,7 +260,7 @@ module Daru
266
260
  @index = try_create_index(index || source[0].size)
267
261
  @vectors = try_create_index(vectors)
268
262
 
269
- @vectors.each_with_index do |vec,idx|
263
+ @vectors.each_with_index do |_vec,idx|
270
264
  @data << Daru::Vector.new(source[idx], index: @index)
271
265
  end
272
266
  elsif source.all? { |s| s.is_a?(Daru::Vector) }
@@ -276,18 +270,18 @@ module Daru
276
270
  end
277
271
  initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
278
272
  else # array of hashes
279
- if vectors.nil?
280
- @vectors = Daru::Index.new source[0].keys
281
- else
282
- @vectors = Daru::Index.new(
283
- (vectors + (source[0].keys - vectors)).uniq)
284
- end
273
+ @vectors =
274
+ if vectors.nil?
275
+ Daru::Index.new source[0].keys
276
+ else
277
+ Daru::Index.new((vectors + (source[0].keys - vectors)).uniq)
278
+ end
285
279
  @index = Daru::Index.new(index || source.size)
286
280
 
287
281
  @vectors.each do |name|
288
282
  v = []
289
- source.each do |hsh|
290
- v << (hsh[name] || hsh[name.to_s])
283
+ source.each do |h|
284
+ v << (h[name] || h[name.to_s])
291
285
  end
292
286
 
293
287
  @data << Daru::Vector.new(v, name: set_name(name), index: @index)
@@ -296,10 +290,10 @@ module Daru
296
290
  when Hash
297
291
  create_vectors_index_with vectors, source
298
292
  if all_daru_vectors_in_source? source
293
+ vectors_have_same_index = all_vectors_have_equal_indexes?(source)
299
294
  if !index.nil?
300
295
  @index = try_create_index index
301
- elsif all_vectors_have_equal_indexes?(source)
302
- vectors_have_same_index = true
296
+ elsif vectors_have_same_index
303
297
  @index = source.values[0].index.dup
304
298
  else
305
299
  all_indexes = []
@@ -320,14 +314,10 @@ module Daru
320
314
  if vectors_have_same_index
321
315
  v = source[vector].dup
322
316
  else
323
- v = Daru::Vector.new([], name: vector, index: @index)
317
+ v = Daru::Vector.new([], name: vector, metadata: source[vector].metadata.dup, index: @index)
324
318
 
325
319
  @index.each do |idx|
326
- if source[vector].index.include? idx
327
- v[idx] = source[vector][idx]
328
- else
329
- v[idx] = nil
330
- end
320
+ v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
331
321
  end
332
322
  end
333
323
  @data << v
@@ -339,7 +329,8 @@ module Daru
339
329
  @index = try_create_index(index || source.values[0].size)
340
330
 
341
331
  @vectors.each do |name|
342
- @data << Daru::Vector.new(source[name].dup, name: set_name(name), index: @index)
332
+ meta_opt = source[name].respond_to?(:metadata) ? {metadata: source[name].metadata.dup} : {}
333
+ @data << Daru::Vector.new(source[name].dup, name: set_name(name), **meta_opt, index: @index)
343
334
  end
344
335
  end
345
336
  end
@@ -350,17 +341,16 @@ module Daru
350
341
  update
351
342
  end
352
343
 
353
- def vector *args
354
- $stderr.puts "#vector has been deprecated in favour of #[]. Please use that."
344
+ def vector(*)
345
+ $stderr.puts '#vector has been deprecated in favour of #[]. Please use that.'
355
346
  self[*names]
356
347
  end
357
348
 
358
349
  # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
359
350
  # Defaults to *:vector*. Use of this method is not recommended for accessing
360
- # rows or vectors. Use df.row[:a] for accessing row with index ':a' or
361
- # df.vector[:vec] for accessing vector with index *:vec*.
351
+ # rows. Use df.row[:a] for accessing row with index ':a'.
362
352
  def [](*names)
363
- if names[-1] == :vector or names[-1] == :row
353
+ if names[-1] == :vector || names[-1] == :row
364
354
  axis = names[-1]
365
355
  names = names[0..-2]
366
356
  else
@@ -368,9 +358,9 @@ module Daru
368
358
  end
369
359
 
370
360
  if axis == :vector
371
- access_vector *names
361
+ access_vector(*names)
372
362
  elsif axis == :row
373
- access_row *names
363
+ access_row(*names)
374
364
  else
375
365
  raise IndexError, "Expected axis to be row or vector not #{axis}"
376
366
  end
@@ -433,7 +423,7 @@ module Daru
433
423
 
434
424
  src = []
435
425
  vectors_to_dup.each do |vec|
436
- src << @data[@vectors[vec]].to_a.dup
426
+ src << @data[@vectors[vec]].dup
437
427
  end
438
428
  new_order = Daru::Index.new(vectors_to_dup)
439
429
 
@@ -454,11 +444,10 @@ module Daru
454
444
  # a view of the whole data frame otherwise.
455
445
  def clone *vectors_to_clone
456
446
  vectors_to_clone.flatten! unless vectors_to_clone.all? { |a| !a.is_a?(Array) }
457
- return super if vectors_to_clone.empty?
447
+ vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
458
448
 
459
- h = vectors_to_clone.inject({}) do |hsh, vec|
449
+ h = vectors_to_clone.each_with_object({}) do |vec, hsh|
460
450
  hsh[vec] = self[vec]
461
- hsh
462
451
  end
463
452
  Daru::DataFrame.new(h, clone: false)
464
453
  end
@@ -476,9 +465,8 @@ module Daru
476
465
  # Creates a new duplicate dataframe containing only rows
477
466
  # without a single missing value.
478
467
  def dup_only_valid vecs=nil
479
- rows_with_nil = @data.inject([]) do |memo, vector|
468
+ rows_with_nil = @data.each_with_object([]) do |vector, memo|
480
469
  memo.concat vector.missing_positions
481
- memo
482
470
  end.uniq
483
471
 
484
472
  row_indexes = @index.to_a
@@ -505,7 +493,7 @@ module Daru
505
493
  alias_method :each_column, :each_vector
506
494
 
507
495
  # Iterate over each vector alongwith the name of the vector
508
- def each_vector_with_index(&block)
496
+ def each_vector_with_index
509
497
  return to_enum(:each_vector_with_index) unless block_given?
510
498
 
511
499
  @vectors.each do |vector|
@@ -518,7 +506,7 @@ module Daru
518
506
  alias_method :each_column_with_index, :each_vector_with_index
519
507
 
520
508
  # Iterate over each row
521
- def each_row(&block)
509
+ def each_row
522
510
  return to_enum(:each_row) unless block_given?
523
511
 
524
512
  @index.each do |index|
@@ -528,7 +516,7 @@ module Daru
528
516
  self
529
517
  end
530
518
 
531
- def each_row_with_index(&block)
519
+ def each_row_with_index
532
520
  return to_enum(:each_row_with_index) unless block_given?
533
521
 
534
522
  @index.each do |index|
@@ -552,7 +540,7 @@ module Daru
552
540
  # * +axis+ - The axis to iterate over. Can be :vector (or :column)
553
541
  # or :row. Default to :vector.
554
542
  def each axis=:vector, &block
555
- if axis == :vector or axis == :column
543
+ if axis == :vector || axis == :column
556
544
  each_vector(&block)
557
545
  elsif axis == :row
558
546
  each_row(&block)
@@ -577,7 +565,7 @@ module Daru
577
565
  # * +axis+ - The axis to iterate over. Can be :vector (or :column)
578
566
  # or :row. Default to :vector.
579
567
  def collect axis=:vector, &block
580
- if axis == :vector or axis == :column
568
+ if axis == :vector || axis == :column
581
569
  collect_vectors(&block)
582
570
  elsif axis == :row
583
571
  collect_rows(&block)
@@ -603,7 +591,7 @@ module Daru
603
591
  # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
604
592
  # Default to :vector.
605
593
  def map axis=:vector, &block
606
- if axis == :vector or axis == :column
594
+ if axis == :vector || axis == :column
607
595
  map_vectors(&block)
608
596
  elsif axis == :row
609
597
  map_rows(&block)
@@ -621,7 +609,7 @@ module Daru
621
609
  # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
622
610
  # Default to :vector.
623
611
  def map! axis=:vector, &block
624
- if axis == :vector or axis == :column
612
+ if axis == :vector || axis == :column
625
613
  map_vectors!(&block)
626
614
  elsif axis == :row
627
615
  map_rows!(&block)
@@ -646,7 +634,7 @@ module Daru
646
634
  # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
647
635
  # Default to :vector.
648
636
  def recode axis=:vector, &block
649
- if axis == :vector or axis == :column
637
+ if axis == :vector || axis == :column
650
638
  recode_vectors(&block)
651
639
  elsif axis == :row
652
640
  recode_rows(&block)
@@ -682,17 +670,17 @@ module Daru
682
670
  # row[:a] + row[:d] < 100
683
671
  # end
684
672
  def filter axis=:vector, &block
685
- if axis == :vector or axis == :column
673
+ if axis == :vector || axis == :column
686
674
  filter_vectors(&block)
687
675
  elsif axis == :row
688
676
  filter_rows(&block)
689
677
  end
690
678
  end
691
679
 
692
- def recode_vectors &block
680
+ def recode_vectors
693
681
  block_given? or return to_enum(:recode_vectors)
694
682
 
695
- df = self.dup
683
+ df = dup
696
684
  df.each_vector_with_index do |v, i|
697
685
  ret = yield v
698
686
  ret.is_a?(Daru::Vector) or
@@ -703,10 +691,10 @@ module Daru
703
691
  df
704
692
  end
705
693
 
706
- def recode_rows &block
694
+ def recode_rows
707
695
  block_given? or return to_enum(:recode_rows)
708
696
 
709
- df = self.dup
697
+ df = dup
710
698
  df.each_row_with_index do |r, i|
711
699
  ret = yield r
712
700
  ret.is_a?(Daru::Vector) or raise TypeError, "Every iteration must return Daru::Vector not #{ret.class}"
@@ -717,7 +705,7 @@ module Daru
717
705
  end
718
706
 
719
707
  # Map each vector and return an Array.
720
- def map_vectors(&block)
708
+ def map_vectors
721
709
  return to_enum(:map_vectors) unless block_given?
722
710
 
723
711
  arry = []
@@ -729,7 +717,7 @@ module Daru
729
717
  end
730
718
 
731
719
  # Destructive form of #map_vectors
732
- def map_vectors!(&block)
720
+ def map_vectors!
733
721
  return to_enum(:map_vectors!) unless block_given?
734
722
 
735
723
  vectors.dup.each do |n|
@@ -742,7 +730,7 @@ module Daru
742
730
  end
743
731
 
744
732
  # Map vectors alongwith the index.
745
- def map_vectors_with_index(&block)
733
+ def map_vectors_with_index
746
734
  return to_enum(:map_vectors_with_index) unless block_given?
747
735
 
748
736
  dt = []
@@ -754,7 +742,7 @@ module Daru
754
742
  end
755
743
 
756
744
  # Map each row
757
- def map_rows(&block)
745
+ def map_rows
758
746
  return to_enum(:map_rows) unless block_given?
759
747
 
760
748
  dt = []
@@ -765,7 +753,7 @@ module Daru
765
753
  dt
766
754
  end
767
755
 
768
- def map_rows_with_index(&block)
756
+ def map_rows_with_index
769
757
  return to_enum(:map_rows_with_index) unless block_given?
770
758
 
771
759
  dt = []
@@ -776,13 +764,13 @@ module Daru
776
764
  dt
777
765
  end
778
766
 
779
- def map_rows!(&block)
767
+ def map_rows!
780
768
  return to_enum(:map_rows!) unless block_given?
781
769
 
782
770
  index.dup.each do |i|
783
- r = yield self.row[i]
771
+ r = yield row[i]
784
772
  r.is_a?(Daru::Vector) or raise TypeError, "Returned object must be Daru::Vector not #{r.class}"
785
- self.row[i] = r
773
+ row[i] = r
786
774
  end
787
775
 
788
776
  self
@@ -790,7 +778,7 @@ module Daru
790
778
 
791
779
  # Retrieves a Daru::Vector, based on the result of calculation
792
780
  # performed on each row.
793
- def collect_rows &block
781
+ def collect_rows
794
782
  return to_enum(:collect_rows) unless block_given?
795
783
 
796
784
  data = []
@@ -801,7 +789,7 @@ module Daru
801
789
  Daru::Vector.new(data, index: @index)
802
790
  end
803
791
 
804
- def collect_row_with_index &block
792
+ def collect_row_with_index
805
793
  return to_enum(:collect_row_with_index) unless block_given?
806
794
 
807
795
  data = []
@@ -814,7 +802,7 @@ module Daru
814
802
 
815
803
  # Retrives a Daru::Vector, based on the result of calculation
816
804
  # performed on each vector.
817
- def collect_vectors &block
805
+ def collect_vectors
818
806
  return to_enum(:collect_vectors) unless block_given?
819
807
 
820
808
  data = []
@@ -825,7 +813,7 @@ module Daru
825
813
  Daru::Vector.new(data, index: @vectors)
826
814
  end
827
815
 
828
- def collect_vector_with_index &block
816
+ def collect_vector_with_index
829
817
  return to_enum(:collect_vector_with_index) unless block_given?
830
818
 
831
819
  data = []
@@ -852,15 +840,19 @@ module Daru
852
840
  Matrix.rows(rows)
853
841
  end
854
842
 
855
-
856
843
  # Delete a vector
857
844
  def delete_vector vector
858
- if @vectors.include? vector
859
- @data.delete_at @vectors[vector]
860
- @vectors = Daru::Index.new @vectors.to_a - [vector]
861
- else
862
- raise IndexError, "Vector #{vector} does not exist."
863
- end
845
+ raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
846
+
847
+ @data.delete_at @vectors[vector]
848
+ @vectors = Daru::Index.new @vectors.to_a - [vector]
849
+
850
+ self
851
+ end
852
+
853
+ # Deletes a list of vectors
854
+ def delete_vectors *vectors
855
+ Array(vectors).each { |vec| delete_vector vec }
864
856
 
865
857
  self
866
858
  end
@@ -869,13 +861,10 @@ module Daru
869
861
  def delete_row index
870
862
  idx = named_index_for index
871
863
 
872
- if @index.include? idx
873
- @index = Daru::Index.new(@index.to_a - [idx])
874
- self.each_vector do |vector|
875
- vector.delete_at idx
876
- end
877
- else
878
- raise IndexError, "Index #{index} does not exist."
864
+ raise IndexError, "Index #{index} does not exist." unless @index.include? idx
865
+ @index = Daru::Index.new(@index.to_a - [idx])
866
+ each_vector do |vector|
867
+ vector.delete_at idx
879
868
  end
880
869
 
881
870
  set_size
@@ -895,7 +884,7 @@ module Daru
895
884
  ds_boot
896
885
  end
897
886
 
898
- def keep_row_if &block
887
+ def keep_row_if
899
888
  deletion = []
900
889
 
901
890
  @index.each do |index|
@@ -908,7 +897,7 @@ module Daru
908
897
  }
909
898
  end
910
899
 
911
- def keep_vector_if &block
900
+ def keep_vector_if
912
901
  @vectors.each do |vector|
913
902
  keep_vector = yield @data[@vectors[vector]], vector
914
903
 
@@ -923,27 +912,17 @@ module Daru
923
912
  d.push(row[vec]) if yield row
924
913
  end
925
914
 
926
- Daru::Vector.new(d)
915
+ Daru::Vector.new(d, metadata: self[vec].metadata.dup)
927
916
  end
928
917
 
929
918
  # Iterates over each row and retains it in a new DataFrame if the block returns
930
919
  # true for that row.
931
- def filter_rows &block
920
+ def filter_rows
932
921
  return to_enum(:filter_rows) unless block_given?
933
922
 
934
- df = Daru::DataFrame.new({}, order: @vectors.to_a)
935
- marked = []
936
-
937
- @index.each do |index|
938
- keep_row = yield access_row(index)
939
- marked << index if keep_row
940
- end
941
-
942
- marked.each do |idx|
943
- df.row[idx] = self[idx, :row]
944
- end
923
+ keep_rows = @index.map { |index| yield access_row(index) }
945
924
 
946
- df
925
+ where keep_rows
947
926
  end
948
927
 
949
928
  # Iterates over each vector and retains it in a new DataFrame if the block returns
@@ -951,8 +930,8 @@ module Daru
951
930
  def filter_vectors &block
952
931
  return to_enum(:filter_vectors) unless block_given?
953
932
 
954
- df = self.dup
955
- df.keep_vector_if &block
933
+ df = dup
934
+ df.keep_vector_if(&block)
956
935
 
957
936
  df
958
937
  end
@@ -962,7 +941,7 @@ module Daru
962
941
  #
963
942
  # The function returns an array with all errors.
964
943
  def verify(*tests)
965
- if(tests[0].is_a? Symbol)
944
+ if tests[0].is_a? Symbol
966
945
  id = tests[0]
967
946
  tests.shift
968
947
  else
@@ -974,13 +953,12 @@ module Daru
974
953
  each(:row) do |row|
975
954
  i += 1
976
955
  tests.each do |test|
977
- if !test[2].call(row)
978
- values = ""
979
- if test[1].size>0
980
- values = " (" + test[1].collect{ |k| "#{k}=#{row[k]}" }.join(", ") + ")"
981
- end
982
- vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
956
+ next if test[2].call(row)
957
+ values = ''
958
+ unless test[1].empty?
959
+ values = ' (' + test[1].collect { |k| "#{k}=#{row[k]}" }.join(', ') + ')'
983
960
  end
961
+ vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
984
962
  end
985
963
  end
986
964
  vr
@@ -1051,7 +1029,7 @@ module Daru
1051
1029
  alias :vector_missing_values :missing_values_rows
1052
1030
 
1053
1031
  def has_missing_data?
1054
- !!@data.any? { |v| v.has_missing_data? }
1032
+ !!@data.any?(&:has_missing_data?)
1055
1033
  end
1056
1034
 
1057
1035
  alias :flawed? :has_missing_data?
@@ -1075,9 +1053,9 @@ module Daru
1075
1053
  name = row[tree_keys.last]
1076
1054
  if !block
1077
1055
  current[name] ||= []
1078
- current[name].push(row.to_hash.delete_if { |key,value| tree_keys.include? key})
1056
+ current[name].push(row.to_h.delete_if { |key,_value| tree_keys.include? key })
1079
1057
  else
1080
- current[name] = block.call(row, current,name)
1058
+ current[name] = yield(row, current, name)
1081
1059
  end
1082
1060
  end
1083
1061
 
@@ -1087,7 +1065,7 @@ module Daru
1087
1065
  def vector_count_characters vecs=nil
1088
1066
  vecs ||= @vectors.to_a
1089
1067
 
1090
- collect_row_with_index do |row, i|
1068
+ collect_rows do |row|
1091
1069
  vecs.inject(0) do |memo, vec|
1092
1070
  memo + (row[vec].nil? ? 0 : row[vec].to_s.size)
1093
1071
  end
@@ -1129,7 +1107,7 @@ module Daru
1129
1107
  # row[:a] < 3 and row[:b] == 'b'
1130
1108
  # end #=> true
1131
1109
  def any? axis=:vector, &block
1132
- if axis == :vector or axis == :column
1110
+ if axis == :vector || axis == :column
1133
1111
  @data.any?(&block)
1134
1112
  elsif axis == :row
1135
1113
  each_row do |row|
@@ -1151,7 +1129,7 @@ module Daru
1151
1129
  # row[:a] < 10
1152
1130
  # end #=> true
1153
1131
  def all? axis=:vector, &block
1154
- if axis == :vector or axis == :column
1132
+ if axis == :vector || axis == :column
1155
1133
  @data.all?(&block)
1156
1134
  elsif axis == :row
1157
1135
  each_row do |row|
@@ -1236,46 +1214,52 @@ module Daru
1236
1214
  # # ["foo", "two", 3]=>[2, 4]}
1237
1215
  def group_by *vectors
1238
1216
  vectors.flatten!
1239
- vectors.each { |v| raise(ArgumentError, "Vector #{v} does not exist") unless
1240
- has_vector?(v) }
1217
+ vectors.each { |v|
1218
+ raise(ArgumentError, "Vector #{v} does not exist") unless has_vector?(v)
1219
+ }
1241
1220
 
1242
1221
  Daru::Core::GroupBy.new(self, vectors)
1243
1222
  end
1244
1223
 
1245
1224
  def reindex_vectors new_vectors
1246
- raise ArgumentError, "Must pass the new index of type Index or its "\
1247
- "subclasses, not #{new_index.class}" unless new_vectors.kind_of?(Daru::Index)
1225
+ raise ArgumentError, 'Must pass the new index of type Index or its '\
1226
+ "subclasses, not #{new_index.class}" unless new_vectors.is_a?(Daru::Index)
1248
1227
 
1249
1228
  cl = Daru::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
1250
1229
  new_vectors.each do |vec|
1251
- if @vectors.include?(vec)
1252
- cl[vec] = self[vec]
1253
- else
1254
- cl[vec] = [nil]*nrows
1255
- end
1230
+ cl[vec] = @vectors.include?(vec) ? self[vec] : cl[vec] = [nil]*nrows
1256
1231
  end
1257
1232
 
1258
1233
  cl
1259
1234
  end
1260
1235
 
1261
1236
  # Concatenate another DataFrame along corresponding columns.
1262
- # Very premature implementation. Use with caution.
1237
+ # If columns do not exist in both dataframes, they are filled with nils
1263
1238
  def concat other_df
1264
- vectors = []
1265
- @vectors.each do |v|
1266
- vectors << self[v].to_a.dup.concat(other_df[v].to_a)
1239
+ vectors = @vectors.to_a
1240
+ data = []
1241
+
1242
+ vectors.each do |v|
1243
+ other_vec = other_df.vectors.include?(v) ? other_df[v].to_a : [nil] * other_df.size
1244
+ data << self[v].dup.to_a.concat(other_vec)
1245
+ end
1246
+
1247
+ other_df.vectors.each do |v|
1248
+ next if vectors.include?(v)
1249
+ vectors << v
1250
+ data << ([nil] * size).concat(other_df[v].to_a)
1267
1251
  end
1268
1252
 
1269
- Daru::DataFrame.new(vectors, order: @vectors)
1253
+ Daru::DataFrame.new(data, order: vectors)
1270
1254
  end
1271
1255
 
1272
1256
  # Set a particular column as the new DF
1273
1257
  def set_index new_index, opts={}
1274
- raise ArgumentError, "All elements in new index must be unique." if
1258
+ raise ArgumentError, 'All elements in new index must be unique.' if
1275
1259
  @size != self[new_index].uniq.size
1276
1260
 
1277
1261
  self.index = Daru::Index.new(self[new_index].to_a)
1278
- self.delete_vector(new_index) unless opts[:keep]
1262
+ delete_vector(new_index) unless opts[:keep]
1279
1263
 
1280
1264
  self
1281
1265
  end
@@ -1303,16 +1287,12 @@ module Daru
1303
1287
  # # a 1 11
1304
1288
  # # g nil nil
1305
1289
  def reindex new_index
1306
- raise ArgumentError, "Must pass the new index of type Index or its "\
1307
- "subclasses, not #{new_index.class}" unless new_index.kind_of?(Daru::Index)
1290
+ raise ArgumentError, 'Must pass the new index of type Index or its '\
1291
+ "subclasses, not #{new_index.class}" unless new_index.is_a?(Daru::Index)
1308
1292
 
1309
1293
  cl = Daru::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
1310
1294
  new_index.each do |idx|
1311
- if @index.include?(idx)
1312
- cl.row[idx] = self.row[idx]
1313
- else
1314
- cl.row[idx] = [nil]*ncols
1315
- end
1295
+ cl.row[idx] = @index.include?(idx) ? row[idx] : [nil]*ncols
1316
1296
  end
1317
1297
 
1318
1298
  cl
@@ -1330,7 +1310,7 @@ module Daru
1330
1310
  # df.index.to_a #=> ['a','b','c','d']
1331
1311
  # df.row['a'].to_a #=> [1,11]
1332
1312
  def index= idx
1333
- @data.each { |vec| vec.index = idx}
1313
+ @data.each { |vec| vec.index = idx }
1334
1314
  @index = idx
1335
1315
 
1336
1316
  self
@@ -1347,8 +1327,8 @@ module Daru
1347
1327
  # df.vectors = Daru::Index.new([:foo, :bar, :baz])
1348
1328
  # df.vectors.to_a #=> [:foo, :bar, :baz]
1349
1329
  def vectors= idx
1350
- raise ArgumentError, "Can only reindex with Index and its subclasses" unless
1351
- index.kind_of?(Daru::Index)
1330
+ raise ArgumentError, 'Can only reindex with Index and its subclasses' unless
1331
+ index.is_a?(Daru::Index)
1352
1332
  raise ArgumentError, "Specified index length #{idx.size} not equal to"\
1353
1333
  "dataframe size #{ncols}" if idx.size != ncols
1354
1334
 
@@ -1356,13 +1336,35 @@ module Daru
1356
1336
  self
1357
1337
  end
1358
1338
 
1339
+ # Renames the vectors
1340
+ #
1341
+ # == Arguments
1342
+ #
1343
+ # * name_map - A hash where the keys are the exising vector names and
1344
+ # the values are the new names. If a vector is renamed
1345
+ # to a vector name that is already in use, the existing
1346
+ # one is overwritten.
1347
+ #
1348
+ # == Usage
1349
+ #
1350
+ # df = Daru::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1351
+ # df.rename_vectors :a => :alpha, :c => :gamma
1352
+ # df.vectors.to_a #=> [:alpha, :b, :gamma]
1353
+ def rename_vectors name_map
1354
+ existing_targets = name_map.select { |k,v| k != v }.values & vectors.to_a
1355
+ delete_vectors(*existing_targets)
1356
+
1357
+ new_names = vectors.to_a.map { |v| name_map[v] ? name_map[v] : v }
1358
+ self.vectors = Daru::Index.new new_names
1359
+ end
1360
+
1359
1361
  # Return the indexes of all the numeric vectors. Will include vectors with nils
1360
1362
  # alongwith numbers.
1361
1363
  def numeric_vectors
1362
1364
  numerics = []
1363
1365
 
1364
1366
  each_vector_with_index do |vec, i|
1365
- numerics << i if(vec.type == :numeric)
1367
+ numerics << i if vec.type == :numeric
1366
1368
  end
1367
1369
  numerics
1368
1370
  end
@@ -1371,7 +1373,7 @@ module Daru
1371
1373
  numerics = []
1372
1374
 
1373
1375
  @vectors.each do |v|
1374
- numerics << v if (self[v].type == :numeric)
1376
+ numerics << v if self[v].type == :numeric
1375
1377
  end
1376
1378
  numerics
1377
1379
  end
@@ -1382,9 +1384,8 @@ module Daru
1382
1384
  def only_numerics opts={}
1383
1385
  cln = opts[:clone] == false ? false : true
1384
1386
  nv = numeric_vectors
1385
- arry = nv.inject([]) do |arr, v|
1387
+ arry = nv.each_with_object([]) do |v, arr|
1386
1388
  arr << self[v]
1387
- arr
1388
1389
  end
1389
1390
 
1390
1391
  order = Index.new(nv)
@@ -1392,12 +1393,12 @@ module Daru
1392
1393
  end
1393
1394
 
1394
1395
  # Generate a summary of this DataFrame with ReportBuilder.
1395
- def summary(method = :to_text)
1396
+ def summary(method=:to_text)
1396
1397
  ReportBuilder.new(no_title: true).add(self).send(method)
1397
1398
  end
1398
1399
 
1399
1400
  def report_building(b) # :nodoc: #
1400
- b.section(:name=>@name) do |g|
1401
+ b.section(name: @name) do |g|
1401
1402
  g.text "Number of rows: #{nrows}"
1402
1403
  @vectors.each do |v|
1403
1404
  g.text "Element:[#{v}]"
@@ -1406,8 +1407,8 @@ module Daru
1406
1407
  end
1407
1408
  end
1408
1409
 
1409
- # Sorts a dataframe (ascending/descending)according to the given sequence of
1410
- # vectors, using the attributes provided in the blocks.
1410
+ # Sorts a dataframe (ascending/descending) in the given pripority sequence of
1411
+ # vectors, with or without a block.
1411
1412
  #
1412
1413
  # @param order [Array] The order of vector names in which the DataFrame
1413
1414
  # should be sorted.
@@ -1415,42 +1416,121 @@ module Daru
1415
1416
  # @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
1416
1417
  # or descending order. Specify Array corresponding to *order* for multiple
1417
1418
  # sort orders.
1418
- # @option opts [Hash] :by ({|a,b| a <=> b}) Specify attributes of objects to
1419
+ # @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
1419
1420
  # to be used for sorting, for each vector name in *order* as a hash of
1420
- # vector name and lambda pairs. In case a lambda for a vector is not
1421
+ # vector name and lambda expressions. In case a lambda for a vector is not
1421
1422
  # specified, the default will be used.
1423
+ # @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
1424
+ # automatically or not when a block is provided.
1425
+ # If set to True, nils will appear at top after sorting.
1422
1426
  #
1423
- # == Usage
1427
+ # @example Sort a dataframe with a vector sequence.
1428
+ #
1429
+ #
1430
+ # df = Daru::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
1431
+ #
1432
+ # df.sort [:a, :b]
1433
+ # # =>
1434
+ # # <Daru::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
1435
+ # # a b
1436
+ # # 2 1 3
1437
+ # # 0 1 5
1438
+ # # 3 2 2
1439
+ # # 1 2 4
1440
+ # # 4 3 1
1441
+ #
1442
+ # @example Sort a dataframe without a block. Here nils will be handled automatically.
1443
+ #
1444
+ # df = Daru::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
1445
+ #
1446
+ # df.sort([:a])
1447
+ # # =>
1448
+ # # <Daru::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
1449
+ # # a b
1450
+ # # 1 nil 3
1451
+ # # 3 nil 1
1452
+ # # 0 -3 4
1453
+ # # 2 -1 2
1454
+ # # 4 5 4
1455
+ #
1456
+ # @example Sort a dataframe with a block with nils handled automatically.
1457
+ #
1458
+ # df = Daru::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1424
1459
  #
1425
- # df = Daru::DataFrame.new({a: [-3,2,-1,4], b: [4,3,2,1]})
1460
+ # df.sort [:b], by: {b: lambda { |a| a.length } }
1461
+ # # NoMethodError: undefined method `length' for nil:NilClass
1462
+ # # from (pry):8:in `block in __pry__'
1426
1463
  #
1427
- # #<Daru::DataFrame:140630680 @name = 04e00197-f8d5-4161-bca2-93266bfabc6f @size = 4>
1428
- # # a b
1429
- # # 0 -3 4
1430
- # # 1 2 3
1431
- # # 2 -1 2
1432
- # # 3 4 1
1433
- # df.sort([:a], by: { a: lambda { |a,b| a.abs <=> b.abs } })
1464
+ # df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
1465
+ #
1466
+ # # =>
1467
+ # # <Daru::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
1468
+ # # a b
1469
+ # # 2 1 nil
1470
+ # # 5 1 nil
1471
+ # # 4 -1 x
1472
+ # # 1 -1 aa
1473
+ # # 0 nil aaa
1474
+ # # 3 nil baaa
1475
+ #
1476
+ # @example Sort a dataframe with a block with nils handled manually.
1477
+ #
1478
+ # df = Daru::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1479
+ #
1480
+ # # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
1481
+ # df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
1482
+ #
1483
+ # # =>
1484
+ # #<Daru::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
1485
+ # # a b
1486
+ # # 4 -1 x
1487
+ # # 1 -1 aa
1488
+ # # 0 nil aaa
1489
+ # # 3 nil baaa
1490
+ # # 2 1 nil
1491
+ # # 5 1 nil
1492
+
1434
1493
  def sort! vector_order, opts={}
1435
- raise ArgumentError, "Required atleast one vector name" if vector_order.size < 1
1494
+ raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
1436
1495
  opts = {
1437
1496
  ascending: true,
1438
- type: :quick_sort,
1497
+ handle_nils: false,
1439
1498
  by: {}
1440
1499
  }.merge(opts)
1441
1500
 
1442
- opts[:by] = create_logic_blocks vector_order, opts[:by]
1443
1501
  opts[:ascending] = sort_order_array vector_order, opts[:ascending]
1444
- idx = @index.to_a
1445
- send(opts[:type], vector_order, idx, opts[:by], opts[:ascending])
1446
- self.index = Daru::Index.new(idx)
1502
+ opts[:handle_nils] = handle_nils_array vector_order, opts[:handle_nils]
1503
+ blocks = create_logic_blocks vector_order, opts[:by], opts[:ascending]
1504
+
1505
+ block = lambda do |r1, r2|
1506
+ # Build left and right array to compare two rows
1507
+ left = build_array_from_blocks vector_order, opts, blocks, r1, r2
1508
+ right = build_array_from_blocks vector_order, opts, blocks, r2, r1
1509
+
1510
+ # Resolve conflict by Index if all attributes are same
1511
+ left << r1
1512
+ right << r2
1513
+ left <=> right
1514
+ end
1515
+
1516
+ idx = (0..@index.size-1).sort(&block)
1517
+
1518
+ old_index = @index.to_a
1519
+ self.index = Daru::Index.new(idx.map { |i| old_index[i] })
1520
+
1521
+ vectors.each do |v|
1522
+ @data[@vectors[v]] = Daru::Vector.new(
1523
+ idx.map { |i| @data[@vectors[v]].data[i] },
1524
+ name: self[v].name, metadata: self[v].metadata.dup, index: index
1525
+ )
1526
+ end
1447
1527
 
1448
1528
  self
1449
1529
  end
1450
1530
 
1451
1531
  # Non-destructive version of #sort!
1452
1532
  def sort vector_order, opts={}
1453
- self.dup.sort! vector_order, opts
1533
+ dup.sort! vector_order, opts
1454
1534
  end
1455
1535
 
1456
1536
  # Pivots a data frame on specified vectors and applies an aggregate function
@@ -1489,25 +1569,27 @@ module Daru
1489
1569
  # # [:foo] 10 12
1490
1570
  def pivot_table opts={}
1491
1571
  raise ArgumentError,
1492
- "Specify grouping index" if !opts[:index] or opts[:index].empty?
1572
+ 'Specify grouping index' if !opts[:index] || opts[:index].empty?
1493
1573
 
1494
1574
  index = opts[:index]
1495
1575
  vectors = opts[:vectors] || []
1496
1576
  aggregate_function = opts[:agg] || :mean
1497
1577
  values =
1498
- if opts[:values].is_a?(Symbol)
1499
- [opts[:values]]
1500
- elsif opts[:values].is_a?(Array)
1501
- opts[:values]
1502
- else # nil
1503
- (@vectors.to_a - (index | vectors)) & numeric_vector_names
1504
- end
1578
+ if opts[:values].is_a?(Symbol)
1579
+ [opts[:values]]
1580
+ elsif opts[:values].is_a?(Array)
1581
+ opts[:values]
1582
+ else # nil
1583
+ (@vectors.to_a - (index | vectors)) & numeric_vector_names
1584
+ end
1505
1585
 
1506
- raise IndexError, "No numeric vectors to aggregate" if values.empty?
1586
+ raise IndexError, 'No numeric vectors to aggregate' if values.empty?
1507
1587
 
1508
- grouped = group_by(index)
1588
+ grouped = group_by(index)
1509
1589
 
1510
- unless vectors.empty?
1590
+ if vectors.empty?
1591
+ grouped.send(aggregate_function)
1592
+ else
1511
1593
  super_hash = {}
1512
1594
  values.each do |value|
1513
1595
  grouped.groups.each do |group_name, row_numbers|
@@ -1548,8 +1630,6 @@ module Daru
1548
1630
  end
1549
1631
  end
1550
1632
  return pivoted_dataframe
1551
- else
1552
- grouped.send(aggregate_function)
1553
1633
  end
1554
1634
  end
1555
1635
 
@@ -1561,8 +1641,8 @@ module Daru
1561
1641
  raise "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}" unless nrows == other_df.nrows
1562
1642
 
1563
1643
  new_fields = (@vectors.to_a + other_df.vectors.to_a)
1564
- .recode_repeated
1565
- .map(&:to_sym)
1644
+ .recode_repeated
1645
+ .map(&:to_sym)
1566
1646
  df_new = DataFrame.new({}, order: new_fields)
1567
1647
 
1568
1648
  (0...nrows).to_a.each do |i|
@@ -1603,7 +1683,6 @@ module Daru
1603
1683
  Daru::Core::Merge.join(self, other_df, opts)
1604
1684
  end
1605
1685
 
1606
-
1607
1686
  # Creates a new dataset for one to many relations
1608
1687
  # on a dataset, based on pattern of field names.
1609
1688
  #
@@ -1632,26 +1711,25 @@ module Daru
1632
1711
  # # ["white", "2", 20]
1633
1712
  # # ]
1634
1713
  def one_to_many(parent_fields, pattern)
1635
- re = Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
1714
+ re = Regexp.new pattern.gsub('%v','(.+?)').gsub('%n','(\\d+?)')
1636
1715
  ds_vars = parent_fields.dup
1637
1716
  vars = []
1638
1717
  max_n = 0
1639
- h = parent_fields.inject({}) { |a,v|
1718
+ h = parent_fields.each_with_object({}) { |v, a|
1640
1719
  a[v] = Daru::Vector.new([])
1641
- a
1642
1720
  }
1643
1721
  # Adding _row_id
1644
1722
  h['_col_id'] = Daru::Vector.new([])
1645
1723
  ds_vars.push('_col_id')
1646
1724
 
1647
1725
  @vectors.each do |f|
1648
- if f =~ re
1649
- if !vars.include? $1
1650
- vars.push($1)
1651
- h[$1] = Daru::Vector.new([])
1652
- end
1653
- max_n = $2.to_i if max_n < $2.to_i
1726
+ next unless f =~ re
1727
+ unless vars.include? $1
1728
+ vars.push($1)
1729
+ h[$1] = Daru::Vector.new([])
1654
1730
  end
1731
+
1732
+ max_n = $2.to_i if max_n < $2.to_i
1655
1733
  end
1656
1734
  ds = DataFrame.new(h, order: ds_vars+vars)
1657
1735
 
@@ -1662,12 +1740,12 @@ module Daru
1662
1740
  end
1663
1741
 
1664
1742
  max_n.times do |n1|
1665
- n = n1+1
1743
+ n = n1+1
1666
1744
  any_data = false
1667
1745
  vars.each do |v|
1668
- data = row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)]
1746
+ data = row[pattern.gsub('%v',v.to_s).gsub('%n',n.to_s)]
1669
1747
  row_out[v] = data
1670
- any_data = true if !data.nil?
1748
+ any_data = true unless data.nil?
1671
1749
  end
1672
1750
 
1673
1751
  if any_data
@@ -1685,7 +1763,7 @@ module Daru
1685
1763
  i = 1
1686
1764
  split.each { |k,v|
1687
1765
  new_field = name_.to_s + join + i.to_s
1688
- v.rename name_.to_s + ":" + k.to_s
1766
+ v.rename name_.to_s + ':' + k.to_s
1689
1767
  self[new_field.to_sym] = v
1690
1768
  i += 1
1691
1769
  }
@@ -1707,11 +1785,11 @@ module Daru
1707
1785
  # ds.create_sql('names')
1708
1786
  # #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
1709
1787
  #
1710
- def create_sql(table,charset="UTF8")
1788
+ def create_sql(table,charset='UTF8')
1711
1789
  sql = "CREATE TABLE #{table} ("
1712
- fields = self.vectors.to_a.collect do |f|
1790
+ fields = vectors.to_a.collect do |f|
1713
1791
  v = self[f]
1714
- f.to_s + " " + v.db_type
1792
+ f.to_s + ' ' + v.db_type
1715
1793
  end
1716
1794
 
1717
1795
  sql + fields.join(",\n ")+") CHARACTER SET=#{charset};"
@@ -1724,14 +1802,14 @@ module Daru
1724
1802
  numerics_as_arrays << self[n].to_a
1725
1803
  end
1726
1804
 
1727
- GSL::Matrix.alloc *numerics_as_arrays.transpose
1805
+ GSL::Matrix.alloc(*numerics_as_arrays.transpose)
1728
1806
  end
1729
1807
 
1730
1808
  # Convert all vectors of type *:numeric* into a Matrix.
1731
1809
  def to_matrix
1732
1810
  numerics_as_arrays = []
1733
1811
  each_vector do |vector|
1734
- numerics_as_arrays << vector.to_a if(vector.type == :numeric)
1812
+ numerics_as_arrays << vector.to_a if vector.type == :numeric
1735
1813
  end
1736
1814
 
1737
1815
  Matrix.columns numerics_as_arrays
@@ -1746,8 +1824,8 @@ module Daru
1746
1824
  def to_nmatrix
1747
1825
  numerics_as_arrays = []
1748
1826
  each_vector do |vector|
1749
- numerics_as_arrays << vector.to_a if(vector.type == :numeric and
1750
- vector.missing_positions.size == 0)
1827
+ numerics_as_arrays << vector.to_a if vector.type == :numeric &&
1828
+ vector.missing_positions.empty?
1751
1829
  end
1752
1830
 
1753
1831
  numerics_as_arrays.transpose.to_nm
@@ -1760,8 +1838,8 @@ module Daru
1760
1838
  # in the array of hashes, which has the same index.
1761
1839
  def to_a
1762
1840
  arry = [[],[]]
1763
- self.each_row do |row|
1764
- arry[0] << row.to_hash
1841
+ each_row do |row|
1842
+ arry[0] << row.to_h
1765
1843
  end
1766
1844
  arry[1] = @index.to_a
1767
1845
 
@@ -1772,15 +1850,15 @@ module Daru
1772
1850
  # in the JSON thus created.
1773
1851
  def to_json no_index=true
1774
1852
  if no_index
1775
- self.to_a[0].to_json
1853
+ to_a[0].to_json
1776
1854
  else
1777
- self.to_a.to_json
1855
+ to_a.to_json
1778
1856
  end
1779
1857
  end
1780
1858
 
1781
- # Converts DataFrame to a hash with keys as vector names and values as
1859
+ # Converts DataFrame to a hash (explicit) with keys as vector names and values as
1782
1860
  # the corresponding vectors.
1783
- def to_hash
1861
+ def to_h
1784
1862
  hsh = {}
1785
1863
  @vectors.each_with_index do |vec_name, idx|
1786
1864
  hsh[vec_name] = @data[idx]
@@ -1791,12 +1869,12 @@ module Daru
1791
1869
 
1792
1870
  # Convert to html for IRuby.
1793
1871
  def to_html threshold=30
1794
- html = "<table>" +
1795
- "<tr>" +
1796
- "<th colspan=\"#{@vectors.size+1}\">" +
1797
- "Daru::DataFrame:#{self.object_id} " + " rows: #{nrows} " + " cols: #{ncols}"
1798
- "</th>" +
1799
- "</tr>"
1872
+ html = '<table>' \
1873
+ '<tr>' \
1874
+ "<th colspan=\"#{@vectors.size+1}\">" \
1875
+ "Daru::DataFrame:#{object_id} " + " rows: #{nrows} " + " cols: #{ncols}" \
1876
+ '</th>' \
1877
+ '</tr>'
1800
1878
  html +='<tr><th></th>'
1801
1879
  @vectors.each { |vector| html += '<th>' + vector.to_s + '</th>' }
1802
1880
  html += '</tr>'
@@ -1805,26 +1883,26 @@ module Daru
1805
1883
  html += '<tr>'
1806
1884
  html += '<td>' + index.to_s + '</td>'
1807
1885
 
1808
- self.row[index].each do |element|
1886
+ row[index].each do |element|
1809
1887
  html += '<td>' + element.to_s + '</td>'
1810
1888
  end
1811
1889
 
1812
1890
  html += '</tr>'
1813
- if num > threshold
1814
- html += '<tr>'
1815
- (@vectors.size + 1).times { html += '<td>...</td>' }
1816
- html += '</tr>'
1817
-
1818
- last_index = @index.to_a.last
1819
- last_row = self.row[last_index]
1820
- html += '<tr>'
1821
- html += "<td>" + last_index.to_s + "</td>"
1822
- (0..(ncols - 1)).to_a.each do |i|
1823
- html += '<td>' + last_row[i].to_s + '</td>'
1824
- end
1825
- html += '</tr>'
1826
- break
1891
+ next if num <= threshold
1892
+
1893
+ html += '<tr>'
1894
+ (@vectors.size + 1).times { html += '<td>...</td>' }
1895
+ html += '</tr>'
1896
+
1897
+ last_index = @index.to_a.last
1898
+ last_row = row[last_index]
1899
+ html += '<tr>'
1900
+ html += '<td>' + last_index.to_s + '</td>'
1901
+ (0..(ncols - 1)).to_a.each do |i|
1902
+ html += '<td>' + last_row[i].to_s + '</td>'
1827
1903
  end
1904
+ html += '</tr>'
1905
+ break
1828
1906
  end
1829
1907
  html += '</table>'
1830
1908
 
@@ -1841,7 +1919,7 @@ module Daru
1841
1919
  # assignment/deletion of elements is done. Updating data this way is called
1842
1920
  # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
1843
1921
  def update
1844
- @data.each { |v| v.update } if Daru.lazy_update
1922
+ @data.each(&:update) if Daru.lazy_update
1845
1923
  end
1846
1924
 
1847
1925
  # Rename the DataFrame.
@@ -1890,19 +1968,18 @@ module Daru
1890
1968
  Daru::IO.dataframe_write_sql self, dbh, table
1891
1969
  end
1892
1970
 
1893
-
1894
1971
  # Use marshalling to save dataframe to a file.
1895
1972
  def save filename
1896
1973
  Daru::IO.save self, filename
1897
1974
  end
1898
1975
 
1899
- def _dump depth
1900
- Marshal.dump({
1976
+ def _dump(_depth)
1977
+ Marshal.dump(
1901
1978
  data: @data,
1902
1979
  index: @index.to_a,
1903
1980
  order: @vectors.to_a,
1904
1981
  name: @name
1905
- })
1982
+ )
1906
1983
  end
1907
1984
 
1908
1985
  def self._load data
@@ -1939,29 +2016,29 @@ module Daru
1939
2016
  longest = [@name.to_s.size,
1940
2017
  (@vectors.map(&:to_s).map(&:size).max || 0),
1941
2018
  (@index .map(&:to_s).map(&:size).max || 0),
1942
- (@data .map{ |v| v.map(&:to_s).map(&:size).max}.max || 0)].max
2019
+ (@data .map { |v| v.map(&:to_s).map(&:size).max }.max || 0)].max
1943
2020
 
1944
2021
  name = @name || 'nil'
1945
- content = ""
2022
+ content = ''
1946
2023
  longest = spacing if longest > spacing
1947
2024
  formatter = "\n"
1948
2025
 
1949
2026
  (@vectors.size + 1).times { formatter += "%#{longest}.#{longest}s " }
1950
- content += "\n#<" + self.class.to_s + ":" + self.object_id.to_s + " @name = " +
1951
- name.to_s + " @size = " + @size.to_s + ">"
1952
- content += sprintf formatter, "" , *@vectors.map(&:to_s)
2027
+ content += "\n#<" + self.class.to_s + ':' + object_id.to_s + ' @name = ' +
2028
+ name.to_s + ' @size = ' + @size.to_s + '>'
2029
+ content += formatter % ['', *@vectors.map(&:to_s)]
1953
2030
  row_num = 1
1954
2031
 
1955
- self.each_row_with_index do |row, index|
1956
- content += sprintf formatter, index.to_s, *row.to_hash.values.map { |e| (e || 'nil').to_s }
2032
+ each_row_with_index do |row, index|
2033
+ content += formatter % [index.to_s, *row.to_h.values.map { |e| (e || 'nil').to_s }]
1957
2034
  row_num += 1
1958
- if row_num > threshold
1959
- dots = []
2035
+ next if row_num <= threshold
1960
2036
 
1961
- (@vectors.size + 1).times { dots << "..." }
1962
- content += sprintf formatter, *dots
1963
- break
1964
- end
2037
+ dots = []
2038
+
2039
+ (@vectors.size + 1).times { dots << '...' }
2040
+ content += formatter % dots
2041
+ break
1965
2042
  end
1966
2043
  content += "\n"
1967
2044
 
@@ -1974,24 +2051,24 @@ module Daru
1974
2051
  end
1975
2052
 
1976
2053
  def == other
1977
- self.class == other.class and
1978
- @size == other.size and
1979
- @index == other.index and
1980
- @vectors == other.vectors and
1981
- @vectors.to_a.all? { |v| self[v] == other[v] }
2054
+ self.class == other.class &&
2055
+ @size == other.size &&
2056
+ @index == other.index &&
2057
+ @vectors == other.vectors &&
2058
+ @vectors.to_a.all? { |v| self[v] == other[v] }
1982
2059
  end
1983
2060
 
1984
2061
  def method_missing(name, *args, &block)
1985
- if md = name.match(/(.+)\=/)
1986
- insert_or_modify_vector name[/(.+)\=/].delete("=").to_sym, args[0]
1987
- elsif self.has_vector? name
2062
+ if name =~ /(.+)\=/
2063
+ insert_or_modify_vector name[/(.+)\=/].delete('=').to_sym, args[0]
2064
+ elsif has_vector? name
1988
2065
  self[name]
1989
2066
  else
1990
2067
  super(name, *args, &block)
1991
2068
  end
1992
2069
  end
1993
2070
 
1994
- private
2071
+ private
1995
2072
 
1996
2073
  def possibly_multi_index? index
1997
2074
  if @index.is_a?(MultiIndex)
@@ -2001,101 +2078,51 @@ module Daru
2001
2078
  end
2002
2079
  end
2003
2080
 
2004
- def quick_sort vector_order, index, by, ascending
2005
- recursive_quick_sort vector_order, index, by, ascending, 0, @size-1
2006
- end
2007
-
2008
- # == Arguments
2009
- #
2010
- # vector_order -
2011
- # index -
2012
- # by -
2013
- # ascending -
2014
- # left_lower -
2015
- # right_upper -
2016
- def recursive_quick_sort vector_order, index, by, ascending, left_lower, right_upper
2017
- if left_lower < right_upper
2018
- left_upper, right_lower = partition(vector_order, index, by, ascending, left_lower, right_upper)
2019
- if left_upper - left_lower < right_upper - right_lower
2020
- recursive_quick_sort(vector_order, index, by, ascending, left_lower, left_upper)
2021
- recursive_quick_sort(vector_order, index, by, ascending, right_lower, right_upper)
2022
- else
2023
- recursive_quick_sort(vector_order, index, by, ascending, right_lower, right_upper)
2024
- recursive_quick_sort(vector_order, index, by, ascending, left_lower, left_upper)
2025
- end
2026
- end
2027
- end
2028
-
2029
- def partition vector_order, index, by, ascending, left_lower, right_upper
2030
- mindex = (left_lower + right_upper) / 2
2031
- mvalues = vector_order.inject([]) { |a, vector_name| a << self[vector_name][mindex]; a }
2032
- i = left_lower
2033
- j = right_upper
2034
- descending = ascending.map { |a| !a }
2035
-
2036
- i += 1 while(keep?(i, mvalues, vector_order, ascending , by, 0))
2037
- j -= 1 while(keep?(j, mvalues, vector_order, descending, by, 0))
2038
-
2039
- while i < j - 1
2040
- @data.each do |vector|
2041
- vector[i], vector[j] = vector[j], vector[i]
2042
- end
2043
- index[i], index[j] = index[j], index[i]
2044
- i += 1
2045
- j -= 1
2046
-
2047
- i += 1 while(keep?(i, mvalues, vector_order, ascending , by,0))
2048
- j -= 1 while(keep?(j, mvalues, vector_order, descending, by,0))
2049
- end
2050
-
2051
- if i <= j
2052
- if i < j
2053
- @data.each do |vector|
2054
- vector[i], vector[j] = vector[j], vector[i]
2081
+ def create_logic_blocks vector_order, _by, ascending
2082
+ # Create blocks to handle nils
2083
+ blocks = {}
2084
+ universal_block_ascending = ->(a) { [a.nil? ? 0 : 1, a] }
2085
+ universal_block_decending = ->(a) { [a.nil? ? 1 : 0, a] }
2086
+ vector_order.each_with_index do |vector, i|
2087
+ blocks[vector] =
2088
+ if ascending[i]
2089
+ universal_block_ascending
2090
+ else
2091
+ universal_block_decending
2055
2092
  end
2056
- index[i], index[j] = index[j], index[i]
2057
- end
2058
- i += 1
2059
- j -= 1
2060
2093
  end
2061
2094
 
2062
- [j,i]
2095
+ blocks
2063
2096
  end
2064
2097
 
2065
- def keep? current_index, mvalues, vector_order, sort_order, by, vector_order_index
2066
- vector_name = vector_order[vector_order_index]
2067
- if vector_name
2068
- vec = self[vector_name]
2069
- eval = by[vector_name].call(vec[current_index], mvalues[vector_order_index])
2098
+ def build_array_from_blocks vector_order, opts, blocks, r1, r2
2099
+ # Create an array to be used for comparison of two rows in sorting
2100
+ vector_order.map.each_with_index do |v, i|
2101
+ value = if opts[:ascending][i]
2102
+ @data[@vectors[v]].data[r1]
2103
+ else
2104
+ @data[@vectors[v]].data[r2]
2105
+ end
2070
2106
 
2071
- if sort_order[vector_order_index] # sort in ascending order
2072
- return false if eval == 1
2073
- return true if eval == -1
2074
- if eval == 0
2075
- keep?(current_index, mvalues, vector_order, sort_order, by, vector_order_index + 1)
2076
- end
2077
- else # sort in descending order
2078
- return false if eval == -1
2079
- return true if eval == 1
2080
- if eval == 0
2081
- keep?(current_index, mvalues, vector_order, sort_order, by, vector_order_index + 1)
2082
- end
2083
- end
2084
- end
2085
- end
2107
+ if opts[:by][v] && !opts[:handle_nils][i]
2108
+ # Block given and nils handled manually
2109
+ value = opts[:by][v].call value
2086
2110
 
2087
- def create_logic_blocks vector_order, by={}
2088
- universal_block = lambda { |a,b| a <=> b }
2089
- vector_order.each do |vector|
2090
- by[vector] ||= universal_block
2091
- end
2111
+ elsif opts[:by][v] && opts[:handle_nils][i]
2112
+ # Block given and nils handled automatically
2113
+ value = opts[:by][v].call value rescue nil
2114
+ blocks[v].call value
2092
2115
 
2093
- by
2116
+ else
2117
+ # Block not given and nils handled automatically
2118
+ blocks[v].call value
2119
+ end
2120
+ end
2094
2121
  end
2095
2122
 
2096
2123
  def sort_order_array vector_order, ascending
2097
- if ascending.is_a?(Array)
2098
- raise ArgumentError, "Specify same number of vector names and sort orders" if
2124
+ if ascending.is_a? Array
2125
+ raise ArgumentError, 'Specify same number of vector names and sort orders' if
2099
2126
  vector_order.size != ascending.size
2100
2127
  return ascending
2101
2128
  else
@@ -2103,6 +2130,16 @@ module Daru
2103
2130
  end
2104
2131
  end
2105
2132
 
2133
+ def handle_nils_array vector_order, handle_nils
2134
+ if handle_nils.is_a? Array
2135
+ raise ArgumentError, 'Specify same number of vector names and handle nils' if
2136
+ vector_order.size != handle_nils.size
2137
+ return handle_nils
2138
+ else
2139
+ Array.new(vector_order.size, handle_nils)
2140
+ end
2141
+ end
2142
+
2106
2143
  def vectors_index_for location
2107
2144
  if @vectors.include?(location)
2108
2145
  @vectors[location]
@@ -2118,39 +2155,35 @@ module Daru
2118
2155
  if @vectors.is_a?(MultiIndex)
2119
2156
  pos = @vectors[names]
2120
2157
 
2121
- if pos.is_a?(Integer)
2122
- return @data[pos]
2123
- else # MultiIndex
2124
- new_vectors = pos.map do |tuple|
2125
- @data[@vectors[tuple]]
2126
- end
2158
+ return @data[pos] if pos.is_a?(Integer)
2127
2159
 
2128
- if !location.is_a?(Range) and names.size < @vectors.width
2129
- pos = pos.drop_left_level names.size
2130
- end
2160
+ # MultiIndex
2161
+ new_vectors = pos.map do |tuple|
2162
+ @data[@vectors[tuple]]
2163
+ end
2131
2164
 
2132
- Daru::DataFrame.new(
2133
- new_vectors, index: @index, order: pos)
2165
+ if !location.is_a?(Range) && names.size < @vectors.width
2166
+ pos = pos.drop_left_level names.size
2134
2167
  end
2168
+
2169
+ Daru::DataFrame.new(new_vectors, index: @index, order: pos)
2135
2170
  else
2136
2171
  unless names[1]
2137
2172
  pos = @vectors[location]
2138
2173
 
2139
- if pos.is_a?(Numeric)
2140
- return @data[pos]
2141
- else
2142
- names = pos
2143
- end
2174
+ return @data[pos] if pos.is_a?(Numeric)
2175
+
2176
+ names = pos
2144
2177
  end
2145
2178
 
2146
- new_vcs = []
2179
+ new_vectors = {}
2147
2180
  names.each do |name|
2148
- new_vcs << @data[@vectors[name]].to_a
2181
+ new_vectors[name] = @data[@vectors[name]]
2149
2182
  end
2150
2183
 
2151
2184
  order = names.is_a?(Array) ? Daru::Index.new(names) : names
2152
- Daru::DataFrame.new(new_vcs, order: order,
2153
- index: @index, name: @name)
2185
+ Daru::DataFrame.new(new_vectors, order: order,
2186
+ index: @index, name: @name)
2154
2187
  end
2155
2188
  end
2156
2189
 
@@ -2161,16 +2194,15 @@ module Daru
2161
2194
  pos = @index[names]
2162
2195
  if pos.is_a?(Integer)
2163
2196
  return Daru::Vector.new(populate_row_for(pos), index: @vectors, name: pos)
2164
- else
2165
- new_rows = pos.map { |tuple| populate_row_for(tuple) }
2197
+ end
2166
2198
 
2167
- if !location.is_a?(Range) and names.size < @index.width
2168
- pos = pos.drop_left_level names.size
2169
- end
2199
+ new_rows = pos.map { |tuple| populate_row_for(tuple) }
2170
2200
 
2171
- Daru::DataFrame.rows(
2172
- new_rows, order: @vectors, name: @name, index: pos)
2201
+ if !location.is_a?(Range) && names.size < @index.width
2202
+ pos = pos.drop_left_level names.size
2173
2203
  end
2204
+
2205
+ Daru::DataFrame.rows(new_rows, order: @vectors, name: @name, index: pos)
2174
2206
  else
2175
2207
  if names[1].nil?
2176
2208
  names = @index[location]
@@ -2189,7 +2221,7 @@ module Daru
2189
2221
  rows << self.row[name].to_a
2190
2222
  end
2191
2223
 
2192
- Daru::DataFrame.rows rows, index: names ,name: @name, order: @vectors
2224
+ Daru::DataFrame.rows rows, index: names,name: @name, order: @vectors
2193
2225
  end
2194
2226
  end
2195
2227
 
@@ -2201,17 +2233,22 @@ module Daru
2201
2233
 
2202
2234
  def insert_or_modify_vector name, vector
2203
2235
  name = name[0] unless @vectors.is_a?(MultiIndex)
2204
- v = nil
2236
+ vec = nil
2205
2237
 
2206
2238
  if @index.empty?
2207
- v = vector.is_a?(Daru::Vector) ? vector : Daru::Vector.new(vector.to_a)
2208
- @index = v.index
2209
- assign_or_add_vector name, v
2239
+ vec = if vector.is_a?(Daru::Vector)
2240
+ vector
2241
+ else
2242
+ Daru::Vector.new(vector.to_a, name: set_name(name))
2243
+ end
2244
+
2245
+ @index = vec.index
2246
+ assign_or_add_vector name, vec
2210
2247
  set_size
2211
2248
 
2212
2249
  @data.map! do |v|
2213
- if v.size == 0
2214
- Daru::Vector.new([nil]*@size, name: set_name(name), index: @index)
2250
+ if v.empty?
2251
+ Daru::Vector.new([nil]*@size, name: set_name(name), metadata: v.metadata, index: @index)
2215
2252
  else
2216
2253
  v
2217
2254
  end
@@ -2219,15 +2256,11 @@ module Daru
2219
2256
  else
2220
2257
  if vector.is_a?(Daru::Vector)
2221
2258
  if vector.index == @index # so that index-by-index assignment is avoided when possible.
2222
- v = vector.dup
2259
+ vec = vector.dup
2223
2260
  else
2224
- v = Daru::Vector.new [], name: set_name(name), index: @index
2261
+ vec = Daru::Vector.new [], name: set_name(name), metadata: vector.metadata.dup, index: @index
2225
2262
  @index.each do |idx|
2226
- if vector.index.include? idx
2227
- v[idx] = vector[idx]
2228
- else
2229
- v[idx] = nil
2230
- end
2263
+ vec[idx] = vector.index.include?(idx) ? vector[idx] : nil
2231
2264
  end
2232
2265
  end
2233
2266
  else
@@ -2235,26 +2268,30 @@ module Daru
2235
2268
  "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}" if
2236
2269
  @size != vector.size
2237
2270
 
2238
- v = Daru::Vector.new(vector, name: set_name(name), index: @index)
2271
+ vec = Daru::Vector.new(vector, name: set_name(name), index: @index)
2239
2272
  end
2240
2273
 
2241
- assign_or_add_vector name, v
2274
+ assign_or_add_vector name, vec
2242
2275
  end
2243
2276
  end
2244
2277
 
2245
2278
  def assign_or_add_vector name, v
2246
- #FIXME: fix this jugaad. need to make changes in Indexing itself.
2247
- pos = @vectors[name]
2279
+ # FIXME: fix this jugaad. need to make changes in Indexing itself.
2280
+ begin
2281
+ pos = @vectors[name]
2282
+ rescue IndexError
2283
+ pos = name
2284
+ end
2248
2285
 
2249
- if !pos.kind_of?(Daru::Index) and pos == name and
2250
- (@vectors.include?(name) or (pos.is_a?(Integer) and pos < @data.size))
2286
+ if !pos.is_a?(Daru::Index) && pos == name &&
2287
+ (@vectors.include?(name) || (pos.is_a?(Integer) && pos < @data.size))
2251
2288
  @data[pos] = v
2252
- elsif pos.kind_of?(Daru::Index)
2289
+ elsif pos.is_a?(Daru::Index)
2253
2290
  pos.each do |p|
2254
2291
  @data[@vectors[p]] = v
2255
2292
  end
2256
2293
  else
2257
- @vectors = @vectors | [name] if !@vectors.include?(name)
2294
+ @vectors |= [name] unless @vectors.include?(name)
2258
2295
  @data[@vectors[name]] = v
2259
2296
  end
2260
2297
  end
@@ -2264,21 +2301,21 @@ module Daru
2264
2301
  # TODO
2265
2302
  else
2266
2303
  name = name[0]
2267
- v =
2268
- if vector.is_a?(Daru::Vector)
2269
- vector
2270
- else
2271
- Daru::Vector.new(vector, name: set_name(name), index: @vectors)
2272
- end
2304
+ vec =
2305
+ if vector.is_a?(Daru::Vector)
2306
+ vector
2307
+ else
2308
+ Daru::Vector.new(vector, name: set_name(name), index: @vectors)
2309
+ end
2273
2310
 
2274
2311
  if @index.include? name
2275
- self.each_vector_with_index do |vector,i|
2276
- vector[name] = v.index.include?(i) ? v[i] : nil
2312
+ each_vector_with_index do |v,i|
2313
+ v[name] = vec.index.include?(i) ? vec[i] : nil
2277
2314
  end
2278
2315
  else
2279
- @index = @index | [name]
2280
- self.each_vector_with_index do |vector,i|
2281
- vector.concat((v.index.include?(i) ? v[i] : nil), name)
2316
+ @index |= [name]
2317
+ each_vector_with_index do |v,i|
2318
+ v.concat((vec.index.include?(i) ? vec[i] : nil), name)
2282
2319
  end
2283
2320
  end
2284
2321
 
@@ -2294,15 +2331,15 @@ module Daru
2294
2331
 
2295
2332
  def validate_labels
2296
2333
  raise IndexError, "Expected equal number of vector names (#{@vectors.size}) for number of vectors (#{@data.size})." if
2297
- @vectors and @vectors.size != @data.size
2334
+ @vectors && @vectors.size != @data.size
2298
2335
 
2299
- raise IndexError, "Expected number of indexes same as number of rows" if
2300
- @index and @data[0] and @index.size != @data[0].size
2336
+ raise IndexError, 'Expected number of indexes same as number of rows' if
2337
+ @index && @data[0] && @index.size != @data[0].size
2301
2338
  end
2302
2339
 
2303
2340
  def validate_vector_sizes
2304
2341
  @data.each do |vector|
2305
- raise IndexError, "Expected vectors with equal length" if vector.size != @size
2342
+ raise IndexError, 'Expected vectors with equal length' if vector.size != @size
2306
2343
  end
2307
2344
  end
2308
2345
 
@@ -2332,14 +2369,14 @@ module Daru
2332
2369
  end
2333
2370
 
2334
2371
  def create_vectors_index_with vectors, source
2335
- vectors = source.keys.sort_by { |a| a.to_s } if vectors.nil?
2372
+ vectors = source.keys.sort_by(&:to_s) if vectors.nil?
2336
2373
 
2337
2374
  @vectors =
2338
- unless vectors.is_a?(Index) or vectors.is_a?(MultiIndex)
2339
- Daru::Index.new((vectors + (source.keys - vectors)).uniq)
2340
- else
2341
- vectors
2342
- end
2375
+ if vectors.is_a?(Index) || vectors.is_a?(MultiIndex)
2376
+ vectors
2377
+ else
2378
+ Daru::Index.new((vectors + (source.keys - vectors)).uniq)
2379
+ end
2343
2380
  end
2344
2381
 
2345
2382
  def all_vectors_have_equal_indexes? source
@@ -2351,24 +2388,24 @@ module Daru
2351
2388
  end
2352
2389
 
2353
2390
  def try_create_index index
2354
- index.kind_of?(Index) ? index : Daru::Index.new(index)
2391
+ index.is_a?(Index) ? index : Daru::Index.new(index)
2355
2392
  end
2356
2393
 
2357
- def set_name potential_name
2394
+ def set_name potential_name # rubocop:disable Style/AccessorMethodName
2358
2395
  potential_name.is_a?(Array) ? potential_name.join : potential_name
2359
2396
  end
2360
2397
 
2361
2398
  def symbolize arry
2362
2399
  symbolized_arry =
2363
- if arry.all? { |e| e.is_a?(Array) }
2364
- arry.map do |sub_arry|
2365
- sub_arry.map do |e|
2366
- e.is_a?(Numeric) ? e : e.to_sym
2400
+ if arry.all? { |e| e.is_a?(Array) }
2401
+ arry.map do |sub_arry|
2402
+ sub_arry.map do |e|
2403
+ e.is_a?(Numeric) ? e : e.to_sym
2404
+ end
2367
2405
  end
2406
+ else
2407
+ arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym }
2368
2408
  end
2369
- else
2370
- arry.map { |e| e.is_a?(Numeric) ? e : e.to_sym }
2371
- end
2372
2409
 
2373
2410
  symbolized_arry
2374
2411
  end