daru 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/.rubocop.yml +99 -0
  4. data/.rubocop_todo.yml +44 -0
  5. data/.travis.yml +3 -1
  6. data/CONTRIBUTING.md +5 -1
  7. data/History.md +43 -0
  8. data/README.md +3 -4
  9. data/benchmarks/duplicating.rb +45 -0
  10. data/benchmarks/group_by.rb +7 -7
  11. data/benchmarks/joining.rb +52 -0
  12. data/benchmarks/sorting.rb +9 -2
  13. data/benchmarks/statistics.rb +39 -0
  14. data/daru.gemspec +4 -4
  15. data/lib/daru.rb +9 -9
  16. data/lib/daru/accessors/array_wrapper.rb +15 -11
  17. data/lib/daru/accessors/dataframe_by_row.rb +1 -1
  18. data/lib/daru/accessors/gsl_wrapper.rb +30 -19
  19. data/lib/daru/accessors/mdarray_wrapper.rb +1 -3
  20. data/lib/daru/accessors/nmatrix_wrapper.rb +15 -15
  21. data/lib/daru/core/group_by.rb +69 -16
  22. data/lib/daru/core/merge.rb +135 -151
  23. data/lib/daru/core/query.rb +9 -30
  24. data/lib/daru/dataframe.rb +476 -439
  25. data/lib/daru/date_time/index.rb +150 -137
  26. data/lib/daru/date_time/offsets.rb +45 -41
  27. data/lib/daru/extensions/rserve.rb +4 -4
  28. data/lib/daru/index.rb +88 -64
  29. data/lib/daru/io/io.rb +33 -34
  30. data/lib/daru/io/sql_data_source.rb +11 -11
  31. data/lib/daru/maths/arithmetic/dataframe.rb +19 -19
  32. data/lib/daru/maths/arithmetic/vector.rb +9 -14
  33. data/lib/daru/maths/statistics/dataframe.rb +89 -61
  34. data/lib/daru/maths/statistics/vector.rb +226 -97
  35. data/lib/daru/monkeys.rb +23 -30
  36. data/lib/daru/plotting/dataframe.rb +27 -28
  37. data/lib/daru/plotting/vector.rb +12 -13
  38. data/lib/daru/vector.rb +221 -330
  39. data/lib/daru/version.rb +2 -2
  40. data/spec/core/group_by_spec.rb +16 -0
  41. data/spec/core/merge_spec.rb +30 -14
  42. data/spec/dataframe_spec.rb +268 -14
  43. data/spec/index_spec.rb +23 -5
  44. data/spec/io/io_spec.rb +37 -16
  45. data/spec/math/statistics/dataframe_spec.rb +40 -8
  46. data/spec/math/statistics/vector_spec.rb +135 -10
  47. data/spec/monkeys_spec.rb +3 -3
  48. data/spec/vector_spec.rb +157 -25
  49. metadata +41 -21
@@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
31
31
  *************************************************************************
32
32
  Thank you for installing daru!
33
33
 
34
- oOOOOOo
34
+ oOOOOOo
35
35
  ,| oO
36
36
  //| |
37
37
  \\\\| |
@@ -39,8 +39,8 @@ Thank you for installing daru!
39
39
  `-----`
40
40
 
41
41
 
42
- Hope you love daru! For enhanced interactivity and better visualizations,
43
- consider using gnuplotrb and nyaplot with iruby. For statistics use the
42
+ Hope you love daru! For enhanced interactivity and better visualizations,
43
+ consider using gnuplotrb and nyaplot with iruby. For statistics use the
44
44
  statsample family.
45
45
 
46
46
  Read the README for interesting use cases and examples.
@@ -63,9 +63,9 @@ EOF
63
63
  spec.add_development_dependency 'nmatrix', '~> 0.2.1'
64
64
  spec.add_development_dependency 'distribution', '~> 0.7'
65
65
  spec.add_development_dependency 'rb-gsl', '~>1.16'
66
- spec.add_development_dependency 'bloomfilter-rb', '~> 2.1'
67
66
  spec.add_development_dependency 'dbd-sqlite3'
68
67
  spec.add_development_dependency 'dbi'
69
68
  spec.add_development_dependency 'activerecord', '~> 4.0'
70
69
  spec.add_development_dependency 'sqlite3'
70
+ spec.add_development_dependency 'rubocop', '>= 0.40.0'
71
71
  end
@@ -11,7 +11,7 @@ module Daru
11
11
  'THU' => 4,
12
12
  'FRI' => 5,
13
13
  'SAT' => 6
14
- }
14
+ }.freeze
15
15
 
16
16
  MONTH_DAYS = {
17
17
  1 => 31,
@@ -26,19 +26,20 @@ module Daru
26
26
  10 => 31,
27
27
  11 => 30,
28
28
  12 => 31
29
- }
30
-
31
- SPLIT_TOKEN = ','
29
+ }.freeze
30
+
31
+ @lazy_update = false
32
+
33
+ SPLIT_TOKEN = ','.freeze
34
+
32
35
  class << self
33
- @@lazy_update = false
34
-
35
36
  # A variable which will set whether Vector metadata is updated immediately or lazily.
36
37
  # Call the #update method every time a values are set or removed in order to update
37
38
  # metadata like positions of missing values.
38
39
  attr_accessor :lazy_update
39
-
40
+
40
41
  def create_has_library(library)
41
- lib_underscore = library.to_s.gsub(/-/, '_')
42
+ lib_underscore = library.to_s.tr('-', '_')
42
43
  define_singleton_method("has_#{lib_underscore}?") do
43
44
  cv = "@@#{lib_underscore}"
44
45
  unless class_variable_defined? cv
@@ -58,7 +59,6 @@ module Daru
58
59
  create_has_library :gsl
59
60
  create_has_library :nmatrix
60
61
  create_has_library :nyaplot
61
- create_has_library :'bloomfilter-rb'
62
62
  end
63
63
 
64
64
  autoload :Spreadsheet, 'spreadsheet'
@@ -6,7 +6,7 @@ module Daru
6
6
  extend Forwardable
7
7
 
8
8
  def_delegators :@data, :slice!
9
-
9
+
10
10
  def each(&block)
11
11
  @data.each(&block)
12
12
  self
@@ -66,34 +66,38 @@ module Daru
66
66
  ArrayWrapper.new @data.dup, @context
67
67
  end
68
68
 
69
+ def compact
70
+ @data - @context.missing_values
71
+ end
72
+
69
73
  def mean
70
- sum.quo(@size - @context.missing_positions.size).to_f
74
+ values_to_sum = compact
75
+ return nil if values_to_sum.empty?
76
+ sum = values_to_sum.inject :+
77
+ sum.quo(values_to_sum.size).to_f
71
78
  end
72
79
 
73
80
  def product
74
- @data.inject(1) { |m,e| m*e unless e.nil? }
81
+ compact.inject :*
75
82
  end
76
83
 
77
84
  def max
78
- @data.max
85
+ compact.max
79
86
  end
80
87
 
81
88
  def min
82
- @data.min
89
+ compact.min
83
90
  end
84
91
 
85
92
  def sum
86
- @data.inject(0) do |memo ,e|
87
- memo += e unless e.nil? #TODO: Remove this conditional somehow!
88
- memo
89
- end
93
+ compact.inject :+
90
94
  end
91
95
 
92
- private
96
+ private
93
97
 
94
98
  def set_size
95
99
  @size = @data.size
96
100
  end
97
101
  end
98
102
  end
99
- end
103
+ end
@@ -14,4 +14,4 @@ module Daru
14
14
  end
15
15
  end
16
16
  end
17
- end
17
+ end
@@ -3,33 +3,33 @@ module Daru
3
3
  module GSLStatistics
4
4
  def vector_standardized_compute(m,sd)
5
5
  Daru::Vector.new @data.collect { |x| (x.to_f - m).quo(sd) }, dtype: :gsl,
6
- index: @context.index, name: @context.name
6
+ index: @context.index, name: @context.name
7
7
  end
8
-
8
+
9
9
  def vector_centered_compute(m)
10
- Daru::Vector.new @data.collect {|x| (x.to_f - m)}, dtype: :gsl,
11
- index: @context.index, name: @context.name
10
+ Daru::Vector.new @data.collect { |x| (x.to_f - m) }, dtype: :gsl,
11
+ index: @context.index, name: @context.name
12
12
  end
13
13
 
14
14
  def sample_with_replacement(sample=1)
15
- r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
16
- Daru::Vector.new(r.sample(@data, sample).to_a, dtype: :gsl,
17
- index: @context.index, name: @context.name)
15
+ r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10_000))
16
+ Daru::Vector.new(r.sample(@data, sample).to_a, dtype: :gsl,
17
+ index: @context.index, name: @context.name)
18
18
  end
19
-
19
+
20
20
  def sample_without_replacement(sample=1)
21
- r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
21
+ r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10_000))
22
22
  r.choose(@data, sample).to_a
23
23
  end
24
24
 
25
25
  def median
26
- GSL::Stats::median_from_sorted_data(@data.sort)
26
+ GSL::Stats.median_from_sorted_data(@data.sort)
27
27
  end
28
-
28
+
29
29
  def variance_sample(m)
30
- @data.variance_m
30
+ @data.variance(m)
31
31
  end
32
-
32
+
33
33
  def standard_deviation_sample(m)
34
34
  @data.sd(m)
35
35
  end
@@ -56,16 +56,27 @@ module Daru
56
56
  extend Forwardable
57
57
  include Daru::Accessors::GSLStatistics
58
58
 
59
- def_delegators :@data, :[], :size, :to_a, :each, :mean,
60
- :sum, :prod, :max, :min
61
-
62
- alias :product :prod
59
+ def_delegators :@data, :[], :size, :to_a, :each
63
60
 
64
61
  attr_reader :data
65
62
 
63
+ def compact
64
+ # set missing to [] incase @context is not Daru::Vector
65
+ missing = @context.missing_values rescue []
66
+ ::GSL::Vector.alloc(@data.to_a - missing.map(&:to_f))
67
+ end
68
+
69
+ [:mean, :min, :max, :prod, :sum].each do |method|
70
+ define_method(method) do
71
+ compact.send(method.to_sym) rescue nil
72
+ end
73
+ end
74
+
75
+ alias :product :prod
76
+
66
77
  def each(&block)
67
78
  @data.each(&block)
68
- self
79
+ self
69
80
  end
70
81
 
71
82
  def map!(&block)
@@ -110,4 +121,4 @@ module Daru
110
121
  end
111
122
  end
112
123
  end
113
- end if Daru.has_gsl?
124
+ end if Daru.has_gsl?
@@ -1,9 +1,7 @@
1
1
  module Daru
2
2
  module Accessors
3
-
4
3
  # Internal class for wrapping MDArray
5
4
  class MDArrayWrapper
6
-
7
5
  end
8
6
  end
9
- end
7
+ end
@@ -19,7 +19,7 @@ module Daru
19
19
  end
20
20
 
21
21
  attr_reader :size, :data, :nm_dtype
22
-
22
+
23
23
  def initialize vector, context, nm_dtype=:int32
24
24
  @size = vector.size
25
25
  @data = NMatrix.new [@size*2], vector.to_a, dtype: nm_dtype
@@ -32,47 +32,47 @@ module Daru
32
32
  return @data[*index] if index[0] < @size
33
33
  nil
34
34
  end
35
-
35
+
36
36
  def []= index, value
37
- raise ArgumentError, "Index #{index} does not exist" if
38
- index > @size and index < @data.size
37
+ raise ArgumentError, "Index #{index} does not exist" if
38
+ index > @size && index < @data.size
39
39
  resize if index >= @data.size
40
40
  @size += 1 if index == @size
41
-
41
+
42
42
  @data = @data.cast(dtype: :object) if value.nil?
43
43
  @data[index] = value
44
- end
45
-
44
+ end
45
+
46
46
  def == other
47
47
  @data[0...@size] == other[0...@size] and @size == other.size
48
48
  end
49
-
49
+
50
50
  def delete_at index
51
51
  arry = @data.to_a
52
52
  arry.delete_at index
53
53
  @data = NMatrix.new [(2*@size-1)], arry, dtype: @nm_dtype
54
54
  @size -= 1
55
55
  end
56
-
56
+
57
57
  def index key
58
58
  @data.to_a.index key
59
59
  end
60
-
60
+
61
61
  def << element
62
62
  resize if @size >= @data.size
63
63
  self[@size] = element
64
64
  end
65
-
65
+
66
66
  def to_a
67
67
  @data[0...@size].to_a
68
68
  end
69
-
69
+
70
70
  def dup
71
71
  NMatrixWrapper.new @data[0...@size].to_a, @context, @nm_dtype
72
72
  end
73
73
 
74
- def resize size = @size*2
75
- raise ArgumentError, "Size must be greater than current size" if size < @size
74
+ def resize size=@size*2
75
+ raise ArgumentError, 'Size must be greater than current size' if size < @size
76
76
 
77
77
  @data = NMatrix.new [size], @data.to_a, dtype: @nm_dtype
78
78
  end
@@ -98,4 +98,4 @@ module Daru
98
98
  end
99
99
  end
100
100
  end
101
- end if Daru.has_nmatrix?
101
+ end if Daru.has_nmatrix?
@@ -1,12 +1,11 @@
1
1
  module Daru
2
2
  module Core
3
3
  class GroupBy
4
-
5
4
  attr_reader :groups
6
5
 
7
6
  # Iterate over each group created by group_by. A DataFrame is yielded in
8
7
  # block.
9
- def each_group &block
8
+ def each_group
10
9
  groups.keys.each do |k|
11
10
  yield get_group(k)
12
11
  end
@@ -18,7 +17,14 @@ module Daru
18
17
  @context = context
19
18
  vectors = names.map { |vec| context[vec].to_a }
20
19
  tuples = vectors[0].zip(*vectors[1..-1])
21
- keys = tuples.uniq.sort { |a,b| a && b ? a.compact <=> b.compact : a ? 1 : -1 }
20
+ keys =
21
+ tuples.uniq.sort do |a,b|
22
+ if a && b
23
+ a.compact <=> b.compact
24
+ else
25
+ a ? 1 : -1
26
+ end
27
+ end
22
28
 
23
29
  keys.each do |key|
24
30
  @groups[key] = all_indices_for(tuples, key)
@@ -29,13 +35,13 @@ module Daru
29
35
  # Get a Daru::Vector of the size of each group.
30
36
  def size
31
37
  index =
32
- if multi_indexed_grouping?
33
- Daru::MultiIndex.from_tuples @groups.keys
34
- else
35
- Daru::Index.new @groups.keys.flatten
36
- end
38
+ if multi_indexed_grouping?
39
+ Daru::MultiIndex.from_tuples @groups.keys
40
+ else
41
+ Daru::Index.new @groups.keys.flatten
42
+ end
37
43
 
38
- values = @groups.values.map { |e| e.size }
44
+ values = @groups.values.map(&:size)
39
45
  Daru::Vector.new(values, index: index, name: :size)
40
46
  end
41
47
 
@@ -194,11 +200,57 @@ module Daru
194
200
  indexes.each do |idx|
195
201
  rows << transpose[idx]
196
202
  end
203
+
204
+ new_index =
205
+ begin
206
+ @context.index[indexes]
207
+ rescue IndexError
208
+ indexes
209
+ end
197
210
  Daru::DataFrame.rows(
198
- rows, index: @context.index[indexes], order: @context.vectors)
211
+ rows, index: new_index, order: @context.vectors
212
+ )
213
+ end
214
+
215
+ # Iteratively applies a function to the values in a group and accumulates the result.
216
+ # @param init (nil) The initial value of the accumulator.
217
+ # @param block [Proc] A proc or lambda that accepts two arguments. The first argument
218
+ # is the accumulated result. The second argument is a DataFrame row.
219
+ # @example Usage of reduce
220
+ # df = Daru::DataFrame.new({
221
+ # a: ['a','b'] * 3,
222
+ # b: [1,2,3] * 2,
223
+ # c: 'A'..'F'
224
+ # })
225
+ # df.group_by([:a]).reduce('') { |result, row| result += row[:c]; result }
226
+ # # =>
227
+ # # #<Daru::Vector:70343147159900 @name = nil @metadata = {} @size = 2 >
228
+ # # nil
229
+ # # a ACE
230
+ # # b BDF
231
+ def reduce(init=nil)
232
+ result_hash = @groups.each_with_object({}) do |(group, indices), h|
233
+ group_indices = indices.map { |v| @context.index.to_a[v] }
234
+
235
+ grouped_result = init
236
+ group_indices.each do |idx|
237
+ grouped_result = yield(grouped_result, @context.row[idx])
238
+ end
239
+
240
+ h[group] = grouped_result
241
+ end
242
+
243
+ index =
244
+ if multi_indexed_grouping?
245
+ Daru::MultiIndex.from_tuples result_hash.keys
246
+ else
247
+ Daru::Index.new result_hash.keys.flatten
248
+ end
249
+
250
+ Daru::Vector.new(result_hash.values, index: index)
199
251
  end
200
252
 
201
- private
253
+ private
202
254
 
203
255
  def select_groups_from method, quantity
204
256
  selection = @context
@@ -219,13 +271,13 @@ module Daru
219
271
  multi_index = multi_indexed_grouping?
220
272
  rows, order = [], []
221
273
 
222
- @groups.each do |group, indexes|
274
+ @groups.each do |_group, indexes|
223
275
  single_row = []
224
276
  @non_group_vectors.each do |ngvector|
225
277
  vec = @context[ngvector]
226
- if method_type == :numeric and vec.type == :numeric
278
+ if method_type == :numeric && vec.type == :numeric
227
279
  slice = vec[*indexes]
228
- single_row << (slice.is_a?(Numeric) ? slice : slice.send(method))
280
+ single_row << (slice.is_a?(Daru::Vector) ? slice.send(method) : slice)
229
281
  end
230
282
  end
231
283
 
@@ -234,7 +286,7 @@ module Daru
234
286
 
235
287
  @non_group_vectors.each do |ngvec|
236
288
  order << ngvec if
237
- (method_type == :numeric and @context[ngvec].type == :numeric)
289
+ method_type == :numeric && @context[ngvec].type == :numeric
238
290
  end
239
291
 
240
292
  index = @groups.keys
@@ -256,7 +308,8 @@ module Daru
256
308
  end
257
309
 
258
310
  def multi_indexed_grouping?
259
- @groups.keys[0][1] ? true : false
311
+ return false unless @groups.keys[0]
312
+ @groups.keys[0].size > 1 ? true : false
260
313
  end
261
314
  end
262
315
  end