daru 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/.rubocop.yml +99 -0
  4. data/.rubocop_todo.yml +44 -0
  5. data/.travis.yml +3 -1
  6. data/CONTRIBUTING.md +5 -1
  7. data/History.md +43 -0
  8. data/README.md +3 -4
  9. data/benchmarks/duplicating.rb +45 -0
  10. data/benchmarks/group_by.rb +7 -7
  11. data/benchmarks/joining.rb +52 -0
  12. data/benchmarks/sorting.rb +9 -2
  13. data/benchmarks/statistics.rb +39 -0
  14. data/daru.gemspec +4 -4
  15. data/lib/daru.rb +9 -9
  16. data/lib/daru/accessors/array_wrapper.rb +15 -11
  17. data/lib/daru/accessors/dataframe_by_row.rb +1 -1
  18. data/lib/daru/accessors/gsl_wrapper.rb +30 -19
  19. data/lib/daru/accessors/mdarray_wrapper.rb +1 -3
  20. data/lib/daru/accessors/nmatrix_wrapper.rb +15 -15
  21. data/lib/daru/core/group_by.rb +69 -16
  22. data/lib/daru/core/merge.rb +135 -151
  23. data/lib/daru/core/query.rb +9 -30
  24. data/lib/daru/dataframe.rb +476 -439
  25. data/lib/daru/date_time/index.rb +150 -137
  26. data/lib/daru/date_time/offsets.rb +45 -41
  27. data/lib/daru/extensions/rserve.rb +4 -4
  28. data/lib/daru/index.rb +88 -64
  29. data/lib/daru/io/io.rb +33 -34
  30. data/lib/daru/io/sql_data_source.rb +11 -11
  31. data/lib/daru/maths/arithmetic/dataframe.rb +19 -19
  32. data/lib/daru/maths/arithmetic/vector.rb +9 -14
  33. data/lib/daru/maths/statistics/dataframe.rb +89 -61
  34. data/lib/daru/maths/statistics/vector.rb +226 -97
  35. data/lib/daru/monkeys.rb +23 -30
  36. data/lib/daru/plotting/dataframe.rb +27 -28
  37. data/lib/daru/plotting/vector.rb +12 -13
  38. data/lib/daru/vector.rb +221 -330
  39. data/lib/daru/version.rb +2 -2
  40. data/spec/core/group_by_spec.rb +16 -0
  41. data/spec/core/merge_spec.rb +30 -14
  42. data/spec/dataframe_spec.rb +268 -14
  43. data/spec/index_spec.rb +23 -5
  44. data/spec/io/io_spec.rb +37 -16
  45. data/spec/math/statistics/dataframe_spec.rb +40 -8
  46. data/spec/math/statistics/vector_spec.rb +135 -10
  47. data/spec/monkeys_spec.rb +3 -3
  48. data/spec/vector_spec.rb +157 -25
  49. metadata +41 -21
@@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
31
31
  *************************************************************************
32
32
  Thank you for installing daru!
33
33
 
34
- oOOOOOo
34
+ oOOOOOo
35
35
  ,| oO
36
36
  //| |
37
37
  \\\\| |
@@ -39,8 +39,8 @@ Thank you for installing daru!
39
39
  `-----`
40
40
 
41
41
 
42
- Hope you love daru! For enhanced interactivity and better visualizations,
43
- consider using gnuplotrb and nyaplot with iruby. For statistics use the
42
+ Hope you love daru! For enhanced interactivity and better visualizations,
43
+ consider using gnuplotrb and nyaplot with iruby. For statistics use the
44
44
  statsample family.
45
45
 
46
46
  Read the README for interesting use cases and examples.
@@ -63,9 +63,9 @@ EOF
63
63
  spec.add_development_dependency 'nmatrix', '~> 0.2.1'
64
64
  spec.add_development_dependency 'distribution', '~> 0.7'
65
65
  spec.add_development_dependency 'rb-gsl', '~>1.16'
66
- spec.add_development_dependency 'bloomfilter-rb', '~> 2.1'
67
66
  spec.add_development_dependency 'dbd-sqlite3'
68
67
  spec.add_development_dependency 'dbi'
69
68
  spec.add_development_dependency 'activerecord', '~> 4.0'
70
69
  spec.add_development_dependency 'sqlite3'
70
+ spec.add_development_dependency 'rubocop', '>= 0.40.0'
71
71
  end
@@ -11,7 +11,7 @@ module Daru
11
11
  'THU' => 4,
12
12
  'FRI' => 5,
13
13
  'SAT' => 6
14
- }
14
+ }.freeze
15
15
 
16
16
  MONTH_DAYS = {
17
17
  1 => 31,
@@ -26,19 +26,20 @@ module Daru
26
26
  10 => 31,
27
27
  11 => 30,
28
28
  12 => 31
29
- }
30
-
31
- SPLIT_TOKEN = ','
29
+ }.freeze
30
+
31
+ @lazy_update = false
32
+
33
+ SPLIT_TOKEN = ','.freeze
34
+
32
35
  class << self
33
- @@lazy_update = false
34
-
35
36
  # A variable which will set whether Vector metadata is updated immediately or lazily.
36
37
  # Call the #update method every time a values are set or removed in order to update
37
38
  # metadata like positions of missing values.
38
39
  attr_accessor :lazy_update
39
-
40
+
40
41
  def create_has_library(library)
41
- lib_underscore = library.to_s.gsub(/-/, '_')
42
+ lib_underscore = library.to_s.tr('-', '_')
42
43
  define_singleton_method("has_#{lib_underscore}?") do
43
44
  cv = "@@#{lib_underscore}"
44
45
  unless class_variable_defined? cv
@@ -58,7 +59,6 @@ module Daru
58
59
  create_has_library :gsl
59
60
  create_has_library :nmatrix
60
61
  create_has_library :nyaplot
61
- create_has_library :'bloomfilter-rb'
62
62
  end
63
63
 
64
64
  autoload :Spreadsheet, 'spreadsheet'
@@ -6,7 +6,7 @@ module Daru
6
6
  extend Forwardable
7
7
 
8
8
  def_delegators :@data, :slice!
9
-
9
+
10
10
  def each(&block)
11
11
  @data.each(&block)
12
12
  self
@@ -66,34 +66,38 @@ module Daru
66
66
  ArrayWrapper.new @data.dup, @context
67
67
  end
68
68
 
69
+ def compact
70
+ @data - @context.missing_values
71
+ end
72
+
69
73
  def mean
70
- sum.quo(@size - @context.missing_positions.size).to_f
74
+ values_to_sum = compact
75
+ return nil if values_to_sum.empty?
76
+ sum = values_to_sum.inject :+
77
+ sum.quo(values_to_sum.size).to_f
71
78
  end
72
79
 
73
80
  def product
74
- @data.inject(1) { |m,e| m*e unless e.nil? }
81
+ compact.inject :*
75
82
  end
76
83
 
77
84
  def max
78
- @data.max
85
+ compact.max
79
86
  end
80
87
 
81
88
  def min
82
- @data.min
89
+ compact.min
83
90
  end
84
91
 
85
92
  def sum
86
- @data.inject(0) do |memo ,e|
87
- memo += e unless e.nil? #TODO: Remove this conditional somehow!
88
- memo
89
- end
93
+ compact.inject :+
90
94
  end
91
95
 
92
- private
96
+ private
93
97
 
94
98
  def set_size
95
99
  @size = @data.size
96
100
  end
97
101
  end
98
102
  end
99
- end
103
+ end
@@ -14,4 +14,4 @@ module Daru
14
14
  end
15
15
  end
16
16
  end
17
- end
17
+ end
@@ -3,33 +3,33 @@ module Daru
3
3
  module GSLStatistics
4
4
  def vector_standardized_compute(m,sd)
5
5
  Daru::Vector.new @data.collect { |x| (x.to_f - m).quo(sd) }, dtype: :gsl,
6
- index: @context.index, name: @context.name
6
+ index: @context.index, name: @context.name
7
7
  end
8
-
8
+
9
9
  def vector_centered_compute(m)
10
- Daru::Vector.new @data.collect {|x| (x.to_f - m)}, dtype: :gsl,
11
- index: @context.index, name: @context.name
10
+ Daru::Vector.new @data.collect { |x| (x.to_f - m) }, dtype: :gsl,
11
+ index: @context.index, name: @context.name
12
12
  end
13
13
 
14
14
  def sample_with_replacement(sample=1)
15
- r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
16
- Daru::Vector.new(r.sample(@data, sample).to_a, dtype: :gsl,
17
- index: @context.index, name: @context.name)
15
+ r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10_000))
16
+ Daru::Vector.new(r.sample(@data, sample).to_a, dtype: :gsl,
17
+ index: @context.index, name: @context.name)
18
18
  end
19
-
19
+
20
20
  def sample_without_replacement(sample=1)
21
- r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
21
+ r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10_000))
22
22
  r.choose(@data, sample).to_a
23
23
  end
24
24
 
25
25
  def median
26
- GSL::Stats::median_from_sorted_data(@data.sort)
26
+ GSL::Stats.median_from_sorted_data(@data.sort)
27
27
  end
28
-
28
+
29
29
  def variance_sample(m)
30
- @data.variance_m
30
+ @data.variance(m)
31
31
  end
32
-
32
+
33
33
  def standard_deviation_sample(m)
34
34
  @data.sd(m)
35
35
  end
@@ -56,16 +56,27 @@ module Daru
56
56
  extend Forwardable
57
57
  include Daru::Accessors::GSLStatistics
58
58
 
59
- def_delegators :@data, :[], :size, :to_a, :each, :mean,
60
- :sum, :prod, :max, :min
61
-
62
- alias :product :prod
59
+ def_delegators :@data, :[], :size, :to_a, :each
63
60
 
64
61
  attr_reader :data
65
62
 
63
+ def compact
64
+ # set missing to [] incase @context is not Daru::Vector
65
+ missing = @context.missing_values rescue []
66
+ ::GSL::Vector.alloc(@data.to_a - missing.map(&:to_f))
67
+ end
68
+
69
+ [:mean, :min, :max, :prod, :sum].each do |method|
70
+ define_method(method) do
71
+ compact.send(method.to_sym) rescue nil
72
+ end
73
+ end
74
+
75
+ alias :product :prod
76
+
66
77
  def each(&block)
67
78
  @data.each(&block)
68
- self
79
+ self
69
80
  end
70
81
 
71
82
  def map!(&block)
@@ -110,4 +121,4 @@ module Daru
110
121
  end
111
122
  end
112
123
  end
113
- end if Daru.has_gsl?
124
+ end if Daru.has_gsl?
@@ -1,9 +1,7 @@
1
1
  module Daru
2
2
  module Accessors
3
-
4
3
  # Internal class for wrapping MDArray
5
4
  class MDArrayWrapper
6
-
7
5
  end
8
6
  end
9
- end
7
+ end
@@ -19,7 +19,7 @@ module Daru
19
19
  end
20
20
 
21
21
  attr_reader :size, :data, :nm_dtype
22
-
22
+
23
23
  def initialize vector, context, nm_dtype=:int32
24
24
  @size = vector.size
25
25
  @data = NMatrix.new [@size*2], vector.to_a, dtype: nm_dtype
@@ -32,47 +32,47 @@ module Daru
32
32
  return @data[*index] if index[0] < @size
33
33
  nil
34
34
  end
35
-
35
+
36
36
  def []= index, value
37
- raise ArgumentError, "Index #{index} does not exist" if
38
- index > @size and index < @data.size
37
+ raise ArgumentError, "Index #{index} does not exist" if
38
+ index > @size && index < @data.size
39
39
  resize if index >= @data.size
40
40
  @size += 1 if index == @size
41
-
41
+
42
42
  @data = @data.cast(dtype: :object) if value.nil?
43
43
  @data[index] = value
44
- end
45
-
44
+ end
45
+
46
46
  def == other
47
47
  @data[0...@size] == other[0...@size] and @size == other.size
48
48
  end
49
-
49
+
50
50
  def delete_at index
51
51
  arry = @data.to_a
52
52
  arry.delete_at index
53
53
  @data = NMatrix.new [(2*@size-1)], arry, dtype: @nm_dtype
54
54
  @size -= 1
55
55
  end
56
-
56
+
57
57
  def index key
58
58
  @data.to_a.index key
59
59
  end
60
-
60
+
61
61
  def << element
62
62
  resize if @size >= @data.size
63
63
  self[@size] = element
64
64
  end
65
-
65
+
66
66
  def to_a
67
67
  @data[0...@size].to_a
68
68
  end
69
-
69
+
70
70
  def dup
71
71
  NMatrixWrapper.new @data[0...@size].to_a, @context, @nm_dtype
72
72
  end
73
73
 
74
- def resize size = @size*2
75
- raise ArgumentError, "Size must be greater than current size" if size < @size
74
+ def resize size=@size*2
75
+ raise ArgumentError, 'Size must be greater than current size' if size < @size
76
76
 
77
77
  @data = NMatrix.new [size], @data.to_a, dtype: @nm_dtype
78
78
  end
@@ -98,4 +98,4 @@ module Daru
98
98
  end
99
99
  end
100
100
  end
101
- end if Daru.has_nmatrix?
101
+ end if Daru.has_nmatrix?
@@ -1,12 +1,11 @@
1
1
  module Daru
2
2
  module Core
3
3
  class GroupBy
4
-
5
4
  attr_reader :groups
6
5
 
7
6
  # Iterate over each group created by group_by. A DataFrame is yielded in
8
7
  # block.
9
- def each_group &block
8
+ def each_group
10
9
  groups.keys.each do |k|
11
10
  yield get_group(k)
12
11
  end
@@ -18,7 +17,14 @@ module Daru
18
17
  @context = context
19
18
  vectors = names.map { |vec| context[vec].to_a }
20
19
  tuples = vectors[0].zip(*vectors[1..-1])
21
- keys = tuples.uniq.sort { |a,b| a && b ? a.compact <=> b.compact : a ? 1 : -1 }
20
+ keys =
21
+ tuples.uniq.sort do |a,b|
22
+ if a && b
23
+ a.compact <=> b.compact
24
+ else
25
+ a ? 1 : -1
26
+ end
27
+ end
22
28
 
23
29
  keys.each do |key|
24
30
  @groups[key] = all_indices_for(tuples, key)
@@ -29,13 +35,13 @@ module Daru
29
35
  # Get a Daru::Vector of the size of each group.
30
36
  def size
31
37
  index =
32
- if multi_indexed_grouping?
33
- Daru::MultiIndex.from_tuples @groups.keys
34
- else
35
- Daru::Index.new @groups.keys.flatten
36
- end
38
+ if multi_indexed_grouping?
39
+ Daru::MultiIndex.from_tuples @groups.keys
40
+ else
41
+ Daru::Index.new @groups.keys.flatten
42
+ end
37
43
 
38
- values = @groups.values.map { |e| e.size }
44
+ values = @groups.values.map(&:size)
39
45
  Daru::Vector.new(values, index: index, name: :size)
40
46
  end
41
47
 
@@ -194,11 +200,57 @@ module Daru
194
200
  indexes.each do |idx|
195
201
  rows << transpose[idx]
196
202
  end
203
+
204
+ new_index =
205
+ begin
206
+ @context.index[indexes]
207
+ rescue IndexError
208
+ indexes
209
+ end
197
210
  Daru::DataFrame.rows(
198
- rows, index: @context.index[indexes], order: @context.vectors)
211
+ rows, index: new_index, order: @context.vectors
212
+ )
213
+ end
214
+
215
+ # Iteratively applies a function to the values in a group and accumulates the result.
216
+ # @param init (nil) The initial value of the accumulator.
217
+ # @param block [Proc] A proc or lambda that accepts two arguments. The first argument
218
+ # is the accumulated result. The second argument is a DataFrame row.
219
+ # @example Usage of reduce
220
+ # df = Daru::DataFrame.new({
221
+ # a: ['a','b'] * 3,
222
+ # b: [1,2,3] * 2,
223
+ # c: 'A'..'F'
224
+ # })
225
+ # df.group_by([:a]).reduce('') { |result, row| result += row[:c]; result }
226
+ # # =>
227
+ # # #<Daru::Vector:70343147159900 @name = nil @metadata = {} @size = 2 >
228
+ # # nil
229
+ # # a ACE
230
+ # # b BDF
231
+ def reduce(init=nil)
232
+ result_hash = @groups.each_with_object({}) do |(group, indices), h|
233
+ group_indices = indices.map { |v| @context.index.to_a[v] }
234
+
235
+ grouped_result = init
236
+ group_indices.each do |idx|
237
+ grouped_result = yield(grouped_result, @context.row[idx])
238
+ end
239
+
240
+ h[group] = grouped_result
241
+ end
242
+
243
+ index =
244
+ if multi_indexed_grouping?
245
+ Daru::MultiIndex.from_tuples result_hash.keys
246
+ else
247
+ Daru::Index.new result_hash.keys.flatten
248
+ end
249
+
250
+ Daru::Vector.new(result_hash.values, index: index)
199
251
  end
200
252
 
201
- private
253
+ private
202
254
 
203
255
  def select_groups_from method, quantity
204
256
  selection = @context
@@ -219,13 +271,13 @@ module Daru
219
271
  multi_index = multi_indexed_grouping?
220
272
  rows, order = [], []
221
273
 
222
- @groups.each do |group, indexes|
274
+ @groups.each do |_group, indexes|
223
275
  single_row = []
224
276
  @non_group_vectors.each do |ngvector|
225
277
  vec = @context[ngvector]
226
- if method_type == :numeric and vec.type == :numeric
278
+ if method_type == :numeric && vec.type == :numeric
227
279
  slice = vec[*indexes]
228
- single_row << (slice.is_a?(Numeric) ? slice : slice.send(method))
280
+ single_row << (slice.is_a?(Daru::Vector) ? slice.send(method) : slice)
229
281
  end
230
282
  end
231
283
 
@@ -234,7 +286,7 @@ module Daru
234
286
 
235
287
  @non_group_vectors.each do |ngvec|
236
288
  order << ngvec if
237
- (method_type == :numeric and @context[ngvec].type == :numeric)
289
+ method_type == :numeric && @context[ngvec].type == :numeric
238
290
  end
239
291
 
240
292
  index = @groups.keys
@@ -256,7 +308,8 @@ module Daru
256
308
  end
257
309
 
258
310
  def multi_indexed_grouping?
259
- @groups.keys[0][1] ? true : false
311
+ return false unless @groups.keys[0]
312
+ @groups.keys[0].size > 1 ? true : false
260
313
  end
261
314
  end
262
315
  end