daru 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.rubocop.yml +99 -0
- data/.rubocop_todo.yml +44 -0
- data/.travis.yml +3 -1
- data/CONTRIBUTING.md +5 -1
- data/History.md +43 -0
- data/README.md +3 -4
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +7 -7
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/sorting.rb +9 -2
- data/benchmarks/statistics.rb +39 -0
- data/daru.gemspec +4 -4
- data/lib/daru.rb +9 -9
- data/lib/daru/accessors/array_wrapper.rb +15 -11
- data/lib/daru/accessors/dataframe_by_row.rb +1 -1
- data/lib/daru/accessors/gsl_wrapper.rb +30 -19
- data/lib/daru/accessors/mdarray_wrapper.rb +1 -3
- data/lib/daru/accessors/nmatrix_wrapper.rb +15 -15
- data/lib/daru/core/group_by.rb +69 -16
- data/lib/daru/core/merge.rb +135 -151
- data/lib/daru/core/query.rb +9 -30
- data/lib/daru/dataframe.rb +476 -439
- data/lib/daru/date_time/index.rb +150 -137
- data/lib/daru/date_time/offsets.rb +45 -41
- data/lib/daru/extensions/rserve.rb +4 -4
- data/lib/daru/index.rb +88 -64
- data/lib/daru/io/io.rb +33 -34
- data/lib/daru/io/sql_data_source.rb +11 -11
- data/lib/daru/maths/arithmetic/dataframe.rb +19 -19
- data/lib/daru/maths/arithmetic/vector.rb +9 -14
- data/lib/daru/maths/statistics/dataframe.rb +89 -61
- data/lib/daru/maths/statistics/vector.rb +226 -97
- data/lib/daru/monkeys.rb +23 -30
- data/lib/daru/plotting/dataframe.rb +27 -28
- data/lib/daru/plotting/vector.rb +12 -13
- data/lib/daru/vector.rb +221 -330
- data/lib/daru/version.rb +2 -2
- data/spec/core/group_by_spec.rb +16 -0
- data/spec/core/merge_spec.rb +30 -14
- data/spec/dataframe_spec.rb +268 -14
- data/spec/index_spec.rb +23 -5
- data/spec/io/io_spec.rb +37 -16
- data/spec/math/statistics/dataframe_spec.rb +40 -8
- data/spec/math/statistics/vector_spec.rb +135 -10
- data/spec/monkeys_spec.rb +3 -3
- data/spec/vector_spec.rb +157 -25
- metadata +41 -21
data/daru.gemspec
CHANGED
@@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
|
|
31
31
|
*************************************************************************
|
32
32
|
Thank you for installing daru!
|
33
33
|
|
34
|
-
oOOOOOo
|
34
|
+
oOOOOOo
|
35
35
|
,| oO
|
36
36
|
//| |
|
37
37
|
\\\\| |
|
@@ -39,8 +39,8 @@ Thank you for installing daru!
|
|
39
39
|
`-----`
|
40
40
|
|
41
41
|
|
42
|
-
Hope you love daru! For enhanced interactivity and better visualizations,
|
43
|
-
consider using gnuplotrb and nyaplot with iruby. For statistics use the
|
42
|
+
Hope you love daru! For enhanced interactivity and better visualizations,
|
43
|
+
consider using gnuplotrb and nyaplot with iruby. For statistics use the
|
44
44
|
statsample family.
|
45
45
|
|
46
46
|
Read the README for interesting use cases and examples.
|
@@ -63,9 +63,9 @@ EOF
|
|
63
63
|
spec.add_development_dependency 'nmatrix', '~> 0.2.1'
|
64
64
|
spec.add_development_dependency 'distribution', '~> 0.7'
|
65
65
|
spec.add_development_dependency 'rb-gsl', '~>1.16'
|
66
|
-
spec.add_development_dependency 'bloomfilter-rb', '~> 2.1'
|
67
66
|
spec.add_development_dependency 'dbd-sqlite3'
|
68
67
|
spec.add_development_dependency 'dbi'
|
69
68
|
spec.add_development_dependency 'activerecord', '~> 4.0'
|
70
69
|
spec.add_development_dependency 'sqlite3'
|
70
|
+
spec.add_development_dependency 'rubocop', '>= 0.40.0'
|
71
71
|
end
|
data/lib/daru.rb
CHANGED
@@ -11,7 +11,7 @@ module Daru
|
|
11
11
|
'THU' => 4,
|
12
12
|
'FRI' => 5,
|
13
13
|
'SAT' => 6
|
14
|
-
}
|
14
|
+
}.freeze
|
15
15
|
|
16
16
|
MONTH_DAYS = {
|
17
17
|
1 => 31,
|
@@ -26,19 +26,20 @@ module Daru
|
|
26
26
|
10 => 31,
|
27
27
|
11 => 30,
|
28
28
|
12 => 31
|
29
|
-
}
|
30
|
-
|
31
|
-
|
29
|
+
}.freeze
|
30
|
+
|
31
|
+
@lazy_update = false
|
32
|
+
|
33
|
+
SPLIT_TOKEN = ','.freeze
|
34
|
+
|
32
35
|
class << self
|
33
|
-
@@lazy_update = false
|
34
|
-
|
35
36
|
# A variable which will set whether Vector metadata is updated immediately or lazily.
|
36
37
|
# Call the #update method every time a values are set or removed in order to update
|
37
38
|
# metadata like positions of missing values.
|
38
39
|
attr_accessor :lazy_update
|
39
|
-
|
40
|
+
|
40
41
|
def create_has_library(library)
|
41
|
-
lib_underscore = library.to_s.
|
42
|
+
lib_underscore = library.to_s.tr('-', '_')
|
42
43
|
define_singleton_method("has_#{lib_underscore}?") do
|
43
44
|
cv = "@@#{lib_underscore}"
|
44
45
|
unless class_variable_defined? cv
|
@@ -58,7 +59,6 @@ module Daru
|
|
58
59
|
create_has_library :gsl
|
59
60
|
create_has_library :nmatrix
|
60
61
|
create_has_library :nyaplot
|
61
|
-
create_has_library :'bloomfilter-rb'
|
62
62
|
end
|
63
63
|
|
64
64
|
autoload :Spreadsheet, 'spreadsheet'
|
@@ -6,7 +6,7 @@ module Daru
|
|
6
6
|
extend Forwardable
|
7
7
|
|
8
8
|
def_delegators :@data, :slice!
|
9
|
-
|
9
|
+
|
10
10
|
def each(&block)
|
11
11
|
@data.each(&block)
|
12
12
|
self
|
@@ -66,34 +66,38 @@ module Daru
|
|
66
66
|
ArrayWrapper.new @data.dup, @context
|
67
67
|
end
|
68
68
|
|
69
|
+
def compact
|
70
|
+
@data - @context.missing_values
|
71
|
+
end
|
72
|
+
|
69
73
|
def mean
|
70
|
-
|
74
|
+
values_to_sum = compact
|
75
|
+
return nil if values_to_sum.empty?
|
76
|
+
sum = values_to_sum.inject :+
|
77
|
+
sum.quo(values_to_sum.size).to_f
|
71
78
|
end
|
72
79
|
|
73
80
|
def product
|
74
|
-
|
81
|
+
compact.inject :*
|
75
82
|
end
|
76
83
|
|
77
84
|
def max
|
78
|
-
|
85
|
+
compact.max
|
79
86
|
end
|
80
87
|
|
81
88
|
def min
|
82
|
-
|
89
|
+
compact.min
|
83
90
|
end
|
84
91
|
|
85
92
|
def sum
|
86
|
-
|
87
|
-
memo += e unless e.nil? #TODO: Remove this conditional somehow!
|
88
|
-
memo
|
89
|
-
end
|
93
|
+
compact.inject :+
|
90
94
|
end
|
91
95
|
|
92
|
-
|
96
|
+
private
|
93
97
|
|
94
98
|
def set_size
|
95
99
|
@size = @data.size
|
96
100
|
end
|
97
101
|
end
|
98
102
|
end
|
99
|
-
end
|
103
|
+
end
|
@@ -3,33 +3,33 @@ module Daru
|
|
3
3
|
module GSLStatistics
|
4
4
|
def vector_standardized_compute(m,sd)
|
5
5
|
Daru::Vector.new @data.collect { |x| (x.to_f - m).quo(sd) }, dtype: :gsl,
|
6
|
-
|
6
|
+
index: @context.index, name: @context.name
|
7
7
|
end
|
8
|
-
|
8
|
+
|
9
9
|
def vector_centered_compute(m)
|
10
|
-
Daru::Vector.new @data.collect {|x| (x.to_f - m)}, dtype: :gsl,
|
11
|
-
|
10
|
+
Daru::Vector.new @data.collect { |x| (x.to_f - m) }, dtype: :gsl,
|
11
|
+
index: @context.index, name: @context.name
|
12
12
|
end
|
13
13
|
|
14
14
|
def sample_with_replacement(sample=1)
|
15
|
-
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(
|
16
|
-
Daru::Vector.new(r.sample(@data, sample).to_a, dtype: :gsl,
|
17
|
-
|
15
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10_000))
|
16
|
+
Daru::Vector.new(r.sample(@data, sample).to_a, dtype: :gsl,
|
17
|
+
index: @context.index, name: @context.name)
|
18
18
|
end
|
19
|
-
|
19
|
+
|
20
20
|
def sample_without_replacement(sample=1)
|
21
|
-
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(
|
21
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10_000))
|
22
22
|
r.choose(@data, sample).to_a
|
23
23
|
end
|
24
24
|
|
25
25
|
def median
|
26
|
-
GSL::Stats
|
26
|
+
GSL::Stats.median_from_sorted_data(@data.sort)
|
27
27
|
end
|
28
|
-
|
28
|
+
|
29
29
|
def variance_sample(m)
|
30
|
-
@data.
|
30
|
+
@data.variance(m)
|
31
31
|
end
|
32
|
-
|
32
|
+
|
33
33
|
def standard_deviation_sample(m)
|
34
34
|
@data.sd(m)
|
35
35
|
end
|
@@ -56,16 +56,27 @@ module Daru
|
|
56
56
|
extend Forwardable
|
57
57
|
include Daru::Accessors::GSLStatistics
|
58
58
|
|
59
|
-
def_delegators :@data, :[], :size, :to_a, :each
|
60
|
-
:sum, :prod, :max, :min
|
61
|
-
|
62
|
-
alias :product :prod
|
59
|
+
def_delegators :@data, :[], :size, :to_a, :each
|
63
60
|
|
64
61
|
attr_reader :data
|
65
62
|
|
63
|
+
def compact
|
64
|
+
# set missing to [] incase @context is not Daru::Vector
|
65
|
+
missing = @context.missing_values rescue []
|
66
|
+
::GSL::Vector.alloc(@data.to_a - missing.map(&:to_f))
|
67
|
+
end
|
68
|
+
|
69
|
+
[:mean, :min, :max, :prod, :sum].each do |method|
|
70
|
+
define_method(method) do
|
71
|
+
compact.send(method.to_sym) rescue nil
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
alias :product :prod
|
76
|
+
|
66
77
|
def each(&block)
|
67
78
|
@data.each(&block)
|
68
|
-
self
|
79
|
+
self
|
69
80
|
end
|
70
81
|
|
71
82
|
def map!(&block)
|
@@ -110,4 +121,4 @@ module Daru
|
|
110
121
|
end
|
111
122
|
end
|
112
123
|
end
|
113
|
-
end if Daru.has_gsl?
|
124
|
+
end if Daru.has_gsl?
|
@@ -19,7 +19,7 @@ module Daru
|
|
19
19
|
end
|
20
20
|
|
21
21
|
attr_reader :size, :data, :nm_dtype
|
22
|
-
|
22
|
+
|
23
23
|
def initialize vector, context, nm_dtype=:int32
|
24
24
|
@size = vector.size
|
25
25
|
@data = NMatrix.new [@size*2], vector.to_a, dtype: nm_dtype
|
@@ -32,47 +32,47 @@ module Daru
|
|
32
32
|
return @data[*index] if index[0] < @size
|
33
33
|
nil
|
34
34
|
end
|
35
|
-
|
35
|
+
|
36
36
|
def []= index, value
|
37
|
-
raise ArgumentError, "Index #{index} does not exist" if
|
38
|
-
index > @size
|
37
|
+
raise ArgumentError, "Index #{index} does not exist" if
|
38
|
+
index > @size && index < @data.size
|
39
39
|
resize if index >= @data.size
|
40
40
|
@size += 1 if index == @size
|
41
|
-
|
41
|
+
|
42
42
|
@data = @data.cast(dtype: :object) if value.nil?
|
43
43
|
@data[index] = value
|
44
|
-
end
|
45
|
-
|
44
|
+
end
|
45
|
+
|
46
46
|
def == other
|
47
47
|
@data[0...@size] == other[0...@size] and @size == other.size
|
48
48
|
end
|
49
|
-
|
49
|
+
|
50
50
|
def delete_at index
|
51
51
|
arry = @data.to_a
|
52
52
|
arry.delete_at index
|
53
53
|
@data = NMatrix.new [(2*@size-1)], arry, dtype: @nm_dtype
|
54
54
|
@size -= 1
|
55
55
|
end
|
56
|
-
|
56
|
+
|
57
57
|
def index key
|
58
58
|
@data.to_a.index key
|
59
59
|
end
|
60
|
-
|
60
|
+
|
61
61
|
def << element
|
62
62
|
resize if @size >= @data.size
|
63
63
|
self[@size] = element
|
64
64
|
end
|
65
|
-
|
65
|
+
|
66
66
|
def to_a
|
67
67
|
@data[0...@size].to_a
|
68
68
|
end
|
69
|
-
|
69
|
+
|
70
70
|
def dup
|
71
71
|
NMatrixWrapper.new @data[0...@size].to_a, @context, @nm_dtype
|
72
72
|
end
|
73
73
|
|
74
|
-
def resize size
|
75
|
-
raise ArgumentError,
|
74
|
+
def resize size=@size*2
|
75
|
+
raise ArgumentError, 'Size must be greater than current size' if size < @size
|
76
76
|
|
77
77
|
@data = NMatrix.new [size], @data.to_a, dtype: @nm_dtype
|
78
78
|
end
|
@@ -98,4 +98,4 @@ module Daru
|
|
98
98
|
end
|
99
99
|
end
|
100
100
|
end
|
101
|
-
end if Daru.has_nmatrix?
|
101
|
+
end if Daru.has_nmatrix?
|
data/lib/daru/core/group_by.rb
CHANGED
@@ -1,12 +1,11 @@
|
|
1
1
|
module Daru
|
2
2
|
module Core
|
3
3
|
class GroupBy
|
4
|
-
|
5
4
|
attr_reader :groups
|
6
5
|
|
7
6
|
# Iterate over each group created by group_by. A DataFrame is yielded in
|
8
7
|
# block.
|
9
|
-
def each_group
|
8
|
+
def each_group
|
10
9
|
groups.keys.each do |k|
|
11
10
|
yield get_group(k)
|
12
11
|
end
|
@@ -18,7 +17,14 @@ module Daru
|
|
18
17
|
@context = context
|
19
18
|
vectors = names.map { |vec| context[vec].to_a }
|
20
19
|
tuples = vectors[0].zip(*vectors[1..-1])
|
21
|
-
keys =
|
20
|
+
keys =
|
21
|
+
tuples.uniq.sort do |a,b|
|
22
|
+
if a && b
|
23
|
+
a.compact <=> b.compact
|
24
|
+
else
|
25
|
+
a ? 1 : -1
|
26
|
+
end
|
27
|
+
end
|
22
28
|
|
23
29
|
keys.each do |key|
|
24
30
|
@groups[key] = all_indices_for(tuples, key)
|
@@ -29,13 +35,13 @@ module Daru
|
|
29
35
|
# Get a Daru::Vector of the size of each group.
|
30
36
|
def size
|
31
37
|
index =
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
38
|
+
if multi_indexed_grouping?
|
39
|
+
Daru::MultiIndex.from_tuples @groups.keys
|
40
|
+
else
|
41
|
+
Daru::Index.new @groups.keys.flatten
|
42
|
+
end
|
37
43
|
|
38
|
-
values = @groups.values.map
|
44
|
+
values = @groups.values.map(&:size)
|
39
45
|
Daru::Vector.new(values, index: index, name: :size)
|
40
46
|
end
|
41
47
|
|
@@ -194,11 +200,57 @@ module Daru
|
|
194
200
|
indexes.each do |idx|
|
195
201
|
rows << transpose[idx]
|
196
202
|
end
|
203
|
+
|
204
|
+
new_index =
|
205
|
+
begin
|
206
|
+
@context.index[indexes]
|
207
|
+
rescue IndexError
|
208
|
+
indexes
|
209
|
+
end
|
197
210
|
Daru::DataFrame.rows(
|
198
|
-
rows, index:
|
211
|
+
rows, index: new_index, order: @context.vectors
|
212
|
+
)
|
213
|
+
end
|
214
|
+
|
215
|
+
# Iteratively applies a function to the values in a group and accumulates the result.
|
216
|
+
# @param init (nil) The initial value of the accumulator.
|
217
|
+
# @param block [Proc] A proc or lambda that accepts two arguments. The first argument
|
218
|
+
# is the accumulated result. The second argument is a DataFrame row.
|
219
|
+
# @example Usage of reduce
|
220
|
+
# df = Daru::DataFrame.new({
|
221
|
+
# a: ['a','b'] * 3,
|
222
|
+
# b: [1,2,3] * 2,
|
223
|
+
# c: 'A'..'F'
|
224
|
+
# })
|
225
|
+
# df.group_by([:a]).reduce('') { |result, row| result += row[:c]; result }
|
226
|
+
# # =>
|
227
|
+
# # #<Daru::Vector:70343147159900 @name = nil @metadata = {} @size = 2 >
|
228
|
+
# # nil
|
229
|
+
# # a ACE
|
230
|
+
# # b BDF
|
231
|
+
def reduce(init=nil)
|
232
|
+
result_hash = @groups.each_with_object({}) do |(group, indices), h|
|
233
|
+
group_indices = indices.map { |v| @context.index.to_a[v] }
|
234
|
+
|
235
|
+
grouped_result = init
|
236
|
+
group_indices.each do |idx|
|
237
|
+
grouped_result = yield(grouped_result, @context.row[idx])
|
238
|
+
end
|
239
|
+
|
240
|
+
h[group] = grouped_result
|
241
|
+
end
|
242
|
+
|
243
|
+
index =
|
244
|
+
if multi_indexed_grouping?
|
245
|
+
Daru::MultiIndex.from_tuples result_hash.keys
|
246
|
+
else
|
247
|
+
Daru::Index.new result_hash.keys.flatten
|
248
|
+
end
|
249
|
+
|
250
|
+
Daru::Vector.new(result_hash.values, index: index)
|
199
251
|
end
|
200
252
|
|
201
|
-
|
253
|
+
private
|
202
254
|
|
203
255
|
def select_groups_from method, quantity
|
204
256
|
selection = @context
|
@@ -219,13 +271,13 @@ module Daru
|
|
219
271
|
multi_index = multi_indexed_grouping?
|
220
272
|
rows, order = [], []
|
221
273
|
|
222
|
-
@groups.each do |
|
274
|
+
@groups.each do |_group, indexes|
|
223
275
|
single_row = []
|
224
276
|
@non_group_vectors.each do |ngvector|
|
225
277
|
vec = @context[ngvector]
|
226
|
-
if method_type == :numeric
|
278
|
+
if method_type == :numeric && vec.type == :numeric
|
227
279
|
slice = vec[*indexes]
|
228
|
-
single_row << (slice.is_a?(
|
280
|
+
single_row << (slice.is_a?(Daru::Vector) ? slice.send(method) : slice)
|
229
281
|
end
|
230
282
|
end
|
231
283
|
|
@@ -234,7 +286,7 @@ module Daru
|
|
234
286
|
|
235
287
|
@non_group_vectors.each do |ngvec|
|
236
288
|
order << ngvec if
|
237
|
-
|
289
|
+
method_type == :numeric && @context[ngvec].type == :numeric
|
238
290
|
end
|
239
291
|
|
240
292
|
index = @groups.keys
|
@@ -256,7 +308,8 @@ module Daru
|
|
256
308
|
end
|
257
309
|
|
258
310
|
def multi_indexed_grouping?
|
259
|
-
@groups.keys[0]
|
311
|
+
return false unless @groups.keys[0]
|
312
|
+
@groups.keys[0].size > 1 ? true : false
|
260
313
|
end
|
261
314
|
end
|
262
315
|
end
|