daru 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.rubocop.yml +99 -0
- data/.rubocop_todo.yml +44 -0
- data/.travis.yml +3 -1
- data/CONTRIBUTING.md +5 -1
- data/History.md +43 -0
- data/README.md +3 -4
- data/benchmarks/duplicating.rb +45 -0
- data/benchmarks/group_by.rb +7 -7
- data/benchmarks/joining.rb +52 -0
- data/benchmarks/sorting.rb +9 -2
- data/benchmarks/statistics.rb +39 -0
- data/daru.gemspec +4 -4
- data/lib/daru.rb +9 -9
- data/lib/daru/accessors/array_wrapper.rb +15 -11
- data/lib/daru/accessors/dataframe_by_row.rb +1 -1
- data/lib/daru/accessors/gsl_wrapper.rb +30 -19
- data/lib/daru/accessors/mdarray_wrapper.rb +1 -3
- data/lib/daru/accessors/nmatrix_wrapper.rb +15 -15
- data/lib/daru/core/group_by.rb +69 -16
- data/lib/daru/core/merge.rb +135 -151
- data/lib/daru/core/query.rb +9 -30
- data/lib/daru/dataframe.rb +476 -439
- data/lib/daru/date_time/index.rb +150 -137
- data/lib/daru/date_time/offsets.rb +45 -41
- data/lib/daru/extensions/rserve.rb +4 -4
- data/lib/daru/index.rb +88 -64
- data/lib/daru/io/io.rb +33 -34
- data/lib/daru/io/sql_data_source.rb +11 -11
- data/lib/daru/maths/arithmetic/dataframe.rb +19 -19
- data/lib/daru/maths/arithmetic/vector.rb +9 -14
- data/lib/daru/maths/statistics/dataframe.rb +89 -61
- data/lib/daru/maths/statistics/vector.rb +226 -97
- data/lib/daru/monkeys.rb +23 -30
- data/lib/daru/plotting/dataframe.rb +27 -28
- data/lib/daru/plotting/vector.rb +12 -13
- data/lib/daru/vector.rb +221 -330
- data/lib/daru/version.rb +2 -2
- data/spec/core/group_by_spec.rb +16 -0
- data/spec/core/merge_spec.rb +30 -14
- data/spec/dataframe_spec.rb +268 -14
- data/spec/index_spec.rb +23 -5
- data/spec/io/io_spec.rb +37 -16
- data/spec/math/statistics/dataframe_spec.rb +40 -8
- data/spec/math/statistics/vector_spec.rb +135 -10
- data/spec/monkeys_spec.rb +3 -3
- data/spec/vector_spec.rb +157 -25
- metadata +41 -21
data/daru.gemspec
CHANGED
@@ -31,7 +31,7 @@ Gem::Specification.new do |spec|
|
|
31
31
|
*************************************************************************
|
32
32
|
Thank you for installing daru!
|
33
33
|
|
34
|
-
oOOOOOo
|
34
|
+
oOOOOOo
|
35
35
|
,| oO
|
36
36
|
//| |
|
37
37
|
\\\\| |
|
@@ -39,8 +39,8 @@ Thank you for installing daru!
|
|
39
39
|
`-----`
|
40
40
|
|
41
41
|
|
42
|
-
Hope you love daru! For enhanced interactivity and better visualizations,
|
43
|
-
consider using gnuplotrb and nyaplot with iruby. For statistics use the
|
42
|
+
Hope you love daru! For enhanced interactivity and better visualizations,
|
43
|
+
consider using gnuplotrb and nyaplot with iruby. For statistics use the
|
44
44
|
statsample family.
|
45
45
|
|
46
46
|
Read the README for interesting use cases and examples.
|
@@ -63,9 +63,9 @@ EOF
|
|
63
63
|
spec.add_development_dependency 'nmatrix', '~> 0.2.1'
|
64
64
|
spec.add_development_dependency 'distribution', '~> 0.7'
|
65
65
|
spec.add_development_dependency 'rb-gsl', '~>1.16'
|
66
|
-
spec.add_development_dependency 'bloomfilter-rb', '~> 2.1'
|
67
66
|
spec.add_development_dependency 'dbd-sqlite3'
|
68
67
|
spec.add_development_dependency 'dbi'
|
69
68
|
spec.add_development_dependency 'activerecord', '~> 4.0'
|
70
69
|
spec.add_development_dependency 'sqlite3'
|
70
|
+
spec.add_development_dependency 'rubocop', '>= 0.40.0'
|
71
71
|
end
|
data/lib/daru.rb
CHANGED
@@ -11,7 +11,7 @@ module Daru
|
|
11
11
|
'THU' => 4,
|
12
12
|
'FRI' => 5,
|
13
13
|
'SAT' => 6
|
14
|
-
}
|
14
|
+
}.freeze
|
15
15
|
|
16
16
|
MONTH_DAYS = {
|
17
17
|
1 => 31,
|
@@ -26,19 +26,20 @@ module Daru
|
|
26
26
|
10 => 31,
|
27
27
|
11 => 30,
|
28
28
|
12 => 31
|
29
|
-
}
|
30
|
-
|
31
|
-
|
29
|
+
}.freeze
|
30
|
+
|
31
|
+
@lazy_update = false
|
32
|
+
|
33
|
+
SPLIT_TOKEN = ','.freeze
|
34
|
+
|
32
35
|
class << self
|
33
|
-
@@lazy_update = false
|
34
|
-
|
35
36
|
# A variable which will set whether Vector metadata is updated immediately or lazily.
|
36
37
|
# Call the #update method every time a values are set or removed in order to update
|
37
38
|
# metadata like positions of missing values.
|
38
39
|
attr_accessor :lazy_update
|
39
|
-
|
40
|
+
|
40
41
|
def create_has_library(library)
|
41
|
-
lib_underscore = library.to_s.
|
42
|
+
lib_underscore = library.to_s.tr('-', '_')
|
42
43
|
define_singleton_method("has_#{lib_underscore}?") do
|
43
44
|
cv = "@@#{lib_underscore}"
|
44
45
|
unless class_variable_defined? cv
|
@@ -58,7 +59,6 @@ module Daru
|
|
58
59
|
create_has_library :gsl
|
59
60
|
create_has_library :nmatrix
|
60
61
|
create_has_library :nyaplot
|
61
|
-
create_has_library :'bloomfilter-rb'
|
62
62
|
end
|
63
63
|
|
64
64
|
autoload :Spreadsheet, 'spreadsheet'
|
@@ -6,7 +6,7 @@ module Daru
|
|
6
6
|
extend Forwardable
|
7
7
|
|
8
8
|
def_delegators :@data, :slice!
|
9
|
-
|
9
|
+
|
10
10
|
def each(&block)
|
11
11
|
@data.each(&block)
|
12
12
|
self
|
@@ -66,34 +66,38 @@ module Daru
|
|
66
66
|
ArrayWrapper.new @data.dup, @context
|
67
67
|
end
|
68
68
|
|
69
|
+
def compact
|
70
|
+
@data - @context.missing_values
|
71
|
+
end
|
72
|
+
|
69
73
|
def mean
|
70
|
-
|
74
|
+
values_to_sum = compact
|
75
|
+
return nil if values_to_sum.empty?
|
76
|
+
sum = values_to_sum.inject :+
|
77
|
+
sum.quo(values_to_sum.size).to_f
|
71
78
|
end
|
72
79
|
|
73
80
|
def product
|
74
|
-
|
81
|
+
compact.inject :*
|
75
82
|
end
|
76
83
|
|
77
84
|
def max
|
78
|
-
|
85
|
+
compact.max
|
79
86
|
end
|
80
87
|
|
81
88
|
def min
|
82
|
-
|
89
|
+
compact.min
|
83
90
|
end
|
84
91
|
|
85
92
|
def sum
|
86
|
-
|
87
|
-
memo += e unless e.nil? #TODO: Remove this conditional somehow!
|
88
|
-
memo
|
89
|
-
end
|
93
|
+
compact.inject :+
|
90
94
|
end
|
91
95
|
|
92
|
-
|
96
|
+
private
|
93
97
|
|
94
98
|
def set_size
|
95
99
|
@size = @data.size
|
96
100
|
end
|
97
101
|
end
|
98
102
|
end
|
99
|
-
end
|
103
|
+
end
|
@@ -3,33 +3,33 @@ module Daru
|
|
3
3
|
module GSLStatistics
|
4
4
|
def vector_standardized_compute(m,sd)
|
5
5
|
Daru::Vector.new @data.collect { |x| (x.to_f - m).quo(sd) }, dtype: :gsl,
|
6
|
-
|
6
|
+
index: @context.index, name: @context.name
|
7
7
|
end
|
8
|
-
|
8
|
+
|
9
9
|
def vector_centered_compute(m)
|
10
|
-
Daru::Vector.new @data.collect {|x| (x.to_f - m)}, dtype: :gsl,
|
11
|
-
|
10
|
+
Daru::Vector.new @data.collect { |x| (x.to_f - m) }, dtype: :gsl,
|
11
|
+
index: @context.index, name: @context.name
|
12
12
|
end
|
13
13
|
|
14
14
|
def sample_with_replacement(sample=1)
|
15
|
-
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(
|
16
|
-
Daru::Vector.new(r.sample(@data, sample).to_a, dtype: :gsl,
|
17
|
-
|
15
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10_000))
|
16
|
+
Daru::Vector.new(r.sample(@data, sample).to_a, dtype: :gsl,
|
17
|
+
index: @context.index, name: @context.name)
|
18
18
|
end
|
19
|
-
|
19
|
+
|
20
20
|
def sample_without_replacement(sample=1)
|
21
|
-
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(
|
21
|
+
r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10_000))
|
22
22
|
r.choose(@data, sample).to_a
|
23
23
|
end
|
24
24
|
|
25
25
|
def median
|
26
|
-
GSL::Stats
|
26
|
+
GSL::Stats.median_from_sorted_data(@data.sort)
|
27
27
|
end
|
28
|
-
|
28
|
+
|
29
29
|
def variance_sample(m)
|
30
|
-
@data.
|
30
|
+
@data.variance(m)
|
31
31
|
end
|
32
|
-
|
32
|
+
|
33
33
|
def standard_deviation_sample(m)
|
34
34
|
@data.sd(m)
|
35
35
|
end
|
@@ -56,16 +56,27 @@ module Daru
|
|
56
56
|
extend Forwardable
|
57
57
|
include Daru::Accessors::GSLStatistics
|
58
58
|
|
59
|
-
def_delegators :@data, :[], :size, :to_a, :each
|
60
|
-
:sum, :prod, :max, :min
|
61
|
-
|
62
|
-
alias :product :prod
|
59
|
+
def_delegators :@data, :[], :size, :to_a, :each
|
63
60
|
|
64
61
|
attr_reader :data
|
65
62
|
|
63
|
+
def compact
|
64
|
+
# set missing to [] incase @context is not Daru::Vector
|
65
|
+
missing = @context.missing_values rescue []
|
66
|
+
::GSL::Vector.alloc(@data.to_a - missing.map(&:to_f))
|
67
|
+
end
|
68
|
+
|
69
|
+
[:mean, :min, :max, :prod, :sum].each do |method|
|
70
|
+
define_method(method) do
|
71
|
+
compact.send(method.to_sym) rescue nil
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
alias :product :prod
|
76
|
+
|
66
77
|
def each(&block)
|
67
78
|
@data.each(&block)
|
68
|
-
self
|
79
|
+
self
|
69
80
|
end
|
70
81
|
|
71
82
|
def map!(&block)
|
@@ -110,4 +121,4 @@ module Daru
|
|
110
121
|
end
|
111
122
|
end
|
112
123
|
end
|
113
|
-
end if Daru.has_gsl?
|
124
|
+
end if Daru.has_gsl?
|
@@ -19,7 +19,7 @@ module Daru
|
|
19
19
|
end
|
20
20
|
|
21
21
|
attr_reader :size, :data, :nm_dtype
|
22
|
-
|
22
|
+
|
23
23
|
def initialize vector, context, nm_dtype=:int32
|
24
24
|
@size = vector.size
|
25
25
|
@data = NMatrix.new [@size*2], vector.to_a, dtype: nm_dtype
|
@@ -32,47 +32,47 @@ module Daru
|
|
32
32
|
return @data[*index] if index[0] < @size
|
33
33
|
nil
|
34
34
|
end
|
35
|
-
|
35
|
+
|
36
36
|
def []= index, value
|
37
|
-
raise ArgumentError, "Index #{index} does not exist" if
|
38
|
-
index > @size
|
37
|
+
raise ArgumentError, "Index #{index} does not exist" if
|
38
|
+
index > @size && index < @data.size
|
39
39
|
resize if index >= @data.size
|
40
40
|
@size += 1 if index == @size
|
41
|
-
|
41
|
+
|
42
42
|
@data = @data.cast(dtype: :object) if value.nil?
|
43
43
|
@data[index] = value
|
44
|
-
end
|
45
|
-
|
44
|
+
end
|
45
|
+
|
46
46
|
def == other
|
47
47
|
@data[0...@size] == other[0...@size] and @size == other.size
|
48
48
|
end
|
49
|
-
|
49
|
+
|
50
50
|
def delete_at index
|
51
51
|
arry = @data.to_a
|
52
52
|
arry.delete_at index
|
53
53
|
@data = NMatrix.new [(2*@size-1)], arry, dtype: @nm_dtype
|
54
54
|
@size -= 1
|
55
55
|
end
|
56
|
-
|
56
|
+
|
57
57
|
def index key
|
58
58
|
@data.to_a.index key
|
59
59
|
end
|
60
|
-
|
60
|
+
|
61
61
|
def << element
|
62
62
|
resize if @size >= @data.size
|
63
63
|
self[@size] = element
|
64
64
|
end
|
65
|
-
|
65
|
+
|
66
66
|
def to_a
|
67
67
|
@data[0...@size].to_a
|
68
68
|
end
|
69
|
-
|
69
|
+
|
70
70
|
def dup
|
71
71
|
NMatrixWrapper.new @data[0...@size].to_a, @context, @nm_dtype
|
72
72
|
end
|
73
73
|
|
74
|
-
def resize size
|
75
|
-
raise ArgumentError,
|
74
|
+
def resize size=@size*2
|
75
|
+
raise ArgumentError, 'Size must be greater than current size' if size < @size
|
76
76
|
|
77
77
|
@data = NMatrix.new [size], @data.to_a, dtype: @nm_dtype
|
78
78
|
end
|
@@ -98,4 +98,4 @@ module Daru
|
|
98
98
|
end
|
99
99
|
end
|
100
100
|
end
|
101
|
-
end if Daru.has_nmatrix?
|
101
|
+
end if Daru.has_nmatrix?
|
data/lib/daru/core/group_by.rb
CHANGED
@@ -1,12 +1,11 @@
|
|
1
1
|
module Daru
|
2
2
|
module Core
|
3
3
|
class GroupBy
|
4
|
-
|
5
4
|
attr_reader :groups
|
6
5
|
|
7
6
|
# Iterate over each group created by group_by. A DataFrame is yielded in
|
8
7
|
# block.
|
9
|
-
def each_group
|
8
|
+
def each_group
|
10
9
|
groups.keys.each do |k|
|
11
10
|
yield get_group(k)
|
12
11
|
end
|
@@ -18,7 +17,14 @@ module Daru
|
|
18
17
|
@context = context
|
19
18
|
vectors = names.map { |vec| context[vec].to_a }
|
20
19
|
tuples = vectors[0].zip(*vectors[1..-1])
|
21
|
-
keys =
|
20
|
+
keys =
|
21
|
+
tuples.uniq.sort do |a,b|
|
22
|
+
if a && b
|
23
|
+
a.compact <=> b.compact
|
24
|
+
else
|
25
|
+
a ? 1 : -1
|
26
|
+
end
|
27
|
+
end
|
22
28
|
|
23
29
|
keys.each do |key|
|
24
30
|
@groups[key] = all_indices_for(tuples, key)
|
@@ -29,13 +35,13 @@ module Daru
|
|
29
35
|
# Get a Daru::Vector of the size of each group.
|
30
36
|
def size
|
31
37
|
index =
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
38
|
+
if multi_indexed_grouping?
|
39
|
+
Daru::MultiIndex.from_tuples @groups.keys
|
40
|
+
else
|
41
|
+
Daru::Index.new @groups.keys.flatten
|
42
|
+
end
|
37
43
|
|
38
|
-
values = @groups.values.map
|
44
|
+
values = @groups.values.map(&:size)
|
39
45
|
Daru::Vector.new(values, index: index, name: :size)
|
40
46
|
end
|
41
47
|
|
@@ -194,11 +200,57 @@ module Daru
|
|
194
200
|
indexes.each do |idx|
|
195
201
|
rows << transpose[idx]
|
196
202
|
end
|
203
|
+
|
204
|
+
new_index =
|
205
|
+
begin
|
206
|
+
@context.index[indexes]
|
207
|
+
rescue IndexError
|
208
|
+
indexes
|
209
|
+
end
|
197
210
|
Daru::DataFrame.rows(
|
198
|
-
rows, index:
|
211
|
+
rows, index: new_index, order: @context.vectors
|
212
|
+
)
|
213
|
+
end
|
214
|
+
|
215
|
+
# Iteratively applies a function to the values in a group and accumulates the result.
|
216
|
+
# @param init (nil) The initial value of the accumulator.
|
217
|
+
# @param block [Proc] A proc or lambda that accepts two arguments. The first argument
|
218
|
+
# is the accumulated result. The second argument is a DataFrame row.
|
219
|
+
# @example Usage of reduce
|
220
|
+
# df = Daru::DataFrame.new({
|
221
|
+
# a: ['a','b'] * 3,
|
222
|
+
# b: [1,2,3] * 2,
|
223
|
+
# c: 'A'..'F'
|
224
|
+
# })
|
225
|
+
# df.group_by([:a]).reduce('') { |result, row| result += row[:c]; result }
|
226
|
+
# # =>
|
227
|
+
# # #<Daru::Vector:70343147159900 @name = nil @metadata = {} @size = 2 >
|
228
|
+
# # nil
|
229
|
+
# # a ACE
|
230
|
+
# # b BDF
|
231
|
+
def reduce(init=nil)
|
232
|
+
result_hash = @groups.each_with_object({}) do |(group, indices), h|
|
233
|
+
group_indices = indices.map { |v| @context.index.to_a[v] }
|
234
|
+
|
235
|
+
grouped_result = init
|
236
|
+
group_indices.each do |idx|
|
237
|
+
grouped_result = yield(grouped_result, @context.row[idx])
|
238
|
+
end
|
239
|
+
|
240
|
+
h[group] = grouped_result
|
241
|
+
end
|
242
|
+
|
243
|
+
index =
|
244
|
+
if multi_indexed_grouping?
|
245
|
+
Daru::MultiIndex.from_tuples result_hash.keys
|
246
|
+
else
|
247
|
+
Daru::Index.new result_hash.keys.flatten
|
248
|
+
end
|
249
|
+
|
250
|
+
Daru::Vector.new(result_hash.values, index: index)
|
199
251
|
end
|
200
252
|
|
201
|
-
|
253
|
+
private
|
202
254
|
|
203
255
|
def select_groups_from method, quantity
|
204
256
|
selection = @context
|
@@ -219,13 +271,13 @@ module Daru
|
|
219
271
|
multi_index = multi_indexed_grouping?
|
220
272
|
rows, order = [], []
|
221
273
|
|
222
|
-
@groups.each do |
|
274
|
+
@groups.each do |_group, indexes|
|
223
275
|
single_row = []
|
224
276
|
@non_group_vectors.each do |ngvector|
|
225
277
|
vec = @context[ngvector]
|
226
|
-
if method_type == :numeric
|
278
|
+
if method_type == :numeric && vec.type == :numeric
|
227
279
|
slice = vec[*indexes]
|
228
|
-
single_row << (slice.is_a?(
|
280
|
+
single_row << (slice.is_a?(Daru::Vector) ? slice.send(method) : slice)
|
229
281
|
end
|
230
282
|
end
|
231
283
|
|
@@ -234,7 +286,7 @@ module Daru
|
|
234
286
|
|
235
287
|
@non_group_vectors.each do |ngvec|
|
236
288
|
order << ngvec if
|
237
|
-
|
289
|
+
method_type == :numeric && @context[ngvec].type == :numeric
|
238
290
|
end
|
239
291
|
|
240
292
|
index = @groups.keys
|
@@ -256,7 +308,8 @@ module Daru
|
|
256
308
|
end
|
257
309
|
|
258
310
|
def multi_indexed_grouping?
|
259
|
-
@groups.keys[0]
|
311
|
+
return false unless @groups.keys[0]
|
312
|
+
@groups.keys[0].size > 1 ? true : false
|
260
313
|
end
|
261
314
|
end
|
262
315
|
end
|