daru 0.1.5 → 0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.gitignore +1 -0
- data/.rubocop.yml +21 -7
- data/.travis.yml +10 -5
- data/CONTRIBUTING.md +15 -10
- data/History.md +124 -2
- data/README.md +37 -9
- data/ReleasePolicy.md +20 -0
- data/benchmarks/db_loading.rb +34 -0
- data/benchmarks/statistics.rb +6 -6
- data/benchmarks/where_clause.rb +1 -1
- data/benchmarks/where_vs_filter.rb +1 -1
- data/daru.gemspec +17 -41
- data/lib/daru.rb +10 -13
- data/lib/daru/accessors/gsl_wrapper.rb +1 -1
- data/lib/daru/accessors/nmatrix_wrapper.rb +2 -0
- data/lib/daru/category.rb +29 -15
- data/lib/daru/configuration.rb +34 -0
- data/lib/daru/core/group_by.rb +158 -77
- data/lib/daru/core/merge.rb +12 -3
- data/lib/daru/core/query.rb +20 -4
- data/lib/daru/dataframe.rb +692 -118
- data/lib/daru/date_time/index.rb +14 -11
- data/lib/daru/date_time/offsets.rb +9 -1
- data/lib/daru/extensions/which_dsl.rb +55 -0
- data/lib/daru/formatters/table.rb +3 -5
- data/lib/daru/index/categorical_index.rb +4 -4
- data/lib/daru/index/index.rb +131 -42
- data/lib/daru/index/multi_index.rb +118 -10
- data/lib/daru/io/csv/converters.rb +21 -0
- data/lib/daru/io/io.rb +105 -33
- data/lib/daru/io/sql_data_source.rb +10 -0
- data/lib/daru/iruby/templates/dataframe.html.erb +4 -51
- data/lib/daru/iruby/templates/dataframe_mi.html.erb +3 -56
- data/lib/daru/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
- data/lib/daru/iruby/templates/dataframe_mi_thead.html.erb +21 -0
- data/lib/daru/iruby/templates/dataframe_tbody.html.erb +28 -0
- data/lib/daru/iruby/templates/dataframe_thead.html.erb +21 -0
- data/lib/daru/iruby/templates/vector.html.erb +3 -25
- data/lib/daru/iruby/templates/vector_mi.html.erb +3 -34
- data/lib/daru/iruby/templates/vector_mi_tbody.html.erb +26 -0
- data/lib/daru/iruby/templates/vector_mi_thead.html.erb +8 -0
- data/lib/daru/iruby/templates/vector_tbody.html.erb +17 -0
- data/lib/daru/iruby/templates/vector_thead.html.erb +8 -0
- data/lib/daru/maths/arithmetic/vector.rb +38 -2
- data/lib/daru/maths/statistics/dataframe.rb +28 -30
- data/lib/daru/maths/statistics/vector.rb +295 -41
- data/lib/daru/plotting/gruff/dataframe.rb +13 -15
- data/lib/daru/plotting/nyaplot/category.rb +1 -1
- data/lib/daru/plotting/nyaplot/dataframe.rb +15 -4
- data/lib/daru/plotting/nyaplot/vector.rb +1 -2
- data/lib/daru/vector.rb +308 -96
- data/lib/daru/version.rb +1 -1
- data/profile/vector_new.rb +9 -0
- data/spec/accessors/gsl_wrapper_spec.rb +38 -35
- data/spec/accessors/nmatrix_wrapper_spec.rb +25 -22
- data/spec/category_spec.rb +24 -20
- data/spec/core/group_by_spec.rb +238 -4
- data/spec/core/merge_spec.rb +1 -1
- data/spec/core/query_spec.rb +65 -50
- data/spec/daru_spec.rb +22 -0
- data/spec/dataframe_spec.rb +473 -16
- data/spec/date_time/date_time_index_helper_spec.rb +72 -0
- data/spec/date_time/index_spec.rb +34 -16
- data/spec/date_time/offsets_spec.rb +14 -0
- data/spec/extensions/rserve_spec.rb +1 -1
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/boolean_converter_test.csv +5 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/fixtures/eciresults.html +394 -0
- data/spec/fixtures/empty_rows_test.csv +17 -0
- data/spec/fixtures/macau.html +3691 -0
- data/spec/fixtures/macd_data.csv +150 -0
- data/spec/fixtures/matrix_test.csv +55 -55
- data/spec/fixtures/moneycontrol.html +6812 -0
- data/spec/fixtures/string_converter_test.csv +5 -0
- data/spec/fixtures/test_xls.xls +0 -0
- data/spec/fixtures/test_xls_2.xls +0 -0
- data/spec/fixtures/url_test.txt~ +0 -0
- data/spec/fixtures/valid_markup.html +62 -0
- data/spec/fixtures/wiki_climate.html +1243 -0
- data/spec/fixtures/wiki_table_info.html +631 -0
- data/spec/formatters/table_formatter_spec.rb +29 -0
- data/spec/index/categorical_index_spec.rb +33 -33
- data/spec/index/index_spec.rb +160 -41
- data/spec/index/multi_index_spec.rb +143 -33
- data/spec/io/io_spec.rb +246 -2
- data/spec/io/sql_data_source_spec.rb +31 -41
- data/spec/iruby/dataframe_spec.rb +17 -19
- data/spec/iruby/vector_spec.rb +26 -28
- data/spec/maths/arithmetic/dataframe_spec.rb +1 -1
- data/spec/maths/arithmetic/vector_spec.rb +18 -0
- data/spec/maths/statistics/vector_spec.rb +153 -15
- data/spec/plotting/gruff/category_spec.rb +3 -3
- data/spec/plotting/gruff/dataframe_spec.rb +14 -4
- data/spec/plotting/gruff/vector_spec.rb +9 -9
- data/spec/plotting/nyaplot/category_spec.rb +5 -9
- data/spec/plotting/nyaplot/dataframe_spec.rb +95 -47
- data/spec/plotting/nyaplot/vector_spec.rb +5 -11
- data/spec/shared/vector_display_spec.rb +12 -14
- data/spec/spec_helper.rb +30 -7
- data/spec/support/matchers.rb +5 -0
- data/spec/vector_spec.rb +306 -72
- metadata +96 -55
- data/spec/fixtures/stock_data.csv +0 -500
@@ -0,0 +1,34 @@
|
|
1
|
+
$:.unshift File.expand_path("../../lib", __FILE__)
|
2
|
+
|
3
|
+
require 'benchmark'
|
4
|
+
require 'daru'
|
5
|
+
require 'sqlite3'
|
6
|
+
require 'dbi'
|
7
|
+
require 'active_record'
|
8
|
+
|
9
|
+
db_name = 'daru_test.sqlite'
|
10
|
+
FileUtils.rm(db_name) if File.file?(db_name)
|
11
|
+
|
12
|
+
SQLite3::Database.new(db_name).tap do |db|
|
13
|
+
db.execute "create table accounts(id integer, name varchar, age integer, primary key(id))"
|
14
|
+
|
15
|
+
values = 1.upto(100_000).map { |i| %!(#{i},"name_#{i}",#{rand(100)})! }.join(",")
|
16
|
+
db.execute "insert into accounts values #{values}"
|
17
|
+
end
|
18
|
+
|
19
|
+
ActiveRecord::Base.establish_connection("sqlite3:#{db_name}")
|
20
|
+
ActiveRecord::Base.connection
|
21
|
+
|
22
|
+
class Account < ActiveRecord::Base; end
|
23
|
+
|
24
|
+
Benchmark.bm do |x|
|
25
|
+
x.report("DataFrame.from_sql") do
|
26
|
+
Daru::DataFrame.from_sql(ActiveRecord::Base.connection, "SELECT * FROM accounts")
|
27
|
+
end
|
28
|
+
|
29
|
+
x.report("DataFrame.from_activerecord") do
|
30
|
+
Daru::DataFrame.from_activerecord(Account.all)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
FileUtils.rm(db_name)
|
data/benchmarks/statistics.rb
CHANGED
@@ -5,26 +5,26 @@ vector = Daru::Vector.new(
|
|
5
5
|
(10**6).times.map.to_a.shuffle,
|
6
6
|
missing_values: 100.times.map.to_a.shuffle
|
7
7
|
)
|
8
|
-
|
8
|
+
|
9
9
|
vector_gsl = Daru::Vector.new(
|
10
10
|
10000.times.map.to_a.shuffle,
|
11
11
|
missing_values: 100.times.map.to_a.shuffle,
|
12
12
|
dtype: :gsl
|
13
|
-
)
|
14
|
-
|
13
|
+
)
|
14
|
+
|
15
15
|
Benchmark.bm do |x|
|
16
16
|
x.report("Mean of a vector") do
|
17
17
|
vector.mean
|
18
18
|
end
|
19
|
-
|
19
|
+
|
20
20
|
x.report("Minimum of a vector") do
|
21
21
|
vector.min
|
22
22
|
end
|
23
|
-
|
23
|
+
|
24
24
|
x.report("Mean of a vector with data type gsl") do
|
25
25
|
vector_gsl.mean
|
26
26
|
end
|
27
|
-
|
27
|
+
|
28
28
|
x.report "Minimum of a vector with data type gsl" do
|
29
29
|
vector_gsl.min
|
30
30
|
end
|
data/benchmarks/where_clause.rb
CHANGED
data/daru.gemspec
CHANGED
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
|
|
19
19
|
spec.email = ['sameer.deshmukh93@gmail.com']
|
20
20
|
spec.summary = %q{Data Analysis in RUby}
|
21
21
|
spec.description = Daru::DESCRIPTION
|
22
|
-
spec.homepage = "http://github.com/
|
22
|
+
spec.homepage = "http://github.com/SciRuby/daru"
|
23
23
|
spec.license = 'BSD-2'
|
24
24
|
|
25
25
|
spec.files = `git ls-files -z`.split("\x0")
|
@@ -27,35 +27,12 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
28
28
|
spec.require_paths = ["lib"]
|
29
29
|
|
30
|
-
|
31
|
-
|
32
|
-
Thank you for installing daru!
|
30
|
+
# it is required by NMatrix, yet we want to specify clearly which minimal version is OK
|
31
|
+
spec.add_runtime_dependency 'packable', '~> 1.3.13'
|
33
32
|
|
34
|
-
oOOOOOo
|
35
|
-
,| oO
|
36
|
-
//| |
|
37
|
-
\\\\| |
|
38
|
-
`| |
|
39
|
-
`-----`
|
40
|
-
|
41
|
-
|
42
|
-
Hope you love daru! For enhanced interactivity and better visualizations,
|
43
|
-
consider using gnuplotrb and nyaplot with iruby. For statistics use the
|
44
|
-
statsample family.
|
45
|
-
|
46
|
-
Read the README for interesting use cases and examples.
|
47
|
-
|
48
|
-
Cheers!
|
49
|
-
*************************************************************************
|
50
|
-
EOF
|
51
|
-
|
52
|
-
|
53
|
-
spec.add_runtime_dependency 'backports'
|
54
|
-
|
55
|
-
spec.add_development_dependency 'reportbuilder', '~> 1.4'
|
56
33
|
spec.add_development_dependency 'spreadsheet', '~> 1.1.1'
|
57
|
-
spec.add_development_dependency 'bundler', '
|
58
|
-
spec.add_development_dependency 'rake', '~>
|
34
|
+
spec.add_development_dependency 'bundler', '>= 1.10'
|
35
|
+
spec.add_development_dependency 'rake', '~>13.0'
|
59
36
|
spec.add_development_dependency 'pry', '~> 0.10'
|
60
37
|
spec.add_development_dependency 'pry-byebug'
|
61
38
|
spec.add_development_dependency 'rserve-client', '~> 0.3'
|
@@ -63,23 +40,22 @@ EOF
|
|
63
40
|
spec.add_development_dependency 'rspec-its'
|
64
41
|
spec.add_development_dependency 'awesome_print'
|
65
42
|
spec.add_development_dependency 'nyaplot', '~> 0.1.5'
|
66
|
-
spec.add_development_dependency 'nmatrix', '~> 0.2.1'
|
43
|
+
spec.add_development_dependency 'nmatrix', '~> 0.2.1' if ENV['DARU_TEST_NMATRIX']
|
67
44
|
spec.add_development_dependency 'distribution', '~> 0.7'
|
68
|
-
spec.add_development_dependency 'gsl', '~>2.1.0.2'
|
45
|
+
spec.add_development_dependency 'gsl', '~>2.1.0.2' if ENV['DARU_TEST_GSL']
|
69
46
|
spec.add_development_dependency 'dbd-sqlite3'
|
70
47
|
spec.add_development_dependency 'dbi'
|
71
|
-
spec.add_development_dependency 'activerecord', '~>
|
72
|
-
spec.add_development_dependency '
|
73
|
-
|
48
|
+
spec.add_development_dependency 'activerecord', '~> 6.0'
|
49
|
+
spec.add_development_dependency 'mechanize'
|
50
|
+
# issue : https://github.com/SciRuby/daru/issues/493 occured
|
51
|
+
# with latest version of sqlite3
|
52
|
+
spec.add_development_dependency 'sqlite3'
|
53
|
+
spec.add_development_dependency 'rubocop', '~> 0.49.0'
|
74
54
|
spec.add_development_dependency 'ruby-prof'
|
75
55
|
spec.add_development_dependency 'simplecov'
|
76
56
|
spec.add_development_dependency 'gruff'
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
end
|
82
|
-
if RUBY_VERSION >= '2.2.5'
|
83
|
-
spec.add_development_dependency 'guard-rspec'
|
84
|
-
end
|
57
|
+
spec.add_development_dependency 'webmock'
|
58
|
+
|
59
|
+
spec.add_development_dependency 'nokogiri'
|
60
|
+
spec.add_development_dependency 'guard-rspec'
|
85
61
|
end
|
data/lib/daru.rb
CHANGED
@@ -38,11 +38,13 @@ module Daru
|
|
38
38
|
|
39
39
|
@plotting_library = :nyaplot
|
40
40
|
|
41
|
+
@error_stream = $stderr
|
42
|
+
|
41
43
|
class << self
|
42
44
|
# A variable which will set whether Vector metadata is updated immediately or lazily.
|
43
45
|
# Call the #update method every time a values are set or removed in order to update
|
44
46
|
# metadata like positions of missing values.
|
45
|
-
attr_accessor :lazy_update
|
47
|
+
attr_accessor :lazy_update, :error_stream
|
46
48
|
attr_reader :plotting_library
|
47
49
|
|
48
50
|
def create_has_library(library)
|
@@ -72,6 +74,10 @@ module Daru
|
|
72
74
|
raise ArgumentError, "Unsupported library #{lib}"
|
73
75
|
end
|
74
76
|
end
|
77
|
+
|
78
|
+
def error msg
|
79
|
+
error_stream.puts msg if error_stream
|
80
|
+
end
|
75
81
|
end
|
76
82
|
|
77
83
|
create_has_library :gsl
|
@@ -80,16 +86,6 @@ module Daru
|
|
80
86
|
create_has_library :gruff
|
81
87
|
end
|
82
88
|
|
83
|
-
[['reportbuilder', '~>1.4'], ['spreadsheet', '~>1.1.1']].each do |lib|
|
84
|
-
begin
|
85
|
-
gem lib[0], lib[1]
|
86
|
-
require lib[0]
|
87
|
-
rescue LoadError
|
88
|
-
STDERR.puts "\nInstall the #{lib[0]} gem version #{lib[1]} for using"\
|
89
|
-
" #{lib[0]} functions."
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
89
|
autoload :CSV, 'csv'
|
94
90
|
require 'matrix'
|
95
91
|
require 'forwardable'
|
@@ -98,11 +94,14 @@ require 'date'
|
|
98
94
|
|
99
95
|
require 'daru/version.rb'
|
100
96
|
|
97
|
+
require 'open-uri'
|
98
|
+
|
101
99
|
require 'daru/index/index.rb'
|
102
100
|
require 'daru/index/multi_index.rb'
|
103
101
|
require 'daru/index/categorical_index.rb'
|
104
102
|
|
105
103
|
require 'daru/helpers/array.rb'
|
104
|
+
require 'daru/configuration.rb'
|
106
105
|
require 'daru/vector.rb'
|
107
106
|
require 'daru/dataframe.rb'
|
108
107
|
require 'daru/monkeys.rb'
|
@@ -116,5 +115,3 @@ require 'daru/core/merge.rb'
|
|
116
115
|
|
117
116
|
require 'daru/date_time/offsets.rb'
|
118
117
|
require 'daru/date_time/index.rb'
|
119
|
-
|
120
|
-
require 'backports'
|
@@ -25,6 +25,8 @@ if Daru.has_nmatrix?
|
|
25
25
|
attr_reader :size, :data, :nm_dtype
|
26
26
|
|
27
27
|
def initialize vector, context, nm_dtype=:int32
|
28
|
+
# To avoid arrays with nils throwing TypeError for nil nm_dtype
|
29
|
+
nm_dtype = :object if nm_dtype.nil? && vector.any?(&:nil?)
|
28
30
|
@size = vector.size
|
29
31
|
@data = NMatrix.new [@size*2], vector.to_a, dtype: nm_dtype
|
30
32
|
@context = context
|
data/lib/daru/category.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module Daru
|
2
2
|
module Category # rubocop:disable Metrics/ModuleLength
|
3
|
+
UNDEFINED = Object.new.freeze
|
4
|
+
|
3
5
|
attr_accessor :base_category
|
4
6
|
attr_reader :index, :coding_scheme, :name
|
5
7
|
|
@@ -72,6 +74,13 @@ module Daru
|
|
72
74
|
end
|
73
75
|
end
|
74
76
|
|
77
|
+
# this method is overwritten: see Daru::Category#plotting_library=
|
78
|
+
def plot(*args, **options, &b)
|
79
|
+
init_plotting_library
|
80
|
+
|
81
|
+
plot(*args, **options, &b)
|
82
|
+
end
|
83
|
+
|
75
84
|
alias_method :rename, :name=
|
76
85
|
|
77
86
|
# Returns an enumerator that enumerates on categorical data
|
@@ -113,7 +122,7 @@ module Daru
|
|
113
122
|
end
|
114
123
|
|
115
124
|
# Associates a category to the vector.
|
116
|
-
# @param [Array]
|
125
|
+
# @param [Array] new_categories new categories to be associated
|
117
126
|
# @example
|
118
127
|
# dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
|
119
128
|
# dv.add_category :b
|
@@ -131,7 +140,10 @@ module Daru
|
|
131
140
|
# dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
|
132
141
|
# dv.count :a
|
133
142
|
# # => 2
|
134
|
-
|
143
|
+
# dv.count
|
144
|
+
# # => 5
|
145
|
+
def count category=UNDEFINED
|
146
|
+
return @cat_hash.values.map(&:size).inject(&:+) if category == UNDEFINED # count all
|
135
147
|
raise ArgumentError, "Invalid category #{category}" unless
|
136
148
|
categories.include?(category)
|
137
149
|
|
@@ -167,9 +179,9 @@ module Daru
|
|
167
179
|
end
|
168
180
|
|
169
181
|
# Returns vector for indexes/positions specified
|
170
|
-
# @param [Array]
|
182
|
+
# @param [Array] indexes for which values has to be retrived
|
171
183
|
# @note Since it accepts both indexes and postions. In case of collision,
|
172
|
-
#
|
184
|
+
# argument will be treated as index
|
173
185
|
# @return vector containing values specified at specified indexes/positions
|
174
186
|
# @example
|
175
187
|
# dv = Daru::Vector.new [:a, 1, :a, 1, :c],
|
@@ -196,7 +208,7 @@ module Daru
|
|
196
208
|
end
|
197
209
|
|
198
210
|
# Returns vector for positions specified.
|
199
|
-
# @param [Array]
|
211
|
+
# @param [Array] positions at which values to be retrived.
|
200
212
|
# @return vector containing values specified at specified positions
|
201
213
|
# @example
|
202
214
|
# dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
|
@@ -223,7 +235,7 @@ module Daru
|
|
223
235
|
|
224
236
|
# Modifies values at specified indexes/positions.
|
225
237
|
# @note In order to add a new category you need to associate it via #add_category
|
226
|
-
# @param [Array]
|
238
|
+
# @param [Array] indexes at which to modify value
|
227
239
|
# @param [object] val value to assign at specific indexes/positions
|
228
240
|
# @return modified vector
|
229
241
|
# @example
|
@@ -461,7 +473,7 @@ module Daru
|
|
461
473
|
@coding_scheme = scheme
|
462
474
|
end
|
463
475
|
|
464
|
-
CODING_SCHEMES = [
|
476
|
+
CODING_SCHEMES = %i[dummy deviation helmert simple].freeze
|
465
477
|
|
466
478
|
# Contrast code the vector acording to the coding scheme set.
|
467
479
|
# @note To set the coding scheme use #coding_scheme=
|
@@ -584,7 +596,7 @@ module Daru
|
|
584
596
|
alias :gteq :mteq
|
585
597
|
|
586
598
|
# For querying the data
|
587
|
-
# @param [object] arel like query syntax
|
599
|
+
# @param bool_array [object] arel like query syntax
|
588
600
|
# @return [Daru::Vector] Vector which makes the conditions true
|
589
601
|
# @example
|
590
602
|
# dv = Daru::Vector.new ['I', 'II', 'I', 'III', 'I', 'II'],
|
@@ -658,7 +670,7 @@ module Daru
|
|
658
670
|
end
|
659
671
|
|
660
672
|
# Check if any one of mentioned values occur in the vector
|
661
|
-
# @param [Array]
|
673
|
+
# @param [Array] values to check for
|
662
674
|
# @return [true, false] returns true if any one of specified values
|
663
675
|
# occur in the vector
|
664
676
|
# @example
|
@@ -670,7 +682,7 @@ module Daru
|
|
670
682
|
end
|
671
683
|
|
672
684
|
# Return a vector with specified values removed
|
673
|
-
# @param [Array]
|
685
|
+
# @param [Array] values to reject from resultant vector
|
674
686
|
# @return [Daru::Vector] vector with specified values removed
|
675
687
|
# @example
|
676
688
|
# dv = Daru::Vector.new [1, 2, nil, Float::NAN], type: :category
|
@@ -689,7 +701,7 @@ module Daru
|
|
689
701
|
end
|
690
702
|
|
691
703
|
# Count the number of values specified
|
692
|
-
# @param [Array]
|
704
|
+
# @param [Array] values to count for
|
693
705
|
# @return [Integer] the number of times the values mentioned occurs
|
694
706
|
# @example
|
695
707
|
# dv = Daru::Vector.new [1, 2, 1, 2, 3, 4, nil, nil]
|
@@ -702,7 +714,7 @@ module Daru
|
|
702
714
|
end
|
703
715
|
|
704
716
|
# Return indexes of values specified
|
705
|
-
# @param [Array]
|
717
|
+
# @param [Array] values to find indexes for
|
706
718
|
# @return [Array] array of indexes of values specified
|
707
719
|
# @example
|
708
720
|
# dv = Daru::Vector.new [1, 2, nil, Float::NAN], index: 11..14
|
@@ -743,6 +755,11 @@ module Daru
|
|
743
755
|
|
744
756
|
private
|
745
757
|
|
758
|
+
# Will lazily load the plotting library being used
|
759
|
+
def init_plotting_library
|
760
|
+
self.plotting_library = Daru.plotting_library
|
761
|
+
end
|
762
|
+
|
746
763
|
def validate_categories input_categories
|
747
764
|
raise ArgumentError, 'Input categories and speculated categories mismatch' unless
|
748
765
|
(categories - input_categories).empty?
|
@@ -763,9 +780,6 @@ module Daru
|
|
763
780
|
# To link every instance to its category,
|
764
781
|
# it stores integer for every instance representing its category
|
765
782
|
@array = map_cat_int.values_at(*data)
|
766
|
-
|
767
|
-
# Include plotting functionality
|
768
|
-
self.plotting_library = Daru.plotting_library
|
769
783
|
end
|
770
784
|
|
771
785
|
def category_from_position position
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Daru
|
2
|
+
# Defines constants and methods related to configuration
|
3
|
+
module Configuration
|
4
|
+
INSPECT_OPTIONS_KEYS = [
|
5
|
+
:max_rows,
|
6
|
+
# Terminal
|
7
|
+
:spacing
|
8
|
+
].freeze
|
9
|
+
|
10
|
+
# Jupyter
|
11
|
+
DEFAULT_MAX_ROWS = 30
|
12
|
+
|
13
|
+
# Terminal
|
14
|
+
DEFAULT_SPACING = 10
|
15
|
+
|
16
|
+
attr_accessor(*INSPECT_OPTIONS_KEYS)
|
17
|
+
|
18
|
+
def configure
|
19
|
+
yield self
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.extended(base)
|
23
|
+
base.reset_options
|
24
|
+
end
|
25
|
+
|
26
|
+
def reset_options
|
27
|
+
self.max_rows = DEFAULT_MAX_ROWS
|
28
|
+
|
29
|
+
self.spacing = DEFAULT_SPACING
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
extend Configuration
|
34
|
+
end
|
data/lib/daru/core/group_by.rb
CHANGED
@@ -1,30 +1,107 @@
|
|
1
1
|
module Daru
|
2
2
|
module Core
|
3
3
|
class GroupBy
|
4
|
-
|
4
|
+
class << self
|
5
|
+
extend Gem::Deprecate
|
6
|
+
|
7
|
+
# @private
|
8
|
+
def group_by_index_to_positions(indexes_with_positions, sort: false)
|
9
|
+
index_to_positions = {}
|
10
|
+
|
11
|
+
indexes_with_positions.each do |idx, position|
|
12
|
+
(index_to_positions[idx] ||= []) << position
|
13
|
+
end
|
14
|
+
|
15
|
+
if sort # TODO: maybe add a more "stable" sorting option?
|
16
|
+
sorted_keys = index_to_positions.keys.sort(&Daru::Core::GroupBy::TUPLE_SORTER)
|
17
|
+
index_to_positions = sorted_keys.map { |k| [k, index_to_positions[k]] }.to_h
|
18
|
+
end
|
19
|
+
|
20
|
+
index_to_positions
|
21
|
+
end
|
22
|
+
alias get_positions_group_map_on group_by_index_to_positions
|
23
|
+
deprecate :get_positions_group_map_on, :group_by_index_to_positions, 2019, 10
|
24
|
+
|
25
|
+
# @private
|
26
|
+
def get_positions_group_for_aggregation(multi_index, level=-1)
|
27
|
+
raise unless multi_index.is_a?(Daru::MultiIndex)
|
28
|
+
|
29
|
+
new_index = multi_index.dup
|
30
|
+
new_index.remove_layer(level) # TODO: recheck code of Daru::MultiIndex#remove_layer
|
31
|
+
|
32
|
+
group_by_index_to_positions(new_index.each_with_index)
|
33
|
+
end
|
34
|
+
|
35
|
+
# @private
|
36
|
+
def get_positions_group_map_for_df(df, group_by_keys, sort: true)
|
37
|
+
indexes_with_positions = df[*group_by_keys].to_df.each_row.map(&:to_a).each_with_index
|
38
|
+
|
39
|
+
group_by_index_to_positions(indexes_with_positions, sort: sort)
|
40
|
+
end
|
41
|
+
|
42
|
+
# @private
|
43
|
+
def group_map_from_positions_to_indexes(positions_group_map, index)
|
44
|
+
positions_group_map.map { |k, positions| [k, positions.map { |pos| index.at(pos) }] }.to_h
|
45
|
+
end
|
46
|
+
|
47
|
+
# @private
|
48
|
+
def df_from_group_map(df, group_map, remaining_vectors, from_position: true)
|
49
|
+
return nil if group_map == {}
|
50
|
+
|
51
|
+
new_index = group_map.flat_map { |group, values| values.map { |val| group + [val] } }
|
52
|
+
new_index = Daru::MultiIndex.from_tuples(new_index)
|
53
|
+
|
54
|
+
return Daru::DataFrame.new({}, index: new_index) if remaining_vectors == []
|
55
|
+
|
56
|
+
new_rows_order = group_map.values.flatten
|
57
|
+
new_df = df[*remaining_vectors].to_df.get_sub_dataframe(new_rows_order, by_position: from_position)
|
58
|
+
new_df.index = new_index
|
59
|
+
|
60
|
+
new_df
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# The group_by was done over the vectors in group_vectors; the remaining vectors are the non_group_vectors
|
65
|
+
attr_reader :group_vectors, :non_group_vectors
|
66
|
+
|
67
|
+
# lazy accessor/attr_reader for the attribute groups
|
68
|
+
def groups
|
69
|
+
@groups ||= GroupBy.group_map_from_positions_to_indexes(@groups_by_pos, @context.index)
|
70
|
+
end
|
71
|
+
alias :groups_by_idx :groups
|
72
|
+
|
73
|
+
# lazy accessor/attr_reader for the attribute df
|
74
|
+
def df
|
75
|
+
@df ||= GroupBy.df_from_group_map(@context, @groups_by_pos, @non_group_vectors)
|
76
|
+
end
|
77
|
+
alias :grouped_df :df
|
5
78
|
|
6
79
|
# Iterate over each group created by group_by. A DataFrame is yielded in
|
7
80
|
# block.
|
8
81
|
def each_group
|
82
|
+
return to_enum(:each_group) unless block_given?
|
83
|
+
|
9
84
|
groups.keys.each do |k|
|
10
85
|
yield get_group(k)
|
11
86
|
end
|
12
87
|
end
|
13
88
|
|
14
|
-
TUPLE_SORTER = lambda do |
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
89
|
+
TUPLE_SORTER = lambda do |left, right|
|
90
|
+
return -1 unless right
|
91
|
+
return 1 unless left
|
92
|
+
|
93
|
+
left = left.compact
|
94
|
+
right = right.compact
|
95
|
+
return left <=> right || 0 if left.length == right.length
|
96
|
+
left.length <=> right.length
|
20
97
|
end
|
21
98
|
|
22
99
|
def initialize context, names
|
23
|
-
@
|
100
|
+
@group_vectors = names
|
24
101
|
@non_group_vectors = context.vectors.to_a - names
|
25
|
-
|
26
|
-
|
27
|
-
|
102
|
+
|
103
|
+
@context = context # TODO: maybe rename in @original_df
|
104
|
+
|
28
105
|
# FIXME: It feels like we don't want to sort here. Ruby's #group_by
|
29
106
|
# never sorts:
|
30
107
|
#
|
@@ -32,24 +109,14 @@ module Daru
|
|
32
109
|
# # => {4=>["test"], 2=>["me"], 6=>["please"]}
|
33
110
|
#
|
34
111
|
# - zverok, 2016-09-12
|
35
|
-
|
36
|
-
|
37
|
-
keys.each do |key|
|
38
|
-
@groups[key] = all_indices_for(tuples, key)
|
39
|
-
end
|
40
|
-
@groups.freeze
|
112
|
+
@groups_by_pos = GroupBy.get_positions_group_map_for_df(@context, @group_vectors, sort: true)
|
41
113
|
end
|
42
114
|
|
43
115
|
# Get a Daru::Vector of the size of each group.
|
44
116
|
def size
|
45
|
-
index =
|
46
|
-
if multi_indexed_grouping?
|
47
|
-
Daru::MultiIndex.from_tuples @groups.keys
|
48
|
-
else
|
49
|
-
Daru::Index.new @groups.keys.flatten
|
50
|
-
end
|
117
|
+
index = get_grouped_index
|
51
118
|
|
52
|
-
values = @
|
119
|
+
values = @groups_by_pos.values.map(&:size)
|
53
120
|
Daru::Vector.new(values, index: index, name: :size)
|
54
121
|
end
|
55
122
|
|
@@ -196,27 +263,20 @@ module Daru
|
|
196
263
|
# # a b c d
|
197
264
|
# # 5 bar two 6 66
|
198
265
|
def get_group group
|
199
|
-
indexes =
|
266
|
+
indexes = groups_by_idx[group]
|
200
267
|
elements = @context.each_vector.map(&:to_a)
|
201
268
|
transpose = elements.transpose
|
202
269
|
rows = indexes.each.map { |idx| transpose[idx] }
|
203
270
|
|
204
|
-
new_index =
|
205
|
-
begin
|
206
|
-
@context.index[indexes]
|
207
|
-
rescue IndexError
|
208
|
-
indexes
|
209
|
-
end
|
210
|
-
|
211
271
|
Daru::DataFrame.rows(
|
212
|
-
rows, index:
|
272
|
+
rows, index: indexes, order: @context.vectors
|
213
273
|
)
|
214
274
|
end
|
215
275
|
|
216
276
|
# Iteratively applies a function to the values in a group and accumulates the result.
|
217
277
|
# @param init (nil) The initial value of the accumulator.
|
218
|
-
# @
|
219
|
-
#
|
278
|
+
# @yieldparam block [Proc] A proc or lambda that accepts two arguments. The first argument
|
279
|
+
# is the accumulated result. The second argument is a DataFrame row.
|
220
280
|
# @example Usage of reduce
|
221
281
|
# df = Daru::DataFrame.new({
|
222
282
|
# a: ['a','b'] * 3,
|
@@ -230,7 +290,7 @@ module Daru
|
|
230
290
|
# # a ACE
|
231
291
|
# # b BDF
|
232
292
|
def reduce(init=nil)
|
233
|
-
result_hash =
|
293
|
+
result_hash = groups_by_idx.each_with_object({}) do |(group, indices), h|
|
234
294
|
group_indices = indices.map { |v| @context.index.to_a[v] }
|
235
295
|
|
236
296
|
grouped_result = init
|
@@ -241,23 +301,64 @@ module Daru
|
|
241
301
|
h[group] = grouped_result
|
242
302
|
end
|
243
303
|
|
244
|
-
index =
|
245
|
-
if multi_indexed_grouping?
|
246
|
-
Daru::MultiIndex.from_tuples result_hash.keys
|
247
|
-
else
|
248
|
-
Daru::Index.new result_hash.keys.flatten
|
249
|
-
end
|
304
|
+
index = get_grouped_index(result_hash.keys)
|
250
305
|
|
251
306
|
Daru::Vector.new(result_hash.values, index: index)
|
252
307
|
end
|
253
308
|
|
309
|
+
def inspect
|
310
|
+
grouped_df.inspect
|
311
|
+
end
|
312
|
+
|
313
|
+
# Function to use for aggregating the data.
|
314
|
+
# `group_by` is using Daru::DataFrame#aggregate
|
315
|
+
#
|
316
|
+
# @param options [Hash] options for column, you want in resultant dataframe
|
317
|
+
#
|
318
|
+
# @return [Daru::DataFrame]
|
319
|
+
#
|
320
|
+
# @example
|
321
|
+
#
|
322
|
+
# df = Daru::DataFrame.new(
|
323
|
+
# name: ['Ram','Krishna','Ram','Krishna','Krishna'],
|
324
|
+
# visited: ['Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore'])
|
325
|
+
#
|
326
|
+
# => #<Daru::DataFrame(5x2)>
|
327
|
+
# name visited
|
328
|
+
# 0 Ram Hyderabad
|
329
|
+
# 1 Krishna Delhi
|
330
|
+
# 2 Ram Mumbai
|
331
|
+
# 3 Krishna Raipur
|
332
|
+
# 4 Krishna Banglore
|
333
|
+
#
|
334
|
+
# df.group_by(:name)
|
335
|
+
# => #<Daru::DataFrame(5x1)>
|
336
|
+
# visited
|
337
|
+
# Krishna 1 Delhi
|
338
|
+
# 3 Raipur
|
339
|
+
# 4 Banglore
|
340
|
+
# Ram 0 Hyderabad
|
341
|
+
# 2 Mumbai
|
342
|
+
#
|
343
|
+
# df.group_by(:name).aggregate(visited: -> (vec){vec.to_a.join(',')})
|
344
|
+
# => #<Daru::DataFrame(2x1)>
|
345
|
+
# visited
|
346
|
+
# Krishna Delhi,Raipur,Banglore
|
347
|
+
# Ram Hyderabad,Mumbai
|
348
|
+
#
|
349
|
+
def aggregate(options={})
|
350
|
+
new_index = get_grouped_index
|
351
|
+
|
352
|
+
@context.aggregate(options) { [@groups_by_pos.values, new_index] }
|
353
|
+
end
|
354
|
+
|
254
355
|
private
|
255
356
|
|
256
357
|
def select_groups_from method, quantity
|
257
358
|
selection = @context
|
258
359
|
rows, indexes = [], []
|
259
360
|
|
260
|
-
|
361
|
+
groups_by_idx.each_value do |index|
|
261
362
|
index.send(method, quantity).each do |idx|
|
262
363
|
rows << selection.row[idx].to_a
|
263
364
|
indexes << idx
|
@@ -268,50 +369,30 @@ module Daru
|
|
268
369
|
Daru::DataFrame.rows(rows, order: @context.vectors, index: indexes)
|
269
370
|
end
|
270
371
|
|
271
|
-
def
|
272
|
-
|
273
|
-
|
274
|
-
end
|
372
|
+
def select_numeric_non_group_vectors
|
373
|
+
@non_group_vectors.select { |ngvec| @context[ngvec].type == :numeric }
|
374
|
+
end
|
275
375
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
slice.is_a?(Daru::Vector) ? slice.send(method) : slice
|
280
|
-
end
|
281
|
-
end
|
376
|
+
def apply_method method_type, method
|
377
|
+
raise 'To implement' if method_type != :numeric
|
378
|
+
aggregation_options = select_numeric_non_group_vectors.map { |k| [k, method] }.to_h
|
282
379
|
|
283
|
-
|
284
|
-
order = Daru::Index.new(order)
|
285
|
-
Daru::DataFrame.new(rows.transpose, index: index, order: order)
|
380
|
+
aggregate(aggregation_options)
|
286
381
|
end
|
287
382
|
|
288
|
-
def
|
289
|
-
if
|
290
|
-
Daru::MultiIndex.from_tuples(@groups.keys)
|
291
|
-
else
|
292
|
-
Daru::Index.new(@groups.keys.flatten)
|
293
|
-
end
|
294
|
-
end
|
383
|
+
def get_grouped_index(index_tuples=nil)
|
384
|
+
index_tuples = @groups_by_pos.keys if index_tuples.nil?
|
295
385
|
|
296
|
-
|
297
|
-
|
298
|
-
while found
|
299
|
-
found = arry[index+1..-1].index(element)
|
300
|
-
if found
|
301
|
-
index = index + found + 1
|
302
|
-
indexes << index
|
303
|
-
end
|
304
|
-
end
|
305
|
-
if indexes.count == 1
|
306
|
-
[@context.index.at(*indexes)]
|
386
|
+
if multi_indexed_grouping?
|
387
|
+
Daru::MultiIndex.from_tuples(index_tuples)
|
307
388
|
else
|
308
|
-
|
389
|
+
Daru::Index.new(index_tuples.flatten)
|
309
390
|
end
|
310
391
|
end
|
311
392
|
|
312
393
|
def multi_indexed_grouping?
|
313
|
-
return false unless @
|
314
|
-
@
|
394
|
+
return false unless @groups_by_pos.keys[0]
|
395
|
+
@groups_by_pos.keys[0].size > 1
|
315
396
|
end
|
316
397
|
end
|
317
398
|
end
|