daru 0.1.5 → 0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (106) hide show
  1. checksums.yaml +5 -5
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.gitignore +1 -0
  4. data/.rubocop.yml +21 -7
  5. data/.travis.yml +10 -5
  6. data/CONTRIBUTING.md +15 -10
  7. data/History.md +124 -2
  8. data/README.md +37 -9
  9. data/ReleasePolicy.md +20 -0
  10. data/benchmarks/db_loading.rb +34 -0
  11. data/benchmarks/statistics.rb +6 -6
  12. data/benchmarks/where_clause.rb +1 -1
  13. data/benchmarks/where_vs_filter.rb +1 -1
  14. data/daru.gemspec +17 -41
  15. data/lib/daru.rb +10 -13
  16. data/lib/daru/accessors/gsl_wrapper.rb +1 -1
  17. data/lib/daru/accessors/nmatrix_wrapper.rb +2 -0
  18. data/lib/daru/category.rb +29 -15
  19. data/lib/daru/configuration.rb +34 -0
  20. data/lib/daru/core/group_by.rb +158 -77
  21. data/lib/daru/core/merge.rb +12 -3
  22. data/lib/daru/core/query.rb +20 -4
  23. data/lib/daru/dataframe.rb +692 -118
  24. data/lib/daru/date_time/index.rb +14 -11
  25. data/lib/daru/date_time/offsets.rb +9 -1
  26. data/lib/daru/extensions/which_dsl.rb +55 -0
  27. data/lib/daru/formatters/table.rb +3 -5
  28. data/lib/daru/index/categorical_index.rb +4 -4
  29. data/lib/daru/index/index.rb +131 -42
  30. data/lib/daru/index/multi_index.rb +118 -10
  31. data/lib/daru/io/csv/converters.rb +21 -0
  32. data/lib/daru/io/io.rb +105 -33
  33. data/lib/daru/io/sql_data_source.rb +10 -0
  34. data/lib/daru/iruby/templates/dataframe.html.erb +4 -51
  35. data/lib/daru/iruby/templates/dataframe_mi.html.erb +3 -56
  36. data/lib/daru/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  37. data/lib/daru/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  38. data/lib/daru/iruby/templates/dataframe_tbody.html.erb +28 -0
  39. data/lib/daru/iruby/templates/dataframe_thead.html.erb +21 -0
  40. data/lib/daru/iruby/templates/vector.html.erb +3 -25
  41. data/lib/daru/iruby/templates/vector_mi.html.erb +3 -34
  42. data/lib/daru/iruby/templates/vector_mi_tbody.html.erb +26 -0
  43. data/lib/daru/iruby/templates/vector_mi_thead.html.erb +8 -0
  44. data/lib/daru/iruby/templates/vector_tbody.html.erb +17 -0
  45. data/lib/daru/iruby/templates/vector_thead.html.erb +8 -0
  46. data/lib/daru/maths/arithmetic/vector.rb +38 -2
  47. data/lib/daru/maths/statistics/dataframe.rb +28 -30
  48. data/lib/daru/maths/statistics/vector.rb +295 -41
  49. data/lib/daru/plotting/gruff/dataframe.rb +13 -15
  50. data/lib/daru/plotting/nyaplot/category.rb +1 -1
  51. data/lib/daru/plotting/nyaplot/dataframe.rb +15 -4
  52. data/lib/daru/plotting/nyaplot/vector.rb +1 -2
  53. data/lib/daru/vector.rb +308 -96
  54. data/lib/daru/version.rb +1 -1
  55. data/profile/vector_new.rb +9 -0
  56. data/spec/accessors/gsl_wrapper_spec.rb +38 -35
  57. data/spec/accessors/nmatrix_wrapper_spec.rb +25 -22
  58. data/spec/category_spec.rb +24 -20
  59. data/spec/core/group_by_spec.rb +238 -4
  60. data/spec/core/merge_spec.rb +1 -1
  61. data/spec/core/query_spec.rb +65 -50
  62. data/spec/daru_spec.rb +22 -0
  63. data/spec/dataframe_spec.rb +473 -16
  64. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  65. data/spec/date_time/index_spec.rb +34 -16
  66. data/spec/date_time/offsets_spec.rb +14 -0
  67. data/spec/extensions/rserve_spec.rb +1 -1
  68. data/spec/extensions/which_dsl_spec.rb +38 -0
  69. data/spec/fixtures/boolean_converter_test.csv +5 -0
  70. data/spec/fixtures/duplicates.csv +32 -0
  71. data/spec/fixtures/eciresults.html +394 -0
  72. data/spec/fixtures/empty_rows_test.csv +17 -0
  73. data/spec/fixtures/macau.html +3691 -0
  74. data/spec/fixtures/macd_data.csv +150 -0
  75. data/spec/fixtures/matrix_test.csv +55 -55
  76. data/spec/fixtures/moneycontrol.html +6812 -0
  77. data/spec/fixtures/string_converter_test.csv +5 -0
  78. data/spec/fixtures/test_xls.xls +0 -0
  79. data/spec/fixtures/test_xls_2.xls +0 -0
  80. data/spec/fixtures/url_test.txt~ +0 -0
  81. data/spec/fixtures/valid_markup.html +62 -0
  82. data/spec/fixtures/wiki_climate.html +1243 -0
  83. data/spec/fixtures/wiki_table_info.html +631 -0
  84. data/spec/formatters/table_formatter_spec.rb +29 -0
  85. data/spec/index/categorical_index_spec.rb +33 -33
  86. data/spec/index/index_spec.rb +160 -41
  87. data/spec/index/multi_index_spec.rb +143 -33
  88. data/spec/io/io_spec.rb +246 -2
  89. data/spec/io/sql_data_source_spec.rb +31 -41
  90. data/spec/iruby/dataframe_spec.rb +17 -19
  91. data/spec/iruby/vector_spec.rb +26 -28
  92. data/spec/maths/arithmetic/dataframe_spec.rb +1 -1
  93. data/spec/maths/arithmetic/vector_spec.rb +18 -0
  94. data/spec/maths/statistics/vector_spec.rb +153 -15
  95. data/spec/plotting/gruff/category_spec.rb +3 -3
  96. data/spec/plotting/gruff/dataframe_spec.rb +14 -4
  97. data/spec/plotting/gruff/vector_spec.rb +9 -9
  98. data/spec/plotting/nyaplot/category_spec.rb +5 -9
  99. data/spec/plotting/nyaplot/dataframe_spec.rb +95 -47
  100. data/spec/plotting/nyaplot/vector_spec.rb +5 -11
  101. data/spec/shared/vector_display_spec.rb +12 -14
  102. data/spec/spec_helper.rb +30 -7
  103. data/spec/support/matchers.rb +5 -0
  104. data/spec/vector_spec.rb +306 -72
  105. metadata +96 -55
  106. data/spec/fixtures/stock_data.csv +0 -500
@@ -0,0 +1,34 @@
1
+ $:.unshift File.expand_path("../../lib", __FILE__)
2
+
3
+ require 'benchmark'
4
+ require 'daru'
5
+ require 'sqlite3'
6
+ require 'dbi'
7
+ require 'active_record'
8
+
9
+ db_name = 'daru_test.sqlite'
10
+ FileUtils.rm(db_name) if File.file?(db_name)
11
+
12
+ SQLite3::Database.new(db_name).tap do |db|
13
+ db.execute "create table accounts(id integer, name varchar, age integer, primary key(id))"
14
+
15
+ values = 1.upto(100_000).map { |i| %!(#{i},"name_#{i}",#{rand(100)})! }.join(",")
16
+ db.execute "insert into accounts values #{values}"
17
+ end
18
+
19
+ ActiveRecord::Base.establish_connection("sqlite3:#{db_name}")
20
+ ActiveRecord::Base.connection
21
+
22
+ class Account < ActiveRecord::Base; end
23
+
24
+ Benchmark.bm do |x|
25
+ x.report("DataFrame.from_sql") do
26
+ Daru::DataFrame.from_sql(ActiveRecord::Base.connection, "SELECT * FROM accounts")
27
+ end
28
+
29
+ x.report("DataFrame.from_activerecord") do
30
+ Daru::DataFrame.from_activerecord(Account.all)
31
+ end
32
+ end
33
+
34
+ FileUtils.rm(db_name)
@@ -5,26 +5,26 @@ vector = Daru::Vector.new(
5
5
  (10**6).times.map.to_a.shuffle,
6
6
  missing_values: 100.times.map.to_a.shuffle
7
7
  )
8
-
8
+
9
9
  vector_gsl = Daru::Vector.new(
10
10
  10000.times.map.to_a.shuffle,
11
11
  missing_values: 100.times.map.to_a.shuffle,
12
12
  dtype: :gsl
13
- )
14
-
13
+ )
14
+
15
15
  Benchmark.bm do |x|
16
16
  x.report("Mean of a vector") do
17
17
  vector.mean
18
18
  end
19
-
19
+
20
20
  x.report("Minimum of a vector") do
21
21
  vector.min
22
22
  end
23
-
23
+
24
24
  x.report("Mean of a vector with data type gsl") do
25
25
  vector_gsl.mean
26
26
  end
27
-
27
+
28
28
  x.report "Minimum of a vector with data type gsl" do
29
29
  vector_gsl.min
30
30
  end
@@ -35,7 +35,7 @@ Benchmark.bm do |x|
35
35
  end
36
36
 
37
37
  # ====== Benchmarks ======
38
- #
38
+ #
39
39
  # Benchmarking DataFrame#where
40
40
  #
41
41
  # user system total real
@@ -22,7 +22,7 @@ Benchmark.bm do |x|
22
22
  end
23
23
 
24
24
  # ===== Benchmarks =====
25
- #
25
+ #
26
26
  # user system total real
27
27
  # where 0.000000 0.000000 0.000000 ( 0.002575)
28
28
  # filter_rows 0.210000 0.000000 0.210000 ( 0.205403)
@@ -19,7 +19,7 @@ Gem::Specification.new do |spec|
19
19
  spec.email = ['sameer.deshmukh93@gmail.com']
20
20
  spec.summary = %q{Data Analysis in RUby}
21
21
  spec.description = Daru::DESCRIPTION
22
- spec.homepage = "http://github.com/v0dro/daru"
22
+ spec.homepage = "http://github.com/SciRuby/daru"
23
23
  spec.license = 'BSD-2'
24
24
 
25
25
  spec.files = `git ls-files -z`.split("\x0")
@@ -27,35 +27,12 @@ Gem::Specification.new do |spec|
27
27
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
28
28
  spec.require_paths = ["lib"]
29
29
 
30
- spec.post_install_message = <<-EOF
31
- *************************************************************************
32
- Thank you for installing daru!
30
+ # it is required by NMatrix, yet we want to specify clearly which minimal version is OK
31
+ spec.add_runtime_dependency 'packable', '~> 1.3.13'
33
32
 
34
- oOOOOOo
35
- ,| oO
36
- //| |
37
- \\\\| |
38
- `| |
39
- `-----`
40
-
41
-
42
- Hope you love daru! For enhanced interactivity and better visualizations,
43
- consider using gnuplotrb and nyaplot with iruby. For statistics use the
44
- statsample family.
45
-
46
- Read the README for interesting use cases and examples.
47
-
48
- Cheers!
49
- *************************************************************************
50
- EOF
51
-
52
-
53
- spec.add_runtime_dependency 'backports'
54
-
55
- spec.add_development_dependency 'reportbuilder', '~> 1.4'
56
33
  spec.add_development_dependency 'spreadsheet', '~> 1.1.1'
57
- spec.add_development_dependency 'bundler', '~> 1.10'
58
- spec.add_development_dependency 'rake', '~>10.5'
34
+ spec.add_development_dependency 'bundler', '>= 1.10'
35
+ spec.add_development_dependency 'rake', '~>13.0'
59
36
  spec.add_development_dependency 'pry', '~> 0.10'
60
37
  spec.add_development_dependency 'pry-byebug'
61
38
  spec.add_development_dependency 'rserve-client', '~> 0.3'
@@ -63,23 +40,22 @@ EOF
63
40
  spec.add_development_dependency 'rspec-its'
64
41
  spec.add_development_dependency 'awesome_print'
65
42
  spec.add_development_dependency 'nyaplot', '~> 0.1.5'
66
- spec.add_development_dependency 'nmatrix', '~> 0.2.1'
43
+ spec.add_development_dependency 'nmatrix', '~> 0.2.1' if ENV['DARU_TEST_NMATRIX']
67
44
  spec.add_development_dependency 'distribution', '~> 0.7'
68
- spec.add_development_dependency 'gsl', '~>2.1.0.2'
45
+ spec.add_development_dependency 'gsl', '~>2.1.0.2' if ENV['DARU_TEST_GSL']
69
46
  spec.add_development_dependency 'dbd-sqlite3'
70
47
  spec.add_development_dependency 'dbi'
71
- spec.add_development_dependency 'activerecord', '~> 4.0'
72
- spec.add_development_dependency 'sqlite3'
73
- spec.add_development_dependency 'rubocop', '>= 0.40.0'
48
+ spec.add_development_dependency 'activerecord', '~> 6.0'
49
+ spec.add_development_dependency 'mechanize'
50
+ # issue : https://github.com/SciRuby/daru/issues/493 occured
51
+ # with latest version of sqlite3
52
+ spec.add_development_dependency 'sqlite3'
53
+ spec.add_development_dependency 'rubocop', '~> 0.49.0'
74
54
  spec.add_development_dependency 'ruby-prof'
75
55
  spec.add_development_dependency 'simplecov'
76
56
  spec.add_development_dependency 'gruff'
77
- if RUBY_VERSION < '2.1.0'
78
- spec.add_development_dependency 'nokogiri', '<= 1.6.8.1'
79
- else
80
- spec.add_development_dependency 'nokogiri'
81
- end
82
- if RUBY_VERSION >= '2.2.5'
83
- spec.add_development_dependency 'guard-rspec'
84
- end
57
+ spec.add_development_dependency 'webmock'
58
+
59
+ spec.add_development_dependency 'nokogiri'
60
+ spec.add_development_dependency 'guard-rspec'
85
61
  end
@@ -38,11 +38,13 @@ module Daru
38
38
 
39
39
  @plotting_library = :nyaplot
40
40
 
41
+ @error_stream = $stderr
42
+
41
43
  class << self
42
44
  # A variable which will set whether Vector metadata is updated immediately or lazily.
43
45
  # Call the #update method every time a values are set or removed in order to update
44
46
  # metadata like positions of missing values.
45
- attr_accessor :lazy_update
47
+ attr_accessor :lazy_update, :error_stream
46
48
  attr_reader :plotting_library
47
49
 
48
50
  def create_has_library(library)
@@ -72,6 +74,10 @@ module Daru
72
74
  raise ArgumentError, "Unsupported library #{lib}"
73
75
  end
74
76
  end
77
+
78
+ def error msg
79
+ error_stream.puts msg if error_stream
80
+ end
75
81
  end
76
82
 
77
83
  create_has_library :gsl
@@ -80,16 +86,6 @@ module Daru
80
86
  create_has_library :gruff
81
87
  end
82
88
 
83
- [['reportbuilder', '~>1.4'], ['spreadsheet', '~>1.1.1']].each do |lib|
84
- begin
85
- gem lib[0], lib[1]
86
- require lib[0]
87
- rescue LoadError
88
- STDERR.puts "\nInstall the #{lib[0]} gem version #{lib[1]} for using"\
89
- " #{lib[0]} functions."
90
- end
91
- end
92
-
93
89
  autoload :CSV, 'csv'
94
90
  require 'matrix'
95
91
  require 'forwardable'
@@ -98,11 +94,14 @@ require 'date'
98
94
 
99
95
  require 'daru/version.rb'
100
96
 
97
+ require 'open-uri'
98
+
101
99
  require 'daru/index/index.rb'
102
100
  require 'daru/index/multi_index.rb'
103
101
  require 'daru/index/categorical_index.rb'
104
102
 
105
103
  require 'daru/helpers/array.rb'
104
+ require 'daru/configuration.rb'
106
105
  require 'daru/vector.rb'
107
106
  require 'daru/dataframe.rb'
108
107
  require 'daru/monkeys.rb'
@@ -116,5 +115,3 @@ require 'daru/core/merge.rb'
116
115
 
117
116
  require 'daru/date_time/offsets.rb'
118
117
  require 'daru/date_time/index.rb'
119
-
120
- require 'backports'
@@ -65,7 +65,7 @@ if Daru.has_gsl?
65
65
  ::GSL::Vector.alloc(@data.to_a - [Float::NAN])
66
66
  end
67
67
 
68
- [:mean, :min, :max, :prod, :sum].each do |method|
68
+ %i[mean min max prod sum].each do |method|
69
69
  define_method(method) do
70
70
  compact.send(method.to_sym) rescue nil
71
71
  end
@@ -25,6 +25,8 @@ if Daru.has_nmatrix?
25
25
  attr_reader :size, :data, :nm_dtype
26
26
 
27
27
  def initialize vector, context, nm_dtype=:int32
28
+ # To avoid arrays with nils throwing TypeError for nil nm_dtype
29
+ nm_dtype = :object if nm_dtype.nil? && vector.any?(&:nil?)
28
30
  @size = vector.size
29
31
  @data = NMatrix.new [@size*2], vector.to_a, dtype: nm_dtype
30
32
  @context = context
@@ -1,5 +1,7 @@
1
1
  module Daru
2
2
  module Category # rubocop:disable Metrics/ModuleLength
3
+ UNDEFINED = Object.new.freeze
4
+
3
5
  attr_accessor :base_category
4
6
  attr_reader :index, :coding_scheme, :name
5
7
 
@@ -72,6 +74,13 @@ module Daru
72
74
  end
73
75
  end
74
76
 
77
+ # this method is overwritten: see Daru::Category#plotting_library=
78
+ def plot(*args, **options, &b)
79
+ init_plotting_library
80
+
81
+ plot(*args, **options, &b)
82
+ end
83
+
75
84
  alias_method :rename, :name=
76
85
 
77
86
  # Returns an enumerator that enumerates on categorical data
@@ -113,7 +122,7 @@ module Daru
113
122
  end
114
123
 
115
124
  # Associates a category to the vector.
116
- # @param [Array] *new_categories new categories to be associated
125
+ # @param [Array] new_categories new categories to be associated
117
126
  # @example
118
127
  # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
119
128
  # dv.add_category :b
@@ -131,7 +140,10 @@ module Daru
131
140
  # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
132
141
  # dv.count :a
133
142
  # # => 2
134
- def count category
143
+ # dv.count
144
+ # # => 5
145
+ def count category=UNDEFINED
146
+ return @cat_hash.values.map(&:size).inject(&:+) if category == UNDEFINED # count all
135
147
  raise ArgumentError, "Invalid category #{category}" unless
136
148
  categories.include?(category)
137
149
 
@@ -167,9 +179,9 @@ module Daru
167
179
  end
168
180
 
169
181
  # Returns vector for indexes/positions specified
170
- # @param [Array] *indexes indexes/positions for which values has to be retrived
182
+ # @param [Array] indexes for which values has to be retrived
171
183
  # @note Since it accepts both indexes and postions. In case of collision,
172
- # arguement will be treated as index
184
+ # argument will be treated as index
173
185
  # @return vector containing values specified at specified indexes/positions
174
186
  # @example
175
187
  # dv = Daru::Vector.new [:a, 1, :a, 1, :c],
@@ -196,7 +208,7 @@ module Daru
196
208
  end
197
209
 
198
210
  # Returns vector for positions specified.
199
- # @param [Array] *positions positions at which values to be retrived.
211
+ # @param [Array] positions at which values to be retrived.
200
212
  # @return vector containing values specified at specified positions
201
213
  # @example
202
214
  # dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
@@ -223,7 +235,7 @@ module Daru
223
235
 
224
236
  # Modifies values at specified indexes/positions.
225
237
  # @note In order to add a new category you need to associate it via #add_category
226
- # @param [Array] *indexes indexes/positions at which to modify value
238
+ # @param [Array] indexes at which to modify value
227
239
  # @param [object] val value to assign at specific indexes/positions
228
240
  # @return modified vector
229
241
  # @example
@@ -461,7 +473,7 @@ module Daru
461
473
  @coding_scheme = scheme
462
474
  end
463
475
 
464
- CODING_SCHEMES = [:dummy, :deviation, :helmert, :simple].freeze
476
+ CODING_SCHEMES = %i[dummy deviation helmert simple].freeze
465
477
 
466
478
  # Contrast code the vector acording to the coding scheme set.
467
479
  # @note To set the coding scheme use #coding_scheme=
@@ -584,7 +596,7 @@ module Daru
584
596
  alias :gteq :mteq
585
597
 
586
598
  # For querying the data
587
- # @param [object] arel like query syntax
599
+ # @param bool_array [object] arel like query syntax
588
600
  # @return [Daru::Vector] Vector which makes the conditions true
589
601
  # @example
590
602
  # dv = Daru::Vector.new ['I', 'II', 'I', 'III', 'I', 'II'],
@@ -658,7 +670,7 @@ module Daru
658
670
  end
659
671
 
660
672
  # Check if any one of mentioned values occur in the vector
661
- # @param [Array] *values values to check for
673
+ # @param [Array] values to check for
662
674
  # @return [true, false] returns true if any one of specified values
663
675
  # occur in the vector
664
676
  # @example
@@ -670,7 +682,7 @@ module Daru
670
682
  end
671
683
 
672
684
  # Return a vector with specified values removed
673
- # @param [Array] *values values to reject from resultant vector
685
+ # @param [Array] values to reject from resultant vector
674
686
  # @return [Daru::Vector] vector with specified values removed
675
687
  # @example
676
688
  # dv = Daru::Vector.new [1, 2, nil, Float::NAN], type: :category
@@ -689,7 +701,7 @@ module Daru
689
701
  end
690
702
 
691
703
  # Count the number of values specified
692
- # @param [Array] *values values to count for
704
+ # @param [Array] values to count for
693
705
  # @return [Integer] the number of times the values mentioned occurs
694
706
  # @example
695
707
  # dv = Daru::Vector.new [1, 2, 1, 2, 3, 4, nil, nil]
@@ -702,7 +714,7 @@ module Daru
702
714
  end
703
715
 
704
716
  # Return indexes of values specified
705
- # @param [Array] *values values to find indexes for
717
+ # @param [Array] values to find indexes for
706
718
  # @return [Array] array of indexes of values specified
707
719
  # @example
708
720
  # dv = Daru::Vector.new [1, 2, nil, Float::NAN], index: 11..14
@@ -743,6 +755,11 @@ module Daru
743
755
 
744
756
  private
745
757
 
758
+ # Will lazily load the plotting library being used
759
+ def init_plotting_library
760
+ self.plotting_library = Daru.plotting_library
761
+ end
762
+
746
763
  def validate_categories input_categories
747
764
  raise ArgumentError, 'Input categories and speculated categories mismatch' unless
748
765
  (categories - input_categories).empty?
@@ -763,9 +780,6 @@ module Daru
763
780
  # To link every instance to its category,
764
781
  # it stores integer for every instance representing its category
765
782
  @array = map_cat_int.values_at(*data)
766
-
767
- # Include plotting functionality
768
- self.plotting_library = Daru.plotting_library
769
783
  end
770
784
 
771
785
  def category_from_position position
@@ -0,0 +1,34 @@
1
+ module Daru
2
+ # Defines constants and methods related to configuration
3
+ module Configuration
4
+ INSPECT_OPTIONS_KEYS = [
5
+ :max_rows,
6
+ # Terminal
7
+ :spacing
8
+ ].freeze
9
+
10
+ # Jupyter
11
+ DEFAULT_MAX_ROWS = 30
12
+
13
+ # Terminal
14
+ DEFAULT_SPACING = 10
15
+
16
+ attr_accessor(*INSPECT_OPTIONS_KEYS)
17
+
18
+ def configure
19
+ yield self
20
+ end
21
+
22
+ def self.extended(base)
23
+ base.reset_options
24
+ end
25
+
26
+ def reset_options
27
+ self.max_rows = DEFAULT_MAX_ROWS
28
+
29
+ self.spacing = DEFAULT_SPACING
30
+ end
31
+ end
32
+
33
+ extend Configuration
34
+ end
@@ -1,30 +1,107 @@
1
1
  module Daru
2
2
  module Core
3
3
  class GroupBy
4
- attr_reader :groups
4
+ class << self
5
+ extend Gem::Deprecate
6
+
7
+ # @private
8
+ def group_by_index_to_positions(indexes_with_positions, sort: false)
9
+ index_to_positions = {}
10
+
11
+ indexes_with_positions.each do |idx, position|
12
+ (index_to_positions[idx] ||= []) << position
13
+ end
14
+
15
+ if sort # TODO: maybe add a more "stable" sorting option?
16
+ sorted_keys = index_to_positions.keys.sort(&Daru::Core::GroupBy::TUPLE_SORTER)
17
+ index_to_positions = sorted_keys.map { |k| [k, index_to_positions[k]] }.to_h
18
+ end
19
+
20
+ index_to_positions
21
+ end
22
+ alias get_positions_group_map_on group_by_index_to_positions
23
+ deprecate :get_positions_group_map_on, :group_by_index_to_positions, 2019, 10
24
+
25
+ # @private
26
+ def get_positions_group_for_aggregation(multi_index, level=-1)
27
+ raise unless multi_index.is_a?(Daru::MultiIndex)
28
+
29
+ new_index = multi_index.dup
30
+ new_index.remove_layer(level) # TODO: recheck code of Daru::MultiIndex#remove_layer
31
+
32
+ group_by_index_to_positions(new_index.each_with_index)
33
+ end
34
+
35
+ # @private
36
+ def get_positions_group_map_for_df(df, group_by_keys, sort: true)
37
+ indexes_with_positions = df[*group_by_keys].to_df.each_row.map(&:to_a).each_with_index
38
+
39
+ group_by_index_to_positions(indexes_with_positions, sort: sort)
40
+ end
41
+
42
+ # @private
43
+ def group_map_from_positions_to_indexes(positions_group_map, index)
44
+ positions_group_map.map { |k, positions| [k, positions.map { |pos| index.at(pos) }] }.to_h
45
+ end
46
+
47
+ # @private
48
+ def df_from_group_map(df, group_map, remaining_vectors, from_position: true)
49
+ return nil if group_map == {}
50
+
51
+ new_index = group_map.flat_map { |group, values| values.map { |val| group + [val] } }
52
+ new_index = Daru::MultiIndex.from_tuples(new_index)
53
+
54
+ return Daru::DataFrame.new({}, index: new_index) if remaining_vectors == []
55
+
56
+ new_rows_order = group_map.values.flatten
57
+ new_df = df[*remaining_vectors].to_df.get_sub_dataframe(new_rows_order, by_position: from_position)
58
+ new_df.index = new_index
59
+
60
+ new_df
61
+ end
62
+ end
63
+
64
+ # The group_by was done over the vectors in group_vectors; the remaining vectors are the non_group_vectors
65
+ attr_reader :group_vectors, :non_group_vectors
66
+
67
+ # lazy accessor/attr_reader for the attribute groups
68
+ def groups
69
+ @groups ||= GroupBy.group_map_from_positions_to_indexes(@groups_by_pos, @context.index)
70
+ end
71
+ alias :groups_by_idx :groups
72
+
73
+ # lazy accessor/attr_reader for the attribute df
74
+ def df
75
+ @df ||= GroupBy.df_from_group_map(@context, @groups_by_pos, @non_group_vectors)
76
+ end
77
+ alias :grouped_df :df
5
78
 
6
79
  # Iterate over each group created by group_by. A DataFrame is yielded in
7
80
  # block.
8
81
  def each_group
82
+ return to_enum(:each_group) unless block_given?
83
+
9
84
  groups.keys.each do |k|
10
85
  yield get_group(k)
11
86
  end
12
87
  end
13
88
 
14
- TUPLE_SORTER = lambda do |a, b|
15
- if a && b
16
- a.compact <=> b.compact
17
- else
18
- a ? 1 : -1
19
- end
89
+ TUPLE_SORTER = lambda do |left, right|
90
+ return -1 unless right
91
+ return 1 unless left
92
+
93
+ left = left.compact
94
+ right = right.compact
95
+ return left <=> right || 0 if left.length == right.length
96
+ left.length <=> right.length
20
97
  end
21
98
 
22
99
  def initialize context, names
23
- @groups = {}
100
+ @group_vectors = names
24
101
  @non_group_vectors = context.vectors.to_a - names
25
- @context = context
26
- vectors = names.map { |vec| context[vec].to_a }
27
- tuples = vectors[0].zip(*vectors[1..-1])
102
+
103
+ @context = context # TODO: maybe rename in @original_df
104
+
28
105
  # FIXME: It feels like we don't want to sort here. Ruby's #group_by
29
106
  # never sorts:
30
107
  #
@@ -32,24 +109,14 @@ module Daru
32
109
  # # => {4=>["test"], 2=>["me"], 6=>["please"]}
33
110
  #
34
111
  # - zverok, 2016-09-12
35
- keys = tuples.uniq.sort(&TUPLE_SORTER)
36
-
37
- keys.each do |key|
38
- @groups[key] = all_indices_for(tuples, key)
39
- end
40
- @groups.freeze
112
+ @groups_by_pos = GroupBy.get_positions_group_map_for_df(@context, @group_vectors, sort: true)
41
113
  end
42
114
 
43
115
  # Get a Daru::Vector of the size of each group.
44
116
  def size
45
- index =
46
- if multi_indexed_grouping?
47
- Daru::MultiIndex.from_tuples @groups.keys
48
- else
49
- Daru::Index.new @groups.keys.flatten
50
- end
117
+ index = get_grouped_index
51
118
 
52
- values = @groups.values.map(&:size)
119
+ values = @groups_by_pos.values.map(&:size)
53
120
  Daru::Vector.new(values, index: index, name: :size)
54
121
  end
55
122
 
@@ -196,27 +263,20 @@ module Daru
196
263
  # # a b c d
197
264
  # # 5 bar two 6 66
198
265
  def get_group group
199
- indexes = @groups[group]
266
+ indexes = groups_by_idx[group]
200
267
  elements = @context.each_vector.map(&:to_a)
201
268
  transpose = elements.transpose
202
269
  rows = indexes.each.map { |idx| transpose[idx] }
203
270
 
204
- new_index =
205
- begin
206
- @context.index[indexes]
207
- rescue IndexError
208
- indexes
209
- end
210
-
211
271
  Daru::DataFrame.rows(
212
- rows, index: new_index, order: @context.vectors
272
+ rows, index: indexes, order: @context.vectors
213
273
  )
214
274
  end
215
275
 
216
276
  # Iteratively applies a function to the values in a group and accumulates the result.
217
277
  # @param init (nil) The initial value of the accumulator.
218
- # @param block [Proc] A proc or lambda that accepts two arguments. The first argument
219
- # is the accumulated result. The second argument is a DataFrame row.
278
+ # @yieldparam block [Proc] A proc or lambda that accepts two arguments. The first argument
279
+ # is the accumulated result. The second argument is a DataFrame row.
220
280
  # @example Usage of reduce
221
281
  # df = Daru::DataFrame.new({
222
282
  # a: ['a','b'] * 3,
@@ -230,7 +290,7 @@ module Daru
230
290
  # # a ACE
231
291
  # # b BDF
232
292
  def reduce(init=nil)
233
- result_hash = @groups.each_with_object({}) do |(group, indices), h|
293
+ result_hash = groups_by_idx.each_with_object({}) do |(group, indices), h|
234
294
  group_indices = indices.map { |v| @context.index.to_a[v] }
235
295
 
236
296
  grouped_result = init
@@ -241,23 +301,64 @@ module Daru
241
301
  h[group] = grouped_result
242
302
  end
243
303
 
244
- index =
245
- if multi_indexed_grouping?
246
- Daru::MultiIndex.from_tuples result_hash.keys
247
- else
248
- Daru::Index.new result_hash.keys.flatten
249
- end
304
+ index = get_grouped_index(result_hash.keys)
250
305
 
251
306
  Daru::Vector.new(result_hash.values, index: index)
252
307
  end
253
308
 
309
+ def inspect
310
+ grouped_df.inspect
311
+ end
312
+
313
+ # Function to use for aggregating the data.
314
+ # `group_by` is using Daru::DataFrame#aggregate
315
+ #
316
+ # @param options [Hash] options for column, you want in resultant dataframe
317
+ #
318
+ # @return [Daru::DataFrame]
319
+ #
320
+ # @example
321
+ #
322
+ # df = Daru::DataFrame.new(
323
+ # name: ['Ram','Krishna','Ram','Krishna','Krishna'],
324
+ # visited: ['Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore'])
325
+ #
326
+ # => #<Daru::DataFrame(5x2)>
327
+ # name visited
328
+ # 0 Ram Hyderabad
329
+ # 1 Krishna Delhi
330
+ # 2 Ram Mumbai
331
+ # 3 Krishna Raipur
332
+ # 4 Krishna Banglore
333
+ #
334
+ # df.group_by(:name)
335
+ # => #<Daru::DataFrame(5x1)>
336
+ # visited
337
+ # Krishna 1 Delhi
338
+ # 3 Raipur
339
+ # 4 Banglore
340
+ # Ram 0 Hyderabad
341
+ # 2 Mumbai
342
+ #
343
+ # df.group_by(:name).aggregate(visited: -> (vec){vec.to_a.join(',')})
344
+ # => #<Daru::DataFrame(2x1)>
345
+ # visited
346
+ # Krishna Delhi,Raipur,Banglore
347
+ # Ram Hyderabad,Mumbai
348
+ #
349
+ def aggregate(options={})
350
+ new_index = get_grouped_index
351
+
352
+ @context.aggregate(options) { [@groups_by_pos.values, new_index] }
353
+ end
354
+
254
355
  private
255
356
 
256
357
  def select_groups_from method, quantity
257
358
  selection = @context
258
359
  rows, indexes = [], []
259
360
 
260
- @groups.each_value do |index|
361
+ groups_by_idx.each_value do |index|
261
362
  index.send(method, quantity).each do |idx|
262
363
  rows << selection.row[idx].to_a
263
364
  indexes << idx
@@ -268,50 +369,30 @@ module Daru
268
369
  Daru::DataFrame.rows(rows, order: @context.vectors, index: indexes)
269
370
  end
270
371
 
271
- def apply_method method_type, method
272
- order = @non_group_vectors.select do |ngvec|
273
- method_type == :numeric && @context[ngvec].type == :numeric
274
- end
372
+ def select_numeric_non_group_vectors
373
+ @non_group_vectors.select { |ngvec| @context[ngvec].type == :numeric }
374
+ end
275
375
 
276
- rows = @groups.map do |_group, indexes|
277
- order.map do |ngvector|
278
- slice = @context[ngvector][*indexes]
279
- slice.is_a?(Daru::Vector) ? slice.send(method) : slice
280
- end
281
- end
376
+ def apply_method method_type, method
377
+ raise 'To implement' if method_type != :numeric
378
+ aggregation_options = select_numeric_non_group_vectors.map { |k| [k, method] }.to_h
282
379
 
283
- index = apply_method_index
284
- order = Daru::Index.new(order)
285
- Daru::DataFrame.new(rows.transpose, index: index, order: order)
380
+ aggregate(aggregation_options)
286
381
  end
287
382
 
288
- def apply_method_index
289
- if multi_indexed_grouping?
290
- Daru::MultiIndex.from_tuples(@groups.keys)
291
- else
292
- Daru::Index.new(@groups.keys.flatten)
293
- end
294
- end
383
+ def get_grouped_index(index_tuples=nil)
384
+ index_tuples = @groups_by_pos.keys if index_tuples.nil?
295
385
 
296
- def all_indices_for arry, element
297
- found, index, indexes = -1, -1, []
298
- while found
299
- found = arry[index+1..-1].index(element)
300
- if found
301
- index = index + found + 1
302
- indexes << index
303
- end
304
- end
305
- if indexes.count == 1
306
- [@context.index.at(*indexes)]
386
+ if multi_indexed_grouping?
387
+ Daru::MultiIndex.from_tuples(index_tuples)
307
388
  else
308
- @context.index.at(*indexes).to_a
389
+ Daru::Index.new(index_tuples.flatten)
309
390
  end
310
391
  end
311
392
 
312
393
  def multi_indexed_grouping?
313
- return false unless @groups.keys[0]
314
- @groups.keys[0].size > 1
394
+ return false unless @groups_by_pos.keys[0]
395
+ @groups_by_pos.keys[0].size > 1
315
396
  end
316
397
  end
317
398
  end