daru 0.1.6 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.rubocop.yml +1 -0
- data/.travis.yml +5 -0
- data/History.md +28 -0
- data/README.md +6 -0
- data/ReleasePolicy.md +20 -0
- data/daru.gemspec +4 -0
- data/lib/daru.rb +1 -2
- data/lib/daru/category.rb +15 -10
- data/lib/daru/core/group_by.rb +51 -8
- data/lib/daru/dataframe.rb +267 -28
- data/lib/daru/date_time/index.rb +1 -1
- data/lib/daru/date_time/offsets.rb +1 -1
- data/lib/daru/extensions/which_dsl.rb +55 -0
- data/lib/daru/index/categorical_index.rb +4 -4
- data/lib/daru/index/index.rb +5 -5
- data/lib/daru/index/multi_index.rb +11 -2
- data/lib/daru/io/io.rb +1 -1
- data/lib/daru/maths/arithmetic/vector.rb +38 -2
- data/lib/daru/maths/statistics/dataframe.rb +19 -19
- data/lib/daru/maths/statistics/vector.rb +225 -78
- data/lib/daru/plotting/nyaplot/dataframe.rb +11 -0
- data/lib/daru/vector.rb +55 -13
- data/lib/daru/version.rb +1 -1
- data/profile/vector_new.rb +9 -0
- data/spec/category_spec.rb +5 -1
- data/spec/core/group_by_spec.rb +128 -0
- data/spec/dataframe_spec.rb +125 -10
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/io/io_spec.rb +2 -2
- data/spec/maths/arithmetic/vector_spec.rb +18 -0
- data/spec/maths/statistics/vector_spec.rb +54 -38
- data/spec/plotting/nyaplot/dataframe_spec.rb +23 -0
- data/spec/spec_helper.rb +1 -1
- data/spec/vector_spec.rb +39 -0
- metadata +25 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69452b32fd8ef0ef7fb4ed58ab53ffa8aa15806d
|
4
|
+
data.tar.gz: 56927c77adbe7941eb2ca9a5e44d705931aad237
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8e7511133b3409f7821cfec944a950d53df57bcd5893bb8a9557c013f31bf1e4a9cc07bbe1c143c63684f00f7d8d8f1adf3b31df732508e667ba6677f47d1d96
|
7
|
+
data.tar.gz: fc4beb70106372a276b21e0da645951595e5674f56e4422752aeeabc9cc2156983add90e59486aea4d88386fbeb2896d15f7ede30667bc84027abd900ee42e0e
|
@@ -0,0 +1,18 @@
|
|
1
|
+
Heya! We are glad you are going to contribute to Daru by creating an issue, and kindly ask you to
|
2
|
+
follow the simple rules:
|
3
|
+
|
4
|
+
1. If it is a bug report, please provide a **self-containing** Ruby code for reproducing the bug.
|
5
|
+
This means if Daru contributors just copy-paste the code from issue into `this-is-bug.rb` and run
|
6
|
+
`ruby this-is-bug.rb`, it will be reproduced. If the bug is hard to spot (e.g. it is not some
|
7
|
+
`NoMethodError`, but the differences in data structure), please show it with comment in code or
|
8
|
+
plain text in the issue.
|
9
|
+
2. If it is a feature request, try to do the following (if possible):
|
10
|
+
* show how new feature will work with small code example;
|
11
|
+
* explain the use case (if it is not 200% obvious);
|
12
|
+
* if you are aware of it, show how it works in pandas and/or R.
|
13
|
+
3. If it is just a question ("how to do this or that" or "why Daru does this or that") feel free to
|
14
|
+
write it in any form that is convenient to you, but remember code examples and use cases are always
|
15
|
+
welcome.
|
16
|
+
|
17
|
+
Thanks! And please remove this text when finished with your issue description :)
|
18
|
+
|
data/.rubocop.yml
CHANGED
data/.travis.yml
CHANGED
@@ -9,12 +9,17 @@ rvm:
|
|
9
9
|
- '2.4.0'
|
10
10
|
|
11
11
|
matrix:
|
12
|
+
allow_failures:
|
13
|
+
- rvm: '2.0'
|
12
14
|
fast_finish:
|
13
15
|
true
|
14
16
|
|
15
17
|
script:
|
18
|
+
- bundle add yard-junk
|
19
|
+
- bundle install
|
16
20
|
- bundle exec rspec
|
17
21
|
- bundle exec rubocop
|
22
|
+
- bundle exec yard-junk
|
18
23
|
|
19
24
|
install:
|
20
25
|
- gem install bundler
|
data/History.md
CHANGED
@@ -1,3 +1,31 @@
|
|
1
|
+
# 0.2.0 (31 October 2017)
|
2
|
+
* Major Enhancements
|
3
|
+
- Add `DataFrame#which` query DSL (experimental! @rainchen)
|
4
|
+
- Add `DataFrame/Vector#rolling_fillna` (@baarkerlounger)
|
5
|
+
- Add `GroupBy#aggregate` (@shekharrajak)
|
6
|
+
- Add `DataFrame#uniq` (@baarkerlounger)
|
7
|
+
|
8
|
+
* Minor Enhancements
|
9
|
+
- Allow `Vector#count` to be called without param for category type Vector (@rainchen)
|
10
|
+
- Add option to `DataFrame#vector_sum` to skip nils (@parthm)
|
11
|
+
- Add installation instructions to README.md (@koishimasato)
|
12
|
+
- Add release policy documentation (@baarkerlounger)
|
13
|
+
- Set index as DataFrame's default x axis for nyaplot (@matugm)
|
14
|
+
|
15
|
+
* Fixes
|
16
|
+
- Fix `DataFrame/Vector#to_s` when name is a symbol (@baarkerlounger)
|
17
|
+
- Force `Vector#proportions` to return float (@rainchen)
|
18
|
+
- `DataFrame#new` creates empty DataFrame when given empty hash (@parthm)
|
19
|
+
- Remove unnecessary backports dependencies (@zverok)
|
20
|
+
- Specify minimum packable dependency (@zverok)
|
21
|
+
- Preserve key/column order when creating DataFrame from hash (@baarkerlounger)
|
22
|
+
- Fix `DataFrame#add_row` for DF with multi-index (@zverok)
|
23
|
+
- Fix `Vector#min, `#max`, `#index_of_min`, `#index_of_max` (0.1.6 regression) (@athityakumar)
|
24
|
+
- Integrate yard-junk into CI (@rohitner)
|
25
|
+
- Remove Travis spec restriction (@zverok)
|
26
|
+
- Fix tuple sorting for DataFrames with nils (@baarkerlounger)
|
27
|
+
- Fix merge on index dropping default index (@rohitner)
|
28
|
+
|
1
29
|
# 0.1.6 (04 August 2017)
|
2
30
|
* Major Enhancements
|
3
31
|
- Add support for reading HTML tables into DataFrames (@athityakumar)
|
data/README.md
CHANGED
@@ -26,6 +26,12 @@ daru makes it easy and intuitive to process data predominantly through 2 data st
|
|
26
26
|
* Quickly reducing data with pivot tables for quick data summary.
|
27
27
|
* Import and export data from and to Excel, CSV, SQL Databases, ActiveRecord and plain text files.
|
28
28
|
|
29
|
+
## Installation
|
30
|
+
|
31
|
+
```console
|
32
|
+
$ gem install daru
|
33
|
+
```
|
34
|
+
|
29
35
|
## Notebooks
|
30
36
|
|
31
37
|
#### Notebooks on most use cases
|
data/ReleasePolicy.md
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# Gem Release Policy
|
2
|
+
|
3
|
+
Applicable to Daru > 0.1.6
|
4
|
+
|
5
|
+
## Versioning
|
6
|
+
|
7
|
+
Daru follows semantic versioning whereby the version number is always in the form MAJOR.MINOR.PATCH
|
8
|
+
|
9
|
+
* Patch bump = Bug fixes
|
10
|
+
* Minor bump = New features but backwards compatible
|
11
|
+
* Major bump = API breaking changes
|
12
|
+
|
13
|
+
For Major and Minor bumps release candidates should be released around 2 weeks prior to the bump and are indicated by MAJOR.MINOR.0.rc.
|
14
|
+
|
15
|
+
For more information see the full semantic versioning specification at http://semver.org/.
|
16
|
+
|
17
|
+
## Release Timing
|
18
|
+
|
19
|
+
Patch releases should be done after every fix of a major bug (as tagged in the github issue tracker).
|
20
|
+
Major releases should be kept to the minimum.
|
data/daru.gemspec
CHANGED
@@ -52,6 +52,9 @@ EOF
|
|
52
52
|
|
53
53
|
spec.add_runtime_dependency 'backports'
|
54
54
|
|
55
|
+
# it is required by NMatrix, yet we want to specify clearly which minimal version is OK
|
56
|
+
spec.add_runtime_dependency 'packable', '~> 1.3.9'
|
57
|
+
|
55
58
|
spec.add_development_dependency 'spreadsheet', '~> 1.1.1'
|
56
59
|
spec.add_development_dependency 'bundler', '~> 1.10'
|
57
60
|
spec.add_development_dependency 'rake', '~>10.5'
|
@@ -75,6 +78,7 @@ EOF
|
|
75
78
|
spec.add_development_dependency 'simplecov'
|
76
79
|
spec.add_development_dependency 'gruff'
|
77
80
|
spec.add_development_dependency 'webmock'
|
81
|
+
|
78
82
|
if RUBY_VERSION < '2.1.0'
|
79
83
|
spec.add_development_dependency 'nokogiri', '<= 1.6.8.1'
|
80
84
|
else
|
data/lib/daru.rb
CHANGED
@@ -105,6 +105,7 @@ require 'date'
|
|
105
105
|
require 'daru/version.rb'
|
106
106
|
|
107
107
|
require 'open-uri'
|
108
|
+
require 'backports/2.1.0/array/to_h'
|
108
109
|
|
109
110
|
require 'daru/index/index.rb'
|
110
111
|
require 'daru/index/multi_index.rb'
|
@@ -124,5 +125,3 @@ require 'daru/core/merge.rb'
|
|
124
125
|
|
125
126
|
require 'daru/date_time/offsets.rb'
|
126
127
|
require 'daru/date_time/index.rb'
|
127
|
-
|
128
|
-
require 'backports'
|
data/lib/daru/category.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module Daru
|
2
2
|
module Category # rubocop:disable Metrics/ModuleLength
|
3
|
+
UNDEFINED = Object.new.freeze
|
4
|
+
|
3
5
|
attr_accessor :base_category
|
4
6
|
attr_reader :index, :coding_scheme, :name
|
5
7
|
|
@@ -113,7 +115,7 @@ module Daru
|
|
113
115
|
end
|
114
116
|
|
115
117
|
# Associates a category to the vector.
|
116
|
-
# @param [Array]
|
118
|
+
# @param [Array] new_categories new categories to be associated
|
117
119
|
# @example
|
118
120
|
# dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
|
119
121
|
# dv.add_category :b
|
@@ -131,7 +133,10 @@ module Daru
|
|
131
133
|
# dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
|
132
134
|
# dv.count :a
|
133
135
|
# # => 2
|
134
|
-
|
136
|
+
# dv.count
|
137
|
+
# # => 5
|
138
|
+
def count category=UNDEFINED
|
139
|
+
return @cat_hash.values.map(&:size).inject(&:+) if category == UNDEFINED # count all
|
135
140
|
raise ArgumentError, "Invalid category #{category}" unless
|
136
141
|
categories.include?(category)
|
137
142
|
|
@@ -167,7 +172,7 @@ module Daru
|
|
167
172
|
end
|
168
173
|
|
169
174
|
# Returns vector for indexes/positions specified
|
170
|
-
# @param [Array]
|
175
|
+
# @param [Array] indexes for which values has to be retrived
|
171
176
|
# @note Since it accepts both indexes and postions. In case of collision,
|
172
177
|
# arguement will be treated as index
|
173
178
|
# @return vector containing values specified at specified indexes/positions
|
@@ -196,7 +201,7 @@ module Daru
|
|
196
201
|
end
|
197
202
|
|
198
203
|
# Returns vector for positions specified.
|
199
|
-
# @param [Array]
|
204
|
+
# @param [Array] positions at which values to be retrived.
|
200
205
|
# @return vector containing values specified at specified positions
|
201
206
|
# @example
|
202
207
|
# dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
|
@@ -223,7 +228,7 @@ module Daru
|
|
223
228
|
|
224
229
|
# Modifies values at specified indexes/positions.
|
225
230
|
# @note In order to add a new category you need to associate it via #add_category
|
226
|
-
# @param [Array]
|
231
|
+
# @param [Array] indexes at which to modify value
|
227
232
|
# @param [object] val value to assign at specific indexes/positions
|
228
233
|
# @return modified vector
|
229
234
|
# @example
|
@@ -584,7 +589,7 @@ module Daru
|
|
584
589
|
alias :gteq :mteq
|
585
590
|
|
586
591
|
# For querying the data
|
587
|
-
# @param [object] arel like query syntax
|
592
|
+
# @param bool_array [object] arel like query syntax
|
588
593
|
# @return [Daru::Vector] Vector which makes the conditions true
|
589
594
|
# @example
|
590
595
|
# dv = Daru::Vector.new ['I', 'II', 'I', 'III', 'I', 'II'],
|
@@ -658,7 +663,7 @@ module Daru
|
|
658
663
|
end
|
659
664
|
|
660
665
|
# Check if any one of mentioned values occur in the vector
|
661
|
-
# @param [Array]
|
666
|
+
# @param [Array] values to check for
|
662
667
|
# @return [true, false] returns true if any one of specified values
|
663
668
|
# occur in the vector
|
664
669
|
# @example
|
@@ -670,7 +675,7 @@ module Daru
|
|
670
675
|
end
|
671
676
|
|
672
677
|
# Return a vector with specified values removed
|
673
|
-
# @param [Array]
|
678
|
+
# @param [Array] values to reject from resultant vector
|
674
679
|
# @return [Daru::Vector] vector with specified values removed
|
675
680
|
# @example
|
676
681
|
# dv = Daru::Vector.new [1, 2, nil, Float::NAN], type: :category
|
@@ -689,7 +694,7 @@ module Daru
|
|
689
694
|
end
|
690
695
|
|
691
696
|
# Count the number of values specified
|
692
|
-
# @param [Array]
|
697
|
+
# @param [Array] values to count for
|
693
698
|
# @return [Integer] the number of times the values mentioned occurs
|
694
699
|
# @example
|
695
700
|
# dv = Daru::Vector.new [1, 2, 1, 2, 3, 4, nil, nil]
|
@@ -702,7 +707,7 @@ module Daru
|
|
702
707
|
end
|
703
708
|
|
704
709
|
# Return indexes of values specified
|
705
|
-
# @param [Array]
|
710
|
+
# @param [Array] values to find indexes for
|
706
711
|
# @return [Array] array of indexes of values specified
|
707
712
|
# @example
|
708
713
|
# dv = Daru::Vector.new [1, 2, nil, Float::NAN], index: 11..14
|
data/lib/daru/core/group_by.rb
CHANGED
@@ -11,12 +11,14 @@ module Daru
|
|
11
11
|
end
|
12
12
|
end
|
13
13
|
|
14
|
-
TUPLE_SORTER = lambda do |
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
14
|
+
TUPLE_SORTER = lambda do |left, right|
|
15
|
+
return -1 unless right
|
16
|
+
return 1 unless left
|
17
|
+
|
18
|
+
left = left.compact
|
19
|
+
right = right.compact
|
20
|
+
return left <=> right || 0 if left.length == right.length
|
21
|
+
left.length <=> right.length
|
20
22
|
end
|
21
23
|
|
22
24
|
def initialize context, names
|
@@ -203,8 +205,8 @@ module Daru
|
|
203
205
|
|
204
206
|
# Iteratively applies a function to the values in a group and accumulates the result.
|
205
207
|
# @param init (nil) The initial value of the accumulator.
|
206
|
-
# @
|
207
|
-
#
|
208
|
+
# @yieldparam block [Proc] A proc or lambda that accepts two arguments. The first argument
|
209
|
+
# is the accumulated result. The second argument is a DataFrame row.
|
208
210
|
# @example Usage of reduce
|
209
211
|
# df = Daru::DataFrame.new({
|
210
212
|
# a: ['a','b'] * 3,
|
@@ -243,6 +245,47 @@ module Daru
|
|
243
245
|
@df.inspect
|
244
246
|
end
|
245
247
|
|
248
|
+
# Function to use for aggregating the data.
|
249
|
+
# `group_by` is using Daru::DataFrame#aggregate
|
250
|
+
#
|
251
|
+
# @param options [Hash] options for column, you want in resultant dataframe
|
252
|
+
#
|
253
|
+
# @return [Daru::DataFrame]
|
254
|
+
#
|
255
|
+
# @example
|
256
|
+
#
|
257
|
+
# df = Daru::DataFrame.new(
|
258
|
+
# name: ['Ram','Krishna','Ram','Krishna','Krishna'],
|
259
|
+
# visited: ['Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore'])
|
260
|
+
#
|
261
|
+
# => #<Daru::DataFrame(5x2)>
|
262
|
+
# name visited
|
263
|
+
# 0 Ram Hyderabad
|
264
|
+
# 1 Krishna Delhi
|
265
|
+
# 2 Ram Mumbai
|
266
|
+
# 3 Krishna Raipur
|
267
|
+
# 4 Krishna Banglore
|
268
|
+
#
|
269
|
+
# df.group_by(:name)
|
270
|
+
# => #<Daru::DataFrame(5x1)>
|
271
|
+
# visited
|
272
|
+
# Krishna 1 Delhi
|
273
|
+
# 3 Raipur
|
274
|
+
# 4 Banglore
|
275
|
+
# Ram 0 Hyderabad
|
276
|
+
# 2 Mumbai
|
277
|
+
#
|
278
|
+
# df.group_by(:name).aggregate(visited: -> (vec){vec.to_a.join(',')})
|
279
|
+
# => #<Daru::DataFrame(2x1)>
|
280
|
+
# visited
|
281
|
+
# Krishna Delhi,Raipur,Banglore
|
282
|
+
# Ram Hyderabad,Mumbai
|
283
|
+
#
|
284
|
+
def aggregate(options={})
|
285
|
+
@df.index = @df.index.remove_layer(@df.index.levels.size - 1)
|
286
|
+
@df.aggregate(options)
|
287
|
+
end
|
288
|
+
|
246
289
|
private
|
247
290
|
|
248
291
|
def init_groups_df tuples, names
|
data/lib/daru/dataframe.rb
CHANGED
@@ -84,7 +84,7 @@ module Daru
|
|
84
84
|
# Read a dataframe from AR::Relation
|
85
85
|
#
|
86
86
|
# @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
|
87
|
-
# @
|
87
|
+
# @param fields [Array] Field names to be loaded (optional)
|
88
88
|
#
|
89
89
|
# @return A dataframe containing the data loaded from the relation
|
90
90
|
#
|
@@ -277,6 +277,17 @@ module Daru
|
|
277
277
|
# Default to *true*.
|
278
278
|
#
|
279
279
|
# == Usage
|
280
|
+
#
|
281
|
+
# df = Daru::DataFrame.new
|
282
|
+
# # =>
|
283
|
+
# # <Daru::DataFrame(0x0)>
|
284
|
+
# # Creates an empty DataFrame with no rows or columns.
|
285
|
+
#
|
286
|
+
# df = Daru::DataFrame.new({}, order: [:a, :b])
|
287
|
+
# #<Daru::DataFrame(0x2)>
|
288
|
+
# a b
|
289
|
+
# # Creates a DataFrame with no rows and columns :a and :b
|
290
|
+
#
|
280
291
|
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
|
281
292
|
# index: [:a, :b, :c, :d], name: :spider_man)
|
282
293
|
#
|
@@ -329,7 +340,7 @@ module Daru
|
|
329
340
|
# # 1 4 14 44
|
330
341
|
# # 2 5 15 55
|
331
342
|
|
332
|
-
def initialize source, opts={} # rubocop:disable Metrics/MethodLength
|
343
|
+
def initialize source={}, opts={} # rubocop:disable Metrics/MethodLength
|
333
344
|
vectors, index = opts[:order], opts[:index] # FIXME: just keyword arges after Ruby 2.1
|
334
345
|
@data = []
|
335
346
|
@name = opts[:name]
|
@@ -375,7 +386,7 @@ module Daru
|
|
375
386
|
end
|
376
387
|
|
377
388
|
# Retrive rows by positions
|
378
|
-
# @param [Array<Integer>]
|
389
|
+
# @param [Array<Integer>] positions of rows to retrive
|
379
390
|
# @return [Daru::Vector, Daru::DataFrame] vector for single position and dataframe for multiple positions
|
380
391
|
# @example
|
381
392
|
# df = Daru::DataFrame.new({
|
@@ -405,7 +416,7 @@ module Daru
|
|
405
416
|
|
406
417
|
# Set rows by positions
|
407
418
|
# @param [Array<Integer>] positions positions of rows to set
|
408
|
-
# @
|
419
|
+
# @param [Array, Daru::Vector] vector vector to be assigned
|
409
420
|
# @example
|
410
421
|
# df = Daru::DataFrame.new({
|
411
422
|
# a: [1, 2, 3],
|
@@ -438,7 +449,7 @@ module Daru
|
|
438
449
|
end
|
439
450
|
|
440
451
|
# Retrive vectors by positions
|
441
|
-
# @param [Array<Integer>]
|
452
|
+
# @param [Array<Integer>] positions of vectors to retrive
|
442
453
|
# @return [Daru::Vector, Daru::DataFrame] vector for single position and dataframe for multiple positions
|
443
454
|
# @example
|
444
455
|
# df = Daru::DataFrame.new({
|
@@ -522,7 +533,7 @@ module Daru
|
|
522
533
|
end
|
523
534
|
|
524
535
|
def add_row row, index=nil
|
525
|
-
self.row[index || @size] = row
|
536
|
+
self.row[*(index || @size)] = row
|
526
537
|
end
|
527
538
|
|
528
539
|
def add_vector n, vector
|
@@ -597,7 +608,7 @@ module Daru
|
|
597
608
|
|
598
609
|
# Returns a dataframe in which rows with any of the mentioned values
|
599
610
|
# are ignored.
|
600
|
-
# @param [Array]
|
611
|
+
# @param [Array] values to reject to form the new dataframe
|
601
612
|
# @return [Daru::DataFrame] Data Frame with only rows which doesn't
|
602
613
|
# contain the mentioned values
|
603
614
|
# @example
|
@@ -650,6 +661,88 @@ module Daru
|
|
650
661
|
self
|
651
662
|
end
|
652
663
|
|
664
|
+
# Rolling fillna
|
665
|
+
# replace all Float::NAN and NIL values with the preceeding or following value
|
666
|
+
#
|
667
|
+
# @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
|
668
|
+
#
|
669
|
+
# @example
|
670
|
+
# df = Daru::DataFrame.new({
|
671
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
672
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
|
673
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
674
|
+
# })
|
675
|
+
#
|
676
|
+
# => #<Daru::DataFrame(8x3)>
|
677
|
+
# a b c
|
678
|
+
# 0 1 a a
|
679
|
+
# 1 2 b NaN
|
680
|
+
# 2 3 nil 3
|
681
|
+
# 3 nil NaN 4
|
682
|
+
# 4 NaN nil 3
|
683
|
+
# 5 nil 3 5
|
684
|
+
# 6 1 5 nil
|
685
|
+
# 7 7 nil 7
|
686
|
+
#
|
687
|
+
# 2.3.3 :068 > df.rolling_fillna(:forward)
|
688
|
+
# => #<Daru::DataFrame(8x3)>
|
689
|
+
# a b c
|
690
|
+
# 0 1 a a
|
691
|
+
# 1 2 b a
|
692
|
+
# 2 3 b 3
|
693
|
+
# 3 3 b 4
|
694
|
+
# 4 3 b 3
|
695
|
+
# 5 3 3 5
|
696
|
+
# 6 1 5 5
|
697
|
+
# 7 7 5 7
|
698
|
+
#
|
699
|
+
def rolling_fillna!(direction=:forward)
|
700
|
+
@data.each { |vec| vec.rolling_fillna!(direction) }
|
701
|
+
end
|
702
|
+
|
703
|
+
def rolling_fillna(direction=:forward)
|
704
|
+
dup.rolling_fillna!(direction)
|
705
|
+
end
|
706
|
+
|
707
|
+
# Return unique rows by vector specified or all vectors
|
708
|
+
#
|
709
|
+
# @param vtrs [String][Symbol] vector names(s) that should be considered
|
710
|
+
#
|
711
|
+
# @example
|
712
|
+
#
|
713
|
+
# => #<Daru::DataFrame(6x2)>
|
714
|
+
# a b
|
715
|
+
# 0 1 a
|
716
|
+
# 1 2 b
|
717
|
+
# 2 3 c
|
718
|
+
# 3 4 d
|
719
|
+
# 2 3 c
|
720
|
+
# 3 4 f
|
721
|
+
#
|
722
|
+
# 2.3.3 :> df.unique
|
723
|
+
# => #<Daru::DataFrame(5x2)>
|
724
|
+
# a b
|
725
|
+
# 0 1 a
|
726
|
+
# 1 2 b
|
727
|
+
# 2 3 c
|
728
|
+
# 3 4 d
|
729
|
+
# 3 4 f
|
730
|
+
#
|
731
|
+
# 2.3.3 :> df.unique(:a)
|
732
|
+
# => #<Daru::DataFrame(5x2)>
|
733
|
+
# a b
|
734
|
+
# 0 1 a
|
735
|
+
# 1 2 b
|
736
|
+
# 2 3 c
|
737
|
+
# 3 4 d
|
738
|
+
#
|
739
|
+
def uniq(*vtrs)
|
740
|
+
vecs = vtrs.empty? ? vectors.map(&:to_s) : Array(vtrs)
|
741
|
+
grouped = group_by(vecs)
|
742
|
+
indexes = grouped.groups.values.map { |v| v[0] }.sort
|
743
|
+
row[*indexes]
|
744
|
+
end
|
745
|
+
|
653
746
|
# Iterate over each index of the DataFrame.
|
654
747
|
def each_index &block
|
655
748
|
return to_enum(:each_index) unless block_given?
|
@@ -1024,9 +1117,9 @@ module Daru
|
|
1024
1117
|
dup.tap { |df| df.keep_vector_if(&block) }
|
1025
1118
|
end
|
1026
1119
|
|
1027
|
-
# Test each row with one or more tests.
|
1028
|
-
#
|
1029
|
-
#
|
1120
|
+
# Test each row with one or more tests.
|
1121
|
+
# @param tests [Proc] Each test is a Proc with the form
|
1122
|
+
# *Proc.new {|row| row[:age] > 0}*
|
1030
1123
|
# The function returns an array with all errors.
|
1031
1124
|
#
|
1032
1125
|
# FIXME: description here is too sparse. As far as I can get,
|
@@ -1128,7 +1221,7 @@ module Daru
|
|
1128
1221
|
deprecate :flawed?, :include_values?, 2016, 10
|
1129
1222
|
|
1130
1223
|
# Check if any of given values occur in the data frame
|
1131
|
-
# @param [Array]
|
1224
|
+
# @param [Array] values to check for
|
1132
1225
|
# @return [true, false] true if any of the given values occur in the
|
1133
1226
|
# dataframe, false otherwise
|
1134
1227
|
# @example
|
@@ -1259,13 +1352,60 @@ module Daru
|
|
1259
1352
|
|
1260
1353
|
alias :last :tail
|
1261
1354
|
|
1262
|
-
#
|
1263
|
-
#
|
1264
|
-
|
1355
|
+
# Sum all numeric/specified vectors in the DataFrame.
|
1356
|
+
#
|
1357
|
+
# Returns a new vector that's a containing a sum of all numeric
|
1358
|
+
# or specified vectors of the DataFrame. By default, if the vector
|
1359
|
+
# contains a nil, the sum is nil.
|
1360
|
+
# With :skipnil argument set to true, nil values are assumed to be
|
1361
|
+
# 0 (zero) and the sum vector is returned.
|
1362
|
+
#
|
1363
|
+
# @param args [Array] List of vectors to sum. Default is nil in which case
|
1364
|
+
# all numeric vectors are summed.
|
1365
|
+
#
|
1366
|
+
# @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
|
1367
|
+
#
|
1368
|
+
# @return Vector with sum of all vectors specified in the argument.
|
1369
|
+
# If vecs parameter is empty, sum all numeric vector.
|
1370
|
+
#
|
1371
|
+
# @example
|
1372
|
+
# df = Daru::DataFrame.new({
|
1373
|
+
# a: [1, 2, nil],
|
1374
|
+
# b: [2, 1, 3],
|
1375
|
+
# c: [1, 1, 1]
|
1376
|
+
# })
|
1377
|
+
# => #<Daru::DataFrame(3x3)>
|
1378
|
+
# a b c
|
1379
|
+
# 0 1 2 1
|
1380
|
+
# 1 2 1 1
|
1381
|
+
# 2 nil 3 1
|
1382
|
+
# df.vector_sum [:a, :c]
|
1383
|
+
# => #<Daru::Vector(3)>
|
1384
|
+
# 0 2
|
1385
|
+
# 1 3
|
1386
|
+
# 2 nil
|
1387
|
+
# df.vector_sum
|
1388
|
+
# => #<Daru::Vector(3)>
|
1389
|
+
# 0 4
|
1390
|
+
# 1 4
|
1391
|
+
# 2 nil
|
1392
|
+
# df.vector_sum skipnil: true
|
1393
|
+
# => #<Daru::Vector(3)>
|
1394
|
+
# c
|
1395
|
+
# 0 4
|
1396
|
+
# 1 4
|
1397
|
+
# 2 4
|
1398
|
+
#
|
1399
|
+
def vector_sum(*args)
|
1400
|
+
defaults = {vecs: nil, skipnil: false}
|
1401
|
+
options = args.last.is_a?(::Hash) ? args.pop : {}
|
1402
|
+
options = defaults.merge(options)
|
1403
|
+
vecs = args[0] || options[:vecs]
|
1404
|
+
skipnil = args[1] || options[:skipnil]
|
1405
|
+
|
1265
1406
|
vecs ||= numeric_vectors
|
1266
1407
|
sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype
|
1267
|
-
|
1268
|
-
vecs.inject(sum) { |memo, n| memo + self[n] }
|
1408
|
+
vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
|
1269
1409
|
end
|
1270
1410
|
|
1271
1411
|
# Calculate mean of the rows of the dataframe.
|
@@ -1427,7 +1567,7 @@ module Daru
|
|
1427
1567
|
|
1428
1568
|
# Reassign vectors with a new index of type Daru::Index or any of its subclasses.
|
1429
1569
|
#
|
1430
|
-
# @param [Daru::Index] idx The new index object on which the vectors are to
|
1570
|
+
# @param new_index [Daru::Index] idx The new index object on which the vectors are to
|
1431
1571
|
# be indexed. Must of the same size as ncols.
|
1432
1572
|
# @example Reassigning vectors of a DataFrame
|
1433
1573
|
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
|
@@ -1513,9 +1653,9 @@ module Daru
|
|
1513
1653
|
# Sorts a dataframe (ascending/descending) in the given pripority sequence of
|
1514
1654
|
# vectors, with or without a block.
|
1515
1655
|
#
|
1516
|
-
# @param
|
1656
|
+
# @param vector_order [Array] The order of vector names in which the DataFrame
|
1517
1657
|
# should be sorted.
|
1518
|
-
# @param [Hash] opts The options to sort with.
|
1658
|
+
# @param opts [Hash] opts The options to sort with.
|
1519
1659
|
# @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
|
1520
1660
|
# or descending order. Specify Array corresponding to *order* for multiple
|
1521
1661
|
# sort orders.
|
@@ -1684,12 +1824,11 @@ module Daru
|
|
1684
1824
|
|
1685
1825
|
new_fields = (@vectors.to_a + other_df.vectors.to_a)
|
1686
1826
|
new_fields = ArrayHelper.recode_repeated(new_fields)
|
1687
|
-
|
1688
1827
|
DataFrame.new({}, order: new_fields).tap do |df_new|
|
1689
1828
|
(0...nrows).each do |i|
|
1690
1829
|
df_new.add_row row[i].to_a + other_df.row[i].to_a
|
1691
1830
|
end
|
1692
|
-
|
1831
|
+
df_new.index = @index if @index == other_df.index
|
1693
1832
|
df_new.update
|
1694
1833
|
end
|
1695
1834
|
end
|
@@ -2035,7 +2174,7 @@ module Daru
|
|
2035
2174
|
end
|
2036
2175
|
|
2037
2176
|
# Converts the specified non category type vectors to category type vectors
|
2038
|
-
# @param [Array]
|
2177
|
+
# @param [Array] names of non category type vectors to be converted
|
2039
2178
|
# @return [Daru::DataFrame] data frame in which specified vectors have been
|
2040
2179
|
# converted to category type
|
2041
2180
|
# @example
|
@@ -2126,8 +2265,88 @@ module Daru
|
|
2126
2265
|
res
|
2127
2266
|
end
|
2128
2267
|
|
2268
|
+
# Function to use for aggregating the data.
|
2269
|
+
#
|
2270
|
+
# @param options [Hash] options for column, you want in resultant dataframe
|
2271
|
+
#
|
2272
|
+
# @return [Daru::DataFrame]
|
2273
|
+
#
|
2274
|
+
# @example
|
2275
|
+
# df = Daru::DataFrame.new(
|
2276
|
+
# {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
|
2277
|
+
# => #<Daru::DataFrame(5x2)>
|
2278
|
+
# col num
|
2279
|
+
# 0 a 52
|
2280
|
+
# 1 b 12
|
2281
|
+
# 2 c 7
|
2282
|
+
# 3 d 17
|
2283
|
+
# 4 e 1
|
2284
|
+
#
|
2285
|
+
# df.aggregate(num_100_times: ->(df) { df.num*100 })
|
2286
|
+
# => #<Daru::DataFrame(5x1)>
|
2287
|
+
# num_100_ti
|
2288
|
+
# 0 5200
|
2289
|
+
# 1 1200
|
2290
|
+
# 2 700
|
2291
|
+
# 3 1700
|
2292
|
+
# 4 100
|
2293
|
+
#
|
2294
|
+
# When we have duplicate index :
|
2295
|
+
#
|
2296
|
+
# idx = Daru::CategoricalIndex.new [:a, :b, :a, :a, :c]
|
2297
|
+
# df = Daru::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
|
2298
|
+
# => #<Daru::DataFrame(5x1)>
|
2299
|
+
# num
|
2300
|
+
# a 52
|
2301
|
+
# b 12
|
2302
|
+
# a 7
|
2303
|
+
# a 17
|
2304
|
+
# c 1
|
2305
|
+
#
|
2306
|
+
# df.aggregate(num: :mean)
|
2307
|
+
# => #<Daru::DataFrame(3x1)>
|
2308
|
+
# num
|
2309
|
+
# a 25.3333333
|
2310
|
+
# b 12
|
2311
|
+
# c 1
|
2312
|
+
#
|
2313
|
+
# Note: `GroupBy` class `aggregate` method uses this `aggregate` method
|
2314
|
+
# internally.
|
2315
|
+
def aggregate(options={})
|
2316
|
+
colmn_value, index_tuples = aggregated_colmn_value(options)
|
2317
|
+
Daru::DataFrame.new(
|
2318
|
+
colmn_value, index: index_tuples, order: options.keys
|
2319
|
+
)
|
2320
|
+
end
|
2321
|
+
|
2129
2322
|
private
|
2130
2323
|
|
2324
|
+
# Do the `method` (`method` can be :sum, :mean, :std, :median, etc or
|
2325
|
+
# lambda), on the column.
|
2326
|
+
def apply_method_on_colmns colmn, index_tuples, method
|
2327
|
+
rows = []
|
2328
|
+
index_tuples.each do |indexes|
|
2329
|
+
# If single element then also make it vector.
|
2330
|
+
slice = Daru::Vector.new(Array(self[colmn][*indexes]))
|
2331
|
+
case method
|
2332
|
+
when Symbol
|
2333
|
+
rows << (slice.is_a?(Daru::Vector) ? slice.send(method) : slice)
|
2334
|
+
when Proc
|
2335
|
+
rows << method.call(slice)
|
2336
|
+
end
|
2337
|
+
end
|
2338
|
+
rows
|
2339
|
+
end
|
2340
|
+
|
2341
|
+
def apply_method_on_df index_tuples, method
|
2342
|
+
rows = []
|
2343
|
+
index_tuples.each do |indexes|
|
2344
|
+
slice = row[*indexes]
|
2345
|
+
rows << method.call(slice)
|
2346
|
+
end
|
2347
|
+
rows
|
2348
|
+
end
|
2349
|
+
|
2131
2350
|
def headers
|
2132
2351
|
Daru::Index.new(Array(index.name) + @vectors.to_a)
|
2133
2352
|
end
|
@@ -2224,9 +2443,7 @@ module Daru
|
|
2224
2443
|
rescue IndexError
|
2225
2444
|
raise IndexError, "Specified vector #{names.first} does not exist"
|
2226
2445
|
end
|
2227
|
-
|
2228
2446
|
return @data[pos] if pos.is_a?(Numeric)
|
2229
|
-
|
2230
2447
|
names = pos
|
2231
2448
|
end
|
2232
2449
|
|
@@ -2396,7 +2613,7 @@ module Daru
|
|
2396
2613
|
end
|
2397
2614
|
|
2398
2615
|
def create_vectors_index_with vectors, source
|
2399
|
-
vectors = source.keys
|
2616
|
+
vectors = source.keys if vectors.nil?
|
2400
2617
|
|
2401
2618
|
@vectors =
|
2402
2619
|
if vectors.is_a?(Index) || vectors.is_a?(MultiIndex)
|
@@ -2443,9 +2660,7 @@ module Daru
|
|
2443
2660
|
@index = Index.coerce(index || source[0].size)
|
2444
2661
|
@vectors = Index.coerce(vectors)
|
2445
2662
|
|
2446
|
-
|
2447
|
-
Daru::Vector.new(source[idx], index: @index, name: vectors[idx])
|
2448
|
-
end
|
2663
|
+
update_data source, vectors
|
2449
2664
|
end
|
2450
2665
|
|
2451
2666
|
def initialize_from_array_of_vectors source, vectors, index, opts
|
@@ -2694,6 +2909,30 @@ module Daru
|
|
2694
2909
|
end
|
2695
2910
|
end
|
2696
2911
|
|
2912
|
+
def update_data source, vectors
|
2913
|
+
@data = @vectors.each_with_index.map do |_vec,idx|
|
2914
|
+
Daru::Vector.new(source[idx], index: @index, name: vectors[idx])
|
2915
|
+
end
|
2916
|
+
end
|
2917
|
+
|
2918
|
+
def aggregated_colmn_value(options)
|
2919
|
+
colmn_value = []
|
2920
|
+
index_tuples = Array(@index).uniq
|
2921
|
+
options.keys.each do |vec|
|
2922
|
+
do_this_on_vec = options[vec]
|
2923
|
+
colmn_value << if @vectors.include?(vec)
|
2924
|
+
apply_method_on_colmns(
|
2925
|
+
vec, index_tuples, do_this_on_vec
|
2926
|
+
)
|
2927
|
+
else
|
2928
|
+
apply_method_on_df(
|
2929
|
+
index_tuples, do_this_on_vec
|
2930
|
+
)
|
2931
|
+
end
|
2932
|
+
end
|
2933
|
+
[colmn_value, index_tuples]
|
2934
|
+
end
|
2935
|
+
|
2697
2936
|
# coerce ranges, integers and array in appropriate ways
|
2698
2937
|
def coerce_positions *positions, size
|
2699
2938
|
if positions.size == 1
|