daru 0.1.6 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE.md +18 -0
- data/.rubocop.yml +1 -0
- data/.travis.yml +5 -0
- data/History.md +28 -0
- data/README.md +6 -0
- data/ReleasePolicy.md +20 -0
- data/daru.gemspec +4 -0
- data/lib/daru.rb +1 -2
- data/lib/daru/category.rb +15 -10
- data/lib/daru/core/group_by.rb +51 -8
- data/lib/daru/dataframe.rb +267 -28
- data/lib/daru/date_time/index.rb +1 -1
- data/lib/daru/date_time/offsets.rb +1 -1
- data/lib/daru/extensions/which_dsl.rb +55 -0
- data/lib/daru/index/categorical_index.rb +4 -4
- data/lib/daru/index/index.rb +5 -5
- data/lib/daru/index/multi_index.rb +11 -2
- data/lib/daru/io/io.rb +1 -1
- data/lib/daru/maths/arithmetic/vector.rb +38 -2
- data/lib/daru/maths/statistics/dataframe.rb +19 -19
- data/lib/daru/maths/statistics/vector.rb +225 -78
- data/lib/daru/plotting/nyaplot/dataframe.rb +11 -0
- data/lib/daru/vector.rb +55 -13
- data/lib/daru/version.rb +1 -1
- data/profile/vector_new.rb +9 -0
- data/spec/category_spec.rb +5 -1
- data/spec/core/group_by_spec.rb +128 -0
- data/spec/dataframe_spec.rb +125 -10
- data/spec/extensions/which_dsl_spec.rb +38 -0
- data/spec/fixtures/duplicates.csv +32 -0
- data/spec/io/io_spec.rb +2 -2
- data/spec/maths/arithmetic/vector_spec.rb +18 -0
- data/spec/maths/statistics/vector_spec.rb +54 -38
- data/spec/plotting/nyaplot/dataframe_spec.rb +23 -0
- data/spec/spec_helper.rb +1 -1
- data/spec/vector_spec.rb +39 -0
- metadata +25 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69452b32fd8ef0ef7fb4ed58ab53ffa8aa15806d
|
4
|
+
data.tar.gz: 56927c77adbe7941eb2ca9a5e44d705931aad237
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8e7511133b3409f7821cfec944a950d53df57bcd5893bb8a9557c013f31bf1e4a9cc07bbe1c143c63684f00f7d8d8f1adf3b31df732508e667ba6677f47d1d96
|
7
|
+
data.tar.gz: fc4beb70106372a276b21e0da645951595e5674f56e4422752aeeabc9cc2156983add90e59486aea4d88386fbeb2896d15f7ede30667bc84027abd900ee42e0e
|
@@ -0,0 +1,18 @@
|
|
1
|
+
Heya! We are glad you are going to contribute to Daru by creating an issue, and kindly ask you to
|
2
|
+
follow the simple rules:
|
3
|
+
|
4
|
+
1. If it is a bug report, please provide a **self-containing** Ruby code for reproducing the bug.
|
5
|
+
This means if Daru contributors just copy-paste the code from issue into `this-is-bug.rb` and run
|
6
|
+
`ruby this-is-bug.rb`, it will be reproduced. If the bug is hard to spot (e.g. it is not some
|
7
|
+
`NoMethodError`, but the differences in data structure), please show it with comment in code or
|
8
|
+
plain text in the issue.
|
9
|
+
2. If it is a feature request, try to do the following (if possible):
|
10
|
+
* show how new feature will work with small code example;
|
11
|
+
* explain the use case (if it is not 200% obvious);
|
12
|
+
* if you are aware of it, show how it works in pandas and/or R.
|
13
|
+
3. If it is just a question ("how to do this or that" or "why Daru does this or that") feel free to
|
14
|
+
write it in any form that is convenient to you, but remember code examples and use cases are always
|
15
|
+
welcome.
|
16
|
+
|
17
|
+
Thanks! And please remove this text when finished with your issue description :)
|
18
|
+
|
data/.rubocop.yml
CHANGED
data/.travis.yml
CHANGED
@@ -9,12 +9,17 @@ rvm:
|
|
9
9
|
- '2.4.0'
|
10
10
|
|
11
11
|
matrix:
|
12
|
+
allow_failures:
|
13
|
+
- rvm: '2.0'
|
12
14
|
fast_finish:
|
13
15
|
true
|
14
16
|
|
15
17
|
script:
|
18
|
+
- bundle add yard-junk
|
19
|
+
- bundle install
|
16
20
|
- bundle exec rspec
|
17
21
|
- bundle exec rubocop
|
22
|
+
- bundle exec yard-junk
|
18
23
|
|
19
24
|
install:
|
20
25
|
- gem install bundler
|
data/History.md
CHANGED
@@ -1,3 +1,31 @@
|
|
1
|
+
# 0.2.0 (31 October 2017)
|
2
|
+
* Major Enhancements
|
3
|
+
- Add `DataFrame#which` query DSL (experimental! @rainchen)
|
4
|
+
- Add `DataFrame/Vector#rolling_fillna` (@baarkerlounger)
|
5
|
+
- Add `GroupBy#aggregate` (@shekharrajak)
|
6
|
+
- Add `DataFrame#uniq` (@baarkerlounger)
|
7
|
+
|
8
|
+
* Minor Enhancements
|
9
|
+
- Allow `Vector#count` to be called without param for category type Vector (@rainchen)
|
10
|
+
- Add option to `DataFrame#vector_sum` to skip nils (@parthm)
|
11
|
+
- Add installation instructions to README.md (@koishimasato)
|
12
|
+
- Add release policy documentation (@baarkerlounger)
|
13
|
+
- Set index as DataFrame's default x axis for nyaplot (@matugm)
|
14
|
+
|
15
|
+
* Fixes
|
16
|
+
- Fix `DataFrame/Vector#to_s` when name is a symbol (@baarkerlounger)
|
17
|
+
- Force `Vector#proportions` to return float (@rainchen)
|
18
|
+
- `DataFrame#new` creates empty DataFrame when given empty hash (@parthm)
|
19
|
+
- Remove unnecessary backports dependencies (@zverok)
|
20
|
+
- Specify minimum packable dependency (@zverok)
|
21
|
+
- Preserve key/column order when creating DataFrame from hash (@baarkerlounger)
|
22
|
+
- Fix `DataFrame#add_row` for DF with multi-index (@zverok)
|
23
|
+
- Fix `Vector#min, `#max`, `#index_of_min`, `#index_of_max` (0.1.6 regression) (@athityakumar)
|
24
|
+
- Integrate yard-junk into CI (@rohitner)
|
25
|
+
- Remove Travis spec restriction (@zverok)
|
26
|
+
- Fix tuple sorting for DataFrames with nils (@baarkerlounger)
|
27
|
+
- Fix merge on index dropping default index (@rohitner)
|
28
|
+
|
1
29
|
# 0.1.6 (04 August 2017)
|
2
30
|
* Major Enhancements
|
3
31
|
- Add support for reading HTML tables into DataFrames (@athityakumar)
|
data/README.md
CHANGED
@@ -26,6 +26,12 @@ daru makes it easy and intuitive to process data predominantly through 2 data st
|
|
26
26
|
* Quickly reducing data with pivot tables for quick data summary.
|
27
27
|
* Import and export data from and to Excel, CSV, SQL Databases, ActiveRecord and plain text files.
|
28
28
|
|
29
|
+
## Installation
|
30
|
+
|
31
|
+
```console
|
32
|
+
$ gem install daru
|
33
|
+
```
|
34
|
+
|
29
35
|
## Notebooks
|
30
36
|
|
31
37
|
#### Notebooks on most use cases
|
data/ReleasePolicy.md
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# Gem Release Policy
|
2
|
+
|
3
|
+
Applicable to Daru > 0.1.6
|
4
|
+
|
5
|
+
## Versioning
|
6
|
+
|
7
|
+
Daru follows semantic versioning whereby the version number is always in the form MAJOR.MINOR.PATCH
|
8
|
+
|
9
|
+
* Patch bump = Bug fixes
|
10
|
+
* Minor bump = New features but backwards compatible
|
11
|
+
* Major bump = API breaking changes
|
12
|
+
|
13
|
+
For Major and Minor bumps release candidates should be released around 2 weeks prior to the bump and are indicated by MAJOR.MINOR.0.rc.
|
14
|
+
|
15
|
+
For more information see the full semantic versioning specification at http://semver.org/.
|
16
|
+
|
17
|
+
## Release Timing
|
18
|
+
|
19
|
+
Patch releases should be done after every fix of a major bug (as tagged in the github issue tracker).
|
20
|
+
Major releases should be kept to the minimum.
|
data/daru.gemspec
CHANGED
@@ -52,6 +52,9 @@ EOF
|
|
52
52
|
|
53
53
|
spec.add_runtime_dependency 'backports'
|
54
54
|
|
55
|
+
# it is required by NMatrix, yet we want to specify clearly which minimal version is OK
|
56
|
+
spec.add_runtime_dependency 'packable', '~> 1.3.9'
|
57
|
+
|
55
58
|
spec.add_development_dependency 'spreadsheet', '~> 1.1.1'
|
56
59
|
spec.add_development_dependency 'bundler', '~> 1.10'
|
57
60
|
spec.add_development_dependency 'rake', '~>10.5'
|
@@ -75,6 +78,7 @@ EOF
|
|
75
78
|
spec.add_development_dependency 'simplecov'
|
76
79
|
spec.add_development_dependency 'gruff'
|
77
80
|
spec.add_development_dependency 'webmock'
|
81
|
+
|
78
82
|
if RUBY_VERSION < '2.1.0'
|
79
83
|
spec.add_development_dependency 'nokogiri', '<= 1.6.8.1'
|
80
84
|
else
|
data/lib/daru.rb
CHANGED
@@ -105,6 +105,7 @@ require 'date'
|
|
105
105
|
require 'daru/version.rb'
|
106
106
|
|
107
107
|
require 'open-uri'
|
108
|
+
require 'backports/2.1.0/array/to_h'
|
108
109
|
|
109
110
|
require 'daru/index/index.rb'
|
110
111
|
require 'daru/index/multi_index.rb'
|
@@ -124,5 +125,3 @@ require 'daru/core/merge.rb'
|
|
124
125
|
|
125
126
|
require 'daru/date_time/offsets.rb'
|
126
127
|
require 'daru/date_time/index.rb'
|
127
|
-
|
128
|
-
require 'backports'
|
data/lib/daru/category.rb
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
module Daru
|
2
2
|
module Category # rubocop:disable Metrics/ModuleLength
|
3
|
+
UNDEFINED = Object.new.freeze
|
4
|
+
|
3
5
|
attr_accessor :base_category
|
4
6
|
attr_reader :index, :coding_scheme, :name
|
5
7
|
|
@@ -113,7 +115,7 @@ module Daru
|
|
113
115
|
end
|
114
116
|
|
115
117
|
# Associates a category to the vector.
|
116
|
-
# @param [Array]
|
118
|
+
# @param [Array] new_categories new categories to be associated
|
117
119
|
# @example
|
118
120
|
# dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
|
119
121
|
# dv.add_category :b
|
@@ -131,7 +133,10 @@ module Daru
|
|
131
133
|
# dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
|
132
134
|
# dv.count :a
|
133
135
|
# # => 2
|
134
|
-
|
136
|
+
# dv.count
|
137
|
+
# # => 5
|
138
|
+
def count category=UNDEFINED
|
139
|
+
return @cat_hash.values.map(&:size).inject(&:+) if category == UNDEFINED # count all
|
135
140
|
raise ArgumentError, "Invalid category #{category}" unless
|
136
141
|
categories.include?(category)
|
137
142
|
|
@@ -167,7 +172,7 @@ module Daru
|
|
167
172
|
end
|
168
173
|
|
169
174
|
# Returns vector for indexes/positions specified
|
170
|
-
# @param [Array]
|
175
|
+
# @param [Array] indexes for which values has to be retrived
|
171
176
|
# @note Since it accepts both indexes and postions. In case of collision,
|
172
177
|
# arguement will be treated as index
|
173
178
|
# @return vector containing values specified at specified indexes/positions
|
@@ -196,7 +201,7 @@ module Daru
|
|
196
201
|
end
|
197
202
|
|
198
203
|
# Returns vector for positions specified.
|
199
|
-
# @param [Array]
|
204
|
+
# @param [Array] positions at which values to be retrived.
|
200
205
|
# @return vector containing values specified at specified positions
|
201
206
|
# @example
|
202
207
|
# dv = Daru::Vector.new [:a, 1, :a, 1, :c], type: :category
|
@@ -223,7 +228,7 @@ module Daru
|
|
223
228
|
|
224
229
|
# Modifies values at specified indexes/positions.
|
225
230
|
# @note In order to add a new category you need to associate it via #add_category
|
226
|
-
# @param [Array]
|
231
|
+
# @param [Array] indexes at which to modify value
|
227
232
|
# @param [object] val value to assign at specific indexes/positions
|
228
233
|
# @return modified vector
|
229
234
|
# @example
|
@@ -584,7 +589,7 @@ module Daru
|
|
584
589
|
alias :gteq :mteq
|
585
590
|
|
586
591
|
# For querying the data
|
587
|
-
# @param [object] arel like query syntax
|
592
|
+
# @param bool_array [object] arel like query syntax
|
588
593
|
# @return [Daru::Vector] Vector which makes the conditions true
|
589
594
|
# @example
|
590
595
|
# dv = Daru::Vector.new ['I', 'II', 'I', 'III', 'I', 'II'],
|
@@ -658,7 +663,7 @@ module Daru
|
|
658
663
|
end
|
659
664
|
|
660
665
|
# Check if any one of mentioned values occur in the vector
|
661
|
-
# @param [Array]
|
666
|
+
# @param [Array] values to check for
|
662
667
|
# @return [true, false] returns true if any one of specified values
|
663
668
|
# occur in the vector
|
664
669
|
# @example
|
@@ -670,7 +675,7 @@ module Daru
|
|
670
675
|
end
|
671
676
|
|
672
677
|
# Return a vector with specified values removed
|
673
|
-
# @param [Array]
|
678
|
+
# @param [Array] values to reject from resultant vector
|
674
679
|
# @return [Daru::Vector] vector with specified values removed
|
675
680
|
# @example
|
676
681
|
# dv = Daru::Vector.new [1, 2, nil, Float::NAN], type: :category
|
@@ -689,7 +694,7 @@ module Daru
|
|
689
694
|
end
|
690
695
|
|
691
696
|
# Count the number of values specified
|
692
|
-
# @param [Array]
|
697
|
+
# @param [Array] values to count for
|
693
698
|
# @return [Integer] the number of times the values mentioned occurs
|
694
699
|
# @example
|
695
700
|
# dv = Daru::Vector.new [1, 2, 1, 2, 3, 4, nil, nil]
|
@@ -702,7 +707,7 @@ module Daru
|
|
702
707
|
end
|
703
708
|
|
704
709
|
# Return indexes of values specified
|
705
|
-
# @param [Array]
|
710
|
+
# @param [Array] values to find indexes for
|
706
711
|
# @return [Array] array of indexes of values specified
|
707
712
|
# @example
|
708
713
|
# dv = Daru::Vector.new [1, 2, nil, Float::NAN], index: 11..14
|
data/lib/daru/core/group_by.rb
CHANGED
@@ -11,12 +11,14 @@ module Daru
|
|
11
11
|
end
|
12
12
|
end
|
13
13
|
|
14
|
-
TUPLE_SORTER = lambda do |
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
14
|
+
TUPLE_SORTER = lambda do |left, right|
|
15
|
+
return -1 unless right
|
16
|
+
return 1 unless left
|
17
|
+
|
18
|
+
left = left.compact
|
19
|
+
right = right.compact
|
20
|
+
return left <=> right || 0 if left.length == right.length
|
21
|
+
left.length <=> right.length
|
20
22
|
end
|
21
23
|
|
22
24
|
def initialize context, names
|
@@ -203,8 +205,8 @@ module Daru
|
|
203
205
|
|
204
206
|
# Iteratively applies a function to the values in a group and accumulates the result.
|
205
207
|
# @param init (nil) The initial value of the accumulator.
|
206
|
-
# @
|
207
|
-
#
|
208
|
+
# @yieldparam block [Proc] A proc or lambda that accepts two arguments. The first argument
|
209
|
+
# is the accumulated result. The second argument is a DataFrame row.
|
208
210
|
# @example Usage of reduce
|
209
211
|
# df = Daru::DataFrame.new({
|
210
212
|
# a: ['a','b'] * 3,
|
@@ -243,6 +245,47 @@ module Daru
|
|
243
245
|
@df.inspect
|
244
246
|
end
|
245
247
|
|
248
|
+
# Function to use for aggregating the data.
|
249
|
+
# `group_by` is using Daru::DataFrame#aggregate
|
250
|
+
#
|
251
|
+
# @param options [Hash] options for column, you want in resultant dataframe
|
252
|
+
#
|
253
|
+
# @return [Daru::DataFrame]
|
254
|
+
#
|
255
|
+
# @example
|
256
|
+
#
|
257
|
+
# df = Daru::DataFrame.new(
|
258
|
+
# name: ['Ram','Krishna','Ram','Krishna','Krishna'],
|
259
|
+
# visited: ['Hyderabad', 'Delhi', 'Mumbai', 'Raipur', 'Banglore'])
|
260
|
+
#
|
261
|
+
# => #<Daru::DataFrame(5x2)>
|
262
|
+
# name visited
|
263
|
+
# 0 Ram Hyderabad
|
264
|
+
# 1 Krishna Delhi
|
265
|
+
# 2 Ram Mumbai
|
266
|
+
# 3 Krishna Raipur
|
267
|
+
# 4 Krishna Banglore
|
268
|
+
#
|
269
|
+
# df.group_by(:name)
|
270
|
+
# => #<Daru::DataFrame(5x1)>
|
271
|
+
# visited
|
272
|
+
# Krishna 1 Delhi
|
273
|
+
# 3 Raipur
|
274
|
+
# 4 Banglore
|
275
|
+
# Ram 0 Hyderabad
|
276
|
+
# 2 Mumbai
|
277
|
+
#
|
278
|
+
# df.group_by(:name).aggregate(visited: -> (vec){vec.to_a.join(',')})
|
279
|
+
# => #<Daru::DataFrame(2x1)>
|
280
|
+
# visited
|
281
|
+
# Krishna Delhi,Raipur,Banglore
|
282
|
+
# Ram Hyderabad,Mumbai
|
283
|
+
#
|
284
|
+
def aggregate(options={})
|
285
|
+
@df.index = @df.index.remove_layer(@df.index.levels.size - 1)
|
286
|
+
@df.aggregate(options)
|
287
|
+
end
|
288
|
+
|
246
289
|
private
|
247
290
|
|
248
291
|
def init_groups_df tuples, names
|
data/lib/daru/dataframe.rb
CHANGED
@@ -84,7 +84,7 @@ module Daru
|
|
84
84
|
# Read a dataframe from AR::Relation
|
85
85
|
#
|
86
86
|
# @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
|
87
|
-
# @
|
87
|
+
# @param fields [Array] Field names to be loaded (optional)
|
88
88
|
#
|
89
89
|
# @return A dataframe containing the data loaded from the relation
|
90
90
|
#
|
@@ -277,6 +277,17 @@ module Daru
|
|
277
277
|
# Default to *true*.
|
278
278
|
#
|
279
279
|
# == Usage
|
280
|
+
#
|
281
|
+
# df = Daru::DataFrame.new
|
282
|
+
# # =>
|
283
|
+
# # <Daru::DataFrame(0x0)>
|
284
|
+
# # Creates an empty DataFrame with no rows or columns.
|
285
|
+
#
|
286
|
+
# df = Daru::DataFrame.new({}, order: [:a, :b])
|
287
|
+
# #<Daru::DataFrame(0x2)>
|
288
|
+
# a b
|
289
|
+
# # Creates a DataFrame with no rows and columns :a and :b
|
290
|
+
#
|
280
291
|
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
|
281
292
|
# index: [:a, :b, :c, :d], name: :spider_man)
|
282
293
|
#
|
@@ -329,7 +340,7 @@ module Daru
|
|
329
340
|
# # 1 4 14 44
|
330
341
|
# # 2 5 15 55
|
331
342
|
|
332
|
-
def initialize source, opts={} # rubocop:disable Metrics/MethodLength
|
343
|
+
def initialize source={}, opts={} # rubocop:disable Metrics/MethodLength
|
333
344
|
vectors, index = opts[:order], opts[:index] # FIXME: just keyword arges after Ruby 2.1
|
334
345
|
@data = []
|
335
346
|
@name = opts[:name]
|
@@ -375,7 +386,7 @@ module Daru
|
|
375
386
|
end
|
376
387
|
|
377
388
|
# Retrive rows by positions
|
378
|
-
# @param [Array<Integer>]
|
389
|
+
# @param [Array<Integer>] positions of rows to retrive
|
379
390
|
# @return [Daru::Vector, Daru::DataFrame] vector for single position and dataframe for multiple positions
|
380
391
|
# @example
|
381
392
|
# df = Daru::DataFrame.new({
|
@@ -405,7 +416,7 @@ module Daru
|
|
405
416
|
|
406
417
|
# Set rows by positions
|
407
418
|
# @param [Array<Integer>] positions positions of rows to set
|
408
|
-
# @
|
419
|
+
# @param [Array, Daru::Vector] vector vector to be assigned
|
409
420
|
# @example
|
410
421
|
# df = Daru::DataFrame.new({
|
411
422
|
# a: [1, 2, 3],
|
@@ -438,7 +449,7 @@ module Daru
|
|
438
449
|
end
|
439
450
|
|
440
451
|
# Retrive vectors by positions
|
441
|
-
# @param [Array<Integer>]
|
452
|
+
# @param [Array<Integer>] positions of vectors to retrive
|
442
453
|
# @return [Daru::Vector, Daru::DataFrame] vector for single position and dataframe for multiple positions
|
443
454
|
# @example
|
444
455
|
# df = Daru::DataFrame.new({
|
@@ -522,7 +533,7 @@ module Daru
|
|
522
533
|
end
|
523
534
|
|
524
535
|
def add_row row, index=nil
|
525
|
-
self.row[index || @size] = row
|
536
|
+
self.row[*(index || @size)] = row
|
526
537
|
end
|
527
538
|
|
528
539
|
def add_vector n, vector
|
@@ -597,7 +608,7 @@ module Daru
|
|
597
608
|
|
598
609
|
# Returns a dataframe in which rows with any of the mentioned values
|
599
610
|
# are ignored.
|
600
|
-
# @param [Array]
|
611
|
+
# @param [Array] values to reject to form the new dataframe
|
601
612
|
# @return [Daru::DataFrame] Data Frame with only rows which doesn't
|
602
613
|
# contain the mentioned values
|
603
614
|
# @example
|
@@ -650,6 +661,88 @@ module Daru
|
|
650
661
|
self
|
651
662
|
end
|
652
663
|
|
664
|
+
# Rolling fillna
|
665
|
+
# replace all Float::NAN and NIL values with the preceeding or following value
|
666
|
+
#
|
667
|
+
# @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
|
668
|
+
#
|
669
|
+
# @example
|
670
|
+
# df = Daru::DataFrame.new({
|
671
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
672
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
|
673
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
674
|
+
# })
|
675
|
+
#
|
676
|
+
# => #<Daru::DataFrame(8x3)>
|
677
|
+
# a b c
|
678
|
+
# 0 1 a a
|
679
|
+
# 1 2 b NaN
|
680
|
+
# 2 3 nil 3
|
681
|
+
# 3 nil NaN 4
|
682
|
+
# 4 NaN nil 3
|
683
|
+
# 5 nil 3 5
|
684
|
+
# 6 1 5 nil
|
685
|
+
# 7 7 nil 7
|
686
|
+
#
|
687
|
+
# 2.3.3 :068 > df.rolling_fillna(:forward)
|
688
|
+
# => #<Daru::DataFrame(8x3)>
|
689
|
+
# a b c
|
690
|
+
# 0 1 a a
|
691
|
+
# 1 2 b a
|
692
|
+
# 2 3 b 3
|
693
|
+
# 3 3 b 4
|
694
|
+
# 4 3 b 3
|
695
|
+
# 5 3 3 5
|
696
|
+
# 6 1 5 5
|
697
|
+
# 7 7 5 7
|
698
|
+
#
|
699
|
+
def rolling_fillna!(direction=:forward)
|
700
|
+
@data.each { |vec| vec.rolling_fillna!(direction) }
|
701
|
+
end
|
702
|
+
|
703
|
+
def rolling_fillna(direction=:forward)
|
704
|
+
dup.rolling_fillna!(direction)
|
705
|
+
end
|
706
|
+
|
707
|
+
# Return unique rows by vector specified or all vectors
|
708
|
+
#
|
709
|
+
# @param vtrs [String][Symbol] vector names(s) that should be considered
|
710
|
+
#
|
711
|
+
# @example
|
712
|
+
#
|
713
|
+
# => #<Daru::DataFrame(6x2)>
|
714
|
+
# a b
|
715
|
+
# 0 1 a
|
716
|
+
# 1 2 b
|
717
|
+
# 2 3 c
|
718
|
+
# 3 4 d
|
719
|
+
# 2 3 c
|
720
|
+
# 3 4 f
|
721
|
+
#
|
722
|
+
# 2.3.3 :> df.unique
|
723
|
+
# => #<Daru::DataFrame(5x2)>
|
724
|
+
# a b
|
725
|
+
# 0 1 a
|
726
|
+
# 1 2 b
|
727
|
+
# 2 3 c
|
728
|
+
# 3 4 d
|
729
|
+
# 3 4 f
|
730
|
+
#
|
731
|
+
# 2.3.3 :> df.unique(:a)
|
732
|
+
# => #<Daru::DataFrame(5x2)>
|
733
|
+
# a b
|
734
|
+
# 0 1 a
|
735
|
+
# 1 2 b
|
736
|
+
# 2 3 c
|
737
|
+
# 3 4 d
|
738
|
+
#
|
739
|
+
def uniq(*vtrs)
|
740
|
+
vecs = vtrs.empty? ? vectors.map(&:to_s) : Array(vtrs)
|
741
|
+
grouped = group_by(vecs)
|
742
|
+
indexes = grouped.groups.values.map { |v| v[0] }.sort
|
743
|
+
row[*indexes]
|
744
|
+
end
|
745
|
+
|
653
746
|
# Iterate over each index of the DataFrame.
|
654
747
|
def each_index &block
|
655
748
|
return to_enum(:each_index) unless block_given?
|
@@ -1024,9 +1117,9 @@ module Daru
|
|
1024
1117
|
dup.tap { |df| df.keep_vector_if(&block) }
|
1025
1118
|
end
|
1026
1119
|
|
1027
|
-
# Test each row with one or more tests.
|
1028
|
-
#
|
1029
|
-
#
|
1120
|
+
# Test each row with one or more tests.
|
1121
|
+
# @param tests [Proc] Each test is a Proc with the form
|
1122
|
+
# *Proc.new {|row| row[:age] > 0}*
|
1030
1123
|
# The function returns an array with all errors.
|
1031
1124
|
#
|
1032
1125
|
# FIXME: description here is too sparse. As far as I can get,
|
@@ -1128,7 +1221,7 @@ module Daru
|
|
1128
1221
|
deprecate :flawed?, :include_values?, 2016, 10
|
1129
1222
|
|
1130
1223
|
# Check if any of given values occur in the data frame
|
1131
|
-
# @param [Array]
|
1224
|
+
# @param [Array] values to check for
|
1132
1225
|
# @return [true, false] true if any of the given values occur in the
|
1133
1226
|
# dataframe, false otherwise
|
1134
1227
|
# @example
|
@@ -1259,13 +1352,60 @@ module Daru
|
|
1259
1352
|
|
1260
1353
|
alias :last :tail
|
1261
1354
|
|
1262
|
-
#
|
1263
|
-
#
|
1264
|
-
|
1355
|
+
# Sum all numeric/specified vectors in the DataFrame.
|
1356
|
+
#
|
1357
|
+
# Returns a new vector that's a containing a sum of all numeric
|
1358
|
+
# or specified vectors of the DataFrame. By default, if the vector
|
1359
|
+
# contains a nil, the sum is nil.
|
1360
|
+
# With :skipnil argument set to true, nil values are assumed to be
|
1361
|
+
# 0 (zero) and the sum vector is returned.
|
1362
|
+
#
|
1363
|
+
# @param args [Array] List of vectors to sum. Default is nil in which case
|
1364
|
+
# all numeric vectors are summed.
|
1365
|
+
#
|
1366
|
+
# @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
|
1367
|
+
#
|
1368
|
+
# @return Vector with sum of all vectors specified in the argument.
|
1369
|
+
# If vecs parameter is empty, sum all numeric vector.
|
1370
|
+
#
|
1371
|
+
# @example
|
1372
|
+
# df = Daru::DataFrame.new({
|
1373
|
+
# a: [1, 2, nil],
|
1374
|
+
# b: [2, 1, 3],
|
1375
|
+
# c: [1, 1, 1]
|
1376
|
+
# })
|
1377
|
+
# => #<Daru::DataFrame(3x3)>
|
1378
|
+
# a b c
|
1379
|
+
# 0 1 2 1
|
1380
|
+
# 1 2 1 1
|
1381
|
+
# 2 nil 3 1
|
1382
|
+
# df.vector_sum [:a, :c]
|
1383
|
+
# => #<Daru::Vector(3)>
|
1384
|
+
# 0 2
|
1385
|
+
# 1 3
|
1386
|
+
# 2 nil
|
1387
|
+
# df.vector_sum
|
1388
|
+
# => #<Daru::Vector(3)>
|
1389
|
+
# 0 4
|
1390
|
+
# 1 4
|
1391
|
+
# 2 nil
|
1392
|
+
# df.vector_sum skipnil: true
|
1393
|
+
# => #<Daru::Vector(3)>
|
1394
|
+
# c
|
1395
|
+
# 0 4
|
1396
|
+
# 1 4
|
1397
|
+
# 2 4
|
1398
|
+
#
|
1399
|
+
def vector_sum(*args)
|
1400
|
+
defaults = {vecs: nil, skipnil: false}
|
1401
|
+
options = args.last.is_a?(::Hash) ? args.pop : {}
|
1402
|
+
options = defaults.merge(options)
|
1403
|
+
vecs = args[0] || options[:vecs]
|
1404
|
+
skipnil = args[1] || options[:skipnil]
|
1405
|
+
|
1265
1406
|
vecs ||= numeric_vectors
|
1266
1407
|
sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype
|
1267
|
-
|
1268
|
-
vecs.inject(sum) { |memo, n| memo + self[n] }
|
1408
|
+
vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
|
1269
1409
|
end
|
1270
1410
|
|
1271
1411
|
# Calculate mean of the rows of the dataframe.
|
@@ -1427,7 +1567,7 @@ module Daru
|
|
1427
1567
|
|
1428
1568
|
# Reassign vectors with a new index of type Daru::Index or any of its subclasses.
|
1429
1569
|
#
|
1430
|
-
# @param [Daru::Index] idx The new index object on which the vectors are to
|
1570
|
+
# @param new_index [Daru::Index] idx The new index object on which the vectors are to
|
1431
1571
|
# be indexed. Must of the same size as ncols.
|
1432
1572
|
# @example Reassigning vectors of a DataFrame
|
1433
1573
|
# df = Daru::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
|
@@ -1513,9 +1653,9 @@ module Daru
|
|
1513
1653
|
# Sorts a dataframe (ascending/descending) in the given pripority sequence of
|
1514
1654
|
# vectors, with or without a block.
|
1515
1655
|
#
|
1516
|
-
# @param
|
1656
|
+
# @param vector_order [Array] The order of vector names in which the DataFrame
|
1517
1657
|
# should be sorted.
|
1518
|
-
# @param [Hash] opts The options to sort with.
|
1658
|
+
# @param opts [Hash] opts The options to sort with.
|
1519
1659
|
# @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
|
1520
1660
|
# or descending order. Specify Array corresponding to *order* for multiple
|
1521
1661
|
# sort orders.
|
@@ -1684,12 +1824,11 @@ module Daru
|
|
1684
1824
|
|
1685
1825
|
new_fields = (@vectors.to_a + other_df.vectors.to_a)
|
1686
1826
|
new_fields = ArrayHelper.recode_repeated(new_fields)
|
1687
|
-
|
1688
1827
|
DataFrame.new({}, order: new_fields).tap do |df_new|
|
1689
1828
|
(0...nrows).each do |i|
|
1690
1829
|
df_new.add_row row[i].to_a + other_df.row[i].to_a
|
1691
1830
|
end
|
1692
|
-
|
1831
|
+
df_new.index = @index if @index == other_df.index
|
1693
1832
|
df_new.update
|
1694
1833
|
end
|
1695
1834
|
end
|
@@ -2035,7 +2174,7 @@ module Daru
|
|
2035
2174
|
end
|
2036
2175
|
|
2037
2176
|
# Converts the specified non category type vectors to category type vectors
|
2038
|
-
# @param [Array]
|
2177
|
+
# @param [Array] names of non category type vectors to be converted
|
2039
2178
|
# @return [Daru::DataFrame] data frame in which specified vectors have been
|
2040
2179
|
# converted to category type
|
2041
2180
|
# @example
|
@@ -2126,8 +2265,88 @@ module Daru
|
|
2126
2265
|
res
|
2127
2266
|
end
|
2128
2267
|
|
2268
|
+
# Function to use for aggregating the data.
|
2269
|
+
#
|
2270
|
+
# @param options [Hash] options for column, you want in resultant dataframe
|
2271
|
+
#
|
2272
|
+
# @return [Daru::DataFrame]
|
2273
|
+
#
|
2274
|
+
# @example
|
2275
|
+
# df = Daru::DataFrame.new(
|
2276
|
+
# {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
|
2277
|
+
# => #<Daru::DataFrame(5x2)>
|
2278
|
+
# col num
|
2279
|
+
# 0 a 52
|
2280
|
+
# 1 b 12
|
2281
|
+
# 2 c 7
|
2282
|
+
# 3 d 17
|
2283
|
+
# 4 e 1
|
2284
|
+
#
|
2285
|
+
# df.aggregate(num_100_times: ->(df) { df.num*100 })
|
2286
|
+
# => #<Daru::DataFrame(5x1)>
|
2287
|
+
# num_100_ti
|
2288
|
+
# 0 5200
|
2289
|
+
# 1 1200
|
2290
|
+
# 2 700
|
2291
|
+
# 3 1700
|
2292
|
+
# 4 100
|
2293
|
+
#
|
2294
|
+
# When we have duplicate index :
|
2295
|
+
#
|
2296
|
+
# idx = Daru::CategoricalIndex.new [:a, :b, :a, :a, :c]
|
2297
|
+
# df = Daru::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
|
2298
|
+
# => #<Daru::DataFrame(5x1)>
|
2299
|
+
# num
|
2300
|
+
# a 52
|
2301
|
+
# b 12
|
2302
|
+
# a 7
|
2303
|
+
# a 17
|
2304
|
+
# c 1
|
2305
|
+
#
|
2306
|
+
# df.aggregate(num: :mean)
|
2307
|
+
# => #<Daru::DataFrame(3x1)>
|
2308
|
+
# num
|
2309
|
+
# a 25.3333333
|
2310
|
+
# b 12
|
2311
|
+
# c 1
|
2312
|
+
#
|
2313
|
+
# Note: `GroupBy` class `aggregate` method uses this `aggregate` method
|
2314
|
+
# internally.
|
2315
|
+
def aggregate(options={})
|
2316
|
+
colmn_value, index_tuples = aggregated_colmn_value(options)
|
2317
|
+
Daru::DataFrame.new(
|
2318
|
+
colmn_value, index: index_tuples, order: options.keys
|
2319
|
+
)
|
2320
|
+
end
|
2321
|
+
|
2129
2322
|
private
|
2130
2323
|
|
2324
|
+
# Do the `method` (`method` can be :sum, :mean, :std, :median, etc or
|
2325
|
+
# lambda), on the column.
|
2326
|
+
def apply_method_on_colmns colmn, index_tuples, method
|
2327
|
+
rows = []
|
2328
|
+
index_tuples.each do |indexes|
|
2329
|
+
# If single element then also make it vector.
|
2330
|
+
slice = Daru::Vector.new(Array(self[colmn][*indexes]))
|
2331
|
+
case method
|
2332
|
+
when Symbol
|
2333
|
+
rows << (slice.is_a?(Daru::Vector) ? slice.send(method) : slice)
|
2334
|
+
when Proc
|
2335
|
+
rows << method.call(slice)
|
2336
|
+
end
|
2337
|
+
end
|
2338
|
+
rows
|
2339
|
+
end
|
2340
|
+
|
2341
|
+
def apply_method_on_df index_tuples, method
|
2342
|
+
rows = []
|
2343
|
+
index_tuples.each do |indexes|
|
2344
|
+
slice = row[*indexes]
|
2345
|
+
rows << method.call(slice)
|
2346
|
+
end
|
2347
|
+
rows
|
2348
|
+
end
|
2349
|
+
|
2131
2350
|
def headers
|
2132
2351
|
Daru::Index.new(Array(index.name) + @vectors.to_a)
|
2133
2352
|
end
|
@@ -2224,9 +2443,7 @@ module Daru
|
|
2224
2443
|
rescue IndexError
|
2225
2444
|
raise IndexError, "Specified vector #{names.first} does not exist"
|
2226
2445
|
end
|
2227
|
-
|
2228
2446
|
return @data[pos] if pos.is_a?(Numeric)
|
2229
|
-
|
2230
2447
|
names = pos
|
2231
2448
|
end
|
2232
2449
|
|
@@ -2396,7 +2613,7 @@ module Daru
|
|
2396
2613
|
end
|
2397
2614
|
|
2398
2615
|
def create_vectors_index_with vectors, source
|
2399
|
-
vectors = source.keys
|
2616
|
+
vectors = source.keys if vectors.nil?
|
2400
2617
|
|
2401
2618
|
@vectors =
|
2402
2619
|
if vectors.is_a?(Index) || vectors.is_a?(MultiIndex)
|
@@ -2443,9 +2660,7 @@ module Daru
|
|
2443
2660
|
@index = Index.coerce(index || source[0].size)
|
2444
2661
|
@vectors = Index.coerce(vectors)
|
2445
2662
|
|
2446
|
-
|
2447
|
-
Daru::Vector.new(source[idx], index: @index, name: vectors[idx])
|
2448
|
-
end
|
2663
|
+
update_data source, vectors
|
2449
2664
|
end
|
2450
2665
|
|
2451
2666
|
def initialize_from_array_of_vectors source, vectors, index, opts
|
@@ -2694,6 +2909,30 @@ module Daru
|
|
2694
2909
|
end
|
2695
2910
|
end
|
2696
2911
|
|
2912
|
+
def update_data source, vectors
|
2913
|
+
@data = @vectors.each_with_index.map do |_vec,idx|
|
2914
|
+
Daru::Vector.new(source[idx], index: @index, name: vectors[idx])
|
2915
|
+
end
|
2916
|
+
end
|
2917
|
+
|
2918
|
+
def aggregated_colmn_value(options)
|
2919
|
+
colmn_value = []
|
2920
|
+
index_tuples = Array(@index).uniq
|
2921
|
+
options.keys.each do |vec|
|
2922
|
+
do_this_on_vec = options[vec]
|
2923
|
+
colmn_value << if @vectors.include?(vec)
|
2924
|
+
apply_method_on_colmns(
|
2925
|
+
vec, index_tuples, do_this_on_vec
|
2926
|
+
)
|
2927
|
+
else
|
2928
|
+
apply_method_on_df(
|
2929
|
+
index_tuples, do_this_on_vec
|
2930
|
+
)
|
2931
|
+
end
|
2932
|
+
end
|
2933
|
+
[colmn_value, index_tuples]
|
2934
|
+
end
|
2935
|
+
|
2697
2936
|
# coerce ranges, integers and array in appropriate ways
|
2698
2937
|
def coerce_positions *positions, size
|
2699
2938
|
if positions.size == 1
|