daru_lite 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +35 -33
  3. data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
  4. data/lib/daru_lite/data_frame/calculatable.rb +140 -0
  5. data/lib/daru_lite/data_frame/convertible.rb +107 -0
  6. data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
  7. data/lib/daru_lite/data_frame/fetchable.rb +301 -0
  8. data/lib/daru_lite/data_frame/filterable.rb +144 -0
  9. data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
  10. data/lib/daru_lite/data_frame/indexable.rb +168 -0
  11. data/lib/daru_lite/data_frame/iterable.rb +339 -0
  12. data/lib/daru_lite/data_frame/joinable.rb +152 -0
  13. data/lib/daru_lite/data_frame/missable.rb +75 -0
  14. data/lib/daru_lite/data_frame/pivotable.rb +108 -0
  15. data/lib/daru_lite/data_frame/queryable.rb +67 -0
  16. data/lib/daru_lite/data_frame/setable.rb +109 -0
  17. data/lib/daru_lite/data_frame/sortable.rb +241 -0
  18. data/lib/daru_lite/dataframe.rb +138 -2353
  19. data/lib/daru_lite/index/index.rb +13 -0
  20. data/lib/daru_lite/maths/statistics/vector.rb +1 -1
  21. data/lib/daru_lite/vector/aggregatable.rb +9 -0
  22. data/lib/daru_lite/vector/calculatable.rb +78 -0
  23. data/lib/daru_lite/vector/convertible.rb +77 -0
  24. data/lib/daru_lite/vector/duplicatable.rb +17 -0
  25. data/lib/daru_lite/vector/fetchable.rb +175 -0
  26. data/lib/daru_lite/vector/filterable.rb +128 -0
  27. data/lib/daru_lite/vector/indexable.rb +77 -0
  28. data/lib/daru_lite/vector/iterable.rb +95 -0
  29. data/lib/daru_lite/vector/joinable.rb +17 -0
  30. data/lib/daru_lite/vector/missable.rb +124 -0
  31. data/lib/daru_lite/vector/queryable.rb +45 -0
  32. data/lib/daru_lite/vector/setable.rb +47 -0
  33. data/lib/daru_lite/vector/sortable.rb +113 -0
  34. data/lib/daru_lite/vector.rb +36 -932
  35. data/lib/daru_lite/version.rb +1 -1
  36. data/spec/data_frame/aggregatable_example.rb +65 -0
  37. data/spec/data_frame/buildable_example.rb +109 -0
  38. data/spec/data_frame/calculatable_example.rb +135 -0
  39. data/spec/data_frame/convertible_example.rb +180 -0
  40. data/spec/data_frame/duplicatable_example.rb +111 -0
  41. data/spec/data_frame/fetchable_example.rb +476 -0
  42. data/spec/data_frame/filterable_example.rb +250 -0
  43. data/spec/data_frame/indexable_example.rb +221 -0
  44. data/spec/data_frame/iterable_example.rb +465 -0
  45. data/spec/data_frame/joinable_example.rb +106 -0
  46. data/spec/data_frame/missable_example.rb +47 -0
  47. data/spec/data_frame/pivotable_example.rb +297 -0
  48. data/spec/data_frame/queryable_example.rb +92 -0
  49. data/spec/data_frame/setable_example.rb +482 -0
  50. data/spec/data_frame/sortable_example.rb +350 -0
  51. data/spec/dataframe_spec.rb +181 -3289
  52. data/spec/index/index_spec.rb +8 -0
  53. data/spec/vector/aggregatable_example.rb +27 -0
  54. data/spec/vector/calculatable_example.rb +82 -0
  55. data/spec/vector/convertible_example.rb +126 -0
  56. data/spec/vector/duplicatable_example.rb +48 -0
  57. data/spec/vector/fetchable_example.rb +463 -0
  58. data/spec/vector/filterable_example.rb +165 -0
  59. data/spec/vector/indexable_example.rb +201 -0
  60. data/spec/vector/iterable_example.rb +111 -0
  61. data/spec/vector/joinable_example.rb +25 -0
  62. data/spec/vector/missable_example.rb +88 -0
  63. data/spec/vector/queryable_example.rb +91 -0
  64. data/spec/vector/setable_example.rb +300 -0
  65. data/spec/vector/sortable_example.rb +242 -0
  66. data/spec/vector_spec.rb +111 -1805
  67. metadata +86 -2
@@ -1,10 +1,40 @@
1
1
  require 'daru_lite/accessors/dataframe_by_row'
2
+ require 'daru_lite/data_frame/aggregatable'
3
+ require 'daru_lite/data_frame/calculatable'
4
+ require 'daru_lite/data_frame/convertible'
5
+ require 'daru_lite/data_frame/duplicatable'
6
+ require 'daru_lite/data_frame/fetchable'
7
+ require 'daru_lite/data_frame/filterable'
8
+ require 'daru_lite/data_frame/indexable'
9
+ require 'daru_lite/data_frame/i_o_able'
10
+ require 'daru_lite/data_frame/iterable'
11
+ require 'daru_lite/data_frame/joinable'
12
+ require 'daru_lite/data_frame/missable'
13
+ require 'daru_lite/data_frame/pivotable'
14
+ require 'daru_lite/data_frame/setable'
15
+ require 'daru_lite/data_frame/sortable'
16
+ require 'daru_lite/data_frame/queryable'
2
17
  require 'daru_lite/maths/arithmetic/dataframe'
3
18
  require 'daru_lite/maths/statistics/dataframe'
4
19
  require 'daru_lite/io/io'
5
20
 
6
21
  module DaruLite
7
22
  class DataFrame # rubocop:disable Metrics/ClassLength
23
+ include DaruLite::DataFrame::Aggregatable
24
+ include DaruLite::DataFrame::Calculatable
25
+ include DaruLite::DataFrame::Convertible
26
+ include DaruLite::DataFrame::Duplicatable
27
+ include DaruLite::DataFrame::Fetchable
28
+ include DaruLite::DataFrame::Filterable
29
+ include DaruLite::DataFrame::Indexable
30
+ include DaruLite::DataFrame::Iterable
31
+ include DaruLite::DataFrame::IOAble
32
+ include DaruLite::DataFrame::Joinable
33
+ include DaruLite::DataFrame::Missable
34
+ include DaruLite::DataFrame::Pivotable
35
+ include DaruLite::DataFrame::Setable
36
+ include DaruLite::DataFrame::Sortable
37
+ include DaruLite::DataFrame::Queryable
8
38
  include DaruLite::Maths::Arithmetic::DataFrame
9
39
  include DaruLite::Maths::Statistics::DataFrame
10
40
 
@@ -13,109 +43,6 @@ module DaruLite
13
43
  extend Gem::Deprecate
14
44
 
15
45
  class << self
16
- # Load data from a CSV file. Specify an optional block to grab the CSV
17
- # object and pre-condition it (for example use the `convert` or
18
- # `header_convert` methods).
19
- #
20
- # == Arguments
21
- #
22
- # * path - Local path / Remote URL of the file to load specified as a String.
23
- #
24
- # == Options
25
- #
26
- # Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
27
- # and uses those to eventually construct the resulting DataFrame.
28
- #
29
- # == Verbose Description
30
- #
31
- # You can specify all the options to the `.from_csv` function that you
32
- # do to the Ruby `CSV.read()` function, since this is what is used internally.
33
- #
34
- # For example, if the columns in your CSV file are separated by something
35
- # other that commas, you can use the `:col_sep` option. If you want to
36
- # convert numeric values to numbers and not keep them as strings, you can
37
- # use the `:converters` option and set it to `:numeric`.
38
- #
39
- # The `.from_csv` function uses the following defaults for reading CSV files
40
- # (that are passed into the `CSV.read()` function):
41
- #
42
- # {
43
- # :col_sep => ',',
44
- # :converters => :numeric
45
- # }
46
- def from_csv(path, opts = {}, &block)
47
- DaruLite::IO.from_csv path, opts, &block
48
- end
49
-
50
- # Read data from an Excel file into a DataFrame.
51
- #
52
- # == Arguments
53
- #
54
- # * path - Path of the file to be read.
55
- #
56
- # == Options
57
- #
58
- # *:worksheet_id - ID of the worksheet that is to be read.
59
- def from_excel(path, opts = {}, &block)
60
- DaruLite::IO.from_excel path, opts, &block
61
- end
62
-
63
- # Read a database query and returns a Dataset
64
- #
65
- # @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
66
- # @param query [String] The query to be executed
67
- #
68
- # @return A dataframe containing the data resulting from the query
69
- #
70
- # USE:
71
- #
72
- # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
73
- # DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
74
- #
75
- # #Alternatively
76
- #
77
- # require 'dbi'
78
- # DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
79
- def from_sql(dbh, query)
80
- DaruLite::IO.from_sql dbh, query
81
- end
82
-
83
- # Read a dataframe from AR::Relation
84
- #
85
- # @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
86
- # @param fields [Array] Field names to be loaded (optional)
87
- #
88
- # @return A dataframe containing the data loaded from the relation
89
- #
90
- # USE:
91
- #
92
- # # When Post model is defined as:
93
- # class Post < ActiveRecord::Base
94
- # scope :active, -> { where.not(published_at: nil) }
95
- # end
96
- #
97
- # # You can load active posts into a dataframe by:
98
- # DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
99
- def from_activerecord(relation, *fields)
100
- DaruLite::IO.from_activerecord relation, *fields
101
- end
102
-
103
- # Read the database from a plaintext file. For this method to work,
104
- # the data should be present in a plain text file in columns. See
105
- # spec/fixtures/bank2.dat for an example.
106
- #
107
- # == Arguments
108
- #
109
- # * path - Path of the file to be read.
110
- # * fields - Vector names of the resulting database.
111
- #
112
- # == Usage
113
- #
114
- # df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
115
- def from_plaintext(path, fields)
116
- DaruLite::IO.from_plaintext path, fields
117
- end
118
-
119
46
  # Create DataFrame by specifying rows as an Array of Arrays or Array of
120
47
  # DaruLite::Vector objects.
121
48
  def rows(source, opts = {})
@@ -316,179 +243,6 @@ module DaruLite
316
243
  update
317
244
  end
318
245
 
319
- # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
320
- # Defaults to *:vector*. Use of this method is not recommended for accessing
321
- # rows. Use df.row[:a] for accessing row with index ':a'.
322
- def [](*names)
323
- axis = extract_axis(names, :vector)
324
- dispatch_to_axis axis, :access, *names
325
- end
326
-
327
- # Retrive rows by positions
328
- # @param [Array<Integer>] positions of rows to retrive
329
- # @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
330
- # @example
331
- # df = DaruLite::DataFrame.new({
332
- # a: [1, 2, 3],
333
- # b: ['a', 'b', 'c']
334
- # })
335
- # df.row_at 1, 2
336
- # # => #<DaruLite::DataFrame(2x2)>
337
- # # a b
338
- # # 1 2 b
339
- # # 2 3 c
340
- def row_at(*positions)
341
- original_positions = positions
342
- positions = coerce_positions(*positions, nrows)
343
- validate_positions(*positions, nrows)
344
-
345
- if positions.is_a? Integer
346
- row = get_rows_for([positions])
347
- DaruLite::Vector.new row, index: @vectors
348
- else
349
- new_rows = get_rows_for(original_positions)
350
- DaruLite::DataFrame.new new_rows, index: @index.at(*original_positions), order: @vectors
351
- end
352
- end
353
-
354
- # Set rows by positions
355
- # @param [Array<Integer>] positions positions of rows to set
356
- # @param [Array, DaruLite::Vector] vector vector to be assigned
357
- # @example
358
- # df = DaruLite::DataFrame.new({
359
- # a: [1, 2, 3],
360
- # b: ['a', 'b', 'c']
361
- # })
362
- # df.set_row_at [0, 1], ['x', 'x']
363
- # df
364
- # #=> #<DaruLite::DataFrame(3x2)>
365
- # # a b
366
- # # 0 x x
367
- # # 1 x x
368
- # # 2 3 c
369
- def set_row_at(positions, vector)
370
- validate_positions(*positions, nrows)
371
- vector =
372
- if vector.is_a? DaruLite::Vector
373
- vector.reindex @vectors
374
- else
375
- DaruLite::Vector.new vector
376
- end
377
-
378
- raise SizeError, 'Vector length should match row length' if
379
- vector.size != @vectors.size
380
-
381
- @data.each_with_index do |vec, pos|
382
- vec.set_at(positions, vector.at(pos))
383
- end
384
- @index = @data[0].index
385
- set_size
386
- end
387
-
388
- # Retrive vectors by positions
389
- # @param [Array<Integer>] positions of vectors to retrive
390
- # @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
391
- # @example
392
- # df = DaruLite::DataFrame.new({
393
- # a: [1, 2, 3],
394
- # b: ['a', 'b', 'c']
395
- # })
396
- # df.at 0
397
- # # => #<DaruLite::Vector(3)>
398
- # # a
399
- # # 0 1
400
- # # 1 2
401
- # # 2 3
402
- def at(*positions)
403
- if AXES.include? positions.last
404
- axis = positions.pop
405
- return row_at(*positions) if axis == :row
406
- end
407
-
408
- original_positions = positions
409
- positions = coerce_positions(*positions, ncols)
410
- validate_positions(*positions, ncols)
411
-
412
- if positions.is_a? Integer
413
- @data[positions].dup
414
- else
415
- DaruLite::DataFrame.new positions.map { |pos| @data[pos].dup },
416
- index: @index,
417
- order: @vectors.at(*original_positions),
418
- name: @name
419
- end
420
- end
421
-
422
- # Set vectors by positions
423
- # @param [Array<Integer>] positions positions of vectors to set
424
- # @param [Array, DaruLite::Vector] vector vector to be assigned
425
- # @example
426
- # df = DaruLite::DataFrame.new({
427
- # a: [1, 2, 3],
428
- # b: ['a', 'b', 'c']
429
- # })
430
- # df.set_at [0], ['x', 'y', 'z']
431
- # df
432
- # #=> #<DaruLite::DataFrame(3x2)>
433
- # # a b
434
- # # 0 x a
435
- # # 1 y b
436
- # # 2 z c
437
- def set_at(positions, vector)
438
- if positions.last == :row
439
- positions.pop
440
- return set_row_at(positions, vector)
441
- end
442
-
443
- validate_positions(*positions, ncols)
444
- vector =
445
- if vector.is_a? DaruLite::Vector
446
- vector.reindex @index
447
- else
448
- DaruLite::Vector.new vector
449
- end
450
-
451
- raise SizeError, 'Vector length should match index length' if
452
- vector.size != @index.size
453
-
454
- positions.each { |pos| @data[pos] = vector }
455
- end
456
-
457
- # Insert a new row/vector of the specified name or modify a previous row.
458
- # Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
459
- # a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
460
- #
461
- # In case a DaruLite::Vector is specified after the equality the sign, the indexes
462
- # of the vector will be matched against the row/vector indexes of the DataFrame
463
- # before an insertion is performed. Unmatched indexes will be set to nil.
464
- def []=(*args)
465
- vector = args.pop
466
- axis = extract_axis(args)
467
- names = args
468
-
469
- dispatch_to_axis axis, :insert_or_modify, names, vector
470
- end
471
-
472
- def add_row(row, index = nil)
473
- self.row[*(index || @size)] = row
474
- end
475
-
476
- def add_vector(n, vector)
477
- self[n] = vector
478
- end
479
-
480
- def insert_vector(n, name, source)
481
- raise ArgumentError unless source.is_a? Array
482
-
483
- vector = DaruLite::Vector.new(source, index: @index, name: @name)
484
- @data << vector
485
- @vectors = @vectors.add name
486
- ordr = @vectors.dup.to_a
487
- elmnt = ordr.pop
488
- ordr.insert n, elmnt
489
- self.order = ordr
490
- end
491
-
492
246
  # Access a row or set/create a row. Refer #[] and #[]= docs for details.
493
247
  #
494
248
  # == Usage
@@ -498,1696 +252,177 @@ module DaruLite
498
252
  DaruLite::Accessors::DataFrameByRow.new(self)
499
253
  end
500
254
 
501
- # Extract a dataframe given row indexes or positions
502
- # @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
503
- # @return [DaruLite::Dataframe]
504
- def get_sub_dataframe(keys, by_position: true)
505
- return DaruLite::DataFrame.new({}) if keys == []
506
-
507
- keys = @index.pos(*keys) unless by_position
508
-
509
- sub_df = row_at(*keys)
510
- sub_df = sub_df.to_df.transpose if sub_df.is_a?(DaruLite::Vector)
511
-
512
- sub_df
513
- end
514
-
515
- # Duplicate the DataFrame entirely.
516
- #
517
- # == Arguments
518
- #
519
- # * +vectors_to_dup+ - An Array specifying the names of Vectors to
520
- # be duplicated. Will duplicate the entire DataFrame if not specified.
521
- def dup(vectors_to_dup = nil)
522
- vectors_to_dup ||= @vectors.to_a
523
-
524
- src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
525
- new_order = DaruLite::Index.new(vectors_to_dup)
526
-
527
- DaruLite::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
528
- end
529
-
530
- # Only clone the structure of the DataFrame.
531
- def clone_structure
532
- DaruLite::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
533
- end
534
-
535
- # Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
536
- # preserved.
537
- #
538
- # == Arguments
539
- #
540
- # +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
541
- # a view of the whole data frame otherwise.
542
- def clone(*vectors_to_clone)
543
- vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
544
- vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
545
-
546
- h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
547
- DaruLite::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
548
- end
549
-
550
- # Returns a 'shallow' copy of DataFrame if missing data is not present,
551
- # or a full copy of only valid data if missing data is present.
552
- def clone_only_valid
553
- if include_values?(*DaruLite::MISSING_VALUES)
554
- reject_values(*DaruLite::MISSING_VALUES)
555
- else
556
- clone
557
- end
558
- end
559
-
560
- # Creates a new duplicate dataframe containing only rows
561
- # without a single missing value.
562
- def dup_only_valid(vecs = nil)
563
- rows_with_nil = @data.map { |vec| vec.indexes(*DaruLite::MISSING_VALUES) }
564
- .inject(&:concat)
565
- .uniq
566
-
567
- row_indexes = @index.to_a
568
- (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
569
- end
570
- deprecate :dup_only_valid, :reject_values, 2016, 10
571
-
572
- # Returns a dataframe in which rows with any of the mentioned values
573
- # are ignored.
574
- # @param [Array] values to reject to form the new dataframe
575
- # @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
576
- # contain the mentioned values
577
- # @example
578
- # df = DaruLite::DataFrame.new({
579
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
580
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
581
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
582
- # }, index: 11..18)
583
- # df.reject_values nil, Float::NAN
584
- # # => #<DaruLite::DataFrame(2x3)>
585
- # # a b c
586
- # # 11 1 a a
587
- # # 18 7 8 7
588
- def reject_values(*values)
589
- positions =
590
- size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
591
- # Handle the case when positions size is 1 and #row_at wouldn't return a df
592
- if positions.size == 1
593
- pos = positions.first
594
- row_at(pos..pos)
595
- else
596
- row_at(*positions)
597
- end
598
- end
599
-
600
- # Replace specified values with given value
601
- # @param [Array] old_values values to replace with new value
602
- # @param [object] new_value new value to replace with
603
- # @return [DaruLite::DataFrame] Data Frame itself with old values replace
604
- # with new value
605
- # @example
606
- # df = DaruLite::DataFrame.new({
607
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
608
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
609
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
610
- # }, index: 11..18)
611
- # df.replace_values nil, Float::NAN
612
- # # => #<DaruLite::DataFrame(8x3)>
613
- # # a b c
614
- # # 11 1 a a
615
- # # 12 2 b NaN
616
- # # 13 3 NaN 3
617
- # # 14 NaN NaN 4
618
- # # 15 NaN NaN 3
619
- # # 16 NaN 3 5
620
- # # 17 1 5 NaN
621
- # # 18 7 8 7
622
- def replace_values(old_values, new_value)
623
- @data.each { |vec| vec.replace_values old_values, new_value }
624
- self
625
- end
626
-
627
- # Rolling fillna
628
- # replace all Float::NAN and NIL values with the preceeding or following value
629
- #
630
- # @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
631
- #
632
- # @example
633
- # df = DaruLite::DataFrame.new({
634
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
635
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
636
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
637
- # })
638
- #
639
- # => #<DaruLite::DataFrame(8x3)>
640
- # a b c
641
- # 0 1 a a
642
- # 1 2 b NaN
643
- # 2 3 nil 3
644
- # 3 nil NaN 4
645
- # 4 NaN nil 3
646
- # 5 nil 3 5
647
- # 6 1 5 nil
648
- # 7 7 nil 7
649
- #
650
- # 2.3.3 :068 > df.rolling_fillna(:forward)
651
- # => #<DaruLite::DataFrame(8x3)>
652
- # a b c
653
- # 0 1 a a
654
- # 1 2 b a
655
- # 2 3 b 3
656
- # 3 3 b 4
657
- # 4 3 b 3
658
- # 5 3 3 5
659
- # 6 1 5 5
660
- # 7 7 5 7
661
- #
662
- def rolling_fillna!(direction = :forward)
663
- @data.each { |vec| vec.rolling_fillna!(direction) }
664
- self
665
- end
666
-
667
- def rolling_fillna(direction = :forward)
668
- dup.rolling_fillna!(direction)
669
- end
670
-
671
- # Return unique rows by vector specified or all vectors
672
- #
673
- # @param vtrs [String][Symbol] vector names(s) that should be considered
674
- #
675
- # @example
676
- #
677
- # => #<DaruLite::DataFrame(6x2)>
678
- # a b
679
- # 0 1 a
680
- # 1 2 b
681
- # 2 3 c
682
- # 3 4 d
683
- # 2 3 c
684
- # 3 4 f
685
- #
686
- # 2.3.3 :> df.unique
687
- # => #<DaruLite::DataFrame(5x2)>
688
- # a b
689
- # 0 1 a
690
- # 1 2 b
691
- # 2 3 c
692
- # 3 4 d
693
- # 3 4 f
694
- #
695
- # 2.3.3 :> df.unique(:a)
696
- # => #<DaruLite::DataFrame(5x2)>
697
- # a b
698
- # 0 1 a
699
- # 1 2 b
700
- # 2 3 c
701
- # 3 4 d
702
- #
703
- def uniq(*vtrs)
704
- vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
705
- grouped = group_by(vecs)
706
- indexes = grouped.groups.values.map { |v| v[0] }.sort
707
- row[*indexes]
708
- end
709
-
710
- # Iterate over each index of the DataFrame.
711
- def each_index(&block)
712
- return to_enum(:each_index) unless block
713
-
714
- @index.each(&block)
715
-
716
- self
717
- end
718
-
719
- # Iterate over each vector
720
- def each_vector(&block)
721
- return to_enum(:each_vector) unless block
255
+ # Delete a vector
256
+ def delete_vector(vector)
257
+ raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
722
258
 
723
- @data.each(&block)
259
+ @data.delete_at @vectors[vector]
260
+ @vectors = DaruLite::Index.new @vectors.to_a - [vector]
724
261
 
725
262
  self
726
263
  end
727
264
 
728
- alias each_column each_vector
729
-
730
- # Iterate over each vector alongwith the name of the vector
731
- def each_vector_with_index
732
- return to_enum(:each_vector_with_index) unless block_given?
733
-
734
- @vectors.each do |vector|
735
- yield @data[@vectors[vector]], vector
736
- end
265
+ # Deletes a list of vectors
266
+ def delete_vectors(*vectors)
267
+ Array(vectors).each { |vec| delete_vector vec }
737
268
 
738
269
  self
739
270
  end
740
271
 
741
- alias each_column_with_index each_vector_with_index
742
-
743
- # Iterate over each row
744
- def each_row
745
- return to_enum(:each_row) unless block_given?
746
-
747
- @index.size.times do |pos|
748
- yield row_at(pos)
749
- end
750
-
751
- self
752
- end
272
+ # Delete a row
273
+ def delete_row(index)
274
+ idx = named_index_for index
753
275
 
754
- def each_row_with_index
755
- return to_enum(:each_row_with_index) unless block_given?
276
+ raise IndexError, "Index #{index} does not exist." unless @index.include? idx
756
277
 
757
- @index.each do |index|
758
- yield access_row(index), index
278
+ @index = DaruLite::Index.new(@index.to_a - [idx])
279
+ each_vector do |vector|
280
+ vector.delete_at idx
759
281
  end
760
282
 
761
- self
762
- end
763
-
764
- # Iterate over each row or vector of the DataFrame. Specify axis
765
- # by passing :vector or :row as the argument. Default to :vector.
766
- #
767
- # == Description
768
- #
769
- # `#each` works exactly like Array#each. The default mode for `each`
770
- # is to iterate over the columns of the DataFrame. To iterate over
771
- # rows you must pass the axis, i.e `:row` as an argument.
772
- #
773
- # == Arguments
774
- #
775
- # * +axis+ - The axis to iterate over. Can be :vector (or :column)
776
- # or :row. Default to :vector.
777
- def each(axis = :vector, &block)
778
- dispatch_to_axis axis, :each, &block
779
- end
780
-
781
- # Iterate over a row or vector and return results in a DaruLite::Vector.
782
- # Specify axis with :vector or :row. Default to :vector.
783
- #
784
- # == Description
785
- #
786
- # The #collect iterator works similar to #map, the only difference
787
- # being that it returns a DaruLite::Vector comprising of the results of
788
- # each block run. The resultant Vector has the same index as that
789
- # of the axis over which collect has iterated. It also accepts the
790
- # optional axis argument.
791
- #
792
- # == Arguments
793
- #
794
- # * +axis+ - The axis to iterate over. Can be :vector (or :column)
795
- # or :row. Default to :vector.
796
- def collect(axis = :vector, &block)
797
- dispatch_to_axis_pl axis, :collect, &block
283
+ set_size
798
284
  end
799
285
 
800
- # Map over each vector or row of the data frame according to
801
- # the argument specified. Will return an Array of the resulting
802
- # elements. To map over each row/vector and get a DataFrame,
803
- # see #recode.
804
- #
805
- # == Description
806
- #
807
- # The #map iterator works like Array#map. The value returned by
808
- # each run of the block is added to an Array and the Array is
809
- # returned. This method also accepts an axis argument, like #each.
810
- # The default is :vector.
811
- #
812
- # == Arguments
813
- #
814
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
815
- # Default to :vector.
816
- def map(axis = :vector, &block)
817
- dispatch_to_axis_pl axis, :map, &block
818
- end
286
+ # Delete a row based on its position
287
+ # More robust than #delete_row when working with a CategoricalIndex or when the
288
+ # Index includes integers
289
+ def delete_at_position(position)
290
+ raise IndexError, "Position #{position} does not exist." unless position < size
819
291
 
820
- # Destructive map. Modifies the DataFrame. Each run of the block
821
- # must return a DaruLite::Vector. You can specify the axis to map over
822
- # as the argument. Default to :vector.
823
- #
824
- # == Arguments
825
- #
826
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
827
- # Default to :vector.
828
- def map!(axis = :vector, &block)
829
- if %i[vector column].include?(axis)
830
- map_vectors!(&block)
831
- elsif axis == :row
832
- map_rows!(&block)
833
- end
834
- end
292
+ each_vector { |vector| vector.delete_at_position(position) }
293
+ @index = @index.delete_at(position)
835
294
 
836
- # Maps over the DataFrame and returns a DataFrame. Each run of the
837
- # block must return a DaruLite::Vector object. You can specify the axis
838
- # to map over. Default to :vector.
839
- #
840
- # == Description
841
- #
842
- # Recode works similarly to #map, but an important difference between
843
- # the two is that recode returns a modified DaruLite::DataFrame instead
844
- # of an Array. For this reason, #recode expects that every run of the
845
- # block to return a DaruLite::Vector.
846
- #
847
- # Just like map and each, recode also accepts an optional _axis_ argument.
848
- #
849
- # == Arguments
850
- #
851
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
852
- # Default to :vector.
853
- def recode(axis = :vector, &block)
854
- dispatch_to_axis_pl axis, :recode, &block
295
+ set_size
855
296
  end
856
297
 
857
- # Retain vectors or rows if the block returns a truthy value.
858
- #
859
- # == Description
860
- #
861
- # For filtering out certain rows/vectors based on their values,
862
- # use the #filter method. By default it iterates over vectors and
863
- # keeps those vectors for which the block returns true. It accepts
864
- # an optional axis argument which lets you specify whether you want
865
- # to iterate over vectors or rows.
866
- #
867
- # == Arguments
868
- #
869
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
870
- # Default to :vector.
871
- #
872
- # == Usage
873
- #
874
- # # Filter vectors
875
- #
876
- # df.filter do |vector|
877
- # vector.type == :numeric and vector.median < 50
878
- # end
879
- #
880
- # # Filter rows
298
+ # Creates a DataFrame with the random data, of n size.
299
+ # If n not given, uses original number of rows.
881
300
  #
882
- # df.filter(:row) do |row|
883
- # row[:a] + row[:d] < 100
884
- # end
885
- def filter(axis = :vector, &block)
886
- dispatch_to_axis_pl axis, :filter, &block
887
- end
888
-
889
- def recode_vectors
890
- block_given? or return to_enum(:recode_vectors)
891
-
892
- dup.tap do |df|
893
- df.each_vector_with_index do |v, i|
894
- df[*i] = should_be_vector!(yield(v))
895
- end
896
- end
897
- end
898
-
899
- def recode_rows
900
- block_given? or return to_enum(:recode_rows)
901
-
902
- dup.tap do |df|
903
- df.each_row_with_index do |r, i|
904
- df.row[i] = should_be_vector!(yield(r))
301
+ # @return {DaruLite::DataFrame}
302
+ def bootstrap(n = nil)
303
+ n ||= nrows
304
+ DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
305
+ n.times do
306
+ df_boot.add_row(row[rand(n)])
905
307
  end
308
+ df_boot.update
906
309
  end
907
310
  end
908
311
 
909
- # Map each vector and return an Array.
910
- def map_vectors(&block)
911
- return to_enum(:map_vectors) unless block
912
-
913
- @data.map(&block)
914
- end
915
-
916
- # Destructive form of #map_vectors
917
- def map_vectors!
918
- return to_enum(:map_vectors!) unless block_given?
919
-
920
- vectors.dup.each do |n|
921
- self[n] = should_be_vector!(yield(self[n]))
922
- end
923
-
924
- self
925
- end
926
-
927
- # Map vectors alongwith the index.
928
- def map_vectors_with_index(&block)
929
- return to_enum(:map_vectors_with_index) unless block
930
-
931
- each_vector_with_index.map(&block)
932
- end
933
-
934
- # Map each row
935
- def map_rows(&block)
936
- return to_enum(:map_rows) unless block
937
-
938
- each_row.map(&block)
939
- end
940
-
941
- def map_rows_with_index(&block)
942
- return to_enum(:map_rows_with_index) unless block
943
-
944
- each_row_with_index.map(&block)
945
- end
946
-
947
- def map_rows!
948
- return to_enum(:map_rows!) unless block_given?
949
-
950
- index.dup.each do |i|
951
- row[i] = should_be_vector!(yield(row[i]))
952
- end
953
-
954
- self
955
- end
956
-
957
- def apply_method(method, keys: nil, by_position: true)
958
- df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
959
-
960
- case method
961
- when Symbol then df.send(method)
962
- when Proc then method.call(df)
963
- when Array then method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
964
- else raise
965
- end
966
- end
967
- alias apply_method_on_sub_df apply_method
968
-
969
- # Retrieves a DaruLite::Vector, based on the result of calculation
970
- # performed on each row.
971
- def collect_rows(&block)
972
- return to_enum(:collect_rows) unless block
973
-
974
- DaruLite::Vector.new(each_row.map(&block), index: @index)
975
- end
976
-
977
- def collect_row_with_index(&block)
978
- return to_enum(:collect_row_with_index) unless block
979
-
980
- DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
981
- end
982
-
983
- # Retrives a DaruLite::Vector, based on the result of calculation
984
- # performed on each vector.
985
- def collect_vectors(&block)
986
- return to_enum(:collect_vectors) unless block
987
-
988
- DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
989
- end
990
-
991
- def collect_vector_with_index(&block)
992
- return to_enum(:collect_vector_with_index) unless block
993
-
994
- DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
995
- end
996
-
997
- # Generate a matrix, based on vector names of the DataFrame.
998
- #
999
- # @return {::Matrix}
1000
- # :nocov:
1001
- # FIXME: Even not trying to cover this: I can't get, how it is expected
1002
- # to work.... -- zverok
1003
- def collect_matrix
1004
- return to_enum(:collect_matrix) unless block_given?
1005
-
1006
- vecs = vectors.to_a
1007
- rows = vecs.collect do |row|
1008
- vecs.collect do |col|
1009
- yield row, col
1010
- end
1011
- end
1012
-
1013
- Matrix.rows(rows)
1014
- end
1015
- # :nocov:
1016
-
1017
- # Delete a vector
1018
- def delete_vector(vector)
1019
- raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
1020
-
1021
- @data.delete_at @vectors[vector]
1022
- @vectors = DaruLite::Index.new @vectors.to_a - [vector]
1023
-
1024
- self
1025
- end
1026
-
1027
- # Deletes a list of vectors
1028
- def delete_vectors(*vectors)
1029
- Array(vectors).each { |vec| delete_vector vec }
1030
-
1031
- self
1032
- end
1033
-
1034
- # Delete a row
1035
- def delete_row(index)
1036
- idx = named_index_for index
1037
-
1038
- raise IndexError, "Index #{index} does not exist." unless @index.include? idx
1039
-
1040
- @index = DaruLite::Index.new(@index.to_a - [idx])
1041
- each_vector do |vector|
1042
- vector.delete_at idx
1043
- end
1044
-
1045
- set_size
1046
- end
1047
-
1048
- # Creates a DataFrame with the random data, of n size.
1049
- # If n not given, uses original number of rows.
1050
- #
1051
- # @return {DaruLite::DataFrame}
1052
- def bootstrap(n = nil)
1053
- n ||= nrows
1054
- DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
1055
- n.times do
1056
- df_boot.add_row(row[rand(n)])
1057
- end
1058
- df_boot.update
1059
- end
1060
- end
1061
-
1062
- def keep_row_if
1063
- @index
1064
- .reject { |idx| yield access_row(idx) }
1065
- .each { |idx| delete_row idx }
1066
- end
1067
-
1068
- def keep_vector_if
1069
- @vectors.each do |vector|
1070
- delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
1071
- end
1072
- end
1073
-
1074
- # creates a new vector with the data of a given field which the block returns true
1075
- def filter_vector(vec, &block)
1076
- DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
1077
- end
1078
-
1079
- # Iterates over each row and retains it in a new DataFrame if the block returns
1080
- # true for that row.
1081
- def filter_rows
1082
- return to_enum(:filter_rows) unless block_given?
1083
-
1084
- keep_rows = @index.map { |index| yield access_row(index) }
1085
-
1086
- where keep_rows
1087
- end
1088
-
1089
- # Iterates over each vector and retains it in a new DataFrame if the block returns
1090
- # true for that vector.
1091
- def filter_vectors(&block)
1092
- return to_enum(:filter_vectors) unless block
1093
-
1094
- dup.tap { |df| df.keep_vector_if(&block) }
1095
- end
1096
-
1097
- # Test each row with one or more tests.
1098
- # @param tests [Proc] Each test is a Proc with the form
1099
- # *Proc.new {|row| row[:age] > 0}*
1100
- # The function returns an array with all errors.
1101
- #
1102
- # FIXME: description here is too sparse. As far as I can get,
1103
- # it should tell something about that each test is [descr, fields, block],
1104
- # and that first value may be column name to output. - zverok, 2016-05-18
1105
- def verify(*tests)
1106
- id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
1107
-
1108
- each_row_with_index.map do |row, i|
1109
- tests.reject { |*_, block| block.call(row) }
1110
- .map { |test| verify_error_message row, test, id, i }
1111
- end.flatten
1112
- end
1113
-
1114
- # DSL for yielding each row and returning a DaruLite::Vector based on the
1115
- # value each run of the block returns.
1116
- #
1117
- # == Usage
1118
- #
1119
- # a1 = DaruLite::Vector.new([1, 2, 3, 4, 5, 6, 7])
1120
- # a2 = DaruLite::Vector.new([10, 20, 30, 40, 50, 60, 70])
1121
- # a3 = DaruLite::Vector.new([100, 200, 300, 400, 500, 600, 700])
1122
- # ds = DaruLite::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
1123
- # total = ds.vector_by_calculation { a + b + c }
1124
- # # <DaruLite::Vector:82314050 @name = nil @size = 7 >
1125
- # # nil
1126
- # # 0 111
1127
- # # 1 222
1128
- # # 2 333
1129
- # # 3 444
1130
- # # 4 555
1131
- # # 5 666
1132
- # # 6 777
1133
- def vector_by_calculation(&block)
1134
- a = each_row.map { |r| r.instance_eval(&block) }
1135
-
1136
- DaruLite::Vector.new a, index: @index
1137
- end
1138
-
1139
- # Reorder the vectors in a dataframe
1140
- # @param [Array] order_array new order of the vectors
1141
- # @example
1142
- # df = DaruLite::DataFrame({
1143
- # a: [1, 2, 3],
1144
- # b: [4, 5, 6]
1145
- # }, order: [:a, :b])
1146
- # df.order = [:b, :a]
1147
- # df
1148
- # # => #<DaruLite::DataFrame(3x2)>
1149
- # # b a
1150
- # # 0 4 1
1151
- # # 1 5 2
1152
- # # 2 6 3
1153
- def order=(order_array)
1154
- raise ArgumentError, 'Invalid order' unless vectors.to_a.tally == order_array.tally
1155
-
1156
- initialize(to_h, order: order_array)
1157
- end
1158
-
1159
- # Return the dataframe with rotate vectors positions, the vector at position count is now
1160
- # the first vector of the dataframe.
1161
- # If only one vector in the dataframe, the dataframe is return without any change.
1162
- # @param count => Integer, the vector at position count will be the first vector of the dataframe.
1163
- # @example
1164
- # df = DaruLite::DataFrame({
1165
- # a: [1, 2, 3],
1166
- # b: [4, 5, 6],
1167
- # total: [5, 7, 9],
1168
- # })
1169
- # df.rotate_vectors(-1)
1170
- # df
1171
- # # => #<DaruLite::DataFrame(3x3)>
1172
- # # total b a
1173
- # # 0 5 4 1
1174
- # # 1 7 5 2
1175
- # # 2 9 6 3
1176
- def rotate_vectors(count = -1)
1177
- return self unless vectors.many?
1178
-
1179
- self.order = vectors.to_a.rotate(count)
1180
- self
1181
- end
1182
-
1183
- # Returns a vector, based on a string with a calculation based
1184
- # on vector.
1185
- #
1186
- # The calculation will be eval'ed, so you can put any variable
1187
- # or expression valid on ruby.
1188
- #
1189
- # For example:
1190
- # a = DaruLite::Vector.new [1,2]
1191
- # b = DaruLite::Vector.new [3,4]
1192
- # ds = DaruLite::DataFrame.new({:a => a,:b => b})
1193
- # ds.compute("a+b")
1194
- # => Vector [4,6]
1195
- def compute(text, &block)
1196
- return instance_eval(&block) if block
1197
-
1198
- instance_eval(text)
1199
- end
1200
-
1201
- # Return a vector with the number of missing values in each row.
1202
- #
1203
- # == Arguments
1204
- #
1205
- # * +missing_values+ - An Array of the values that should be
1206
- # treated as 'missing'. The default missing value is *nil*.
1207
- def missing_values_rows(missing_values = [nil])
1208
- number_of_missing = each_row.map do |row|
1209
- row.indexes(*missing_values).size
1210
- end
1211
-
1212
- DaruLite::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
1213
- end
1214
-
1215
- # TODO: remove next version
1216
- alias vector_missing_values missing_values_rows
1217
-
1218
- def has_missing_data?
1219
- @data.any? { |vec| vec.include_values?(*DaruLite::MISSING_VALUES) }
1220
- end
1221
- alias flawed? has_missing_data?
1222
- deprecate :has_missing_data?, :include_values?, 2016, 10
1223
- deprecate :flawed?, :include_values?, 2016, 10
1224
-
1225
- # Check if any of given values occur in the data frame
1226
- # @param [Array] values to check for
1227
- # @return [true, false] true if any of the given values occur in the
1228
- # dataframe, false otherwise
1229
- # @example
1230
- # df = DaruLite::DataFrame.new({
1231
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
1232
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
1233
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
1234
- # }, index: 11..18)
1235
- # df.include_values? nil
1236
- # # => true
1237
- def include_values?(*values)
1238
- @data.any? { |vec| vec.include_values?(*values) }
1239
- end
1240
-
1241
312
  # Return a nested hash using vector names as keys and an array constructed of
1242
313
  # hashes with other values. If block provided, is used to provide the
1243
314
  # values, with parameters +row+ of dataset, +current+ last hash on
1244
315
  # hierarchy and +name+ of the key to include
1245
- def nest(*tree_keys, &block)
1246
- tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
1247
-
1248
- each_row.with_object({}) do |row, current|
1249
- # Create tree
1250
- *keys, last = tree_keys
1251
- current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
1252
- name = row[last]
1253
-
1254
- if block
1255
- current[name] = yield(row, current, name)
1256
- else
1257
- current[name] ||= []
1258
- current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
1259
- end
1260
- end
1261
- end
1262
-
1263
- def vector_count_characters(vecs = nil)
1264
- vecs ||= @vectors.to_a
1265
-
1266
- collect_rows do |row|
1267
- vecs.sum { |v| row[v].to_s.size }
1268
- end
1269
- end
1270
-
1271
- def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
1272
- self[name]
1273
- .split_by_separator(sep)
1274
- .each { |k, v| self[:"#{name}#{join}#{k}"] = v }
1275
- end
1276
-
1277
- # Return the number of rows and columns of the DataFrame in an Array.
1278
- def shape
1279
- [nrows, ncols]
1280
- end
1281
-
1282
- # The number of rows
1283
- def nrows
1284
- @index.size
1285
- end
1286
-
1287
- # The number of vectors
1288
- def ncols
1289
- @vectors.size
1290
- end
1291
-
1292
- # Check if a vector is present
1293
- def has_vector?(vector)
1294
- @vectors.include? vector
1295
- end
1296
-
1297
- # Works like Array#any?.
1298
- #
1299
- # @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
1300
- # :row. A DaruLite::Vector object is yielded in the block.
1301
- # @example Using any?
1302
- # df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
1303
- # df.any?(:row) do |row|
1304
- # row[:a] < 3 and row[:b] == 'b'
1305
- # end #=> true
1306
- def any?(axis = :vector, &block)
1307
- if %i[vector column].include?(axis)
1308
- @data.any?(&block)
1309
- elsif axis == :row
1310
- each_row do |row|
1311
- return true if yield(row)
1312
- end
1313
- false
1314
- else
1315
- raise ArgumentError, "Unidentified axis #{axis}"
1316
- end
1317
- end
1318
-
1319
- # Works like Array#all?
1320
- #
1321
- # @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
1322
- # :row. A DaruLite::Vector object is yielded in the block.
1323
- # @example Using all?
1324
- # df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
1325
- # df.all?(:row) do |row|
1326
- # row[:a] < 10
1327
- # end #=> true
1328
- def all?(axis = :vector, &block)
1329
- if %i[vector column].include?(axis)
1330
- @data.all?(&block)
1331
- elsif axis == :row
1332
- each_row.all?(&block)
1333
- else
1334
- raise ArgumentError, "Unidentified axis #{axis}"
1335
- end
1336
- end
1337
-
1338
- # The first ten elements of the DataFrame
1339
- #
1340
- # @param [Fixnum] quantity (10) The number of elements to display from the top.
1341
- def head(quantity = 10)
1342
- row.at 0..(quantity - 1)
1343
- end
1344
-
1345
- alias first head
1346
-
1347
- # The last ten elements of the DataFrame
1348
- #
1349
- # @param [Fixnum] quantity (10) The number of elements to display from the bottom.
1350
- def tail(quantity = 10)
1351
- start = [-quantity, -size].max
1352
- row.at start..-1
1353
- end
1354
-
1355
- alias last tail
1356
-
1357
- # Sum all numeric/specified vectors in the DataFrame.
1358
- #
1359
- # Returns a new vector that's a containing a sum of all numeric
1360
- # or specified vectors of the DataFrame. By default, if the vector
1361
- # contains a nil, the sum is nil.
1362
- # With :skipnil argument set to true, nil values are assumed to be
1363
- # 0 (zero) and the sum vector is returned.
1364
- #
1365
- # @param args [Array] List of vectors to sum. Default is nil in which case
1366
- # all numeric vectors are summed.
1367
- #
1368
- # @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
1369
- #
1370
- # @return Vector with sum of all vectors specified in the argument.
1371
- # If vecs parameter is empty, sum all numeric vector.
1372
- #
1373
- # @example
1374
- # df = DaruLite::DataFrame.new({
1375
- # a: [1, 2, nil],
1376
- # b: [2, 1, 3],
1377
- # c: [1, 1, 1]
1378
- # })
1379
- # => #<DaruLite::DataFrame(3x3)>
1380
- # a b c
1381
- # 0 1 2 1
1382
- # 1 2 1 1
1383
- # 2 nil 3 1
1384
- # df.vector_sum [:a, :c]
1385
- # => #<DaruLite::Vector(3)>
1386
- # 0 2
1387
- # 1 3
1388
- # 2 nil
1389
- # df.vector_sum
1390
- # => #<DaruLite::Vector(3)>
1391
- # 0 4
1392
- # 1 4
1393
- # 2 nil
1394
- # df.vector_sum skipnil: true
1395
- # => #<DaruLite::Vector(3)>
1396
- # c
1397
- # 0 4
1398
- # 1 4
1399
- # 2 4
1400
- #
1401
- def vector_sum(*args)
1402
- defaults = { vecs: nil, skipnil: false }
1403
- options = args.last.is_a?(::Hash) ? args.pop : {}
1404
- options = defaults.merge(options)
1405
- vecs = args[0] || options[:vecs]
1406
- skipnil = args[1] || options[:skipnil]
1407
-
1408
- vecs ||= numeric_vectors
1409
- sum = DaruLite::Vector.new [0] * @size, index: @index, name: @name, dtype: @dtype
1410
- vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
1411
- end
1412
-
1413
- # Calculate mean of the rows of the dataframe.
1414
- #
1415
- # == Arguments
1416
- #
1417
- # * +max_missing+ - The maximum number of elements in the row that can be
1418
- # zero for the mean calculation to happen. Default to 0.
1419
- def vector_mean(max_missing = 0)
1420
- # FIXME: in vector_sum we preserve created vector dtype, but
1421
- # here we are not. Is this by design or ...? - zverok, 2016-05-18
1422
- mean_vec = DaruLite::Vector.new [0] * @size, index: @index, name: "mean_#{@name}"
1423
-
1424
- each_row_with_index.with_object(mean_vec) do |(row, i), memo|
1425
- memo[i] = row.indexes(*DaruLite::MISSING_VALUES).size > max_missing ? nil : row.mean
1426
- end
1427
- end
1428
-
1429
- # Group elements by vector to perform operations on them. Returns a
1430
- # DaruLite::Core::GroupBy object.See the DaruLite::Core::GroupBy docs for a detailed
1431
- # list of possible operations.
1432
- #
1433
- # == Arguments
1434
- #
1435
- # * vectors - An Array contatining names of vectors to group by.
1436
- #
1437
- # == Usage
1438
- #
1439
- # df = DaruLite::DataFrame.new({
1440
- # a: %w{foo bar foo bar foo bar foo foo},
1441
- # b: %w{one one two three two two one three},
1442
- # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
1443
- # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
1444
- # })
1445
- # df.group_by([:a,:b,:c]).groups
1446
- # #=> {["bar", "one", 2]=>[1],
1447
- # # ["bar", "three", 1]=>[3],
1448
- # # ["bar", "two", 6]=>[5],
1449
- # # ["foo", "one", 1]=>[0],
1450
- # # ["foo", "one", 3]=>[6],
1451
- # # ["foo", "three", 8]=>[7],
1452
- # # ["foo", "two", 3]=>[2, 4]}
1453
- def group_by(*vectors)
1454
- vectors.flatten!
1455
- missing = vectors - @vectors.to_a
1456
- raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}") unless missing.empty?
1457
-
1458
- vectors = [@vectors.first] if vectors.empty?
1459
-
1460
- DaruLite::Core::GroupBy.new(self, vectors)
1461
- end
1462
-
1463
- def reindex_vectors(new_vectors)
1464
- unless new_vectors.is_a?(DaruLite::Index)
1465
- raise ArgumentError, 'Must pass the new index of type Index or its ' \
1466
- "subclasses, not #{new_vectors.class}"
1467
- end
1468
-
1469
- cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
1470
- new_vectors.each_with_object(cl) do |vec, memo|
1471
- memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
1472
- end
1473
- end
1474
-
1475
- def get_vector_anyways(v)
1476
- @vectors.include?(v) ? self[v].to_a : Array.new(size)
1477
- end
1478
-
1479
- # Concatenate another DataFrame along corresponding columns.
1480
- # If columns do not exist in both dataframes, they are filled with nils
1481
- def concat(other_df)
1482
- vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
1483
-
1484
- data = vectors.map do |v|
1485
- get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
1486
- end
1487
-
1488
- DaruLite::DataFrame.new(data, order: vectors)
1489
- end
1490
-
1491
- # Concatenates another DataFrame as #concat.
1492
- # Additionally it tries to preserve the index. If the indices contain
1493
- # common elements, #union will overwrite the according rows in the
1494
- # first dataframe.
1495
- def union(other_df)
1496
- index = (@index.to_a + other_df.index.to_a).uniq
1497
- df = row[*(@index.to_a - other_df.index.to_a)]
1498
-
1499
- df = df.concat(other_df)
1500
- df.index = DaruLite::Index.new(index)
1501
- df
1502
- end
1503
-
1504
- module SetSingleIndexStrategy
1505
- def self.uniq_size(df, col)
1506
- df[col].uniq.size
1507
- end
1508
-
1509
- def self.new_index(df, col)
1510
- DaruLite::Index.new(df[col].to_a)
1511
- end
1512
-
1513
- def self.delete_vector(df, col)
1514
- df.delete_vector(col)
1515
- end
1516
- end
1517
-
1518
- module SetCategoricalIndexStrategy
1519
- def self.new_index(df, col)
1520
- DaruLite::CategoricalIndex.new(df[col].to_a)
1521
- end
1522
-
1523
- def self.delete_vector(df, col)
1524
- df.delete_vector(col)
1525
- end
1526
- end
1527
-
1528
- module SetMultiIndexStrategy
1529
- def self.uniq_size(df, cols)
1530
- df[*cols].uniq.size
1531
- end
1532
-
1533
- def self.new_index(df, cols)
1534
- DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
1535
- mi.name = cols
1536
- end
1537
- end
1538
-
1539
- def self.delete_vector(df, cols)
1540
- df.delete_vectors(*cols)
1541
- end
1542
- end
1543
-
1544
- # Set a particular column as the new DF
1545
- def set_index(new_index_col, keep: false, categorical: false)
1546
- if categorical
1547
- strategy = SetCategoricalIndexStrategy
1548
- elsif new_index_col.respond_to?(:to_a)
1549
- strategy = SetMultiIndexStrategy
1550
- new_index_col = new_index_col.to_a
1551
- else
1552
- strategy = SetSingleIndexStrategy
1553
- end
1554
-
1555
- unless categorical
1556
- uniq_size = strategy.uniq_size(self, new_index_col)
1557
- raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
1558
- end
1559
-
1560
- self.index = strategy.new_index(self, new_index_col)
1561
- strategy.delete_vector(self, new_index_col) unless keep
1562
- self
1563
- end
1564
-
1565
- # Change the index of the DataFrame and preserve the labels of the previous
1566
- # indexing. New index can be DaruLite::Index or any of its subclasses.
1567
- #
1568
- # @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
1569
- # @example Reindexing DataFrame
1570
- # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
1571
- # index: ['a','b','c','d'])
1572
- # #=>
1573
- # ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
1574
- # # a b
1575
- # # a 1 11
1576
- # # b 2 22
1577
- # # c 3 33
1578
- # # d 4 44
1579
- # df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
1580
- # #=>
1581
- # ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
1582
- # # a b
1583
- # # b 2 22
1584
- # # 0 nil nil
1585
- # # a 1 11
1586
- # # g nil nil
1587
- def reindex(new_index)
1588
- unless new_index.is_a?(DaruLite::Index)
1589
- raise ArgumentError, 'Must pass the new index of type Index or its ' \
1590
- "subclasses, not #{new_index.class}"
1591
- end
1592
-
1593
- cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
1594
- new_index.each_with_object(cl) do |idx, memo|
1595
- memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
1596
- end
1597
- end
1598
-
1599
- def reset_index
1600
- index_df = index.to_df
1601
- names = index.name
1602
- names = [names] unless names.instance_of?(Array)
1603
- new_vectors = names + vectors.to_a
1604
- self.index = index_df.index
1605
- names.each do |name|
1606
- self[name] = index_df[name]
1607
- end
1608
- self.order = new_vectors
1609
- self
1610
- end
1611
-
1612
- # Reassign index with a new index of type DaruLite::Index or any of its subclasses.
1613
- #
1614
- # @param [DaruLite::Index] idx New index object on which the rows of the dataframe
1615
- # are to be indexed.
1616
- # @example Reassigining index of a DataFrame
1617
- # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
1618
- # df.index.to_a #=> [0,1,2,3]
1619
- #
1620
- # df.index = DaruLite::Index.new(['a','b','c','d'])
1621
- # df.index.to_a #=> ['a','b','c','d']
1622
- # df.row['a'].to_a #=> [1,11]
1623
- def index=(idx)
1624
- @index = Index.coerce idx
1625
- @data.each { |vec| vec.index = @index }
1626
-
1627
- self
1628
- end
1629
-
1630
- # Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
1631
- #
1632
- # @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
1633
- # be indexed. Must of the same size as ncols.
1634
- # @example Reassigning vectors of a DataFrame
1635
- # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
1636
- # df.vectors.to_a #=> [:a, :b, :c]
1637
- #
1638
- # df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
1639
- # df.vectors.to_a #=> [:foo, :bar, :baz]
1640
- def vectors=(new_index)
1641
- raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
1642
-
1643
- if new_index.size != ncols
1644
- raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
1645
- "dataframe size #{ncols}"
1646
- end
1647
-
1648
- @vectors = new_index
1649
- @data.zip(new_index.to_a).each do |vect, name|
1650
- vect.name = name
1651
- end
1652
- self
1653
- end
1654
-
1655
- # Renames the vectors
1656
- #
1657
- # == Arguments
1658
- #
1659
- # * name_map - A hash where the keys are the exising vector names and
1660
- # the values are the new names. If a vector is renamed
1661
- # to a vector name that is already in use, the existing
1662
- # one is overwritten.
1663
- #
1664
- # == Usage
1665
- #
1666
- # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1667
- # df.rename_vectors :a => :alpha, :c => :gamma
1668
- # df.vectors.to_a #=> [:alpha, :b, :gamma]
1669
- def rename_vectors(name_map)
1670
- existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
1671
- delete_vectors(*existing_targets)
1672
-
1673
- new_names = vectors.to_a.map { |v| name_map[v] || v }
1674
- self.vectors = DaruLite::Index.new new_names
1675
- end
1676
-
1677
- # Renames the vectors and returns itself
1678
- #
1679
- # == Arguments
1680
- #
1681
- # * name_map - A hash where the keys are the exising vector names and
1682
- # the values are the new names. If a vector is renamed
1683
- # to a vector name that is already in use, the existing
1684
- # one is overwritten.
1685
- #
1686
- # == Usage
1687
- #
1688
- # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1689
- # df.rename_vectors! :a => :alpha, :c => :gamma # df
1690
- def rename_vectors!(name_map)
1691
- rename_vectors(name_map)
1692
- self
1693
- end
1694
-
1695
- # Converts the vectors to a DaruLite::MultiIndex.
1696
- # The argument passed is used as the MultiIndex's top level
1697
- def add_level_to_vectors(top_level_label)
1698
- tuples = vectors.map { |label| [top_level_label, *label] }
1699
- self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
1700
- end
1701
-
1702
- # Return the indexes of all the numeric vectors. Will include vectors with nils
1703
- # alongwith numbers.
1704
- def numeric_vectors
1705
- # FIXME: Why _with_index ?..
1706
- each_vector_with_index
1707
- .select { |vec, _i| vec.numeric? }
1708
- .map(&:last)
1709
- end
1710
-
1711
- def numeric_vector_names
1712
- @vectors.select { |v| self[v].numeric? }
1713
- end
1714
-
1715
- # Return a DataFrame of only the numerical Vectors. If clone: false
1716
- # is specified as option, only a *view* of the Vectors will be
1717
- # returned. Defaults to clone: true.
1718
- def only_numerics(opts = {})
1719
- cln = opts[:clone] != false
1720
- arry = numeric_vectors.map { |v| self[v] }
1721
-
1722
- order = Index.new(numeric_vectors)
1723
- DaruLite::DataFrame.new(arry, clone: cln, order: order, index: @index)
1724
- end
1725
-
1726
- # Generate a summary of this DataFrame based on individual vectors in the DataFrame
1727
- # @return [String] String containing the summary of the DataFrame
1728
- def summary
1729
- summary = "= #{name}"
1730
- summary << "\n Number of rows: #{nrows}"
1731
- @vectors.each do |v|
1732
- summary << "\n Element:[#{v}]\n"
1733
- summary << self[v].summary(1)
1734
- end
1735
- summary
1736
- end
1737
-
1738
- # Sorts a dataframe (ascending/descending) in the given pripority sequence of
1739
- # vectors, with or without a block.
1740
- #
1741
- # @param vector_order [Array] The order of vector names in which the DataFrame
1742
- # should be sorted.
1743
- # @param opts [Hash] opts The options to sort with.
1744
- # @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
1745
- # or descending order. Specify Array corresponding to *order* for multiple
1746
- # sort orders.
1747
- # @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
1748
- # to be used for sorting, for each vector name in *order* as a hash of
1749
- # vector name and lambda expressions. In case a lambda for a vector is not
1750
- # specified, the default will be used.
1751
- # @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
1752
- # automatically or not when a block is provided.
1753
- # If set to True, nils will appear at top after sorting.
1754
- #
1755
- # @example Sort a dataframe with a vector sequence.
1756
- #
1757
- #
1758
- # df = DaruLite::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
1759
- #
1760
- # df.sort [:a, :b]
1761
- # # =>
1762
- # # <DaruLite::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
1763
- # # a b
1764
- # # 2 1 3
1765
- # # 0 1 5
1766
- # # 3 2 2
1767
- # # 1 2 4
1768
- # # 4 3 1
1769
- #
1770
- # @example Sort a dataframe without a block. Here nils will be handled automatically.
1771
- #
1772
- # df = DaruLite::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
1773
- #
1774
- # df.sort([:a])
1775
- # # =>
1776
- # # <DaruLite::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
1777
- # # a b
1778
- # # 1 nil 3
1779
- # # 3 nil 1
1780
- # # 0 -3 4
1781
- # # 2 -1 2
1782
- # # 4 5 4
1783
- #
1784
- # @example Sort a dataframe with a block with nils handled automatically.
1785
- #
1786
- # df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1787
- #
1788
- # df.sort [:b], by: {b: lambda { |a| a.length } }
1789
- # # NoMethodError: undefined method `length' for nil:NilClass
1790
- # # from (pry):8:in `block in __pry__'
1791
- #
1792
- # df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
1793
- #
1794
- # # =>
1795
- # # <DaruLite::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
1796
- # # a b
1797
- # # 2 1 nil
1798
- # # 5 1 nil
1799
- # # 4 -1 x
1800
- # # 1 -1 aa
1801
- # # 0 nil aaa
1802
- # # 3 nil baaa
1803
- #
1804
- # @example Sort a dataframe with a block with nils handled manually.
1805
- #
1806
- # df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1807
- #
1808
- # # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
1809
- # df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
1810
- #
1811
- # # =>
1812
- # #<DaruLite::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
1813
- # # a b
1814
- # # 4 -1 x
1815
- # # 1 -1 aa
1816
- # # 0 nil aaa
1817
- # # 3 nil baaa
1818
- # # 2 1 nil
1819
- # # 5 1 nil
1820
-
1821
- def sort!(vector_order, opts = {})
1822
- raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
1823
-
1824
- # To enable sorting with categorical data,
1825
- # map categories to integers preserving their order
1826
- old = convert_categorical_vectors vector_order
1827
- block = sort_prepare_block vector_order, opts
1828
-
1829
- order = @index.size.times.sort(&block)
1830
- new_index = @index.reorder order
1831
-
1832
- # To reverse map mapping of categorical data to integers
1833
- restore_categorical_vectors old
1834
-
1835
- @data.each do |vector|
1836
- vector.reorder! order
1837
- end
1838
-
1839
- self.index = new_index
1840
-
1841
- self
1842
- end
1843
-
1844
- # Non-destructive version of #sort!
1845
- def sort(vector_order, opts = {})
1846
- dup.sort! vector_order, opts
1847
- end
1848
-
1849
- # Pivots a data frame on specified vectors and applies an aggregate function
1850
- # to quickly generate a summary.
1851
- #
1852
- # == Options
1853
- #
1854
- # +:index+ - Keys to group by on the pivot table row index. Pass vector names
1855
- # contained in an Array.
1856
- #
1857
- # +:vectors+ - Keys to group by on the pivot table column index. Pass vector
1858
- # names contained in an Array.
1859
- #
1860
- # +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
1861
- # use any of the statistics functions applicable on Vectors that can be found in
1862
- # the DaruLite::Statistics::Vector module.
1863
- #
1864
- # +:values+ - Columns to aggregate. Will consider all numeric columns not
1865
- # specified in *:index* or *:vectors*. Optional.
1866
- #
1867
- # == Usage
1868
- #
1869
- # df = DaruLite::DataFrame.new({
1870
- # a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
1871
- # b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
1872
- # c: ['small','large','large','small','small','large','small','large','small'],
1873
- # d: [1,2,2,3,3,4,5,6,7],
1874
- # e: [2,4,4,6,6,8,10,12,14]
1875
- # })
1876
- # df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
1877
- #
1878
- # #=>
1879
- # # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
1880
- # # [:e, :one] [:e, :two]
1881
- # # [:bar] 18 26
1882
- # # [:foo] 10 12
1883
- def pivot_table(opts = {})
1884
- raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?
1885
-
1886
- index = opts[:index]
1887
- vectors = opts[:vectors] || []
1888
- aggregate_function = opts[:agg] || :mean
1889
- values = prepare_pivot_values index, vectors, opts
1890
- raise IndexError, 'No numeric vectors to aggregate' if values.empty?
1891
-
1892
- grouped = group_by(index)
1893
- return grouped.send(aggregate_function) if vectors.empty?
1894
-
1895
- super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
1896
-
1897
- pivot_dataframe super_hash
1898
- end
1899
-
1900
- # Merge vectors from two DataFrames. In case of name collision,
1901
- # the vectors names are changed to x_1, x_2 ....
1902
- #
1903
- # @return {DaruLite::DataFrame}
1904
- def merge(other_df)
1905
- unless nrows == other_df.nrows
1906
- raise ArgumentError,
1907
- "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
1908
- end
1909
-
1910
- new_fields = (@vectors.to_a + other_df.vectors.to_a)
1911
- new_fields = ArrayHelper.recode_repeated(new_fields)
1912
- DataFrame.new({}, order: new_fields).tap do |df_new|
1913
- (0...nrows).each do |i|
1914
- df_new.add_row row[i].to_a + other_df.row[i].to_a
1915
- end
1916
- df_new.index = @index if @index == other_df.index
1917
- df_new.update
1918
- end
1919
- end
1920
-
1921
- # Join 2 DataFrames with SQL style joins. Currently supports inner, left
1922
- # outer, right outer and full outer joins.
1923
- #
1924
- # @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
1925
- # to be performed.
1926
- # @param [Hash] opts Options Hash
1927
- # @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
1928
- # @option :on [Array] The columns on which the join is to be performed.
1929
- # Column names specified here must be common to both DataFrames.
1930
- # @option :indicator [Symbol] The name of a vector to add to the resultant
1931
- # dataframe that indicates whether the record was in the left (:left_only),
1932
- # right (:right_only), or both (:both) joining dataframes.
1933
- # @return [DaruLite::DataFrame]
1934
- # @example Inner Join
1935
- # left = DaruLite::DataFrame.new({
1936
- # :id => [1,2,3,4],
1937
- # :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
1938
- # })
1939
- # right = DaruLite::DataFrame.new({
1940
- # :id => [1,2,3,4],
1941
- # :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
1942
- # })
1943
- # left.join(right, how: :inner, on: [:name])
1944
- # #=>
1945
- # ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
1946
- # # id_1 name id_2
1947
- # # 0 1 Pirate 2
1948
- # # 1 3 Ninja 4
1949
- def join(other_df, opts = {})
1950
- DaruLite::Core::Merge.join(self, other_df, opts)
1951
- end
1952
-
1953
- # Creates a new dataset for one to many relations
1954
- # on a dataset, based on pattern of field names.
1955
- #
1956
- # for example, you have a survey for number of children
1957
- # with this structure:
1958
- # id, name, child_name_1, child_age_1, child_name_2, child_age_2
1959
- # with
1960
- # ds.one_to_many([:id], "child_%v_%n"
1961
- # the field of first parameters will be copied verbatim
1962
- # to new dataset, and fields which responds to second
1963
- # pattern will be added one case for each different %n.
1964
- #
1965
- # @example
1966
- # cases=[
1967
- # ['1','george','red',10,'blue',20,nil,nil],
1968
- # ['2','fred','green',15,'orange',30,'white',20],
1969
- # ['3','alfred',nil,nil,nil,nil,nil,nil]
1970
- # ]
1971
- # ds=DaruLite::DataFrame.rows(cases, order:
1972
- # [:id, :name,
1973
- # :car_color1, :car_value1,
1974
- # :car_color2, :car_value2,
1975
- # :car_color3, :car_value3])
1976
- # ds.one_to_many([:id],'car_%v%n').to_matrix
1977
- # #=> Matrix[
1978
- # # ["red", "1", 10],
1979
- # # ["blue", "1", 20],
1980
- # # ["green", "2", 15],
1981
- # # ["orange", "2", 30],
1982
- # # ["white", "2", 20]
1983
- # # ]
1984
- def one_to_many(parent_fields, pattern)
1985
- vars, numbers = one_to_many_components(pattern)
1986
-
1987
- DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
1988
- each_row do |row|
1989
- verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
1990
- numbers.each do |n|
1991
- generated = one_to_many_row row, n, vars, pattern
1992
- next if generated.values.all?(&:nil?)
1993
-
1994
- ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
1995
- end
1996
- end
1997
- ds.update
1998
- end
1999
- end
2000
-
2001
- def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
2002
- self[nm]
2003
- .split_by_separator(sep)
2004
- .each_with_index do |(k, v), i|
2005
- v.rename "#{nm}:#{k}"
2006
- self[:"#{nm}#{join}#{i + 1}"] = v
2007
- end
2008
- end
2009
-
2010
- # Create a sql, basen on a given Dataset
2011
- #
2012
- # == Arguments
2013
- #
2014
- # * table - String specifying name of the table that will created in SQL.
2015
- # * charset - Character set. Default is "UTF8".
2016
- #
2017
- # @example
2018
- #
2019
- # ds = DaruLite::DataFrame.new({
2020
- # :id => DaruLite::Vector.new([1,2,3,4,5]),
2021
- # :name => DaruLite::Vector.new(%w{Alex Peter Susan Mary John})
2022
- # })
2023
- # ds.create_sql('names')
2024
- # #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
2025
- #
2026
- def create_sql(table, charset = 'UTF8')
2027
- sql = "CREATE TABLE #{table} ("
2028
- fields = vectors.to_a.collect do |f|
2029
- v = self[f]
2030
- "#{f} #{v.db_type}"
2031
- end
2032
-
2033
- sql + fields.join(",\n ") + ") CHARACTER SET=#{charset};"
2034
- end
2035
-
2036
- # Returns the dataframe. This can be convenient when the user does not
2037
- # know whether the object is a vector or a dataframe.
2038
- # @return [self] the dataframe
2039
- def to_df
2040
- self
2041
- end
2042
-
2043
- # Convert all vectors of type *:numeric* into a Matrix.
2044
- def to_matrix
2045
- Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
2046
- end
2047
-
2048
- # Converts the DataFrame into an array of hashes where key is vector name
2049
- # and value is the corresponding element. The 0th index of the array contains
2050
- # the array of hashes while the 1th index contains the indexes of each row
2051
- # of the dataframe. Each element in the index array corresponds to its row
2052
- # in the array of hashes, which has the same index.
2053
- def to_a
2054
- [each_row.map(&:to_h), @index.to_a]
2055
- end
2056
-
2057
- # Convert to json. If no_index is false then the index will NOT be included
2058
- # in the JSON thus created.
2059
- def to_json(no_index = true)
2060
- if no_index
2061
- to_a[0].to_json
2062
- else
2063
- to_a.to_json
2064
- end
2065
- end
2066
-
2067
- # Converts DataFrame to a hash (explicit) with keys as vector names and values as
2068
- # the corresponding vectors.
2069
- def to_h
2070
- @vectors
2071
- .each_with_index
2072
- .map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
2073
- end
316
+ def nest(*tree_keys, &block)
317
+ tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
2074
318
 
2075
- # Convert to html for IRuby.
2076
- def to_html(threshold = DaruLite.max_rows)
2077
- table_thead = to_html_thead
2078
- table_tbody = to_html_tbody(threshold)
2079
- path = if index.is_a?(MultiIndex)
2080
- File.expand_path('iruby/templates/dataframe_mi.html.erb', __dir__)
2081
- else
2082
- File.expand_path('iruby/templates/dataframe.html.erb', __dir__)
2083
- end
2084
- ERB.new(File.read(path).strip).result(binding)
2085
- end
319
+ each_row.with_object({}) do |row, current|
320
+ # Create tree
321
+ *keys, last = tree_keys
322
+ current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
323
+ name = row[last]
2086
324
 
2087
- def to_html_thead
2088
- table_thead_path =
2089
- if index.is_a?(MultiIndex)
2090
- File.expand_path('iruby/templates/dataframe_mi_thead.html.erb', __dir__)
325
+ if block
326
+ current[name] = yield(row, current, name)
2091
327
  else
2092
- File.expand_path('iruby/templates/dataframe_thead.html.erb', __dir__)
328
+ current[name] ||= []
329
+ current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
2093
330
  end
2094
- ERB.new(File.read(table_thead_path).strip).result(binding)
331
+ end
2095
332
  end
2096
333
 
2097
- def to_html_tbody(threshold = DaruLite.max_rows)
2098
- threshold ||= @size
2099
- table_tbody_path =
2100
- if index.is_a?(MultiIndex)
2101
- File.expand_path('iruby/templates/dataframe_mi_tbody.html.erb', __dir__)
2102
- else
2103
- File.expand_path('iruby/templates/dataframe_tbody.html.erb', __dir__)
2104
- end
2105
- ERB.new(File.read(table_tbody_path).strip).result(binding)
334
+ def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
335
+ self[name]
336
+ .split_by_separator(sep)
337
+ .each { |k, v| self[:"#{name}#{join}#{k}"] = v }
2106
338
  end
2107
339
 
2108
- def to_s
2109
- "#<#{self.class}#{": #{@name}" if @name}(#{nrows}x#{ncols})>"
340
+ # Return the number of rows and columns of the DataFrame in an Array.
341
+ def shape
342
+ [nrows, ncols]
2110
343
  end
2111
344
 
2112
- # Method for updating the metadata (i.e. missing value positions) of the
2113
- # after assingment/deletion etc. are complete. This is provided so that
2114
- # time is not wasted in creating the metadata for the vector each time
2115
- # assignment/deletion of elements is done. Updating data this way is called
2116
- # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
2117
- def update
2118
- @data.each(&:update) if DaruLite.lazy_update
345
+ # The number of rows
346
+ def nrows
347
+ @index.size
2119
348
  end
2120
349
 
2121
- # Rename the DataFrame.
2122
- def rename(new_name)
2123
- @name = new_name
2124
- self
350
+ # The number of vectors
351
+ def ncols
352
+ @vectors.size
2125
353
  end
2126
354
 
2127
- alias name= rename
2128
-
2129
- # Write this DataFrame to a CSV file.
355
+ # Renames the vectors
2130
356
  #
2131
357
  # == Arguments
2132
358
  #
2133
- # * filename - Path of CSV file where the DataFrame is to be saved.
359
+ # * name_map - A hash where the keys are the exising vector names and
360
+ # the values are the new names. If a vector is renamed
361
+ # to a vector name that is already in use, the existing
362
+ # one is overwritten.
2134
363
  #
2135
- # == Options
364
+ # == Usage
2136
365
  #
2137
- # * convert_comma - If set to *true*, will convert any commas in any
2138
- # of the data to full stops ('.').
2139
- # All the options accepted by CSV.read() can also be passed into this
2140
- # function.
2141
- def write_csv(filename, opts = {})
2142
- DaruLite::IO.dataframe_write_csv self, filename, opts
2143
- end
366
+ # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
367
+ # df.rename_vectors :a => :alpha, :c => :gamma
368
+ # df.vectors.to_a #=> [:alpha, :b, :gamma]
369
+ def rename_vectors(name_map)
370
+ existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
371
+ delete_vectors(*existing_targets)
2144
372
 
2145
- # Write this dataframe to an Excel Spreadsheet
2146
- #
2147
- # == Arguments
2148
- #
2149
- # * filename - The path of the file where the DataFrame should be written.
2150
- def write_excel(filename, opts = {})
2151
- DaruLite::IO.dataframe_write_excel self, filename, opts
373
+ new_names = vectors.to_a.map { |v| name_map[v] || v }
374
+ self.vectors = DaruLite::Index.new new_names
2152
375
  end
2153
376
 
2154
- # Insert each case of the Dataset on the selected table
377
+ # Renames the vectors and returns itself
2155
378
  #
2156
379
  # == Arguments
2157
380
  #
2158
- # * dbh - DBI database connection object.
2159
- # * query - Query string.
381
+ # * name_map - A hash where the keys are the exising vector names and
382
+ # the values are the new names. If a vector is renamed
383
+ # to a vector name that is already in use, the existing
384
+ # one is overwritten.
2160
385
  #
2161
386
  # == Usage
2162
387
  #
2163
- # ds = DaruLite::DataFrame.new({:id=>DaruLite::Vector.new([1,2,3]), :name=>DaruLite::Vector.new(["a","b","c"])})
2164
- # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
2165
- # ds.write_sql(dbh,"test")
2166
- def write_sql(dbh, table)
2167
- DaruLite::IO.dataframe_write_sql self, dbh, table
388
+ # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
389
+ # df.rename_vectors! :a => :alpha, :c => :gamma # df
390
+ def rename_vectors!(name_map)
391
+ rename_vectors(name_map)
392
+ self
2168
393
  end
2169
394
 
2170
- # Use marshalling to save dataframe to a file.
2171
- def save(filename)
2172
- DaruLite::IO.save self, filename
395
+ # Converts the vectors to a DaruLite::MultiIndex.
396
+ # The argument passed is used as the MultiIndex's top level
397
+ def add_level_to_vectors(top_level_label)
398
+ tuples = vectors.map { |label| [top_level_label, *label] }
399
+ self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
2173
400
  end
2174
401
 
2175
- def _dump(_depth)
2176
- Marshal.dump(
2177
- data: @data,
2178
- index: @index.to_a,
2179
- order: @vectors.to_a,
2180
- name: @name
2181
- )
402
+ def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
403
+ self[nm]
404
+ .split_by_separator(sep)
405
+ .each_with_index do |(k, v), i|
406
+ v.rename "#{nm}:#{k}"
407
+ self[:"#{nm}#{join}#{i + 1}"] = v
408
+ end
409
+ end
410
+
411
+ # Method for updating the metadata (i.e. missing value positions) of the
412
+ # after assingment/deletion etc. are complete. This is provided so that
413
+ # time is not wasted in creating the metadata for the vector each time
414
+ # assignment/deletion of elements is done. Updating data this way is called
415
+ # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
416
+ def update
417
+ @data.each(&:update) if DaruLite.lazy_update
2182
418
  end
2183
419
 
2184
- def self._load(data)
2185
- h = Marshal.load data
2186
- DaruLite::DataFrame.new(h[:data],
2187
- index: h[:index],
2188
- order: h[:order],
2189
- name: h[:name])
420
+ # Rename the DataFrame.
421
+ def rename(new_name)
422
+ @name = new_name
423
+ self
2190
424
  end
425
+ alias name= rename
2191
426
 
2192
427
  # Transpose a DataFrame, tranposing elements and row, column indexing.
2193
428
  def transpose
@@ -2218,11 +453,6 @@ module DaruLite
2218
453
  )
2219
454
  end
2220
455
 
2221
- # Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
2222
- def where(bool_array)
2223
- DaruLite::Core::Query.df_where self, bool_array
2224
- end
2225
-
2226
456
  def ==(other)
2227
457
  self.class == other.class &&
2228
458
  @size == other.size &&
@@ -2276,144 +506,6 @@ module DaruLite
2276
506
  order: all_vectors.map(&:name)
2277
507
  end
2278
508
 
2279
- # Split the dataframe into many dataframes based on category vector
2280
- # @param [object] cat_name name of category vector to split the dataframe
2281
- # @return [Array] array of dataframes split by category with category vector
2282
- # used to split not included
2283
- # @example
2284
- # df = DaruLite::DataFrame.new({
2285
- # a: [1, 2, 3],
2286
- # b: ['a', 'a', 'b']
2287
- # })
2288
- # df.to_category :b
2289
- # df.split_by_category :b
2290
- # # => [#<DaruLite::DataFrame: a (2x1)>
2291
- # # a
2292
- # # 0 1
2293
- # # 1 2,
2294
- # # #<DaruLite::DataFrame: b (1x1)>
2295
- # # a
2296
- # # 2 3]
2297
- def split_by_category(cat_name)
2298
- cat_dv = self[cat_name]
2299
- raise ArgumentError, "#{cat_name} is not a category vector" unless
2300
- cat_dv.category?
2301
-
2302
- cat_dv.categories.map do |cat|
2303
- where(cat_dv.eq cat)
2304
- .rename(cat)
2305
- .delete_vector cat_name
2306
- end
2307
- end
2308
-
2309
- # @param indexes [Array] index(s) at which row tuples are retrieved
2310
- # @return [Array] returns array of row tuples at given index(s)
2311
- # @example Using DaruLite::Index
2312
- # df = DaruLite::DataFrame.new({
2313
- # a: [1, 2, 3],
2314
- # b: ['a', 'a', 'b']
2315
- # })
2316
- #
2317
- # df.access_row_tuples_by_indexs(1,2)
2318
- # # => [[2, "a"], [3, "b"]]
2319
- #
2320
- # df.index = DaruLite::Index.new([:one,:two,:three])
2321
- # df.access_row_tuples_by_indexs(:one,:three)
2322
- # # => [[1, "a"], [3, "b"]]
2323
- #
2324
- # @example Using DaruLite::MultiIndex
2325
- # mi_idx = DaruLite::MultiIndex.from_tuples [
2326
- # [:a,:one,:bar],
2327
- # [:a,:one,:baz],
2328
- # [:b,:two,:bar],
2329
- # [:a,:two,:baz],
2330
- # ]
2331
- # df_mi = DaruLite::DataFrame.new({
2332
- # a: 1..4,
2333
- # b: 'a'..'d'
2334
- # }, index: mi_idx )
2335
- #
2336
- # df_mi.access_row_tuples_by_indexs(:b, :two, :bar)
2337
- # # => [[3, "c"]]
2338
- # df_mi.access_row_tuples_by_indexs(:a)
2339
- # # => [[1, "a"], [2, "b"], [4, "d"]]
2340
- def access_row_tuples_by_indexs(*indexes)
2341
- return get_sub_dataframe(indexes, by_position: false).map_rows(&:to_a) if
2342
- @index.is_a?(DaruLite::MultiIndex)
2343
-
2344
- positions = @index.pos(*indexes)
2345
- if positions.is_a? Numeric
2346
- row = get_rows_for([positions])
2347
- row.first.is_a?(Array) ? row : [row]
2348
- else
2349
- new_rows = get_rows_for(indexes, by_position: false)
2350
- indexes.map { |index| new_rows.map { |r| r[index] } }
2351
- end
2352
- end
2353
-
2354
- # Function to use for aggregating the data.
2355
- #
2356
- # @param options [Hash] options for column, you want in resultant dataframe
2357
- #
2358
- # @return [DaruLite::DataFrame]
2359
- #
2360
- # @example
2361
- # df = DaruLite::DataFrame.new(
2362
- # {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
2363
- # => #<DaruLite::DataFrame(5x2)>
2364
- # col num
2365
- # 0 a 52
2366
- # 1 b 12
2367
- # 2 c 7
2368
- # 3 d 17
2369
- # 4 e 1
2370
- #
2371
- # df.aggregate(num_100_times: ->(df) { (df.num*100).first })
2372
- # => #<DaruLite::DataFrame(5x1)>
2373
- # num_100_ti
2374
- # 0 5200
2375
- # 1 1200
2376
- # 2 700
2377
- # 3 1700
2378
- # 4 100
2379
- #
2380
- # When we have duplicate index :
2381
- #
2382
- # idx = DaruLite::CategoricalIndex.new [:a, :b, :a, :a, :c]
2383
- # df = DaruLite::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
2384
- # => #<DaruLite::DataFrame(5x1)>
2385
- # num
2386
- # a 52
2387
- # b 12
2388
- # a 7
2389
- # a 17
2390
- # c 1
2391
- #
2392
- # df.aggregate(num: :mean)
2393
- # => #<DaruLite::DataFrame(3x1)>
2394
- # num
2395
- # a 25.3333333
2396
- # b 12
2397
- # c 1
2398
- #
2399
- # Note: `GroupBy` class `aggregate` method uses this `aggregate` method
2400
- # internally.
2401
- def aggregate(options = {}, multi_index_level = -1)
2402
- if block_given?
2403
- positions_tuples, new_index = yield(@index) # NOTE: use of yield is private for now
2404
- else
2405
- positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
2406
- end
2407
-
2408
- colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
2409
-
2410
- DaruLite::DataFrame.new(colmn_value, index: new_index, order: options.keys)
2411
- end
2412
-
2413
- def group_by_and_aggregate(*group_by_keys, **aggregation_map)
2414
- group_by(*group_by_keys).aggregate(aggregation_map)
2415
- end
2416
-
2417
509
  private
2418
510
 
2419
511
  def headers
@@ -2424,20 +516,6 @@ module DaruLite
2424
516
  index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
2425
517
  end
2426
518
 
2427
- def convert_categorical_vectors(names)
2428
- names.filter_map do |n|
2429
- next unless self[n].category?
2430
-
2431
- old = [n, self[n]]
2432
- self[n] = DaruLite::Vector.new(self[n].to_ints)
2433
- old
2434
- end
2435
- end
2436
-
2437
- def restore_categorical_vectors(old)
2438
- old.each { |name, vector| self[name] = vector }
2439
- end
2440
-
2441
519
  def recursive_product(dfs)
2442
520
  return dfs.first if dfs.size == 1
2443
521
 
@@ -2449,12 +527,6 @@ module DaruLite
2449
527
  end
2450
528
  end
2451
529
 
2452
- def should_be_vector!(val)
2453
- return val if val.is_a?(DaruLite::Vector)
2454
-
2455
- raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
2456
- end
2457
-
2458
530
  def dispatch_to_axis(axis, method, *args, &block)
2459
531
  if %i[vector column].include?(axis)
2460
532
  send(:"#{method}_vector", *args, &block)
@@ -2485,76 +557,6 @@ module DaruLite
2485
557
  end
2486
558
  end
2487
559
 
2488
- def access_vector(*names)
2489
- if names.first.is_a?(Range)
2490
- dup(@vectors.subset(names.first))
2491
- elsif @vectors.is_a?(MultiIndex)
2492
- access_vector_multi_index(*names)
2493
- else
2494
- access_vector_single_index(*names)
2495
- end
2496
- end
2497
-
2498
- def access_vector_multi_index(*names)
2499
- pos = @vectors[names]
2500
-
2501
- return @data[pos] if pos.is_a?(Integer)
2502
-
2503
- new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
2504
-
2505
- pos = pos.drop_left_level(names.size) if names.size < @vectors.width
2506
-
2507
- DaruLite::DataFrame.new(new_vectors, index: @index, order: pos)
2508
- end
2509
-
2510
- def access_vector_single_index(*names)
2511
- if names.count < 2
2512
- begin
2513
- pos = @vectors.is_a?(DaruLite::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
2514
- rescue IndexError
2515
- raise IndexError, "Specified vector #{names.first} does not exist"
2516
- end
2517
- return @data[pos] if pos.is_a?(Numeric)
2518
-
2519
- names = pos
2520
- end
2521
-
2522
- new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
2523
-
2524
- order = names.is_a?(Array) ? DaruLite::Index.new(names) : names
2525
- DaruLite::DataFrame.new(new_vectors, order: order, index: @index, name: @name)
2526
- end
2527
-
2528
- def access_row(*indexes)
2529
- positions = @index.pos(*indexes)
2530
-
2531
- if positions.is_a? Numeric
2532
- row = get_rows_for([positions])
2533
- DaruLite::Vector.new row, index: @vectors, name: indexes.first
2534
- else
2535
- new_rows = get_rows_for(indexes, by_position: false)
2536
- DaruLite::DataFrame.new new_rows, index: @index.subset(*indexes), order: @vectors
2537
- end
2538
- end
2539
-
2540
- # @param keys [Array] can be an array of positions (if by_position is true) or indexes (if by_position if false)
2541
- # because of coercion by DaruLite::Vector#at and DaruLite::Vector#[], can return either an Array of
2542
- # values (representing a row) or an array of Vectors (that can be seen as rows)
2543
- def get_rows_for(keys, by_position: true)
2544
- raise unless keys.is_a?(Array)
2545
-
2546
- if by_position
2547
- pos = keys
2548
- @data.map { |vector| vector.at(*pos) }
2549
- else
2550
- # TODO: for now (2018-07-27), it is different than using
2551
- # get_rows_for(@index.pos(*keys))
2552
- # because DaruLite::Vector#at and DaruLite::Vector#[] don't handle DaruLite::MultiIndex the same way
2553
- indexes = keys
2554
- @data.map { |vec| vec[*indexes] }
2555
- end
2556
- end
2557
-
2558
560
  def insert_or_modify_vector(name, vector)
2559
561
  name = name[0] unless @vectors.is_a?(MultiIndex)
2560
562
 
@@ -2837,146 +839,6 @@ module DaruLite
2837
839
  end
2838
840
  end
2839
841
 
2840
- def sort_build_row(vector_locs, by_blocks, ascending, handle_nils, r1, r2) # rubocop:disable Metrics/ParameterLists
2841
- # Create an array to be used for comparison of two rows in sorting
2842
- vector_locs
2843
- .zip(by_blocks, ascending, handle_nils)
2844
- .map do |vector_loc, by, asc, handle_nil|
2845
- value = @data[vector_loc].data[asc ? r1 : r2]
2846
-
2847
- if by
2848
- value = begin
2849
- by.call(value)
2850
- rescue StandardError
2851
- nil
2852
- end
2853
- end
2854
-
2855
- sort_handle_nils value, asc, handle_nil || !by
2856
- end
2857
- end
2858
-
2859
- def sort_handle_nils(value, asc, handle_nil)
2860
- if !handle_nil
2861
- value
2862
- elsif asc
2863
- [value.nil? ? 0 : 1, value]
2864
- else
2865
- [value.nil? ? 1 : 0, value]
2866
- end
2867
- end
2868
-
2869
- def sort_coerce_boolean(opts, symbol, default, size)
2870
- val = opts[symbol]
2871
- case val
2872
- when true, false
2873
- Array.new(size, val)
2874
- when nil
2875
- Array.new(size, default)
2876
- when Array
2877
- raise ArgumentError, "Specify same number of vector names and #{symbol}" if
2878
- size != val.size
2879
-
2880
- val
2881
- else
2882
- raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
2883
- end
2884
- end
2885
-
2886
- def sort_prepare_block(vector_order, opts)
2887
- ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
2888
- handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
2889
-
2890
- by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
2891
- vector_locs = vector_order.map { |v| @vectors[v] }
2892
-
2893
- lambda do |index1, index2|
2894
- # Build left and right array to compare two rows
2895
- left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
2896
- right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
2897
-
2898
- # Resolve conflict by Index if all attributes are same
2899
- left << index1
2900
- right << index2
2901
- left <=> right
2902
- end
2903
- end
2904
-
2905
- def verify_error_message(row, test, id, i)
2906
- description, fields, = test
2907
- values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
2908
- "#{i + 1} [#{row[id]}]: #{description}#{values}"
2909
- end
2910
-
2911
- def prepare_pivot_values(index, vectors, opts)
2912
- case opts[:values]
2913
- when nil # values not specified at all.
2914
- (@vectors.to_a - (index | vectors)) & numeric_vector_names
2915
- when Array # multiple values specified.
2916
- opts[:values]
2917
- else # single value specified.
2918
- [opts[:values]]
2919
- end
2920
- end
2921
-
2922
- def make_pivot_hash(grouped, vectors, values, aggregate_function)
2923
- grouped.groups.transform_values { |_| {} }.tap do |super_hash|
2924
- values.each do |value|
2925
- grouped.groups.each do |group_name, row_numbers|
2926
- row_numbers.each do |num|
2927
- arry = [value, *vectors.map { |v| self[v][num] }]
2928
- sub_hash = super_hash[group_name]
2929
- sub_hash[arry] ||= []
2930
-
2931
- sub_hash[arry] << self[value][num]
2932
- end
2933
- end
2934
- end
2935
-
2936
- setup_pivot_aggregates super_hash, aggregate_function
2937
- end
2938
- end
2939
-
2940
- def setup_pivot_aggregates(super_hash, aggregate_function)
2941
- super_hash.each_value do |sub_hash|
2942
- sub_hash.each do |group_name, aggregates|
2943
- sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function)
2944
- end
2945
- end
2946
- end
2947
-
2948
- def pivot_dataframe(super_hash)
2949
- df_index = DaruLite::MultiIndex.from_tuples super_hash.keys
2950
- df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
2951
-
2952
- DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
2953
- super_hash.each do |row_index, sub_h|
2954
- sub_h.each do |vector_index, val|
2955
- pivoted_dataframe[vector_index][row_index] = val
2956
- end
2957
- end
2958
- end
2959
- end
2960
-
2961
- def one_to_many_components(pattern)
2962
- re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
2963
-
2964
- vars, numbers =
2965
- @vectors
2966
- .map { |v| v.scan(re) }
2967
- .reject(&:empty?).flatten(1).transpose
2968
-
2969
- [vars.uniq, numbers.map(&:to_i).sort.uniq]
2970
- end
2971
-
2972
- def one_to_many_row(row, number, vars, pattern)
2973
- vars
2974
- .to_h do |v|
2975
- name = pattern.sub('%v', v).sub('%n', number.to_s)
2976
- [v, row[name]]
2977
- end
2978
- end
2979
-
2980
842
  # Raises IndexError when one of the positions is not a valid position
2981
843
  def validate_positions(*positions, size)
2982
844
  positions.each do |pos|
@@ -3001,82 +863,5 @@ module DaruLite
3001
863
  DaruLite::Vector.new(source[idx], index: @index, name: vectors[idx])
3002
864
  end
3003
865
  end
3004
-
3005
- def aggregate_by_positions_tuples(options, positions_tuples)
3006
- agg_over_vectors_only, options = cast_aggregation_options(options)
3007
-
3008
- if agg_over_vectors_only
3009
- options.map do |vect_name, method|
3010
- vect = self[vect_name]
3011
-
3012
- positions_tuples.map do |positions|
3013
- vect.apply_method_on_sub_vector(method, keys: positions)
3014
- end
3015
- end
3016
- else
3017
- methods = options.values
3018
-
3019
- # NOTE: because we aggregate over rows, we don't have to re-get sub-dfs for each method (which is expensive)
3020
- rows = positions_tuples.map do |positions|
3021
- apply_method_on_sub_df(methods, keys: positions)
3022
- end
3023
-
3024
- rows.transpose
3025
- end
3026
- end
3027
-
3028
- # convert operations over sub-vectors to operations over sub-dfs when it improves perf
3029
- # note: we don't always "cast" because aggregation over a single vector / a few vector is faster
3030
- # than aggregation over (sub-)dfs
3031
- def cast_aggregation_options(options)
3032
- vects, non_vects = options.keys.partition { |k| @vectors.include?(k) }
3033
-
3034
- over_vectors = true
3035
-
3036
- if non_vects.any?
3037
- options = options.clone
3038
-
3039
- vects.each do |name|
3040
- proc_on_vect = options[name].to_proc
3041
- options[name] = ->(sub_df) { proc_on_vect.call(sub_df[name]) }
3042
- end
3043
-
3044
- over_vectors = false
3045
- end
3046
-
3047
- [over_vectors, options]
3048
- end
3049
-
3050
- def group_index_for_aggregation(index, multi_index_level = -1)
3051
- case index
3052
- when DaruLite::MultiIndex
3053
- groups_by_pos = DaruLite::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
3054
-
3055
- new_index = DaruLite::MultiIndex.from_tuples(groups_by_pos.keys).coerce_index
3056
- pos_tuples = groups_by_pos.values
3057
- when DaruLite::Index, DaruLite::CategoricalIndex
3058
- new_index = Array(index).uniq
3059
- pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
3060
- else raise
3061
- end
3062
-
3063
- [pos_tuples, new_index]
3064
- end
3065
-
3066
- # coerce ranges, integers and array in appropriate ways
3067
- def coerce_positions(*positions, size)
3068
- if positions.size == 1
3069
- case positions.first
3070
- when Integer
3071
- positions.first
3072
- when Range
3073
- size.times.to_a[positions.first]
3074
- else
3075
- raise ArgumentError, 'Unknown position type.'
3076
- end
3077
- else
3078
- positions
3079
- end
3080
- end
3081
866
  end
3082
867
  end