daru_lite 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +35 -33
  3. data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
  4. data/lib/daru_lite/data_frame/calculatable.rb +140 -0
  5. data/lib/daru_lite/data_frame/convertible.rb +107 -0
  6. data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
  7. data/lib/daru_lite/data_frame/fetchable.rb +301 -0
  8. data/lib/daru_lite/data_frame/filterable.rb +144 -0
  9. data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
  10. data/lib/daru_lite/data_frame/indexable.rb +168 -0
  11. data/lib/daru_lite/data_frame/iterable.rb +339 -0
  12. data/lib/daru_lite/data_frame/joinable.rb +152 -0
  13. data/lib/daru_lite/data_frame/missable.rb +75 -0
  14. data/lib/daru_lite/data_frame/pivotable.rb +108 -0
  15. data/lib/daru_lite/data_frame/queryable.rb +67 -0
  16. data/lib/daru_lite/data_frame/setable.rb +109 -0
  17. data/lib/daru_lite/data_frame/sortable.rb +241 -0
  18. data/lib/daru_lite/dataframe.rb +138 -2353
  19. data/lib/daru_lite/index/index.rb +14 -1
  20. data/lib/daru_lite/index/multi_index.rb +9 -0
  21. data/lib/daru_lite/maths/statistics/vector.rb +1 -1
  22. data/lib/daru_lite/vector/aggregatable.rb +9 -0
  23. data/lib/daru_lite/vector/calculatable.rb +78 -0
  24. data/lib/daru_lite/vector/convertible.rb +77 -0
  25. data/lib/daru_lite/vector/duplicatable.rb +17 -0
  26. data/lib/daru_lite/vector/fetchable.rb +175 -0
  27. data/lib/daru_lite/vector/filterable.rb +128 -0
  28. data/lib/daru_lite/vector/indexable.rb +77 -0
  29. data/lib/daru_lite/vector/iterable.rb +95 -0
  30. data/lib/daru_lite/vector/joinable.rb +17 -0
  31. data/lib/daru_lite/vector/missable.rb +124 -0
  32. data/lib/daru_lite/vector/queryable.rb +45 -0
  33. data/lib/daru_lite/vector/setable.rb +47 -0
  34. data/lib/daru_lite/vector/sortable.rb +113 -0
  35. data/lib/daru_lite/vector.rb +36 -932
  36. data/lib/daru_lite/version.rb +1 -1
  37. data/spec/data_frame/aggregatable_example.rb +65 -0
  38. data/spec/data_frame/buildable_example.rb +109 -0
  39. data/spec/data_frame/calculatable_example.rb +135 -0
  40. data/spec/data_frame/convertible_example.rb +180 -0
  41. data/spec/data_frame/duplicatable_example.rb +111 -0
  42. data/spec/data_frame/fetchable_example.rb +476 -0
  43. data/spec/data_frame/filterable_example.rb +409 -0
  44. data/spec/data_frame/indexable_example.rb +221 -0
  45. data/spec/data_frame/iterable_example.rb +465 -0
  46. data/spec/data_frame/joinable_example.rb +106 -0
  47. data/spec/data_frame/missable_example.rb +47 -0
  48. data/spec/data_frame/pivotable_example.rb +297 -0
  49. data/spec/data_frame/queryable_example.rb +92 -0
  50. data/spec/data_frame/setable_example.rb +482 -0
  51. data/spec/data_frame/sortable_example.rb +350 -0
  52. data/spec/dataframe_spec.rb +181 -3289
  53. data/spec/index/categorical_index_spec.rb +27 -8
  54. data/spec/index/index_spec.rb +21 -0
  55. data/spec/index/multi_index_spec.rb +85 -76
  56. data/spec/vector/aggregatable_example.rb +27 -0
  57. data/spec/vector/calculatable_example.rb +82 -0
  58. data/spec/vector/convertible_example.rb +126 -0
  59. data/spec/vector/duplicatable_example.rb +48 -0
  60. data/spec/vector/fetchable_example.rb +463 -0
  61. data/spec/vector/filterable_example.rb +165 -0
  62. data/spec/vector/indexable_example.rb +201 -0
  63. data/spec/vector/iterable_example.rb +111 -0
  64. data/spec/vector/joinable_example.rb +25 -0
  65. data/spec/vector/missable_example.rb +88 -0
  66. data/spec/vector/queryable_example.rb +91 -0
  67. data/spec/vector/setable_example.rb +300 -0
  68. data/spec/vector/sortable_example.rb +242 -0
  69. data/spec/vector_spec.rb +111 -1805
  70. metadata +86 -2
@@ -1,10 +1,40 @@
1
1
  require 'daru_lite/accessors/dataframe_by_row'
2
+ require 'daru_lite/data_frame/aggregatable'
3
+ require 'daru_lite/data_frame/calculatable'
4
+ require 'daru_lite/data_frame/convertible'
5
+ require 'daru_lite/data_frame/duplicatable'
6
+ require 'daru_lite/data_frame/fetchable'
7
+ require 'daru_lite/data_frame/filterable'
8
+ require 'daru_lite/data_frame/indexable'
9
+ require 'daru_lite/data_frame/i_o_able'
10
+ require 'daru_lite/data_frame/iterable'
11
+ require 'daru_lite/data_frame/joinable'
12
+ require 'daru_lite/data_frame/missable'
13
+ require 'daru_lite/data_frame/pivotable'
14
+ require 'daru_lite/data_frame/setable'
15
+ require 'daru_lite/data_frame/sortable'
16
+ require 'daru_lite/data_frame/queryable'
2
17
  require 'daru_lite/maths/arithmetic/dataframe'
3
18
  require 'daru_lite/maths/statistics/dataframe'
4
19
  require 'daru_lite/io/io'
5
20
 
6
21
  module DaruLite
7
22
  class DataFrame # rubocop:disable Metrics/ClassLength
23
+ include DaruLite::DataFrame::Aggregatable
24
+ include DaruLite::DataFrame::Calculatable
25
+ include DaruLite::DataFrame::Convertible
26
+ include DaruLite::DataFrame::Duplicatable
27
+ include DaruLite::DataFrame::Fetchable
28
+ include DaruLite::DataFrame::Filterable
29
+ include DaruLite::DataFrame::Indexable
30
+ include DaruLite::DataFrame::Iterable
31
+ include DaruLite::DataFrame::IOAble
32
+ include DaruLite::DataFrame::Joinable
33
+ include DaruLite::DataFrame::Missable
34
+ include DaruLite::DataFrame::Pivotable
35
+ include DaruLite::DataFrame::Setable
36
+ include DaruLite::DataFrame::Sortable
37
+ include DaruLite::DataFrame::Queryable
8
38
  include DaruLite::Maths::Arithmetic::DataFrame
9
39
  include DaruLite::Maths::Statistics::DataFrame
10
40
 
@@ -13,109 +43,6 @@ module DaruLite
13
43
  extend Gem::Deprecate
14
44
 
15
45
  class << self
16
- # Load data from a CSV file. Specify an optional block to grab the CSV
17
- # object and pre-condition it (for example use the `convert` or
18
- # `header_convert` methods).
19
- #
20
- # == Arguments
21
- #
22
- # * path - Local path / Remote URL of the file to load specified as a String.
23
- #
24
- # == Options
25
- #
26
- # Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
27
- # and uses those to eventually construct the resulting DataFrame.
28
- #
29
- # == Verbose Description
30
- #
31
- # You can specify all the options to the `.from_csv` function that you
32
- # do to the Ruby `CSV.read()` function, since this is what is used internally.
33
- #
34
- # For example, if the columns in your CSV file are separated by something
35
- # other that commas, you can use the `:col_sep` option. If you want to
36
- # convert numeric values to numbers and not keep them as strings, you can
37
- # use the `:converters` option and set it to `:numeric`.
38
- #
39
- # The `.from_csv` function uses the following defaults for reading CSV files
40
- # (that are passed into the `CSV.read()` function):
41
- #
42
- # {
43
- # :col_sep => ',',
44
- # :converters => :numeric
45
- # }
46
- def from_csv(path, opts = {}, &block)
47
- DaruLite::IO.from_csv path, opts, &block
48
- end
49
-
50
- # Read data from an Excel file into a DataFrame.
51
- #
52
- # == Arguments
53
- #
54
- # * path - Path of the file to be read.
55
- #
56
- # == Options
57
- #
58
- # *:worksheet_id - ID of the worksheet that is to be read.
59
- def from_excel(path, opts = {}, &block)
60
- DaruLite::IO.from_excel path, opts, &block
61
- end
62
-
63
- # Read a database query and returns a Dataset
64
- #
65
- # @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
66
- # @param query [String] The query to be executed
67
- #
68
- # @return A dataframe containing the data resulting from the query
69
- #
70
- # USE:
71
- #
72
- # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
73
- # DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
74
- #
75
- # #Alternatively
76
- #
77
- # require 'dbi'
78
- # DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
79
- def from_sql(dbh, query)
80
- DaruLite::IO.from_sql dbh, query
81
- end
82
-
83
- # Read a dataframe from AR::Relation
84
- #
85
- # @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
86
- # @param fields [Array] Field names to be loaded (optional)
87
- #
88
- # @return A dataframe containing the data loaded from the relation
89
- #
90
- # USE:
91
- #
92
- # # When Post model is defined as:
93
- # class Post < ActiveRecord::Base
94
- # scope :active, -> { where.not(published_at: nil) }
95
- # end
96
- #
97
- # # You can load active posts into a dataframe by:
98
- # DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
99
- def from_activerecord(relation, *fields)
100
- DaruLite::IO.from_activerecord relation, *fields
101
- end
102
-
103
- # Read the database from a plaintext file. For this method to work,
104
- # the data should be present in a plain text file in columns. See
105
- # spec/fixtures/bank2.dat for an example.
106
- #
107
- # == Arguments
108
- #
109
- # * path - Path of the file to be read.
110
- # * fields - Vector names of the resulting database.
111
- #
112
- # == Usage
113
- #
114
- # df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
115
- def from_plaintext(path, fields)
116
- DaruLite::IO.from_plaintext path, fields
117
- end
118
-
119
46
  # Create DataFrame by specifying rows as an Array of Arrays or Array of
120
47
  # DaruLite::Vector objects.
121
48
  def rows(source, opts = {})
@@ -316,179 +243,6 @@ module DaruLite
316
243
  update
317
244
  end
318
245
 
319
- # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
320
- # Defaults to *:vector*. Use of this method is not recommended for accessing
321
- # rows. Use df.row[:a] for accessing row with index ':a'.
322
- def [](*names)
323
- axis = extract_axis(names, :vector)
324
- dispatch_to_axis axis, :access, *names
325
- end
326
-
327
- # Retrive rows by positions
328
- # @param [Array<Integer>] positions of rows to retrive
329
- # @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
330
- # @example
331
- # df = DaruLite::DataFrame.new({
332
- # a: [1, 2, 3],
333
- # b: ['a', 'b', 'c']
334
- # })
335
- # df.row_at 1, 2
336
- # # => #<DaruLite::DataFrame(2x2)>
337
- # # a b
338
- # # 1 2 b
339
- # # 2 3 c
340
- def row_at(*positions)
341
- original_positions = positions
342
- positions = coerce_positions(*positions, nrows)
343
- validate_positions(*positions, nrows)
344
-
345
- if positions.is_a? Integer
346
- row = get_rows_for([positions])
347
- DaruLite::Vector.new row, index: @vectors
348
- else
349
- new_rows = get_rows_for(original_positions)
350
- DaruLite::DataFrame.new new_rows, index: @index.at(*original_positions), order: @vectors
351
- end
352
- end
353
-
354
- # Set rows by positions
355
- # @param [Array<Integer>] positions positions of rows to set
356
- # @param [Array, DaruLite::Vector] vector vector to be assigned
357
- # @example
358
- # df = DaruLite::DataFrame.new({
359
- # a: [1, 2, 3],
360
- # b: ['a', 'b', 'c']
361
- # })
362
- # df.set_row_at [0, 1], ['x', 'x']
363
- # df
364
- # #=> #<DaruLite::DataFrame(3x2)>
365
- # # a b
366
- # # 0 x x
367
- # # 1 x x
368
- # # 2 3 c
369
- def set_row_at(positions, vector)
370
- validate_positions(*positions, nrows)
371
- vector =
372
- if vector.is_a? DaruLite::Vector
373
- vector.reindex @vectors
374
- else
375
- DaruLite::Vector.new vector
376
- end
377
-
378
- raise SizeError, 'Vector length should match row length' if
379
- vector.size != @vectors.size
380
-
381
- @data.each_with_index do |vec, pos|
382
- vec.set_at(positions, vector.at(pos))
383
- end
384
- @index = @data[0].index
385
- set_size
386
- end
387
-
388
- # Retrive vectors by positions
389
- # @param [Array<Integer>] positions of vectors to retrive
390
- # @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
391
- # @example
392
- # df = DaruLite::DataFrame.new({
393
- # a: [1, 2, 3],
394
- # b: ['a', 'b', 'c']
395
- # })
396
- # df.at 0
397
- # # => #<DaruLite::Vector(3)>
398
- # # a
399
- # # 0 1
400
- # # 1 2
401
- # # 2 3
402
- def at(*positions)
403
- if AXES.include? positions.last
404
- axis = positions.pop
405
- return row_at(*positions) if axis == :row
406
- end
407
-
408
- original_positions = positions
409
- positions = coerce_positions(*positions, ncols)
410
- validate_positions(*positions, ncols)
411
-
412
- if positions.is_a? Integer
413
- @data[positions].dup
414
- else
415
- DaruLite::DataFrame.new positions.map { |pos| @data[pos].dup },
416
- index: @index,
417
- order: @vectors.at(*original_positions),
418
- name: @name
419
- end
420
- end
421
-
422
- # Set vectors by positions
423
- # @param [Array<Integer>] positions positions of vectors to set
424
- # @param [Array, DaruLite::Vector] vector vector to be assigned
425
- # @example
426
- # df = DaruLite::DataFrame.new({
427
- # a: [1, 2, 3],
428
- # b: ['a', 'b', 'c']
429
- # })
430
- # df.set_at [0], ['x', 'y', 'z']
431
- # df
432
- # #=> #<DaruLite::DataFrame(3x2)>
433
- # # a b
434
- # # 0 x a
435
- # # 1 y b
436
- # # 2 z c
437
- def set_at(positions, vector)
438
- if positions.last == :row
439
- positions.pop
440
- return set_row_at(positions, vector)
441
- end
442
-
443
- validate_positions(*positions, ncols)
444
- vector =
445
- if vector.is_a? DaruLite::Vector
446
- vector.reindex @index
447
- else
448
- DaruLite::Vector.new vector
449
- end
450
-
451
- raise SizeError, 'Vector length should match index length' if
452
- vector.size != @index.size
453
-
454
- positions.each { |pos| @data[pos] = vector }
455
- end
456
-
457
- # Insert a new row/vector of the specified name or modify a previous row.
458
- # Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
459
- # a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
460
- #
461
- # In case a DaruLite::Vector is specified after the equality the sign, the indexes
462
- # of the vector will be matched against the row/vector indexes of the DataFrame
463
- # before an insertion is performed. Unmatched indexes will be set to nil.
464
- def []=(*args)
465
- vector = args.pop
466
- axis = extract_axis(args)
467
- names = args
468
-
469
- dispatch_to_axis axis, :insert_or_modify, names, vector
470
- end
471
-
472
- def add_row(row, index = nil)
473
- self.row[*(index || @size)] = row
474
- end
475
-
476
- def add_vector(n, vector)
477
- self[n] = vector
478
- end
479
-
480
- def insert_vector(n, name, source)
481
- raise ArgumentError unless source.is_a? Array
482
-
483
- vector = DaruLite::Vector.new(source, index: @index, name: @name)
484
- @data << vector
485
- @vectors = @vectors.add name
486
- ordr = @vectors.dup.to_a
487
- elmnt = ordr.pop
488
- ordr.insert n, elmnt
489
- self.order = ordr
490
- end
491
-
492
246
  # Access a row or set/create a row. Refer #[] and #[]= docs for details.
493
247
  #
494
248
  # == Usage
@@ -498,1696 +252,177 @@ module DaruLite
498
252
  DaruLite::Accessors::DataFrameByRow.new(self)
499
253
  end
500
254
 
501
- # Extract a dataframe given row indexes or positions
502
- # @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
503
- # @return [DaruLite::Dataframe]
504
- def get_sub_dataframe(keys, by_position: true)
505
- return DaruLite::DataFrame.new({}) if keys == []
506
-
507
- keys = @index.pos(*keys) unless by_position
508
-
509
- sub_df = row_at(*keys)
510
- sub_df = sub_df.to_df.transpose if sub_df.is_a?(DaruLite::Vector)
511
-
512
- sub_df
513
- end
514
-
515
- # Duplicate the DataFrame entirely.
516
- #
517
- # == Arguments
518
- #
519
- # * +vectors_to_dup+ - An Array specifying the names of Vectors to
520
- # be duplicated. Will duplicate the entire DataFrame if not specified.
521
- def dup(vectors_to_dup = nil)
522
- vectors_to_dup ||= @vectors.to_a
523
-
524
- src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
525
- new_order = DaruLite::Index.new(vectors_to_dup)
526
-
527
- DaruLite::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
528
- end
529
-
530
- # Only clone the structure of the DataFrame.
531
- def clone_structure
532
- DaruLite::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
533
- end
534
-
535
- # Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
536
- # preserved.
537
- #
538
- # == Arguments
539
- #
540
- # +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
541
- # a view of the whole data frame otherwise.
542
- def clone(*vectors_to_clone)
543
- vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
544
- vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
545
-
546
- h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
547
- DaruLite::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
548
- end
549
-
550
- # Returns a 'shallow' copy of DataFrame if missing data is not present,
551
- # or a full copy of only valid data if missing data is present.
552
- def clone_only_valid
553
- if include_values?(*DaruLite::MISSING_VALUES)
554
- reject_values(*DaruLite::MISSING_VALUES)
555
- else
556
- clone
557
- end
558
- end
559
-
560
- # Creates a new duplicate dataframe containing only rows
561
- # without a single missing value.
562
- def dup_only_valid(vecs = nil)
563
- rows_with_nil = @data.map { |vec| vec.indexes(*DaruLite::MISSING_VALUES) }
564
- .inject(&:concat)
565
- .uniq
566
-
567
- row_indexes = @index.to_a
568
- (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
569
- end
570
- deprecate :dup_only_valid, :reject_values, 2016, 10
571
-
572
- # Returns a dataframe in which rows with any of the mentioned values
573
- # are ignored.
574
- # @param [Array] values to reject to form the new dataframe
575
- # @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
576
- # contain the mentioned values
577
- # @example
578
- # df = DaruLite::DataFrame.new({
579
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
580
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
581
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
582
- # }, index: 11..18)
583
- # df.reject_values nil, Float::NAN
584
- # # => #<DaruLite::DataFrame(2x3)>
585
- # # a b c
586
- # # 11 1 a a
587
- # # 18 7 8 7
588
- def reject_values(*values)
589
- positions =
590
- size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
591
- # Handle the case when positions size is 1 and #row_at wouldn't return a df
592
- if positions.size == 1
593
- pos = positions.first
594
- row_at(pos..pos)
595
- else
596
- row_at(*positions)
597
- end
598
- end
599
-
600
- # Replace specified values with given value
601
- # @param [Array] old_values values to replace with new value
602
- # @param [object] new_value new value to replace with
603
- # @return [DaruLite::DataFrame] Data Frame itself with old values replace
604
- # with new value
605
- # @example
606
- # df = DaruLite::DataFrame.new({
607
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
608
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
609
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
610
- # }, index: 11..18)
611
- # df.replace_values nil, Float::NAN
612
- # # => #<DaruLite::DataFrame(8x3)>
613
- # # a b c
614
- # # 11 1 a a
615
- # # 12 2 b NaN
616
- # # 13 3 NaN 3
617
- # # 14 NaN NaN 4
618
- # # 15 NaN NaN 3
619
- # # 16 NaN 3 5
620
- # # 17 1 5 NaN
621
- # # 18 7 8 7
622
- def replace_values(old_values, new_value)
623
- @data.each { |vec| vec.replace_values old_values, new_value }
624
- self
625
- end
626
-
627
- # Rolling fillna
628
- # replace all Float::NAN and NIL values with the preceeding or following value
629
- #
630
- # @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
631
- #
632
- # @example
633
- # df = DaruLite::DataFrame.new({
634
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
635
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
636
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
637
- # })
638
- #
639
- # => #<DaruLite::DataFrame(8x3)>
640
- # a b c
641
- # 0 1 a a
642
- # 1 2 b NaN
643
- # 2 3 nil 3
644
- # 3 nil NaN 4
645
- # 4 NaN nil 3
646
- # 5 nil 3 5
647
- # 6 1 5 nil
648
- # 7 7 nil 7
649
- #
650
- # 2.3.3 :068 > df.rolling_fillna(:forward)
651
- # => #<DaruLite::DataFrame(8x3)>
652
- # a b c
653
- # 0 1 a a
654
- # 1 2 b a
655
- # 2 3 b 3
656
- # 3 3 b 4
657
- # 4 3 b 3
658
- # 5 3 3 5
659
- # 6 1 5 5
660
- # 7 7 5 7
661
- #
662
- def rolling_fillna!(direction = :forward)
663
- @data.each { |vec| vec.rolling_fillna!(direction) }
664
- self
665
- end
666
-
667
- def rolling_fillna(direction = :forward)
668
- dup.rolling_fillna!(direction)
669
- end
670
-
671
- # Return unique rows by vector specified or all vectors
672
- #
673
- # @param vtrs [String][Symbol] vector names(s) that should be considered
674
- #
675
- # @example
676
- #
677
- # => #<DaruLite::DataFrame(6x2)>
678
- # a b
679
- # 0 1 a
680
- # 1 2 b
681
- # 2 3 c
682
- # 3 4 d
683
- # 2 3 c
684
- # 3 4 f
685
- #
686
- # 2.3.3 :> df.unique
687
- # => #<DaruLite::DataFrame(5x2)>
688
- # a b
689
- # 0 1 a
690
- # 1 2 b
691
- # 2 3 c
692
- # 3 4 d
693
- # 3 4 f
694
- #
695
- # 2.3.3 :> df.unique(:a)
696
- # => #<DaruLite::DataFrame(5x2)>
697
- # a b
698
- # 0 1 a
699
- # 1 2 b
700
- # 2 3 c
701
- # 3 4 d
702
- #
703
- def uniq(*vtrs)
704
- vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
705
- grouped = group_by(vecs)
706
- indexes = grouped.groups.values.map { |v| v[0] }.sort
707
- row[*indexes]
708
- end
709
-
710
- # Iterate over each index of the DataFrame.
711
- def each_index(&block)
712
- return to_enum(:each_index) unless block
713
-
714
- @index.each(&block)
715
-
716
- self
717
- end
718
-
719
- # Iterate over each vector
720
- def each_vector(&block)
721
- return to_enum(:each_vector) unless block
255
+ # Delete a vector
256
+ def delete_vector(vector)
257
+ raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
722
258
 
723
- @data.each(&block)
259
+ @data.delete_at @vectors[vector]
260
+ @vectors = DaruLite::Index.new @vectors.to_a - [vector]
724
261
 
725
262
  self
726
263
  end
727
264
 
728
- alias each_column each_vector
729
-
730
- # Iterate over each vector alongwith the name of the vector
731
- def each_vector_with_index
732
- return to_enum(:each_vector_with_index) unless block_given?
733
-
734
- @vectors.each do |vector|
735
- yield @data[@vectors[vector]], vector
736
- end
265
+ # Deletes a list of vectors
266
+ def delete_vectors(*vectors)
267
+ Array(vectors).each { |vec| delete_vector vec }
737
268
 
738
269
  self
739
270
  end
740
271
 
741
- alias each_column_with_index each_vector_with_index
742
-
743
- # Iterate over each row
744
- def each_row
745
- return to_enum(:each_row) unless block_given?
746
-
747
- @index.size.times do |pos|
748
- yield row_at(pos)
749
- end
750
-
751
- self
752
- end
272
+ # Delete a row
273
+ def delete_row(index)
274
+ idx = named_index_for index
753
275
 
754
- def each_row_with_index
755
- return to_enum(:each_row_with_index) unless block_given?
276
+ raise IndexError, "Index #{index} does not exist." unless @index.include? idx
756
277
 
757
- @index.each do |index|
758
- yield access_row(index), index
278
+ @index = DaruLite::Index.new(@index.to_a - [idx])
279
+ each_vector do |vector|
280
+ vector.delete_at idx
759
281
  end
760
282
 
761
- self
762
- end
763
-
764
- # Iterate over each row or vector of the DataFrame. Specify axis
765
- # by passing :vector or :row as the argument. Default to :vector.
766
- #
767
- # == Description
768
- #
769
- # `#each` works exactly like Array#each. The default mode for `each`
770
- # is to iterate over the columns of the DataFrame. To iterate over
771
- # rows you must pass the axis, i.e `:row` as an argument.
772
- #
773
- # == Arguments
774
- #
775
- # * +axis+ - The axis to iterate over. Can be :vector (or :column)
776
- # or :row. Default to :vector.
777
- def each(axis = :vector, &block)
778
- dispatch_to_axis axis, :each, &block
779
- end
780
-
781
- # Iterate over a row or vector and return results in a DaruLite::Vector.
782
- # Specify axis with :vector or :row. Default to :vector.
783
- #
784
- # == Description
785
- #
786
- # The #collect iterator works similar to #map, the only difference
787
- # being that it returns a DaruLite::Vector comprising of the results of
788
- # each block run. The resultant Vector has the same index as that
789
- # of the axis over which collect has iterated. It also accepts the
790
- # optional axis argument.
791
- #
792
- # == Arguments
793
- #
794
- # * +axis+ - The axis to iterate over. Can be :vector (or :column)
795
- # or :row. Default to :vector.
796
- def collect(axis = :vector, &block)
797
- dispatch_to_axis_pl axis, :collect, &block
283
+ set_size
798
284
  end
799
285
 
800
- # Map over each vector or row of the data frame according to
801
- # the argument specified. Will return an Array of the resulting
802
- # elements. To map over each row/vector and get a DataFrame,
803
- # see #recode.
804
- #
805
- # == Description
806
- #
807
- # The #map iterator works like Array#map. The value returned by
808
- # each run of the block is added to an Array and the Array is
809
- # returned. This method also accepts an axis argument, like #each.
810
- # The default is :vector.
811
- #
812
- # == Arguments
813
- #
814
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
815
- # Default to :vector.
816
- def map(axis = :vector, &block)
817
- dispatch_to_axis_pl axis, :map, &block
818
- end
286
+ # Delete a row based on its position
287
+ # More robust than #delete_row when working with a CategoricalIndex or when the
288
+ # Index includes integers
289
+ def delete_at_position(position)
290
+ raise IndexError, "Position #{position} does not exist." unless position < size
819
291
 
820
- # Destructive map. Modifies the DataFrame. Each run of the block
821
- # must return a DaruLite::Vector. You can specify the axis to map over
822
- # as the argument. Default to :vector.
823
- #
824
- # == Arguments
825
- #
826
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
827
- # Default to :vector.
828
- def map!(axis = :vector, &block)
829
- if %i[vector column].include?(axis)
830
- map_vectors!(&block)
831
- elsif axis == :row
832
- map_rows!(&block)
833
- end
834
- end
292
+ @index = @index.delete_at(position)
293
+ each_vector { |vector| vector.delete_at_position(position) }
835
294
 
836
- # Maps over the DataFrame and returns a DataFrame. Each run of the
837
- # block must return a DaruLite::Vector object. You can specify the axis
838
- # to map over. Default to :vector.
839
- #
840
- # == Description
841
- #
842
- # Recode works similarly to #map, but an important difference between
843
- # the two is that recode returns a modified DaruLite::DataFrame instead
844
- # of an Array. For this reason, #recode expects that every run of the
845
- # block to return a DaruLite::Vector.
846
- #
847
- # Just like map and each, recode also accepts an optional _axis_ argument.
848
- #
849
- # == Arguments
850
- #
851
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
852
- # Default to :vector.
853
- def recode(axis = :vector, &block)
854
- dispatch_to_axis_pl axis, :recode, &block
295
+ set_size
855
296
  end
856
297
 
857
- # Retain vectors or rows if the block returns a truthy value.
858
- #
859
- # == Description
860
- #
861
- # For filtering out certain rows/vectors based on their values,
862
- # use the #filter method. By default it iterates over vectors and
863
- # keeps those vectors for which the block returns true. It accepts
864
- # an optional axis argument which lets you specify whether you want
865
- # to iterate over vectors or rows.
866
- #
867
- # == Arguments
868
- #
869
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
870
- # Default to :vector.
871
- #
872
- # == Usage
873
- #
874
- # # Filter vectors
875
- #
876
- # df.filter do |vector|
877
- # vector.type == :numeric and vector.median < 50
878
- # end
879
- #
880
- # # Filter rows
298
+ # Creates a DataFrame with the random data, of n size.
299
+ # If n not given, uses original number of rows.
881
300
  #
882
- # df.filter(:row) do |row|
883
- # row[:a] + row[:d] < 100
884
- # end
885
- def filter(axis = :vector, &block)
886
- dispatch_to_axis_pl axis, :filter, &block
887
- end
888
-
889
- def recode_vectors
890
- block_given? or return to_enum(:recode_vectors)
891
-
892
- dup.tap do |df|
893
- df.each_vector_with_index do |v, i|
894
- df[*i] = should_be_vector!(yield(v))
895
- end
896
- end
897
- end
898
-
899
- def recode_rows
900
- block_given? or return to_enum(:recode_rows)
901
-
902
- dup.tap do |df|
903
- df.each_row_with_index do |r, i|
904
- df.row[i] = should_be_vector!(yield(r))
301
+ # @return {DaruLite::DataFrame}
302
+ def bootstrap(n = nil)
303
+ n ||= nrows
304
+ DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
305
+ n.times do
306
+ df_boot.add_row(row[rand(n)])
905
307
  end
308
+ df_boot.update
906
309
  end
907
310
  end
908
311
 
909
- # Map each vector and return an Array.
910
- def map_vectors(&block)
911
- return to_enum(:map_vectors) unless block
912
-
913
- @data.map(&block)
914
- end
915
-
916
- # Destructive form of #map_vectors
917
- def map_vectors!
918
- return to_enum(:map_vectors!) unless block_given?
919
-
920
- vectors.dup.each do |n|
921
- self[n] = should_be_vector!(yield(self[n]))
922
- end
923
-
924
- self
925
- end
926
-
927
- # Map vectors alongwith the index.
928
- def map_vectors_with_index(&block)
929
- return to_enum(:map_vectors_with_index) unless block
930
-
931
- each_vector_with_index.map(&block)
932
- end
933
-
934
- # Map each row
935
- def map_rows(&block)
936
- return to_enum(:map_rows) unless block
937
-
938
- each_row.map(&block)
939
- end
940
-
941
- def map_rows_with_index(&block)
942
- return to_enum(:map_rows_with_index) unless block
943
-
944
- each_row_with_index.map(&block)
945
- end
946
-
947
- def map_rows!
948
- return to_enum(:map_rows!) unless block_given?
949
-
950
- index.dup.each do |i|
951
- row[i] = should_be_vector!(yield(row[i]))
952
- end
953
-
954
- self
955
- end
956
-
957
- def apply_method(method, keys: nil, by_position: true)
958
- df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
959
-
960
- case method
961
- when Symbol then df.send(method)
962
- when Proc then method.call(df)
963
- when Array then method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
964
- else raise
965
- end
966
- end
967
- alias apply_method_on_sub_df apply_method
968
-
969
- # Retrieves a DaruLite::Vector, based on the result of calculation
970
- # performed on each row.
971
- def collect_rows(&block)
972
- return to_enum(:collect_rows) unless block
973
-
974
- DaruLite::Vector.new(each_row.map(&block), index: @index)
975
- end
976
-
977
- def collect_row_with_index(&block)
978
- return to_enum(:collect_row_with_index) unless block
979
-
980
- DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
981
- end
982
-
983
- # Retrives a DaruLite::Vector, based on the result of calculation
984
- # performed on each vector.
985
- def collect_vectors(&block)
986
- return to_enum(:collect_vectors) unless block
987
-
988
- DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
989
- end
990
-
991
- def collect_vector_with_index(&block)
992
- return to_enum(:collect_vector_with_index) unless block
993
-
994
- DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
995
- end
996
-
997
- # Generate a matrix, based on vector names of the DataFrame.
998
- #
999
- # @return {::Matrix}
1000
- # :nocov:
1001
- # FIXME: Even not trying to cover this: I can't get, how it is expected
1002
- # to work.... -- zverok
1003
- def collect_matrix
1004
- return to_enum(:collect_matrix) unless block_given?
1005
-
1006
- vecs = vectors.to_a
1007
- rows = vecs.collect do |row|
1008
- vecs.collect do |col|
1009
- yield row, col
1010
- end
1011
- end
1012
-
1013
- Matrix.rows(rows)
1014
- end
1015
- # :nocov:
1016
-
1017
- # Delete a vector
1018
- def delete_vector(vector)
1019
- raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
1020
-
1021
- @data.delete_at @vectors[vector]
1022
- @vectors = DaruLite::Index.new @vectors.to_a - [vector]
1023
-
1024
- self
1025
- end
1026
-
1027
- # Deletes a list of vectors
1028
- def delete_vectors(*vectors)
1029
- Array(vectors).each { |vec| delete_vector vec }
1030
-
1031
- self
1032
- end
1033
-
1034
- # Delete a row
1035
- def delete_row(index)
1036
- idx = named_index_for index
1037
-
1038
- raise IndexError, "Index #{index} does not exist." unless @index.include? idx
1039
-
1040
- @index = DaruLite::Index.new(@index.to_a - [idx])
1041
- each_vector do |vector|
1042
- vector.delete_at idx
1043
- end
1044
-
1045
- set_size
1046
- end
1047
-
1048
- # Creates a DataFrame with the random data, of n size.
1049
- # If n not given, uses original number of rows.
1050
- #
1051
- # @return {DaruLite::DataFrame}
1052
- def bootstrap(n = nil)
1053
- n ||= nrows
1054
- DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
1055
- n.times do
1056
- df_boot.add_row(row[rand(n)])
1057
- end
1058
- df_boot.update
1059
- end
1060
- end
1061
-
1062
- def keep_row_if
1063
- @index
1064
- .reject { |idx| yield access_row(idx) }
1065
- .each { |idx| delete_row idx }
1066
- end
1067
-
1068
- def keep_vector_if
1069
- @vectors.each do |vector|
1070
- delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
1071
- end
1072
- end
1073
-
1074
- # creates a new vector with the data of a given field which the block returns true
1075
- def filter_vector(vec, &block)
1076
- DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
1077
- end
1078
-
1079
- # Iterates over each row and retains it in a new DataFrame if the block returns
1080
- # true for that row.
1081
- def filter_rows
1082
- return to_enum(:filter_rows) unless block_given?
1083
-
1084
- keep_rows = @index.map { |index| yield access_row(index) }
1085
-
1086
- where keep_rows
1087
- end
1088
-
1089
- # Iterates over each vector and retains it in a new DataFrame if the block returns
1090
- # true for that vector.
1091
- def filter_vectors(&block)
1092
- return to_enum(:filter_vectors) unless block
1093
-
1094
- dup.tap { |df| df.keep_vector_if(&block) }
1095
- end
1096
-
1097
- # Test each row with one or more tests.
1098
- # @param tests [Proc] Each test is a Proc with the form
1099
- # *Proc.new {|row| row[:age] > 0}*
1100
- # The function returns an array with all errors.
1101
- #
1102
- # FIXME: description here is too sparse. As far as I can get,
1103
- # it should tell something about that each test is [descr, fields, block],
1104
- # and that first value may be column name to output. - zverok, 2016-05-18
1105
- def verify(*tests)
1106
- id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
1107
-
1108
- each_row_with_index.map do |row, i|
1109
- tests.reject { |*_, block| block.call(row) }
1110
- .map { |test| verify_error_message row, test, id, i }
1111
- end.flatten
1112
- end
1113
-
1114
- # DSL for yielding each row and returning a DaruLite::Vector based on the
1115
- # value each run of the block returns.
1116
- #
1117
- # == Usage
1118
- #
1119
- # a1 = DaruLite::Vector.new([1, 2, 3, 4, 5, 6, 7])
1120
- # a2 = DaruLite::Vector.new([10, 20, 30, 40, 50, 60, 70])
1121
- # a3 = DaruLite::Vector.new([100, 200, 300, 400, 500, 600, 700])
1122
- # ds = DaruLite::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
1123
- # total = ds.vector_by_calculation { a + b + c }
1124
- # # <DaruLite::Vector:82314050 @name = nil @size = 7 >
1125
- # # nil
1126
- # # 0 111
1127
- # # 1 222
1128
- # # 2 333
1129
- # # 3 444
1130
- # # 4 555
1131
- # # 5 666
1132
- # # 6 777
1133
- def vector_by_calculation(&block)
1134
- a = each_row.map { |r| r.instance_eval(&block) }
1135
-
1136
- DaruLite::Vector.new a, index: @index
1137
- end
1138
-
1139
- # Reorder the vectors in a dataframe
1140
- # @param [Array] order_array new order of the vectors
1141
- # @example
1142
- # df = DaruLite::DataFrame({
1143
- # a: [1, 2, 3],
1144
- # b: [4, 5, 6]
1145
- # }, order: [:a, :b])
1146
- # df.order = [:b, :a]
1147
- # df
1148
- # # => #<DaruLite::DataFrame(3x2)>
1149
- # # b a
1150
- # # 0 4 1
1151
- # # 1 5 2
1152
- # # 2 6 3
1153
- def order=(order_array)
1154
- raise ArgumentError, 'Invalid order' unless vectors.to_a.tally == order_array.tally
1155
-
1156
- initialize(to_h, order: order_array)
1157
- end
1158
-
1159
- # Return the dataframe with rotate vectors positions, the vector at position count is now
1160
- # the first vector of the dataframe.
1161
- # If only one vector in the dataframe, the dataframe is return without any change.
1162
- # @param count => Integer, the vector at position count will be the first vector of the dataframe.
1163
- # @example
1164
- # df = DaruLite::DataFrame({
1165
- # a: [1, 2, 3],
1166
- # b: [4, 5, 6],
1167
- # total: [5, 7, 9],
1168
- # })
1169
- # df.rotate_vectors(-1)
1170
- # df
1171
- # # => #<DaruLite::DataFrame(3x3)>
1172
- # # total b a
1173
- # # 0 5 4 1
1174
- # # 1 7 5 2
1175
- # # 2 9 6 3
1176
- def rotate_vectors(count = -1)
1177
- return self unless vectors.many?
1178
-
1179
- self.order = vectors.to_a.rotate(count)
1180
- self
1181
- end
1182
-
1183
- # Returns a vector, based on a string with a calculation based
1184
- # on vector.
1185
- #
1186
- # The calculation will be eval'ed, so you can put any variable
1187
- # or expression valid on ruby.
1188
- #
1189
- # For example:
1190
- # a = DaruLite::Vector.new [1,2]
1191
- # b = DaruLite::Vector.new [3,4]
1192
- # ds = DaruLite::DataFrame.new({:a => a,:b => b})
1193
- # ds.compute("a+b")
1194
- # => Vector [4,6]
1195
- def compute(text, &block)
1196
- return instance_eval(&block) if block
1197
-
1198
- instance_eval(text)
1199
- end
1200
-
1201
- # Return a vector with the number of missing values in each row.
1202
- #
1203
- # == Arguments
1204
- #
1205
- # * +missing_values+ - An Array of the values that should be
1206
- # treated as 'missing'. The default missing value is *nil*.
1207
- def missing_values_rows(missing_values = [nil])
1208
- number_of_missing = each_row.map do |row|
1209
- row.indexes(*missing_values).size
1210
- end
1211
-
1212
- DaruLite::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
1213
- end
1214
-
1215
- # TODO: remove next version
1216
- alias vector_missing_values missing_values_rows
1217
-
1218
- def has_missing_data?
1219
- @data.any? { |vec| vec.include_values?(*DaruLite::MISSING_VALUES) }
1220
- end
1221
- alias flawed? has_missing_data?
1222
- deprecate :has_missing_data?, :include_values?, 2016, 10
1223
- deprecate :flawed?, :include_values?, 2016, 10
1224
-
1225
- # Check if any of given values occur in the data frame
1226
- # @param [Array] values to check for
1227
- # @return [true, false] true if any of the given values occur in the
1228
- # dataframe, false otherwise
1229
- # @example
1230
- # df = DaruLite::DataFrame.new({
1231
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
1232
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
1233
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
1234
- # }, index: 11..18)
1235
- # df.include_values? nil
1236
- # # => true
1237
- def include_values?(*values)
1238
- @data.any? { |vec| vec.include_values?(*values) }
1239
- end
1240
-
1241
312
  # Return a nested hash using vector names as keys and an array constructed of
1242
313
  # hashes with other values. If block provided, is used to provide the
1243
314
  # values, with parameters +row+ of dataset, +current+ last hash on
1244
315
  # hierarchy and +name+ of the key to include
1245
- def nest(*tree_keys, &block)
1246
- tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
1247
-
1248
- each_row.with_object({}) do |row, current|
1249
- # Create tree
1250
- *keys, last = tree_keys
1251
- current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
1252
- name = row[last]
1253
-
1254
- if block
1255
- current[name] = yield(row, current, name)
1256
- else
1257
- current[name] ||= []
1258
- current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
1259
- end
1260
- end
1261
- end
1262
-
1263
- def vector_count_characters(vecs = nil)
1264
- vecs ||= @vectors.to_a
1265
-
1266
- collect_rows do |row|
1267
- vecs.sum { |v| row[v].to_s.size }
1268
- end
1269
- end
1270
-
1271
- def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
1272
- self[name]
1273
- .split_by_separator(sep)
1274
- .each { |k, v| self[:"#{name}#{join}#{k}"] = v }
1275
- end
1276
-
1277
- # Return the number of rows and columns of the DataFrame in an Array.
1278
- def shape
1279
- [nrows, ncols]
1280
- end
1281
-
1282
- # The number of rows
1283
- def nrows
1284
- @index.size
1285
- end
1286
-
1287
- # The number of vectors
1288
- def ncols
1289
- @vectors.size
1290
- end
1291
-
1292
- # Check if a vector is present
1293
- def has_vector?(vector)
1294
- @vectors.include? vector
1295
- end
1296
-
1297
- # Works like Array#any?.
1298
- #
1299
- # @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
1300
- # :row. A DaruLite::Vector object is yielded in the block.
1301
- # @example Using any?
1302
- # df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
1303
- # df.any?(:row) do |row|
1304
- # row[:a] < 3 and row[:b] == 'b'
1305
- # end #=> true
1306
- def any?(axis = :vector, &block)
1307
- if %i[vector column].include?(axis)
1308
- @data.any?(&block)
1309
- elsif axis == :row
1310
- each_row do |row|
1311
- return true if yield(row)
1312
- end
1313
- false
1314
- else
1315
- raise ArgumentError, "Unidentified axis #{axis}"
1316
- end
1317
- end
1318
-
1319
- # Works like Array#all?
1320
- #
1321
- # @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
1322
- # :row. A DaruLite::Vector object is yielded in the block.
1323
- # @example Using all?
1324
- # df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
1325
- # df.all?(:row) do |row|
1326
- # row[:a] < 10
1327
- # end #=> true
1328
- def all?(axis = :vector, &block)
1329
- if %i[vector column].include?(axis)
1330
- @data.all?(&block)
1331
- elsif axis == :row
1332
- each_row.all?(&block)
1333
- else
1334
- raise ArgumentError, "Unidentified axis #{axis}"
1335
- end
1336
- end
1337
-
1338
- # The first ten elements of the DataFrame
1339
- #
1340
- # @param [Fixnum] quantity (10) The number of elements to display from the top.
1341
- def head(quantity = 10)
1342
- row.at 0..(quantity - 1)
1343
- end
1344
-
1345
- alias first head
1346
-
1347
- # The last ten elements of the DataFrame
1348
- #
1349
- # @param [Fixnum] quantity (10) The number of elements to display from the bottom.
1350
- def tail(quantity = 10)
1351
- start = [-quantity, -size].max
1352
- row.at start..-1
1353
- end
1354
-
1355
- alias last tail
1356
-
1357
- # Sum all numeric/specified vectors in the DataFrame.
1358
- #
1359
- # Returns a new vector that's a containing a sum of all numeric
1360
- # or specified vectors of the DataFrame. By default, if the vector
1361
- # contains a nil, the sum is nil.
1362
- # With :skipnil argument set to true, nil values are assumed to be
1363
- # 0 (zero) and the sum vector is returned.
1364
- #
1365
- # @param args [Array] List of vectors to sum. Default is nil in which case
1366
- # all numeric vectors are summed.
1367
- #
1368
- # @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
1369
- #
1370
- # @return Vector with sum of all vectors specified in the argument.
1371
- # If vecs parameter is empty, sum all numeric vector.
1372
- #
1373
- # @example
1374
- # df = DaruLite::DataFrame.new({
1375
- # a: [1, 2, nil],
1376
- # b: [2, 1, 3],
1377
- # c: [1, 1, 1]
1378
- # })
1379
- # => #<DaruLite::DataFrame(3x3)>
1380
- # a b c
1381
- # 0 1 2 1
1382
- # 1 2 1 1
1383
- # 2 nil 3 1
1384
- # df.vector_sum [:a, :c]
1385
- # => #<DaruLite::Vector(3)>
1386
- # 0 2
1387
- # 1 3
1388
- # 2 nil
1389
- # df.vector_sum
1390
- # => #<DaruLite::Vector(3)>
1391
- # 0 4
1392
- # 1 4
1393
- # 2 nil
1394
- # df.vector_sum skipnil: true
1395
- # => #<DaruLite::Vector(3)>
1396
- # c
1397
- # 0 4
1398
- # 1 4
1399
- # 2 4
1400
- #
1401
- def vector_sum(*args)
1402
- defaults = { vecs: nil, skipnil: false }
1403
- options = args.last.is_a?(::Hash) ? args.pop : {}
1404
- options = defaults.merge(options)
1405
- vecs = args[0] || options[:vecs]
1406
- skipnil = args[1] || options[:skipnil]
1407
-
1408
- vecs ||= numeric_vectors
1409
- sum = DaruLite::Vector.new [0] * @size, index: @index, name: @name, dtype: @dtype
1410
- vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
1411
- end
1412
-
1413
- # Calculate mean of the rows of the dataframe.
1414
- #
1415
- # == Arguments
1416
- #
1417
- # * +max_missing+ - The maximum number of elements in the row that can be
1418
- # zero for the mean calculation to happen. Default to 0.
1419
- def vector_mean(max_missing = 0)
1420
- # FIXME: in vector_sum we preserve created vector dtype, but
1421
- # here we are not. Is this by design or ...? - zverok, 2016-05-18
1422
- mean_vec = DaruLite::Vector.new [0] * @size, index: @index, name: "mean_#{@name}"
1423
-
1424
- each_row_with_index.with_object(mean_vec) do |(row, i), memo|
1425
- memo[i] = row.indexes(*DaruLite::MISSING_VALUES).size > max_missing ? nil : row.mean
1426
- end
1427
- end
1428
-
1429
- # Group elements by vector to perform operations on them. Returns a
1430
- # DaruLite::Core::GroupBy object.See the DaruLite::Core::GroupBy docs for a detailed
1431
- # list of possible operations.
1432
- #
1433
- # == Arguments
1434
- #
1435
- # * vectors - An Array contatining names of vectors to group by.
1436
- #
1437
- # == Usage
1438
- #
1439
- # df = DaruLite::DataFrame.new({
1440
- # a: %w{foo bar foo bar foo bar foo foo},
1441
- # b: %w{one one two three two two one three},
1442
- # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
1443
- # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
1444
- # })
1445
- # df.group_by([:a,:b,:c]).groups
1446
- # #=> {["bar", "one", 2]=>[1],
1447
- # # ["bar", "three", 1]=>[3],
1448
- # # ["bar", "two", 6]=>[5],
1449
- # # ["foo", "one", 1]=>[0],
1450
- # # ["foo", "one", 3]=>[6],
1451
- # # ["foo", "three", 8]=>[7],
1452
- # # ["foo", "two", 3]=>[2, 4]}
1453
- def group_by(*vectors)
1454
- vectors.flatten!
1455
- missing = vectors - @vectors.to_a
1456
- raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}") unless missing.empty?
1457
-
1458
- vectors = [@vectors.first] if vectors.empty?
1459
-
1460
- DaruLite::Core::GroupBy.new(self, vectors)
1461
- end
1462
-
1463
- def reindex_vectors(new_vectors)
1464
- unless new_vectors.is_a?(DaruLite::Index)
1465
- raise ArgumentError, 'Must pass the new index of type Index or its ' \
1466
- "subclasses, not #{new_vectors.class}"
1467
- end
1468
-
1469
- cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
1470
- new_vectors.each_with_object(cl) do |vec, memo|
1471
- memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
1472
- end
1473
- end
1474
-
1475
- def get_vector_anyways(v)
1476
- @vectors.include?(v) ? self[v].to_a : Array.new(size)
1477
- end
1478
-
1479
- # Concatenate another DataFrame along corresponding columns.
1480
- # If columns do not exist in both dataframes, they are filled with nils
1481
- def concat(other_df)
1482
- vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
1483
-
1484
- data = vectors.map do |v|
1485
- get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
1486
- end
1487
-
1488
- DaruLite::DataFrame.new(data, order: vectors)
1489
- end
1490
-
1491
- # Concatenates another DataFrame as #concat.
1492
- # Additionally it tries to preserve the index. If the indices contain
1493
- # common elements, #union will overwrite the according rows in the
1494
- # first dataframe.
1495
- def union(other_df)
1496
- index = (@index.to_a + other_df.index.to_a).uniq
1497
- df = row[*(@index.to_a - other_df.index.to_a)]
1498
-
1499
- df = df.concat(other_df)
1500
- df.index = DaruLite::Index.new(index)
1501
- df
1502
- end
1503
-
1504
- module SetSingleIndexStrategy
1505
- def self.uniq_size(df, col)
1506
- df[col].uniq.size
1507
- end
1508
-
1509
- def self.new_index(df, col)
1510
- DaruLite::Index.new(df[col].to_a)
1511
- end
1512
-
1513
- def self.delete_vector(df, col)
1514
- df.delete_vector(col)
1515
- end
1516
- end
1517
-
1518
- module SetCategoricalIndexStrategy
1519
- def self.new_index(df, col)
1520
- DaruLite::CategoricalIndex.new(df[col].to_a)
1521
- end
1522
-
1523
- def self.delete_vector(df, col)
1524
- df.delete_vector(col)
1525
- end
1526
- end
1527
-
1528
- module SetMultiIndexStrategy
1529
- def self.uniq_size(df, cols)
1530
- df[*cols].uniq.size
1531
- end
1532
-
1533
- def self.new_index(df, cols)
1534
- DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
1535
- mi.name = cols
1536
- end
1537
- end
1538
-
1539
- def self.delete_vector(df, cols)
1540
- df.delete_vectors(*cols)
1541
- end
1542
- end
1543
-
1544
- # Set a particular column as the new DF
1545
- def set_index(new_index_col, keep: false, categorical: false)
1546
- if categorical
1547
- strategy = SetCategoricalIndexStrategy
1548
- elsif new_index_col.respond_to?(:to_a)
1549
- strategy = SetMultiIndexStrategy
1550
- new_index_col = new_index_col.to_a
1551
- else
1552
- strategy = SetSingleIndexStrategy
1553
- end
1554
-
1555
- unless categorical
1556
- uniq_size = strategy.uniq_size(self, new_index_col)
1557
- raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
1558
- end
1559
-
1560
- self.index = strategy.new_index(self, new_index_col)
1561
- strategy.delete_vector(self, new_index_col) unless keep
1562
- self
1563
- end
1564
-
1565
- # Change the index of the DataFrame and preserve the labels of the previous
1566
- # indexing. New index can be DaruLite::Index or any of its subclasses.
1567
- #
1568
- # @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
1569
- # @example Reindexing DataFrame
1570
- # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
1571
- # index: ['a','b','c','d'])
1572
- # #=>
1573
- # ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
1574
- # # a b
1575
- # # a 1 11
1576
- # # b 2 22
1577
- # # c 3 33
1578
- # # d 4 44
1579
- # df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
1580
- # #=>
1581
- # ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
1582
- # # a b
1583
- # # b 2 22
1584
- # # 0 nil nil
1585
- # # a 1 11
1586
- # # g nil nil
1587
- def reindex(new_index)
1588
- unless new_index.is_a?(DaruLite::Index)
1589
- raise ArgumentError, 'Must pass the new index of type Index or its ' \
1590
- "subclasses, not #{new_index.class}"
1591
- end
1592
-
1593
- cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
1594
- new_index.each_with_object(cl) do |idx, memo|
1595
- memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
1596
- end
1597
- end
1598
-
1599
- def reset_index
1600
- index_df = index.to_df
1601
- names = index.name
1602
- names = [names] unless names.instance_of?(Array)
1603
- new_vectors = names + vectors.to_a
1604
- self.index = index_df.index
1605
- names.each do |name|
1606
- self[name] = index_df[name]
1607
- end
1608
- self.order = new_vectors
1609
- self
1610
- end
1611
-
1612
- # Reassign index with a new index of type DaruLite::Index or any of its subclasses.
1613
- #
1614
- # @param [DaruLite::Index] idx New index object on which the rows of the dataframe
1615
- # are to be indexed.
1616
- # @example Reassigining index of a DataFrame
1617
- # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
1618
- # df.index.to_a #=> [0,1,2,3]
1619
- #
1620
- # df.index = DaruLite::Index.new(['a','b','c','d'])
1621
- # df.index.to_a #=> ['a','b','c','d']
1622
- # df.row['a'].to_a #=> [1,11]
1623
- def index=(idx)
1624
- @index = Index.coerce idx
1625
- @data.each { |vec| vec.index = @index }
1626
-
1627
- self
1628
- end
1629
-
1630
- # Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
1631
- #
1632
- # @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
1633
- # be indexed. Must of the same size as ncols.
1634
- # @example Reassigning vectors of a DataFrame
1635
- # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
1636
- # df.vectors.to_a #=> [:a, :b, :c]
1637
- #
1638
- # df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
1639
- # df.vectors.to_a #=> [:foo, :bar, :baz]
1640
- def vectors=(new_index)
1641
- raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
1642
-
1643
- if new_index.size != ncols
1644
- raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
1645
- "dataframe size #{ncols}"
1646
- end
1647
-
1648
- @vectors = new_index
1649
- @data.zip(new_index.to_a).each do |vect, name|
1650
- vect.name = name
1651
- end
1652
- self
1653
- end
1654
-
1655
- # Renames the vectors
1656
- #
1657
- # == Arguments
1658
- #
1659
- # * name_map - A hash where the keys are the exising vector names and
1660
- # the values are the new names. If a vector is renamed
1661
- # to a vector name that is already in use, the existing
1662
- # one is overwritten.
1663
- #
1664
- # == Usage
1665
- #
1666
- # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1667
- # df.rename_vectors :a => :alpha, :c => :gamma
1668
- # df.vectors.to_a #=> [:alpha, :b, :gamma]
1669
- def rename_vectors(name_map)
1670
- existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
1671
- delete_vectors(*existing_targets)
1672
-
1673
- new_names = vectors.to_a.map { |v| name_map[v] || v }
1674
- self.vectors = DaruLite::Index.new new_names
1675
- end
1676
-
1677
- # Renames the vectors and returns itself
1678
- #
1679
- # == Arguments
1680
- #
1681
- # * name_map - A hash where the keys are the exising vector names and
1682
- # the values are the new names. If a vector is renamed
1683
- # to a vector name that is already in use, the existing
1684
- # one is overwritten.
1685
- #
1686
- # == Usage
1687
- #
1688
- # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1689
- # df.rename_vectors! :a => :alpha, :c => :gamma # df
1690
- def rename_vectors!(name_map)
1691
- rename_vectors(name_map)
1692
- self
1693
- end
1694
-
1695
- # Converts the vectors to a DaruLite::MultiIndex.
1696
- # The argument passed is used as the MultiIndex's top level
1697
- def add_level_to_vectors(top_level_label)
1698
- tuples = vectors.map { |label| [top_level_label, *label] }
1699
- self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
1700
- end
1701
-
1702
- # Return the indexes of all the numeric vectors. Will include vectors with nils
1703
- # alongwith numbers.
1704
- def numeric_vectors
1705
- # FIXME: Why _with_index ?..
1706
- each_vector_with_index
1707
- .select { |vec, _i| vec.numeric? }
1708
- .map(&:last)
1709
- end
1710
-
1711
- def numeric_vector_names
1712
- @vectors.select { |v| self[v].numeric? }
1713
- end
1714
-
1715
- # Return a DataFrame of only the numerical Vectors. If clone: false
1716
- # is specified as option, only a *view* of the Vectors will be
1717
- # returned. Defaults to clone: true.
1718
- def only_numerics(opts = {})
1719
- cln = opts[:clone] != false
1720
- arry = numeric_vectors.map { |v| self[v] }
1721
-
1722
- order = Index.new(numeric_vectors)
1723
- DaruLite::DataFrame.new(arry, clone: cln, order: order, index: @index)
1724
- end
1725
-
1726
- # Generate a summary of this DataFrame based on individual vectors in the DataFrame
1727
- # @return [String] String containing the summary of the DataFrame
1728
- def summary
1729
- summary = "= #{name}"
1730
- summary << "\n Number of rows: #{nrows}"
1731
- @vectors.each do |v|
1732
- summary << "\n Element:[#{v}]\n"
1733
- summary << self[v].summary(1)
1734
- end
1735
- summary
1736
- end
1737
-
1738
- # Sorts a dataframe (ascending/descending) in the given pripority sequence of
1739
- # vectors, with or without a block.
1740
- #
1741
- # @param vector_order [Array] The order of vector names in which the DataFrame
1742
- # should be sorted.
1743
- # @param opts [Hash] opts The options to sort with.
1744
- # @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
1745
- # or descending order. Specify Array corresponding to *order* for multiple
1746
- # sort orders.
1747
- # @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
1748
- # to be used for sorting, for each vector name in *order* as a hash of
1749
- # vector name and lambda expressions. In case a lambda for a vector is not
1750
- # specified, the default will be used.
1751
- # @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
1752
- # automatically or not when a block is provided.
1753
- # If set to True, nils will appear at top after sorting.
1754
- #
1755
- # @example Sort a dataframe with a vector sequence.
1756
- #
1757
- #
1758
- # df = DaruLite::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
1759
- #
1760
- # df.sort [:a, :b]
1761
- # # =>
1762
- # # <DaruLite::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
1763
- # # a b
1764
- # # 2 1 3
1765
- # # 0 1 5
1766
- # # 3 2 2
1767
- # # 1 2 4
1768
- # # 4 3 1
1769
- #
1770
- # @example Sort a dataframe without a block. Here nils will be handled automatically.
1771
- #
1772
- # df = DaruLite::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
1773
- #
1774
- # df.sort([:a])
1775
- # # =>
1776
- # # <DaruLite::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
1777
- # # a b
1778
- # # 1 nil 3
1779
- # # 3 nil 1
1780
- # # 0 -3 4
1781
- # # 2 -1 2
1782
- # # 4 5 4
1783
- #
1784
- # @example Sort a dataframe with a block with nils handled automatically.
1785
- #
1786
- # df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1787
- #
1788
- # df.sort [:b], by: {b: lambda { |a| a.length } }
1789
- # # NoMethodError: undefined method `length' for nil:NilClass
1790
- # # from (pry):8:in `block in __pry__'
1791
- #
1792
- # df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
1793
- #
1794
- # # =>
1795
- # # <DaruLite::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
1796
- # # a b
1797
- # # 2 1 nil
1798
- # # 5 1 nil
1799
- # # 4 -1 x
1800
- # # 1 -1 aa
1801
- # # 0 nil aaa
1802
- # # 3 nil baaa
1803
- #
1804
- # @example Sort a dataframe with a block with nils handled manually.
1805
- #
1806
- # df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1807
- #
1808
- # # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
1809
- # df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
1810
- #
1811
- # # =>
1812
- # #<DaruLite::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
1813
- # # a b
1814
- # # 4 -1 x
1815
- # # 1 -1 aa
1816
- # # 0 nil aaa
1817
- # # 3 nil baaa
1818
- # # 2 1 nil
1819
- # # 5 1 nil
1820
-
1821
- def sort!(vector_order, opts = {})
1822
- raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
1823
-
1824
- # To enable sorting with categorical data,
1825
- # map categories to integers preserving their order
1826
- old = convert_categorical_vectors vector_order
1827
- block = sort_prepare_block vector_order, opts
1828
-
1829
- order = @index.size.times.sort(&block)
1830
- new_index = @index.reorder order
1831
-
1832
- # To reverse map mapping of categorical data to integers
1833
- restore_categorical_vectors old
1834
-
1835
- @data.each do |vector|
1836
- vector.reorder! order
1837
- end
1838
-
1839
- self.index = new_index
1840
-
1841
- self
1842
- end
1843
-
1844
- # Non-destructive version of #sort!
1845
- def sort(vector_order, opts = {})
1846
- dup.sort! vector_order, opts
1847
- end
1848
-
1849
- # Pivots a data frame on specified vectors and applies an aggregate function
1850
- # to quickly generate a summary.
1851
- #
1852
- # == Options
1853
- #
1854
- # +:index+ - Keys to group by on the pivot table row index. Pass vector names
1855
- # contained in an Array.
1856
- #
1857
- # +:vectors+ - Keys to group by on the pivot table column index. Pass vector
1858
- # names contained in an Array.
1859
- #
1860
- # +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
1861
- # use any of the statistics functions applicable on Vectors that can be found in
1862
- # the DaruLite::Statistics::Vector module.
1863
- #
1864
- # +:values+ - Columns to aggregate. Will consider all numeric columns not
1865
- # specified in *:index* or *:vectors*. Optional.
1866
- #
1867
- # == Usage
1868
- #
1869
- # df = DaruLite::DataFrame.new({
1870
- # a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
1871
- # b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
1872
- # c: ['small','large','large','small','small','large','small','large','small'],
1873
- # d: [1,2,2,3,3,4,5,6,7],
1874
- # e: [2,4,4,6,6,8,10,12,14]
1875
- # })
1876
- # df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
1877
- #
1878
- # #=>
1879
- # # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
1880
- # # [:e, :one] [:e, :two]
1881
- # # [:bar] 18 26
1882
- # # [:foo] 10 12
1883
- def pivot_table(opts = {})
1884
- raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?
1885
-
1886
- index = opts[:index]
1887
- vectors = opts[:vectors] || []
1888
- aggregate_function = opts[:agg] || :mean
1889
- values = prepare_pivot_values index, vectors, opts
1890
- raise IndexError, 'No numeric vectors to aggregate' if values.empty?
1891
-
1892
- grouped = group_by(index)
1893
- return grouped.send(aggregate_function) if vectors.empty?
1894
-
1895
- super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
1896
-
1897
- pivot_dataframe super_hash
1898
- end
1899
-
1900
- # Merge vectors from two DataFrames. In case of name collision,
1901
- # the vectors names are changed to x_1, x_2 ....
1902
- #
1903
- # @return {DaruLite::DataFrame}
1904
- def merge(other_df)
1905
- unless nrows == other_df.nrows
1906
- raise ArgumentError,
1907
- "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
1908
- end
1909
-
1910
- new_fields = (@vectors.to_a + other_df.vectors.to_a)
1911
- new_fields = ArrayHelper.recode_repeated(new_fields)
1912
- DataFrame.new({}, order: new_fields).tap do |df_new|
1913
- (0...nrows).each do |i|
1914
- df_new.add_row row[i].to_a + other_df.row[i].to_a
1915
- end
1916
- df_new.index = @index if @index == other_df.index
1917
- df_new.update
1918
- end
1919
- end
1920
-
1921
- # Join 2 DataFrames with SQL style joins. Currently supports inner, left
1922
- # outer, right outer and full outer joins.
1923
- #
1924
- # @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
1925
- # to be performed.
1926
- # @param [Hash] opts Options Hash
1927
- # @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
1928
- # @option :on [Array] The columns on which the join is to be performed.
1929
- # Column names specified here must be common to both DataFrames.
1930
- # @option :indicator [Symbol] The name of a vector to add to the resultant
1931
- # dataframe that indicates whether the record was in the left (:left_only),
1932
- # right (:right_only), or both (:both) joining dataframes.
1933
- # @return [DaruLite::DataFrame]
1934
- # @example Inner Join
1935
- # left = DaruLite::DataFrame.new({
1936
- # :id => [1,2,3,4],
1937
- # :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
1938
- # })
1939
- # right = DaruLite::DataFrame.new({
1940
- # :id => [1,2,3,4],
1941
- # :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
1942
- # })
1943
- # left.join(right, how: :inner, on: [:name])
1944
- # #=>
1945
- # ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
1946
- # # id_1 name id_2
1947
- # # 0 1 Pirate 2
1948
- # # 1 3 Ninja 4
1949
- def join(other_df, opts = {})
1950
- DaruLite::Core::Merge.join(self, other_df, opts)
1951
- end
1952
-
1953
- # Creates a new dataset for one to many relations
1954
- # on a dataset, based on pattern of field names.
1955
- #
1956
- # for example, you have a survey for number of children
1957
- # with this structure:
1958
- # id, name, child_name_1, child_age_1, child_name_2, child_age_2
1959
- # with
1960
- # ds.one_to_many([:id], "child_%v_%n"
1961
- # the field of first parameters will be copied verbatim
1962
- # to new dataset, and fields which responds to second
1963
- # pattern will be added one case for each different %n.
1964
- #
1965
- # @example
1966
- # cases=[
1967
- # ['1','george','red',10,'blue',20,nil,nil],
1968
- # ['2','fred','green',15,'orange',30,'white',20],
1969
- # ['3','alfred',nil,nil,nil,nil,nil,nil]
1970
- # ]
1971
- # ds=DaruLite::DataFrame.rows(cases, order:
1972
- # [:id, :name,
1973
- # :car_color1, :car_value1,
1974
- # :car_color2, :car_value2,
1975
- # :car_color3, :car_value3])
1976
- # ds.one_to_many([:id],'car_%v%n').to_matrix
1977
- # #=> Matrix[
1978
- # # ["red", "1", 10],
1979
- # # ["blue", "1", 20],
1980
- # # ["green", "2", 15],
1981
- # # ["orange", "2", 30],
1982
- # # ["white", "2", 20]
1983
- # # ]
1984
- def one_to_many(parent_fields, pattern)
1985
- vars, numbers = one_to_many_components(pattern)
1986
-
1987
- DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
1988
- each_row do |row|
1989
- verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
1990
- numbers.each do |n|
1991
- generated = one_to_many_row row, n, vars, pattern
1992
- next if generated.values.all?(&:nil?)
1993
-
1994
- ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
1995
- end
1996
- end
1997
- ds.update
1998
- end
1999
- end
2000
-
2001
- def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
2002
- self[nm]
2003
- .split_by_separator(sep)
2004
- .each_with_index do |(k, v), i|
2005
- v.rename "#{nm}:#{k}"
2006
- self[:"#{nm}#{join}#{i + 1}"] = v
2007
- end
2008
- end
2009
-
2010
- # Create a sql, basen on a given Dataset
2011
- #
2012
- # == Arguments
2013
- #
2014
- # * table - String specifying name of the table that will created in SQL.
2015
- # * charset - Character set. Default is "UTF8".
2016
- #
2017
- # @example
2018
- #
2019
- # ds = DaruLite::DataFrame.new({
2020
- # :id => DaruLite::Vector.new([1,2,3,4,5]),
2021
- # :name => DaruLite::Vector.new(%w{Alex Peter Susan Mary John})
2022
- # })
2023
- # ds.create_sql('names')
2024
- # #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
2025
- #
2026
- def create_sql(table, charset = 'UTF8')
2027
- sql = "CREATE TABLE #{table} ("
2028
- fields = vectors.to_a.collect do |f|
2029
- v = self[f]
2030
- "#{f} #{v.db_type}"
2031
- end
2032
-
2033
- sql + fields.join(",\n ") + ") CHARACTER SET=#{charset};"
2034
- end
2035
-
2036
- # Returns the dataframe. This can be convenient when the user does not
2037
- # know whether the object is a vector or a dataframe.
2038
- # @return [self] the dataframe
2039
- def to_df
2040
- self
2041
- end
2042
-
2043
- # Convert all vectors of type *:numeric* into a Matrix.
2044
- def to_matrix
2045
- Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
2046
- end
2047
-
2048
- # Converts the DataFrame into an array of hashes where key is vector name
2049
- # and value is the corresponding element. The 0th index of the array contains
2050
- # the array of hashes while the 1th index contains the indexes of each row
2051
- # of the dataframe. Each element in the index array corresponds to its row
2052
- # in the array of hashes, which has the same index.
2053
- def to_a
2054
- [each_row.map(&:to_h), @index.to_a]
2055
- end
2056
-
2057
- # Convert to json. If no_index is false then the index will NOT be included
2058
- # in the JSON thus created.
2059
- def to_json(no_index = true)
2060
- if no_index
2061
- to_a[0].to_json
2062
- else
2063
- to_a.to_json
2064
- end
2065
- end
2066
-
2067
- # Converts DataFrame to a hash (explicit) with keys as vector names and values as
2068
- # the corresponding vectors.
2069
- def to_h
2070
- @vectors
2071
- .each_with_index
2072
- .map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
2073
- end
316
+ def nest(*tree_keys, &block)
317
+ tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
2074
318
 
2075
- # Convert to html for IRuby.
2076
- def to_html(threshold = DaruLite.max_rows)
2077
- table_thead = to_html_thead
2078
- table_tbody = to_html_tbody(threshold)
2079
- path = if index.is_a?(MultiIndex)
2080
- File.expand_path('iruby/templates/dataframe_mi.html.erb', __dir__)
2081
- else
2082
- File.expand_path('iruby/templates/dataframe.html.erb', __dir__)
2083
- end
2084
- ERB.new(File.read(path).strip).result(binding)
2085
- end
319
+ each_row.with_object({}) do |row, current|
320
+ # Create tree
321
+ *keys, last = tree_keys
322
+ current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
323
+ name = row[last]
2086
324
 
2087
- def to_html_thead
2088
- table_thead_path =
2089
- if index.is_a?(MultiIndex)
2090
- File.expand_path('iruby/templates/dataframe_mi_thead.html.erb', __dir__)
325
+ if block
326
+ current[name] = yield(row, current, name)
2091
327
  else
2092
- File.expand_path('iruby/templates/dataframe_thead.html.erb', __dir__)
328
+ current[name] ||= []
329
+ current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
2093
330
  end
2094
- ERB.new(File.read(table_thead_path).strip).result(binding)
331
+ end
2095
332
  end
2096
333
 
2097
- def to_html_tbody(threshold = DaruLite.max_rows)
2098
- threshold ||= @size
2099
- table_tbody_path =
2100
- if index.is_a?(MultiIndex)
2101
- File.expand_path('iruby/templates/dataframe_mi_tbody.html.erb', __dir__)
2102
- else
2103
- File.expand_path('iruby/templates/dataframe_tbody.html.erb', __dir__)
2104
- end
2105
- ERB.new(File.read(table_tbody_path).strip).result(binding)
334
+ def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
335
+ self[name]
336
+ .split_by_separator(sep)
337
+ .each { |k, v| self[:"#{name}#{join}#{k}"] = v }
2106
338
  end
2107
339
 
2108
- def to_s
2109
- "#<#{self.class}#{": #{@name}" if @name}(#{nrows}x#{ncols})>"
340
+ # Return the number of rows and columns of the DataFrame in an Array.
341
+ def shape
342
+ [nrows, ncols]
2110
343
  end
2111
344
 
2112
- # Method for updating the metadata (i.e. missing value positions) of the
2113
- # after assingment/deletion etc. are complete. This is provided so that
2114
- # time is not wasted in creating the metadata for the vector each time
2115
- # assignment/deletion of elements is done. Updating data this way is called
2116
- # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
2117
- def update
2118
- @data.each(&:update) if DaruLite.lazy_update
345
+ # The number of rows
346
+ def nrows
347
+ @index.size
2119
348
  end
2120
349
 
2121
- # Rename the DataFrame.
2122
- def rename(new_name)
2123
- @name = new_name
2124
- self
350
+ # The number of vectors
351
+ def ncols
352
+ @vectors.size
2125
353
  end
2126
354
 
2127
- alias name= rename
2128
-
2129
- # Write this DataFrame to a CSV file.
355
+ # Renames the vectors
2130
356
  #
2131
357
  # == Arguments
2132
358
  #
2133
- # * filename - Path of CSV file where the DataFrame is to be saved.
359
+ # * name_map - A hash where the keys are the exising vector names and
360
+ # the values are the new names. If a vector is renamed
361
+ # to a vector name that is already in use, the existing
362
+ # one is overwritten.
2134
363
  #
2135
- # == Options
364
+ # == Usage
2136
365
  #
2137
- # * convert_comma - If set to *true*, will convert any commas in any
2138
- # of the data to full stops ('.').
2139
- # All the options accepted by CSV.read() can also be passed into this
2140
- # function.
2141
- def write_csv(filename, opts = {})
2142
- DaruLite::IO.dataframe_write_csv self, filename, opts
2143
- end
366
+ # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
367
+ # df.rename_vectors :a => :alpha, :c => :gamma
368
+ # df.vectors.to_a #=> [:alpha, :b, :gamma]
369
+ def rename_vectors(name_map)
370
+ existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
371
+ delete_vectors(*existing_targets)
2144
372
 
2145
- # Write this dataframe to an Excel Spreadsheet
2146
- #
2147
- # == Arguments
2148
- #
2149
- # * filename - The path of the file where the DataFrame should be written.
2150
- def write_excel(filename, opts = {})
2151
- DaruLite::IO.dataframe_write_excel self, filename, opts
373
+ new_names = vectors.to_a.map { |v| name_map[v] || v }
374
+ self.vectors = DaruLite::Index.new new_names
2152
375
  end
2153
376
 
2154
- # Insert each case of the Dataset on the selected table
377
+ # Renames the vectors and returns itself
2155
378
  #
2156
379
  # == Arguments
2157
380
  #
2158
- # * dbh - DBI database connection object.
2159
- # * query - Query string.
381
+ # * name_map - A hash where the keys are the exising vector names and
382
+ # the values are the new names. If a vector is renamed
383
+ # to a vector name that is already in use, the existing
384
+ # one is overwritten.
2160
385
  #
2161
386
  # == Usage
2162
387
  #
2163
- # ds = DaruLite::DataFrame.new({:id=>DaruLite::Vector.new([1,2,3]), :name=>DaruLite::Vector.new(["a","b","c"])})
2164
- # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
2165
- # ds.write_sql(dbh,"test")
2166
- def write_sql(dbh, table)
2167
- DaruLite::IO.dataframe_write_sql self, dbh, table
388
+ # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
389
+ # df.rename_vectors! :a => :alpha, :c => :gamma # df
390
+ def rename_vectors!(name_map)
391
+ rename_vectors(name_map)
392
+ self
2168
393
  end
2169
394
 
2170
- # Use marshalling to save dataframe to a file.
2171
- def save(filename)
2172
- DaruLite::IO.save self, filename
395
+ # Converts the vectors to a DaruLite::MultiIndex.
396
+ # The argument passed is used as the MultiIndex's top level
397
+ def add_level_to_vectors(top_level_label)
398
+ tuples = vectors.map { |label| [top_level_label, *label] }
399
+ self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
2173
400
  end
2174
401
 
2175
- def _dump(_depth)
2176
- Marshal.dump(
2177
- data: @data,
2178
- index: @index.to_a,
2179
- order: @vectors.to_a,
2180
- name: @name
2181
- )
402
+ def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
403
+ self[nm]
404
+ .split_by_separator(sep)
405
+ .each_with_index do |(k, v), i|
406
+ v.rename "#{nm}:#{k}"
407
+ self[:"#{nm}#{join}#{i + 1}"] = v
408
+ end
409
+ end
410
+
411
+ # Method for updating the metadata (i.e. missing value positions) of the
412
+ # after assingment/deletion etc. are complete. This is provided so that
413
+ # time is not wasted in creating the metadata for the vector each time
414
+ # assignment/deletion of elements is done. Updating data this way is called
415
+ # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
416
+ def update
417
+ @data.each(&:update) if DaruLite.lazy_update
2182
418
  end
2183
419
 
2184
- def self._load(data)
2185
- h = Marshal.load data
2186
- DaruLite::DataFrame.new(h[:data],
2187
- index: h[:index],
2188
- order: h[:order],
2189
- name: h[:name])
420
+ # Rename the DataFrame.
421
+ def rename(new_name)
422
+ @name = new_name
423
+ self
2190
424
  end
425
+ alias name= rename
2191
426
 
2192
427
  # Transpose a DataFrame, tranposing elements and row, column indexing.
2193
428
  def transpose
@@ -2218,11 +453,6 @@ module DaruLite
2218
453
  )
2219
454
  end
2220
455
 
2221
- # Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
2222
- def where(bool_array)
2223
- DaruLite::Core::Query.df_where self, bool_array
2224
- end
2225
-
2226
456
  def ==(other)
2227
457
  self.class == other.class &&
2228
458
  @size == other.size &&
@@ -2276,144 +506,6 @@ module DaruLite
2276
506
  order: all_vectors.map(&:name)
2277
507
  end
2278
508
 
2279
- # Split the dataframe into many dataframes based on category vector
2280
- # @param [object] cat_name name of category vector to split the dataframe
2281
- # @return [Array] array of dataframes split by category with category vector
2282
- # used to split not included
2283
- # @example
2284
- # df = DaruLite::DataFrame.new({
2285
- # a: [1, 2, 3],
2286
- # b: ['a', 'a', 'b']
2287
- # })
2288
- # df.to_category :b
2289
- # df.split_by_category :b
2290
- # # => [#<DaruLite::DataFrame: a (2x1)>
2291
- # # a
2292
- # # 0 1
2293
- # # 1 2,
2294
- # # #<DaruLite::DataFrame: b (1x1)>
2295
- # # a
2296
- # # 2 3]
2297
- def split_by_category(cat_name)
2298
- cat_dv = self[cat_name]
2299
- raise ArgumentError, "#{cat_name} is not a category vector" unless
2300
- cat_dv.category?
2301
-
2302
- cat_dv.categories.map do |cat|
2303
- where(cat_dv.eq cat)
2304
- .rename(cat)
2305
- .delete_vector cat_name
2306
- end
2307
- end
2308
-
2309
- # @param indexes [Array] index(s) at which row tuples are retrieved
2310
- # @return [Array] returns array of row tuples at given index(s)
2311
- # @example Using DaruLite::Index
2312
- # df = DaruLite::DataFrame.new({
2313
- # a: [1, 2, 3],
2314
- # b: ['a', 'a', 'b']
2315
- # })
2316
- #
2317
- # df.access_row_tuples_by_indexs(1,2)
2318
- # # => [[2, "a"], [3, "b"]]
2319
- #
2320
- # df.index = DaruLite::Index.new([:one,:two,:three])
2321
- # df.access_row_tuples_by_indexs(:one,:three)
2322
- # # => [[1, "a"], [3, "b"]]
2323
- #
2324
- # @example Using DaruLite::MultiIndex
2325
- # mi_idx = DaruLite::MultiIndex.from_tuples [
2326
- # [:a,:one,:bar],
2327
- # [:a,:one,:baz],
2328
- # [:b,:two,:bar],
2329
- # [:a,:two,:baz],
2330
- # ]
2331
- # df_mi = DaruLite::DataFrame.new({
2332
- # a: 1..4,
2333
- # b: 'a'..'d'
2334
- # }, index: mi_idx )
2335
- #
2336
- # df_mi.access_row_tuples_by_indexs(:b, :two, :bar)
2337
- # # => [[3, "c"]]
2338
- # df_mi.access_row_tuples_by_indexs(:a)
2339
- # # => [[1, "a"], [2, "b"], [4, "d"]]
2340
- def access_row_tuples_by_indexs(*indexes)
2341
- return get_sub_dataframe(indexes, by_position: false).map_rows(&:to_a) if
2342
- @index.is_a?(DaruLite::MultiIndex)
2343
-
2344
- positions = @index.pos(*indexes)
2345
- if positions.is_a? Numeric
2346
- row = get_rows_for([positions])
2347
- row.first.is_a?(Array) ? row : [row]
2348
- else
2349
- new_rows = get_rows_for(indexes, by_position: false)
2350
- indexes.map { |index| new_rows.map { |r| r[index] } }
2351
- end
2352
- end
2353
-
2354
- # Function to use for aggregating the data.
2355
- #
2356
- # @param options [Hash] options for column, you want in resultant dataframe
2357
- #
2358
- # @return [DaruLite::DataFrame]
2359
- #
2360
- # @example
2361
- # df = DaruLite::DataFrame.new(
2362
- # {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
2363
- # => #<DaruLite::DataFrame(5x2)>
2364
- # col num
2365
- # 0 a 52
2366
- # 1 b 12
2367
- # 2 c 7
2368
- # 3 d 17
2369
- # 4 e 1
2370
- #
2371
- # df.aggregate(num_100_times: ->(df) { (df.num*100).first })
2372
- # => #<DaruLite::DataFrame(5x1)>
2373
- # num_100_ti
2374
- # 0 5200
2375
- # 1 1200
2376
- # 2 700
2377
- # 3 1700
2378
- # 4 100
2379
- #
2380
- # When we have duplicate index :
2381
- #
2382
- # idx = DaruLite::CategoricalIndex.new [:a, :b, :a, :a, :c]
2383
- # df = DaruLite::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
2384
- # => #<DaruLite::DataFrame(5x1)>
2385
- # num
2386
- # a 52
2387
- # b 12
2388
- # a 7
2389
- # a 17
2390
- # c 1
2391
- #
2392
- # df.aggregate(num: :mean)
2393
- # => #<DaruLite::DataFrame(3x1)>
2394
- # num
2395
- # a 25.3333333
2396
- # b 12
2397
- # c 1
2398
- #
2399
- # Note: `GroupBy` class `aggregate` method uses this `aggregate` method
2400
- # internally.
2401
- def aggregate(options = {}, multi_index_level = -1)
2402
- if block_given?
2403
- positions_tuples, new_index = yield(@index) # NOTE: use of yield is private for now
2404
- else
2405
- positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
2406
- end
2407
-
2408
- colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
2409
-
2410
- DaruLite::DataFrame.new(colmn_value, index: new_index, order: options.keys)
2411
- end
2412
-
2413
- def group_by_and_aggregate(*group_by_keys, **aggregation_map)
2414
- group_by(*group_by_keys).aggregate(aggregation_map)
2415
- end
2416
-
2417
509
  private
2418
510
 
2419
511
  def headers
@@ -2424,20 +516,6 @@ module DaruLite
2424
516
  index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
2425
517
  end
2426
518
 
2427
- def convert_categorical_vectors(names)
2428
- names.filter_map do |n|
2429
- next unless self[n].category?
2430
-
2431
- old = [n, self[n]]
2432
- self[n] = DaruLite::Vector.new(self[n].to_ints)
2433
- old
2434
- end
2435
- end
2436
-
2437
- def restore_categorical_vectors(old)
2438
- old.each { |name, vector| self[name] = vector }
2439
- end
2440
-
2441
519
  def recursive_product(dfs)
2442
520
  return dfs.first if dfs.size == 1
2443
521
 
@@ -2449,12 +527,6 @@ module DaruLite
2449
527
  end
2450
528
  end
2451
529
 
2452
- def should_be_vector!(val)
2453
- return val if val.is_a?(DaruLite::Vector)
2454
-
2455
- raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
2456
- end
2457
-
2458
530
  def dispatch_to_axis(axis, method, *args, &block)
2459
531
  if %i[vector column].include?(axis)
2460
532
  send(:"#{method}_vector", *args, &block)
@@ -2485,76 +557,6 @@ module DaruLite
2485
557
  end
2486
558
  end
2487
559
 
2488
- def access_vector(*names)
2489
- if names.first.is_a?(Range)
2490
- dup(@vectors.subset(names.first))
2491
- elsif @vectors.is_a?(MultiIndex)
2492
- access_vector_multi_index(*names)
2493
- else
2494
- access_vector_single_index(*names)
2495
- end
2496
- end
2497
-
2498
- def access_vector_multi_index(*names)
2499
- pos = @vectors[names]
2500
-
2501
- return @data[pos] if pos.is_a?(Integer)
2502
-
2503
- new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
2504
-
2505
- pos = pos.drop_left_level(names.size) if names.size < @vectors.width
2506
-
2507
- DaruLite::DataFrame.new(new_vectors, index: @index, order: pos)
2508
- end
2509
-
2510
- def access_vector_single_index(*names)
2511
- if names.count < 2
2512
- begin
2513
- pos = @vectors.is_a?(DaruLite::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
2514
- rescue IndexError
2515
- raise IndexError, "Specified vector #{names.first} does not exist"
2516
- end
2517
- return @data[pos] if pos.is_a?(Numeric)
2518
-
2519
- names = pos
2520
- end
2521
-
2522
- new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
2523
-
2524
- order = names.is_a?(Array) ? DaruLite::Index.new(names) : names
2525
- DaruLite::DataFrame.new(new_vectors, order: order, index: @index, name: @name)
2526
- end
2527
-
2528
- def access_row(*indexes)
2529
- positions = @index.pos(*indexes)
2530
-
2531
- if positions.is_a? Numeric
2532
- row = get_rows_for([positions])
2533
- DaruLite::Vector.new row, index: @vectors, name: indexes.first
2534
- else
2535
- new_rows = get_rows_for(indexes, by_position: false)
2536
- DaruLite::DataFrame.new new_rows, index: @index.subset(*indexes), order: @vectors
2537
- end
2538
- end
2539
-
2540
- # @param keys [Array] can be an array of positions (if by_position is true) or indexes (if by_position if false)
2541
- # because of coercion by DaruLite::Vector#at and DaruLite::Vector#[], can return either an Array of
2542
- # values (representing a row) or an array of Vectors (that can be seen as rows)
2543
- def get_rows_for(keys, by_position: true)
2544
- raise unless keys.is_a?(Array)
2545
-
2546
- if by_position
2547
- pos = keys
2548
- @data.map { |vector| vector.at(*pos) }
2549
- else
2550
- # TODO: for now (2018-07-27), it is different than using
2551
- # get_rows_for(@index.pos(*keys))
2552
- # because DaruLite::Vector#at and DaruLite::Vector#[] don't handle DaruLite::MultiIndex the same way
2553
- indexes = keys
2554
- @data.map { |vec| vec[*indexes] }
2555
- end
2556
- end
2557
-
2558
560
  def insert_or_modify_vector(name, vector)
2559
561
  name = name[0] unless @vectors.is_a?(MultiIndex)
2560
562
 
@@ -2837,146 +839,6 @@ module DaruLite
2837
839
  end
2838
840
  end
2839
841
 
2840
- def sort_build_row(vector_locs, by_blocks, ascending, handle_nils, r1, r2) # rubocop:disable Metrics/ParameterLists
2841
- # Create an array to be used for comparison of two rows in sorting
2842
- vector_locs
2843
- .zip(by_blocks, ascending, handle_nils)
2844
- .map do |vector_loc, by, asc, handle_nil|
2845
- value = @data[vector_loc].data[asc ? r1 : r2]
2846
-
2847
- if by
2848
- value = begin
2849
- by.call(value)
2850
- rescue StandardError
2851
- nil
2852
- end
2853
- end
2854
-
2855
- sort_handle_nils value, asc, handle_nil || !by
2856
- end
2857
- end
2858
-
2859
- def sort_handle_nils(value, asc, handle_nil)
2860
- if !handle_nil
2861
- value
2862
- elsif asc
2863
- [value.nil? ? 0 : 1, value]
2864
- else
2865
- [value.nil? ? 1 : 0, value]
2866
- end
2867
- end
2868
-
2869
- def sort_coerce_boolean(opts, symbol, default, size)
2870
- val = opts[symbol]
2871
- case val
2872
- when true, false
2873
- Array.new(size, val)
2874
- when nil
2875
- Array.new(size, default)
2876
- when Array
2877
- raise ArgumentError, "Specify same number of vector names and #{symbol}" if
2878
- size != val.size
2879
-
2880
- val
2881
- else
2882
- raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
2883
- end
2884
- end
2885
-
2886
- def sort_prepare_block(vector_order, opts)
2887
- ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
2888
- handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
2889
-
2890
- by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
2891
- vector_locs = vector_order.map { |v| @vectors[v] }
2892
-
2893
- lambda do |index1, index2|
2894
- # Build left and right array to compare two rows
2895
- left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
2896
- right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
2897
-
2898
- # Resolve conflict by Index if all attributes are same
2899
- left << index1
2900
- right << index2
2901
- left <=> right
2902
- end
2903
- end
2904
-
2905
- def verify_error_message(row, test, id, i)
2906
- description, fields, = test
2907
- values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
2908
- "#{i + 1} [#{row[id]}]: #{description}#{values}"
2909
- end
2910
-
2911
- def prepare_pivot_values(index, vectors, opts)
2912
- case opts[:values]
2913
- when nil # values not specified at all.
2914
- (@vectors.to_a - (index | vectors)) & numeric_vector_names
2915
- when Array # multiple values specified.
2916
- opts[:values]
2917
- else # single value specified.
2918
- [opts[:values]]
2919
- end
2920
- end
2921
-
2922
- def make_pivot_hash(grouped, vectors, values, aggregate_function)
2923
- grouped.groups.transform_values { |_| {} }.tap do |super_hash|
2924
- values.each do |value|
2925
- grouped.groups.each do |group_name, row_numbers|
2926
- row_numbers.each do |num|
2927
- arry = [value, *vectors.map { |v| self[v][num] }]
2928
- sub_hash = super_hash[group_name]
2929
- sub_hash[arry] ||= []
2930
-
2931
- sub_hash[arry] << self[value][num]
2932
- end
2933
- end
2934
- end
2935
-
2936
- setup_pivot_aggregates super_hash, aggregate_function
2937
- end
2938
- end
2939
-
2940
- def setup_pivot_aggregates(super_hash, aggregate_function)
2941
- super_hash.each_value do |sub_hash|
2942
- sub_hash.each do |group_name, aggregates|
2943
- sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function)
2944
- end
2945
- end
2946
- end
2947
-
2948
- def pivot_dataframe(super_hash)
2949
- df_index = DaruLite::MultiIndex.from_tuples super_hash.keys
2950
- df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
2951
-
2952
- DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
2953
- super_hash.each do |row_index, sub_h|
2954
- sub_h.each do |vector_index, val|
2955
- pivoted_dataframe[vector_index][row_index] = val
2956
- end
2957
- end
2958
- end
2959
- end
2960
-
2961
- def one_to_many_components(pattern)
2962
- re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
2963
-
2964
- vars, numbers =
2965
- @vectors
2966
- .map { |v| v.scan(re) }
2967
- .reject(&:empty?).flatten(1).transpose
2968
-
2969
- [vars.uniq, numbers.map(&:to_i).sort.uniq]
2970
- end
2971
-
2972
- def one_to_many_row(row, number, vars, pattern)
2973
- vars
2974
- .to_h do |v|
2975
- name = pattern.sub('%v', v).sub('%n', number.to_s)
2976
- [v, row[name]]
2977
- end
2978
- end
2979
-
2980
842
  # Raises IndexError when one of the positions is not a valid position
2981
843
  def validate_positions(*positions, size)
2982
844
  positions.each do |pos|
@@ -3001,82 +863,5 @@ module DaruLite
3001
863
  DaruLite::Vector.new(source[idx], index: @index, name: vectors[idx])
3002
864
  end
3003
865
  end
3004
-
3005
- def aggregate_by_positions_tuples(options, positions_tuples)
3006
- agg_over_vectors_only, options = cast_aggregation_options(options)
3007
-
3008
- if agg_over_vectors_only
3009
- options.map do |vect_name, method|
3010
- vect = self[vect_name]
3011
-
3012
- positions_tuples.map do |positions|
3013
- vect.apply_method_on_sub_vector(method, keys: positions)
3014
- end
3015
- end
3016
- else
3017
- methods = options.values
3018
-
3019
- # NOTE: because we aggregate over rows, we don't have to re-get sub-dfs for each method (which is expensive)
3020
- rows = positions_tuples.map do |positions|
3021
- apply_method_on_sub_df(methods, keys: positions)
3022
- end
3023
-
3024
- rows.transpose
3025
- end
3026
- end
3027
-
3028
- # convert operations over sub-vectors to operations over sub-dfs when it improves perf
3029
- # note: we don't always "cast" because aggregation over a single vector / a few vector is faster
3030
- # than aggregation over (sub-)dfs
3031
- def cast_aggregation_options(options)
3032
- vects, non_vects = options.keys.partition { |k| @vectors.include?(k) }
3033
-
3034
- over_vectors = true
3035
-
3036
- if non_vects.any?
3037
- options = options.clone
3038
-
3039
- vects.each do |name|
3040
- proc_on_vect = options[name].to_proc
3041
- options[name] = ->(sub_df) { proc_on_vect.call(sub_df[name]) }
3042
- end
3043
-
3044
- over_vectors = false
3045
- end
3046
-
3047
- [over_vectors, options]
3048
- end
3049
-
3050
- def group_index_for_aggregation(index, multi_index_level = -1)
3051
- case index
3052
- when DaruLite::MultiIndex
3053
- groups_by_pos = DaruLite::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
3054
-
3055
- new_index = DaruLite::MultiIndex.from_tuples(groups_by_pos.keys).coerce_index
3056
- pos_tuples = groups_by_pos.values
3057
- when DaruLite::Index, DaruLite::CategoricalIndex
3058
- new_index = Array(index).uniq
3059
- pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
3060
- else raise
3061
- end
3062
-
3063
- [pos_tuples, new_index]
3064
- end
3065
-
3066
- # coerce ranges, integers and array in appropriate ways
3067
- def coerce_positions(*positions, size)
3068
- if positions.size == 1
3069
- case positions.first
3070
- when Integer
3071
- positions.first
3072
- when Range
3073
- size.times.to_a[positions.first]
3074
- else
3075
- raise ArgumentError, 'Unknown position type.'
3076
- end
3077
- else
3078
- positions
3079
- end
3080
- end
3081
866
  end
3082
867
  end