daru_lite 0.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (73) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
  3. data/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
  4. data/.github/workflows/ci.yml +20 -0
  5. data/.rubocop_todo.yml +35 -33
  6. data/README.md +19 -115
  7. data/daru_lite.gemspec +1 -0
  8. data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
  9. data/lib/daru_lite/data_frame/calculatable.rb +140 -0
  10. data/lib/daru_lite/data_frame/convertible.rb +107 -0
  11. data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
  12. data/lib/daru_lite/data_frame/fetchable.rb +301 -0
  13. data/lib/daru_lite/data_frame/filterable.rb +144 -0
  14. data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
  15. data/lib/daru_lite/data_frame/indexable.rb +168 -0
  16. data/lib/daru_lite/data_frame/iterable.rb +339 -0
  17. data/lib/daru_lite/data_frame/joinable.rb +152 -0
  18. data/lib/daru_lite/data_frame/missable.rb +75 -0
  19. data/lib/daru_lite/data_frame/pivotable.rb +108 -0
  20. data/lib/daru_lite/data_frame/queryable.rb +67 -0
  21. data/lib/daru_lite/data_frame/setable.rb +109 -0
  22. data/lib/daru_lite/data_frame/sortable.rb +241 -0
  23. data/lib/daru_lite/dataframe.rb +142 -2355
  24. data/lib/daru_lite/index/index.rb +13 -0
  25. data/lib/daru_lite/maths/statistics/vector.rb +1 -1
  26. data/lib/daru_lite/vector/aggregatable.rb +9 -0
  27. data/lib/daru_lite/vector/calculatable.rb +78 -0
  28. data/lib/daru_lite/vector/convertible.rb +77 -0
  29. data/lib/daru_lite/vector/duplicatable.rb +17 -0
  30. data/lib/daru_lite/vector/fetchable.rb +175 -0
  31. data/lib/daru_lite/vector/filterable.rb +128 -0
  32. data/lib/daru_lite/vector/indexable.rb +77 -0
  33. data/lib/daru_lite/vector/iterable.rb +95 -0
  34. data/lib/daru_lite/vector/joinable.rb +17 -0
  35. data/lib/daru_lite/vector/missable.rb +124 -0
  36. data/lib/daru_lite/vector/queryable.rb +45 -0
  37. data/lib/daru_lite/vector/setable.rb +47 -0
  38. data/lib/daru_lite/vector/sortable.rb +113 -0
  39. data/lib/daru_lite/vector.rb +36 -932
  40. data/lib/daru_lite/version.rb +1 -1
  41. data/spec/data_frame/aggregatable_example.rb +65 -0
  42. data/spec/data_frame/buildable_example.rb +109 -0
  43. data/spec/data_frame/calculatable_example.rb +135 -0
  44. data/spec/data_frame/convertible_example.rb +180 -0
  45. data/spec/data_frame/duplicatable_example.rb +111 -0
  46. data/spec/data_frame/fetchable_example.rb +476 -0
  47. data/spec/data_frame/filterable_example.rb +250 -0
  48. data/spec/data_frame/indexable_example.rb +221 -0
  49. data/spec/data_frame/iterable_example.rb +465 -0
  50. data/spec/data_frame/joinable_example.rb +106 -0
  51. data/spec/data_frame/missable_example.rb +47 -0
  52. data/spec/data_frame/pivotable_example.rb +297 -0
  53. data/spec/data_frame/queryable_example.rb +92 -0
  54. data/spec/data_frame/setable_example.rb +482 -0
  55. data/spec/data_frame/sortable_example.rb +350 -0
  56. data/spec/dataframe_spec.rb +181 -3243
  57. data/spec/index/index_spec.rb +8 -0
  58. data/spec/vector/aggregatable_example.rb +27 -0
  59. data/spec/vector/calculatable_example.rb +82 -0
  60. data/spec/vector/convertible_example.rb +126 -0
  61. data/spec/vector/duplicatable_example.rb +48 -0
  62. data/spec/vector/fetchable_example.rb +463 -0
  63. data/spec/vector/filterable_example.rb +165 -0
  64. data/spec/vector/indexable_example.rb +201 -0
  65. data/spec/vector/iterable_example.rb +111 -0
  66. data/spec/vector/joinable_example.rb +25 -0
  67. data/spec/vector/missable_example.rb +88 -0
  68. data/spec/vector/queryable_example.rb +91 -0
  69. data/spec/vector/setable_example.rb +300 -0
  70. data/spec/vector/sortable_example.rb +242 -0
  71. data/spec/vector_spec.rb +111 -1805
  72. metadata +102 -3
  73. data/.github/ISSUE_TEMPLATE.md +0 -18
@@ -1,10 +1,40 @@
1
1
  require 'daru_lite/accessors/dataframe_by_row'
2
+ require 'daru_lite/data_frame/aggregatable'
3
+ require 'daru_lite/data_frame/calculatable'
4
+ require 'daru_lite/data_frame/convertible'
5
+ require 'daru_lite/data_frame/duplicatable'
6
+ require 'daru_lite/data_frame/fetchable'
7
+ require 'daru_lite/data_frame/filterable'
8
+ require 'daru_lite/data_frame/indexable'
9
+ require 'daru_lite/data_frame/i_o_able'
10
+ require 'daru_lite/data_frame/iterable'
11
+ require 'daru_lite/data_frame/joinable'
12
+ require 'daru_lite/data_frame/missable'
13
+ require 'daru_lite/data_frame/pivotable'
14
+ require 'daru_lite/data_frame/setable'
15
+ require 'daru_lite/data_frame/sortable'
16
+ require 'daru_lite/data_frame/queryable'
2
17
  require 'daru_lite/maths/arithmetic/dataframe'
3
18
  require 'daru_lite/maths/statistics/dataframe'
4
19
  require 'daru_lite/io/io'
5
20
 
6
21
  module DaruLite
7
22
  class DataFrame # rubocop:disable Metrics/ClassLength
23
+ include DaruLite::DataFrame::Aggregatable
24
+ include DaruLite::DataFrame::Calculatable
25
+ include DaruLite::DataFrame::Convertible
26
+ include DaruLite::DataFrame::Duplicatable
27
+ include DaruLite::DataFrame::Fetchable
28
+ include DaruLite::DataFrame::Filterable
29
+ include DaruLite::DataFrame::Indexable
30
+ include DaruLite::DataFrame::Iterable
31
+ include DaruLite::DataFrame::IOAble
32
+ include DaruLite::DataFrame::Joinable
33
+ include DaruLite::DataFrame::Missable
34
+ include DaruLite::DataFrame::Pivotable
35
+ include DaruLite::DataFrame::Setable
36
+ include DaruLite::DataFrame::Sortable
37
+ include DaruLite::DataFrame::Queryable
8
38
  include DaruLite::Maths::Arithmetic::DataFrame
9
39
  include DaruLite::Maths::Statistics::DataFrame
10
40
 
@@ -13,109 +43,6 @@ module DaruLite
13
43
  extend Gem::Deprecate
14
44
 
15
45
  class << self
16
- # Load data from a CSV file. Specify an optional block to grab the CSV
17
- # object and pre-condition it (for example use the `convert` or
18
- # `header_convert` methods).
19
- #
20
- # == Arguments
21
- #
22
- # * path - Local path / Remote URL of the file to load specified as a String.
23
- #
24
- # == Options
25
- #
26
- # Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
27
- # and uses those to eventually construct the resulting DataFrame.
28
- #
29
- # == Verbose Description
30
- #
31
- # You can specify all the options to the `.from_csv` function that you
32
- # do to the Ruby `CSV.read()` function, since this is what is used internally.
33
- #
34
- # For example, if the columns in your CSV file are separated by something
35
- # other that commas, you can use the `:col_sep` option. If you want to
36
- # convert numeric values to numbers and not keep them as strings, you can
37
- # use the `:converters` option and set it to `:numeric`.
38
- #
39
- # The `.from_csv` function uses the following defaults for reading CSV files
40
- # (that are passed into the `CSV.read()` function):
41
- #
42
- # {
43
- # :col_sep => ',',
44
- # :converters => :numeric
45
- # }
46
- def from_csv(path, opts = {}, &block)
47
- DaruLite::IO.from_csv path, opts, &block
48
- end
49
-
50
- # Read data from an Excel file into a DataFrame.
51
- #
52
- # == Arguments
53
- #
54
- # * path - Path of the file to be read.
55
- #
56
- # == Options
57
- #
58
- # *:worksheet_id - ID of the worksheet that is to be read.
59
- def from_excel(path, opts = {}, &block)
60
- DaruLite::IO.from_excel path, opts, &block
61
- end
62
-
63
- # Read a database query and returns a Dataset
64
- #
65
- # @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
66
- # @param query [String] The query to be executed
67
- #
68
- # @return A dataframe containing the data resulting from the query
69
- #
70
- # USE:
71
- #
72
- # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
73
- # DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
74
- #
75
- # #Alternatively
76
- #
77
- # require 'dbi'
78
- # DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
79
- def from_sql(dbh, query)
80
- DaruLite::IO.from_sql dbh, query
81
- end
82
-
83
- # Read a dataframe from AR::Relation
84
- #
85
- # @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
86
- # @param fields [Array] Field names to be loaded (optional)
87
- #
88
- # @return A dataframe containing the data loaded from the relation
89
- #
90
- # USE:
91
- #
92
- # # When Post model is defined as:
93
- # class Post < ActiveRecord::Base
94
- # scope :active, -> { where.not(published_at: nil) }
95
- # end
96
- #
97
- # # You can load active posts into a dataframe by:
98
- # DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
99
- def from_activerecord(relation, *fields)
100
- DaruLite::IO.from_activerecord relation, *fields
101
- end
102
-
103
- # Read the database from a plaintext file. For this method to work,
104
- # the data should be present in a plain text file in columns. See
105
- # spec/fixtures/bank2.dat for an example.
106
- #
107
- # == Arguments
108
- #
109
- # * path - Path of the file to be read.
110
- # * fields - Vector names of the resulting database.
111
- #
112
- # == Usage
113
- #
114
- # df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
115
- def from_plaintext(path, fields)
116
- DaruLite::IO.from_plaintext path, fields
117
- end
118
-
119
46
  # Create DataFrame by specifying rows as an Array of Arrays or Array of
120
47
  # DaruLite::Vector objects.
121
48
  def rows(source, opts = {})
@@ -316,179 +243,6 @@ module DaruLite
316
243
  update
317
244
  end
318
245
 
319
- # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
320
- # Defaults to *:vector*. Use of this method is not recommended for accessing
321
- # rows. Use df.row[:a] for accessing row with index ':a'.
322
- def [](*names)
323
- axis = extract_axis(names, :vector)
324
- dispatch_to_axis axis, :access, *names
325
- end
326
-
327
- # Retrive rows by positions
328
- # @param [Array<Integer>] positions of rows to retrive
329
- # @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
330
- # @example
331
- # df = DaruLite::DataFrame.new({
332
- # a: [1, 2, 3],
333
- # b: ['a', 'b', 'c']
334
- # })
335
- # df.row_at 1, 2
336
- # # => #<DaruLite::DataFrame(2x2)>
337
- # # a b
338
- # # 1 2 b
339
- # # 2 3 c
340
- def row_at(*positions)
341
- original_positions = positions
342
- positions = coerce_positions(*positions, nrows)
343
- validate_positions(*positions, nrows)
344
-
345
- if positions.is_a? Integer
346
- row = get_rows_for([positions])
347
- DaruLite::Vector.new row, index: @vectors
348
- else
349
- new_rows = get_rows_for(original_positions)
350
- DaruLite::DataFrame.new new_rows, index: @index.at(*original_positions), order: @vectors
351
- end
352
- end
353
-
354
- # Set rows by positions
355
- # @param [Array<Integer>] positions positions of rows to set
356
- # @param [Array, DaruLite::Vector] vector vector to be assigned
357
- # @example
358
- # df = DaruLite::DataFrame.new({
359
- # a: [1, 2, 3],
360
- # b: ['a', 'b', 'c']
361
- # })
362
- # df.set_row_at [0, 1], ['x', 'x']
363
- # df
364
- # #=> #<DaruLite::DataFrame(3x2)>
365
- # # a b
366
- # # 0 x x
367
- # # 1 x x
368
- # # 2 3 c
369
- def set_row_at(positions, vector)
370
- validate_positions(*positions, nrows)
371
- vector =
372
- if vector.is_a? DaruLite::Vector
373
- vector.reindex @vectors
374
- else
375
- DaruLite::Vector.new vector
376
- end
377
-
378
- raise SizeError, 'Vector length should match row length' if
379
- vector.size != @vectors.size
380
-
381
- @data.each_with_index do |vec, pos|
382
- vec.set_at(positions, vector.at(pos))
383
- end
384
- @index = @data[0].index
385
- set_size
386
- end
387
-
388
- # Retrive vectors by positions
389
- # @param [Array<Integer>] positions of vectors to retrive
390
- # @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
391
- # @example
392
- # df = DaruLite::DataFrame.new({
393
- # a: [1, 2, 3],
394
- # b: ['a', 'b', 'c']
395
- # })
396
- # df.at 0
397
- # # => #<DaruLite::Vector(3)>
398
- # # a
399
- # # 0 1
400
- # # 1 2
401
- # # 2 3
402
- def at(*positions)
403
- if AXES.include? positions.last
404
- axis = positions.pop
405
- return row_at(*positions) if axis == :row
406
- end
407
-
408
- original_positions = positions
409
- positions = coerce_positions(*positions, ncols)
410
- validate_positions(*positions, ncols)
411
-
412
- if positions.is_a? Integer
413
- @data[positions].dup
414
- else
415
- DaruLite::DataFrame.new positions.map { |pos| @data[pos].dup },
416
- index: @index,
417
- order: @vectors.at(*original_positions),
418
- name: @name
419
- end
420
- end
421
-
422
- # Set vectors by positions
423
- # @param [Array<Integer>] positions positions of vectors to set
424
- # @param [Array, DaruLite::Vector] vector vector to be assigned
425
- # @example
426
- # df = DaruLite::DataFrame.new({
427
- # a: [1, 2, 3],
428
- # b: ['a', 'b', 'c']
429
- # })
430
- # df.set_at [0], ['x', 'y', 'z']
431
- # df
432
- # #=> #<DaruLite::DataFrame(3x2)>
433
- # # a b
434
- # # 0 x a
435
- # # 1 y b
436
- # # 2 z c
437
- def set_at(positions, vector)
438
- if positions.last == :row
439
- positions.pop
440
- return set_row_at(positions, vector)
441
- end
442
-
443
- validate_positions(*positions, ncols)
444
- vector =
445
- if vector.is_a? DaruLite::Vector
446
- vector.reindex @index
447
- else
448
- DaruLite::Vector.new vector
449
- end
450
-
451
- raise SizeError, 'Vector length should match index length' if
452
- vector.size != @index.size
453
-
454
- positions.each { |pos| @data[pos] = vector }
455
- end
456
-
457
- # Insert a new row/vector of the specified name or modify a previous row.
458
- # Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
459
- # a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
460
- #
461
- # In case a DaruLite::Vector is specified after the equality the sign, the indexes
462
- # of the vector will be matched against the row/vector indexes of the DataFrame
463
- # before an insertion is performed. Unmatched indexes will be set to nil.
464
- def []=(*args)
465
- vector = args.pop
466
- axis = extract_axis(args)
467
- names = args
468
-
469
- dispatch_to_axis axis, :insert_or_modify, names, vector
470
- end
471
-
472
- def add_row(row, index = nil)
473
- self.row[*(index || @size)] = row
474
- end
475
-
476
- def add_vector(n, vector)
477
- self[n] = vector
478
- end
479
-
480
- def insert_vector(n, name, source)
481
- raise ArgumentError unless source.is_a? Array
482
-
483
- vector = DaruLite::Vector.new(source, index: @index, name: @name)
484
- @data << vector
485
- @vectors = @vectors.add name
486
- ordr = @vectors.dup.to_a
487
- elmnt = ordr.pop
488
- ordr.insert n, elmnt
489
- self.order = ordr
490
- end
491
-
492
246
  # Access a row or set/create a row. Refer #[] and #[]= docs for details.
493
247
  #
494
248
  # == Usage
@@ -498,1697 +252,177 @@ module DaruLite
498
252
  DaruLite::Accessors::DataFrameByRow.new(self)
499
253
  end
500
254
 
501
- # Extract a dataframe given row indexes or positions
502
- # @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
503
- # @return [DaruLite::Dataframe]
504
- def get_sub_dataframe(keys, by_position: true)
505
- return DaruLite::DataFrame.new({}) if keys == []
506
-
507
- keys = @index.pos(*keys) unless by_position
508
-
509
- sub_df = row_at(*keys)
510
- sub_df = sub_df.to_df.transpose if sub_df.is_a?(DaruLite::Vector)
511
-
512
- sub_df
513
- end
514
-
515
- # Duplicate the DataFrame entirely.
516
- #
517
- # == Arguments
518
- #
519
- # * +vectors_to_dup+ - An Array specifying the names of Vectors to
520
- # be duplicated. Will duplicate the entire DataFrame if not specified.
521
- def dup(vectors_to_dup = nil)
522
- vectors_to_dup ||= @vectors.to_a
523
-
524
- src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
525
- new_order = DaruLite::Index.new(vectors_to_dup)
526
-
527
- DaruLite::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
528
- end
529
-
530
- # Only clone the structure of the DataFrame.
531
- def clone_structure
532
- DaruLite::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
533
- end
534
-
535
- # Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
536
- # preserved.
537
- #
538
- # == Arguments
539
- #
540
- # +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
541
- # a view of the whole data frame otherwise.
542
- def clone(*vectors_to_clone)
543
- vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
544
- vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
545
-
546
- h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
547
- DaruLite::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
548
- end
549
-
550
- # Returns a 'shallow' copy of DataFrame if missing data is not present,
551
- # or a full copy of only valid data if missing data is present.
552
- def clone_only_valid
553
- if include_values?(*DaruLite::MISSING_VALUES)
554
- reject_values(*DaruLite::MISSING_VALUES)
555
- else
556
- clone
557
- end
558
- end
559
-
560
- # Creates a new duplicate dataframe containing only rows
561
- # without a single missing value.
562
- def dup_only_valid(vecs = nil)
563
- rows_with_nil = @data.map { |vec| vec.indexes(*DaruLite::MISSING_VALUES) }
564
- .inject(&:concat)
565
- .uniq
566
-
567
- row_indexes = @index.to_a
568
- (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
569
- end
570
- deprecate :dup_only_valid, :reject_values, 2016, 10
571
-
572
- # Returns a dataframe in which rows with any of the mentioned values
573
- # are ignored.
574
- # @param [Array] values to reject to form the new dataframe
575
- # @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
576
- # contain the mentioned values
577
- # @example
578
- # df = DaruLite::DataFrame.new({
579
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
580
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
581
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
582
- # }, index: 11..18)
583
- # df.reject_values nil, Float::NAN
584
- # # => #<DaruLite::DataFrame(2x3)>
585
- # # a b c
586
- # # 11 1 a a
587
- # # 18 7 8 7
588
- def reject_values(*values)
589
- positions =
590
- size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
591
- # Handle the case when positions size is 1 and #row_at wouldn't return a df
592
- if positions.size == 1
593
- pos = positions.first
594
- row_at(pos..pos)
595
- else
596
- row_at(*positions)
597
- end
598
- end
599
-
600
- # Replace specified values with given value
601
- # @param [Array] old_values values to replace with new value
602
- # @param [object] new_value new value to replace with
603
- # @return [DaruLite::DataFrame] Data Frame itself with old values replace
604
- # with new value
605
- # @example
606
- # df = DaruLite::DataFrame.new({
607
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
608
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
609
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
610
- # }, index: 11..18)
611
- # df.replace_values nil, Float::NAN
612
- # # => #<DaruLite::DataFrame(8x3)>
613
- # # a b c
614
- # # 11 1 a a
615
- # # 12 2 b NaN
616
- # # 13 3 NaN 3
617
- # # 14 NaN NaN 4
618
- # # 15 NaN NaN 3
619
- # # 16 NaN 3 5
620
- # # 17 1 5 NaN
621
- # # 18 7 8 7
622
- def replace_values(old_values, new_value)
623
- @data.each { |vec| vec.replace_values old_values, new_value }
624
- self
625
- end
626
-
627
- # Rolling fillna
628
- # replace all Float::NAN and NIL values with the preceeding or following value
629
- #
630
- # @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
631
- #
632
- # @example
633
- # df = DaruLite::DataFrame.new({
634
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
635
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
636
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
637
- # })
638
- #
639
- # => #<DaruLite::DataFrame(8x3)>
640
- # a b c
641
- # 0 1 a a
642
- # 1 2 b NaN
643
- # 2 3 nil 3
644
- # 3 nil NaN 4
645
- # 4 NaN nil 3
646
- # 5 nil 3 5
647
- # 6 1 5 nil
648
- # 7 7 nil 7
649
- #
650
- # 2.3.3 :068 > df.rolling_fillna(:forward)
651
- # => #<DaruLite::DataFrame(8x3)>
652
- # a b c
653
- # 0 1 a a
654
- # 1 2 b a
655
- # 2 3 b 3
656
- # 3 3 b 4
657
- # 4 3 b 3
658
- # 5 3 3 5
659
- # 6 1 5 5
660
- # 7 7 5 7
661
- #
662
- def rolling_fillna!(direction = :forward)
663
- @data.each { |vec| vec.rolling_fillna!(direction) }
664
- self
665
- end
666
-
667
- def rolling_fillna(direction = :forward)
668
- dup.rolling_fillna!(direction)
669
- end
670
-
671
- # Return unique rows by vector specified or all vectors
672
- #
673
- # @param vtrs [String][Symbol] vector names(s) that should be considered
674
- #
675
- # @example
676
- #
677
- # => #<DaruLite::DataFrame(6x2)>
678
- # a b
679
- # 0 1 a
680
- # 1 2 b
681
- # 2 3 c
682
- # 3 4 d
683
- # 2 3 c
684
- # 3 4 f
685
- #
686
- # 2.3.3 :> df.unique
687
- # => #<DaruLite::DataFrame(5x2)>
688
- # a b
689
- # 0 1 a
690
- # 1 2 b
691
- # 2 3 c
692
- # 3 4 d
693
- # 3 4 f
694
- #
695
- # 2.3.3 :> df.unique(:a)
696
- # => #<DaruLite::DataFrame(5x2)>
697
- # a b
698
- # 0 1 a
699
- # 1 2 b
700
- # 2 3 c
701
- # 3 4 d
702
- #
703
- def uniq(*vtrs)
704
- vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
705
- grouped = group_by(vecs)
706
- indexes = grouped.groups.values.map { |v| v[0] }.sort
707
- row[*indexes]
708
- end
709
-
710
- # Iterate over each index of the DataFrame.
711
- def each_index(&block)
712
- return to_enum(:each_index) unless block
713
-
714
- @index.each(&block)
715
-
716
- self
717
- end
718
-
719
- # Iterate over each vector
720
- def each_vector(&block)
721
- return to_enum(:each_vector) unless block
255
+ # Delete a vector
256
+ def delete_vector(vector)
257
+ raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
722
258
 
723
- @data.each(&block)
259
+ @data.delete_at @vectors[vector]
260
+ @vectors = DaruLite::Index.new @vectors.to_a - [vector]
724
261
 
725
262
  self
726
263
  end
727
264
 
728
- alias each_column each_vector
729
-
730
- # Iterate over each vector alongwith the name of the vector
731
- def each_vector_with_index
732
- return to_enum(:each_vector_with_index) unless block_given?
733
-
734
- @vectors.each do |vector|
735
- yield @data[@vectors[vector]], vector
736
- end
265
+ # Deletes a list of vectors
266
+ def delete_vectors(*vectors)
267
+ Array(vectors).each { |vec| delete_vector vec }
737
268
 
738
269
  self
739
270
  end
740
271
 
741
- alias each_column_with_index each_vector_with_index
742
-
743
- # Iterate over each row
744
- def each_row
745
- return to_enum(:each_row) unless block_given?
746
-
747
- @index.size.times do |pos|
748
- yield row_at(pos)
749
- end
750
-
751
- self
752
- end
272
+ # Delete a row
273
+ def delete_row(index)
274
+ idx = named_index_for index
753
275
 
754
- def each_row_with_index
755
- return to_enum(:each_row_with_index) unless block_given?
276
+ raise IndexError, "Index #{index} does not exist." unless @index.include? idx
756
277
 
757
- @index.each do |index|
758
- yield access_row(index), index
278
+ @index = DaruLite::Index.new(@index.to_a - [idx])
279
+ each_vector do |vector|
280
+ vector.delete_at idx
759
281
  end
760
282
 
761
- self
762
- end
763
-
764
- # Iterate over each row or vector of the DataFrame. Specify axis
765
- # by passing :vector or :row as the argument. Default to :vector.
766
- #
767
- # == Description
768
- #
769
- # `#each` works exactly like Array#each. The default mode for `each`
770
- # is to iterate over the columns of the DataFrame. To iterate over
771
- # rows you must pass the axis, i.e `:row` as an argument.
772
- #
773
- # == Arguments
774
- #
775
- # * +axis+ - The axis to iterate over. Can be :vector (or :column)
776
- # or :row. Default to :vector.
777
- def each(axis = :vector, &block)
778
- dispatch_to_axis axis, :each, &block
779
- end
780
-
781
- # Iterate over a row or vector and return results in a DaruLite::Vector.
782
- # Specify axis with :vector or :row. Default to :vector.
783
- #
784
- # == Description
785
- #
786
- # The #collect iterator works similar to #map, the only difference
787
- # being that it returns a DaruLite::Vector comprising of the results of
788
- # each block run. The resultant Vector has the same index as that
789
- # of the axis over which collect has iterated. It also accepts the
790
- # optional axis argument.
791
- #
792
- # == Arguments
793
- #
794
- # * +axis+ - The axis to iterate over. Can be :vector (or :column)
795
- # or :row. Default to :vector.
796
- def collect(axis = :vector, &block)
797
- dispatch_to_axis_pl axis, :collect, &block
283
+ set_size
798
284
  end
799
285
 
800
- # Map over each vector or row of the data frame according to
801
- # the argument specified. Will return an Array of the resulting
802
- # elements. To map over each row/vector and get a DataFrame,
803
- # see #recode.
804
- #
805
- # == Description
806
- #
807
- # The #map iterator works like Array#map. The value returned by
808
- # each run of the block is added to an Array and the Array is
809
- # returned. This method also accepts an axis argument, like #each.
810
- # The default is :vector.
811
- #
812
- # == Arguments
813
- #
814
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
815
- # Default to :vector.
816
- def map(axis = :vector, &block)
817
- dispatch_to_axis_pl axis, :map, &block
818
- end
286
+ # Delete a row based on its position
287
+ # More robust than #delete_row when working with a CategoricalIndex or when the
288
+ # Index includes integers
289
+ def delete_at_position(position)
290
+ raise IndexError, "Position #{position} does not exist." unless position < size
819
291
 
820
- # Destructive map. Modifies the DataFrame. Each run of the block
821
- # must return a DaruLite::Vector. You can specify the axis to map over
822
- # as the argument. Default to :vector.
823
- #
824
- # == Arguments
825
- #
826
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
827
- # Default to :vector.
828
- def map!(axis = :vector, &block)
829
- if %i[vector column].include?(axis)
830
- map_vectors!(&block)
831
- elsif axis == :row
832
- map_rows!(&block)
833
- end
834
- end
292
+ each_vector { |vector| vector.delete_at_position(position) }
293
+ @index = @index.delete_at(position)
835
294
 
836
- # Maps over the DataFrame and returns a DataFrame. Each run of the
837
- # block must return a DaruLite::Vector object. You can specify the axis
838
- # to map over. Default to :vector.
839
- #
840
- # == Description
841
- #
842
- # Recode works similarly to #map, but an important difference between
843
- # the two is that recode returns a modified DaruLite::DataFrame instead
844
- # of an Array. For this reason, #recode expects that every run of the
845
- # block to return a DaruLite::Vector.
846
- #
847
- # Just like map and each, recode also accepts an optional _axis_ argument.
848
- #
849
- # == Arguments
850
- #
851
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
852
- # Default to :vector.
853
- def recode(axis = :vector, &block)
854
- dispatch_to_axis_pl axis, :recode, &block
295
+ set_size
855
296
  end
856
297
 
857
- # Retain vectors or rows if the block returns a truthy value.
858
- #
859
- # == Description
860
- #
861
- # For filtering out certain rows/vectors based on their values,
862
- # use the #filter method. By default it iterates over vectors and
863
- # keeps those vectors for which the block returns true. It accepts
864
- # an optional axis argument which lets you specify whether you want
865
- # to iterate over vectors or rows.
866
- #
867
- # == Arguments
868
- #
869
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
870
- # Default to :vector.
871
- #
872
- # == Usage
873
- #
874
- # # Filter vectors
875
- #
876
- # df.filter do |vector|
877
- # vector.type == :numeric and vector.median < 50
878
- # end
879
- #
880
- # # Filter rows
298
+ # Creates a DataFrame with the random data, of n size.
299
+ # If n not given, uses original number of rows.
881
300
  #
882
- # df.filter(:row) do |row|
883
- # row[:a] + row[:d] < 100
884
- # end
885
- def filter(axis = :vector, &block)
886
- dispatch_to_axis_pl axis, :filter, &block
887
- end
888
-
889
- def recode_vectors
890
- block_given? or return to_enum(:recode_vectors)
891
-
892
- dup.tap do |df|
893
- df.each_vector_with_index do |v, i|
894
- df[*i] = should_be_vector!(yield(v))
895
- end
896
- end
897
- end
898
-
899
- def recode_rows
900
- block_given? or return to_enum(:recode_rows)
901
-
902
- dup.tap do |df|
903
- df.each_row_with_index do |r, i|
904
- df.row[i] = should_be_vector!(yield(r))
301
+ # @return {DaruLite::DataFrame}
302
+ def bootstrap(n = nil)
303
+ n ||= nrows
304
+ DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
305
+ n.times do
306
+ df_boot.add_row(row[rand(n)])
905
307
  end
308
+ df_boot.update
906
309
  end
907
310
  end
908
311
 
909
- # Map each vector and return an Array.
910
- def map_vectors(&block)
911
- return to_enum(:map_vectors) unless block
912
-
913
- @data.map(&block)
914
- end
915
-
916
- # Destructive form of #map_vectors
917
- def map_vectors!
918
- return to_enum(:map_vectors!) unless block_given?
919
-
920
- vectors.dup.each do |n|
921
- self[n] = should_be_vector!(yield(self[n]))
922
- end
923
-
924
- self
925
- end
926
-
927
- # Map vectors alongwith the index.
928
- def map_vectors_with_index(&block)
929
- return to_enum(:map_vectors_with_index) unless block
930
-
931
- each_vector_with_index.map(&block)
932
- end
933
-
934
- # Map each row
935
- def map_rows(&block)
936
- return to_enum(:map_rows) unless block
937
-
938
- each_row.map(&block)
939
- end
940
-
941
- def map_rows_with_index(&block)
942
- return to_enum(:map_rows_with_index) unless block
943
-
944
- each_row_with_index.map(&block)
945
- end
946
-
947
- def map_rows!
948
- return to_enum(:map_rows!) unless block_given?
949
-
950
- index.dup.each do |i|
951
- row[i] = should_be_vector!(yield(row[i]))
952
- end
953
-
954
- self
955
- end
956
-
957
- def apply_method(method, keys: nil, by_position: true)
958
- df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
959
-
960
- case method
961
- when Symbol then df.send(method)
962
- when Proc then method.call(df)
963
- when Array then method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
964
- else raise
965
- end
966
- end
967
- alias apply_method_on_sub_df apply_method
968
-
969
- # Retrieves a DaruLite::Vector, based on the result of calculation
970
- # performed on each row.
971
- def collect_rows(&block)
972
- return to_enum(:collect_rows) unless block
973
-
974
- DaruLite::Vector.new(each_row.map(&block), index: @index)
975
- end
976
-
977
- def collect_row_with_index(&block)
978
- return to_enum(:collect_row_with_index) unless block
979
-
980
- DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
981
- end
982
-
983
- # Retrives a DaruLite::Vector, based on the result of calculation
984
- # performed on each vector.
985
- def collect_vectors(&block)
986
- return to_enum(:collect_vectors) unless block
987
-
988
- DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
989
- end
990
-
991
- def collect_vector_with_index(&block)
992
- return to_enum(:collect_vector_with_index) unless block
993
-
994
- DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
995
- end
996
-
997
- # Generate a matrix, based on vector names of the DataFrame.
998
- #
999
- # @return {::Matrix}
1000
- # :nocov:
1001
- # FIXME: Even not trying to cover this: I can't get, how it is expected
1002
- # to work.... -- zverok
1003
- def collect_matrix
1004
- return to_enum(:collect_matrix) unless block_given?
1005
-
1006
- vecs = vectors.to_a
1007
- rows = vecs.collect do |row|
1008
- vecs.collect do |col|
1009
- yield row, col
1010
- end
1011
- end
1012
-
1013
- Matrix.rows(rows)
1014
- end
1015
- # :nocov:
1016
-
1017
- # Delete a vector
1018
- def delete_vector(vector)
1019
- raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
1020
-
1021
- @data.delete_at @vectors[vector]
1022
- @vectors = DaruLite::Index.new @vectors.to_a - [vector]
1023
-
1024
- self
1025
- end
1026
-
1027
- # Deletes a list of vectors
1028
- def delete_vectors(*vectors)
1029
- Array(vectors).each { |vec| delete_vector vec }
1030
-
1031
- self
1032
- end
1033
-
1034
- # Delete a row
1035
- def delete_row(index)
1036
- idx = named_index_for index
1037
-
1038
- raise IndexError, "Index #{index} does not exist." unless @index.include? idx
1039
-
1040
- @index = DaruLite::Index.new(@index.to_a - [idx])
1041
- each_vector do |vector|
1042
- vector.delete_at idx
1043
- end
1044
-
1045
- set_size
1046
- end
1047
-
1048
- # Creates a DataFrame with the random data, of n size.
1049
- # If n not given, uses original number of rows.
1050
- #
1051
- # @return {DaruLite::DataFrame}
1052
- def bootstrap(n = nil)
1053
- n ||= nrows
1054
- DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
1055
- n.times do
1056
- df_boot.add_row(row[rand(n)])
1057
- end
1058
- df_boot.update
1059
- end
1060
- end
1061
-
1062
- def keep_row_if
1063
- @index
1064
- .reject { |idx| yield access_row(idx) }
1065
- .each { |idx| delete_row idx }
1066
- end
1067
-
1068
- def keep_vector_if
1069
- @vectors.each do |vector|
1070
- delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
1071
- end
1072
- end
1073
-
1074
- # creates a new vector with the data of a given field which the block returns true
1075
- def filter_vector(vec, &block)
1076
- DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
1077
- end
1078
-
1079
- # Iterates over each row and retains it in a new DataFrame if the block returns
1080
- # true for that row.
1081
- def filter_rows
1082
- return to_enum(:filter_rows) unless block_given?
1083
-
1084
- keep_rows = @index.map { |index| yield access_row(index) }
1085
-
1086
- where keep_rows
1087
- end
1088
-
1089
- # Iterates over each vector and retains it in a new DataFrame if the block returns
1090
- # true for that vector.
1091
- def filter_vectors(&block)
1092
- return to_enum(:filter_vectors) unless block
1093
-
1094
- dup.tap { |df| df.keep_vector_if(&block) }
1095
- end
1096
-
1097
- # Test each row with one or more tests.
1098
- # @param tests [Proc] Each test is a Proc with the form
1099
- # *Proc.new {|row| row[:age] > 0}*
1100
- # The function returns an array with all errors.
1101
- #
1102
- # FIXME: description here is too sparse. As far as I can get,
1103
- # it should tell something about that each test is [descr, fields, block],
1104
- # and that first value may be column name to output. - zverok, 2016-05-18
1105
- def verify(*tests)
1106
- id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
1107
-
1108
- each_row_with_index.map do |row, i|
1109
- tests.reject { |*_, block| block.call(row) }
1110
- .map { |test| verify_error_message row, test, id, i }
1111
- end.flatten
1112
- end
1113
-
1114
- # DSL for yielding each row and returning a DaruLite::Vector based on the
1115
- # value each run of the block returns.
1116
- #
1117
- # == Usage
1118
- #
1119
- # a1 = DaruLite::Vector.new([1, 2, 3, 4, 5, 6, 7])
1120
- # a2 = DaruLite::Vector.new([10, 20, 30, 40, 50, 60, 70])
1121
- # a3 = DaruLite::Vector.new([100, 200, 300, 400, 500, 600, 700])
1122
- # ds = DaruLite::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
1123
- # total = ds.vector_by_calculation { a + b + c }
1124
- # # <DaruLite::Vector:82314050 @name = nil @size = 7 >
1125
- # # nil
1126
- # # 0 111
1127
- # # 1 222
1128
- # # 2 333
1129
- # # 3 444
1130
- # # 4 555
1131
- # # 5 666
1132
- # # 6 777
1133
- def vector_by_calculation(&block)
1134
- a = each_row.map { |r| r.instance_eval(&block) }
1135
-
1136
- DaruLite::Vector.new a, index: @index
1137
- end
1138
-
1139
- # Reorder the vectors in a dataframe
1140
- # @param [Array] order_array new order of the vectors
1141
- # @example
1142
- # df = DaruLite::DataFrame({
1143
- # a: [1, 2, 3],
1144
- # b: [4, 5, 6]
1145
- # }, order: [:a, :b])
1146
- # df.order = [:b, :a]
1147
- # df
1148
- # # => #<DaruLite::DataFrame(3x2)>
1149
- # # b a
1150
- # # 0 4 1
1151
- # # 1 5 2
1152
- # # 2 6 3
1153
- def order=(order_array)
1154
- raise ArgumentError, 'Invalid order' unless
1155
- order_array.sort == vectors.to_a.sort
1156
-
1157
- initialize(to_h, order: order_array)
1158
- end
1159
-
1160
- # Return the dataframe with rotate vectors positions, the vector at position count is now
1161
- # the first vector of the dataframe.
1162
- # If only one vector in the dataframe, the dataframe is return without any change.
1163
- # @param count => Integer, the vector at position count will be the first vector of the dataframe.
1164
- # @example
1165
- # df = DaruLite::DataFrame({
1166
- # a: [1, 2, 3],
1167
- # b: [4, 5, 6],
1168
- # total: [5, 7, 9],
1169
- # })
1170
- # df.rotate_vectors(-1)
1171
- # df
1172
- # # => #<DaruLite::DataFrame(3x3)>
1173
- # # total b a
1174
- # # 0 5 4 1
1175
- # # 1 7 5 2
1176
- # # 2 9 6 3
1177
- def rotate_vectors(count = -1)
1178
- return self unless vectors.many?
1179
-
1180
- self.order = vectors.to_a.rotate(count)
1181
- self
1182
- end
1183
-
1184
- # Returns a vector, based on a string with a calculation based
1185
- # on vector.
1186
- #
1187
- # The calculation will be eval'ed, so you can put any variable
1188
- # or expression valid on ruby.
1189
- #
1190
- # For example:
1191
- # a = DaruLite::Vector.new [1,2]
1192
- # b = DaruLite::Vector.new [3,4]
1193
- # ds = DaruLite::DataFrame.new({:a => a,:b => b})
1194
- # ds.compute("a+b")
1195
- # => Vector [4,6]
1196
- def compute(text, &block)
1197
- return instance_eval(&block) if block
1198
-
1199
- instance_eval(text)
1200
- end
1201
-
1202
- # Return a vector with the number of missing values in each row.
1203
- #
1204
- # == Arguments
1205
- #
1206
- # * +missing_values+ - An Array of the values that should be
1207
- # treated as 'missing'. The default missing value is *nil*.
1208
- def missing_values_rows(missing_values = [nil])
1209
- number_of_missing = each_row.map do |row|
1210
- row.indexes(*missing_values).size
1211
- end
1212
-
1213
- DaruLite::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
1214
- end
1215
-
1216
- # TODO: remove next version
1217
- alias vector_missing_values missing_values_rows
1218
-
1219
- def has_missing_data?
1220
- @data.any? { |vec| vec.include_values?(*DaruLite::MISSING_VALUES) }
1221
- end
1222
- alias flawed? has_missing_data?
1223
- deprecate :has_missing_data?, :include_values?, 2016, 10
1224
- deprecate :flawed?, :include_values?, 2016, 10
1225
-
1226
- # Check if any of given values occur in the data frame
1227
- # @param [Array] values to check for
1228
- # @return [true, false] true if any of the given values occur in the
1229
- # dataframe, false otherwise
1230
- # @example
1231
- # df = DaruLite::DataFrame.new({
1232
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
1233
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
1234
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
1235
- # }, index: 11..18)
1236
- # df.include_values? nil
1237
- # # => true
1238
- def include_values?(*values)
1239
- @data.any? { |vec| vec.include_values?(*values) }
1240
- end
1241
-
1242
312
  # Return a nested hash using vector names as keys and an array constructed of
1243
313
  # hashes with other values. If block provided, is used to provide the
1244
314
  # values, with parameters +row+ of dataset, +current+ last hash on
1245
315
  # hierarchy and +name+ of the key to include
1246
- def nest(*tree_keys, &block)
1247
- tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
1248
-
1249
- each_row.with_object({}) do |row, current|
1250
- # Create tree
1251
- *keys, last = tree_keys
1252
- current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
1253
- name = row[last]
1254
-
1255
- if block
1256
- current[name] = yield(row, current, name)
1257
- else
1258
- current[name] ||= []
1259
- current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
1260
- end
1261
- end
1262
- end
1263
-
1264
- def vector_count_characters(vecs = nil)
1265
- vecs ||= @vectors.to_a
1266
-
1267
- collect_rows do |row|
1268
- vecs.sum { |v| row[v].to_s.size }
1269
- end
1270
- end
1271
-
1272
- def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
1273
- self[name]
1274
- .split_by_separator(sep)
1275
- .each { |k, v| self[:"#{name}#{join}#{k}"] = v }
1276
- end
1277
-
1278
- # Return the number of rows and columns of the DataFrame in an Array.
1279
- def shape
1280
- [nrows, ncols]
1281
- end
1282
-
1283
- # The number of rows
1284
- def nrows
1285
- @index.size
1286
- end
1287
-
1288
- # The number of vectors
1289
- def ncols
1290
- @vectors.size
1291
- end
1292
-
1293
- # Check if a vector is present
1294
- def has_vector?(vector)
1295
- @vectors.include? vector
1296
- end
1297
-
1298
- # Works like Array#any?.
1299
- #
1300
- # @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
1301
- # :row. A DaruLite::Vector object is yielded in the block.
1302
- # @example Using any?
1303
- # df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
1304
- # df.any?(:row) do |row|
1305
- # row[:a] < 3 and row[:b] == 'b'
1306
- # end #=> true
1307
- def any?(axis = :vector, &block)
1308
- if %i[vector column].include?(axis)
1309
- @data.any?(&block)
1310
- elsif axis == :row
1311
- each_row do |row|
1312
- return true if yield(row)
1313
- end
1314
- false
1315
- else
1316
- raise ArgumentError, "Unidentified axis #{axis}"
1317
- end
1318
- end
1319
-
1320
- # Works like Array#all?
1321
- #
1322
- # @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
1323
- # :row. A DaruLite::Vector object is yielded in the block.
1324
- # @example Using all?
1325
- # df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
1326
- # df.all?(:row) do |row|
1327
- # row[:a] < 10
1328
- # end #=> true
1329
- def all?(axis = :vector, &block)
1330
- if %i[vector column].include?(axis)
1331
- @data.all?(&block)
1332
- elsif axis == :row
1333
- each_row.all?(&block)
1334
- else
1335
- raise ArgumentError, "Unidentified axis #{axis}"
1336
- end
1337
- end
1338
-
1339
- # The first ten elements of the DataFrame
1340
- #
1341
- # @param [Fixnum] quantity (10) The number of elements to display from the top.
1342
- def head(quantity = 10)
1343
- row.at 0..(quantity - 1)
1344
- end
1345
-
1346
- alias first head
1347
-
1348
- # The last ten elements of the DataFrame
1349
- #
1350
- # @param [Fixnum] quantity (10) The number of elements to display from the bottom.
1351
- def tail(quantity = 10)
1352
- start = [-quantity, -size].max
1353
- row.at start..-1
1354
- end
1355
-
1356
- alias last tail
1357
-
1358
- # Sum all numeric/specified vectors in the DataFrame.
1359
- #
1360
- # Returns a new vector that's a containing a sum of all numeric
1361
- # or specified vectors of the DataFrame. By default, if the vector
1362
- # contains a nil, the sum is nil.
1363
- # With :skipnil argument set to true, nil values are assumed to be
1364
- # 0 (zero) and the sum vector is returned.
1365
- #
1366
- # @param args [Array] List of vectors to sum. Default is nil in which case
1367
- # all numeric vectors are summed.
1368
- #
1369
- # @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
1370
- #
1371
- # @return Vector with sum of all vectors specified in the argument.
1372
- # If vecs parameter is empty, sum all numeric vector.
1373
- #
1374
- # @example
1375
- # df = DaruLite::DataFrame.new({
1376
- # a: [1, 2, nil],
1377
- # b: [2, 1, 3],
1378
- # c: [1, 1, 1]
1379
- # })
1380
- # => #<DaruLite::DataFrame(3x3)>
1381
- # a b c
1382
- # 0 1 2 1
1383
- # 1 2 1 1
1384
- # 2 nil 3 1
1385
- # df.vector_sum [:a, :c]
1386
- # => #<DaruLite::Vector(3)>
1387
- # 0 2
1388
- # 1 3
1389
- # 2 nil
1390
- # df.vector_sum
1391
- # => #<DaruLite::Vector(3)>
1392
- # 0 4
1393
- # 1 4
1394
- # 2 nil
1395
- # df.vector_sum skipnil: true
1396
- # => #<DaruLite::Vector(3)>
1397
- # c
1398
- # 0 4
1399
- # 1 4
1400
- # 2 4
1401
- #
1402
- def vector_sum(*args)
1403
- defaults = { vecs: nil, skipnil: false }
1404
- options = args.last.is_a?(::Hash) ? args.pop : {}
1405
- options = defaults.merge(options)
1406
- vecs = args[0] || options[:vecs]
1407
- skipnil = args[1] || options[:skipnil]
1408
-
1409
- vecs ||= numeric_vectors
1410
- sum = DaruLite::Vector.new [0] * @size, index: @index, name: @name, dtype: @dtype
1411
- vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
1412
- end
1413
-
1414
- # Calculate mean of the rows of the dataframe.
1415
- #
1416
- # == Arguments
1417
- #
1418
- # * +max_missing+ - The maximum number of elements in the row that can be
1419
- # zero for the mean calculation to happen. Default to 0.
1420
- def vector_mean(max_missing = 0)
1421
- # FIXME: in vector_sum we preserve created vector dtype, but
1422
- # here we are not. Is this by design or ...? - zverok, 2016-05-18
1423
- mean_vec = DaruLite::Vector.new [0] * @size, index: @index, name: "mean_#{@name}"
1424
-
1425
- each_row_with_index.with_object(mean_vec) do |(row, i), memo|
1426
- memo[i] = row.indexes(*DaruLite::MISSING_VALUES).size > max_missing ? nil : row.mean
1427
- end
1428
- end
1429
-
1430
- # Group elements by vector to perform operations on them. Returns a
1431
- # DaruLite::Core::GroupBy object.See the DaruLite::Core::GroupBy docs for a detailed
1432
- # list of possible operations.
1433
- #
1434
- # == Arguments
1435
- #
1436
- # * vectors - An Array contatining names of vectors to group by.
1437
- #
1438
- # == Usage
1439
- #
1440
- # df = DaruLite::DataFrame.new({
1441
- # a: %w{foo bar foo bar foo bar foo foo},
1442
- # b: %w{one one two three two two one three},
1443
- # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
1444
- # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
1445
- # })
1446
- # df.group_by([:a,:b,:c]).groups
1447
- # #=> {["bar", "one", 2]=>[1],
1448
- # # ["bar", "three", 1]=>[3],
1449
- # # ["bar", "two", 6]=>[5],
1450
- # # ["foo", "one", 1]=>[0],
1451
- # # ["foo", "one", 3]=>[6],
1452
- # # ["foo", "three", 8]=>[7],
1453
- # # ["foo", "two", 3]=>[2, 4]}
1454
- def group_by(*vectors)
1455
- vectors.flatten!
1456
- missing = vectors - @vectors.to_a
1457
- raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}") unless missing.empty?
1458
-
1459
- vectors = [@vectors.first] if vectors.empty?
1460
-
1461
- DaruLite::Core::GroupBy.new(self, vectors)
1462
- end
1463
-
1464
- def reindex_vectors(new_vectors)
1465
- unless new_vectors.is_a?(DaruLite::Index)
1466
- raise ArgumentError, 'Must pass the new index of type Index or its ' \
1467
- "subclasses, not #{new_vectors.class}"
1468
- end
1469
-
1470
- cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
1471
- new_vectors.each_with_object(cl) do |vec, memo|
1472
- memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
1473
- end
1474
- end
1475
-
1476
- def get_vector_anyways(v)
1477
- @vectors.include?(v) ? self[v].to_a : Array.new(size)
1478
- end
1479
-
1480
- # Concatenate another DataFrame along corresponding columns.
1481
- # If columns do not exist in both dataframes, they are filled with nils
1482
- def concat(other_df)
1483
- vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
1484
-
1485
- data = vectors.map do |v|
1486
- get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
1487
- end
1488
-
1489
- DaruLite::DataFrame.new(data, order: vectors)
1490
- end
1491
-
1492
- # Concatenates another DataFrame as #concat.
1493
- # Additionally it tries to preserve the index. If the indices contain
1494
- # common elements, #union will overwrite the according rows in the
1495
- # first dataframe.
1496
- def union(other_df)
1497
- index = (@index.to_a + other_df.index.to_a).uniq
1498
- df = row[*(@index.to_a - other_df.index.to_a)]
1499
-
1500
- df = df.concat(other_df)
1501
- df.index = DaruLite::Index.new(index)
1502
- df
1503
- end
1504
-
1505
- module SetSingleIndexStrategy
1506
- def self.uniq_size(df, col)
1507
- df[col].uniq.size
1508
- end
1509
-
1510
- def self.new_index(df, col)
1511
- DaruLite::Index.new(df[col].to_a)
1512
- end
1513
-
1514
- def self.delete_vector(df, col)
1515
- df.delete_vector(col)
1516
- end
1517
- end
1518
-
1519
- module SetCategoricalIndexStrategy
1520
- def self.new_index(df, col)
1521
- DaruLite::CategoricalIndex.new(df[col].to_a)
1522
- end
1523
-
1524
- def self.delete_vector(df, col)
1525
- df.delete_vector(col)
1526
- end
1527
- end
1528
-
1529
- module SetMultiIndexStrategy
1530
- def self.uniq_size(df, cols)
1531
- df[*cols].uniq.size
1532
- end
1533
-
1534
- def self.new_index(df, cols)
1535
- DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
1536
- mi.name = cols
1537
- end
1538
- end
1539
-
1540
- def self.delete_vector(df, cols)
1541
- df.delete_vectors(*cols)
1542
- end
1543
- end
1544
-
1545
- # Set a particular column as the new DF
1546
- def set_index(new_index_col, keep: false, categorical: false)
1547
- if categorical
1548
- strategy = SetCategoricalIndexStrategy
1549
- elsif new_index_col.respond_to?(:to_a)
1550
- strategy = SetMultiIndexStrategy
1551
- new_index_col = new_index_col.to_a
1552
- else
1553
- strategy = SetSingleIndexStrategy
1554
- end
1555
-
1556
- unless categorical
1557
- uniq_size = strategy.uniq_size(self, new_index_col)
1558
- raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
1559
- end
1560
-
1561
- self.index = strategy.new_index(self, new_index_col)
1562
- strategy.delete_vector(self, new_index_col) unless keep
1563
- self
1564
- end
1565
-
1566
- # Change the index of the DataFrame and preserve the labels of the previous
1567
- # indexing. New index can be DaruLite::Index or any of its subclasses.
1568
- #
1569
- # @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
1570
- # @example Reindexing DataFrame
1571
- # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
1572
- # index: ['a','b','c','d'])
1573
- # #=>
1574
- # ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
1575
- # # a b
1576
- # # a 1 11
1577
- # # b 2 22
1578
- # # c 3 33
1579
- # # d 4 44
1580
- # df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
1581
- # #=>
1582
- # ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
1583
- # # a b
1584
- # # b 2 22
1585
- # # 0 nil nil
1586
- # # a 1 11
1587
- # # g nil nil
1588
- def reindex(new_index)
1589
- unless new_index.is_a?(DaruLite::Index)
1590
- raise ArgumentError, 'Must pass the new index of type Index or its ' \
1591
- "subclasses, not #{new_index.class}"
1592
- end
1593
-
1594
- cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
1595
- new_index.each_with_object(cl) do |idx, memo|
1596
- memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
1597
- end
1598
- end
1599
-
1600
- def reset_index
1601
- index_df = index.to_df
1602
- names = index.name
1603
- names = [names] unless names.instance_of?(Array)
1604
- new_vectors = names + vectors.to_a
1605
- self.index = index_df.index
1606
- names.each do |name|
1607
- self[name] = index_df[name]
1608
- end
1609
- self.order = new_vectors
1610
- self
1611
- end
1612
-
1613
- # Reassign index with a new index of type DaruLite::Index or any of its subclasses.
1614
- #
1615
- # @param [DaruLite::Index] idx New index object on which the rows of the dataframe
1616
- # are to be indexed.
1617
- # @example Reassigining index of a DataFrame
1618
- # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
1619
- # df.index.to_a #=> [0,1,2,3]
1620
- #
1621
- # df.index = DaruLite::Index.new(['a','b','c','d'])
1622
- # df.index.to_a #=> ['a','b','c','d']
1623
- # df.row['a'].to_a #=> [1,11]
1624
- def index=(idx)
1625
- @index = Index.coerce idx
1626
- @data.each { |vec| vec.index = @index }
1627
-
1628
- self
1629
- end
1630
-
1631
- # Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
1632
- #
1633
- # @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
1634
- # be indexed. Must of the same size as ncols.
1635
- # @example Reassigning vectors of a DataFrame
1636
- # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
1637
- # df.vectors.to_a #=> [:a, :b, :c]
1638
- #
1639
- # df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
1640
- # df.vectors.to_a #=> [:foo, :bar, :baz]
1641
- def vectors=(new_index)
1642
- raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
1643
-
1644
- if new_index.size != ncols
1645
- raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
1646
- "dataframe size #{ncols}"
1647
- end
1648
-
1649
- @vectors = new_index
1650
- @data.zip(new_index.to_a).each do |vect, name|
1651
- vect.name = name
1652
- end
1653
- self
1654
- end
1655
-
1656
- # Renames the vectors
1657
- #
1658
- # == Arguments
1659
- #
1660
- # * name_map - A hash where the keys are the exising vector names and
1661
- # the values are the new names. If a vector is renamed
1662
- # to a vector name that is already in use, the existing
1663
- # one is overwritten.
1664
- #
1665
- # == Usage
1666
- #
1667
- # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1668
- # df.rename_vectors :a => :alpha, :c => :gamma
1669
- # df.vectors.to_a #=> [:alpha, :b, :gamma]
1670
- def rename_vectors(name_map)
1671
- existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
1672
- delete_vectors(*existing_targets)
1673
-
1674
- new_names = vectors.to_a.map { |v| name_map[v] || v }
1675
- self.vectors = DaruLite::Index.new new_names
1676
- end
1677
-
1678
- # Renames the vectors and returns itself
1679
- #
1680
- # == Arguments
1681
- #
1682
- # * name_map - A hash where the keys are the exising vector names and
1683
- # the values are the new names. If a vector is renamed
1684
- # to a vector name that is already in use, the existing
1685
- # one is overwritten.
1686
- #
1687
- # == Usage
1688
- #
1689
- # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1690
- # df.rename_vectors! :a => :alpha, :c => :gamma # df
1691
- def rename_vectors!(name_map)
1692
- rename_vectors(name_map)
1693
- self
1694
- end
1695
-
1696
- # Converts the vectors to a DaruLite::MultiIndex.
1697
- # The argument passed is used as the MultiIndex's top level
1698
- def add_level_to_vectors(top_level_label)
1699
- tuples = vectors.map { |label| [top_level_label, *label] }
1700
- self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
1701
- end
1702
-
1703
- # Return the indexes of all the numeric vectors. Will include vectors with nils
1704
- # alongwith numbers.
1705
- def numeric_vectors
1706
- # FIXME: Why _with_index ?..
1707
- each_vector_with_index
1708
- .select { |vec, _i| vec.numeric? }
1709
- .map(&:last)
1710
- end
1711
-
1712
- def numeric_vector_names
1713
- @vectors.select { |v| self[v].numeric? }
1714
- end
1715
-
1716
- # Return a DataFrame of only the numerical Vectors. If clone: false
1717
- # is specified as option, only a *view* of the Vectors will be
1718
- # returned. Defaults to clone: true.
1719
- def only_numerics(opts = {})
1720
- cln = opts[:clone] != false
1721
- arry = numeric_vectors.map { |v| self[v] }
1722
-
1723
- order = Index.new(numeric_vectors)
1724
- DaruLite::DataFrame.new(arry, clone: cln, order: order, index: @index)
1725
- end
1726
-
1727
- # Generate a summary of this DataFrame based on individual vectors in the DataFrame
1728
- # @return [String] String containing the summary of the DataFrame
1729
- def summary
1730
- summary = "= #{name}"
1731
- summary << "\n Number of rows: #{nrows}"
1732
- @vectors.each do |v|
1733
- summary << "\n Element:[#{v}]\n"
1734
- summary << self[v].summary(1)
1735
- end
1736
- summary
1737
- end
1738
-
1739
- # Sorts a dataframe (ascending/descending) in the given pripority sequence of
1740
- # vectors, with or without a block.
1741
- #
1742
- # @param vector_order [Array] The order of vector names in which the DataFrame
1743
- # should be sorted.
1744
- # @param opts [Hash] opts The options to sort with.
1745
- # @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
1746
- # or descending order. Specify Array corresponding to *order* for multiple
1747
- # sort orders.
1748
- # @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
1749
- # to be used for sorting, for each vector name in *order* as a hash of
1750
- # vector name and lambda expressions. In case a lambda for a vector is not
1751
- # specified, the default will be used.
1752
- # @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
1753
- # automatically or not when a block is provided.
1754
- # If set to True, nils will appear at top after sorting.
1755
- #
1756
- # @example Sort a dataframe with a vector sequence.
1757
- #
1758
- #
1759
- # df = DaruLite::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
1760
- #
1761
- # df.sort [:a, :b]
1762
- # # =>
1763
- # # <DaruLite::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
1764
- # # a b
1765
- # # 2 1 3
1766
- # # 0 1 5
1767
- # # 3 2 2
1768
- # # 1 2 4
1769
- # # 4 3 1
1770
- #
1771
- # @example Sort a dataframe without a block. Here nils will be handled automatically.
1772
- #
1773
- # df = DaruLite::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
1774
- #
1775
- # df.sort([:a])
1776
- # # =>
1777
- # # <DaruLite::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
1778
- # # a b
1779
- # # 1 nil 3
1780
- # # 3 nil 1
1781
- # # 0 -3 4
1782
- # # 2 -1 2
1783
- # # 4 5 4
1784
- #
1785
- # @example Sort a dataframe with a block with nils handled automatically.
1786
- #
1787
- # df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1788
- #
1789
- # df.sort [:b], by: {b: lambda { |a| a.length } }
1790
- # # NoMethodError: undefined method `length' for nil:NilClass
1791
- # # from (pry):8:in `block in __pry__'
1792
- #
1793
- # df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
1794
- #
1795
- # # =>
1796
- # # <DaruLite::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
1797
- # # a b
1798
- # # 2 1 nil
1799
- # # 5 1 nil
1800
- # # 4 -1 x
1801
- # # 1 -1 aa
1802
- # # 0 nil aaa
1803
- # # 3 nil baaa
1804
- #
1805
- # @example Sort a dataframe with a block with nils handled manually.
1806
- #
1807
- # df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1808
- #
1809
- # # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
1810
- # df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
1811
- #
1812
- # # =>
1813
- # #<DaruLite::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
1814
- # # a b
1815
- # # 4 -1 x
1816
- # # 1 -1 aa
1817
- # # 0 nil aaa
1818
- # # 3 nil baaa
1819
- # # 2 1 nil
1820
- # # 5 1 nil
1821
-
1822
- def sort!(vector_order, opts = {})
1823
- raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
1824
-
1825
- # To enable sorting with categorical data,
1826
- # map categories to integers preserving their order
1827
- old = convert_categorical_vectors vector_order
1828
- block = sort_prepare_block vector_order, opts
1829
-
1830
- order = @index.size.times.sort(&block)
1831
- new_index = @index.reorder order
1832
-
1833
- # To reverse map mapping of categorical data to integers
1834
- restore_categorical_vectors old
1835
-
1836
- @data.each do |vector|
1837
- vector.reorder! order
1838
- end
1839
-
1840
- self.index = new_index
1841
-
1842
- self
1843
- end
1844
-
1845
- # Non-destructive version of #sort!
1846
- def sort(vector_order, opts = {})
1847
- dup.sort! vector_order, opts
1848
- end
1849
-
1850
- # Pivots a data frame on specified vectors and applies an aggregate function
1851
- # to quickly generate a summary.
1852
- #
1853
- # == Options
1854
- #
1855
- # +:index+ - Keys to group by on the pivot table row index. Pass vector names
1856
- # contained in an Array.
1857
- #
1858
- # +:vectors+ - Keys to group by on the pivot table column index. Pass vector
1859
- # names contained in an Array.
1860
- #
1861
- # +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
1862
- # use any of the statistics functions applicable on Vectors that can be found in
1863
- # the DaruLite::Statistics::Vector module.
1864
- #
1865
- # +:values+ - Columns to aggregate. Will consider all numeric columns not
1866
- # specified in *:index* or *:vectors*. Optional.
1867
- #
1868
- # == Usage
1869
- #
1870
- # df = DaruLite::DataFrame.new({
1871
- # a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
1872
- # b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
1873
- # c: ['small','large','large','small','small','large','small','large','small'],
1874
- # d: [1,2,2,3,3,4,5,6,7],
1875
- # e: [2,4,4,6,6,8,10,12,14]
1876
- # })
1877
- # df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
1878
- #
1879
- # #=>
1880
- # # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
1881
- # # [:e, :one] [:e, :two]
1882
- # # [:bar] 18 26
1883
- # # [:foo] 10 12
1884
- def pivot_table(opts = {})
1885
- raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?
1886
-
1887
- index = opts[:index]
1888
- vectors = opts[:vectors] || []
1889
- aggregate_function = opts[:agg] || :mean
1890
- values = prepare_pivot_values index, vectors, opts
1891
- raise IndexError, 'No numeric vectors to aggregate' if values.empty?
1892
-
1893
- grouped = group_by(index)
1894
- return grouped.send(aggregate_function) if vectors.empty?
1895
-
1896
- super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
1897
-
1898
- pivot_dataframe super_hash
1899
- end
1900
-
1901
- # Merge vectors from two DataFrames. In case of name collision,
1902
- # the vectors names are changed to x_1, x_2 ....
1903
- #
1904
- # @return {DaruLite::DataFrame}
1905
- def merge(other_df)
1906
- unless nrows == other_df.nrows
1907
- raise ArgumentError,
1908
- "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
1909
- end
1910
-
1911
- new_fields = (@vectors.to_a + other_df.vectors.to_a)
1912
- new_fields = ArrayHelper.recode_repeated(new_fields)
1913
- DataFrame.new({}, order: new_fields).tap do |df_new|
1914
- (0...nrows).each do |i|
1915
- df_new.add_row row[i].to_a + other_df.row[i].to_a
1916
- end
1917
- df_new.index = @index if @index == other_df.index
1918
- df_new.update
1919
- end
1920
- end
1921
-
1922
- # Join 2 DataFrames with SQL style joins. Currently supports inner, left
1923
- # outer, right outer and full outer joins.
1924
- #
1925
- # @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
1926
- # to be performed.
1927
- # @param [Hash] opts Options Hash
1928
- # @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
1929
- # @option :on [Array] The columns on which the join is to be performed.
1930
- # Column names specified here must be common to both DataFrames.
1931
- # @option :indicator [Symbol] The name of a vector to add to the resultant
1932
- # dataframe that indicates whether the record was in the left (:left_only),
1933
- # right (:right_only), or both (:both) joining dataframes.
1934
- # @return [DaruLite::DataFrame]
1935
- # @example Inner Join
1936
- # left = DaruLite::DataFrame.new({
1937
- # :id => [1,2,3,4],
1938
- # :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
1939
- # })
1940
- # right = DaruLite::DataFrame.new({
1941
- # :id => [1,2,3,4],
1942
- # :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
1943
- # })
1944
- # left.join(right, how: :inner, on: [:name])
1945
- # #=>
1946
- # ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
1947
- # # id_1 name id_2
1948
- # # 0 1 Pirate 2
1949
- # # 1 3 Ninja 4
1950
- def join(other_df, opts = {})
1951
- DaruLite::Core::Merge.join(self, other_df, opts)
1952
- end
1953
-
1954
- # Creates a new dataset for one to many relations
1955
- # on a dataset, based on pattern of field names.
1956
- #
1957
- # for example, you have a survey for number of children
1958
- # with this structure:
1959
- # id, name, child_name_1, child_age_1, child_name_2, child_age_2
1960
- # with
1961
- # ds.one_to_many([:id], "child_%v_%n"
1962
- # the field of first parameters will be copied verbatim
1963
- # to new dataset, and fields which responds to second
1964
- # pattern will be added one case for each different %n.
1965
- #
1966
- # @example
1967
- # cases=[
1968
- # ['1','george','red',10,'blue',20,nil,nil],
1969
- # ['2','fred','green',15,'orange',30,'white',20],
1970
- # ['3','alfred',nil,nil,nil,nil,nil,nil]
1971
- # ]
1972
- # ds=DaruLite::DataFrame.rows(cases, order:
1973
- # [:id, :name,
1974
- # :car_color1, :car_value1,
1975
- # :car_color2, :car_value2,
1976
- # :car_color3, :car_value3])
1977
- # ds.one_to_many([:id],'car_%v%n').to_matrix
1978
- # #=> Matrix[
1979
- # # ["red", "1", 10],
1980
- # # ["blue", "1", 20],
1981
- # # ["green", "2", 15],
1982
- # # ["orange", "2", 30],
1983
- # # ["white", "2", 20]
1984
- # # ]
1985
- def one_to_many(parent_fields, pattern)
1986
- vars, numbers = one_to_many_components(pattern)
1987
-
1988
- DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
1989
- each_row do |row|
1990
- verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
1991
- numbers.each do |n|
1992
- generated = one_to_many_row row, n, vars, pattern
1993
- next if generated.values.all?(&:nil?)
1994
-
1995
- ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
1996
- end
1997
- end
1998
- ds.update
1999
- end
2000
- end
2001
-
2002
- def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
2003
- self[nm]
2004
- .split_by_separator(sep)
2005
- .each_with_index do |(k, v), i|
2006
- v.rename "#{nm}:#{k}"
2007
- self[:"#{nm}#{join}#{i + 1}"] = v
2008
- end
2009
- end
2010
-
2011
- # Create a sql, basen on a given Dataset
2012
- #
2013
- # == Arguments
2014
- #
2015
- # * table - String specifying name of the table that will created in SQL.
2016
- # * charset - Character set. Default is "UTF8".
2017
- #
2018
- # @example
2019
- #
2020
- # ds = DaruLite::DataFrame.new({
2021
- # :id => DaruLite::Vector.new([1,2,3,4,5]),
2022
- # :name => DaruLite::Vector.new(%w{Alex Peter Susan Mary John})
2023
- # })
2024
- # ds.create_sql('names')
2025
- # #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
2026
- #
2027
- def create_sql(table, charset = 'UTF8')
2028
- sql = "CREATE TABLE #{table} ("
2029
- fields = vectors.to_a.collect do |f|
2030
- v = self[f]
2031
- "#{f} #{v.db_type}"
2032
- end
2033
-
2034
- sql + fields.join(",\n ") + ") CHARACTER SET=#{charset};"
2035
- end
2036
-
2037
- # Returns the dataframe. This can be convenient when the user does not
2038
- # know whether the object is a vector or a dataframe.
2039
- # @return [self] the dataframe
2040
- def to_df
2041
- self
2042
- end
2043
-
2044
- # Convert all vectors of type *:numeric* into a Matrix.
2045
- def to_matrix
2046
- Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
2047
- end
2048
-
2049
- # Converts the DataFrame into an array of hashes where key is vector name
2050
- # and value is the corresponding element. The 0th index of the array contains
2051
- # the array of hashes while the 1th index contains the indexes of each row
2052
- # of the dataframe. Each element in the index array corresponds to its row
2053
- # in the array of hashes, which has the same index.
2054
- def to_a
2055
- [each_row.map(&:to_h), @index.to_a]
2056
- end
2057
-
2058
- # Convert to json. If no_index is false then the index will NOT be included
2059
- # in the JSON thus created.
2060
- def to_json(no_index = true)
2061
- if no_index
2062
- to_a[0].to_json
2063
- else
2064
- to_a.to_json
2065
- end
2066
- end
2067
-
2068
- # Converts DataFrame to a hash (explicit) with keys as vector names and values as
2069
- # the corresponding vectors.
2070
- def to_h
2071
- @vectors
2072
- .each_with_index
2073
- .map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
2074
- end
316
+ def nest(*tree_keys, &block)
317
+ tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
2075
318
 
2076
- # Convert to html for IRuby.
2077
- def to_html(threshold = DaruLite.max_rows)
2078
- table_thead = to_html_thead
2079
- table_tbody = to_html_tbody(threshold)
2080
- path = if index.is_a?(MultiIndex)
2081
- File.expand_path('iruby/templates/dataframe_mi.html.erb', __dir__)
2082
- else
2083
- File.expand_path('iruby/templates/dataframe.html.erb', __dir__)
2084
- end
2085
- ERB.new(File.read(path).strip).result(binding)
2086
- end
319
+ each_row.with_object({}) do |row, current|
320
+ # Create tree
321
+ *keys, last = tree_keys
322
+ current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
323
+ name = row[last]
2087
324
 
2088
- def to_html_thead
2089
- table_thead_path =
2090
- if index.is_a?(MultiIndex)
2091
- File.expand_path('iruby/templates/dataframe_mi_thead.html.erb', __dir__)
325
+ if block
326
+ current[name] = yield(row, current, name)
2092
327
  else
2093
- File.expand_path('iruby/templates/dataframe_thead.html.erb', __dir__)
328
+ current[name] ||= []
329
+ current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
2094
330
  end
2095
- ERB.new(File.read(table_thead_path).strip).result(binding)
331
+ end
2096
332
  end
2097
333
 
2098
- def to_html_tbody(threshold = DaruLite.max_rows)
2099
- threshold ||= @size
2100
- table_tbody_path =
2101
- if index.is_a?(MultiIndex)
2102
- File.expand_path('iruby/templates/dataframe_mi_tbody.html.erb', __dir__)
2103
- else
2104
- File.expand_path('iruby/templates/dataframe_tbody.html.erb', __dir__)
2105
- end
2106
- ERB.new(File.read(table_tbody_path).strip).result(binding)
334
+ def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
335
+ self[name]
336
+ .split_by_separator(sep)
337
+ .each { |k, v| self[:"#{name}#{join}#{k}"] = v }
2107
338
  end
2108
339
 
2109
- def to_s
2110
- "#<#{self.class}#{": #{@name}" if @name}(#{nrows}x#{ncols})>"
340
+ # Return the number of rows and columns of the DataFrame in an Array.
341
+ def shape
342
+ [nrows, ncols]
2111
343
  end
2112
344
 
2113
- # Method for updating the metadata (i.e. missing value positions) of the
2114
- # after assingment/deletion etc. are complete. This is provided so that
2115
- # time is not wasted in creating the metadata for the vector each time
2116
- # assignment/deletion of elements is done. Updating data this way is called
2117
- # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
2118
- def update
2119
- @data.each(&:update) if DaruLite.lazy_update
345
+ # The number of rows
346
+ def nrows
347
+ @index.size
2120
348
  end
2121
349
 
2122
- # Rename the DataFrame.
2123
- def rename(new_name)
2124
- @name = new_name
2125
- self
350
+ # The number of vectors
351
+ def ncols
352
+ @vectors.size
2126
353
  end
2127
354
 
2128
- alias name= rename
2129
-
2130
- # Write this DataFrame to a CSV file.
355
+ # Renames the vectors
2131
356
  #
2132
357
  # == Arguments
2133
358
  #
2134
- # * filename - Path of CSV file where the DataFrame is to be saved.
359
+ # * name_map - A hash where the keys are the exising vector names and
360
+ # the values are the new names. If a vector is renamed
361
+ # to a vector name that is already in use, the existing
362
+ # one is overwritten.
2135
363
  #
2136
- # == Options
364
+ # == Usage
2137
365
  #
2138
- # * convert_comma - If set to *true*, will convert any commas in any
2139
- # of the data to full stops ('.').
2140
- # All the options accepted by CSV.read() can also be passed into this
2141
- # function.
2142
- def write_csv(filename, opts = {})
2143
- DaruLite::IO.dataframe_write_csv self, filename, opts
2144
- end
366
+ # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
367
+ # df.rename_vectors :a => :alpha, :c => :gamma
368
+ # df.vectors.to_a #=> [:alpha, :b, :gamma]
369
+ def rename_vectors(name_map)
370
+ existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
371
+ delete_vectors(*existing_targets)
2145
372
 
2146
- # Write this dataframe to an Excel Spreadsheet
2147
- #
2148
- # == Arguments
2149
- #
2150
- # * filename - The path of the file where the DataFrame should be written.
2151
- def write_excel(filename, opts = {})
2152
- DaruLite::IO.dataframe_write_excel self, filename, opts
373
+ new_names = vectors.to_a.map { |v| name_map[v] || v }
374
+ self.vectors = DaruLite::Index.new new_names
2153
375
  end
2154
376
 
2155
- # Insert each case of the Dataset on the selected table
377
+ # Renames the vectors and returns itself
2156
378
  #
2157
379
  # == Arguments
2158
380
  #
2159
- # * dbh - DBI database connection object.
2160
- # * query - Query string.
381
+ # * name_map - A hash where the keys are the exising vector names and
382
+ # the values are the new names. If a vector is renamed
383
+ # to a vector name that is already in use, the existing
384
+ # one is overwritten.
2161
385
  #
2162
386
  # == Usage
2163
387
  #
2164
- # ds = DaruLite::DataFrame.new({:id=>DaruLite::Vector.new([1,2,3]), :name=>DaruLite::Vector.new(["a","b","c"])})
2165
- # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
2166
- # ds.write_sql(dbh,"test")
2167
- def write_sql(dbh, table)
2168
- DaruLite::IO.dataframe_write_sql self, dbh, table
388
+ # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
389
+ # df.rename_vectors! :a => :alpha, :c => :gamma # df
390
+ def rename_vectors!(name_map)
391
+ rename_vectors(name_map)
392
+ self
2169
393
  end
2170
394
 
2171
- # Use marshalling to save dataframe to a file.
2172
- def save(filename)
2173
- DaruLite::IO.save self, filename
395
+ # Converts the vectors to a DaruLite::MultiIndex.
396
+ # The argument passed is used as the MultiIndex's top level
397
+ def add_level_to_vectors(top_level_label)
398
+ tuples = vectors.map { |label| [top_level_label, *label] }
399
+ self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
2174
400
  end
2175
401
 
2176
- def _dump(_depth)
2177
- Marshal.dump(
2178
- data: @data,
2179
- index: @index.to_a,
2180
- order: @vectors.to_a,
2181
- name: @name
2182
- )
402
+ def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
403
+ self[nm]
404
+ .split_by_separator(sep)
405
+ .each_with_index do |(k, v), i|
406
+ v.rename "#{nm}:#{k}"
407
+ self[:"#{nm}#{join}#{i + 1}"] = v
408
+ end
409
+ end
410
+
411
+ # Method for updating the metadata (i.e. missing value positions) of the
412
+ # after assingment/deletion etc. are complete. This is provided so that
413
+ # time is not wasted in creating the metadata for the vector each time
414
+ # assignment/deletion of elements is done. Updating data this way is called
415
+ # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
416
+ def update
417
+ @data.each(&:update) if DaruLite.lazy_update
2183
418
  end
2184
419
 
2185
- def self._load(data)
2186
- h = Marshal.load data
2187
- DaruLite::DataFrame.new(h[:data],
2188
- index: h[:index],
2189
- order: h[:order],
2190
- name: h[:name])
420
+ # Rename the DataFrame.
421
+ def rename(new_name)
422
+ @name = new_name
423
+ self
2191
424
  end
425
+ alias name= rename
2192
426
 
2193
427
  # Transpose a DataFrame, tranposing elements and row, column indexing.
2194
428
  def transpose
@@ -2204,7 +438,10 @@ module DaruLite
2204
438
  # Pretty print in a nice table format for the command line (irb/pry/iruby)
2205
439
  def inspect(spacing = DaruLite.spacing, threshold = DaruLite.max_rows)
2206
440
  name_part = @name ? ": #{@name} " : ''
2207
- spacing = [headers.to_a.map(&:length).max, spacing].max
441
+ spacing = [
442
+ headers.to_a.map { |header| header.try(:length) || header.to_s.length }.max,
443
+ spacing
444
+ ].max
2208
445
 
2209
446
  "#<#{self.class}#{name_part}(#{nrows}x#{ncols})>#{$INPUT_RECORD_SEPARATOR}" +
2210
447
  Formatters::Table.format(
@@ -2216,11 +453,6 @@ module DaruLite
2216
453
  )
2217
454
  end
2218
455
 
2219
- # Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
2220
- def where(bool_array)
2221
- DaruLite::Core::Query.df_where self, bool_array
2222
- end
2223
-
2224
456
  def ==(other)
2225
457
  self.class == other.class &&
2226
458
  @size == other.size &&
@@ -2274,144 +506,6 @@ module DaruLite
2274
506
  order: all_vectors.map(&:name)
2275
507
  end
2276
508
 
2277
- # Split the dataframe into many dataframes based on category vector
2278
- # @param [object] cat_name name of category vector to split the dataframe
2279
- # @return [Array] array of dataframes split by category with category vector
2280
- # used to split not included
2281
- # @example
2282
- # df = DaruLite::DataFrame.new({
2283
- # a: [1, 2, 3],
2284
- # b: ['a', 'a', 'b']
2285
- # })
2286
- # df.to_category :b
2287
- # df.split_by_category :b
2288
- # # => [#<DaruLite::DataFrame: a (2x1)>
2289
- # # a
2290
- # # 0 1
2291
- # # 1 2,
2292
- # # #<DaruLite::DataFrame: b (1x1)>
2293
- # # a
2294
- # # 2 3]
2295
- def split_by_category(cat_name)
2296
- cat_dv = self[cat_name]
2297
- raise ArgumentError, "#{cat_name} is not a category vector" unless
2298
- cat_dv.category?
2299
-
2300
- cat_dv.categories.map do |cat|
2301
- where(cat_dv.eq cat)
2302
- .rename(cat)
2303
- .delete_vector cat_name
2304
- end
2305
- end
2306
-
2307
- # @param indexes [Array] index(s) at which row tuples are retrieved
2308
- # @return [Array] returns array of row tuples at given index(s)
2309
- # @example Using DaruLite::Index
2310
- # df = DaruLite::DataFrame.new({
2311
- # a: [1, 2, 3],
2312
- # b: ['a', 'a', 'b']
2313
- # })
2314
- #
2315
- # df.access_row_tuples_by_indexs(1,2)
2316
- # # => [[2, "a"], [3, "b"]]
2317
- #
2318
- # df.index = DaruLite::Index.new([:one,:two,:three])
2319
- # df.access_row_tuples_by_indexs(:one,:three)
2320
- # # => [[1, "a"], [3, "b"]]
2321
- #
2322
- # @example Using DaruLite::MultiIndex
2323
- # mi_idx = DaruLite::MultiIndex.from_tuples [
2324
- # [:a,:one,:bar],
2325
- # [:a,:one,:baz],
2326
- # [:b,:two,:bar],
2327
- # [:a,:two,:baz],
2328
- # ]
2329
- # df_mi = DaruLite::DataFrame.new({
2330
- # a: 1..4,
2331
- # b: 'a'..'d'
2332
- # }, index: mi_idx )
2333
- #
2334
- # df_mi.access_row_tuples_by_indexs(:b, :two, :bar)
2335
- # # => [[3, "c"]]
2336
- # df_mi.access_row_tuples_by_indexs(:a)
2337
- # # => [[1, "a"], [2, "b"], [4, "d"]]
2338
- def access_row_tuples_by_indexs(*indexes)
2339
- return get_sub_dataframe(indexes, by_position: false).map_rows(&:to_a) if
2340
- @index.is_a?(DaruLite::MultiIndex)
2341
-
2342
- positions = @index.pos(*indexes)
2343
- if positions.is_a? Numeric
2344
- row = get_rows_for([positions])
2345
- row.first.is_a?(Array) ? row : [row]
2346
- else
2347
- new_rows = get_rows_for(indexes, by_position: false)
2348
- indexes.map { |index| new_rows.map { |r| r[index] } }
2349
- end
2350
- end
2351
-
2352
- # Function to use for aggregating the data.
2353
- #
2354
- # @param options [Hash] options for column, you want in resultant dataframe
2355
- #
2356
- # @return [DaruLite::DataFrame]
2357
- #
2358
- # @example
2359
- # df = DaruLite::DataFrame.new(
2360
- # {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
2361
- # => #<DaruLite::DataFrame(5x2)>
2362
- # col num
2363
- # 0 a 52
2364
- # 1 b 12
2365
- # 2 c 7
2366
- # 3 d 17
2367
- # 4 e 1
2368
- #
2369
- # df.aggregate(num_100_times: ->(df) { (df.num*100).first })
2370
- # => #<DaruLite::DataFrame(5x1)>
2371
- # num_100_ti
2372
- # 0 5200
2373
- # 1 1200
2374
- # 2 700
2375
- # 3 1700
2376
- # 4 100
2377
- #
2378
- # When we have duplicate index :
2379
- #
2380
- # idx = DaruLite::CategoricalIndex.new [:a, :b, :a, :a, :c]
2381
- # df = DaruLite::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
2382
- # => #<DaruLite::DataFrame(5x1)>
2383
- # num
2384
- # a 52
2385
- # b 12
2386
- # a 7
2387
- # a 17
2388
- # c 1
2389
- #
2390
- # df.aggregate(num: :mean)
2391
- # => #<DaruLite::DataFrame(3x1)>
2392
- # num
2393
- # a 25.3333333
2394
- # b 12
2395
- # c 1
2396
- #
2397
- # Note: `GroupBy` class `aggregate` method uses this `aggregate` method
2398
- # internally.
2399
- def aggregate(options = {}, multi_index_level = -1)
2400
- if block_given?
2401
- positions_tuples, new_index = yield(@index) # NOTE: use of yield is private for now
2402
- else
2403
- positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
2404
- end
2405
-
2406
- colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
2407
-
2408
- DaruLite::DataFrame.new(colmn_value, index: new_index, order: options.keys)
2409
- end
2410
-
2411
- def group_by_and_aggregate(*group_by_keys, **aggregation_map)
2412
- group_by(*group_by_keys).aggregate(aggregation_map)
2413
- end
2414
-
2415
509
  private
2416
510
 
2417
511
  def headers
@@ -2422,20 +516,6 @@ module DaruLite
2422
516
  index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
2423
517
  end
2424
518
 
2425
- def convert_categorical_vectors(names)
2426
- names.filter_map do |n|
2427
- next unless self[n].category?
2428
-
2429
- old = [n, self[n]]
2430
- self[n] = DaruLite::Vector.new(self[n].to_ints)
2431
- old
2432
- end
2433
- end
2434
-
2435
- def restore_categorical_vectors(old)
2436
- old.each { |name, vector| self[name] = vector }
2437
- end
2438
-
2439
519
  def recursive_product(dfs)
2440
520
  return dfs.first if dfs.size == 1
2441
521
 
@@ -2447,12 +527,6 @@ module DaruLite
2447
527
  end
2448
528
  end
2449
529
 
2450
- def should_be_vector!(val)
2451
- return val if val.is_a?(DaruLite::Vector)
2452
-
2453
- raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
2454
- end
2455
-
2456
530
  def dispatch_to_axis(axis, method, *args, &block)
2457
531
  if %i[vector column].include?(axis)
2458
532
  send(:"#{method}_vector", *args, &block)
@@ -2483,76 +557,6 @@ module DaruLite
2483
557
  end
2484
558
  end
2485
559
 
2486
- def access_vector(*names)
2487
- if names.first.is_a?(Range)
2488
- dup(@vectors.subset(names.first))
2489
- elsif @vectors.is_a?(MultiIndex)
2490
- access_vector_multi_index(*names)
2491
- else
2492
- access_vector_single_index(*names)
2493
- end
2494
- end
2495
-
2496
- def access_vector_multi_index(*names)
2497
- pos = @vectors[names]
2498
-
2499
- return @data[pos] if pos.is_a?(Integer)
2500
-
2501
- new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
2502
-
2503
- pos = pos.drop_left_level(names.size) if names.size < @vectors.width
2504
-
2505
- DaruLite::DataFrame.new(new_vectors, index: @index, order: pos)
2506
- end
2507
-
2508
- def access_vector_single_index(*names)
2509
- if names.count < 2
2510
- begin
2511
- pos = @vectors.is_a?(DaruLite::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
2512
- rescue IndexError
2513
- raise IndexError, "Specified vector #{names.first} does not exist"
2514
- end
2515
- return @data[pos] if pos.is_a?(Numeric)
2516
-
2517
- names = pos
2518
- end
2519
-
2520
- new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
2521
-
2522
- order = names.is_a?(Array) ? DaruLite::Index.new(names) : names
2523
- DaruLite::DataFrame.new(new_vectors, order: order, index: @index, name: @name)
2524
- end
2525
-
2526
- def access_row(*indexes)
2527
- positions = @index.pos(*indexes)
2528
-
2529
- if positions.is_a? Numeric
2530
- row = get_rows_for([positions])
2531
- DaruLite::Vector.new row, index: @vectors, name: indexes.first
2532
- else
2533
- new_rows = get_rows_for(indexes, by_position: false)
2534
- DaruLite::DataFrame.new new_rows, index: @index.subset(*indexes), order: @vectors
2535
- end
2536
- end
2537
-
2538
- # @param keys [Array] can be an array of positions (if by_position is true) or indexes (if by_position if false)
2539
- # because of coercion by DaruLite::Vector#at and DaruLite::Vector#[], can return either an Array of
2540
- # values (representing a row) or an array of Vectors (that can be seen as rows)
2541
- def get_rows_for(keys, by_position: true)
2542
- raise unless keys.is_a?(Array)
2543
-
2544
- if by_position
2545
- pos = keys
2546
- @data.map { |vector| vector.at(*pos) }
2547
- else
2548
- # TODO: for now (2018-07-27), it is different than using
2549
- # get_rows_for(@index.pos(*keys))
2550
- # because DaruLite::Vector#at and DaruLite::Vector#[] don't handle DaruLite::MultiIndex the same way
2551
- indexes = keys
2552
- @data.map { |vec| vec[*indexes] }
2553
- end
2554
- end
2555
-
2556
560
  def insert_or_modify_vector(name, vector)
2557
561
  name = name[0] unless @vectors.is_a?(MultiIndex)
2558
562
 
@@ -2835,146 +839,6 @@ module DaruLite
2835
839
  end
2836
840
  end
2837
841
 
2838
- def sort_build_row(vector_locs, by_blocks, ascending, handle_nils, r1, r2) # rubocop:disable Metrics/ParameterLists
2839
- # Create an array to be used for comparison of two rows in sorting
2840
- vector_locs
2841
- .zip(by_blocks, ascending, handle_nils)
2842
- .map do |vector_loc, by, asc, handle_nil|
2843
- value = @data[vector_loc].data[asc ? r1 : r2]
2844
-
2845
- if by
2846
- value = begin
2847
- by.call(value)
2848
- rescue StandardError
2849
- nil
2850
- end
2851
- end
2852
-
2853
- sort_handle_nils value, asc, handle_nil || !by
2854
- end
2855
- end
2856
-
2857
- def sort_handle_nils(value, asc, handle_nil)
2858
- if !handle_nil
2859
- value
2860
- elsif asc
2861
- [value.nil? ? 0 : 1, value]
2862
- else
2863
- [value.nil? ? 1 : 0, value]
2864
- end
2865
- end
2866
-
2867
- def sort_coerce_boolean(opts, symbol, default, size)
2868
- val = opts[symbol]
2869
- case val
2870
- when true, false
2871
- Array.new(size, val)
2872
- when nil
2873
- Array.new(size, default)
2874
- when Array
2875
- raise ArgumentError, "Specify same number of vector names and #{symbol}" if
2876
- size != val.size
2877
-
2878
- val
2879
- else
2880
- raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
2881
- end
2882
- end
2883
-
2884
- def sort_prepare_block(vector_order, opts)
2885
- ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
2886
- handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
2887
-
2888
- by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
2889
- vector_locs = vector_order.map { |v| @vectors[v] }
2890
-
2891
- lambda do |index1, index2|
2892
- # Build left and right array to compare two rows
2893
- left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
2894
- right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
2895
-
2896
- # Resolve conflict by Index if all attributes are same
2897
- left << index1
2898
- right << index2
2899
- left <=> right
2900
- end
2901
- end
2902
-
2903
- def verify_error_message(row, test, id, i)
2904
- description, fields, = test
2905
- values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
2906
- "#{i + 1} [#{row[id]}]: #{description}#{values}"
2907
- end
2908
-
2909
- def prepare_pivot_values(index, vectors, opts)
2910
- case opts[:values]
2911
- when nil # values not specified at all.
2912
- (@vectors.to_a - (index | vectors)) & numeric_vector_names
2913
- when Array # multiple values specified.
2914
- opts[:values]
2915
- else # single value specified.
2916
- [opts[:values]]
2917
- end
2918
- end
2919
-
2920
- def make_pivot_hash(grouped, vectors, values, aggregate_function)
2921
- grouped.groups.transform_values { |_| {} }.tap do |super_hash|
2922
- values.each do |value|
2923
- grouped.groups.each do |group_name, row_numbers|
2924
- row_numbers.each do |num|
2925
- arry = [value, *vectors.map { |v| self[v][num] }]
2926
- sub_hash = super_hash[group_name]
2927
- sub_hash[arry] ||= []
2928
-
2929
- sub_hash[arry] << self[value][num]
2930
- end
2931
- end
2932
- end
2933
-
2934
- setup_pivot_aggregates super_hash, aggregate_function
2935
- end
2936
- end
2937
-
2938
- def setup_pivot_aggregates(super_hash, aggregate_function)
2939
- super_hash.each_value do |sub_hash|
2940
- sub_hash.each do |group_name, aggregates|
2941
- sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function)
2942
- end
2943
- end
2944
- end
2945
-
2946
- def pivot_dataframe(super_hash)
2947
- df_index = DaruLite::MultiIndex.from_tuples super_hash.keys
2948
- df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
2949
-
2950
- DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
2951
- super_hash.each do |row_index, sub_h|
2952
- sub_h.each do |vector_index, val|
2953
- pivoted_dataframe[vector_index][row_index] = val
2954
- end
2955
- end
2956
- end
2957
- end
2958
-
2959
- def one_to_many_components(pattern)
2960
- re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
2961
-
2962
- vars, numbers =
2963
- @vectors
2964
- .map { |v| v.scan(re) }
2965
- .reject(&:empty?).flatten(1).transpose
2966
-
2967
- [vars.uniq, numbers.map(&:to_i).sort.uniq]
2968
- end
2969
-
2970
- def one_to_many_row(row, number, vars, pattern)
2971
- vars
2972
- .to_h do |v|
2973
- name = pattern.sub('%v', v).sub('%n', number.to_s)
2974
- [v, row[name]]
2975
- end
2976
- end
2977
-
2978
842
  # Raises IndexError when one of the positions is not a valid position
2979
843
  def validate_positions(*positions, size)
2980
844
  positions.each do |pos|
@@ -2999,82 +863,5 @@ module DaruLite
2999
863
  DaruLite::Vector.new(source[idx], index: @index, name: vectors[idx])
3000
864
  end
3001
865
  end
3002
-
3003
- def aggregate_by_positions_tuples(options, positions_tuples)
3004
- agg_over_vectors_only, options = cast_aggregation_options(options)
3005
-
3006
- if agg_over_vectors_only
3007
- options.map do |vect_name, method|
3008
- vect = self[vect_name]
3009
-
3010
- positions_tuples.map do |positions|
3011
- vect.apply_method_on_sub_vector(method, keys: positions)
3012
- end
3013
- end
3014
- else
3015
- methods = options.values
3016
-
3017
- # NOTE: because we aggregate over rows, we don't have to re-get sub-dfs for each method (which is expensive)
3018
- rows = positions_tuples.map do |positions|
3019
- apply_method_on_sub_df(methods, keys: positions)
3020
- end
3021
-
3022
- rows.transpose
3023
- end
3024
- end
3025
-
3026
- # convert operations over sub-vectors to operations over sub-dfs when it improves perf
3027
- # note: we don't always "cast" because aggregation over a single vector / a few vector is faster
3028
- # than aggregation over (sub-)dfs
3029
- def cast_aggregation_options(options)
3030
- vects, non_vects = options.keys.partition { |k| @vectors.include?(k) }
3031
-
3032
- over_vectors = true
3033
-
3034
- if non_vects.any?
3035
- options = options.clone
3036
-
3037
- vects.each do |name|
3038
- proc_on_vect = options[name].to_proc
3039
- options[name] = ->(sub_df) { proc_on_vect.call(sub_df[name]) }
3040
- end
3041
-
3042
- over_vectors = false
3043
- end
3044
-
3045
- [over_vectors, options]
3046
- end
3047
-
3048
- def group_index_for_aggregation(index, multi_index_level = -1)
3049
- case index
3050
- when DaruLite::MultiIndex
3051
- groups_by_pos = DaruLite::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
3052
-
3053
- new_index = DaruLite::MultiIndex.from_tuples(groups_by_pos.keys).coerce_index
3054
- pos_tuples = groups_by_pos.values
3055
- when DaruLite::Index, DaruLite::CategoricalIndex
3056
- new_index = Array(index).uniq
3057
- pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
3058
- else raise
3059
- end
3060
-
3061
- [pos_tuples, new_index]
3062
- end
3063
-
3064
- # coerce ranges, integers and array in appropriate ways
3065
- def coerce_positions(*positions, size)
3066
- if positions.size == 1
3067
- case positions.first
3068
- when Integer
3069
- positions.first
3070
- when Range
3071
- size.times.to_a[positions.first]
3072
- else
3073
- raise ArgumentError, 'Unknown position type.'
3074
- end
3075
- else
3076
- positions
3077
- end
3078
- end
3079
866
  end
3080
867
  end