daru_lite 0.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
  3. data/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
  4. data/.github/workflows/ci.yml +20 -0
  5. data/.rubocop_todo.yml +35 -33
  6. data/README.md +19 -115
  7. data/daru_lite.gemspec +1 -0
  8. data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
  9. data/lib/daru_lite/data_frame/calculatable.rb +140 -0
  10. data/lib/daru_lite/data_frame/convertible.rb +107 -0
  11. data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
  12. data/lib/daru_lite/data_frame/fetchable.rb +301 -0
  13. data/lib/daru_lite/data_frame/filterable.rb +144 -0
  14. data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
  15. data/lib/daru_lite/data_frame/indexable.rb +168 -0
  16. data/lib/daru_lite/data_frame/iterable.rb +339 -0
  17. data/lib/daru_lite/data_frame/joinable.rb +152 -0
  18. data/lib/daru_lite/data_frame/missable.rb +75 -0
  19. data/lib/daru_lite/data_frame/pivotable.rb +108 -0
  20. data/lib/daru_lite/data_frame/queryable.rb +67 -0
  21. data/lib/daru_lite/data_frame/setable.rb +109 -0
  22. data/lib/daru_lite/data_frame/sortable.rb +241 -0
  23. data/lib/daru_lite/dataframe.rb +142 -2355
  24. data/lib/daru_lite/index/index.rb +13 -0
  25. data/lib/daru_lite/maths/statistics/vector.rb +1 -1
  26. data/lib/daru_lite/vector/aggregatable.rb +9 -0
  27. data/lib/daru_lite/vector/calculatable.rb +78 -0
  28. data/lib/daru_lite/vector/convertible.rb +77 -0
  29. data/lib/daru_lite/vector/duplicatable.rb +17 -0
  30. data/lib/daru_lite/vector/fetchable.rb +175 -0
  31. data/lib/daru_lite/vector/filterable.rb +128 -0
  32. data/lib/daru_lite/vector/indexable.rb +77 -0
  33. data/lib/daru_lite/vector/iterable.rb +95 -0
  34. data/lib/daru_lite/vector/joinable.rb +17 -0
  35. data/lib/daru_lite/vector/missable.rb +124 -0
  36. data/lib/daru_lite/vector/queryable.rb +45 -0
  37. data/lib/daru_lite/vector/setable.rb +47 -0
  38. data/lib/daru_lite/vector/sortable.rb +113 -0
  39. data/lib/daru_lite/vector.rb +36 -932
  40. data/lib/daru_lite/version.rb +1 -1
  41. data/spec/data_frame/aggregatable_example.rb +65 -0
  42. data/spec/data_frame/buildable_example.rb +109 -0
  43. data/spec/data_frame/calculatable_example.rb +135 -0
  44. data/spec/data_frame/convertible_example.rb +180 -0
  45. data/spec/data_frame/duplicatable_example.rb +111 -0
  46. data/spec/data_frame/fetchable_example.rb +476 -0
  47. data/spec/data_frame/filterable_example.rb +250 -0
  48. data/spec/data_frame/indexable_example.rb +221 -0
  49. data/spec/data_frame/iterable_example.rb +465 -0
  50. data/spec/data_frame/joinable_example.rb +106 -0
  51. data/spec/data_frame/missable_example.rb +47 -0
  52. data/spec/data_frame/pivotable_example.rb +297 -0
  53. data/spec/data_frame/queryable_example.rb +92 -0
  54. data/spec/data_frame/setable_example.rb +482 -0
  55. data/spec/data_frame/sortable_example.rb +350 -0
  56. data/spec/dataframe_spec.rb +181 -3243
  57. data/spec/index/index_spec.rb +8 -0
  58. data/spec/vector/aggregatable_example.rb +27 -0
  59. data/spec/vector/calculatable_example.rb +82 -0
  60. data/spec/vector/convertible_example.rb +126 -0
  61. data/spec/vector/duplicatable_example.rb +48 -0
  62. data/spec/vector/fetchable_example.rb +463 -0
  63. data/spec/vector/filterable_example.rb +165 -0
  64. data/spec/vector/indexable_example.rb +201 -0
  65. data/spec/vector/iterable_example.rb +111 -0
  66. data/spec/vector/joinable_example.rb +25 -0
  67. data/spec/vector/missable_example.rb +88 -0
  68. data/spec/vector/queryable_example.rb +91 -0
  69. data/spec/vector/setable_example.rb +300 -0
  70. data/spec/vector/sortable_example.rb +242 -0
  71. data/spec/vector_spec.rb +111 -1805
  72. metadata +102 -3
  73. data/.github/ISSUE_TEMPLATE.md +0 -18
@@ -1,10 +1,40 @@
1
1
  require 'daru_lite/accessors/dataframe_by_row'
2
+ require 'daru_lite/data_frame/aggregatable'
3
+ require 'daru_lite/data_frame/calculatable'
4
+ require 'daru_lite/data_frame/convertible'
5
+ require 'daru_lite/data_frame/duplicatable'
6
+ require 'daru_lite/data_frame/fetchable'
7
+ require 'daru_lite/data_frame/filterable'
8
+ require 'daru_lite/data_frame/indexable'
9
+ require 'daru_lite/data_frame/i_o_able'
10
+ require 'daru_lite/data_frame/iterable'
11
+ require 'daru_lite/data_frame/joinable'
12
+ require 'daru_lite/data_frame/missable'
13
+ require 'daru_lite/data_frame/pivotable'
14
+ require 'daru_lite/data_frame/setable'
15
+ require 'daru_lite/data_frame/sortable'
16
+ require 'daru_lite/data_frame/queryable'
2
17
  require 'daru_lite/maths/arithmetic/dataframe'
3
18
  require 'daru_lite/maths/statistics/dataframe'
4
19
  require 'daru_lite/io/io'
5
20
 
6
21
  module DaruLite
7
22
  class DataFrame # rubocop:disable Metrics/ClassLength
23
+ include DaruLite::DataFrame::Aggregatable
24
+ include DaruLite::DataFrame::Calculatable
25
+ include DaruLite::DataFrame::Convertible
26
+ include DaruLite::DataFrame::Duplicatable
27
+ include DaruLite::DataFrame::Fetchable
28
+ include DaruLite::DataFrame::Filterable
29
+ include DaruLite::DataFrame::Indexable
30
+ include DaruLite::DataFrame::Iterable
31
+ include DaruLite::DataFrame::IOAble
32
+ include DaruLite::DataFrame::Joinable
33
+ include DaruLite::DataFrame::Missable
34
+ include DaruLite::DataFrame::Pivotable
35
+ include DaruLite::DataFrame::Setable
36
+ include DaruLite::DataFrame::Sortable
37
+ include DaruLite::DataFrame::Queryable
8
38
  include DaruLite::Maths::Arithmetic::DataFrame
9
39
  include DaruLite::Maths::Statistics::DataFrame
10
40
 
@@ -13,109 +43,6 @@ module DaruLite
13
43
  extend Gem::Deprecate
14
44
 
15
45
  class << self
16
- # Load data from a CSV file. Specify an optional block to grab the CSV
17
- # object and pre-condition it (for example use the `convert` or
18
- # `header_convert` methods).
19
- #
20
- # == Arguments
21
- #
22
- # * path - Local path / Remote URL of the file to load specified as a String.
23
- #
24
- # == Options
25
- #
26
- # Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
27
- # and uses those to eventually construct the resulting DataFrame.
28
- #
29
- # == Verbose Description
30
- #
31
- # You can specify all the options to the `.from_csv` function that you
32
- # do to the Ruby `CSV.read()` function, since this is what is used internally.
33
- #
34
- # For example, if the columns in your CSV file are separated by something
35
- # other that commas, you can use the `:col_sep` option. If you want to
36
- # convert numeric values to numbers and not keep them as strings, you can
37
- # use the `:converters` option and set it to `:numeric`.
38
- #
39
- # The `.from_csv` function uses the following defaults for reading CSV files
40
- # (that are passed into the `CSV.read()` function):
41
- #
42
- # {
43
- # :col_sep => ',',
44
- # :converters => :numeric
45
- # }
46
- def from_csv(path, opts = {}, &block)
47
- DaruLite::IO.from_csv path, opts, &block
48
- end
49
-
50
- # Read data from an Excel file into a DataFrame.
51
- #
52
- # == Arguments
53
- #
54
- # * path - Path of the file to be read.
55
- #
56
- # == Options
57
- #
58
- # *:worksheet_id - ID of the worksheet that is to be read.
59
- def from_excel(path, opts = {}, &block)
60
- DaruLite::IO.from_excel path, opts, &block
61
- end
62
-
63
- # Read a database query and returns a Dataset
64
- #
65
- # @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
66
- # @param query [String] The query to be executed
67
- #
68
- # @return A dataframe containing the data resulting from the query
69
- #
70
- # USE:
71
- #
72
- # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
73
- # DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
74
- #
75
- # #Alternatively
76
- #
77
- # require 'dbi'
78
- # DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
79
- def from_sql(dbh, query)
80
- DaruLite::IO.from_sql dbh, query
81
- end
82
-
83
- # Read a dataframe from AR::Relation
84
- #
85
- # @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
86
- # @param fields [Array] Field names to be loaded (optional)
87
- #
88
- # @return A dataframe containing the data loaded from the relation
89
- #
90
- # USE:
91
- #
92
- # # When Post model is defined as:
93
- # class Post < ActiveRecord::Base
94
- # scope :active, -> { where.not(published_at: nil) }
95
- # end
96
- #
97
- # # You can load active posts into a dataframe by:
98
- # DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
99
- def from_activerecord(relation, *fields)
100
- DaruLite::IO.from_activerecord relation, *fields
101
- end
102
-
103
- # Read the database from a plaintext file. For this method to work,
104
- # the data should be present in a plain text file in columns. See
105
- # spec/fixtures/bank2.dat for an example.
106
- #
107
- # == Arguments
108
- #
109
- # * path - Path of the file to be read.
110
- # * fields - Vector names of the resulting database.
111
- #
112
- # == Usage
113
- #
114
- # df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
115
- def from_plaintext(path, fields)
116
- DaruLite::IO.from_plaintext path, fields
117
- end
118
-
119
46
  # Create DataFrame by specifying rows as an Array of Arrays or Array of
120
47
  # DaruLite::Vector objects.
121
48
  def rows(source, opts = {})
@@ -316,179 +243,6 @@ module DaruLite
316
243
  update
317
244
  end
318
245
 
319
- # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
320
- # Defaults to *:vector*. Use of this method is not recommended for accessing
321
- # rows. Use df.row[:a] for accessing row with index ':a'.
322
- def [](*names)
323
- axis = extract_axis(names, :vector)
324
- dispatch_to_axis axis, :access, *names
325
- end
326
-
327
- # Retrive rows by positions
328
- # @param [Array<Integer>] positions of rows to retrive
329
- # @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
330
- # @example
331
- # df = DaruLite::DataFrame.new({
332
- # a: [1, 2, 3],
333
- # b: ['a', 'b', 'c']
334
- # })
335
- # df.row_at 1, 2
336
- # # => #<DaruLite::DataFrame(2x2)>
337
- # # a b
338
- # # 1 2 b
339
- # # 2 3 c
340
- def row_at(*positions)
341
- original_positions = positions
342
- positions = coerce_positions(*positions, nrows)
343
- validate_positions(*positions, nrows)
344
-
345
- if positions.is_a? Integer
346
- row = get_rows_for([positions])
347
- DaruLite::Vector.new row, index: @vectors
348
- else
349
- new_rows = get_rows_for(original_positions)
350
- DaruLite::DataFrame.new new_rows, index: @index.at(*original_positions), order: @vectors
351
- end
352
- end
353
-
354
- # Set rows by positions
355
- # @param [Array<Integer>] positions positions of rows to set
356
- # @param [Array, DaruLite::Vector] vector vector to be assigned
357
- # @example
358
- # df = DaruLite::DataFrame.new({
359
- # a: [1, 2, 3],
360
- # b: ['a', 'b', 'c']
361
- # })
362
- # df.set_row_at [0, 1], ['x', 'x']
363
- # df
364
- # #=> #<DaruLite::DataFrame(3x2)>
365
- # # a b
366
- # # 0 x x
367
- # # 1 x x
368
- # # 2 3 c
369
- def set_row_at(positions, vector)
370
- validate_positions(*positions, nrows)
371
- vector =
372
- if vector.is_a? DaruLite::Vector
373
- vector.reindex @vectors
374
- else
375
- DaruLite::Vector.new vector
376
- end
377
-
378
- raise SizeError, 'Vector length should match row length' if
379
- vector.size != @vectors.size
380
-
381
- @data.each_with_index do |vec, pos|
382
- vec.set_at(positions, vector.at(pos))
383
- end
384
- @index = @data[0].index
385
- set_size
386
- end
387
-
388
- # Retrive vectors by positions
389
- # @param [Array<Integer>] positions of vectors to retrive
390
- # @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
391
- # @example
392
- # df = DaruLite::DataFrame.new({
393
- # a: [1, 2, 3],
394
- # b: ['a', 'b', 'c']
395
- # })
396
- # df.at 0
397
- # # => #<DaruLite::Vector(3)>
398
- # # a
399
- # # 0 1
400
- # # 1 2
401
- # # 2 3
402
- def at(*positions)
403
- if AXES.include? positions.last
404
- axis = positions.pop
405
- return row_at(*positions) if axis == :row
406
- end
407
-
408
- original_positions = positions
409
- positions = coerce_positions(*positions, ncols)
410
- validate_positions(*positions, ncols)
411
-
412
- if positions.is_a? Integer
413
- @data[positions].dup
414
- else
415
- DaruLite::DataFrame.new positions.map { |pos| @data[pos].dup },
416
- index: @index,
417
- order: @vectors.at(*original_positions),
418
- name: @name
419
- end
420
- end
421
-
422
- # Set vectors by positions
423
- # @param [Array<Integer>] positions positions of vectors to set
424
- # @param [Array, DaruLite::Vector] vector vector to be assigned
425
- # @example
426
- # df = DaruLite::DataFrame.new({
427
- # a: [1, 2, 3],
428
- # b: ['a', 'b', 'c']
429
- # })
430
- # df.set_at [0], ['x', 'y', 'z']
431
- # df
432
- # #=> #<DaruLite::DataFrame(3x2)>
433
- # # a b
434
- # # 0 x a
435
- # # 1 y b
436
- # # 2 z c
437
- def set_at(positions, vector)
438
- if positions.last == :row
439
- positions.pop
440
- return set_row_at(positions, vector)
441
- end
442
-
443
- validate_positions(*positions, ncols)
444
- vector =
445
- if vector.is_a? DaruLite::Vector
446
- vector.reindex @index
447
- else
448
- DaruLite::Vector.new vector
449
- end
450
-
451
- raise SizeError, 'Vector length should match index length' if
452
- vector.size != @index.size
453
-
454
- positions.each { |pos| @data[pos] = vector }
455
- end
456
-
457
- # Insert a new row/vector of the specified name or modify a previous row.
458
- # Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
459
- # a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
460
- #
461
- # In case a DaruLite::Vector is specified after the equality the sign, the indexes
462
- # of the vector will be matched against the row/vector indexes of the DataFrame
463
- # before an insertion is performed. Unmatched indexes will be set to nil.
464
- def []=(*args)
465
- vector = args.pop
466
- axis = extract_axis(args)
467
- names = args
468
-
469
- dispatch_to_axis axis, :insert_or_modify, names, vector
470
- end
471
-
472
- def add_row(row, index = nil)
473
- self.row[*(index || @size)] = row
474
- end
475
-
476
- def add_vector(n, vector)
477
- self[n] = vector
478
- end
479
-
480
- def insert_vector(n, name, source)
481
- raise ArgumentError unless source.is_a? Array
482
-
483
- vector = DaruLite::Vector.new(source, index: @index, name: @name)
484
- @data << vector
485
- @vectors = @vectors.add name
486
- ordr = @vectors.dup.to_a
487
- elmnt = ordr.pop
488
- ordr.insert n, elmnt
489
- self.order = ordr
490
- end
491
-
492
246
  # Access a row or set/create a row. Refer #[] and #[]= docs for details.
493
247
  #
494
248
  # == Usage
@@ -498,1697 +252,177 @@ module DaruLite
498
252
  DaruLite::Accessors::DataFrameByRow.new(self)
499
253
  end
500
254
 
501
- # Extract a dataframe given row indexes or positions
502
- # @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
503
- # @return [DaruLite::Dataframe]
504
- def get_sub_dataframe(keys, by_position: true)
505
- return DaruLite::DataFrame.new({}) if keys == []
506
-
507
- keys = @index.pos(*keys) unless by_position
508
-
509
- sub_df = row_at(*keys)
510
- sub_df = sub_df.to_df.transpose if sub_df.is_a?(DaruLite::Vector)
511
-
512
- sub_df
513
- end
514
-
515
- # Duplicate the DataFrame entirely.
516
- #
517
- # == Arguments
518
- #
519
- # * +vectors_to_dup+ - An Array specifying the names of Vectors to
520
- # be duplicated. Will duplicate the entire DataFrame if not specified.
521
- def dup(vectors_to_dup = nil)
522
- vectors_to_dup ||= @vectors.to_a
523
-
524
- src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
525
- new_order = DaruLite::Index.new(vectors_to_dup)
526
-
527
- DaruLite::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
528
- end
529
-
530
- # Only clone the structure of the DataFrame.
531
- def clone_structure
532
- DaruLite::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
533
- end
534
-
535
- # Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
536
- # preserved.
537
- #
538
- # == Arguments
539
- #
540
- # +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
541
- # a view of the whole data frame otherwise.
542
- def clone(*vectors_to_clone)
543
- vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
544
- vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
545
-
546
- h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
547
- DaruLite::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
548
- end
549
-
550
- # Returns a 'shallow' copy of DataFrame if missing data is not present,
551
- # or a full copy of only valid data if missing data is present.
552
- def clone_only_valid
553
- if include_values?(*DaruLite::MISSING_VALUES)
554
- reject_values(*DaruLite::MISSING_VALUES)
555
- else
556
- clone
557
- end
558
- end
559
-
560
- # Creates a new duplicate dataframe containing only rows
561
- # without a single missing value.
562
- def dup_only_valid(vecs = nil)
563
- rows_with_nil = @data.map { |vec| vec.indexes(*DaruLite::MISSING_VALUES) }
564
- .inject(&:concat)
565
- .uniq
566
-
567
- row_indexes = @index.to_a
568
- (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
569
- end
570
- deprecate :dup_only_valid, :reject_values, 2016, 10
571
-
572
- # Returns a dataframe in which rows with any of the mentioned values
573
- # are ignored.
574
- # @param [Array] values to reject to form the new dataframe
575
- # @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
576
- # contain the mentioned values
577
- # @example
578
- # df = DaruLite::DataFrame.new({
579
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
580
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
581
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
582
- # }, index: 11..18)
583
- # df.reject_values nil, Float::NAN
584
- # # => #<DaruLite::DataFrame(2x3)>
585
- # # a b c
586
- # # 11 1 a a
587
- # # 18 7 8 7
588
- def reject_values(*values)
589
- positions =
590
- size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
591
- # Handle the case when positions size is 1 and #row_at wouldn't return a df
592
- if positions.size == 1
593
- pos = positions.first
594
- row_at(pos..pos)
595
- else
596
- row_at(*positions)
597
- end
598
- end
599
-
600
- # Replace specified values with given value
601
- # @param [Array] old_values values to replace with new value
602
- # @param [object] new_value new value to replace with
603
- # @return [DaruLite::DataFrame] Data Frame itself with old values replace
604
- # with new value
605
- # @example
606
- # df = DaruLite::DataFrame.new({
607
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
608
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
609
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
610
- # }, index: 11..18)
611
- # df.replace_values nil, Float::NAN
612
- # # => #<DaruLite::DataFrame(8x3)>
613
- # # a b c
614
- # # 11 1 a a
615
- # # 12 2 b NaN
616
- # # 13 3 NaN 3
617
- # # 14 NaN NaN 4
618
- # # 15 NaN NaN 3
619
- # # 16 NaN 3 5
620
- # # 17 1 5 NaN
621
- # # 18 7 8 7
622
- def replace_values(old_values, new_value)
623
- @data.each { |vec| vec.replace_values old_values, new_value }
624
- self
625
- end
626
-
627
- # Rolling fillna
628
- # replace all Float::NAN and NIL values with the preceeding or following value
629
- #
630
- # @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
631
- #
632
- # @example
633
- # df = DaruLite::DataFrame.new({
634
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
635
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
636
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
637
- # })
638
- #
639
- # => #<DaruLite::DataFrame(8x3)>
640
- # a b c
641
- # 0 1 a a
642
- # 1 2 b NaN
643
- # 2 3 nil 3
644
- # 3 nil NaN 4
645
- # 4 NaN nil 3
646
- # 5 nil 3 5
647
- # 6 1 5 nil
648
- # 7 7 nil 7
649
- #
650
- # 2.3.3 :068 > df.rolling_fillna(:forward)
651
- # => #<DaruLite::DataFrame(8x3)>
652
- # a b c
653
- # 0 1 a a
654
- # 1 2 b a
655
- # 2 3 b 3
656
- # 3 3 b 4
657
- # 4 3 b 3
658
- # 5 3 3 5
659
- # 6 1 5 5
660
- # 7 7 5 7
661
- #
662
- def rolling_fillna!(direction = :forward)
663
- @data.each { |vec| vec.rolling_fillna!(direction) }
664
- self
665
- end
666
-
667
- def rolling_fillna(direction = :forward)
668
- dup.rolling_fillna!(direction)
669
- end
670
-
671
- # Return unique rows by vector specified or all vectors
672
- #
673
- # @param vtrs [String][Symbol] vector names(s) that should be considered
674
- #
675
- # @example
676
- #
677
- # => #<DaruLite::DataFrame(6x2)>
678
- # a b
679
- # 0 1 a
680
- # 1 2 b
681
- # 2 3 c
682
- # 3 4 d
683
- # 2 3 c
684
- # 3 4 f
685
- #
686
- # 2.3.3 :> df.unique
687
- # => #<DaruLite::DataFrame(5x2)>
688
- # a b
689
- # 0 1 a
690
- # 1 2 b
691
- # 2 3 c
692
- # 3 4 d
693
- # 3 4 f
694
- #
695
- # 2.3.3 :> df.unique(:a)
696
- # => #<DaruLite::DataFrame(5x2)>
697
- # a b
698
- # 0 1 a
699
- # 1 2 b
700
- # 2 3 c
701
- # 3 4 d
702
- #
703
- def uniq(*vtrs)
704
- vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
705
- grouped = group_by(vecs)
706
- indexes = grouped.groups.values.map { |v| v[0] }.sort
707
- row[*indexes]
708
- end
709
-
710
- # Iterate over each index of the DataFrame.
711
- def each_index(&block)
712
- return to_enum(:each_index) unless block
713
-
714
- @index.each(&block)
715
-
716
- self
717
- end
718
-
719
- # Iterate over each vector
720
- def each_vector(&block)
721
- return to_enum(:each_vector) unless block
255
+ # Delete a vector
256
+ def delete_vector(vector)
257
+ raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
722
258
 
723
- @data.each(&block)
259
+ @data.delete_at @vectors[vector]
260
+ @vectors = DaruLite::Index.new @vectors.to_a - [vector]
724
261
 
725
262
  self
726
263
  end
727
264
 
728
- alias each_column each_vector
729
-
730
- # Iterate over each vector alongwith the name of the vector
731
- def each_vector_with_index
732
- return to_enum(:each_vector_with_index) unless block_given?
733
-
734
- @vectors.each do |vector|
735
- yield @data[@vectors[vector]], vector
736
- end
265
+ # Deletes a list of vectors
266
+ def delete_vectors(*vectors)
267
+ Array(vectors).each { |vec| delete_vector vec }
737
268
 
738
269
  self
739
270
  end
740
271
 
741
- alias each_column_with_index each_vector_with_index
742
-
743
- # Iterate over each row
744
- def each_row
745
- return to_enum(:each_row) unless block_given?
746
-
747
- @index.size.times do |pos|
748
- yield row_at(pos)
749
- end
750
-
751
- self
752
- end
272
+ # Delete a row
273
+ def delete_row(index)
274
+ idx = named_index_for index
753
275
 
754
- def each_row_with_index
755
- return to_enum(:each_row_with_index) unless block_given?
276
+ raise IndexError, "Index #{index} does not exist." unless @index.include? idx
756
277
 
757
- @index.each do |index|
758
- yield access_row(index), index
278
+ @index = DaruLite::Index.new(@index.to_a - [idx])
279
+ each_vector do |vector|
280
+ vector.delete_at idx
759
281
  end
760
282
 
761
- self
762
- end
763
-
764
- # Iterate over each row or vector of the DataFrame. Specify axis
765
- # by passing :vector or :row as the argument. Default to :vector.
766
- #
767
- # == Description
768
- #
769
- # `#each` works exactly like Array#each. The default mode for `each`
770
- # is to iterate over the columns of the DataFrame. To iterate over
771
- # rows you must pass the axis, i.e `:row` as an argument.
772
- #
773
- # == Arguments
774
- #
775
- # * +axis+ - The axis to iterate over. Can be :vector (or :column)
776
- # or :row. Default to :vector.
777
- def each(axis = :vector, &block)
778
- dispatch_to_axis axis, :each, &block
779
- end
780
-
781
- # Iterate over a row or vector and return results in a DaruLite::Vector.
782
- # Specify axis with :vector or :row. Default to :vector.
783
- #
784
- # == Description
785
- #
786
- # The #collect iterator works similar to #map, the only difference
787
- # being that it returns a DaruLite::Vector comprising of the results of
788
- # each block run. The resultant Vector has the same index as that
789
- # of the axis over which collect has iterated. It also accepts the
790
- # optional axis argument.
791
- #
792
- # == Arguments
793
- #
794
- # * +axis+ - The axis to iterate over. Can be :vector (or :column)
795
- # or :row. Default to :vector.
796
- def collect(axis = :vector, &block)
797
- dispatch_to_axis_pl axis, :collect, &block
283
+ set_size
798
284
  end
799
285
 
800
- # Map over each vector or row of the data frame according to
801
- # the argument specified. Will return an Array of the resulting
802
- # elements. To map over each row/vector and get a DataFrame,
803
- # see #recode.
804
- #
805
- # == Description
806
- #
807
- # The #map iterator works like Array#map. The value returned by
808
- # each run of the block is added to an Array and the Array is
809
- # returned. This method also accepts an axis argument, like #each.
810
- # The default is :vector.
811
- #
812
- # == Arguments
813
- #
814
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
815
- # Default to :vector.
816
- def map(axis = :vector, &block)
817
- dispatch_to_axis_pl axis, :map, &block
818
- end
286
+ # Delete a row based on its position
287
+ # More robust than #delete_row when working with a CategoricalIndex or when the
288
+ # Index includes integers
289
+ def delete_at_position(position)
290
+ raise IndexError, "Position #{position} does not exist." unless position < size
819
291
 
820
- # Destructive map. Modifies the DataFrame. Each run of the block
821
- # must return a DaruLite::Vector. You can specify the axis to map over
822
- # as the argument. Default to :vector.
823
- #
824
- # == Arguments
825
- #
826
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
827
- # Default to :vector.
828
- def map!(axis = :vector, &block)
829
- if %i[vector column].include?(axis)
830
- map_vectors!(&block)
831
- elsif axis == :row
832
- map_rows!(&block)
833
- end
834
- end
292
+ each_vector { |vector| vector.delete_at_position(position) }
293
+ @index = @index.delete_at(position)
835
294
 
836
- # Maps over the DataFrame and returns a DataFrame. Each run of the
837
- # block must return a DaruLite::Vector object. You can specify the axis
838
- # to map over. Default to :vector.
839
- #
840
- # == Description
841
- #
842
- # Recode works similarly to #map, but an important difference between
843
- # the two is that recode returns a modified DaruLite::DataFrame instead
844
- # of an Array. For this reason, #recode expects that every run of the
845
- # block to return a DaruLite::Vector.
846
- #
847
- # Just like map and each, recode also accepts an optional _axis_ argument.
848
- #
849
- # == Arguments
850
- #
851
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
852
- # Default to :vector.
853
- def recode(axis = :vector, &block)
854
- dispatch_to_axis_pl axis, :recode, &block
295
+ set_size
855
296
  end
856
297
 
857
- # Retain vectors or rows if the block returns a truthy value.
858
- #
859
- # == Description
860
- #
861
- # For filtering out certain rows/vectors based on their values,
862
- # use the #filter method. By default it iterates over vectors and
863
- # keeps those vectors for which the block returns true. It accepts
864
- # an optional axis argument which lets you specify whether you want
865
- # to iterate over vectors or rows.
866
- #
867
- # == Arguments
868
- #
869
- # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
870
- # Default to :vector.
871
- #
872
- # == Usage
873
- #
874
- # # Filter vectors
875
- #
876
- # df.filter do |vector|
877
- # vector.type == :numeric and vector.median < 50
878
- # end
879
- #
880
- # # Filter rows
298
+ # Creates a DataFrame with the random data, of n size.
299
+ # If n not given, uses original number of rows.
881
300
  #
882
- # df.filter(:row) do |row|
883
- # row[:a] + row[:d] < 100
884
- # end
885
- def filter(axis = :vector, &block)
886
- dispatch_to_axis_pl axis, :filter, &block
887
- end
888
-
889
- def recode_vectors
890
- block_given? or return to_enum(:recode_vectors)
891
-
892
- dup.tap do |df|
893
- df.each_vector_with_index do |v, i|
894
- df[*i] = should_be_vector!(yield(v))
895
- end
896
- end
897
- end
898
-
899
- def recode_rows
900
- block_given? or return to_enum(:recode_rows)
901
-
902
- dup.tap do |df|
903
- df.each_row_with_index do |r, i|
904
- df.row[i] = should_be_vector!(yield(r))
301
+ # @return {DaruLite::DataFrame}
302
+ def bootstrap(n = nil)
303
+ n ||= nrows
304
+ DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
305
+ n.times do
306
+ df_boot.add_row(row[rand(n)])
905
307
  end
308
+ df_boot.update
906
309
  end
907
310
  end
908
311
 
909
- # Map each vector and return an Array.
910
- def map_vectors(&block)
911
- return to_enum(:map_vectors) unless block
912
-
913
- @data.map(&block)
914
- end
915
-
916
- # Destructive form of #map_vectors
917
- def map_vectors!
918
- return to_enum(:map_vectors!) unless block_given?
919
-
920
- vectors.dup.each do |n|
921
- self[n] = should_be_vector!(yield(self[n]))
922
- end
923
-
924
- self
925
- end
926
-
927
- # Map vectors alongwith the index.
928
- def map_vectors_with_index(&block)
929
- return to_enum(:map_vectors_with_index) unless block
930
-
931
- each_vector_with_index.map(&block)
932
- end
933
-
934
- # Map each row
935
- def map_rows(&block)
936
- return to_enum(:map_rows) unless block
937
-
938
- each_row.map(&block)
939
- end
940
-
941
- def map_rows_with_index(&block)
942
- return to_enum(:map_rows_with_index) unless block
943
-
944
- each_row_with_index.map(&block)
945
- end
946
-
947
- def map_rows!
948
- return to_enum(:map_rows!) unless block_given?
949
-
950
- index.dup.each do |i|
951
- row[i] = should_be_vector!(yield(row[i]))
952
- end
953
-
954
- self
955
- end
956
-
957
- def apply_method(method, keys: nil, by_position: true)
958
- df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
959
-
960
- case method
961
- when Symbol then df.send(method)
962
- when Proc then method.call(df)
963
- when Array then method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
964
- else raise
965
- end
966
- end
967
- alias apply_method_on_sub_df apply_method
968
-
969
- # Retrieves a DaruLite::Vector, based on the result of calculation
970
- # performed on each row.
971
- def collect_rows(&block)
972
- return to_enum(:collect_rows) unless block
973
-
974
- DaruLite::Vector.new(each_row.map(&block), index: @index)
975
- end
976
-
977
- def collect_row_with_index(&block)
978
- return to_enum(:collect_row_with_index) unless block
979
-
980
- DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
981
- end
982
-
983
- # Retrives a DaruLite::Vector, based on the result of calculation
984
- # performed on each vector.
985
- def collect_vectors(&block)
986
- return to_enum(:collect_vectors) unless block
987
-
988
- DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
989
- end
990
-
991
- def collect_vector_with_index(&block)
992
- return to_enum(:collect_vector_with_index) unless block
993
-
994
- DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
995
- end
996
-
997
- # Generate a matrix, based on vector names of the DataFrame.
998
- #
999
- # @return {::Matrix}
1000
- # :nocov:
1001
- # FIXME: Even not trying to cover this: I can't get, how it is expected
1002
- # to work.... -- zverok
1003
- def collect_matrix
1004
- return to_enum(:collect_matrix) unless block_given?
1005
-
1006
- vecs = vectors.to_a
1007
- rows = vecs.collect do |row|
1008
- vecs.collect do |col|
1009
- yield row, col
1010
- end
1011
- end
1012
-
1013
- Matrix.rows(rows)
1014
- end
1015
- # :nocov:
1016
-
1017
- # Delete a vector
1018
- def delete_vector(vector)
1019
- raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
1020
-
1021
- @data.delete_at @vectors[vector]
1022
- @vectors = DaruLite::Index.new @vectors.to_a - [vector]
1023
-
1024
- self
1025
- end
1026
-
1027
- # Deletes a list of vectors
1028
- def delete_vectors(*vectors)
1029
- Array(vectors).each { |vec| delete_vector vec }
1030
-
1031
- self
1032
- end
1033
-
1034
- # Delete a row
1035
- def delete_row(index)
1036
- idx = named_index_for index
1037
-
1038
- raise IndexError, "Index #{index} does not exist." unless @index.include? idx
1039
-
1040
- @index = DaruLite::Index.new(@index.to_a - [idx])
1041
- each_vector do |vector|
1042
- vector.delete_at idx
1043
- end
1044
-
1045
- set_size
1046
- end
1047
-
1048
- # Creates a DataFrame with the random data, of n size.
1049
- # If n not given, uses original number of rows.
1050
- #
1051
- # @return {DaruLite::DataFrame}
1052
- def bootstrap(n = nil)
1053
- n ||= nrows
1054
- DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
1055
- n.times do
1056
- df_boot.add_row(row[rand(n)])
1057
- end
1058
- df_boot.update
1059
- end
1060
- end
1061
-
1062
- def keep_row_if
1063
- @index
1064
- .reject { |idx| yield access_row(idx) }
1065
- .each { |idx| delete_row idx }
1066
- end
1067
-
1068
- def keep_vector_if
1069
- @vectors.each do |vector|
1070
- delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
1071
- end
1072
- end
1073
-
1074
- # creates a new vector with the data of a given field which the block returns true
1075
- def filter_vector(vec, &block)
1076
- DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
1077
- end
1078
-
1079
- # Iterates over each row and retains it in a new DataFrame if the block returns
1080
- # true for that row.
1081
- def filter_rows
1082
- return to_enum(:filter_rows) unless block_given?
1083
-
1084
- keep_rows = @index.map { |index| yield access_row(index) }
1085
-
1086
- where keep_rows
1087
- end
1088
-
1089
- # Iterates over each vector and retains it in a new DataFrame if the block returns
1090
- # true for that vector.
1091
- def filter_vectors(&block)
1092
- return to_enum(:filter_vectors) unless block
1093
-
1094
- dup.tap { |df| df.keep_vector_if(&block) }
1095
- end
1096
-
1097
- # Test each row with one or more tests.
1098
- # @param tests [Proc] Each test is a Proc with the form
1099
- # *Proc.new {|row| row[:age] > 0}*
1100
- # The function returns an array with all errors.
1101
- #
1102
- # FIXME: description here is too sparse. As far as I can get,
1103
- # it should tell something about that each test is [descr, fields, block],
1104
- # and that first value may be column name to output. - zverok, 2016-05-18
1105
- def verify(*tests)
1106
- id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
1107
-
1108
- each_row_with_index.map do |row, i|
1109
- tests.reject { |*_, block| block.call(row) }
1110
- .map { |test| verify_error_message row, test, id, i }
1111
- end.flatten
1112
- end
1113
-
1114
- # DSL for yielding each row and returning a DaruLite::Vector based on the
1115
- # value each run of the block returns.
1116
- #
1117
- # == Usage
1118
- #
1119
- # a1 = DaruLite::Vector.new([1, 2, 3, 4, 5, 6, 7])
1120
- # a2 = DaruLite::Vector.new([10, 20, 30, 40, 50, 60, 70])
1121
- # a3 = DaruLite::Vector.new([100, 200, 300, 400, 500, 600, 700])
1122
- # ds = DaruLite::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
1123
- # total = ds.vector_by_calculation { a + b + c }
1124
- # # <DaruLite::Vector:82314050 @name = nil @size = 7 >
1125
- # # nil
1126
- # # 0 111
1127
- # # 1 222
1128
- # # 2 333
1129
- # # 3 444
1130
- # # 4 555
1131
- # # 5 666
1132
- # # 6 777
1133
- def vector_by_calculation(&block)
1134
- a = each_row.map { |r| r.instance_eval(&block) }
1135
-
1136
- DaruLite::Vector.new a, index: @index
1137
- end
1138
-
1139
- # Reorder the vectors in a dataframe
1140
- # @param [Array] order_array new order of the vectors
1141
- # @example
1142
- # df = DaruLite::DataFrame({
1143
- # a: [1, 2, 3],
1144
- # b: [4, 5, 6]
1145
- # }, order: [:a, :b])
1146
- # df.order = [:b, :a]
1147
- # df
1148
- # # => #<DaruLite::DataFrame(3x2)>
1149
- # # b a
1150
- # # 0 4 1
1151
- # # 1 5 2
1152
- # # 2 6 3
1153
- def order=(order_array)
1154
- raise ArgumentError, 'Invalid order' unless
1155
- order_array.sort == vectors.to_a.sort
1156
-
1157
- initialize(to_h, order: order_array)
1158
- end
1159
-
1160
- # Return the dataframe with rotate vectors positions, the vector at position count is now
1161
- # the first vector of the dataframe.
1162
- # If only one vector in the dataframe, the dataframe is return without any change.
1163
- # @param count => Integer, the vector at position count will be the first vector of the dataframe.
1164
- # @example
1165
- # df = DaruLite::DataFrame({
1166
- # a: [1, 2, 3],
1167
- # b: [4, 5, 6],
1168
- # total: [5, 7, 9],
1169
- # })
1170
- # df.rotate_vectors(-1)
1171
- # df
1172
- # # => #<DaruLite::DataFrame(3x3)>
1173
- # # total b a
1174
- # # 0 5 4 1
1175
- # # 1 7 5 2
1176
- # # 2 9 6 3
1177
- def rotate_vectors(count = -1)
1178
- return self unless vectors.many?
1179
-
1180
- self.order = vectors.to_a.rotate(count)
1181
- self
1182
- end
1183
-
1184
- # Returns a vector, based on a string with a calculation based
1185
- # on vector.
1186
- #
1187
- # The calculation will be eval'ed, so you can put any variable
1188
- # or expression valid on ruby.
1189
- #
1190
- # For example:
1191
- # a = DaruLite::Vector.new [1,2]
1192
- # b = DaruLite::Vector.new [3,4]
1193
- # ds = DaruLite::DataFrame.new({:a => a,:b => b})
1194
- # ds.compute("a+b")
1195
- # => Vector [4,6]
1196
- def compute(text, &block)
1197
- return instance_eval(&block) if block
1198
-
1199
- instance_eval(text)
1200
- end
1201
-
1202
- # Return a vector with the number of missing values in each row.
1203
- #
1204
- # == Arguments
1205
- #
1206
- # * +missing_values+ - An Array of the values that should be
1207
- # treated as 'missing'. The default missing value is *nil*.
1208
- def missing_values_rows(missing_values = [nil])
1209
- number_of_missing = each_row.map do |row|
1210
- row.indexes(*missing_values).size
1211
- end
1212
-
1213
- DaruLite::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
1214
- end
1215
-
1216
- # TODO: remove next version
1217
- alias vector_missing_values missing_values_rows
1218
-
1219
- def has_missing_data?
1220
- @data.any? { |vec| vec.include_values?(*DaruLite::MISSING_VALUES) }
1221
- end
1222
- alias flawed? has_missing_data?
1223
- deprecate :has_missing_data?, :include_values?, 2016, 10
1224
- deprecate :flawed?, :include_values?, 2016, 10
1225
-
1226
- # Check if any of given values occur in the data frame
1227
- # @param [Array] values to check for
1228
- # @return [true, false] true if any of the given values occur in the
1229
- # dataframe, false otherwise
1230
- # @example
1231
- # df = DaruLite::DataFrame.new({
1232
- # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
1233
- # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
1234
- # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
1235
- # }, index: 11..18)
1236
- # df.include_values? nil
1237
- # # => true
1238
- def include_values?(*values)
1239
- @data.any? { |vec| vec.include_values?(*values) }
1240
- end
1241
-
1242
312
  # Return a nested hash using vector names as keys and an array constructed of
1243
313
  # hashes with other values. If block provided, is used to provide the
1244
314
  # values, with parameters +row+ of dataset, +current+ last hash on
1245
315
  # hierarchy and +name+ of the key to include
1246
- def nest(*tree_keys, &block)
1247
- tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
1248
-
1249
- each_row.with_object({}) do |row, current|
1250
- # Create tree
1251
- *keys, last = tree_keys
1252
- current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
1253
- name = row[last]
1254
-
1255
- if block
1256
- current[name] = yield(row, current, name)
1257
- else
1258
- current[name] ||= []
1259
- current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
1260
- end
1261
- end
1262
- end
1263
-
1264
- def vector_count_characters(vecs = nil)
1265
- vecs ||= @vectors.to_a
1266
-
1267
- collect_rows do |row|
1268
- vecs.sum { |v| row[v].to_s.size }
1269
- end
1270
- end
1271
-
1272
- def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
1273
- self[name]
1274
- .split_by_separator(sep)
1275
- .each { |k, v| self[:"#{name}#{join}#{k}"] = v }
1276
- end
1277
-
1278
- # Return the number of rows and columns of the DataFrame in an Array.
1279
- def shape
1280
- [nrows, ncols]
1281
- end
1282
-
1283
- # The number of rows
1284
- def nrows
1285
- @index.size
1286
- end
1287
-
1288
- # The number of vectors
1289
- def ncols
1290
- @vectors.size
1291
- end
1292
-
1293
- # Check if a vector is present
1294
- def has_vector?(vector)
1295
- @vectors.include? vector
1296
- end
1297
-
1298
- # Works like Array#any?.
1299
- #
1300
- # @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
1301
- # :row. A DaruLite::Vector object is yielded in the block.
1302
- # @example Using any?
1303
- # df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
1304
- # df.any?(:row) do |row|
1305
- # row[:a] < 3 and row[:b] == 'b'
1306
- # end #=> true
1307
- def any?(axis = :vector, &block)
1308
- if %i[vector column].include?(axis)
1309
- @data.any?(&block)
1310
- elsif axis == :row
1311
- each_row do |row|
1312
- return true if yield(row)
1313
- end
1314
- false
1315
- else
1316
- raise ArgumentError, "Unidentified axis #{axis}"
1317
- end
1318
- end
1319
-
1320
- # Works like Array#all?
1321
- #
1322
- # @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
1323
- # :row. A DaruLite::Vector object is yielded in the block.
1324
- # @example Using all?
1325
- # df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
1326
- # df.all?(:row) do |row|
1327
- # row[:a] < 10
1328
- # end #=> true
1329
- def all?(axis = :vector, &block)
1330
- if %i[vector column].include?(axis)
1331
- @data.all?(&block)
1332
- elsif axis == :row
1333
- each_row.all?(&block)
1334
- else
1335
- raise ArgumentError, "Unidentified axis #{axis}"
1336
- end
1337
- end
1338
-
1339
- # The first ten elements of the DataFrame
1340
- #
1341
- # @param [Fixnum] quantity (10) The number of elements to display from the top.
1342
- def head(quantity = 10)
1343
- row.at 0..(quantity - 1)
1344
- end
1345
-
1346
- alias first head
1347
-
1348
- # The last ten elements of the DataFrame
1349
- #
1350
- # @param [Fixnum] quantity (10) The number of elements to display from the bottom.
1351
- def tail(quantity = 10)
1352
- start = [-quantity, -size].max
1353
- row.at start..-1
1354
- end
1355
-
1356
- alias last tail
1357
-
1358
- # Sum all numeric/specified vectors in the DataFrame.
1359
- #
1360
- # Returns a new vector that's a containing a sum of all numeric
1361
- # or specified vectors of the DataFrame. By default, if the vector
1362
- # contains a nil, the sum is nil.
1363
- # With :skipnil argument set to true, nil values are assumed to be
1364
- # 0 (zero) and the sum vector is returned.
1365
- #
1366
- # @param args [Array] List of vectors to sum. Default is nil in which case
1367
- # all numeric vectors are summed.
1368
- #
1369
- # @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
1370
- #
1371
- # @return Vector with sum of all vectors specified in the argument.
1372
- # If vecs parameter is empty, sum all numeric vector.
1373
- #
1374
- # @example
1375
- # df = DaruLite::DataFrame.new({
1376
- # a: [1, 2, nil],
1377
- # b: [2, 1, 3],
1378
- # c: [1, 1, 1]
1379
- # })
1380
- # => #<DaruLite::DataFrame(3x3)>
1381
- # a b c
1382
- # 0 1 2 1
1383
- # 1 2 1 1
1384
- # 2 nil 3 1
1385
- # df.vector_sum [:a, :c]
1386
- # => #<DaruLite::Vector(3)>
1387
- # 0 2
1388
- # 1 3
1389
- # 2 nil
1390
- # df.vector_sum
1391
- # => #<DaruLite::Vector(3)>
1392
- # 0 4
1393
- # 1 4
1394
- # 2 nil
1395
- # df.vector_sum skipnil: true
1396
- # => #<DaruLite::Vector(3)>
1397
- # c
1398
- # 0 4
1399
- # 1 4
1400
- # 2 4
1401
- #
1402
- def vector_sum(*args)
1403
- defaults = { vecs: nil, skipnil: false }
1404
- options = args.last.is_a?(::Hash) ? args.pop : {}
1405
- options = defaults.merge(options)
1406
- vecs = args[0] || options[:vecs]
1407
- skipnil = args[1] || options[:skipnil]
1408
-
1409
- vecs ||= numeric_vectors
1410
- sum = DaruLite::Vector.new [0] * @size, index: @index, name: @name, dtype: @dtype
1411
- vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
1412
- end
1413
-
1414
- # Calculate mean of the rows of the dataframe.
1415
- #
1416
- # == Arguments
1417
- #
1418
- # * +max_missing+ - The maximum number of elements in the row that can be
1419
- # zero for the mean calculation to happen. Default to 0.
1420
- def vector_mean(max_missing = 0)
1421
- # FIXME: in vector_sum we preserve created vector dtype, but
1422
- # here we are not. Is this by design or ...? - zverok, 2016-05-18
1423
- mean_vec = DaruLite::Vector.new [0] * @size, index: @index, name: "mean_#{@name}"
1424
-
1425
- each_row_with_index.with_object(mean_vec) do |(row, i), memo|
1426
- memo[i] = row.indexes(*DaruLite::MISSING_VALUES).size > max_missing ? nil : row.mean
1427
- end
1428
- end
1429
-
1430
- # Group elements by vector to perform operations on them. Returns a
1431
- # DaruLite::Core::GroupBy object.See the DaruLite::Core::GroupBy docs for a detailed
1432
- # list of possible operations.
1433
- #
1434
- # == Arguments
1435
- #
1436
- # * vectors - An Array contatining names of vectors to group by.
1437
- #
1438
- # == Usage
1439
- #
1440
- # df = DaruLite::DataFrame.new({
1441
- # a: %w{foo bar foo bar foo bar foo foo},
1442
- # b: %w{one one two three two two one three},
1443
- # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
1444
- # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
1445
- # })
1446
- # df.group_by([:a,:b,:c]).groups
1447
- # #=> {["bar", "one", 2]=>[1],
1448
- # # ["bar", "three", 1]=>[3],
1449
- # # ["bar", "two", 6]=>[5],
1450
- # # ["foo", "one", 1]=>[0],
1451
- # # ["foo", "one", 3]=>[6],
1452
- # # ["foo", "three", 8]=>[7],
1453
- # # ["foo", "two", 3]=>[2, 4]}
1454
- def group_by(*vectors)
1455
- vectors.flatten!
1456
- missing = vectors - @vectors.to_a
1457
- raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}") unless missing.empty?
1458
-
1459
- vectors = [@vectors.first] if vectors.empty?
1460
-
1461
- DaruLite::Core::GroupBy.new(self, vectors)
1462
- end
1463
-
1464
- def reindex_vectors(new_vectors)
1465
- unless new_vectors.is_a?(DaruLite::Index)
1466
- raise ArgumentError, 'Must pass the new index of type Index or its ' \
1467
- "subclasses, not #{new_vectors.class}"
1468
- end
1469
-
1470
- cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
1471
- new_vectors.each_with_object(cl) do |vec, memo|
1472
- memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
1473
- end
1474
- end
1475
-
1476
- def get_vector_anyways(v)
1477
- @vectors.include?(v) ? self[v].to_a : Array.new(size)
1478
- end
1479
-
1480
- # Concatenate another DataFrame along corresponding columns.
1481
- # If columns do not exist in both dataframes, they are filled with nils
1482
- def concat(other_df)
1483
- vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
1484
-
1485
- data = vectors.map do |v|
1486
- get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
1487
- end
1488
-
1489
- DaruLite::DataFrame.new(data, order: vectors)
1490
- end
1491
-
1492
- # Concatenates another DataFrame as #concat.
1493
- # Additionally it tries to preserve the index. If the indices contain
1494
- # common elements, #union will overwrite the according rows in the
1495
- # first dataframe.
1496
- def union(other_df)
1497
- index = (@index.to_a + other_df.index.to_a).uniq
1498
- df = row[*(@index.to_a - other_df.index.to_a)]
1499
-
1500
- df = df.concat(other_df)
1501
- df.index = DaruLite::Index.new(index)
1502
- df
1503
- end
1504
-
1505
- module SetSingleIndexStrategy
1506
- def self.uniq_size(df, col)
1507
- df[col].uniq.size
1508
- end
1509
-
1510
- def self.new_index(df, col)
1511
- DaruLite::Index.new(df[col].to_a)
1512
- end
1513
-
1514
- def self.delete_vector(df, col)
1515
- df.delete_vector(col)
1516
- end
1517
- end
1518
-
1519
- module SetCategoricalIndexStrategy
1520
- def self.new_index(df, col)
1521
- DaruLite::CategoricalIndex.new(df[col].to_a)
1522
- end
1523
-
1524
- def self.delete_vector(df, col)
1525
- df.delete_vector(col)
1526
- end
1527
- end
1528
-
1529
- module SetMultiIndexStrategy
1530
- def self.uniq_size(df, cols)
1531
- df[*cols].uniq.size
1532
- end
1533
-
1534
- def self.new_index(df, cols)
1535
- DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
1536
- mi.name = cols
1537
- end
1538
- end
1539
-
1540
- def self.delete_vector(df, cols)
1541
- df.delete_vectors(*cols)
1542
- end
1543
- end
1544
-
1545
- # Set a particular column as the new DF
1546
- def set_index(new_index_col, keep: false, categorical: false)
1547
- if categorical
1548
- strategy = SetCategoricalIndexStrategy
1549
- elsif new_index_col.respond_to?(:to_a)
1550
- strategy = SetMultiIndexStrategy
1551
- new_index_col = new_index_col.to_a
1552
- else
1553
- strategy = SetSingleIndexStrategy
1554
- end
1555
-
1556
- unless categorical
1557
- uniq_size = strategy.uniq_size(self, new_index_col)
1558
- raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
1559
- end
1560
-
1561
- self.index = strategy.new_index(self, new_index_col)
1562
- strategy.delete_vector(self, new_index_col) unless keep
1563
- self
1564
- end
1565
-
1566
- # Change the index of the DataFrame and preserve the labels of the previous
1567
- # indexing. New index can be DaruLite::Index or any of its subclasses.
1568
- #
1569
- # @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
1570
- # @example Reindexing DataFrame
1571
- # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
1572
- # index: ['a','b','c','d'])
1573
- # #=>
1574
- # ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
1575
- # # a b
1576
- # # a 1 11
1577
- # # b 2 22
1578
- # # c 3 33
1579
- # # d 4 44
1580
- # df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
1581
- # #=>
1582
- # ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
1583
- # # a b
1584
- # # b 2 22
1585
- # # 0 nil nil
1586
- # # a 1 11
1587
- # # g nil nil
1588
- def reindex(new_index)
1589
- unless new_index.is_a?(DaruLite::Index)
1590
- raise ArgumentError, 'Must pass the new index of type Index or its ' \
1591
- "subclasses, not #{new_index.class}"
1592
- end
1593
-
1594
- cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
1595
- new_index.each_with_object(cl) do |idx, memo|
1596
- memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
1597
- end
1598
- end
1599
-
1600
- def reset_index
1601
- index_df = index.to_df
1602
- names = index.name
1603
- names = [names] unless names.instance_of?(Array)
1604
- new_vectors = names + vectors.to_a
1605
- self.index = index_df.index
1606
- names.each do |name|
1607
- self[name] = index_df[name]
1608
- end
1609
- self.order = new_vectors
1610
- self
1611
- end
1612
-
1613
- # Reassign index with a new index of type DaruLite::Index or any of its subclasses.
1614
- #
1615
- # @param [DaruLite::Index] idx New index object on which the rows of the dataframe
1616
- # are to be indexed.
1617
- # @example Reassigining index of a DataFrame
1618
- # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
1619
- # df.index.to_a #=> [0,1,2,3]
1620
- #
1621
- # df.index = DaruLite::Index.new(['a','b','c','d'])
1622
- # df.index.to_a #=> ['a','b','c','d']
1623
- # df.row['a'].to_a #=> [1,11]
1624
- def index=(idx)
1625
- @index = Index.coerce idx
1626
- @data.each { |vec| vec.index = @index }
1627
-
1628
- self
1629
- end
1630
-
1631
- # Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
1632
- #
1633
- # @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
1634
- # be indexed. Must of the same size as ncols.
1635
- # @example Reassigning vectors of a DataFrame
1636
- # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
1637
- # df.vectors.to_a #=> [:a, :b, :c]
1638
- #
1639
- # df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
1640
- # df.vectors.to_a #=> [:foo, :bar, :baz]
1641
- def vectors=(new_index)
1642
- raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
1643
-
1644
- if new_index.size != ncols
1645
- raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
1646
- "dataframe size #{ncols}"
1647
- end
1648
-
1649
- @vectors = new_index
1650
- @data.zip(new_index.to_a).each do |vect, name|
1651
- vect.name = name
1652
- end
1653
- self
1654
- end
1655
-
1656
- # Renames the vectors
1657
- #
1658
- # == Arguments
1659
- #
1660
- # * name_map - A hash where the keys are the exising vector names and
1661
- # the values are the new names. If a vector is renamed
1662
- # to a vector name that is already in use, the existing
1663
- # one is overwritten.
1664
- #
1665
- # == Usage
1666
- #
1667
- # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1668
- # df.rename_vectors :a => :alpha, :c => :gamma
1669
- # df.vectors.to_a #=> [:alpha, :b, :gamma]
1670
- def rename_vectors(name_map)
1671
- existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
1672
- delete_vectors(*existing_targets)
1673
-
1674
- new_names = vectors.to_a.map { |v| name_map[v] || v }
1675
- self.vectors = DaruLite::Index.new new_names
1676
- end
1677
-
1678
- # Renames the vectors and returns itself
1679
- #
1680
- # == Arguments
1681
- #
1682
- # * name_map - A hash where the keys are the exising vector names and
1683
- # the values are the new names. If a vector is renamed
1684
- # to a vector name that is already in use, the existing
1685
- # one is overwritten.
1686
- #
1687
- # == Usage
1688
- #
1689
- # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1690
- # df.rename_vectors! :a => :alpha, :c => :gamma # df
1691
- def rename_vectors!(name_map)
1692
- rename_vectors(name_map)
1693
- self
1694
- end
1695
-
1696
- # Converts the vectors to a DaruLite::MultiIndex.
1697
- # The argument passed is used as the MultiIndex's top level
1698
- def add_level_to_vectors(top_level_label)
1699
- tuples = vectors.map { |label| [top_level_label, *label] }
1700
- self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
1701
- end
1702
-
1703
- # Return the indexes of all the numeric vectors. Will include vectors with nils
1704
- # alongwith numbers.
1705
- def numeric_vectors
1706
- # FIXME: Why _with_index ?..
1707
- each_vector_with_index
1708
- .select { |vec, _i| vec.numeric? }
1709
- .map(&:last)
1710
- end
1711
-
1712
- def numeric_vector_names
1713
- @vectors.select { |v| self[v].numeric? }
1714
- end
1715
-
1716
- # Return a DataFrame of only the numerical Vectors. If clone: false
1717
- # is specified as option, only a *view* of the Vectors will be
1718
- # returned. Defaults to clone: true.
1719
- def only_numerics(opts = {})
1720
- cln = opts[:clone] != false
1721
- arry = numeric_vectors.map { |v| self[v] }
1722
-
1723
- order = Index.new(numeric_vectors)
1724
- DaruLite::DataFrame.new(arry, clone: cln, order: order, index: @index)
1725
- end
1726
-
1727
- # Generate a summary of this DataFrame based on individual vectors in the DataFrame
1728
- # @return [String] String containing the summary of the DataFrame
1729
- def summary
1730
- summary = "= #{name}"
1731
- summary << "\n Number of rows: #{nrows}"
1732
- @vectors.each do |v|
1733
- summary << "\n Element:[#{v}]\n"
1734
- summary << self[v].summary(1)
1735
- end
1736
- summary
1737
- end
1738
-
1739
- # Sorts a dataframe (ascending/descending) in the given pripority sequence of
1740
- # vectors, with or without a block.
1741
- #
1742
- # @param vector_order [Array] The order of vector names in which the DataFrame
1743
- # should be sorted.
1744
- # @param opts [Hash] opts The options to sort with.
1745
- # @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
1746
- # or descending order. Specify Array corresponding to *order* for multiple
1747
- # sort orders.
1748
- # @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
1749
- # to be used for sorting, for each vector name in *order* as a hash of
1750
- # vector name and lambda expressions. In case a lambda for a vector is not
1751
- # specified, the default will be used.
1752
- # @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
1753
- # automatically or not when a block is provided.
1754
- # If set to True, nils will appear at top after sorting.
1755
- #
1756
- # @example Sort a dataframe with a vector sequence.
1757
- #
1758
- #
1759
- # df = DaruLite::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
1760
- #
1761
- # df.sort [:a, :b]
1762
- # # =>
1763
- # # <DaruLite::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
1764
- # # a b
1765
- # # 2 1 3
1766
- # # 0 1 5
1767
- # # 3 2 2
1768
- # # 1 2 4
1769
- # # 4 3 1
1770
- #
1771
- # @example Sort a dataframe without a block. Here nils will be handled automatically.
1772
- #
1773
- # df = DaruLite::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
1774
- #
1775
- # df.sort([:a])
1776
- # # =>
1777
- # # <DaruLite::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
1778
- # # a b
1779
- # # 1 nil 3
1780
- # # 3 nil 1
1781
- # # 0 -3 4
1782
- # # 2 -1 2
1783
- # # 4 5 4
1784
- #
1785
- # @example Sort a dataframe with a block with nils handled automatically.
1786
- #
1787
- # df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1788
- #
1789
- # df.sort [:b], by: {b: lambda { |a| a.length } }
1790
- # # NoMethodError: undefined method `length' for nil:NilClass
1791
- # # from (pry):8:in `block in __pry__'
1792
- #
1793
- # df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
1794
- #
1795
- # # =>
1796
- # # <DaruLite::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
1797
- # # a b
1798
- # # 2 1 nil
1799
- # # 5 1 nil
1800
- # # 4 -1 x
1801
- # # 1 -1 aa
1802
- # # 0 nil aaa
1803
- # # 3 nil baaa
1804
- #
1805
- # @example Sort a dataframe with a block with nils handled manually.
1806
- #
1807
- # df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1808
- #
1809
- # # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
1810
- # df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
1811
- #
1812
- # # =>
1813
- # #<DaruLite::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
1814
- # # a b
1815
- # # 4 -1 x
1816
- # # 1 -1 aa
1817
- # # 0 nil aaa
1818
- # # 3 nil baaa
1819
- # # 2 1 nil
1820
- # # 5 1 nil
1821
-
1822
- def sort!(vector_order, opts = {})
1823
- raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
1824
-
1825
- # To enable sorting with categorical data,
1826
- # map categories to integers preserving their order
1827
- old = convert_categorical_vectors vector_order
1828
- block = sort_prepare_block vector_order, opts
1829
-
1830
- order = @index.size.times.sort(&block)
1831
- new_index = @index.reorder order
1832
-
1833
- # To reverse map mapping of categorical data to integers
1834
- restore_categorical_vectors old
1835
-
1836
- @data.each do |vector|
1837
- vector.reorder! order
1838
- end
1839
-
1840
- self.index = new_index
1841
-
1842
- self
1843
- end
1844
-
1845
- # Non-destructive version of #sort!
1846
- def sort(vector_order, opts = {})
1847
- dup.sort! vector_order, opts
1848
- end
1849
-
1850
- # Pivots a data frame on specified vectors and applies an aggregate function
1851
- # to quickly generate a summary.
1852
- #
1853
- # == Options
1854
- #
1855
- # +:index+ - Keys to group by on the pivot table row index. Pass vector names
1856
- # contained in an Array.
1857
- #
1858
- # +:vectors+ - Keys to group by on the pivot table column index. Pass vector
1859
- # names contained in an Array.
1860
- #
1861
- # +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
1862
- # use any of the statistics functions applicable on Vectors that can be found in
1863
- # the DaruLite::Statistics::Vector module.
1864
- #
1865
- # +:values+ - Columns to aggregate. Will consider all numeric columns not
1866
- # specified in *:index* or *:vectors*. Optional.
1867
- #
1868
- # == Usage
1869
- #
1870
- # df = DaruLite::DataFrame.new({
1871
- # a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
1872
- # b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
1873
- # c: ['small','large','large','small','small','large','small','large','small'],
1874
- # d: [1,2,2,3,3,4,5,6,7],
1875
- # e: [2,4,4,6,6,8,10,12,14]
1876
- # })
1877
- # df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
1878
- #
1879
- # #=>
1880
- # # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
1881
- # # [:e, :one] [:e, :two]
1882
- # # [:bar] 18 26
1883
- # # [:foo] 10 12
1884
- def pivot_table(opts = {})
1885
- raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?
1886
-
1887
- index = opts[:index]
1888
- vectors = opts[:vectors] || []
1889
- aggregate_function = opts[:agg] || :mean
1890
- values = prepare_pivot_values index, vectors, opts
1891
- raise IndexError, 'No numeric vectors to aggregate' if values.empty?
1892
-
1893
- grouped = group_by(index)
1894
- return grouped.send(aggregate_function) if vectors.empty?
1895
-
1896
- super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
1897
-
1898
- pivot_dataframe super_hash
1899
- end
1900
-
1901
- # Merge vectors from two DataFrames. In case of name collision,
1902
- # the vectors names are changed to x_1, x_2 ....
1903
- #
1904
- # @return {DaruLite::DataFrame}
1905
- def merge(other_df)
1906
- unless nrows == other_df.nrows
1907
- raise ArgumentError,
1908
- "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
1909
- end
1910
-
1911
- new_fields = (@vectors.to_a + other_df.vectors.to_a)
1912
- new_fields = ArrayHelper.recode_repeated(new_fields)
1913
- DataFrame.new({}, order: new_fields).tap do |df_new|
1914
- (0...nrows).each do |i|
1915
- df_new.add_row row[i].to_a + other_df.row[i].to_a
1916
- end
1917
- df_new.index = @index if @index == other_df.index
1918
- df_new.update
1919
- end
1920
- end
1921
-
1922
- # Join 2 DataFrames with SQL style joins. Currently supports inner, left
1923
- # outer, right outer and full outer joins.
1924
- #
1925
- # @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
1926
- # to be performed.
1927
- # @param [Hash] opts Options Hash
1928
- # @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
1929
- # @option :on [Array] The columns on which the join is to be performed.
1930
- # Column names specified here must be common to both DataFrames.
1931
- # @option :indicator [Symbol] The name of a vector to add to the resultant
1932
- # dataframe that indicates whether the record was in the left (:left_only),
1933
- # right (:right_only), or both (:both) joining dataframes.
1934
- # @return [DaruLite::DataFrame]
1935
- # @example Inner Join
1936
- # left = DaruLite::DataFrame.new({
1937
- # :id => [1,2,3,4],
1938
- # :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
1939
- # })
1940
- # right = DaruLite::DataFrame.new({
1941
- # :id => [1,2,3,4],
1942
- # :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
1943
- # })
1944
- # left.join(right, how: :inner, on: [:name])
1945
- # #=>
1946
- # ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
1947
- # # id_1 name id_2
1948
- # # 0 1 Pirate 2
1949
- # # 1 3 Ninja 4
1950
- def join(other_df, opts = {})
1951
- DaruLite::Core::Merge.join(self, other_df, opts)
1952
- end
1953
-
1954
- # Creates a new dataset for one to many relations
1955
- # on a dataset, based on pattern of field names.
1956
- #
1957
- # for example, you have a survey for number of children
1958
- # with this structure:
1959
- # id, name, child_name_1, child_age_1, child_name_2, child_age_2
1960
- # with
1961
- # ds.one_to_many([:id], "child_%v_%n"
1962
- # the field of first parameters will be copied verbatim
1963
- # to new dataset, and fields which responds to second
1964
- # pattern will be added one case for each different %n.
1965
- #
1966
- # @example
1967
- # cases=[
1968
- # ['1','george','red',10,'blue',20,nil,nil],
1969
- # ['2','fred','green',15,'orange',30,'white',20],
1970
- # ['3','alfred',nil,nil,nil,nil,nil,nil]
1971
- # ]
1972
- # ds=DaruLite::DataFrame.rows(cases, order:
1973
- # [:id, :name,
1974
- # :car_color1, :car_value1,
1975
- # :car_color2, :car_value2,
1976
- # :car_color3, :car_value3])
1977
- # ds.one_to_many([:id],'car_%v%n').to_matrix
1978
- # #=> Matrix[
1979
- # # ["red", "1", 10],
1980
- # # ["blue", "1", 20],
1981
- # # ["green", "2", 15],
1982
- # # ["orange", "2", 30],
1983
- # # ["white", "2", 20]
1984
- # # ]
1985
- def one_to_many(parent_fields, pattern)
1986
- vars, numbers = one_to_many_components(pattern)
1987
-
1988
- DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
1989
- each_row do |row|
1990
- verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
1991
- numbers.each do |n|
1992
- generated = one_to_many_row row, n, vars, pattern
1993
- next if generated.values.all?(&:nil?)
1994
-
1995
- ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
1996
- end
1997
- end
1998
- ds.update
1999
- end
2000
- end
2001
-
2002
- def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
2003
- self[nm]
2004
- .split_by_separator(sep)
2005
- .each_with_index do |(k, v), i|
2006
- v.rename "#{nm}:#{k}"
2007
- self[:"#{nm}#{join}#{i + 1}"] = v
2008
- end
2009
- end
2010
-
2011
- # Create a sql, basen on a given Dataset
2012
- #
2013
- # == Arguments
2014
- #
2015
- # * table - String specifying name of the table that will created in SQL.
2016
- # * charset - Character set. Default is "UTF8".
2017
- #
2018
- # @example
2019
- #
2020
- # ds = DaruLite::DataFrame.new({
2021
- # :id => DaruLite::Vector.new([1,2,3,4,5]),
2022
- # :name => DaruLite::Vector.new(%w{Alex Peter Susan Mary John})
2023
- # })
2024
- # ds.create_sql('names')
2025
- # #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
2026
- #
2027
- def create_sql(table, charset = 'UTF8')
2028
- sql = "CREATE TABLE #{table} ("
2029
- fields = vectors.to_a.collect do |f|
2030
- v = self[f]
2031
- "#{f} #{v.db_type}"
2032
- end
2033
-
2034
- sql + fields.join(",\n ") + ") CHARACTER SET=#{charset};"
2035
- end
2036
-
2037
- # Returns the dataframe. This can be convenient when the user does not
2038
- # know whether the object is a vector or a dataframe.
2039
- # @return [self] the dataframe
2040
- def to_df
2041
- self
2042
- end
2043
-
2044
- # Convert all vectors of type *:numeric* into a Matrix.
2045
- def to_matrix
2046
- Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
2047
- end
2048
-
2049
- # Converts the DataFrame into an array of hashes where key is vector name
2050
- # and value is the corresponding element. The 0th index of the array contains
2051
- # the array of hashes while the 1th index contains the indexes of each row
2052
- # of the dataframe. Each element in the index array corresponds to its row
2053
- # in the array of hashes, which has the same index.
2054
- def to_a
2055
- [each_row.map(&:to_h), @index.to_a]
2056
- end
2057
-
2058
- # Convert to json. If no_index is false then the index will NOT be included
2059
- # in the JSON thus created.
2060
- def to_json(no_index = true)
2061
- if no_index
2062
- to_a[0].to_json
2063
- else
2064
- to_a.to_json
2065
- end
2066
- end
2067
-
2068
- # Converts DataFrame to a hash (explicit) with keys as vector names and values as
2069
- # the corresponding vectors.
2070
- def to_h
2071
- @vectors
2072
- .each_with_index
2073
- .map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
2074
- end
316
+ def nest(*tree_keys, &block)
317
+ tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
2075
318
 
2076
- # Convert to html for IRuby.
2077
- def to_html(threshold = DaruLite.max_rows)
2078
- table_thead = to_html_thead
2079
- table_tbody = to_html_tbody(threshold)
2080
- path = if index.is_a?(MultiIndex)
2081
- File.expand_path('iruby/templates/dataframe_mi.html.erb', __dir__)
2082
- else
2083
- File.expand_path('iruby/templates/dataframe.html.erb', __dir__)
2084
- end
2085
- ERB.new(File.read(path).strip).result(binding)
2086
- end
319
+ each_row.with_object({}) do |row, current|
320
+ # Create tree
321
+ *keys, last = tree_keys
322
+ current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
323
+ name = row[last]
2087
324
 
2088
- def to_html_thead
2089
- table_thead_path =
2090
- if index.is_a?(MultiIndex)
2091
- File.expand_path('iruby/templates/dataframe_mi_thead.html.erb', __dir__)
325
+ if block
326
+ current[name] = yield(row, current, name)
2092
327
  else
2093
- File.expand_path('iruby/templates/dataframe_thead.html.erb', __dir__)
328
+ current[name] ||= []
329
+ current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
2094
330
  end
2095
- ERB.new(File.read(table_thead_path).strip).result(binding)
331
+ end
2096
332
  end
2097
333
 
2098
- def to_html_tbody(threshold = DaruLite.max_rows)
2099
- threshold ||= @size
2100
- table_tbody_path =
2101
- if index.is_a?(MultiIndex)
2102
- File.expand_path('iruby/templates/dataframe_mi_tbody.html.erb', __dir__)
2103
- else
2104
- File.expand_path('iruby/templates/dataframe_tbody.html.erb', __dir__)
2105
- end
2106
- ERB.new(File.read(table_tbody_path).strip).result(binding)
334
+ def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
335
+ self[name]
336
+ .split_by_separator(sep)
337
+ .each { |k, v| self[:"#{name}#{join}#{k}"] = v }
2107
338
  end
2108
339
 
2109
- def to_s
2110
- "#<#{self.class}#{": #{@name}" if @name}(#{nrows}x#{ncols})>"
340
+ # Return the number of rows and columns of the DataFrame in an Array.
341
+ def shape
342
+ [nrows, ncols]
2111
343
  end
2112
344
 
2113
- # Method for updating the metadata (i.e. missing value positions) of the
2114
- # after assingment/deletion etc. are complete. This is provided so that
2115
- # time is not wasted in creating the metadata for the vector each time
2116
- # assignment/deletion of elements is done. Updating data this way is called
2117
- # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
2118
- def update
2119
- @data.each(&:update) if DaruLite.lazy_update
345
+ # The number of rows
346
+ def nrows
347
+ @index.size
2120
348
  end
2121
349
 
2122
- # Rename the DataFrame.
2123
- def rename(new_name)
2124
- @name = new_name
2125
- self
350
+ # The number of vectors
351
+ def ncols
352
+ @vectors.size
2126
353
  end
2127
354
 
2128
- alias name= rename
2129
-
2130
- # Write this DataFrame to a CSV file.
355
+ # Renames the vectors
2131
356
  #
2132
357
  # == Arguments
2133
358
  #
2134
- # * filename - Path of CSV file where the DataFrame is to be saved.
359
+ # * name_map - A hash where the keys are the exising vector names and
360
+ # the values are the new names. If a vector is renamed
361
+ # to a vector name that is already in use, the existing
362
+ # one is overwritten.
2135
363
  #
2136
- # == Options
364
+ # == Usage
2137
365
  #
2138
- # * convert_comma - If set to *true*, will convert any commas in any
2139
- # of the data to full stops ('.').
2140
- # All the options accepted by CSV.read() can also be passed into this
2141
- # function.
2142
- def write_csv(filename, opts = {})
2143
- DaruLite::IO.dataframe_write_csv self, filename, opts
2144
- end
366
+ # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
367
+ # df.rename_vectors :a => :alpha, :c => :gamma
368
+ # df.vectors.to_a #=> [:alpha, :b, :gamma]
369
+ def rename_vectors(name_map)
370
+ existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
371
+ delete_vectors(*existing_targets)
2145
372
 
2146
- # Write this dataframe to an Excel Spreadsheet
2147
- #
2148
- # == Arguments
2149
- #
2150
- # * filename - The path of the file where the DataFrame should be written.
2151
- def write_excel(filename, opts = {})
2152
- DaruLite::IO.dataframe_write_excel self, filename, opts
373
+ new_names = vectors.to_a.map { |v| name_map[v] || v }
374
+ self.vectors = DaruLite::Index.new new_names
2153
375
  end
2154
376
 
2155
- # Insert each case of the Dataset on the selected table
377
+ # Renames the vectors and returns itself
2156
378
  #
2157
379
  # == Arguments
2158
380
  #
2159
- # * dbh - DBI database connection object.
2160
- # * query - Query string.
381
+ # * name_map - A hash where the keys are the exising vector names and
382
+ # the values are the new names. If a vector is renamed
383
+ # to a vector name that is already in use, the existing
384
+ # one is overwritten.
2161
385
  #
2162
386
  # == Usage
2163
387
  #
2164
- # ds = DaruLite::DataFrame.new({:id=>DaruLite::Vector.new([1,2,3]), :name=>DaruLite::Vector.new(["a","b","c"])})
2165
- # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
2166
- # ds.write_sql(dbh,"test")
2167
- def write_sql(dbh, table)
2168
- DaruLite::IO.dataframe_write_sql self, dbh, table
388
+ # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
389
+ # df.rename_vectors! :a => :alpha, :c => :gamma # df
390
+ def rename_vectors!(name_map)
391
+ rename_vectors(name_map)
392
+ self
2169
393
  end
2170
394
 
2171
- # Use marshalling to save dataframe to a file.
2172
- def save(filename)
2173
- DaruLite::IO.save self, filename
395
+ # Converts the vectors to a DaruLite::MultiIndex.
396
+ # The argument passed is used as the MultiIndex's top level
397
+ def add_level_to_vectors(top_level_label)
398
+ tuples = vectors.map { |label| [top_level_label, *label] }
399
+ self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
2174
400
  end
2175
401
 
2176
- def _dump(_depth)
2177
- Marshal.dump(
2178
- data: @data,
2179
- index: @index.to_a,
2180
- order: @vectors.to_a,
2181
- name: @name
2182
- )
402
+ def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
403
+ self[nm]
404
+ .split_by_separator(sep)
405
+ .each_with_index do |(k, v), i|
406
+ v.rename "#{nm}:#{k}"
407
+ self[:"#{nm}#{join}#{i + 1}"] = v
408
+ end
409
+ end
410
+
411
+ # Method for updating the metadata (i.e. missing value positions) of the
412
+ # after assingment/deletion etc. are complete. This is provided so that
413
+ # time is not wasted in creating the metadata for the vector each time
414
+ # assignment/deletion of elements is done. Updating data this way is called
415
+ # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
416
+ def update
417
+ @data.each(&:update) if DaruLite.lazy_update
2183
418
  end
2184
419
 
2185
- def self._load(data)
2186
- h = Marshal.load data
2187
- DaruLite::DataFrame.new(h[:data],
2188
- index: h[:index],
2189
- order: h[:order],
2190
- name: h[:name])
420
+ # Rename the DataFrame.
421
+ def rename(new_name)
422
+ @name = new_name
423
+ self
2191
424
  end
425
+ alias name= rename
2192
426
 
2193
427
  # Transpose a DataFrame, tranposing elements and row, column indexing.
2194
428
  def transpose
@@ -2204,7 +438,10 @@ module DaruLite
2204
438
  # Pretty print in a nice table format for the command line (irb/pry/iruby)
2205
439
  def inspect(spacing = DaruLite.spacing, threshold = DaruLite.max_rows)
2206
440
  name_part = @name ? ": #{@name} " : ''
2207
- spacing = [headers.to_a.map(&:length).max, spacing].max
441
+ spacing = [
442
+ headers.to_a.map { |header| header.try(:length) || header.to_s.length }.max,
443
+ spacing
444
+ ].max
2208
445
 
2209
446
  "#<#{self.class}#{name_part}(#{nrows}x#{ncols})>#{$INPUT_RECORD_SEPARATOR}" +
2210
447
  Formatters::Table.format(
@@ -2216,11 +453,6 @@ module DaruLite
2216
453
  )
2217
454
  end
2218
455
 
2219
- # Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
2220
- def where(bool_array)
2221
- DaruLite::Core::Query.df_where self, bool_array
2222
- end
2223
-
2224
456
  def ==(other)
2225
457
  self.class == other.class &&
2226
458
  @size == other.size &&
@@ -2274,144 +506,6 @@ module DaruLite
2274
506
  order: all_vectors.map(&:name)
2275
507
  end
2276
508
 
2277
- # Split the dataframe into many dataframes based on category vector
2278
- # @param [object] cat_name name of category vector to split the dataframe
2279
- # @return [Array] array of dataframes split by category with category vector
2280
- # used to split not included
2281
- # @example
2282
- # df = DaruLite::DataFrame.new({
2283
- # a: [1, 2, 3],
2284
- # b: ['a', 'a', 'b']
2285
- # })
2286
- # df.to_category :b
2287
- # df.split_by_category :b
2288
- # # => [#<DaruLite::DataFrame: a (2x1)>
2289
- # # a
2290
- # # 0 1
2291
- # # 1 2,
2292
- # # #<DaruLite::DataFrame: b (1x1)>
2293
- # # a
2294
- # # 2 3]
2295
- def split_by_category(cat_name)
2296
- cat_dv = self[cat_name]
2297
- raise ArgumentError, "#{cat_name} is not a category vector" unless
2298
- cat_dv.category?
2299
-
2300
- cat_dv.categories.map do |cat|
2301
- where(cat_dv.eq cat)
2302
- .rename(cat)
2303
- .delete_vector cat_name
2304
- end
2305
- end
2306
-
2307
- # @param indexes [Array] index(s) at which row tuples are retrieved
2308
- # @return [Array] returns array of row tuples at given index(s)
2309
- # @example Using DaruLite::Index
2310
- # df = DaruLite::DataFrame.new({
2311
- # a: [1, 2, 3],
2312
- # b: ['a', 'a', 'b']
2313
- # })
2314
- #
2315
- # df.access_row_tuples_by_indexs(1,2)
2316
- # # => [[2, "a"], [3, "b"]]
2317
- #
2318
- # df.index = DaruLite::Index.new([:one,:two,:three])
2319
- # df.access_row_tuples_by_indexs(:one,:three)
2320
- # # => [[1, "a"], [3, "b"]]
2321
- #
2322
- # @example Using DaruLite::MultiIndex
2323
- # mi_idx = DaruLite::MultiIndex.from_tuples [
2324
- # [:a,:one,:bar],
2325
- # [:a,:one,:baz],
2326
- # [:b,:two,:bar],
2327
- # [:a,:two,:baz],
2328
- # ]
2329
- # df_mi = DaruLite::DataFrame.new({
2330
- # a: 1..4,
2331
- # b: 'a'..'d'
2332
- # }, index: mi_idx )
2333
- #
2334
- # df_mi.access_row_tuples_by_indexs(:b, :two, :bar)
2335
- # # => [[3, "c"]]
2336
- # df_mi.access_row_tuples_by_indexs(:a)
2337
- # # => [[1, "a"], [2, "b"], [4, "d"]]
2338
- def access_row_tuples_by_indexs(*indexes)
2339
- return get_sub_dataframe(indexes, by_position: false).map_rows(&:to_a) if
2340
- @index.is_a?(DaruLite::MultiIndex)
2341
-
2342
- positions = @index.pos(*indexes)
2343
- if positions.is_a? Numeric
2344
- row = get_rows_for([positions])
2345
- row.first.is_a?(Array) ? row : [row]
2346
- else
2347
- new_rows = get_rows_for(indexes, by_position: false)
2348
- indexes.map { |index| new_rows.map { |r| r[index] } }
2349
- end
2350
- end
2351
-
2352
- # Function to use for aggregating the data.
2353
- #
2354
- # @param options [Hash] options for column, you want in resultant dataframe
2355
- #
2356
- # @return [DaruLite::DataFrame]
2357
- #
2358
- # @example
2359
- # df = DaruLite::DataFrame.new(
2360
- # {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
2361
- # => #<DaruLite::DataFrame(5x2)>
2362
- # col num
2363
- # 0 a 52
2364
- # 1 b 12
2365
- # 2 c 7
2366
- # 3 d 17
2367
- # 4 e 1
2368
- #
2369
- # df.aggregate(num_100_times: ->(df) { (df.num*100).first })
2370
- # => #<DaruLite::DataFrame(5x1)>
2371
- # num_100_ti
2372
- # 0 5200
2373
- # 1 1200
2374
- # 2 700
2375
- # 3 1700
2376
- # 4 100
2377
- #
2378
- # When we have duplicate index :
2379
- #
2380
- # idx = DaruLite::CategoricalIndex.new [:a, :b, :a, :a, :c]
2381
- # df = DaruLite::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
2382
- # => #<DaruLite::DataFrame(5x1)>
2383
- # num
2384
- # a 52
2385
- # b 12
2386
- # a 7
2387
- # a 17
2388
- # c 1
2389
- #
2390
- # df.aggregate(num: :mean)
2391
- # => #<DaruLite::DataFrame(3x1)>
2392
- # num
2393
- # a 25.3333333
2394
- # b 12
2395
- # c 1
2396
- #
2397
- # Note: `GroupBy` class `aggregate` method uses this `aggregate` method
2398
- # internally.
2399
- def aggregate(options = {}, multi_index_level = -1)
2400
- if block_given?
2401
- positions_tuples, new_index = yield(@index) # NOTE: use of yield is private for now
2402
- else
2403
- positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
2404
- end
2405
-
2406
- colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
2407
-
2408
- DaruLite::DataFrame.new(colmn_value, index: new_index, order: options.keys)
2409
- end
2410
-
2411
- def group_by_and_aggregate(*group_by_keys, **aggregation_map)
2412
- group_by(*group_by_keys).aggregate(aggregation_map)
2413
- end
2414
-
2415
509
  private
2416
510
 
2417
511
  def headers
@@ -2422,20 +516,6 @@ module DaruLite
2422
516
  index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
2423
517
  end
2424
518
 
2425
- def convert_categorical_vectors(names)
2426
- names.filter_map do |n|
2427
- next unless self[n].category?
2428
-
2429
- old = [n, self[n]]
2430
- self[n] = DaruLite::Vector.new(self[n].to_ints)
2431
- old
2432
- end
2433
- end
2434
-
2435
- def restore_categorical_vectors(old)
2436
- old.each { |name, vector| self[name] = vector }
2437
- end
2438
-
2439
519
  def recursive_product(dfs)
2440
520
  return dfs.first if dfs.size == 1
2441
521
 
@@ -2447,12 +527,6 @@ module DaruLite
2447
527
  end
2448
528
  end
2449
529
 
2450
- def should_be_vector!(val)
2451
- return val if val.is_a?(DaruLite::Vector)
2452
-
2453
- raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
2454
- end
2455
-
2456
530
  def dispatch_to_axis(axis, method, *args, &block)
2457
531
  if %i[vector column].include?(axis)
2458
532
  send(:"#{method}_vector", *args, &block)
@@ -2483,76 +557,6 @@ module DaruLite
2483
557
  end
2484
558
  end
2485
559
 
2486
- def access_vector(*names)
2487
- if names.first.is_a?(Range)
2488
- dup(@vectors.subset(names.first))
2489
- elsif @vectors.is_a?(MultiIndex)
2490
- access_vector_multi_index(*names)
2491
- else
2492
- access_vector_single_index(*names)
2493
- end
2494
- end
2495
-
2496
- def access_vector_multi_index(*names)
2497
- pos = @vectors[names]
2498
-
2499
- return @data[pos] if pos.is_a?(Integer)
2500
-
2501
- new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
2502
-
2503
- pos = pos.drop_left_level(names.size) if names.size < @vectors.width
2504
-
2505
- DaruLite::DataFrame.new(new_vectors, index: @index, order: pos)
2506
- end
2507
-
2508
- def access_vector_single_index(*names)
2509
- if names.count < 2
2510
- begin
2511
- pos = @vectors.is_a?(DaruLite::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
2512
- rescue IndexError
2513
- raise IndexError, "Specified vector #{names.first} does not exist"
2514
- end
2515
- return @data[pos] if pos.is_a?(Numeric)
2516
-
2517
- names = pos
2518
- end
2519
-
2520
- new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
2521
-
2522
- order = names.is_a?(Array) ? DaruLite::Index.new(names) : names
2523
- DaruLite::DataFrame.new(new_vectors, order: order, index: @index, name: @name)
2524
- end
2525
-
2526
- def access_row(*indexes)
2527
- positions = @index.pos(*indexes)
2528
-
2529
- if positions.is_a? Numeric
2530
- row = get_rows_for([positions])
2531
- DaruLite::Vector.new row, index: @vectors, name: indexes.first
2532
- else
2533
- new_rows = get_rows_for(indexes, by_position: false)
2534
- DaruLite::DataFrame.new new_rows, index: @index.subset(*indexes), order: @vectors
2535
- end
2536
- end
2537
-
2538
- # @param keys [Array] can be an array of positions (if by_position is true) or indexes (if by_position if false)
2539
- # because of coercion by DaruLite::Vector#at and DaruLite::Vector#[], can return either an Array of
2540
- # values (representing a row) or an array of Vectors (that can be seen as rows)
2541
- def get_rows_for(keys, by_position: true)
2542
- raise unless keys.is_a?(Array)
2543
-
2544
- if by_position
2545
- pos = keys
2546
- @data.map { |vector| vector.at(*pos) }
2547
- else
2548
- # TODO: for now (2018-07-27), it is different than using
2549
- # get_rows_for(@index.pos(*keys))
2550
- # because DaruLite::Vector#at and DaruLite::Vector#[] don't handle DaruLite::MultiIndex the same way
2551
- indexes = keys
2552
- @data.map { |vec| vec[*indexes] }
2553
- end
2554
- end
2555
-
2556
560
  def insert_or_modify_vector(name, vector)
2557
561
  name = name[0] unless @vectors.is_a?(MultiIndex)
2558
562
 
@@ -2835,146 +839,6 @@ module DaruLite
2835
839
  end
2836
840
  end
2837
841
 
2838
- def sort_build_row(vector_locs, by_blocks, ascending, handle_nils, r1, r2) # rubocop:disable Metrics/ParameterLists
2839
- # Create an array to be used for comparison of two rows in sorting
2840
- vector_locs
2841
- .zip(by_blocks, ascending, handle_nils)
2842
- .map do |vector_loc, by, asc, handle_nil|
2843
- value = @data[vector_loc].data[asc ? r1 : r2]
2844
-
2845
- if by
2846
- value = begin
2847
- by.call(value)
2848
- rescue StandardError
2849
- nil
2850
- end
2851
- end
2852
-
2853
- sort_handle_nils value, asc, handle_nil || !by
2854
- end
2855
- end
2856
-
2857
- def sort_handle_nils(value, asc, handle_nil)
2858
- if !handle_nil
2859
- value
2860
- elsif asc
2861
- [value.nil? ? 0 : 1, value]
2862
- else
2863
- [value.nil? ? 1 : 0, value]
2864
- end
2865
- end
2866
-
2867
- def sort_coerce_boolean(opts, symbol, default, size)
2868
- val = opts[symbol]
2869
- case val
2870
- when true, false
2871
- Array.new(size, val)
2872
- when nil
2873
- Array.new(size, default)
2874
- when Array
2875
- raise ArgumentError, "Specify same number of vector names and #{symbol}" if
2876
- size != val.size
2877
-
2878
- val
2879
- else
2880
- raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
2881
- end
2882
- end
2883
-
2884
- def sort_prepare_block(vector_order, opts)
2885
- ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
2886
- handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
2887
-
2888
- by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
2889
- vector_locs = vector_order.map { |v| @vectors[v] }
2890
-
2891
- lambda do |index1, index2|
2892
- # Build left and right array to compare two rows
2893
- left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
2894
- right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
2895
-
2896
- # Resolve conflict by Index if all attributes are same
2897
- left << index1
2898
- right << index2
2899
- left <=> right
2900
- end
2901
- end
2902
-
2903
- def verify_error_message(row, test, id, i)
2904
- description, fields, = test
2905
- values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
2906
- "#{i + 1} [#{row[id]}]: #{description}#{values}"
2907
- end
2908
-
2909
- def prepare_pivot_values(index, vectors, opts)
2910
- case opts[:values]
2911
- when nil # values not specified at all.
2912
- (@vectors.to_a - (index | vectors)) & numeric_vector_names
2913
- when Array # multiple values specified.
2914
- opts[:values]
2915
- else # single value specified.
2916
- [opts[:values]]
2917
- end
2918
- end
2919
-
2920
- def make_pivot_hash(grouped, vectors, values, aggregate_function)
2921
- grouped.groups.transform_values { |_| {} }.tap do |super_hash|
2922
- values.each do |value|
2923
- grouped.groups.each do |group_name, row_numbers|
2924
- row_numbers.each do |num|
2925
- arry = [value, *vectors.map { |v| self[v][num] }]
2926
- sub_hash = super_hash[group_name]
2927
- sub_hash[arry] ||= []
2928
-
2929
- sub_hash[arry] << self[value][num]
2930
- end
2931
- end
2932
- end
2933
-
2934
- setup_pivot_aggregates super_hash, aggregate_function
2935
- end
2936
- end
2937
-
2938
- def setup_pivot_aggregates(super_hash, aggregate_function)
2939
- super_hash.each_value do |sub_hash|
2940
- sub_hash.each do |group_name, aggregates|
2941
- sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function)
2942
- end
2943
- end
2944
- end
2945
-
2946
- def pivot_dataframe(super_hash)
2947
- df_index = DaruLite::MultiIndex.from_tuples super_hash.keys
2948
- df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
2949
-
2950
- DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
2951
- super_hash.each do |row_index, sub_h|
2952
- sub_h.each do |vector_index, val|
2953
- pivoted_dataframe[vector_index][row_index] = val
2954
- end
2955
- end
2956
- end
2957
- end
2958
-
2959
- def one_to_many_components(pattern)
2960
- re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
2961
-
2962
- vars, numbers =
2963
- @vectors
2964
- .map { |v| v.scan(re) }
2965
- .reject(&:empty?).flatten(1).transpose
2966
-
2967
- [vars.uniq, numbers.map(&:to_i).sort.uniq]
2968
- end
2969
-
2970
- def one_to_many_row(row, number, vars, pattern)
2971
- vars
2972
- .to_h do |v|
2973
- name = pattern.sub('%v', v).sub('%n', number.to_s)
2974
- [v, row[name]]
2975
- end
2976
- end
2977
-
2978
842
  # Raises IndexError when one of the positions is not a valid position
2979
843
  def validate_positions(*positions, size)
2980
844
  positions.each do |pos|
@@ -2999,82 +863,5 @@ module DaruLite
2999
863
  DaruLite::Vector.new(source[idx], index: @index, name: vectors[idx])
3000
864
  end
3001
865
  end
3002
-
3003
- def aggregate_by_positions_tuples(options, positions_tuples)
3004
- agg_over_vectors_only, options = cast_aggregation_options(options)
3005
-
3006
- if agg_over_vectors_only
3007
- options.map do |vect_name, method|
3008
- vect = self[vect_name]
3009
-
3010
- positions_tuples.map do |positions|
3011
- vect.apply_method_on_sub_vector(method, keys: positions)
3012
- end
3013
- end
3014
- else
3015
- methods = options.values
3016
-
3017
- # NOTE: because we aggregate over rows, we don't have to re-get sub-dfs for each method (which is expensive)
3018
- rows = positions_tuples.map do |positions|
3019
- apply_method_on_sub_df(methods, keys: positions)
3020
- end
3021
-
3022
- rows.transpose
3023
- end
3024
- end
3025
-
3026
- # convert operations over sub-vectors to operations over sub-dfs when it improves perf
3027
- # note: we don't always "cast" because aggregation over a single vector / a few vector is faster
3028
- # than aggregation over (sub-)dfs
3029
- def cast_aggregation_options(options)
3030
- vects, non_vects = options.keys.partition { |k| @vectors.include?(k) }
3031
-
3032
- over_vectors = true
3033
-
3034
- if non_vects.any?
3035
- options = options.clone
3036
-
3037
- vects.each do |name|
3038
- proc_on_vect = options[name].to_proc
3039
- options[name] = ->(sub_df) { proc_on_vect.call(sub_df[name]) }
3040
- end
3041
-
3042
- over_vectors = false
3043
- end
3044
-
3045
- [over_vectors, options]
3046
- end
3047
-
3048
- def group_index_for_aggregation(index, multi_index_level = -1)
3049
- case index
3050
- when DaruLite::MultiIndex
3051
- groups_by_pos = DaruLite::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
3052
-
3053
- new_index = DaruLite::MultiIndex.from_tuples(groups_by_pos.keys).coerce_index
3054
- pos_tuples = groups_by_pos.values
3055
- when DaruLite::Index, DaruLite::CategoricalIndex
3056
- new_index = Array(index).uniq
3057
- pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
3058
- else raise
3059
- end
3060
-
3061
- [pos_tuples, new_index]
3062
- end
3063
-
3064
- # coerce ranges, integers and array in appropriate ways
3065
- def coerce_positions(*positions, size)
3066
- if positions.size == 1
3067
- case positions.first
3068
- when Integer
3069
- positions.first
3070
- when Range
3071
- size.times.to_a[positions.first]
3072
- else
3073
- raise ArgumentError, 'Unknown position type.'
3074
- end
3075
- else
3076
- positions
3077
- end
3078
- end
3079
866
  end
3080
867
  end