daru_lite 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +35 -33
  3. data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
  4. data/lib/daru_lite/data_frame/calculatable.rb +140 -0
  5. data/lib/daru_lite/data_frame/convertible.rb +107 -0
  6. data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
  7. data/lib/daru_lite/data_frame/fetchable.rb +301 -0
  8. data/lib/daru_lite/data_frame/filterable.rb +144 -0
  9. data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
  10. data/lib/daru_lite/data_frame/indexable.rb +168 -0
  11. data/lib/daru_lite/data_frame/iterable.rb +339 -0
  12. data/lib/daru_lite/data_frame/joinable.rb +152 -0
  13. data/lib/daru_lite/data_frame/missable.rb +75 -0
  14. data/lib/daru_lite/data_frame/pivotable.rb +108 -0
  15. data/lib/daru_lite/data_frame/queryable.rb +67 -0
  16. data/lib/daru_lite/data_frame/setable.rb +109 -0
  17. data/lib/daru_lite/data_frame/sortable.rb +241 -0
  18. data/lib/daru_lite/dataframe.rb +138 -2353
  19. data/lib/daru_lite/index/index.rb +14 -1
  20. data/lib/daru_lite/index/multi_index.rb +9 -0
  21. data/lib/daru_lite/maths/statistics/vector.rb +1 -1
  22. data/lib/daru_lite/vector/aggregatable.rb +9 -0
  23. data/lib/daru_lite/vector/calculatable.rb +78 -0
  24. data/lib/daru_lite/vector/convertible.rb +77 -0
  25. data/lib/daru_lite/vector/duplicatable.rb +17 -0
  26. data/lib/daru_lite/vector/fetchable.rb +175 -0
  27. data/lib/daru_lite/vector/filterable.rb +128 -0
  28. data/lib/daru_lite/vector/indexable.rb +77 -0
  29. data/lib/daru_lite/vector/iterable.rb +95 -0
  30. data/lib/daru_lite/vector/joinable.rb +17 -0
  31. data/lib/daru_lite/vector/missable.rb +124 -0
  32. data/lib/daru_lite/vector/queryable.rb +45 -0
  33. data/lib/daru_lite/vector/setable.rb +47 -0
  34. data/lib/daru_lite/vector/sortable.rb +113 -0
  35. data/lib/daru_lite/vector.rb +36 -932
  36. data/lib/daru_lite/version.rb +1 -1
  37. data/spec/data_frame/aggregatable_example.rb +65 -0
  38. data/spec/data_frame/buildable_example.rb +109 -0
  39. data/spec/data_frame/calculatable_example.rb +135 -0
  40. data/spec/data_frame/convertible_example.rb +180 -0
  41. data/spec/data_frame/duplicatable_example.rb +111 -0
  42. data/spec/data_frame/fetchable_example.rb +476 -0
  43. data/spec/data_frame/filterable_example.rb +409 -0
  44. data/spec/data_frame/indexable_example.rb +221 -0
  45. data/spec/data_frame/iterable_example.rb +465 -0
  46. data/spec/data_frame/joinable_example.rb +106 -0
  47. data/spec/data_frame/missable_example.rb +47 -0
  48. data/spec/data_frame/pivotable_example.rb +297 -0
  49. data/spec/data_frame/queryable_example.rb +92 -0
  50. data/spec/data_frame/setable_example.rb +482 -0
  51. data/spec/data_frame/sortable_example.rb +350 -0
  52. data/spec/dataframe_spec.rb +181 -3289
  53. data/spec/index/categorical_index_spec.rb +27 -8
  54. data/spec/index/index_spec.rb +21 -0
  55. data/spec/index/multi_index_spec.rb +85 -76
  56. data/spec/vector/aggregatable_example.rb +27 -0
  57. data/spec/vector/calculatable_example.rb +82 -0
  58. data/spec/vector/convertible_example.rb +126 -0
  59. data/spec/vector/duplicatable_example.rb +48 -0
  60. data/spec/vector/fetchable_example.rb +463 -0
  61. data/spec/vector/filterable_example.rb +165 -0
  62. data/spec/vector/indexable_example.rb +201 -0
  63. data/spec/vector/iterable_example.rb +111 -0
  64. data/spec/vector/joinable_example.rb +25 -0
  65. data/spec/vector/missable_example.rb +88 -0
  66. data/spec/vector/queryable_example.rb +91 -0
  67. data/spec/vector/setable_example.rb +300 -0
  68. data/spec/vector/sortable_example.rb +242 -0
  69. data/spec/vector_spec.rb +111 -1805
  70. metadata +86 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4f11fb1ea9fb6c3f43f5f8267338ed4cc81fde2f60bcc82fd6eec6e30b5d87d5
4
- data.tar.gz: e8a833062efdf095672da6684d3971a5e1eafa482b658ae7436061ad4d9ff1c7
3
+ metadata.gz: fe2a3da2352dd68322a62e048c7870ad34f243a31bcd0cf18725b15583e4c5e8
4
+ data.tar.gz: 889d0714e1188240f43dc741f2b58b0ce3fe8731a07b39ee4d366f54ce301f6b
5
5
  SHA512:
6
- metadata.gz: 25dc59ffa9fb012693cf7bd611e19d134d4f003ee3ff5c642a574cb4bfebc5001a8aacd8b42982001b585233b370017e440c62be9dc6389f371423edc36c058b
7
- data.tar.gz: f7373e8a86deb1f766cfb56f759cce92accfad2b571dac138d6d2eb54c2bb746b9cb6a11854891ba4137740208310837cb06edef4680cc2d4319260b12a1c917
6
+ metadata.gz: 370f64692790e1661642c2e40ec4b56baa8fd8ac000be5623293013b169b7d96c0e5d5fe122801bbea40945761710b0b94c148fae27c67a42ab1825683416876
7
+ data.tar.gz: ebb89b2307548a0deaeda39e8e72a899a0f4e20c0e5a5ee16515c7aaea5ac95e8f7da3dc752b80a7dba64ec2f4e19717ba3703c06ee0793111c36da968ed5b02
data/.rubocop_todo.yml CHANGED
@@ -1,11 +1,19 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2022-08-16 13:20:50 UTC using RuboCop version 1.35.0.
3
+ # on 2024-03-03 13:59:21 UTC using RuboCop version 1.60.2.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
7
7
  # versions of RuboCop, may require this file to be generated again.
8
8
 
9
+ # Offense count: 1
10
+ # This cop supports safe autocorrection (--autocorrect).
11
+ # Configuration parameters: EnforcedStyle, IndentationWidth.
12
+ # SupportedStyles: aligned, indented
13
+ Layout/LineEndStringConcatenationIndentation:
14
+ Exclude:
15
+ - 'lib/daru_lite/data_frame/indexable.rb'
16
+
9
17
  # Offense count: 1
10
18
  # Configuration parameters: AllowComments.
11
19
  Lint/EmptyClass:
@@ -13,6 +21,7 @@ Lint/EmptyClass:
13
21
  - 'lib/daru_lite/accessors/mdarray_wrapper.rb'
14
22
 
15
23
  # Offense count: 5
24
+ # Configuration parameters: AllowedParentClasses.
16
25
  Lint/MissingSuper:
17
26
  Exclude:
18
27
  - 'lib/daru_lite/date_time/offsets.rb'
@@ -20,61 +29,50 @@ Lint/MissingSuper:
20
29
  - 'lib/daru_lite/index/index.rb'
21
30
  - 'lib/daru_lite/index/multi_index.rb'
22
31
 
23
- # Offense count: 6
32
+ # Offense count: 5
33
+ # This cop supports safe autocorrection (--autocorrect).
24
34
  # Configuration parameters: CheckForMethodsWithNoSideEffects.
25
35
  Lint/Void:
26
36
  Exclude:
27
37
  - 'lib/daru_lite/category.rb'
28
- - 'lib/daru_lite/dataframe.rb'
38
+ - 'lib/daru_lite/data_frame/indexable.rb'
29
39
  - 'lib/daru_lite/vector.rb'
30
40
 
31
- # Offense count: 40
32
- # Configuration parameters: AllowedMethods, AllowedPatterns, IgnoredMethods, CountRepeatedAttributes.
41
+ # Offense count: 41
42
+ # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes.
33
43
  Metrics/AbcSize:
34
44
  Max: 34
35
45
 
36
- # Offense count: 3
46
+ # Offense count: 5
37
47
  # Configuration parameters: CountComments, CountAsOne.
38
48
  Metrics/ClassLength:
39
- Max: 189
49
+ Max: 188
40
50
 
41
51
  # Offense count: 6
42
- # Configuration parameters: AllowedMethods, AllowedPatterns, IgnoredMethods.
52
+ # Configuration parameters: AllowedMethods, AllowedPatterns.
43
53
  Metrics/CyclomaticComplexity:
44
54
  Max: 9
45
55
 
46
- # Offense count: 61
47
- # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, AllowedMethods, AllowedPatterns, IgnoredMethods.
56
+ # Offense count: 60
57
+ # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
48
58
  Metrics/MethodLength:
49
59
  Max: 15
50
60
 
51
- # Offense count: 2
61
+ # Offense count: 4
52
62
  # Configuration parameters: CountComments, CountAsOne.
53
63
  Metrics/ModuleLength:
54
64
  Max: 190
55
65
 
56
66
  # Offense count: 4
57
- # Configuration parameters: AllowedMethods, AllowedPatterns, IgnoredMethods.
67
+ # Configuration parameters: AllowedMethods, AllowedPatterns.
58
68
  Metrics/PerceivedComplexity:
59
69
  Max: 10
60
70
 
61
- # Offense count: 72
71
+ # Offense count: 66
62
72
  # Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
63
- # AllowedNames: at, by, db, id, in, io, ip, of, on, os, pp, to
73
+ # AllowedNames: as, at, by, cc, db, id, if, in, io, ip, of, on, os, pp, to
64
74
  Naming/MethodParameterName:
65
- Exclude:
66
- - 'lib/daru_lite/category.rb'
67
- - 'lib/daru_lite/core/group_by.rb'
68
- - 'lib/daru_lite/core/merge.rb'
69
- - 'lib/daru_lite/core/query.rb'
70
- - 'lib/daru_lite/dataframe.rb'
71
- - 'lib/daru_lite/date_time/index.rb'
72
- - 'lib/daru_lite/date_time/offsets.rb'
73
- - 'lib/daru_lite/extensions/which_dsl.rb'
74
- - 'lib/daru_lite/io/io.rb'
75
- - 'lib/daru_lite/maths/statistics/dataframe.rb'
76
- - 'lib/daru_lite/maths/statistics/vector.rb'
77
- - 'lib/daru_lite/vector.rb'
75
+ Enabled: false
78
76
 
79
77
  # Offense count: 5
80
78
  # Configuration parameters: NamePrefix, ForbiddenPrefixes, AllowedMethods, MethodDefinitionMacros.
@@ -85,13 +83,14 @@ Naming/MethodParameterName:
85
83
  Naming/PredicateName:
86
84
  Exclude:
87
85
  - 'spec/**/*'
88
- - 'lib/daru_lite/dataframe.rb'
86
+ - 'lib/daru_lite/data_frame/missable.rb'
87
+ - 'lib/daru_lite/data_frame/queryable.rb'
89
88
  - 'lib/daru_lite/vector.rb'
90
89
 
91
90
  # Offense count: 5
92
91
  Security/MarshalLoad:
93
92
  Exclude:
94
- - 'lib/daru_lite/dataframe.rb'
93
+ - 'lib/daru_lite/data_frame/i_o_able.rb'
95
94
  - 'lib/daru_lite/date_time/index.rb'
96
95
  - 'lib/daru_lite/index/index.rb'
97
96
  - 'lib/daru_lite/io/io.rb'
@@ -102,7 +101,7 @@ Style/ClassVars:
102
101
  Exclude:
103
102
  - 'lib/daru_lite.rb'
104
103
 
105
- # Offense count: 44
104
+ # Offense count: 58
106
105
  # Configuration parameters: AllowedConstants.
107
106
  Style/Documentation:
108
107
  Enabled: false
@@ -113,6 +112,10 @@ Style/MapToHash:
113
112
  Exclude:
114
113
  - 'lib/daru_lite/category.rb'
115
114
  - 'lib/daru_lite/core/group_by.rb'
115
+ - 'lib/daru_lite/data_frame/convertible.rb'
116
+ - 'lib/daru_lite/data_frame/duplicatable.rb'
117
+ - 'lib/daru_lite/data_frame/fetchable.rb'
118
+ - 'lib/daru_lite/data_frame/joinable.rb'
116
119
  - 'lib/daru_lite/dataframe.rb'
117
120
 
118
121
  # Offense count: 1
@@ -125,7 +128,7 @@ Style/MultilineBlockChain:
125
128
  # AllowedMethods: respond_to_missing?
126
129
  Style/OptionalBooleanParameter:
127
130
  Exclude:
128
- - 'lib/daru_lite/dataframe.rb'
131
+ - 'lib/daru_lite/data_frame/convertible.rb'
129
132
  - 'lib/daru_lite/maths/statistics/vector.rb'
130
133
  - 'lib/daru_lite/vector.rb'
131
134
 
@@ -133,5 +136,4 @@ Style/OptionalBooleanParameter:
133
136
  # This cop supports unsafe autocorrection (--autocorrect-all).
134
137
  Style/RedundantSelfAssignment:
135
138
  Exclude:
136
- - 'lib/daru_lite/dataframe.rb'
137
-
139
+ - 'lib/daru_lite/data_frame/joinable.rb'
@@ -0,0 +1,165 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Aggregatable
4
+ # Group elements by vector to perform operations on them. Returns a
5
+ # DaruLite::Core::GroupBy object.See the DaruLite::Core::GroupBy docs for a detailed
6
+ # list of possible operations.
7
+ #
8
+ # == Arguments
9
+ #
10
+ # * vectors - An Array contatining names of vectors to group by.
11
+ #
12
+ # == Usage
13
+ #
14
+ # df = DaruLite::DataFrame.new({
15
+ # a: %w{foo bar foo bar foo bar foo foo},
16
+ # b: %w{one one two three two two one three},
17
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
18
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
19
+ # })
20
+ # df.group_by([:a,:b,:c]).groups
21
+ # #=> {["bar", "one", 2]=>[1],
22
+ # # ["bar", "three", 1]=>[3],
23
+ # # ["bar", "two", 6]=>[5],
24
+ # # ["foo", "one", 1]=>[0],
25
+ # # ["foo", "one", 3]=>[6],
26
+ # # ["foo", "three", 8]=>[7],
27
+ # # ["foo", "two", 3]=>[2, 4]}
28
+ def group_by(*vectors)
29
+ vectors.flatten!
30
+ missing = vectors - @vectors.to_a
31
+ raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}") unless missing.empty?
32
+
33
+ vectors = [@vectors.first] if vectors.empty?
34
+
35
+ DaruLite::Core::GroupBy.new(self, vectors)
36
+ end
37
+
38
+ # Function to use for aggregating the data.
39
+ #
40
+ # @param options [Hash] options for column, you want in resultant dataframe
41
+ #
42
+ # @return [DaruLite::DataFrame]
43
+ #
44
+ # @example
45
+ # df = DaruLite::DataFrame.new(
46
+ # {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
47
+ # => #<DaruLite::DataFrame(5x2)>
48
+ # col num
49
+ # 0 a 52
50
+ # 1 b 12
51
+ # 2 c 7
52
+ # 3 d 17
53
+ # 4 e 1
54
+ #
55
+ # df.aggregate(num_100_times: ->(df) { (df.num*100).first })
56
+ # => #<DaruLite::DataFrame(5x1)>
57
+ # num_100_ti
58
+ # 0 5200
59
+ # 1 1200
60
+ # 2 700
61
+ # 3 1700
62
+ # 4 100
63
+ #
64
+ # When we have duplicate index :
65
+ #
66
+ # idx = DaruLite::CategoricalIndex.new [:a, :b, :a, :a, :c]
67
+ # df = DaruLite::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
68
+ # => #<DaruLite::DataFrame(5x1)>
69
+ # num
70
+ # a 52
71
+ # b 12
72
+ # a 7
73
+ # a 17
74
+ # c 1
75
+ #
76
+ # df.aggregate(num: :mean)
77
+ # => #<DaruLite::DataFrame(3x1)>
78
+ # num
79
+ # a 25.3333333
80
+ # b 12
81
+ # c 1
82
+ #
83
+ # Note: `GroupBy` class `aggregate` method uses this `aggregate` method
84
+ # internally.
85
+ def aggregate(options = {}, multi_index_level = -1)
86
+ if block_given?
87
+ positions_tuples, new_index = yield(@index) # NOTE: use of yield is private for now
88
+ else
89
+ positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
90
+ end
91
+
92
+ colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
93
+
94
+ DaruLite::DataFrame.new(colmn_value, index: new_index, order: options.keys)
95
+ end
96
+
97
+ def group_by_and_aggregate(*group_by_keys, **aggregation_map)
98
+ group_by(*group_by_keys).aggregate(aggregation_map)
99
+ end
100
+
101
+ private
102
+
103
+ def aggregate_by_positions_tuples(options, positions_tuples)
104
+ agg_over_vectors_only, options = cast_aggregation_options(options)
105
+
106
+ if agg_over_vectors_only
107
+ options.map do |vect_name, method|
108
+ vect = self[vect_name]
109
+
110
+ positions_tuples.map do |positions|
111
+ vect.apply_method_on_sub_vector(method, keys: positions)
112
+ end
113
+ end
114
+ else
115
+ methods = options.values
116
+
117
+ # NOTE: because we aggregate over rows, we don't have to re-get sub-dfs for each method (which is expensive)
118
+ rows = positions_tuples.map do |positions|
119
+ apply_method_on_sub_df(methods, keys: positions)
120
+ end
121
+
122
+ rows.transpose
123
+ end
124
+ end
125
+
126
+ # convert operations over sub-vectors to operations over sub-dfs when it improves perf
127
+ # note: we don't always "cast" because aggregation over a single vector / a few vector is faster
128
+ # than aggregation over (sub-)dfs
129
+ def cast_aggregation_options(options)
130
+ vects, non_vects = options.keys.partition { |k| @vectors.include?(k) }
131
+
132
+ over_vectors = true
133
+
134
+ if non_vects.any?
135
+ options = options.clone
136
+
137
+ vects.each do |name|
138
+ proc_on_vect = options[name].to_proc
139
+ options[name] = ->(sub_df) { proc_on_vect.call(sub_df[name]) }
140
+ end
141
+
142
+ over_vectors = false
143
+ end
144
+
145
+ [over_vectors, options]
146
+ end
147
+
148
+ def group_index_for_aggregation(index, multi_index_level = -1)
149
+ case index
150
+ when DaruLite::MultiIndex
151
+ groups_by_pos = DaruLite::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
152
+
153
+ new_index = DaruLite::MultiIndex.from_tuples(groups_by_pos.keys).coerce_index
154
+ pos_tuples = groups_by_pos.values
155
+ when DaruLite::Index, DaruLite::CategoricalIndex
156
+ new_index = Array(index).uniq
157
+ pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
158
+ else raise
159
+ end
160
+
161
+ [pos_tuples, new_index]
162
+ end
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,140 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Calculatable
4
+ # Sum all numeric/specified vectors in the DataFrame.
5
+ #
6
+ # Returns a new vector that's a containing a sum of all numeric
7
+ # or specified vectors of the DataFrame. By default, if the vector
8
+ # contains a nil, the sum is nil.
9
+ # With :skipnil argument set to true, nil values are assumed to be
10
+ # 0 (zero) and the sum vector is returned.
11
+ #
12
+ # @param args [Array] List of vectors to sum. Default is nil in which case
13
+ # all numeric vectors are summed.
14
+ #
15
+ # @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
16
+ #
17
+ # @return Vector with sum of all vectors specified in the argument.
18
+ # If vecs parameter is empty, sum all numeric vector.
19
+ #
20
+ # @example
21
+ # df = DaruLite::DataFrame.new({
22
+ # a: [1, 2, nil],
23
+ # b: [2, 1, 3],
24
+ # c: [1, 1, 1]
25
+ # })
26
+ # => #<DaruLite::DataFrame(3x3)>
27
+ # a b c
28
+ # 0 1 2 1
29
+ # 1 2 1 1
30
+ # 2 nil 3 1
31
+ # df.vector_sum [:a, :c]
32
+ # => #<DaruLite::Vector(3)>
33
+ # 0 2
34
+ # 1 3
35
+ # 2 nil
36
+ # df.vector_sum
37
+ # => #<DaruLite::Vector(3)>
38
+ # 0 4
39
+ # 1 4
40
+ # 2 nil
41
+ # df.vector_sum skipnil: true
42
+ # => #<DaruLite::Vector(3)>
43
+ # c
44
+ # 0 4
45
+ # 1 4
46
+ # 2 4
47
+ #
48
+ def vector_sum(*args)
49
+ defaults = { vecs: nil, skipnil: false }
50
+ options = args.last.is_a?(::Hash) ? args.pop : {}
51
+ options = defaults.merge(options)
52
+ vecs = args[0] || options[:vecs]
53
+ skipnil = args[1] || options[:skipnil]
54
+
55
+ vecs ||= numeric_vectors
56
+ sum = DaruLite::Vector.new [0] * @size, index: @index, name: @name, dtype: @dtype
57
+ vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
58
+ end
59
+
60
+ # Calculate mean of the rows of the dataframe.
61
+ #
62
+ # == Arguments
63
+ #
64
+ # * +max_missing+ - The maximum number of elements in the row that can be
65
+ # zero for the mean calculation to happen. Default to 0.
66
+ def vector_mean(max_missing = 0)
67
+ # FIXME: in vector_sum we preserve created vector dtype, but
68
+ # here we are not. Is this by design or ...? - zverok, 2016-05-18
69
+ mean_vec = DaruLite::Vector.new [0] * @size, index: @index, name: "mean_#{@name}"
70
+
71
+ each_row_with_index.with_object(mean_vec) do |(row, i), memo|
72
+ memo[i] = row.indexes(*DaruLite::MISSING_VALUES).size > max_missing ? nil : row.mean
73
+ end
74
+ end
75
+
76
+ # Returns a vector, based on a string with a calculation based
77
+ # on vector.
78
+ #
79
+ # The calculation will be eval'ed, so you can put any variable
80
+ # or expression valid on ruby.
81
+ #
82
+ # For example:
83
+ # a = DaruLite::Vector.new [1,2]
84
+ # b = DaruLite::Vector.new [3,4]
85
+ # ds = DaruLite::DataFrame.new({:a => a,:b => b})
86
+ # ds.compute("a+b")
87
+ # => Vector [4,6]
88
+ def compute(text, &block)
89
+ return instance_eval(&block) if block
90
+
91
+ instance_eval(text)
92
+ end
93
+
94
+ # DSL for yielding each row and returning a DaruLite::Vector based on the
95
+ # value each run of the block returns.
96
+ #
97
+ # == Usage
98
+ #
99
+ # a1 = DaruLite::Vector.new([1, 2, 3, 4, 5, 6, 7])
100
+ # a2 = DaruLite::Vector.new([10, 20, 30, 40, 50, 60, 70])
101
+ # a3 = DaruLite::Vector.new([100, 200, 300, 400, 500, 600, 700])
102
+ # ds = DaruLite::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
103
+ # total = ds.vector_by_calculation { a + b + c }
104
+ # # <DaruLite::Vector:82314050 @name = nil @size = 7 >
105
+ # # nil
106
+ # # 0 111
107
+ # # 1 222
108
+ # # 2 333
109
+ # # 3 444
110
+ # # 4 555
111
+ # # 5 666
112
+ # # 6 777
113
+ def vector_by_calculation(&block)
114
+ a = each_row.map { |r| r.instance_eval(&block) }
115
+
116
+ DaruLite::Vector.new a, index: @index
117
+ end
118
+
119
+ def vector_count_characters(vecs = nil)
120
+ vecs ||= @vectors.to_a
121
+
122
+ collect_rows do |row|
123
+ vecs.sum { |v| row[v].to_s.size }
124
+ end
125
+ end
126
+
127
+ # Generate a summary of this DataFrame based on individual vectors in the DataFrame
128
+ # @return [String] String containing the summary of the DataFrame
129
+ def summary
130
+ summary = "= #{name}"
131
+ summary << "\n Number of rows: #{nrows}"
132
+ @vectors.each do |v|
133
+ summary << "\n Element:[#{v}]\n"
134
+ summary << self[v].summary(1)
135
+ end
136
+ summary
137
+ end
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,107 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Convertible
4
+ # Create a sql, basen on a given Dataset
5
+ #
6
+ # == Arguments
7
+ #
8
+ # * table - String specifying name of the table that will created in SQL.
9
+ # * charset - Character set. Default is "UTF8".
10
+ #
11
+ # @example
12
+ #
13
+ # ds = DaruLite::DataFrame.new({
14
+ # :id => DaruLite::Vector.new([1,2,3,4,5]),
15
+ # :name => DaruLite::Vector.new(%w{Alex Peter Susan Mary John})
16
+ # })
17
+ # ds.create_sql('names')
18
+ # #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
19
+ #
20
+ def create_sql(table, charset = 'UTF8')
21
+ sql = "CREATE TABLE #{table} ("
22
+ fields = vectors.to_a.collect do |f|
23
+ v = self[f]
24
+ "#{f} #{v.db_type}"
25
+ end
26
+
27
+ sql + fields.join(",\n ") + ") CHARACTER SET=#{charset};"
28
+ end
29
+
30
+ # Returns the dataframe. This can be convenient when the user does not
31
+ # know whether the object is a vector or a dataframe.
32
+ # @return [self] the dataframe
33
+ def to_df
34
+ self
35
+ end
36
+
37
+ # Convert all vectors of type *:numeric* into a Matrix.
38
+ def to_matrix
39
+ Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
40
+ end
41
+
42
+ # Converts the DataFrame into an array of hashes where key is vector name
43
+ # and value is the corresponding element. The 0th index of the array contains
44
+ # the array of hashes while the 1th index contains the indexes of each row
45
+ # of the dataframe. Each element in the index array corresponds to its row
46
+ # in the array of hashes, which has the same index.
47
+ def to_a
48
+ [each_row.map(&:to_h), @index.to_a]
49
+ end
50
+
51
+ # Convert to json. If no_index is false then the index will NOT be included
52
+ # in the JSON thus created.
53
+ def to_json(no_index = true)
54
+ if no_index
55
+ to_a[0].to_json
56
+ else
57
+ to_a.to_json
58
+ end
59
+ end
60
+
61
+ # Converts DataFrame to a hash (explicit) with keys as vector names and values as
62
+ # the corresponding vectors.
63
+ def to_h
64
+ @vectors
65
+ .each_with_index
66
+ .map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
67
+ end
68
+
69
+ # Convert to html for IRuby.
70
+ def to_html(threshold = DaruLite.max_rows)
71
+ table_thead = to_html_thead
72
+ table_tbody = to_html_tbody(threshold)
73
+ path = if index.is_a?(MultiIndex)
74
+ File.expand_path('../iruby/templates/dataframe_mi.html.erb', __dir__)
75
+ else
76
+ File.expand_path('../iruby/templates/dataframe.html.erb', __dir__)
77
+ end
78
+ ERB.new(File.read(path).strip).result(binding)
79
+ end
80
+
81
+ def to_html_thead
82
+ table_thead_path =
83
+ if index.is_a?(MultiIndex)
84
+ File.expand_path('../iruby/templates/dataframe_mi_thead.html.erb', __dir__)
85
+ else
86
+ File.expand_path('../iruby/templates/dataframe_thead.html.erb', __dir__)
87
+ end
88
+ ERB.new(File.read(table_thead_path).strip).result(binding)
89
+ end
90
+
91
+ def to_html_tbody(threshold = DaruLite.max_rows)
92
+ threshold ||= @size
93
+ table_tbody_path =
94
+ if index.is_a?(MultiIndex)
95
+ File.expand_path('../iruby/templates/dataframe_mi_tbody.html.erb', __dir__)
96
+ else
97
+ File.expand_path('../iruby/templates/dataframe_tbody.html.erb', __dir__)
98
+ end
99
+ ERB.new(File.read(table_tbody_path).strip).result(binding)
100
+ end
101
+
102
+ def to_s
103
+ "#<#{self.class}#{": #{@name}" if @name}(#{nrows}x#{ncols})>"
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,64 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Duplicatable
4
+ extend Gem::Deprecate
5
+
6
+ # Duplicate the DataFrame entirely.
7
+ #
8
+ # == Arguments
9
+ #
10
+ # * +vectors_to_dup+ - An Array specifying the names of Vectors to
11
+ # be duplicated. Will duplicate the entire DataFrame if not specified.
12
+ def dup(vectors_to_dup = nil)
13
+ vectors_to_dup ||= @vectors.to_a
14
+
15
+ src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
16
+ new_order = DaruLite::Index.new(vectors_to_dup)
17
+
18
+ DaruLite::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
19
+ end
20
+
21
+ # Only clone the structure of the DataFrame.
22
+ def clone_structure
23
+ DaruLite::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
24
+ end
25
+
26
+ # Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
27
+ # preserved.
28
+ #
29
+ # == Arguments
30
+ #
31
+ # +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
32
+ # a view of the whole data frame otherwise.
33
+ def clone(*vectors_to_clone)
34
+ vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
35
+ vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
36
+
37
+ h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
38
+ DaruLite::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
39
+ end
40
+
41
+ # Returns a 'shallow' copy of DataFrame if missing data is not present,
42
+ # or a full copy of only valid data if missing data is present.
43
+ def clone_only_valid
44
+ if include_values?(*DaruLite::MISSING_VALUES)
45
+ reject_values(*DaruLite::MISSING_VALUES)
46
+ else
47
+ clone
48
+ end
49
+ end
50
+
51
+ # Creates a new duplicate dataframe containing only rows
52
+ # without a single missing value.
53
+ def dup_only_valid(vecs = nil)
54
+ rows_with_nil = @data.map { |vec| vec.indexes(*DaruLite::MISSING_VALUES) }
55
+ .inject(&:concat)
56
+ .uniq
57
+
58
+ row_indexes = @index.to_a
59
+ (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
60
+ end
61
+ deprecate :dup_only_valid, :reject_values, 2016, 10
62
+ end
63
+ end
64
+ end