daru_lite 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +35 -33
  3. data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
  4. data/lib/daru_lite/data_frame/calculatable.rb +140 -0
  5. data/lib/daru_lite/data_frame/convertible.rb +107 -0
  6. data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
  7. data/lib/daru_lite/data_frame/fetchable.rb +301 -0
  8. data/lib/daru_lite/data_frame/filterable.rb +144 -0
  9. data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
  10. data/lib/daru_lite/data_frame/indexable.rb +168 -0
  11. data/lib/daru_lite/data_frame/iterable.rb +339 -0
  12. data/lib/daru_lite/data_frame/joinable.rb +152 -0
  13. data/lib/daru_lite/data_frame/missable.rb +75 -0
  14. data/lib/daru_lite/data_frame/pivotable.rb +108 -0
  15. data/lib/daru_lite/data_frame/queryable.rb +67 -0
  16. data/lib/daru_lite/data_frame/setable.rb +109 -0
  17. data/lib/daru_lite/data_frame/sortable.rb +241 -0
  18. data/lib/daru_lite/dataframe.rb +138 -2353
  19. data/lib/daru_lite/index/index.rb +13 -0
  20. data/lib/daru_lite/maths/statistics/vector.rb +1 -1
  21. data/lib/daru_lite/vector/aggregatable.rb +9 -0
  22. data/lib/daru_lite/vector/calculatable.rb +78 -0
  23. data/lib/daru_lite/vector/convertible.rb +77 -0
  24. data/lib/daru_lite/vector/duplicatable.rb +17 -0
  25. data/lib/daru_lite/vector/fetchable.rb +175 -0
  26. data/lib/daru_lite/vector/filterable.rb +128 -0
  27. data/lib/daru_lite/vector/indexable.rb +77 -0
  28. data/lib/daru_lite/vector/iterable.rb +95 -0
  29. data/lib/daru_lite/vector/joinable.rb +17 -0
  30. data/lib/daru_lite/vector/missable.rb +124 -0
  31. data/lib/daru_lite/vector/queryable.rb +45 -0
  32. data/lib/daru_lite/vector/setable.rb +47 -0
  33. data/lib/daru_lite/vector/sortable.rb +113 -0
  34. data/lib/daru_lite/vector.rb +36 -932
  35. data/lib/daru_lite/version.rb +1 -1
  36. data/spec/data_frame/aggregatable_example.rb +65 -0
  37. data/spec/data_frame/buildable_example.rb +109 -0
  38. data/spec/data_frame/calculatable_example.rb +135 -0
  39. data/spec/data_frame/convertible_example.rb +180 -0
  40. data/spec/data_frame/duplicatable_example.rb +111 -0
  41. data/spec/data_frame/fetchable_example.rb +476 -0
  42. data/spec/data_frame/filterable_example.rb +250 -0
  43. data/spec/data_frame/indexable_example.rb +221 -0
  44. data/spec/data_frame/iterable_example.rb +465 -0
  45. data/spec/data_frame/joinable_example.rb +106 -0
  46. data/spec/data_frame/missable_example.rb +47 -0
  47. data/spec/data_frame/pivotable_example.rb +297 -0
  48. data/spec/data_frame/queryable_example.rb +92 -0
  49. data/spec/data_frame/setable_example.rb +482 -0
  50. data/spec/data_frame/sortable_example.rb +350 -0
  51. data/spec/dataframe_spec.rb +181 -3289
  52. data/spec/index/index_spec.rb +8 -0
  53. data/spec/vector/aggregatable_example.rb +27 -0
  54. data/spec/vector/calculatable_example.rb +82 -0
  55. data/spec/vector/convertible_example.rb +126 -0
  56. data/spec/vector/duplicatable_example.rb +48 -0
  57. data/spec/vector/fetchable_example.rb +463 -0
  58. data/spec/vector/filterable_example.rb +165 -0
  59. data/spec/vector/indexable_example.rb +201 -0
  60. data/spec/vector/iterable_example.rb +111 -0
  61. data/spec/vector/joinable_example.rb +25 -0
  62. data/spec/vector/missable_example.rb +88 -0
  63. data/spec/vector/queryable_example.rb +91 -0
  64. data/spec/vector/setable_example.rb +300 -0
  65. data/spec/vector/sortable_example.rb +242 -0
  66. data/spec/vector_spec.rb +111 -1805
  67. metadata +86 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4f11fb1ea9fb6c3f43f5f8267338ed4cc81fde2f60bcc82fd6eec6e30b5d87d5
4
- data.tar.gz: e8a833062efdf095672da6684d3971a5e1eafa482b658ae7436061ad4d9ff1c7
3
+ metadata.gz: 1fca8a59ee849230424502a8ffa2f986134ccf522d15d53ab3807c22b64b30f8
4
+ data.tar.gz: 8c4e8048ea8171c463b048ac9dff8b86a8b19e3ec5dd62f16bf72311e7b03b38
5
5
  SHA512:
6
- metadata.gz: 25dc59ffa9fb012693cf7bd611e19d134d4f003ee3ff5c642a574cb4bfebc5001a8aacd8b42982001b585233b370017e440c62be9dc6389f371423edc36c058b
7
- data.tar.gz: f7373e8a86deb1f766cfb56f759cce92accfad2b571dac138d6d2eb54c2bb746b9cb6a11854891ba4137740208310837cb06edef4680cc2d4319260b12a1c917
6
+ metadata.gz: 403d6cfe869dcd152f083ea0878be37f6a8b40212f6ba5f80ece21bcadf51a4f13471f529bbddcf66b593568f31ec52f3e308c39160f0bd87bac9af6d95b30f6
7
+ data.tar.gz: dfbc2d7b5e63c54980c704c0df3d96ae8d079b921fc0ff51a34f109126a2a382d531457321737e83a2b03bc114b741e3018d0beb9cb00554aa822345d94f3144
data/.rubocop_todo.yml CHANGED
@@ -1,11 +1,19 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2022-08-16 13:20:50 UTC using RuboCop version 1.35.0.
3
+ # on 2024-03-03 13:59:21 UTC using RuboCop version 1.60.2.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
7
7
  # versions of RuboCop, may require this file to be generated again.
8
8
 
9
+ # Offense count: 1
10
+ # This cop supports safe autocorrection (--autocorrect).
11
+ # Configuration parameters: EnforcedStyle, IndentationWidth.
12
+ # SupportedStyles: aligned, indented
13
+ Layout/LineEndStringConcatenationIndentation:
14
+ Exclude:
15
+ - 'lib/daru_lite/data_frame/indexable.rb'
16
+
9
17
  # Offense count: 1
10
18
  # Configuration parameters: AllowComments.
11
19
  Lint/EmptyClass:
@@ -13,6 +21,7 @@ Lint/EmptyClass:
13
21
  - 'lib/daru_lite/accessors/mdarray_wrapper.rb'
14
22
 
15
23
  # Offense count: 5
24
+ # Configuration parameters: AllowedParentClasses.
16
25
  Lint/MissingSuper:
17
26
  Exclude:
18
27
  - 'lib/daru_lite/date_time/offsets.rb'
@@ -20,61 +29,50 @@ Lint/MissingSuper:
20
29
  - 'lib/daru_lite/index/index.rb'
21
30
  - 'lib/daru_lite/index/multi_index.rb'
22
31
 
23
- # Offense count: 6
32
+ # Offense count: 5
33
+ # This cop supports safe autocorrection (--autocorrect).
24
34
  # Configuration parameters: CheckForMethodsWithNoSideEffects.
25
35
  Lint/Void:
26
36
  Exclude:
27
37
  - 'lib/daru_lite/category.rb'
28
- - 'lib/daru_lite/dataframe.rb'
38
+ - 'lib/daru_lite/data_frame/indexable.rb'
29
39
  - 'lib/daru_lite/vector.rb'
30
40
 
31
- # Offense count: 40
32
- # Configuration parameters: AllowedMethods, AllowedPatterns, IgnoredMethods, CountRepeatedAttributes.
41
+ # Offense count: 41
42
+ # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes.
33
43
  Metrics/AbcSize:
34
44
  Max: 34
35
45
 
36
- # Offense count: 3
46
+ # Offense count: 5
37
47
  # Configuration parameters: CountComments, CountAsOne.
38
48
  Metrics/ClassLength:
39
- Max: 189
49
+ Max: 188
40
50
 
41
51
  # Offense count: 6
42
- # Configuration parameters: AllowedMethods, AllowedPatterns, IgnoredMethods.
52
+ # Configuration parameters: AllowedMethods, AllowedPatterns.
43
53
  Metrics/CyclomaticComplexity:
44
54
  Max: 9
45
55
 
46
- # Offense count: 61
47
- # Configuration parameters: CountComments, CountAsOne, ExcludedMethods, AllowedMethods, AllowedPatterns, IgnoredMethods.
56
+ # Offense count: 60
57
+ # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
48
58
  Metrics/MethodLength:
49
59
  Max: 15
50
60
 
51
- # Offense count: 2
61
+ # Offense count: 4
52
62
  # Configuration parameters: CountComments, CountAsOne.
53
63
  Metrics/ModuleLength:
54
64
  Max: 190
55
65
 
56
66
  # Offense count: 4
57
- # Configuration parameters: AllowedMethods, AllowedPatterns, IgnoredMethods.
67
+ # Configuration parameters: AllowedMethods, AllowedPatterns.
58
68
  Metrics/PerceivedComplexity:
59
69
  Max: 10
60
70
 
61
- # Offense count: 72
71
+ # Offense count: 66
62
72
  # Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
63
- # AllowedNames: at, by, db, id, in, io, ip, of, on, os, pp, to
73
+ # AllowedNames: as, at, by, cc, db, id, if, in, io, ip, of, on, os, pp, to
64
74
  Naming/MethodParameterName:
65
- Exclude:
66
- - 'lib/daru_lite/category.rb'
67
- - 'lib/daru_lite/core/group_by.rb'
68
- - 'lib/daru_lite/core/merge.rb'
69
- - 'lib/daru_lite/core/query.rb'
70
- - 'lib/daru_lite/dataframe.rb'
71
- - 'lib/daru_lite/date_time/index.rb'
72
- - 'lib/daru_lite/date_time/offsets.rb'
73
- - 'lib/daru_lite/extensions/which_dsl.rb'
74
- - 'lib/daru_lite/io/io.rb'
75
- - 'lib/daru_lite/maths/statistics/dataframe.rb'
76
- - 'lib/daru_lite/maths/statistics/vector.rb'
77
- - 'lib/daru_lite/vector.rb'
75
+ Enabled: false
78
76
 
79
77
  # Offense count: 5
80
78
  # Configuration parameters: NamePrefix, ForbiddenPrefixes, AllowedMethods, MethodDefinitionMacros.
@@ -85,13 +83,14 @@ Naming/MethodParameterName:
85
83
  Naming/PredicateName:
86
84
  Exclude:
87
85
  - 'spec/**/*'
88
- - 'lib/daru_lite/dataframe.rb'
86
+ - 'lib/daru_lite/data_frame/missable.rb'
87
+ - 'lib/daru_lite/data_frame/queryable.rb'
89
88
  - 'lib/daru_lite/vector.rb'
90
89
 
91
90
  # Offense count: 5
92
91
  Security/MarshalLoad:
93
92
  Exclude:
94
- - 'lib/daru_lite/dataframe.rb'
93
+ - 'lib/daru_lite/data_frame/i_o_able.rb'
95
94
  - 'lib/daru_lite/date_time/index.rb'
96
95
  - 'lib/daru_lite/index/index.rb'
97
96
  - 'lib/daru_lite/io/io.rb'
@@ -102,7 +101,7 @@ Style/ClassVars:
102
101
  Exclude:
103
102
  - 'lib/daru_lite.rb'
104
103
 
105
- # Offense count: 44
104
+ # Offense count: 58
106
105
  # Configuration parameters: AllowedConstants.
107
106
  Style/Documentation:
108
107
  Enabled: false
@@ -113,6 +112,10 @@ Style/MapToHash:
113
112
  Exclude:
114
113
  - 'lib/daru_lite/category.rb'
115
114
  - 'lib/daru_lite/core/group_by.rb'
115
+ - 'lib/daru_lite/data_frame/convertible.rb'
116
+ - 'lib/daru_lite/data_frame/duplicatable.rb'
117
+ - 'lib/daru_lite/data_frame/fetchable.rb'
118
+ - 'lib/daru_lite/data_frame/joinable.rb'
116
119
  - 'lib/daru_lite/dataframe.rb'
117
120
 
118
121
  # Offense count: 1
@@ -125,7 +128,7 @@ Style/MultilineBlockChain:
125
128
  # AllowedMethods: respond_to_missing?
126
129
  Style/OptionalBooleanParameter:
127
130
  Exclude:
128
- - 'lib/daru_lite/dataframe.rb'
131
+ - 'lib/daru_lite/data_frame/convertible.rb'
129
132
  - 'lib/daru_lite/maths/statistics/vector.rb'
130
133
  - 'lib/daru_lite/vector.rb'
131
134
 
@@ -133,5 +136,4 @@ Style/OptionalBooleanParameter:
133
136
  # This cop supports unsafe autocorrection (--autocorrect-all).
134
137
  Style/RedundantSelfAssignment:
135
138
  Exclude:
136
- - 'lib/daru_lite/dataframe.rb'
137
-
139
+ - 'lib/daru_lite/data_frame/joinable.rb'
@@ -0,0 +1,165 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Aggregatable
4
+ # Group elements by vector to perform operations on them. Returns a
5
+ # DaruLite::Core::GroupBy object.See the DaruLite::Core::GroupBy docs for a detailed
6
+ # list of possible operations.
7
+ #
8
+ # == Arguments
9
+ #
10
+ # * vectors - An Array contatining names of vectors to group by.
11
+ #
12
+ # == Usage
13
+ #
14
+ # df = DaruLite::DataFrame.new({
15
+ # a: %w{foo bar foo bar foo bar foo foo},
16
+ # b: %w{one one two three two two one three},
17
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
18
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
19
+ # })
20
+ # df.group_by([:a,:b,:c]).groups
21
+ # #=> {["bar", "one", 2]=>[1],
22
+ # # ["bar", "three", 1]=>[3],
23
+ # # ["bar", "two", 6]=>[5],
24
+ # # ["foo", "one", 1]=>[0],
25
+ # # ["foo", "one", 3]=>[6],
26
+ # # ["foo", "three", 8]=>[7],
27
+ # # ["foo", "two", 3]=>[2, 4]}
28
+ def group_by(*vectors)
29
+ vectors.flatten!
30
+ missing = vectors - @vectors.to_a
31
+ raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}") unless missing.empty?
32
+
33
+ vectors = [@vectors.first] if vectors.empty?
34
+
35
+ DaruLite::Core::GroupBy.new(self, vectors)
36
+ end
37
+
38
+ # Function to use for aggregating the data.
39
+ #
40
+ # @param options [Hash] options for column, you want in resultant dataframe
41
+ #
42
+ # @return [DaruLite::DataFrame]
43
+ #
44
+ # @example
45
+ # df = DaruLite::DataFrame.new(
46
+ # {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
47
+ # => #<DaruLite::DataFrame(5x2)>
48
+ # col num
49
+ # 0 a 52
50
+ # 1 b 12
51
+ # 2 c 7
52
+ # 3 d 17
53
+ # 4 e 1
54
+ #
55
+ # df.aggregate(num_100_times: ->(df) { (df.num*100).first })
56
+ # => #<DaruLite::DataFrame(5x1)>
57
+ # num_100_ti
58
+ # 0 5200
59
+ # 1 1200
60
+ # 2 700
61
+ # 3 1700
62
+ # 4 100
63
+ #
64
+ # When we have duplicate index :
65
+ #
66
+ # idx = DaruLite::CategoricalIndex.new [:a, :b, :a, :a, :c]
67
+ # df = DaruLite::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
68
+ # => #<DaruLite::DataFrame(5x1)>
69
+ # num
70
+ # a 52
71
+ # b 12
72
+ # a 7
73
+ # a 17
74
+ # c 1
75
+ #
76
+ # df.aggregate(num: :mean)
77
+ # => #<DaruLite::DataFrame(3x1)>
78
+ # num
79
+ # a 25.3333333
80
+ # b 12
81
+ # c 1
82
+ #
83
+ # Note: `GroupBy` class `aggregate` method uses this `aggregate` method
84
+ # internally.
85
+ def aggregate(options = {}, multi_index_level = -1)
86
+ if block_given?
87
+ positions_tuples, new_index = yield(@index) # NOTE: use of yield is private for now
88
+ else
89
+ positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
90
+ end
91
+
92
+ colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
93
+
94
+ DaruLite::DataFrame.new(colmn_value, index: new_index, order: options.keys)
95
+ end
96
+
97
+ def group_by_and_aggregate(*group_by_keys, **aggregation_map)
98
+ group_by(*group_by_keys).aggregate(aggregation_map)
99
+ end
100
+
101
+ private
102
+
103
+ def aggregate_by_positions_tuples(options, positions_tuples)
104
+ agg_over_vectors_only, options = cast_aggregation_options(options)
105
+
106
+ if agg_over_vectors_only
107
+ options.map do |vect_name, method|
108
+ vect = self[vect_name]
109
+
110
+ positions_tuples.map do |positions|
111
+ vect.apply_method_on_sub_vector(method, keys: positions)
112
+ end
113
+ end
114
+ else
115
+ methods = options.values
116
+
117
+ # NOTE: because we aggregate over rows, we don't have to re-get sub-dfs for each method (which is expensive)
118
+ rows = positions_tuples.map do |positions|
119
+ apply_method_on_sub_df(methods, keys: positions)
120
+ end
121
+
122
+ rows.transpose
123
+ end
124
+ end
125
+
126
+ # convert operations over sub-vectors to operations over sub-dfs when it improves perf
127
+ # note: we don't always "cast" because aggregation over a single vector / a few vector is faster
128
+ # than aggregation over (sub-)dfs
129
+ def cast_aggregation_options(options)
130
+ vects, non_vects = options.keys.partition { |k| @vectors.include?(k) }
131
+
132
+ over_vectors = true
133
+
134
+ if non_vects.any?
135
+ options = options.clone
136
+
137
+ vects.each do |name|
138
+ proc_on_vect = options[name].to_proc
139
+ options[name] = ->(sub_df) { proc_on_vect.call(sub_df[name]) }
140
+ end
141
+
142
+ over_vectors = false
143
+ end
144
+
145
+ [over_vectors, options]
146
+ end
147
+
148
+ def group_index_for_aggregation(index, multi_index_level = -1)
149
+ case index
150
+ when DaruLite::MultiIndex
151
+ groups_by_pos = DaruLite::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
152
+
153
+ new_index = DaruLite::MultiIndex.from_tuples(groups_by_pos.keys).coerce_index
154
+ pos_tuples = groups_by_pos.values
155
+ when DaruLite::Index, DaruLite::CategoricalIndex
156
+ new_index = Array(index).uniq
157
+ pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
158
+ else raise
159
+ end
160
+
161
+ [pos_tuples, new_index]
162
+ end
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,140 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Calculatable
4
+ # Sum all numeric/specified vectors in the DataFrame.
5
+ #
6
+ # Returns a new vector that's a containing a sum of all numeric
7
+ # or specified vectors of the DataFrame. By default, if the vector
8
+ # contains a nil, the sum is nil.
9
+ # With :skipnil argument set to true, nil values are assumed to be
10
+ # 0 (zero) and the sum vector is returned.
11
+ #
12
+ # @param args [Array] List of vectors to sum. Default is nil in which case
13
+ # all numeric vectors are summed.
14
+ #
15
+ # @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
16
+ #
17
+ # @return Vector with sum of all vectors specified in the argument.
18
+ # If vecs parameter is empty, sum all numeric vector.
19
+ #
20
+ # @example
21
+ # df = DaruLite::DataFrame.new({
22
+ # a: [1, 2, nil],
23
+ # b: [2, 1, 3],
24
+ # c: [1, 1, 1]
25
+ # })
26
+ # => #<DaruLite::DataFrame(3x3)>
27
+ # a b c
28
+ # 0 1 2 1
29
+ # 1 2 1 1
30
+ # 2 nil 3 1
31
+ # df.vector_sum [:a, :c]
32
+ # => #<DaruLite::Vector(3)>
33
+ # 0 2
34
+ # 1 3
35
+ # 2 nil
36
+ # df.vector_sum
37
+ # => #<DaruLite::Vector(3)>
38
+ # 0 4
39
+ # 1 4
40
+ # 2 nil
41
+ # df.vector_sum skipnil: true
42
+ # => #<DaruLite::Vector(3)>
43
+ # c
44
+ # 0 4
45
+ # 1 4
46
+ # 2 4
47
+ #
48
+ def vector_sum(*args)
49
+ defaults = { vecs: nil, skipnil: false }
50
+ options = args.last.is_a?(::Hash) ? args.pop : {}
51
+ options = defaults.merge(options)
52
+ vecs = args[0] || options[:vecs]
53
+ skipnil = args[1] || options[:skipnil]
54
+
55
+ vecs ||= numeric_vectors
56
+ sum = DaruLite::Vector.new [0] * @size, index: @index, name: @name, dtype: @dtype
57
+ vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
58
+ end
59
+
60
+ # Calculate mean of the rows of the dataframe.
61
+ #
62
+ # == Arguments
63
+ #
64
+ # * +max_missing+ - The maximum number of elements in the row that can be
65
+ # zero for the mean calculation to happen. Default to 0.
66
+ def vector_mean(max_missing = 0)
67
+ # FIXME: in vector_sum we preserve created vector dtype, but
68
+ # here we are not. Is this by design or ...? - zverok, 2016-05-18
69
+ mean_vec = DaruLite::Vector.new [0] * @size, index: @index, name: "mean_#{@name}"
70
+
71
+ each_row_with_index.with_object(mean_vec) do |(row, i), memo|
72
+ memo[i] = row.indexes(*DaruLite::MISSING_VALUES).size > max_missing ? nil : row.mean
73
+ end
74
+ end
75
+
76
+ # Returns a vector, based on a string with a calculation based
77
+ # on vector.
78
+ #
79
+ # The calculation will be eval'ed, so you can put any variable
80
+ # or expression valid on ruby.
81
+ #
82
+ # For example:
83
+ # a = DaruLite::Vector.new [1,2]
84
+ # b = DaruLite::Vector.new [3,4]
85
+ # ds = DaruLite::DataFrame.new({:a => a,:b => b})
86
+ # ds.compute("a+b")
87
+ # => Vector [4,6]
88
+ def compute(text, &block)
89
+ return instance_eval(&block) if block
90
+
91
+ instance_eval(text)
92
+ end
93
+
94
+ # DSL for yielding each row and returning a DaruLite::Vector based on the
95
+ # value each run of the block returns.
96
+ #
97
+ # == Usage
98
+ #
99
+ # a1 = DaruLite::Vector.new([1, 2, 3, 4, 5, 6, 7])
100
+ # a2 = DaruLite::Vector.new([10, 20, 30, 40, 50, 60, 70])
101
+ # a3 = DaruLite::Vector.new([100, 200, 300, 400, 500, 600, 700])
102
+ # ds = DaruLite::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
103
+ # total = ds.vector_by_calculation { a + b + c }
104
+ # # <DaruLite::Vector:82314050 @name = nil @size = 7 >
105
+ # # nil
106
+ # # 0 111
107
+ # # 1 222
108
+ # # 2 333
109
+ # # 3 444
110
+ # # 4 555
111
+ # # 5 666
112
+ # # 6 777
113
+ def vector_by_calculation(&block)
114
+ a = each_row.map { |r| r.instance_eval(&block) }
115
+
116
+ DaruLite::Vector.new a, index: @index
117
+ end
118
+
119
+ def vector_count_characters(vecs = nil)
120
+ vecs ||= @vectors.to_a
121
+
122
+ collect_rows do |row|
123
+ vecs.sum { |v| row[v].to_s.size }
124
+ end
125
+ end
126
+
127
+ # Generate a summary of this DataFrame based on individual vectors in the DataFrame
128
+ # @return [String] String containing the summary of the DataFrame
129
+ def summary
130
+ summary = "= #{name}"
131
+ summary << "\n Number of rows: #{nrows}"
132
+ @vectors.each do |v|
133
+ summary << "\n Element:[#{v}]\n"
134
+ summary << self[v].summary(1)
135
+ end
136
+ summary
137
+ end
138
+ end
139
+ end
140
+ end
@@ -0,0 +1,107 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Convertible
4
+ # Create a sql, basen on a given Dataset
5
+ #
6
+ # == Arguments
7
+ #
8
+ # * table - String specifying name of the table that will created in SQL.
9
+ # * charset - Character set. Default is "UTF8".
10
+ #
11
+ # @example
12
+ #
13
+ # ds = DaruLite::DataFrame.new({
14
+ # :id => DaruLite::Vector.new([1,2,3,4,5]),
15
+ # :name => DaruLite::Vector.new(%w{Alex Peter Susan Mary John})
16
+ # })
17
+ # ds.create_sql('names')
18
+ # #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
19
+ #
20
+ def create_sql(table, charset = 'UTF8')
21
+ sql = "CREATE TABLE #{table} ("
22
+ fields = vectors.to_a.collect do |f|
23
+ v = self[f]
24
+ "#{f} #{v.db_type}"
25
+ end
26
+
27
+ sql + fields.join(",\n ") + ") CHARACTER SET=#{charset};"
28
+ end
29
+
30
+ # Returns the dataframe. This can be convenient when the user does not
31
+ # know whether the object is a vector or a dataframe.
32
+ # @return [self] the dataframe
33
+ def to_df
34
+ self
35
+ end
36
+
37
+ # Convert all vectors of type *:numeric* into a Matrix.
38
+ def to_matrix
39
+ Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
40
+ end
41
+
42
+ # Converts the DataFrame into an array of hashes where key is vector name
43
+ # and value is the corresponding element. The 0th index of the array contains
44
+ # the array of hashes while the 1th index contains the indexes of each row
45
+ # of the dataframe. Each element in the index array corresponds to its row
46
+ # in the array of hashes, which has the same index.
47
+ def to_a
48
+ [each_row.map(&:to_h), @index.to_a]
49
+ end
50
+
51
+ # Convert to json. If no_index is false then the index will NOT be included
52
+ # in the JSON thus created.
53
+ def to_json(no_index = true)
54
+ if no_index
55
+ to_a[0].to_json
56
+ else
57
+ to_a.to_json
58
+ end
59
+ end
60
+
61
+ # Converts DataFrame to a hash (explicit) with keys as vector names and values as
62
+ # the corresponding vectors.
63
+ def to_h
64
+ @vectors
65
+ .each_with_index
66
+ .map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
67
+ end
68
+
69
+ # Convert to html for IRuby.
70
+ def to_html(threshold = DaruLite.max_rows)
71
+ table_thead = to_html_thead
72
+ table_tbody = to_html_tbody(threshold)
73
+ path = if index.is_a?(MultiIndex)
74
+ File.expand_path('../iruby/templates/dataframe_mi.html.erb', __dir__)
75
+ else
76
+ File.expand_path('../iruby/templates/dataframe.html.erb', __dir__)
77
+ end
78
+ ERB.new(File.read(path).strip).result(binding)
79
+ end
80
+
81
+ def to_html_thead
82
+ table_thead_path =
83
+ if index.is_a?(MultiIndex)
84
+ File.expand_path('../iruby/templates/dataframe_mi_thead.html.erb', __dir__)
85
+ else
86
+ File.expand_path('../iruby/templates/dataframe_thead.html.erb', __dir__)
87
+ end
88
+ ERB.new(File.read(table_thead_path).strip).result(binding)
89
+ end
90
+
91
+ def to_html_tbody(threshold = DaruLite.max_rows)
92
+ threshold ||= @size
93
+ table_tbody_path =
94
+ if index.is_a?(MultiIndex)
95
+ File.expand_path('../iruby/templates/dataframe_mi_tbody.html.erb', __dir__)
96
+ else
97
+ File.expand_path('../iruby/templates/dataframe_tbody.html.erb', __dir__)
98
+ end
99
+ ERB.new(File.read(table_tbody_path).strip).result(binding)
100
+ end
101
+
102
+ def to_s
103
+ "#<#{self.class}#{": #{@name}" if @name}(#{nrows}x#{ncols})>"
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,64 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Duplicatable
4
+ extend Gem::Deprecate
5
+
6
+ # Duplicate the DataFrame entirely.
7
+ #
8
+ # == Arguments
9
+ #
10
+ # * +vectors_to_dup+ - An Array specifying the names of Vectors to
11
+ # be duplicated. Will duplicate the entire DataFrame if not specified.
12
+ def dup(vectors_to_dup = nil)
13
+ vectors_to_dup ||= @vectors.to_a
14
+
15
+ src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
16
+ new_order = DaruLite::Index.new(vectors_to_dup)
17
+
18
+ DaruLite::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
19
+ end
20
+
21
+ # Only clone the structure of the DataFrame.
22
+ def clone_structure
23
+ DaruLite::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
24
+ end
25
+
26
+ # Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
27
+ # preserved.
28
+ #
29
+ # == Arguments
30
+ #
31
+ # +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
32
+ # a view of the whole data frame otherwise.
33
+ def clone(*vectors_to_clone)
34
+ vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
35
+ vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
36
+
37
+ h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
38
+ DaruLite::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
39
+ end
40
+
41
+ # Returns a 'shallow' copy of DataFrame if missing data is not present,
42
+ # or a full copy of only valid data if missing data is present.
43
+ def clone_only_valid
44
+ if include_values?(*DaruLite::MISSING_VALUES)
45
+ reject_values(*DaruLite::MISSING_VALUES)
46
+ else
47
+ clone
48
+ end
49
+ end
50
+
51
+ # Creates a new duplicate dataframe containing only rows
52
+ # without a single missing value.
53
+ def dup_only_valid(vecs = nil)
54
+ rows_with_nil = @data.map { |vec| vec.indexes(*DaruLite::MISSING_VALUES) }
55
+ .inject(&:concat)
56
+ .uniq
57
+
58
+ row_indexes = @index.to_a
59
+ (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
60
+ end
61
+ deprecate :dup_only_valid, :reject_values, 2016, 10
62
+ end
63
+ end
64
+ end