daru_lite 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +35 -33
- data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
- data/lib/daru_lite/data_frame/calculatable.rb +140 -0
- data/lib/daru_lite/data_frame/convertible.rb +107 -0
- data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
- data/lib/daru_lite/data_frame/fetchable.rb +301 -0
- data/lib/daru_lite/data_frame/filterable.rb +144 -0
- data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
- data/lib/daru_lite/data_frame/indexable.rb +168 -0
- data/lib/daru_lite/data_frame/iterable.rb +339 -0
- data/lib/daru_lite/data_frame/joinable.rb +152 -0
- data/lib/daru_lite/data_frame/missable.rb +75 -0
- data/lib/daru_lite/data_frame/pivotable.rb +108 -0
- data/lib/daru_lite/data_frame/queryable.rb +67 -0
- data/lib/daru_lite/data_frame/setable.rb +109 -0
- data/lib/daru_lite/data_frame/sortable.rb +241 -0
- data/lib/daru_lite/dataframe.rb +138 -2353
- data/lib/daru_lite/index/index.rb +13 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1 -1
- data/lib/daru_lite/vector/aggregatable.rb +9 -0
- data/lib/daru_lite/vector/calculatable.rb +78 -0
- data/lib/daru_lite/vector/convertible.rb +77 -0
- data/lib/daru_lite/vector/duplicatable.rb +17 -0
- data/lib/daru_lite/vector/fetchable.rb +175 -0
- data/lib/daru_lite/vector/filterable.rb +128 -0
- data/lib/daru_lite/vector/indexable.rb +77 -0
- data/lib/daru_lite/vector/iterable.rb +95 -0
- data/lib/daru_lite/vector/joinable.rb +17 -0
- data/lib/daru_lite/vector/missable.rb +124 -0
- data/lib/daru_lite/vector/queryable.rb +45 -0
- data/lib/daru_lite/vector/setable.rb +47 -0
- data/lib/daru_lite/vector/sortable.rb +113 -0
- data/lib/daru_lite/vector.rb +36 -932
- data/lib/daru_lite/version.rb +1 -1
- data/spec/data_frame/aggregatable_example.rb +65 -0
- data/spec/data_frame/buildable_example.rb +109 -0
- data/spec/data_frame/calculatable_example.rb +135 -0
- data/spec/data_frame/convertible_example.rb +180 -0
- data/spec/data_frame/duplicatable_example.rb +111 -0
- data/spec/data_frame/fetchable_example.rb +476 -0
- data/spec/data_frame/filterable_example.rb +250 -0
- data/spec/data_frame/indexable_example.rb +221 -0
- data/spec/data_frame/iterable_example.rb +465 -0
- data/spec/data_frame/joinable_example.rb +106 -0
- data/spec/data_frame/missable_example.rb +47 -0
- data/spec/data_frame/pivotable_example.rb +297 -0
- data/spec/data_frame/queryable_example.rb +92 -0
- data/spec/data_frame/setable_example.rb +482 -0
- data/spec/data_frame/sortable_example.rb +350 -0
- data/spec/dataframe_spec.rb +181 -3289
- data/spec/index/index_spec.rb +8 -0
- data/spec/vector/aggregatable_example.rb +27 -0
- data/spec/vector/calculatable_example.rb +82 -0
- data/spec/vector/convertible_example.rb +126 -0
- data/spec/vector/duplicatable_example.rb +48 -0
- data/spec/vector/fetchable_example.rb +463 -0
- data/spec/vector/filterable_example.rb +165 -0
- data/spec/vector/indexable_example.rb +201 -0
- data/spec/vector/iterable_example.rb +111 -0
- data/spec/vector/joinable_example.rb +25 -0
- data/spec/vector/missable_example.rb +88 -0
- data/spec/vector/queryable_example.rb +91 -0
- data/spec/vector/setable_example.rb +300 -0
- data/spec/vector/sortable_example.rb +242 -0
- data/spec/vector_spec.rb +111 -1805
- metadata +86 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1fca8a59ee849230424502a8ffa2f986134ccf522d15d53ab3807c22b64b30f8
|
4
|
+
data.tar.gz: 8c4e8048ea8171c463b048ac9dff8b86a8b19e3ec5dd62f16bf72311e7b03b38
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 403d6cfe869dcd152f083ea0878be37f6a8b40212f6ba5f80ece21bcadf51a4f13471f529bbddcf66b593568f31ec52f3e308c39160f0bd87bac9af6d95b30f6
|
7
|
+
data.tar.gz: dfbc2d7b5e63c54980c704c0df3d96ae8d079b921fc0ff51a34f109126a2a382d531457321737e83a2b03bc114b741e3018d0beb9cb00554aa822345d94f3144
|
data/.rubocop_todo.yml
CHANGED
@@ -1,11 +1,19 @@
|
|
1
1
|
# This configuration was generated by
|
2
2
|
# `rubocop --auto-gen-config`
|
3
|
-
# on
|
3
|
+
# on 2024-03-03 13:59:21 UTC using RuboCop version 1.60.2.
|
4
4
|
# The point is for the user to remove these configuration records
|
5
5
|
# one by one as the offenses are removed from the code base.
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
7
7
|
# versions of RuboCop, may require this file to be generated again.
|
8
8
|
|
9
|
+
# Offense count: 1
|
10
|
+
# This cop supports safe autocorrection (--autocorrect).
|
11
|
+
# Configuration parameters: EnforcedStyle, IndentationWidth.
|
12
|
+
# SupportedStyles: aligned, indented
|
13
|
+
Layout/LineEndStringConcatenationIndentation:
|
14
|
+
Exclude:
|
15
|
+
- 'lib/daru_lite/data_frame/indexable.rb'
|
16
|
+
|
9
17
|
# Offense count: 1
|
10
18
|
# Configuration parameters: AllowComments.
|
11
19
|
Lint/EmptyClass:
|
@@ -13,6 +21,7 @@ Lint/EmptyClass:
|
|
13
21
|
- 'lib/daru_lite/accessors/mdarray_wrapper.rb'
|
14
22
|
|
15
23
|
# Offense count: 5
|
24
|
+
# Configuration parameters: AllowedParentClasses.
|
16
25
|
Lint/MissingSuper:
|
17
26
|
Exclude:
|
18
27
|
- 'lib/daru_lite/date_time/offsets.rb'
|
@@ -20,61 +29,50 @@ Lint/MissingSuper:
|
|
20
29
|
- 'lib/daru_lite/index/index.rb'
|
21
30
|
- 'lib/daru_lite/index/multi_index.rb'
|
22
31
|
|
23
|
-
# Offense count:
|
32
|
+
# Offense count: 5
|
33
|
+
# This cop supports safe autocorrection (--autocorrect).
|
24
34
|
# Configuration parameters: CheckForMethodsWithNoSideEffects.
|
25
35
|
Lint/Void:
|
26
36
|
Exclude:
|
27
37
|
- 'lib/daru_lite/category.rb'
|
28
|
-
- 'lib/daru_lite/
|
38
|
+
- 'lib/daru_lite/data_frame/indexable.rb'
|
29
39
|
- 'lib/daru_lite/vector.rb'
|
30
40
|
|
31
|
-
# Offense count:
|
32
|
-
# Configuration parameters: AllowedMethods, AllowedPatterns,
|
41
|
+
# Offense count: 41
|
42
|
+
# Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes.
|
33
43
|
Metrics/AbcSize:
|
34
44
|
Max: 34
|
35
45
|
|
36
|
-
# Offense count:
|
46
|
+
# Offense count: 5
|
37
47
|
# Configuration parameters: CountComments, CountAsOne.
|
38
48
|
Metrics/ClassLength:
|
39
|
-
Max:
|
49
|
+
Max: 188
|
40
50
|
|
41
51
|
# Offense count: 6
|
42
|
-
# Configuration parameters: AllowedMethods, AllowedPatterns
|
52
|
+
# Configuration parameters: AllowedMethods, AllowedPatterns.
|
43
53
|
Metrics/CyclomaticComplexity:
|
44
54
|
Max: 9
|
45
55
|
|
46
|
-
# Offense count:
|
47
|
-
# Configuration parameters: CountComments, CountAsOne,
|
56
|
+
# Offense count: 60
|
57
|
+
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
48
58
|
Metrics/MethodLength:
|
49
59
|
Max: 15
|
50
60
|
|
51
|
-
# Offense count:
|
61
|
+
# Offense count: 4
|
52
62
|
# Configuration parameters: CountComments, CountAsOne.
|
53
63
|
Metrics/ModuleLength:
|
54
64
|
Max: 190
|
55
65
|
|
56
66
|
# Offense count: 4
|
57
|
-
# Configuration parameters: AllowedMethods, AllowedPatterns
|
67
|
+
# Configuration parameters: AllowedMethods, AllowedPatterns.
|
58
68
|
Metrics/PerceivedComplexity:
|
59
69
|
Max: 10
|
60
70
|
|
61
|
-
# Offense count:
|
71
|
+
# Offense count: 66
|
62
72
|
# Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
|
63
|
-
# AllowedNames: at, by, db, id, in, io, ip, of, on, os, pp, to
|
73
|
+
# AllowedNames: as, at, by, cc, db, id, if, in, io, ip, of, on, os, pp, to
|
64
74
|
Naming/MethodParameterName:
|
65
|
-
|
66
|
-
- 'lib/daru_lite/category.rb'
|
67
|
-
- 'lib/daru_lite/core/group_by.rb'
|
68
|
-
- 'lib/daru_lite/core/merge.rb'
|
69
|
-
- 'lib/daru_lite/core/query.rb'
|
70
|
-
- 'lib/daru_lite/dataframe.rb'
|
71
|
-
- 'lib/daru_lite/date_time/index.rb'
|
72
|
-
- 'lib/daru_lite/date_time/offsets.rb'
|
73
|
-
- 'lib/daru_lite/extensions/which_dsl.rb'
|
74
|
-
- 'lib/daru_lite/io/io.rb'
|
75
|
-
- 'lib/daru_lite/maths/statistics/dataframe.rb'
|
76
|
-
- 'lib/daru_lite/maths/statistics/vector.rb'
|
77
|
-
- 'lib/daru_lite/vector.rb'
|
75
|
+
Enabled: false
|
78
76
|
|
79
77
|
# Offense count: 5
|
80
78
|
# Configuration parameters: NamePrefix, ForbiddenPrefixes, AllowedMethods, MethodDefinitionMacros.
|
@@ -85,13 +83,14 @@ Naming/MethodParameterName:
|
|
85
83
|
Naming/PredicateName:
|
86
84
|
Exclude:
|
87
85
|
- 'spec/**/*'
|
88
|
-
- 'lib/daru_lite/
|
86
|
+
- 'lib/daru_lite/data_frame/missable.rb'
|
87
|
+
- 'lib/daru_lite/data_frame/queryable.rb'
|
89
88
|
- 'lib/daru_lite/vector.rb'
|
90
89
|
|
91
90
|
# Offense count: 5
|
92
91
|
Security/MarshalLoad:
|
93
92
|
Exclude:
|
94
|
-
- 'lib/daru_lite/
|
93
|
+
- 'lib/daru_lite/data_frame/i_o_able.rb'
|
95
94
|
- 'lib/daru_lite/date_time/index.rb'
|
96
95
|
- 'lib/daru_lite/index/index.rb'
|
97
96
|
- 'lib/daru_lite/io/io.rb'
|
@@ -102,7 +101,7 @@ Style/ClassVars:
|
|
102
101
|
Exclude:
|
103
102
|
- 'lib/daru_lite.rb'
|
104
103
|
|
105
|
-
# Offense count:
|
104
|
+
# Offense count: 58
|
106
105
|
# Configuration parameters: AllowedConstants.
|
107
106
|
Style/Documentation:
|
108
107
|
Enabled: false
|
@@ -113,6 +112,10 @@ Style/MapToHash:
|
|
113
112
|
Exclude:
|
114
113
|
- 'lib/daru_lite/category.rb'
|
115
114
|
- 'lib/daru_lite/core/group_by.rb'
|
115
|
+
- 'lib/daru_lite/data_frame/convertible.rb'
|
116
|
+
- 'lib/daru_lite/data_frame/duplicatable.rb'
|
117
|
+
- 'lib/daru_lite/data_frame/fetchable.rb'
|
118
|
+
- 'lib/daru_lite/data_frame/joinable.rb'
|
116
119
|
- 'lib/daru_lite/dataframe.rb'
|
117
120
|
|
118
121
|
# Offense count: 1
|
@@ -125,7 +128,7 @@ Style/MultilineBlockChain:
|
|
125
128
|
# AllowedMethods: respond_to_missing?
|
126
129
|
Style/OptionalBooleanParameter:
|
127
130
|
Exclude:
|
128
|
-
- 'lib/daru_lite/
|
131
|
+
- 'lib/daru_lite/data_frame/convertible.rb'
|
129
132
|
- 'lib/daru_lite/maths/statistics/vector.rb'
|
130
133
|
- 'lib/daru_lite/vector.rb'
|
131
134
|
|
@@ -133,5 +136,4 @@ Style/OptionalBooleanParameter:
|
|
133
136
|
# This cop supports unsafe autocorrection (--autocorrect-all).
|
134
137
|
Style/RedundantSelfAssignment:
|
135
138
|
Exclude:
|
136
|
-
- 'lib/daru_lite/
|
137
|
-
|
139
|
+
- 'lib/daru_lite/data_frame/joinable.rb'
|
@@ -0,0 +1,165 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Aggregatable
|
4
|
+
# Group elements by vector to perform operations on them. Returns a
|
5
|
+
# DaruLite::Core::GroupBy object.See the DaruLite::Core::GroupBy docs for a detailed
|
6
|
+
# list of possible operations.
|
7
|
+
#
|
8
|
+
# == Arguments
|
9
|
+
#
|
10
|
+
# * vectors - An Array contatining names of vectors to group by.
|
11
|
+
#
|
12
|
+
# == Usage
|
13
|
+
#
|
14
|
+
# df = DaruLite::DataFrame.new({
|
15
|
+
# a: %w{foo bar foo bar foo bar foo foo},
|
16
|
+
# b: %w{one one two three two two one three},
|
17
|
+
# c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
|
18
|
+
# d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
|
19
|
+
# })
|
20
|
+
# df.group_by([:a,:b,:c]).groups
|
21
|
+
# #=> {["bar", "one", 2]=>[1],
|
22
|
+
# # ["bar", "three", 1]=>[3],
|
23
|
+
# # ["bar", "two", 6]=>[5],
|
24
|
+
# # ["foo", "one", 1]=>[0],
|
25
|
+
# # ["foo", "one", 3]=>[6],
|
26
|
+
# # ["foo", "three", 8]=>[7],
|
27
|
+
# # ["foo", "two", 3]=>[2, 4]}
|
28
|
+
def group_by(*vectors)
|
29
|
+
vectors.flatten!
|
30
|
+
missing = vectors - @vectors.to_a
|
31
|
+
raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}") unless missing.empty?
|
32
|
+
|
33
|
+
vectors = [@vectors.first] if vectors.empty?
|
34
|
+
|
35
|
+
DaruLite::Core::GroupBy.new(self, vectors)
|
36
|
+
end
|
37
|
+
|
38
|
+
# Function to use for aggregating the data.
|
39
|
+
#
|
40
|
+
# @param options [Hash] options for column, you want in resultant dataframe
|
41
|
+
#
|
42
|
+
# @return [DaruLite::DataFrame]
|
43
|
+
#
|
44
|
+
# @example
|
45
|
+
# df = DaruLite::DataFrame.new(
|
46
|
+
# {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
|
47
|
+
# => #<DaruLite::DataFrame(5x2)>
|
48
|
+
# col num
|
49
|
+
# 0 a 52
|
50
|
+
# 1 b 12
|
51
|
+
# 2 c 7
|
52
|
+
# 3 d 17
|
53
|
+
# 4 e 1
|
54
|
+
#
|
55
|
+
# df.aggregate(num_100_times: ->(df) { (df.num*100).first })
|
56
|
+
# => #<DaruLite::DataFrame(5x1)>
|
57
|
+
# num_100_ti
|
58
|
+
# 0 5200
|
59
|
+
# 1 1200
|
60
|
+
# 2 700
|
61
|
+
# 3 1700
|
62
|
+
# 4 100
|
63
|
+
#
|
64
|
+
# When we have duplicate index :
|
65
|
+
#
|
66
|
+
# idx = DaruLite::CategoricalIndex.new [:a, :b, :a, :a, :c]
|
67
|
+
# df = DaruLite::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
|
68
|
+
# => #<DaruLite::DataFrame(5x1)>
|
69
|
+
# num
|
70
|
+
# a 52
|
71
|
+
# b 12
|
72
|
+
# a 7
|
73
|
+
# a 17
|
74
|
+
# c 1
|
75
|
+
#
|
76
|
+
# df.aggregate(num: :mean)
|
77
|
+
# => #<DaruLite::DataFrame(3x1)>
|
78
|
+
# num
|
79
|
+
# a 25.3333333
|
80
|
+
# b 12
|
81
|
+
# c 1
|
82
|
+
#
|
83
|
+
# Note: `GroupBy` class `aggregate` method uses this `aggregate` method
|
84
|
+
# internally.
|
85
|
+
def aggregate(options = {}, multi_index_level = -1)
|
86
|
+
if block_given?
|
87
|
+
positions_tuples, new_index = yield(@index) # NOTE: use of yield is private for now
|
88
|
+
else
|
89
|
+
positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
|
90
|
+
end
|
91
|
+
|
92
|
+
colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
|
93
|
+
|
94
|
+
DaruLite::DataFrame.new(colmn_value, index: new_index, order: options.keys)
|
95
|
+
end
|
96
|
+
|
97
|
+
def group_by_and_aggregate(*group_by_keys, **aggregation_map)
|
98
|
+
group_by(*group_by_keys).aggregate(aggregation_map)
|
99
|
+
end
|
100
|
+
|
101
|
+
private
|
102
|
+
|
103
|
+
def aggregate_by_positions_tuples(options, positions_tuples)
|
104
|
+
agg_over_vectors_only, options = cast_aggregation_options(options)
|
105
|
+
|
106
|
+
if agg_over_vectors_only
|
107
|
+
options.map do |vect_name, method|
|
108
|
+
vect = self[vect_name]
|
109
|
+
|
110
|
+
positions_tuples.map do |positions|
|
111
|
+
vect.apply_method_on_sub_vector(method, keys: positions)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
else
|
115
|
+
methods = options.values
|
116
|
+
|
117
|
+
# NOTE: because we aggregate over rows, we don't have to re-get sub-dfs for each method (which is expensive)
|
118
|
+
rows = positions_tuples.map do |positions|
|
119
|
+
apply_method_on_sub_df(methods, keys: positions)
|
120
|
+
end
|
121
|
+
|
122
|
+
rows.transpose
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
# convert operations over sub-vectors to operations over sub-dfs when it improves perf
|
127
|
+
# note: we don't always "cast" because aggregation over a single vector / a few vector is faster
|
128
|
+
# than aggregation over (sub-)dfs
|
129
|
+
def cast_aggregation_options(options)
|
130
|
+
vects, non_vects = options.keys.partition { |k| @vectors.include?(k) }
|
131
|
+
|
132
|
+
over_vectors = true
|
133
|
+
|
134
|
+
if non_vects.any?
|
135
|
+
options = options.clone
|
136
|
+
|
137
|
+
vects.each do |name|
|
138
|
+
proc_on_vect = options[name].to_proc
|
139
|
+
options[name] = ->(sub_df) { proc_on_vect.call(sub_df[name]) }
|
140
|
+
end
|
141
|
+
|
142
|
+
over_vectors = false
|
143
|
+
end
|
144
|
+
|
145
|
+
[over_vectors, options]
|
146
|
+
end
|
147
|
+
|
148
|
+
def group_index_for_aggregation(index, multi_index_level = -1)
|
149
|
+
case index
|
150
|
+
when DaruLite::MultiIndex
|
151
|
+
groups_by_pos = DaruLite::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
|
152
|
+
|
153
|
+
new_index = DaruLite::MultiIndex.from_tuples(groups_by_pos.keys).coerce_index
|
154
|
+
pos_tuples = groups_by_pos.values
|
155
|
+
when DaruLite::Index, DaruLite::CategoricalIndex
|
156
|
+
new_index = Array(index).uniq
|
157
|
+
pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
|
158
|
+
else raise
|
159
|
+
end
|
160
|
+
|
161
|
+
[pos_tuples, new_index]
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Calculatable
|
4
|
+
# Sum all numeric/specified vectors in the DataFrame.
|
5
|
+
#
|
6
|
+
# Returns a new vector that's a containing a sum of all numeric
|
7
|
+
# or specified vectors of the DataFrame. By default, if the vector
|
8
|
+
# contains a nil, the sum is nil.
|
9
|
+
# With :skipnil argument set to true, nil values are assumed to be
|
10
|
+
# 0 (zero) and the sum vector is returned.
|
11
|
+
#
|
12
|
+
# @param args [Array] List of vectors to sum. Default is nil in which case
|
13
|
+
# all numeric vectors are summed.
|
14
|
+
#
|
15
|
+
# @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
|
16
|
+
#
|
17
|
+
# @return Vector with sum of all vectors specified in the argument.
|
18
|
+
# If vecs parameter is empty, sum all numeric vector.
|
19
|
+
#
|
20
|
+
# @example
|
21
|
+
# df = DaruLite::DataFrame.new({
|
22
|
+
# a: [1, 2, nil],
|
23
|
+
# b: [2, 1, 3],
|
24
|
+
# c: [1, 1, 1]
|
25
|
+
# })
|
26
|
+
# => #<DaruLite::DataFrame(3x3)>
|
27
|
+
# a b c
|
28
|
+
# 0 1 2 1
|
29
|
+
# 1 2 1 1
|
30
|
+
# 2 nil 3 1
|
31
|
+
# df.vector_sum [:a, :c]
|
32
|
+
# => #<DaruLite::Vector(3)>
|
33
|
+
# 0 2
|
34
|
+
# 1 3
|
35
|
+
# 2 nil
|
36
|
+
# df.vector_sum
|
37
|
+
# => #<DaruLite::Vector(3)>
|
38
|
+
# 0 4
|
39
|
+
# 1 4
|
40
|
+
# 2 nil
|
41
|
+
# df.vector_sum skipnil: true
|
42
|
+
# => #<DaruLite::Vector(3)>
|
43
|
+
# c
|
44
|
+
# 0 4
|
45
|
+
# 1 4
|
46
|
+
# 2 4
|
47
|
+
#
|
48
|
+
def vector_sum(*args)
|
49
|
+
defaults = { vecs: nil, skipnil: false }
|
50
|
+
options = args.last.is_a?(::Hash) ? args.pop : {}
|
51
|
+
options = defaults.merge(options)
|
52
|
+
vecs = args[0] || options[:vecs]
|
53
|
+
skipnil = args[1] || options[:skipnil]
|
54
|
+
|
55
|
+
vecs ||= numeric_vectors
|
56
|
+
sum = DaruLite::Vector.new [0] * @size, index: @index, name: @name, dtype: @dtype
|
57
|
+
vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
|
58
|
+
end
|
59
|
+
|
60
|
+
# Calculate mean of the rows of the dataframe.
|
61
|
+
#
|
62
|
+
# == Arguments
|
63
|
+
#
|
64
|
+
# * +max_missing+ - The maximum number of elements in the row that can be
|
65
|
+
# zero for the mean calculation to happen. Default to 0.
|
66
|
+
def vector_mean(max_missing = 0)
|
67
|
+
# FIXME: in vector_sum we preserve created vector dtype, but
|
68
|
+
# here we are not. Is this by design or ...? - zverok, 2016-05-18
|
69
|
+
mean_vec = DaruLite::Vector.new [0] * @size, index: @index, name: "mean_#{@name}"
|
70
|
+
|
71
|
+
each_row_with_index.with_object(mean_vec) do |(row, i), memo|
|
72
|
+
memo[i] = row.indexes(*DaruLite::MISSING_VALUES).size > max_missing ? nil : row.mean
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Returns a vector, based on a string with a calculation based
|
77
|
+
# on vector.
|
78
|
+
#
|
79
|
+
# The calculation will be eval'ed, so you can put any variable
|
80
|
+
# or expression valid on ruby.
|
81
|
+
#
|
82
|
+
# For example:
|
83
|
+
# a = DaruLite::Vector.new [1,2]
|
84
|
+
# b = DaruLite::Vector.new [3,4]
|
85
|
+
# ds = DaruLite::DataFrame.new({:a => a,:b => b})
|
86
|
+
# ds.compute("a+b")
|
87
|
+
# => Vector [4,6]
|
88
|
+
def compute(text, &block)
|
89
|
+
return instance_eval(&block) if block
|
90
|
+
|
91
|
+
instance_eval(text)
|
92
|
+
end
|
93
|
+
|
94
|
+
# DSL for yielding each row and returning a DaruLite::Vector based on the
|
95
|
+
# value each run of the block returns.
|
96
|
+
#
|
97
|
+
# == Usage
|
98
|
+
#
|
99
|
+
# a1 = DaruLite::Vector.new([1, 2, 3, 4, 5, 6, 7])
|
100
|
+
# a2 = DaruLite::Vector.new([10, 20, 30, 40, 50, 60, 70])
|
101
|
+
# a3 = DaruLite::Vector.new([100, 200, 300, 400, 500, 600, 700])
|
102
|
+
# ds = DaruLite::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
|
103
|
+
# total = ds.vector_by_calculation { a + b + c }
|
104
|
+
# # <DaruLite::Vector:82314050 @name = nil @size = 7 >
|
105
|
+
# # nil
|
106
|
+
# # 0 111
|
107
|
+
# # 1 222
|
108
|
+
# # 2 333
|
109
|
+
# # 3 444
|
110
|
+
# # 4 555
|
111
|
+
# # 5 666
|
112
|
+
# # 6 777
|
113
|
+
def vector_by_calculation(&block)
|
114
|
+
a = each_row.map { |r| r.instance_eval(&block) }
|
115
|
+
|
116
|
+
DaruLite::Vector.new a, index: @index
|
117
|
+
end
|
118
|
+
|
119
|
+
def vector_count_characters(vecs = nil)
|
120
|
+
vecs ||= @vectors.to_a
|
121
|
+
|
122
|
+
collect_rows do |row|
|
123
|
+
vecs.sum { |v| row[v].to_s.size }
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Generate a summary of this DataFrame based on individual vectors in the DataFrame
|
128
|
+
# @return [String] String containing the summary of the DataFrame
|
129
|
+
def summary
|
130
|
+
summary = "= #{name}"
|
131
|
+
summary << "\n Number of rows: #{nrows}"
|
132
|
+
@vectors.each do |v|
|
133
|
+
summary << "\n Element:[#{v}]\n"
|
134
|
+
summary << self[v].summary(1)
|
135
|
+
end
|
136
|
+
summary
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Convertible
|
4
|
+
# Create a sql, basen on a given Dataset
|
5
|
+
#
|
6
|
+
# == Arguments
|
7
|
+
#
|
8
|
+
# * table - String specifying name of the table that will created in SQL.
|
9
|
+
# * charset - Character set. Default is "UTF8".
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
#
|
13
|
+
# ds = DaruLite::DataFrame.new({
|
14
|
+
# :id => DaruLite::Vector.new([1,2,3,4,5]),
|
15
|
+
# :name => DaruLite::Vector.new(%w{Alex Peter Susan Mary John})
|
16
|
+
# })
|
17
|
+
# ds.create_sql('names')
|
18
|
+
# #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
|
19
|
+
#
|
20
|
+
def create_sql(table, charset = 'UTF8')
|
21
|
+
sql = "CREATE TABLE #{table} ("
|
22
|
+
fields = vectors.to_a.collect do |f|
|
23
|
+
v = self[f]
|
24
|
+
"#{f} #{v.db_type}"
|
25
|
+
end
|
26
|
+
|
27
|
+
sql + fields.join(",\n ") + ") CHARACTER SET=#{charset};"
|
28
|
+
end
|
29
|
+
|
30
|
+
# Returns the dataframe. This can be convenient when the user does not
|
31
|
+
# know whether the object is a vector or a dataframe.
|
32
|
+
# @return [self] the dataframe
|
33
|
+
def to_df
|
34
|
+
self
|
35
|
+
end
|
36
|
+
|
37
|
+
# Convert all vectors of type *:numeric* into a Matrix.
|
38
|
+
def to_matrix
|
39
|
+
Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Converts the DataFrame into an array of hashes where key is vector name
|
43
|
+
# and value is the corresponding element. The 0th index of the array contains
|
44
|
+
# the array of hashes while the 1th index contains the indexes of each row
|
45
|
+
# of the dataframe. Each element in the index array corresponds to its row
|
46
|
+
# in the array of hashes, which has the same index.
|
47
|
+
def to_a
|
48
|
+
[each_row.map(&:to_h), @index.to_a]
|
49
|
+
end
|
50
|
+
|
51
|
+
# Convert to json. If no_index is false then the index will NOT be included
|
52
|
+
# in the JSON thus created.
|
53
|
+
def to_json(no_index = true)
|
54
|
+
if no_index
|
55
|
+
to_a[0].to_json
|
56
|
+
else
|
57
|
+
to_a.to_json
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Converts DataFrame to a hash (explicit) with keys as vector names and values as
|
62
|
+
# the corresponding vectors.
|
63
|
+
def to_h
|
64
|
+
@vectors
|
65
|
+
.each_with_index
|
66
|
+
.map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
|
67
|
+
end
|
68
|
+
|
69
|
+
# Convert to html for IRuby.
|
70
|
+
def to_html(threshold = DaruLite.max_rows)
|
71
|
+
table_thead = to_html_thead
|
72
|
+
table_tbody = to_html_tbody(threshold)
|
73
|
+
path = if index.is_a?(MultiIndex)
|
74
|
+
File.expand_path('../iruby/templates/dataframe_mi.html.erb', __dir__)
|
75
|
+
else
|
76
|
+
File.expand_path('../iruby/templates/dataframe.html.erb', __dir__)
|
77
|
+
end
|
78
|
+
ERB.new(File.read(path).strip).result(binding)
|
79
|
+
end
|
80
|
+
|
81
|
+
def to_html_thead
|
82
|
+
table_thead_path =
|
83
|
+
if index.is_a?(MultiIndex)
|
84
|
+
File.expand_path('../iruby/templates/dataframe_mi_thead.html.erb', __dir__)
|
85
|
+
else
|
86
|
+
File.expand_path('../iruby/templates/dataframe_thead.html.erb', __dir__)
|
87
|
+
end
|
88
|
+
ERB.new(File.read(table_thead_path).strip).result(binding)
|
89
|
+
end
|
90
|
+
|
91
|
+
def to_html_tbody(threshold = DaruLite.max_rows)
|
92
|
+
threshold ||= @size
|
93
|
+
table_tbody_path =
|
94
|
+
if index.is_a?(MultiIndex)
|
95
|
+
File.expand_path('../iruby/templates/dataframe_mi_tbody.html.erb', __dir__)
|
96
|
+
else
|
97
|
+
File.expand_path('../iruby/templates/dataframe_tbody.html.erb', __dir__)
|
98
|
+
end
|
99
|
+
ERB.new(File.read(table_tbody_path).strip).result(binding)
|
100
|
+
end
|
101
|
+
|
102
|
+
def to_s
|
103
|
+
"#<#{self.class}#{": #{@name}" if @name}(#{nrows}x#{ncols})>"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Duplicatable
|
4
|
+
extend Gem::Deprecate
|
5
|
+
|
6
|
+
# Duplicate the DataFrame entirely.
|
7
|
+
#
|
8
|
+
# == Arguments
|
9
|
+
#
|
10
|
+
# * +vectors_to_dup+ - An Array specifying the names of Vectors to
|
11
|
+
# be duplicated. Will duplicate the entire DataFrame if not specified.
|
12
|
+
def dup(vectors_to_dup = nil)
|
13
|
+
vectors_to_dup ||= @vectors.to_a
|
14
|
+
|
15
|
+
src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
|
16
|
+
new_order = DaruLite::Index.new(vectors_to_dup)
|
17
|
+
|
18
|
+
DaruLite::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
|
19
|
+
end
|
20
|
+
|
21
|
+
# Only clone the structure of the DataFrame.
|
22
|
+
def clone_structure
|
23
|
+
DaruLite::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
|
27
|
+
# preserved.
|
28
|
+
#
|
29
|
+
# == Arguments
|
30
|
+
#
|
31
|
+
# +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
|
32
|
+
# a view of the whole data frame otherwise.
|
33
|
+
def clone(*vectors_to_clone)
|
34
|
+
vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
|
35
|
+
vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
|
36
|
+
|
37
|
+
h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
|
38
|
+
DaruLite::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Returns a 'shallow' copy of DataFrame if missing data is not present,
|
42
|
+
# or a full copy of only valid data if missing data is present.
|
43
|
+
def clone_only_valid
|
44
|
+
if include_values?(*DaruLite::MISSING_VALUES)
|
45
|
+
reject_values(*DaruLite::MISSING_VALUES)
|
46
|
+
else
|
47
|
+
clone
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Creates a new duplicate dataframe containing only rows
|
52
|
+
# without a single missing value.
|
53
|
+
def dup_only_valid(vecs = nil)
|
54
|
+
rows_with_nil = @data.map { |vec| vec.indexes(*DaruLite::MISSING_VALUES) }
|
55
|
+
.inject(&:concat)
|
56
|
+
.uniq
|
57
|
+
|
58
|
+
row_indexes = @index.to_a
|
59
|
+
(vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
|
60
|
+
end
|
61
|
+
deprecate :dup_only_valid, :reject_values, 2016, 10
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|