remi 0.2.5 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/remi/project/features/aggregate.feature +27 -8
- data/lib/remi/project/jobs/aggregate_job.rb +13 -1
- data/lib/remi/refinements/daru.rb +8 -4
- data/lib/remi/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c802335b8494b30ff89e4c31b1ce4df34b2fc8a
|
4
|
+
data.tar.gz: b405fbefb668bf07db5bcbb4a0cf6d8550658f13
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e2380bbb4cc87d67cfdb554763f8614cfcc791ceb3b70f711ed2fb975dcaf142438f2937453734479a43d3735ea61c8bcb4e093e3f380b5ae107e2dba5fe9522
|
7
|
+
data.tar.gz: 56f42c6b3608157959713478fbeee5cea4898238a6af176ab0b632cc0bb75ec5c60c66fe0ad60b45f6483aee09bb2943ddfabe586a88ef223ceaf9349eeded07
|
data/Gemfile.lock
CHANGED
@@ -4,20 +4,39 @@ Feature: Tests the aggregate refinement to the Daru library
|
|
4
4
|
Given the job is 'Aggregate'
|
5
5
|
And the job source 'Source Data'
|
6
6
|
And the job target 'Target Data'
|
7
|
+
And the job target 'Multigroup Target Data'
|
7
8
|
|
8
9
|
And the source 'Source Data'
|
9
|
-
|
10
|
+
|
10
11
|
|
11
12
|
Scenario: The aggregator should find the minimum year for each 'Alpha'
|
12
|
-
Given the
|
13
|
-
|
14
|
-
|
|
15
|
-
| a |
|
16
|
-
|
|
17
|
-
| b |
|
18
|
-
|
|
13
|
+
Given the target 'Target Data'
|
14
|
+
And the following example record for 'Source Data':
|
15
|
+
| Alpha | Beta | Year |
|
16
|
+
| a | aa | 2016 |
|
17
|
+
| a | aa | 2018 |
|
18
|
+
| b | bb | 2016 |
|
19
|
+
| b | bb | 2010 |
|
20
|
+
| a | ab | 2017 |
|
19
21
|
And the following example record called 'expected result':
|
20
22
|
| Alpha | Year |
|
21
23
|
| a | Group a has a minimum value of 2016 |
|
22
24
|
| b | Group b has a minimum value of 2010 |
|
23
25
|
Then the target should match the example 'expected result'
|
26
|
+
|
27
|
+
|
28
|
+
Scenario: The aggregator should find the minimum year for each 'Alpha'
|
29
|
+
Given the target 'Multigroup Target Data'
|
30
|
+
And the following example record for 'Source Data':
|
31
|
+
| Alpha | Beta | Year |
|
32
|
+
| a | aa | 2016 |
|
33
|
+
| a | aa | 2018 |
|
34
|
+
| b | bb | 2016 |
|
35
|
+
| b | bb | 2010 |
|
36
|
+
| a | ab | 2017 |
|
37
|
+
And the following example record called 'expected result':
|
38
|
+
| Alpha | Beta | Year |
|
39
|
+
| a | aa | Group ["a", "aa"] has a minimum value of 2016 |
|
40
|
+
| a | ab | Group ["a", "ab"] has a minimum value of 2017 |
|
41
|
+
| b | bb | Group ["b", "bb"] has a minimum value of 2010 |
|
42
|
+
Then the target should match the example 'expected result'
|
@@ -6,15 +6,27 @@ class AggregateJob
|
|
6
6
|
|
7
7
|
define_source :source_data, Remi::DataSource::DataFrame
|
8
8
|
define_target :target_data, Remi::DataTarget::DataFrame
|
9
|
+
define_target :multigroup_target_data, Remi::DataTarget::DataFrame
|
9
10
|
|
10
11
|
define_transform :main, sources: :source_data, targets: :target_data do
|
11
|
-
|
12
12
|
mymin = lambda do |field, df, group_key, indicies|
|
13
13
|
values = indicies.map { |idx| df.row[idx][field] }
|
14
14
|
"Group #{group_key} has a minimum value of #{values.min}"
|
15
15
|
end
|
16
16
|
|
17
|
+
# Daru groups don't use the index of the dataframe when returning groups (WTF?).
|
18
|
+
# Instead they return the position of the record in the dataframe. Here, we
|
19
|
+
# shift the indexes which causes a failure if this artifact is not handled
|
20
|
+
# properly in the aggregate function
|
21
|
+
source_data.df.index = Daru::Index.new(1.upto(source_data.df.size).to_a)
|
22
|
+
|
17
23
|
target_data.df = source_data.df.aggregate(by: :alpha, func: mymin.curry.(:year)).detach_index
|
18
24
|
target_data.df.vectors = Daru::Index.new([:alpha, :year])
|
25
|
+
|
26
|
+
multigroup_target_data.df = source_data.df.aggregate(by: [:alpha,:beta], func: mymin.curry.(:year)).detach_index
|
27
|
+
multigroup_target_data.df.vectors = Daru::Index.new([:alpha_beta, :year])
|
28
|
+
|
29
|
+
|
30
|
+
|
19
31
|
end
|
20
32
|
end
|
@@ -29,8 +29,8 @@ module Remi
|
|
29
29
|
# Example:
|
30
30
|
# df = Daru::DataFrame.new( { a: ['a','a','a','b','b'], year: ['2018','2015','2019', '2014', '2013'] })
|
31
31
|
#
|
32
|
-
# mymin = lambda do |field, df, group_key,
|
33
|
-
# values =
|
32
|
+
# mymin = lambda do |field, df, group_key, indices|
|
33
|
+
# values = indices.map { |idx| df.row[idx][field] }
|
34
34
|
# "Group #{group_key} has a minimum value of #{values.min}"
|
35
35
|
# end
|
36
36
|
#
|
@@ -40,10 +40,14 @@ module Remi
|
|
40
40
|
# Returns a Daru::Vector.
|
41
41
|
def aggregate(by:, func:)
|
42
42
|
grouped = self.group_by(by)
|
43
|
+
df_indices = self.index.to_a
|
43
44
|
::Daru::Vector.new(
|
44
|
-
grouped.groups.reduce({}) do |h, (key,
|
45
|
+
grouped.groups.reduce({}) do |h, (key, indices)|
|
46
|
+
# Daru groups don't use the index of the dataframe when returning groups (WTF?).
|
47
|
+
# Instead they return the position of the record in the dataframe. Here, we
|
48
|
+
group_df_indices = indices.map { |v| df_indices[v] }
|
45
49
|
group_key = key.size == 1 ? key.first : key
|
46
|
-
h[group_key] = func.(self, group_key,
|
50
|
+
h[group_key] = func.(self, group_key, group_df_indices)
|
47
51
|
h
|
48
52
|
end
|
49
53
|
)
|
data/lib/remi/version.rb
CHANGED