remi 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/remi/project/features/aggregate.feature +27 -8
- data/lib/remi/project/jobs/aggregate_job.rb +13 -1
- data/lib/remi/refinements/daru.rb +8 -4
- data/lib/remi/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c802335b8494b30ff89e4c31b1ce4df34b2fc8a
|
4
|
+
data.tar.gz: b405fbefb668bf07db5bcbb4a0cf6d8550658f13
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e2380bbb4cc87d67cfdb554763f8614cfcc791ceb3b70f711ed2fb975dcaf142438f2937453734479a43d3735ea61c8bcb4e093e3f380b5ae107e2dba5fe9522
|
7
|
+
data.tar.gz: 56f42c6b3608157959713478fbeee5cea4898238a6af176ab0b632cc0bb75ec5c60c66fe0ad60b45f6483aee09bb2943ddfabe586a88ef223ceaf9349eeded07
|
data/Gemfile.lock
CHANGED
@@ -4,20 +4,39 @@ Feature: Tests the aggregate refinement to the Daru library
|
|
4
4
|
Given the job is 'Aggregate'
|
5
5
|
And the job source 'Source Data'
|
6
6
|
And the job target 'Target Data'
|
7
|
+
And the job target 'Multigroup Target Data'
|
7
8
|
|
8
9
|
And the source 'Source Data'
|
9
|
-
|
10
|
+
|
10
11
|
|
11
12
|
Scenario: The aggregator should find the minimum year for each 'Alpha'
|
12
|
-
Given the
|
13
|
-
|
14
|
-
|
|
15
|
-
| a |
|
16
|
-
|
|
17
|
-
| b |
|
18
|
-
|
|
13
|
+
Given the target 'Target Data'
|
14
|
+
And the following example record for 'Source Data':
|
15
|
+
| Alpha | Beta | Year |
|
16
|
+
| a | aa | 2016 |
|
17
|
+
| a | aa | 2018 |
|
18
|
+
| b | bb | 2016 |
|
19
|
+
| b | bb | 2010 |
|
20
|
+
| a | ab | 2017 |
|
19
21
|
And the following example record called 'expected result':
|
20
22
|
| Alpha | Year |
|
21
23
|
| a | Group a has a minimum value of 2016 |
|
22
24
|
| b | Group b has a minimum value of 2010 |
|
23
25
|
Then the target should match the example 'expected result'
|
26
|
+
|
27
|
+
|
28
|
+
Scenario: The aggregator should find the minimum year for each 'Alpha'
|
29
|
+
Given the target 'Multigroup Target Data'
|
30
|
+
And the following example record for 'Source Data':
|
31
|
+
| Alpha | Beta | Year |
|
32
|
+
| a | aa | 2016 |
|
33
|
+
| a | aa | 2018 |
|
34
|
+
| b | bb | 2016 |
|
35
|
+
| b | bb | 2010 |
|
36
|
+
| a | ab | 2017 |
|
37
|
+
And the following example record called 'expected result':
|
38
|
+
| Alpha | Beta | Year |
|
39
|
+
| a | aa | Group ["a", "aa"] has a minimum value of 2016 |
|
40
|
+
| a | ab | Group ["a", "ab"] has a minimum value of 2017 |
|
41
|
+
| b | bb | Group ["b", "bb"] has a minimum value of 2010 |
|
42
|
+
Then the target should match the example 'expected result'
|
@@ -6,15 +6,27 @@ class AggregateJob
|
|
6
6
|
|
7
7
|
define_source :source_data, Remi::DataSource::DataFrame
|
8
8
|
define_target :target_data, Remi::DataTarget::DataFrame
|
9
|
+
define_target :multigroup_target_data, Remi::DataTarget::DataFrame
|
9
10
|
|
10
11
|
define_transform :main, sources: :source_data, targets: :target_data do
|
11
|
-
|
12
12
|
mymin = lambda do |field, df, group_key, indicies|
|
13
13
|
values = indicies.map { |idx| df.row[idx][field] }
|
14
14
|
"Group #{group_key} has a minimum value of #{values.min}"
|
15
15
|
end
|
16
16
|
|
17
|
+
# Daru groups don't use the index of the dataframe when returning groups (WTF?).
|
18
|
+
# Instead they return the position of the record in the dataframe. Here, we
|
19
|
+
# shift the indexes which causes a failure if this artifact is not handled
|
20
|
+
# properly in the aggregate function
|
21
|
+
source_data.df.index = Daru::Index.new(1.upto(source_data.df.size).to_a)
|
22
|
+
|
17
23
|
target_data.df = source_data.df.aggregate(by: :alpha, func: mymin.curry.(:year)).detach_index
|
18
24
|
target_data.df.vectors = Daru::Index.new([:alpha, :year])
|
25
|
+
|
26
|
+
multigroup_target_data.df = source_data.df.aggregate(by: [:alpha,:beta], func: mymin.curry.(:year)).detach_index
|
27
|
+
multigroup_target_data.df.vectors = Daru::Index.new([:alpha_beta, :year])
|
28
|
+
|
29
|
+
|
30
|
+
|
19
31
|
end
|
20
32
|
end
|
@@ -29,8 +29,8 @@ module Remi
|
|
29
29
|
# Example:
|
30
30
|
# df = Daru::DataFrame.new( { a: ['a','a','a','b','b'], year: ['2018','2015','2019', '2014', '2013'] })
|
31
31
|
#
|
32
|
-
# mymin = lambda do |field, df, group_key,
|
33
|
-
# values =
|
32
|
+
# mymin = lambda do |field, df, group_key, indices|
|
33
|
+
# values = indices.map { |idx| df.row[idx][field] }
|
34
34
|
# "Group #{group_key} has a minimum value of #{values.min}"
|
35
35
|
# end
|
36
36
|
#
|
@@ -40,10 +40,14 @@ module Remi
|
|
40
40
|
# Returns a Daru::Vector.
|
41
41
|
def aggregate(by:, func:)
|
42
42
|
grouped = self.group_by(by)
|
43
|
+
df_indices = self.index.to_a
|
43
44
|
::Daru::Vector.new(
|
44
|
-
grouped.groups.reduce({}) do |h, (key,
|
45
|
+
grouped.groups.reduce({}) do |h, (key, indices)|
|
46
|
+
# Daru groups don't use the index of the dataframe when returning groups (WTF?).
|
47
|
+
# Instead they return the position of the record in the dataframe. Here, we
|
48
|
+
group_df_indices = indices.map { |v| df_indices[v] }
|
45
49
|
group_key = key.size == 1 ? key.first : key
|
46
|
-
h[group_key] = func.(self, group_key,
|
50
|
+
h[group_key] = func.(self, group_key, group_df_indices)
|
47
51
|
h
|
48
52
|
end
|
49
53
|
)
|
data/lib/remi/version.rb
CHANGED