remi 0.2.30 → 0.2.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/features/step_definitions/remi_step.rb +19 -4
- data/features/transforms/data_frame_sieve.feature +38 -0
- data/features/transforms/partitioner.feature +72 -0
- data/features/transforms/truthy.feature +52 -0
- data/jobs/transforms/data_frame_sieve_job.rb +30 -0
- data/jobs/transforms/partitioner_job.rb +35 -0
- data/jobs/transforms/truthy_job.rb +21 -0
- data/lib/remi/cucumber/business_rules.rb +14 -1
- data/lib/remi/source_to_target_map.rb +2 -2
- data/lib/remi/transform.rb +205 -1
- data/lib/remi/version.rb +1 -1
- metadata +11 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7c7c81a972d6fc604a4761b7fff19a26ec187065
|
4
|
+
data.tar.gz: f07165f263dee57adb1802182cf72551d3326ab4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d4ea4cde266110c374b75ef527f0ec71a674bba865c3da46107687fddb4871b3fda5aa601c44cd0fad5857038d8dff1c4dce297cf1d710ca335e2afafaa882dd
|
7
|
+
data.tar.gz: 7a25d3e85b4ae727998ff196b79395db40bc0d584c04c061069c4d0a457bd79530f7a66b4ce3480df739ebfcef6dda7ef011c5e8171f76bac2326ee392e1d01a
|
data/Gemfile.lock
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
GIT
|
2
2
|
remote: git@github.com:inside-track/daru.git
|
3
|
-
revision:
|
3
|
+
revision: 80b8024102bdb31ef402cb8d6c5cef86dd31cd41
|
4
4
|
branch: itk-master
|
5
5
|
specs:
|
6
6
|
daru (0.1.2)
|
@@ -19,7 +19,7 @@ GIT
|
|
19
19
|
PATH
|
20
20
|
remote: .
|
21
21
|
specs:
|
22
|
-
remi (0.2.
|
22
|
+
remi (0.2.31)
|
23
23
|
activesupport (~> 4.2)
|
24
24
|
bond (~> 0.5)
|
25
25
|
cucumber (~> 2.1)
|
@@ -65,6 +65,9 @@ Then /^the file with the latest date stamp will be downloaded for processing$/ d
|
|
65
65
|
expect(@brt.source.extract).to match_array Array(@brt.filestore.latest)
|
66
66
|
end
|
67
67
|
|
68
|
+
Then /^files will be downloaded for processing$/ do
|
69
|
+
end
|
70
|
+
|
68
71
|
Then /^no files will be downloaded for processing$/ do
|
69
72
|
@brt.filestore.generate
|
70
73
|
@brt.source.mock_extractor(@brt.filestore)
|
@@ -415,13 +418,16 @@ Then /^the source field '([^']+)' is prefixed with "([^"]*)" and loaded into the
|
|
415
418
|
source_name, source_field_name = @brt.sources.parse_full_field(source_field)
|
416
419
|
target_names, target_field_name = @brt.targets.parse_full_field(target_field, multi: true)
|
417
420
|
|
418
|
-
prefixed_source =
|
421
|
+
prefixed_source = @brt.sources[source_name].fields[source_field_name].values.map do |value|
|
422
|
+
"#{prefix}#{value}"
|
423
|
+
end.uniq.sort
|
419
424
|
|
420
425
|
@brt.run_transforms
|
421
|
-
Array(target_names).
|
422
|
-
|
423
|
-
end
|
426
|
+
results = Array(target_names).map do |target_name|
|
427
|
+
@brt.targets[target_name].fields[target_field_name].values.uniq
|
428
|
+
end.flatten.uniq.sort
|
424
429
|
|
430
|
+
expect(results).to eq prefixed_source
|
425
431
|
end
|
426
432
|
|
427
433
|
Then /^the source field is prefixed with "([^"]*)" and loaded into the target field '([^']+)'$/ do |prefix, target_field|
|
@@ -633,6 +639,15 @@ Then /^a target record is not created$/ do
|
|
633
639
|
end
|
634
640
|
|
635
641
|
|
642
|
+
### Setting up data for multiple records
|
643
|
+
|
644
|
+
Given /^the source field '([^']+)' is a unique integer$/ do |source_field|
|
645
|
+
step "the source field '#{source_field}'"
|
646
|
+
source_name, source_field_name = @brt.sources.parse_full_field(source_field)
|
647
|
+
|
648
|
+
@brt.sources[source_name].unique_integer_field(source_field_name)
|
649
|
+
end
|
650
|
+
|
636
651
|
### Record counting
|
637
652
|
|
638
653
|
Then /^the target has (\d+) record(?:s|)$/ do |nrecords|
|
@@ -0,0 +1,38 @@
|
|
1
|
+
Feature: Tests the DataFrameSieve transform
|
2
|
+
|
3
|
+
Background:
|
4
|
+
Given the job is 'Data Frame Sieve'
|
5
|
+
And the job source 'Source Data'
|
6
|
+
And the job source 'Sieve'
|
7
|
+
And the job target 'Target Data'
|
8
|
+
|
9
|
+
And the source 'Source Data'
|
10
|
+
And the source 'Sieve'
|
11
|
+
And the target 'Target Data'
|
12
|
+
|
13
|
+
|
14
|
+
Scenario: A slightly complicated sieve.
|
15
|
+
|
16
|
+
Given the following example for 'Sieve':
|
17
|
+
| level | program | contact | group |
|
18
|
+
| Undergrad | NURS | \\nil | intensive |
|
19
|
+
| Undergrad | \\nil | true | intensive |
|
20
|
+
| Undergrad | \\nil | false | base |
|
21
|
+
| Grad | ENG | true | intensive |
|
22
|
+
| \\nil | \\nil | \\nil | base |
|
23
|
+
|
24
|
+
And the following example for 'Source Data':
|
25
|
+
| id | level | program | contact |
|
26
|
+
| 1 | Undergrad | CHEM | false |
|
27
|
+
| 2 | Undergrad | CHEM | true |
|
28
|
+
| 3 | Grad | CHEM | true |
|
29
|
+
| 4 | Undergrad | NURS | false |
|
30
|
+
| 5 | Unknown | CHEM | true |
|
31
|
+
|
32
|
+
Then the target should match the example:
|
33
|
+
| id | level | program | contact | group |
|
34
|
+
| 1 | Undergrad | CHEM | false | base |
|
35
|
+
| 2 | Undergrad | CHEM | true | intensive |
|
36
|
+
| 3 | Grad | CHEM | true | base |
|
37
|
+
| 4 | Undergrad | NURS | false | intensive |
|
38
|
+
| 5 | Unknown | CHEM | true | base |
|
@@ -0,0 +1,72 @@
|
|
1
|
+
Feature: Tests the Partitioner transform
|
2
|
+
The partitioner keeps track of which groups it has assigned records to
|
3
|
+
in order to keep the distribution of records into groups as precise as
|
4
|
+
possible.
|
5
|
+
|
6
|
+
|
7
|
+
Background:
|
8
|
+
Given the job is 'Partitioner'
|
9
|
+
And the job source 'Source Data'
|
10
|
+
And the job source 'Current Population'
|
11
|
+
And the job source 'Distribution'
|
12
|
+
And the job target 'Target Data'
|
13
|
+
|
14
|
+
And the source 'Source Data'
|
15
|
+
And the source 'Current Population'
|
16
|
+
And the source 'Distribution'
|
17
|
+
And the target 'Target Data'
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
Scenario: Partitioning records into groups with no prior population
|
22
|
+
|
23
|
+
Given the following example for 'Source Data':
|
24
|
+
| id |
|
25
|
+
| 1 |
|
26
|
+
| 2 |
|
27
|
+
| 3 |
|
28
|
+
| 4 |
|
29
|
+
|
30
|
+
And the following example for 'Distribution':
|
31
|
+
| group | weight |
|
32
|
+
| A | 0.5 |
|
33
|
+
| B | 0.5 |
|
34
|
+
| C | 1 |
|
35
|
+
|
36
|
+
Then the target has 1 records where 'group' is "A"
|
37
|
+
Then the target has 1 records where 'group' is "B"
|
38
|
+
Then the target has 2 records where 'group' is "C"
|
39
|
+
|
40
|
+
|
41
|
+
Scenario: Partitioning records into groups with a prior population
|
42
|
+
|
43
|
+
Given the following example for 'Source Data':
|
44
|
+
| id |
|
45
|
+
| 1 |
|
46
|
+
| 2 |
|
47
|
+
| 3 |
|
48
|
+
| 4 |
|
49
|
+
| 5 |
|
50
|
+
|
51
|
+
And the following example for 'Distribution':
|
52
|
+
| group | weight |
|
53
|
+
| A | 0.5 |
|
54
|
+
| B | 0.5 |
|
55
|
+
| C | 1 |
|
56
|
+
|
57
|
+
And the following example for 'Current Population':
|
58
|
+
| group | count |
|
59
|
+
| A | 2 |
|
60
|
+
| B | 1 |
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
Then the target has 0 records where 'group' is "A"
|
65
|
+
Then the target has 1 records where 'group' is "B"
|
66
|
+
Then the target has 4 records where 'group' is "C"
|
67
|
+
|
68
|
+
|
69
|
+
# Scenario: Remainders
|
70
|
+
# When the target population is matched exactly, the next
|
71
|
+
# assignment is random (and weighted by the given weights).
|
72
|
+
# I don't know how to test this.
|
@@ -0,0 +1,52 @@
|
|
1
|
+
Feature: Tests the Truthy transform
|
2
|
+
|
3
|
+
Background:
|
4
|
+
Given the job is 'Truthy'
|
5
|
+
And the job source 'Source Data'
|
6
|
+
And the job target 'Target Data'
|
7
|
+
|
8
|
+
And the source 'Source Data'
|
9
|
+
And the target 'Target Data'
|
10
|
+
|
11
|
+
Scenario Outline: Truthy without allowing nils
|
12
|
+
|
13
|
+
Given the source field 'truthy'
|
14
|
+
And the target field 'no_nils'
|
15
|
+
|
16
|
+
When the source field has the value "<source>"
|
17
|
+
Then the target field is set to the value "<target>"
|
18
|
+
|
19
|
+
Examples:
|
20
|
+
| source | target |
|
21
|
+
| True | true |
|
22
|
+
| t | true |
|
23
|
+
| yEs | true |
|
24
|
+
| Y | true |
|
25
|
+
| 1 | true |
|
26
|
+
| Yessir | false |
|
27
|
+
| anything | false |
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
Scenario Outline: Truthy allowing nils
|
32
|
+
|
33
|
+
Given the source field 'truthy'
|
34
|
+
And the target field 'allow_nils'
|
35
|
+
|
36
|
+
When the source field has the value "<source>"
|
37
|
+
Then the target field is set to the value "<target>"
|
38
|
+
|
39
|
+
Examples:
|
40
|
+
| source | target |
|
41
|
+
| True | true |
|
42
|
+
| t | true |
|
43
|
+
| yEs | true |
|
44
|
+
| Y | true |
|
45
|
+
| 1 | true |
|
46
|
+
| Yessir | |
|
47
|
+
| anything | |
|
48
|
+
| FALSE | false |
|
49
|
+
| f | false |
|
50
|
+
| no | false |
|
51
|
+
| N | false |
|
52
|
+
| 0 | false |
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require_relative '../all_jobs_shared'
|
2
|
+
|
3
|
+
class DataFrameSieveJob
|
4
|
+
include AllJobsShared
|
5
|
+
|
6
|
+
define_source :source_data, Remi::DataSource::DataFrame,
|
7
|
+
fields: {
|
8
|
+
:id => {},
|
9
|
+
:level => {},
|
10
|
+
:program => {},
|
11
|
+
:contact => {}
|
12
|
+
}
|
13
|
+
|
14
|
+
define_source :sieve, Remi::DataSource::DataFrame,
|
15
|
+
fields: {
|
16
|
+
:level => {},
|
17
|
+
:program => {},
|
18
|
+
:contact => {},
|
19
|
+
:group => {}
|
20
|
+
}
|
21
|
+
|
22
|
+
define_target :target_data, Remi::DataTarget::DataFrame
|
23
|
+
|
24
|
+
define_transform :main, sources: :source_data, targets: :target_data do
|
25
|
+
Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
|
26
|
+
map source(:level, :program, :contact) .target(:group)
|
27
|
+
.transform(Remi::Transform::DataFrameSieve.new(sieve.df))
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require_relative '../all_jobs_shared'
|
2
|
+
|
3
|
+
class PartitionerJob
|
4
|
+
include AllJobsShared
|
5
|
+
|
6
|
+
define_source :source_data, Remi::DataSource::DataFrame,
|
7
|
+
fields: {
|
8
|
+
:id => {}
|
9
|
+
}
|
10
|
+
|
11
|
+
define_source :distribution, Remi::DataSource::DataFrame,
|
12
|
+
fields: {
|
13
|
+
:group => {},
|
14
|
+
:weight => {}
|
15
|
+
}
|
16
|
+
|
17
|
+
define_source :current_population, Remi::DataSource::DataFrame,
|
18
|
+
fields: {
|
19
|
+
:group => {},
|
20
|
+
:count => {}
|
21
|
+
}
|
22
|
+
|
23
|
+
define_target :target_data, Remi::DataTarget::DataFrame
|
24
|
+
|
25
|
+
define_transform :main, sources: :source_data, targets: :target_data do
|
26
|
+
|
27
|
+
distribution_hash = distribution.df.map(:row) { |row| [row[:group], row[:weight].to_f] }.to_h
|
28
|
+
current_population_hash = current_population.df.map(:row) { |row| [row[:group], row[:count].to_i] }.to_h
|
29
|
+
|
30
|
+
Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
|
31
|
+
map source(nil) .target(:group)
|
32
|
+
.transform(Remi::Transform::Partitioner.new(buckets: distribution_hash, initial_population: current_population_hash))
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require_relative '../all_jobs_shared'
|
2
|
+
|
3
|
+
class TruthyJob
|
4
|
+
include AllJobsShared
|
5
|
+
|
6
|
+
define_source :source_data, Remi::DataSource::DataFrame,
|
7
|
+
fields: {
|
8
|
+
:truthy => {}
|
9
|
+
}
|
10
|
+
define_target :target_data, Remi::DataTarget::DataFrame
|
11
|
+
|
12
|
+
define_transform :main, sources: :source_data, targets: :target_data do
|
13
|
+
Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
|
14
|
+
map source(:truthy) .target(:allow_nils)
|
15
|
+
.transform(Remi::Transform::Truthy.new(allow_nils: true))
|
16
|
+
|
17
|
+
map source(:truthy) .target(:no_nils)
|
18
|
+
.transform(Remi::Transform::Truthy.new)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -390,6 +390,12 @@ module Remi::BusinessRules
|
|
390
390
|
@data_subject.df.group_by(field_names).size * 1.0 / @data_subject.df.size
|
391
391
|
end
|
392
392
|
|
393
|
+
def unique_integer_field(field_name)
|
394
|
+
vector_name = fields[field_name].field_name
|
395
|
+
i = 0
|
396
|
+
@data_subject.df[vector_name].recode! { |v| i += 1 }
|
397
|
+
end
|
398
|
+
|
393
399
|
def mock_extractor(filestore)
|
394
400
|
extractor = class << @data_subject.extractor; self; end
|
395
401
|
|
@@ -526,7 +532,14 @@ module Remi::BusinessRules
|
|
526
532
|
df = Daru::DataFrame.new([], order: seed_hash.keys | table_headers)
|
527
533
|
@table.hashes.each do |example_row|
|
528
534
|
example_row_sym = example_row.reduce({}) do |h, (k,v)|
|
529
|
-
|
535
|
+
formula_value = ParseFormula.parse(v)
|
536
|
+
value = case formula_value
|
537
|
+
when '\nil'
|
538
|
+
nil
|
539
|
+
else
|
540
|
+
formula_value
|
541
|
+
end
|
542
|
+
h[k.symbolize(field_symbolizer)] = value
|
530
543
|
h
|
531
544
|
end
|
532
545
|
df.add_row(seed_hash.merge(example_row_sym))
|
@@ -84,7 +84,7 @@ module Remi
|
|
84
84
|
|
85
85
|
def do_map_single_source_and_target_vector
|
86
86
|
@target_df[@target_vectors.first] = @source_df[@source_vectors.first].recode do |vector_value|
|
87
|
-
@transform_procs.reduce(vector_value) { |value, tform| tform.call(*(value
|
87
|
+
@transform_procs.reduce(vector_value) { |value, tform| tform.call(*(value.nil? ? [nil] : value)) }
|
88
88
|
end
|
89
89
|
end
|
90
90
|
|
@@ -100,7 +100,7 @@ module Remi
|
|
100
100
|
end
|
101
101
|
|
102
102
|
work_vector.recode! do |vector_value|
|
103
|
-
@transform_procs.reduce(vector_value) { |value, tform| tform.call(*(value
|
103
|
+
@transform_procs.reduce(vector_value) { |value, tform| tform.call(*(value.nil? ? [nil] : value)) }
|
104
104
|
end
|
105
105
|
|
106
106
|
@target_vectors.each_with_index do |target_vector, vector_idx|
|
data/lib/remi/transform.rb
CHANGED
@@ -483,7 +483,7 @@ module Remi
|
|
483
483
|
when :decimal
|
484
484
|
Float("%.#{scale}f" % Float(value))
|
485
485
|
when :date
|
486
|
-
Date.strptime(value, in_format)
|
486
|
+
value.is_a?(Date) ? value : Date.strptime(value, in_format) # value.is_a?(Date) is only needed becuase we stub date types with actual dates, rather than strings like we probably should
|
487
487
|
when :datetime
|
488
488
|
Time.strptime(value, in_format)
|
489
489
|
else
|
@@ -493,5 +493,209 @@ module Remi
|
|
493
493
|
end
|
494
494
|
end
|
495
495
|
|
496
|
+
|
497
|
+
|
498
|
+
|
499
|
+
|
500
|
+
# Public: Converts strings into booleans.
|
501
|
+
# Uses a regex to convert strings representing booleans to actual booleans.
|
502
|
+
# The truthy regex is /^(t|true|y|yes|1)$/i and the falsey regex is /^(f|false|n|no|0)$/i
|
503
|
+
#
|
504
|
+
# allow_nils - Specifies whether to allow the result to include nils. If this is set
|
505
|
+
# to false, then the value is only checked against the truthy regex and
|
506
|
+
# the returned value is false if it doesn't match. If allow_nils
|
507
|
+
# is set to true, the both the truthy and the falsey regex are checked.
|
508
|
+
# If neither match, then the result is nil. (Default: false).
|
509
|
+
#
|
510
|
+
# Examples:
|
511
|
+
#
|
512
|
+
# Truthy.new.to_proc.call('True') # => true
|
513
|
+
# Truthy.new.to_proc.call('Yes') # => true
|
514
|
+
# Truthy.new.to_proc.call('y') # => true
|
515
|
+
# Truthy.new.to_proc.call('Yessire') # => false
|
516
|
+
# Truthy.new.to_proc.call('0') # => false
|
517
|
+
# Truthy.new.to_proc.call('Pineapple') # => false
|
518
|
+
# Truthy.new(allow_nils: false).to_proc.call('Pineapple') # => nil
|
519
|
+
class Truthy < Transform
|
520
|
+
def initialize(*args, allow_nils: false, **kargs, &block)
|
521
|
+
super
|
522
|
+
@allow_nils = allow_nils
|
523
|
+
|
524
|
+
@true_regex = /^(t|true|y|yes|1)$/i
|
525
|
+
@false_regex = /^(f|false|n|no|0)$/i
|
526
|
+
end
|
527
|
+
|
528
|
+
def match_true(value)
|
529
|
+
!!value.match(@true_regex)
|
530
|
+
end
|
531
|
+
|
532
|
+
def match_false(value)
|
533
|
+
!!value.match(@false_regex)
|
534
|
+
end
|
535
|
+
|
536
|
+
def transform(value)
|
537
|
+
value = value.to_s
|
538
|
+
|
539
|
+
if @allow_nils
|
540
|
+
if match_true(value)
|
541
|
+
true
|
542
|
+
elsif match_false(value)
|
543
|
+
false
|
544
|
+
else
|
545
|
+
nil
|
546
|
+
end
|
547
|
+
else
|
548
|
+
match_true(value)
|
549
|
+
end
|
550
|
+
end
|
551
|
+
end
|
552
|
+
|
553
|
+
|
554
|
+
# Public: Applies a DataFrame grouping sieve.
|
555
|
+
#
|
556
|
+
# The DataFrame sieve can be used to simplify very complex nested
|
557
|
+
# if-then logic to group data into buckets. Given a DataFrame
|
558
|
+
# with N columns, the first N-1 columns represent the variables
|
559
|
+
# needed to group data into buckets. The last column is the
|
560
|
+
# desired group. The sieve then progresses down the rows of the
|
561
|
+
# DataFrame and checks to see if the input data matches the values
|
562
|
+
# in the columns of the sieve. Nils in the sieve are treated as
|
563
|
+
# wildcards and match anything. The first row that matches wins
|
564
|
+
# and the sieve progression stops.
|
565
|
+
#
|
566
|
+
# sieve_df - The sieve, defined as a dataframe. The arguments
|
567
|
+
# to the transform must appear in the same order as the
|
568
|
+
# first N-1 columns of the sieve.
|
569
|
+
#
|
570
|
+
#
|
571
|
+
# Examples:
|
572
|
+
#
|
573
|
+
# # This sieve captures the following business logic
|
574
|
+
# # 1 - All Non-Graduate Nursing, regardless of contact, gets assigned to the :intensive group.
|
575
|
+
# # 2 - All Undergraduate programs with contact get assigned to the :intensive group.
|
576
|
+
# # 3 - All Undergraduate programs without a contact get assigned to the :base group.
|
577
|
+
# # 4 - All Graduate engineering programs with a contact get assigned to the :intensive group.
|
578
|
+
# # 5 - All other programs get assigned to the :base group
|
579
|
+
# sieve_df = Daru::DataFrame.new([
|
580
|
+
# [ 'Undergrad' , 'NURS' , nil , :intensive ],
|
581
|
+
# [ 'Undergrad' , nil , true , :intensive ],
|
582
|
+
# [ 'Undergrad' , nil , false , :base ],
|
583
|
+
# [ 'Grad' , 'ENG' , true , :intensive ],
|
584
|
+
# [ nil , nil , nil , :base ],
|
585
|
+
# ].transpose,
|
586
|
+
# order: [:level, :program, :contact, :group]
|
587
|
+
# )
|
588
|
+
#
|
589
|
+
# test_df = Daru::DataFrame.new([
|
590
|
+
# ['Undergrad' , 'CHEM' , false],
|
591
|
+
# ['Undergrad' , 'CHEM' , true],
|
592
|
+
# ['Grad' , 'CHEM' , true],
|
593
|
+
# ['Undergrad' , 'NURS' , false],
|
594
|
+
# ['Unknown' , 'CHEM' , true],
|
595
|
+
# ].transpose,
|
596
|
+
# order: [:level, :program, :contact]
|
597
|
+
# )
|
598
|
+
#
|
599
|
+
# Remi::SourceToTargetMap.apply(test_df) do
|
600
|
+
# map source(:level, :program, :contact,) .target(:group)
|
601
|
+
# .transform(Remi::Transform::DataFrameSieve.new(sieve_df))
|
602
|
+
# end
|
603
|
+
#
|
604
|
+
# test_df
|
605
|
+
# # => #<Daru::DataFrame:70099624408400 @name = d30888fd-6ca8-48dd-9be3-558f81ae1015 @size = 5>
|
606
|
+
# level program contact group
|
607
|
+
# 0 Undergrad CHEM nil base
|
608
|
+
# 1 Undergrad CHEM true intensive
|
609
|
+
# 2 Grad CHEM true base
|
610
|
+
# 3 Undergrad NURS nil intensive
|
611
|
+
# 4 Unknown CHEM true base
|
612
|
+
class DataFrameSieve < Transform
|
613
|
+
def initialize(sieve_df, *args, **kargs, &block)
|
614
|
+
super
|
615
|
+
@sieve_df = sieve_df.transpose.to_h.values
|
616
|
+
end
|
617
|
+
|
618
|
+
def transform(*values)
|
619
|
+
sieve_keys = @sieve_df.first.index.to_a
|
620
|
+
sieve_result_key = sieve_keys.pop
|
621
|
+
|
622
|
+
@sieve_df.each.find do |sieve_row|
|
623
|
+
match_row = true
|
624
|
+
sieve_keys.each_with_index do |key,idx|
|
625
|
+
match_row &&= sieve_row[key].nil? || sieve_row[key] == values[idx]
|
626
|
+
end
|
627
|
+
match_row
|
628
|
+
end[sieve_result_key]
|
629
|
+
end
|
630
|
+
end
|
631
|
+
|
632
|
+
|
633
|
+
# Public: Used to partition elements into groups (buckets).
|
634
|
+
#
|
635
|
+
# buckets - A hash where the keys are groups and the values are weights or percentages.
|
636
|
+
# current_population - A hashable object holding a count of the current number of
|
637
|
+
# elements in each bucket.
|
638
|
+
#
|
639
|
+
# Example:
|
640
|
+
#
|
641
|
+
# # The current population has 2 record in the A bucket and 3 in B
|
642
|
+
# current_pop = Daru::Vector.new([2,3], index: ['A', 'B'])
|
643
|
+
#
|
644
|
+
# # We want to generate 7 new records that will evenly populate the A, B, and C buckets, given the current populations.
|
645
|
+
# part = Remi::Transform::Partitioner.new(buckets: { 'A' => 1, 'B' => 1,'C' => 1 }, initial_population: current_pop)
|
646
|
+
#
|
647
|
+
# 1.upt(7).map { |iter| part.call } # => ["C", "C", "A", "C", "C", "B", "A"]
|
648
|
+
class Partitioner < Transform
|
649
|
+
def initialize(buckets:, initial_population: {}, **kargs, &block)
|
650
|
+
super
|
651
|
+
@buckets = buckets
|
652
|
+
@current_population = sanitize_initial_population(buckets, initial_population)
|
653
|
+
end
|
654
|
+
|
655
|
+
attr_reader :buckets
|
656
|
+
attr_reader :current_population
|
657
|
+
|
658
|
+
def transform(*values)
|
659
|
+
get_next_value
|
660
|
+
end
|
661
|
+
|
662
|
+
def size
|
663
|
+
@size ||= @current_population.reduce(0) { |sum, (group, n)| sum += n }
|
664
|
+
end
|
665
|
+
|
666
|
+
def total_weight
|
667
|
+
@total_weight ||= @buckets.reduce(0) { |sum, (bucket, weight)| sum += 1.0 * weight }
|
668
|
+
end
|
669
|
+
|
670
|
+
def get_next_value
|
671
|
+
assigned = @buckets.max_by do |(group, weight)|
|
672
|
+
expected = @buckets[group] / total_weight * size
|
673
|
+
actual = @current_population[group]
|
674
|
+
|
675
|
+
diff = expected - actual
|
676
|
+
if diff > 0
|
677
|
+
rand**(1.0 / diff)
|
678
|
+
else
|
679
|
+
-rand**(- 1.0 / @buckets[group])
|
680
|
+
end
|
681
|
+
end.first
|
682
|
+
|
683
|
+
@current_population[assigned] += 1
|
684
|
+
@size += 1
|
685
|
+
|
686
|
+
assigned
|
687
|
+
end
|
688
|
+
|
689
|
+
private
|
690
|
+
|
691
|
+
def sanitize_initial_population(buckets, dist)
|
692
|
+
dist = dist.to_h
|
693
|
+
|
694
|
+
zero_distribution = buckets.keys.reduce({}) { |h, group| h[group] = 0; h }
|
695
|
+
zero_distribution.merge(dist.select { |k,v| buckets.keys.include? k })
|
696
|
+
end
|
697
|
+
end
|
698
|
+
|
699
|
+
|
496
700
|
end
|
497
701
|
end
|
data/lib/remi/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: remi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.31
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sterling Paramore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-05-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bond
|
@@ -193,11 +193,14 @@ files:
|
|
193
193
|
- features/step_definitions/remi_step.rb
|
194
194
|
- features/support/env.rb
|
195
195
|
- features/support/env_app.rb
|
196
|
+
- features/transforms/data_frame_sieve.feature
|
196
197
|
- features/transforms/date_diff.feature
|
197
198
|
- features/transforms/nvl.feature
|
198
199
|
- features/transforms/parse_date.feature
|
200
|
+
- features/transforms/partitioner.feature
|
199
201
|
- features/transforms/prefix.feature
|
200
202
|
- features/transforms/truncate.feature
|
203
|
+
- features/transforms/truthy.feature
|
201
204
|
- jobs/aggregate_job.rb
|
202
205
|
- jobs/all_jobs_shared.rb
|
203
206
|
- jobs/copy_source_job.rb
|
@@ -206,12 +209,15 @@ files:
|
|
206
209
|
- jobs/parameters_job.rb
|
207
210
|
- jobs/sample_job.rb
|
208
211
|
- jobs/sftp_file_target_job.rb
|
212
|
+
- jobs/transforms/data_frame_sieve_job.rb
|
209
213
|
- jobs/transforms/date_diff_job.rb
|
210
214
|
- jobs/transforms/nvl_job.rb
|
211
215
|
- jobs/transforms/parse_date_job.rb
|
216
|
+
- jobs/transforms/partitioner_job.rb
|
212
217
|
- jobs/transforms/prefix_job.rb
|
213
218
|
- jobs/transforms/transform_jobs.rb
|
214
219
|
- jobs/transforms/truncate_job.rb
|
220
|
+
- jobs/transforms/truthy_job.rb
|
215
221
|
- lib/remi.rb
|
216
222
|
- lib/remi/cli.rb
|
217
223
|
- lib/remi/cucumber.rb
|
@@ -277,11 +283,14 @@ test_files:
|
|
277
283
|
- features/step_definitions/remi_step.rb
|
278
284
|
- features/support/env.rb
|
279
285
|
- features/support/env_app.rb
|
286
|
+
- features/transforms/data_frame_sieve.feature
|
280
287
|
- features/transforms/date_diff.feature
|
281
288
|
- features/transforms/nvl.feature
|
282
289
|
- features/transforms/parse_date.feature
|
290
|
+
- features/transforms/partitioner.feature
|
283
291
|
- features/transforms/prefix.feature
|
284
292
|
- features/transforms/truncate.feature
|
293
|
+
- features/transforms/truthy.feature
|
285
294
|
- spec/extractor/sftp_file_spec.rb
|
286
295
|
- spec/metadata_spec.rb
|
287
296
|
- spec/remi_spec.rb
|