remi 0.2.30 → 0.2.31
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/features/step_definitions/remi_step.rb +19 -4
- data/features/transforms/data_frame_sieve.feature +38 -0
- data/features/transforms/partitioner.feature +72 -0
- data/features/transforms/truthy.feature +52 -0
- data/jobs/transforms/data_frame_sieve_job.rb +30 -0
- data/jobs/transforms/partitioner_job.rb +35 -0
- data/jobs/transforms/truthy_job.rb +21 -0
- data/lib/remi/cucumber/business_rules.rb +14 -1
- data/lib/remi/source_to_target_map.rb +2 -2
- data/lib/remi/transform.rb +205 -1
- data/lib/remi/version.rb +1 -1
- metadata +11 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7c7c81a972d6fc604a4761b7fff19a26ec187065
|
4
|
+
data.tar.gz: f07165f263dee57adb1802182cf72551d3326ab4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d4ea4cde266110c374b75ef527f0ec71a674bba865c3da46107687fddb4871b3fda5aa601c44cd0fad5857038d8dff1c4dce297cf1d710ca335e2afafaa882dd
|
7
|
+
data.tar.gz: 7a25d3e85b4ae727998ff196b79395db40bc0d584c04c061069c4d0a457bd79530f7a66b4ce3480df739ebfcef6dda7ef011c5e8171f76bac2326ee392e1d01a
|
data/Gemfile.lock
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
GIT
|
2
2
|
remote: git@github.com:inside-track/daru.git
|
3
|
-
revision:
|
3
|
+
revision: 80b8024102bdb31ef402cb8d6c5cef86dd31cd41
|
4
4
|
branch: itk-master
|
5
5
|
specs:
|
6
6
|
daru (0.1.2)
|
@@ -19,7 +19,7 @@ GIT
|
|
19
19
|
PATH
|
20
20
|
remote: .
|
21
21
|
specs:
|
22
|
-
remi (0.2.
|
22
|
+
remi (0.2.31)
|
23
23
|
activesupport (~> 4.2)
|
24
24
|
bond (~> 0.5)
|
25
25
|
cucumber (~> 2.1)
|
@@ -65,6 +65,9 @@ Then /^the file with the latest date stamp will be downloaded for processing$/ d
|
|
65
65
|
expect(@brt.source.extract).to match_array Array(@brt.filestore.latest)
|
66
66
|
end
|
67
67
|
|
68
|
+
Then /^files will be downloaded for processing$/ do
|
69
|
+
end
|
70
|
+
|
68
71
|
Then /^no files will be downloaded for processing$/ do
|
69
72
|
@brt.filestore.generate
|
70
73
|
@brt.source.mock_extractor(@brt.filestore)
|
@@ -415,13 +418,16 @@ Then /^the source field '([^']+)' is prefixed with "([^"]*)" and loaded into the
|
|
415
418
|
source_name, source_field_name = @brt.sources.parse_full_field(source_field)
|
416
419
|
target_names, target_field_name = @brt.targets.parse_full_field(target_field, multi: true)
|
417
420
|
|
418
|
-
prefixed_source =
|
421
|
+
prefixed_source = @brt.sources[source_name].fields[source_field_name].values.map do |value|
|
422
|
+
"#{prefix}#{value}"
|
423
|
+
end.uniq.sort
|
419
424
|
|
420
425
|
@brt.run_transforms
|
421
|
-
Array(target_names).
|
422
|
-
|
423
|
-
end
|
426
|
+
results = Array(target_names).map do |target_name|
|
427
|
+
@brt.targets[target_name].fields[target_field_name].values.uniq
|
428
|
+
end.flatten.uniq.sort
|
424
429
|
|
430
|
+
expect(results).to eq prefixed_source
|
425
431
|
end
|
426
432
|
|
427
433
|
Then /^the source field is prefixed with "([^"]*)" and loaded into the target field '([^']+)'$/ do |prefix, target_field|
|
@@ -633,6 +639,15 @@ Then /^a target record is not created$/ do
|
|
633
639
|
end
|
634
640
|
|
635
641
|
|
642
|
+
### Setting up data for multiple records
|
643
|
+
|
644
|
+
Given /^the source field '([^']+)' is a unique integer$/ do |source_field|
|
645
|
+
step "the source field '#{source_field}'"
|
646
|
+
source_name, source_field_name = @brt.sources.parse_full_field(source_field)
|
647
|
+
|
648
|
+
@brt.sources[source_name].unique_integer_field(source_field_name)
|
649
|
+
end
|
650
|
+
|
636
651
|
### Record counting
|
637
652
|
|
638
653
|
Then /^the target has (\d+) record(?:s|)$/ do |nrecords|
|
@@ -0,0 +1,38 @@
|
|
1
|
+
Feature: Tests the DataFrameSieve transform
|
2
|
+
|
3
|
+
Background:
|
4
|
+
Given the job is 'Data Frame Sieve'
|
5
|
+
And the job source 'Source Data'
|
6
|
+
And the job source 'Sieve'
|
7
|
+
And the job target 'Target Data'
|
8
|
+
|
9
|
+
And the source 'Source Data'
|
10
|
+
And the source 'Sieve'
|
11
|
+
And the target 'Target Data'
|
12
|
+
|
13
|
+
|
14
|
+
Scenario: A slightly complicated sieve.
|
15
|
+
|
16
|
+
Given the following example for 'Sieve':
|
17
|
+
| level | program | contact | group |
|
18
|
+
| Undergrad | NURS | \\nil | intensive |
|
19
|
+
| Undergrad | \\nil | true | intensive |
|
20
|
+
| Undergrad | \\nil | false | base |
|
21
|
+
| Grad | ENG | true | intensive |
|
22
|
+
| \\nil | \\nil | \\nil | base |
|
23
|
+
|
24
|
+
And the following example for 'Source Data':
|
25
|
+
| id | level | program | contact |
|
26
|
+
| 1 | Undergrad | CHEM | false |
|
27
|
+
| 2 | Undergrad | CHEM | true |
|
28
|
+
| 3 | Grad | CHEM | true |
|
29
|
+
| 4 | Undergrad | NURS | false |
|
30
|
+
| 5 | Unknown | CHEM | true |
|
31
|
+
|
32
|
+
Then the target should match the example:
|
33
|
+
| id | level | program | contact | group |
|
34
|
+
| 1 | Undergrad | CHEM | false | base |
|
35
|
+
| 2 | Undergrad | CHEM | true | intensive |
|
36
|
+
| 3 | Grad | CHEM | true | base |
|
37
|
+
| 4 | Undergrad | NURS | false | intensive |
|
38
|
+
| 5 | Unknown | CHEM | true | base |
|
@@ -0,0 +1,72 @@
|
|
1
|
+
Feature: Tests the Partitioner transform
|
2
|
+
The partitioner keeps track of which groups it has assigned records to
|
3
|
+
in order to keep the distribution of records into groups as precise as
|
4
|
+
possible.
|
5
|
+
|
6
|
+
|
7
|
+
Background:
|
8
|
+
Given the job is 'Partitioner'
|
9
|
+
And the job source 'Source Data'
|
10
|
+
And the job source 'Current Population'
|
11
|
+
And the job source 'Distribution'
|
12
|
+
And the job target 'Target Data'
|
13
|
+
|
14
|
+
And the source 'Source Data'
|
15
|
+
And the source 'Current Population'
|
16
|
+
And the source 'Distribution'
|
17
|
+
And the target 'Target Data'
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
Scenario: Partitioning records into groups with no prior population
|
22
|
+
|
23
|
+
Given the following example for 'Source Data':
|
24
|
+
| id |
|
25
|
+
| 1 |
|
26
|
+
| 2 |
|
27
|
+
| 3 |
|
28
|
+
| 4 |
|
29
|
+
|
30
|
+
And the following example for 'Distribution':
|
31
|
+
| group | weight |
|
32
|
+
| A | 0.5 |
|
33
|
+
| B | 0.5 |
|
34
|
+
| C | 1 |
|
35
|
+
|
36
|
+
Then the target has 1 records where 'group' is "A"
|
37
|
+
Then the target has 1 records where 'group' is "B"
|
38
|
+
Then the target has 2 records where 'group' is "C"
|
39
|
+
|
40
|
+
|
41
|
+
Scenario: Partitioning records into groups with a prior population
|
42
|
+
|
43
|
+
Given the following example for 'Source Data':
|
44
|
+
| id |
|
45
|
+
| 1 |
|
46
|
+
| 2 |
|
47
|
+
| 3 |
|
48
|
+
| 4 |
|
49
|
+
| 5 |
|
50
|
+
|
51
|
+
And the following example for 'Distribution':
|
52
|
+
| group | weight |
|
53
|
+
| A | 0.5 |
|
54
|
+
| B | 0.5 |
|
55
|
+
| C | 1 |
|
56
|
+
|
57
|
+
And the following example for 'Current Population':
|
58
|
+
| group | count |
|
59
|
+
| A | 2 |
|
60
|
+
| B | 1 |
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
Then the target has 0 records where 'group' is "A"
|
65
|
+
Then the target has 1 records where 'group' is "B"
|
66
|
+
Then the target has 4 records where 'group' is "C"
|
67
|
+
|
68
|
+
|
69
|
+
# Scenario: Remainders
|
70
|
+
# When the target population is matched exactly, the next
|
71
|
+
# assignment is random (and weighted by the given weights).
|
72
|
+
# I don't know how to test this.
|
@@ -0,0 +1,52 @@
|
|
1
|
+
Feature: Tests the Truthy transform
|
2
|
+
|
3
|
+
Background:
|
4
|
+
Given the job is 'Truthy'
|
5
|
+
And the job source 'Source Data'
|
6
|
+
And the job target 'Target Data'
|
7
|
+
|
8
|
+
And the source 'Source Data'
|
9
|
+
And the target 'Target Data'
|
10
|
+
|
11
|
+
Scenario Outline: Truthy without allowing nils
|
12
|
+
|
13
|
+
Given the source field 'truthy'
|
14
|
+
And the target field 'no_nils'
|
15
|
+
|
16
|
+
When the source field has the value "<source>"
|
17
|
+
Then the target field is set to the value "<target>"
|
18
|
+
|
19
|
+
Examples:
|
20
|
+
| source | target |
|
21
|
+
| True | true |
|
22
|
+
| t | true |
|
23
|
+
| yEs | true |
|
24
|
+
| Y | true |
|
25
|
+
| 1 | true |
|
26
|
+
| Yessir | false |
|
27
|
+
| anything | false |
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
Scenario Outline: Truthy allowing nils
|
32
|
+
|
33
|
+
Given the source field 'truthy'
|
34
|
+
And the target field 'allow_nils'
|
35
|
+
|
36
|
+
When the source field has the value "<source>"
|
37
|
+
Then the target field is set to the value "<target>"
|
38
|
+
|
39
|
+
Examples:
|
40
|
+
| source | target |
|
41
|
+
| True | true |
|
42
|
+
| t | true |
|
43
|
+
| yEs | true |
|
44
|
+
| Y | true |
|
45
|
+
| 1 | true |
|
46
|
+
| Yessir | |
|
47
|
+
| anything | |
|
48
|
+
| FALSE | false |
|
49
|
+
| f | false |
|
50
|
+
| no | false |
|
51
|
+
| N | false |
|
52
|
+
| 0 | false |
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require_relative '../all_jobs_shared'
|
2
|
+
|
3
|
+
class DataFrameSieveJob
|
4
|
+
include AllJobsShared
|
5
|
+
|
6
|
+
define_source :source_data, Remi::DataSource::DataFrame,
|
7
|
+
fields: {
|
8
|
+
:id => {},
|
9
|
+
:level => {},
|
10
|
+
:program => {},
|
11
|
+
:contact => {}
|
12
|
+
}
|
13
|
+
|
14
|
+
define_source :sieve, Remi::DataSource::DataFrame,
|
15
|
+
fields: {
|
16
|
+
:level => {},
|
17
|
+
:program => {},
|
18
|
+
:contact => {},
|
19
|
+
:group => {}
|
20
|
+
}
|
21
|
+
|
22
|
+
define_target :target_data, Remi::DataTarget::DataFrame
|
23
|
+
|
24
|
+
define_transform :main, sources: :source_data, targets: :target_data do
|
25
|
+
Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
|
26
|
+
map source(:level, :program, :contact) .target(:group)
|
27
|
+
.transform(Remi::Transform::DataFrameSieve.new(sieve.df))
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require_relative '../all_jobs_shared'
|
2
|
+
|
3
|
+
class PartitionerJob
|
4
|
+
include AllJobsShared
|
5
|
+
|
6
|
+
define_source :source_data, Remi::DataSource::DataFrame,
|
7
|
+
fields: {
|
8
|
+
:id => {}
|
9
|
+
}
|
10
|
+
|
11
|
+
define_source :distribution, Remi::DataSource::DataFrame,
|
12
|
+
fields: {
|
13
|
+
:group => {},
|
14
|
+
:weight => {}
|
15
|
+
}
|
16
|
+
|
17
|
+
define_source :current_population, Remi::DataSource::DataFrame,
|
18
|
+
fields: {
|
19
|
+
:group => {},
|
20
|
+
:count => {}
|
21
|
+
}
|
22
|
+
|
23
|
+
define_target :target_data, Remi::DataTarget::DataFrame
|
24
|
+
|
25
|
+
define_transform :main, sources: :source_data, targets: :target_data do
|
26
|
+
|
27
|
+
distribution_hash = distribution.df.map(:row) { |row| [row[:group], row[:weight].to_f] }.to_h
|
28
|
+
current_population_hash = current_population.df.map(:row) { |row| [row[:group], row[:count].to_i] }.to_h
|
29
|
+
|
30
|
+
Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
|
31
|
+
map source(nil) .target(:group)
|
32
|
+
.transform(Remi::Transform::Partitioner.new(buckets: distribution_hash, initial_population: current_population_hash))
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require_relative '../all_jobs_shared'
|
2
|
+
|
3
|
+
class TruthyJob
|
4
|
+
include AllJobsShared
|
5
|
+
|
6
|
+
define_source :source_data, Remi::DataSource::DataFrame,
|
7
|
+
fields: {
|
8
|
+
:truthy => {}
|
9
|
+
}
|
10
|
+
define_target :target_data, Remi::DataTarget::DataFrame
|
11
|
+
|
12
|
+
define_transform :main, sources: :source_data, targets: :target_data do
|
13
|
+
Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
|
14
|
+
map source(:truthy) .target(:allow_nils)
|
15
|
+
.transform(Remi::Transform::Truthy.new(allow_nils: true))
|
16
|
+
|
17
|
+
map source(:truthy) .target(:no_nils)
|
18
|
+
.transform(Remi::Transform::Truthy.new)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -390,6 +390,12 @@ module Remi::BusinessRules
|
|
390
390
|
@data_subject.df.group_by(field_names).size * 1.0 / @data_subject.df.size
|
391
391
|
end
|
392
392
|
|
393
|
+
def unique_integer_field(field_name)
|
394
|
+
vector_name = fields[field_name].field_name
|
395
|
+
i = 0
|
396
|
+
@data_subject.df[vector_name].recode! { |v| i += 1 }
|
397
|
+
end
|
398
|
+
|
393
399
|
def mock_extractor(filestore)
|
394
400
|
extractor = class << @data_subject.extractor; self; end
|
395
401
|
|
@@ -526,7 +532,14 @@ module Remi::BusinessRules
|
|
526
532
|
df = Daru::DataFrame.new([], order: seed_hash.keys | table_headers)
|
527
533
|
@table.hashes.each do |example_row|
|
528
534
|
example_row_sym = example_row.reduce({}) do |h, (k,v)|
|
529
|
-
|
535
|
+
formula_value = ParseFormula.parse(v)
|
536
|
+
value = case formula_value
|
537
|
+
when '\nil'
|
538
|
+
nil
|
539
|
+
else
|
540
|
+
formula_value
|
541
|
+
end
|
542
|
+
h[k.symbolize(field_symbolizer)] = value
|
530
543
|
h
|
531
544
|
end
|
532
545
|
df.add_row(seed_hash.merge(example_row_sym))
|
@@ -84,7 +84,7 @@ module Remi
|
|
84
84
|
|
85
85
|
def do_map_single_source_and_target_vector
|
86
86
|
@target_df[@target_vectors.first] = @source_df[@source_vectors.first].recode do |vector_value|
|
87
|
-
@transform_procs.reduce(vector_value) { |value, tform| tform.call(*(value
|
87
|
+
@transform_procs.reduce(vector_value) { |value, tform| tform.call(*(value.nil? ? [nil] : value)) }
|
88
88
|
end
|
89
89
|
end
|
90
90
|
|
@@ -100,7 +100,7 @@ module Remi
|
|
100
100
|
end
|
101
101
|
|
102
102
|
work_vector.recode! do |vector_value|
|
103
|
-
@transform_procs.reduce(vector_value) { |value, tform| tform.call(*(value
|
103
|
+
@transform_procs.reduce(vector_value) { |value, tform| tform.call(*(value.nil? ? [nil] : value)) }
|
104
104
|
end
|
105
105
|
|
106
106
|
@target_vectors.each_with_index do |target_vector, vector_idx|
|
data/lib/remi/transform.rb
CHANGED
@@ -483,7 +483,7 @@ module Remi
|
|
483
483
|
when :decimal
|
484
484
|
Float("%.#{scale}f" % Float(value))
|
485
485
|
when :date
|
486
|
-
Date.strptime(value, in_format)
|
486
|
+
value.is_a?(Date) ? value : Date.strptime(value, in_format) # value.is_a?(Date) is only needed becuase we stub date types with actual dates, rather than strings like we probably should
|
487
487
|
when :datetime
|
488
488
|
Time.strptime(value, in_format)
|
489
489
|
else
|
@@ -493,5 +493,209 @@ module Remi
|
|
493
493
|
end
|
494
494
|
end
|
495
495
|
|
496
|
+
|
497
|
+
|
498
|
+
|
499
|
+
|
500
|
+
# Public: Converts strings into booleans.
|
501
|
+
# Uses a regex to convert strings representing booleans to actual booleans.
|
502
|
+
# The truthy regex is /^(t|true|y|yes|1)$/i and the falsey regex is /^(f|false|n|no|0)$/i
|
503
|
+
#
|
504
|
+
# allow_nils - Specifies whether to allow the result to include nils. If this is set
|
505
|
+
# to false, then the value is only checked against the truthy regex and
|
506
|
+
# the returned value is false if it doesn't match. If allow_nils
|
507
|
+
# is set to true, the both the truthy and the falsey regex are checked.
|
508
|
+
# If neither match, then the result is nil. (Default: false).
|
509
|
+
#
|
510
|
+
# Examples:
|
511
|
+
#
|
512
|
+
# Truthy.new.to_proc.call('True') # => true
|
513
|
+
# Truthy.new.to_proc.call('Yes') # => true
|
514
|
+
# Truthy.new.to_proc.call('y') # => true
|
515
|
+
# Truthy.new.to_proc.call('Yessire') # => false
|
516
|
+
# Truthy.new.to_proc.call('0') # => false
|
517
|
+
# Truthy.new.to_proc.call('Pineapple') # => false
|
518
|
+
# Truthy.new(allow_nils: false).to_proc.call('Pineapple') # => nil
|
519
|
+
class Truthy < Transform
|
520
|
+
def initialize(*args, allow_nils: false, **kargs, &block)
|
521
|
+
super
|
522
|
+
@allow_nils = allow_nils
|
523
|
+
|
524
|
+
@true_regex = /^(t|true|y|yes|1)$/i
|
525
|
+
@false_regex = /^(f|false|n|no|0)$/i
|
526
|
+
end
|
527
|
+
|
528
|
+
def match_true(value)
|
529
|
+
!!value.match(@true_regex)
|
530
|
+
end
|
531
|
+
|
532
|
+
def match_false(value)
|
533
|
+
!!value.match(@false_regex)
|
534
|
+
end
|
535
|
+
|
536
|
+
def transform(value)
|
537
|
+
value = value.to_s
|
538
|
+
|
539
|
+
if @allow_nils
|
540
|
+
if match_true(value)
|
541
|
+
true
|
542
|
+
elsif match_false(value)
|
543
|
+
false
|
544
|
+
else
|
545
|
+
nil
|
546
|
+
end
|
547
|
+
else
|
548
|
+
match_true(value)
|
549
|
+
end
|
550
|
+
end
|
551
|
+
end
|
552
|
+
|
553
|
+
|
554
|
+
# Public: Applies a DataFrame grouping sieve.
|
555
|
+
#
|
556
|
+
# The DataFrame sieve can be used to simplify very complex nested
|
557
|
+
# if-then logic to group data into buckets. Given a DataFrame
|
558
|
+
# with N columns, the first N-1 columns represent the variables
|
559
|
+
# needed to group data into buckets. The last column is the
|
560
|
+
# desired group. The sieve then progresses down the rows of the
|
561
|
+
# DataFrame and checks to see if the input data matches the values
|
562
|
+
# in the columns of the sieve. Nils in the sieve are treated as
|
563
|
+
# wildcards and match anything. The first row that matches wins
|
564
|
+
# and the sieve progression stops.
|
565
|
+
#
|
566
|
+
# sieve_df - The sieve, defined as a dataframe. The arguments
|
567
|
+
# to the transform must appear in the same order as the
|
568
|
+
# first N-1 columns of the sieve.
|
569
|
+
#
|
570
|
+
#
|
571
|
+
# Examples:
|
572
|
+
#
|
573
|
+
# # This sieve captures the following business logic
|
574
|
+
# # 1 - All Non-Graduate Nursing, regardless of contact, gets assigned to the :intensive group.
|
575
|
+
# # 2 - All Undergraduate programs with contact get assigned to the :intensive group.
|
576
|
+
# # 3 - All Undergraduate programs without a contact get assigned to the :base group.
|
577
|
+
# # 4 - All Graduate engineering programs with a contact get assigned to the :intensive group.
|
578
|
+
# # 5 - All other programs get assigned to the :base group
|
579
|
+
# sieve_df = Daru::DataFrame.new([
|
580
|
+
# [ 'Undergrad' , 'NURS' , nil , :intensive ],
|
581
|
+
# [ 'Undergrad' , nil , true , :intensive ],
|
582
|
+
# [ 'Undergrad' , nil , false , :base ],
|
583
|
+
# [ 'Grad' , 'ENG' , true , :intensive ],
|
584
|
+
# [ nil , nil , nil , :base ],
|
585
|
+
# ].transpose,
|
586
|
+
# order: [:level, :program, :contact, :group]
|
587
|
+
# )
|
588
|
+
#
|
589
|
+
# test_df = Daru::DataFrame.new([
|
590
|
+
# ['Undergrad' , 'CHEM' , false],
|
591
|
+
# ['Undergrad' , 'CHEM' , true],
|
592
|
+
# ['Grad' , 'CHEM' , true],
|
593
|
+
# ['Undergrad' , 'NURS' , false],
|
594
|
+
# ['Unknown' , 'CHEM' , true],
|
595
|
+
# ].transpose,
|
596
|
+
# order: [:level, :program, :contact]
|
597
|
+
# )
|
598
|
+
#
|
599
|
+
# Remi::SourceToTargetMap.apply(test_df) do
|
600
|
+
# map source(:level, :program, :contact,) .target(:group)
|
601
|
+
# .transform(Remi::Transform::DataFrameSieve.new(sieve_df))
|
602
|
+
# end
|
603
|
+
#
|
604
|
+
# test_df
|
605
|
+
# # => #<Daru::DataFrame:70099624408400 @name = d30888fd-6ca8-48dd-9be3-558f81ae1015 @size = 5>
|
606
|
+
# level program contact group
|
607
|
+
# 0 Undergrad CHEM nil base
|
608
|
+
# 1 Undergrad CHEM true intensive
|
609
|
+
# 2 Grad CHEM true base
|
610
|
+
# 3 Undergrad NURS nil intensive
|
611
|
+
# 4 Unknown CHEM true base
|
612
|
+
class DataFrameSieve < Transform
|
613
|
+
def initialize(sieve_df, *args, **kargs, &block)
|
614
|
+
super
|
615
|
+
@sieve_df = sieve_df.transpose.to_h.values
|
616
|
+
end
|
617
|
+
|
618
|
+
def transform(*values)
|
619
|
+
sieve_keys = @sieve_df.first.index.to_a
|
620
|
+
sieve_result_key = sieve_keys.pop
|
621
|
+
|
622
|
+
@sieve_df.each.find do |sieve_row|
|
623
|
+
match_row = true
|
624
|
+
sieve_keys.each_with_index do |key,idx|
|
625
|
+
match_row &&= sieve_row[key].nil? || sieve_row[key] == values[idx]
|
626
|
+
end
|
627
|
+
match_row
|
628
|
+
end[sieve_result_key]
|
629
|
+
end
|
630
|
+
end
|
631
|
+
|
632
|
+
|
633
|
+
# Public: Used to partition elements into groups (buckets).
|
634
|
+
#
|
635
|
+
# buckets - A hash where the keys are groups and the values are weights or percentages.
|
636
|
+
# current_population - A hashable object holding a count of the current number of
|
637
|
+
# elements in each bucket.
|
638
|
+
#
|
639
|
+
# Example:
|
640
|
+
#
|
641
|
+
# # The current population has 2 record in the A bucket and 3 in B
|
642
|
+
# current_pop = Daru::Vector.new([2,3], index: ['A', 'B'])
|
643
|
+
#
|
644
|
+
# # We want to generate 7 new records that will evenly populate the A, B, and C buckets, given the current populations.
|
645
|
+
# part = Remi::Transform::Partitioner.new(buckets: { 'A' => 1, 'B' => 1,'C' => 1 }, initial_population: current_pop)
|
646
|
+
#
|
647
|
+
# 1.upt(7).map { |iter| part.call } # => ["C", "C", "A", "C", "C", "B", "A"]
|
648
|
+
class Partitioner < Transform
|
649
|
+
def initialize(buckets:, initial_population: {}, **kargs, &block)
|
650
|
+
super
|
651
|
+
@buckets = buckets
|
652
|
+
@current_population = sanitize_initial_population(buckets, initial_population)
|
653
|
+
end
|
654
|
+
|
655
|
+
attr_reader :buckets
|
656
|
+
attr_reader :current_population
|
657
|
+
|
658
|
+
def transform(*values)
|
659
|
+
get_next_value
|
660
|
+
end
|
661
|
+
|
662
|
+
def size
|
663
|
+
@size ||= @current_population.reduce(0) { |sum, (group, n)| sum += n }
|
664
|
+
end
|
665
|
+
|
666
|
+
def total_weight
|
667
|
+
@total_weight ||= @buckets.reduce(0) { |sum, (bucket, weight)| sum += 1.0 * weight }
|
668
|
+
end
|
669
|
+
|
670
|
+
def get_next_value
|
671
|
+
assigned = @buckets.max_by do |(group, weight)|
|
672
|
+
expected = @buckets[group] / total_weight * size
|
673
|
+
actual = @current_population[group]
|
674
|
+
|
675
|
+
diff = expected - actual
|
676
|
+
if diff > 0
|
677
|
+
rand**(1.0 / diff)
|
678
|
+
else
|
679
|
+
-rand**(- 1.0 / @buckets[group])
|
680
|
+
end
|
681
|
+
end.first
|
682
|
+
|
683
|
+
@current_population[assigned] += 1
|
684
|
+
@size += 1
|
685
|
+
|
686
|
+
assigned
|
687
|
+
end
|
688
|
+
|
689
|
+
private
|
690
|
+
|
691
|
+
def sanitize_initial_population(buckets, dist)
|
692
|
+
dist = dist.to_h
|
693
|
+
|
694
|
+
zero_distribution = buckets.keys.reduce({}) { |h, group| h[group] = 0; h }
|
695
|
+
zero_distribution.merge(dist.select { |k,v| buckets.keys.include? k })
|
696
|
+
end
|
697
|
+
end
|
698
|
+
|
699
|
+
|
496
700
|
end
|
497
701
|
end
|
data/lib/remi/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: remi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.31
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sterling Paramore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-05-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bond
|
@@ -193,11 +193,14 @@ files:
|
|
193
193
|
- features/step_definitions/remi_step.rb
|
194
194
|
- features/support/env.rb
|
195
195
|
- features/support/env_app.rb
|
196
|
+
- features/transforms/data_frame_sieve.feature
|
196
197
|
- features/transforms/date_diff.feature
|
197
198
|
- features/transforms/nvl.feature
|
198
199
|
- features/transforms/parse_date.feature
|
200
|
+
- features/transforms/partitioner.feature
|
199
201
|
- features/transforms/prefix.feature
|
200
202
|
- features/transforms/truncate.feature
|
203
|
+
- features/transforms/truthy.feature
|
201
204
|
- jobs/aggregate_job.rb
|
202
205
|
- jobs/all_jobs_shared.rb
|
203
206
|
- jobs/copy_source_job.rb
|
@@ -206,12 +209,15 @@ files:
|
|
206
209
|
- jobs/parameters_job.rb
|
207
210
|
- jobs/sample_job.rb
|
208
211
|
- jobs/sftp_file_target_job.rb
|
212
|
+
- jobs/transforms/data_frame_sieve_job.rb
|
209
213
|
- jobs/transforms/date_diff_job.rb
|
210
214
|
- jobs/transforms/nvl_job.rb
|
211
215
|
- jobs/transforms/parse_date_job.rb
|
216
|
+
- jobs/transforms/partitioner_job.rb
|
212
217
|
- jobs/transforms/prefix_job.rb
|
213
218
|
- jobs/transforms/transform_jobs.rb
|
214
219
|
- jobs/transforms/truncate_job.rb
|
220
|
+
- jobs/transforms/truthy_job.rb
|
215
221
|
- lib/remi.rb
|
216
222
|
- lib/remi/cli.rb
|
217
223
|
- lib/remi/cucumber.rb
|
@@ -277,11 +283,14 @@ test_files:
|
|
277
283
|
- features/step_definitions/remi_step.rb
|
278
284
|
- features/support/env.rb
|
279
285
|
- features/support/env_app.rb
|
286
|
+
- features/transforms/data_frame_sieve.feature
|
280
287
|
- features/transforms/date_diff.feature
|
281
288
|
- features/transforms/nvl.feature
|
282
289
|
- features/transforms/parse_date.feature
|
290
|
+
- features/transforms/partitioner.feature
|
283
291
|
- features/transforms/prefix.feature
|
284
292
|
- features/transforms/truncate.feature
|
293
|
+
- features/transforms/truthy.feature
|
285
294
|
- spec/extractor/sftp_file_spec.rb
|
286
295
|
- spec/metadata_spec.rb
|
287
296
|
- spec/remi_spec.rb
|