RubyGems - remi - Versions diffs - 0.3.0 → 0.3.1 - Mend

remi 0.3.0 → 0.3.1

Files changed (27) hide show

checksums.yaml +4 -4
data/.bundle/config +1 -1
data/Gemfile +1 -0
data/Gemfile.lock +45 -5
data/README.md +245 -0
data/features/step_definitions/remi_step.rb +16 -0
data/jobs/sub_job_example_job.rb +5 -5
data/lib/remi.rb +4 -1
data/lib/remi/data_subject.rb +10 -1
data/lib/remi/data_subjects/file_system.rb +31 -1
data/lib/remi/data_subjects/gsheet.rb +140 -0
data/lib/remi/data_subjects/sftp_file.rb +1 -0
data/lib/remi/data_subjects/sub_job.rb +13 -4
data/lib/remi/encoder.rb +1 -1
data/lib/remi/job.rb +9 -1
data/lib/remi/job/parameters.rb +8 -3
data/lib/remi/job/sub_job.rb +14 -8
data/lib/remi/loader.rb +14 -2
data/lib/remi/testing/business_rules.rb +12 -9
data/lib/remi/transform.rb +9 -0
data/lib/remi/version.rb +1 -1
data/spec/data_subject_spec.rb +23 -5
data/spec/data_subjects/file_system_spec.rb +43 -9
data/spec/data_subjects/gsheet_spec.rb +133 -0
data/spec/data_subjects/sub_job_spec.rb +40 -8
data/spec/job_spec.rb +58 -15
metadata +5 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 71164673ece850e218a1ef18a72aa02f4ca5d065
-  data.tar.gz: b52e6e86bc4995f2caf79dad79a7f0a195cdb9f9
+  metadata.gz: ea2d2971479e9e0dfcc5de4dd01ac13f5274a6f8
+  data.tar.gz: ea5e3a3280613d00ae29f5265c342740973ea57c
 SHA512:
-  metadata.gz: 708b6fdf566acb525caa85bc038685bad46c3999f65045b5281c9393d731f5a3cdb49ac30a7ecde12f16abab5a27364449a83fc840adfa531e2a04a783fcd69a
-  data.tar.gz: 48730dd300bb84dadb93d911e37baa274958d0d1deaddcbc769aa43a7f7c1887ffbcf025ce43d4188e09630de31868acc6315ae2559dbd80853febc42e13b5d7
+  metadata.gz: a09f8f926a99891356bcd6363ef4729b3a27bcf9cbfacce2a6bc8a8b60a262cfe83fe40d1e0f1f0c79945693aacb0fb1099b5bad299e022511846730e98642fe
+  data.tar.gz: d160e9840162558b1d9e203825f8e724ce1626e9f8a6922b86f894249b78c5c6680b37fef7e24574d794aac41db025bd887c707df35f6485a3540914552a3293

data/.bundle/config CHANGED Viewed

@@ -1,2 +1,2 @@
 ---
-BUNDLE_DISABLE_SHARED_GEMS: '1'
+BUNDLE_DISABLE_SHARED_GEMS: "true"

data/Gemfile CHANGED Viewed

@@ -2,6 +2,7 @@
 source 'https://rubygems.org'
 gemspec
+gem 'google-api-client', '~> 0.9'
 gem 'daru', '0.1.4.1', git: 'git@github.com:inside-track/daru.git', branch: '0.1.4.1-Remi'
 gem 'restforce', '~> 2.1'
 gem 'salesforce_bulk_api', git: 'git@github.com:inside-track/salesforce_bulk_api.git', branch: 'master'

data/Gemfile.lock CHANGED Viewed

@@ -18,7 +18,7 @@ GIT
 PATH
   remote: .
   specs:
-    remi (0.3.0)
+    remi (0.3.1)
       activesupport (~> 4.2)
       bond (~> 0.5)
       cucumber (~> 2.1)
@@ -39,6 +39,7 @@ GEM
       minitest (~> 5.1)
       thread_safe (~> 0.3, >= 0.3.4)
       tzinfo (~> 1.1)
+    addressable (2.4.0)
     aws-sdk (2.3.5)
       aws-sdk-resources (= 2.3.5)
     aws-sdk-core (2.3.5)
@@ -69,7 +70,26 @@ GEM
       faraday (>= 0.7.4, < 0.10)
     gherkin (3.2.0)
     github-markup (1.4.0)
+    google-api-client (0.9.15)
+      addressable (~> 2.3)
+      googleauth (~> 0.5)
+      httpclient (~> 2.7)
+      hurley (~> 0.1)
+      memoist (~> 0.11)
+      mime-types (>= 1.6)
+      representable (~> 2.3.0)
+      retriable (~> 2.0)
+    googleauth (0.5.1)
+      faraday (~> 0.9)
+      jwt (~> 1.4)
+      logging (~> 2.0)
+      memoist (~> 0.12)
+      multi_json (~> 1.11)
+      os (~> 0.9)
+      signet (~> 0.7)
     hashie (3.4.3)
+    httpclient (2.8.2.4)
+    hurley (0.2)
     i18n (0.7.0)
     iruby (0.2.7)
       bond (~> 0.5)
@@ -80,6 +100,15 @@ GEM
       json_pure (>= 1.8.1)
     json (1.8.3)
     json_pure (1.8.3)
+    jwt (1.5.6)
+    little-plugger (1.1.4)
+    logging (2.1.0)
+      little-plugger (~> 1.1)
+      multi_json (~> 1.10)
+    memoist (0.15.0)
+    mime-types (3.1)
+      mime-types-data (~> 3.2015)
+    mime-types-data (3.2016.0521)
     mimemagic (0.3.1)
     minitest (5.8.4)
     multi_json (1.11.2)
@@ -88,16 +117,20 @@ GEM
     net-sftp (2.1.2)
       net-ssh (>= 2.6.5)
     net-ssh (3.1.1)
+    os (0.9.6)
     pg (0.18.4)
     rbczmq (1.7.9)
     redcarpet (3.3.4)
     regex_sieve (0.1.0)
     regexp-examples (1.2.0)
+    representable (2.3.0)
+      uber (~> 0.0.7)
     restforce (2.2.0)
       faraday (~> 0.9.0)
       faraday_middleware (>= 0.8.8)
       hashie (>= 1.2.0, < 4.0)
       json (>= 1.7.5, < 1.9.0)
+    retriable (2.1.0)
     rspec (3.4.0)
       rspec-core (~> 3.4.0)
       rspec-expectations (~> 3.4.0)
@@ -111,9 +144,15 @@ GEM
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.4.0)
     rspec-support (3.4.1)
+    signet (0.7.3)
+      addressable (~> 2.3)
+      faraday (~> 0.9)
+      jwt (~> 1.5)
+      multi_json (~> 1.10)
     thread_safe (0.3.5)
     tzinfo (1.2.2)
       thread_safe (~> 0.1)
+    uber (0.0.15)
     xml-simple (1.1.5)
     yard (0.9.0)
@@ -123,13 +162,14 @@ PLATFORMS
 DEPENDENCIES
   aws-sdk (~> 2.3)
   daru (= 0.1.4.1)!
-  github-markup
+  github-markup (~> 1.4)
+  google-api-client (~> 0.9)
   iruby (= 0.2.7)
-  redcarpet
+  redcarpet (~> 3.3)
   remi!
   restforce (~> 2.1)
   salesforce_bulk_api!
-  yard
+  yard (~> 0.9)
 BUNDLED WITH
-   1.11.2
+   1.13.5

data/README.md CHANGED Viewed

@@ -24,6 +24,11 @@ Remi will follow [semantic versioning](http://semver.org/) principles.
 Of course, while we're still on major version zero, little effort will
 be made to maintain backward compatibility.
+The data transformation layer is built on top of
+[Daru dataframe](https://github.com/v0dro/daru).  Familiarity with
+Daru dataframes is essential for writing complex transformations in
+Remi.
 ## Getting Started
 Add the gem to your Gemfile, `bundle install`, and then initialize your repository as
@@ -40,8 +45,248 @@ sure this works by running
 All of the test should pass.
+## Remi Jobs
+A Remi job describes the data sources that will be used to collect
+data, the transformations that will be performed on the data, and the
+data targets that will be populated when all transformations are
+complete.  With Remi, an ETL process is defined in a class that
+inherits from the `Remi::Job` class.
+### Hello World
+A very simple "Hello World" example of a Remi job would be
+````ruby
+class HelloWorldJob < Remi::Job
+  transform :say_hi do
+    puts "Hello World"
+  end
+end
+````
+This job doesn't make use of any data subjects (data sources or data
+targets), but it does define a single data transform called `:say_hi`.
+The full job can be executed by calling the `#execute` method on an
+instance of the `HelloWorldJob` class
+````ruby
+job = HelloWorldJob.new
+job.execute
+#=> "Hello World"
+````
+The transform called `say_hi` is just a method of the `HelloWorldJob`
+class representing a job transform object.  Multiple transforms can be
+defined in a Remi job.  To execute a specific transform we can call that transform by
+name using
+````ruby
+job = HelloWorldJob.new
+job.say_hi.execute
+#=> "Hello World"
+````
+### A more complete example
+Suppose we have a database containing data on beer sales.  It's a
+normalized database where we store data on individual beers sold in a
+`beer_sales_fact` table and information on the details of the beer in
+a `beers_dim` table.  We'd like to extract data from both of these
+sources, combine them into a single flattened table and save it as a
+CSV file. This operation could be performed with the following Remi
+job.  (Of course, if this were a real world problem, we'd do the join
+in the database before extracting; this is a contrived example to show
+how one can combine data from multiple arbitrary sources).
+````ruby
+class DenormalizeBeersJob < Remi::Job
+  source :beer_sales_fact do
+    extractor Remi::Extractor::Postgres.new(
+      credentials: {
+        dbname: 'my_local_db'
+      },
+      query: 'SELECT beer_id, sold_date, quantity FROM beer_sales_fact'
+    )
+    parser Remi::Parser::Postgres.new
+    fields(
+      {
+        :beer_id  => {},
+        :sold_at  => { type: :date, in_format: '%Y-%m-%d' },
+        :quantity => { type: :integer }
+      }
+    )
+  end
+  source :beers_dim do
+    extractor Remi::Extractor::Postgres.new(
+      credentials: {
+        dbname: 'my_local_db'
+      },
+      query: 'SELECT beer_id, name, price_per_unit FROM beers_dim'
+    )
+    parser Remi::Parser::Postgres.new
+    fields(
+      {
+        :beer_id        => {},
+        :name           => {},
+        :price_per_unit => { type: :decimal, scale: 2 }
+      }
+    )
+  end
+  target :flat_beer_file do
+    encoder Remi::Encoder::CsvFile.new
+    loader Remi::Loader::LocalFile.new(
+      path: 'flat_beers.csv'
+    )
+  end
+  transform :type_enforcement do
+    beer_sales_fact.enforce_types
+    beers_dim.enforce_types
+  end
+  transform :flatten do
+    flat_beer_file.df = beer_sales_fact.df.join(flat_beer_file.df, on: [:beer_id], how: :inner)
+    Remi::SourceToTargetMap.apply(flat_beer_file.df) do
+      map source(:quantity, :price_per_unit) .target(:total_price)
+        .transform(->(row) {
+          row[:quantity] * row[:price_per_unit]
+        })
+    end
+  end
+end
+````
+### Components of a Remi Job
+A Remi job is composed of one or more of the following elements, which are described
+in more detail below.  All of these elements are defined using class methods (part
+of `Remi::Job`).  Each of the elements is given a name and defined in a block.
+* Data Subjects - A data subject is either a data source or a data target.
+  * Data Sources - A data source describes where data is extracted from.
+  ````ruby
+  source :my_source do
+    # ... source definition
+  end
+  ````
+  * Data Targets - A data target describes where data is loaded to.
+  ````ruby
+  target :my_target do
+    # ... target definition
+  end
+  ````
+* Transforms - A transform is essentially arbitrary block of of Ruby
+  code, but is typically used to transform data sources into data targets.
+  ````ruby
+  transform :my_transform do
+    # ... lots of code
+  end
+  ````
+* Job Parameters - A job parameter is a memoized block of code
+  (similar to RSpecs' `let` method) that is used to configure a job and may
+  be overridden at runtime if needed.
+  ````ruby
+  param :my_param do
+    # ... the return value of this block is memoized
+  end
+  ````
+* Sub Transforms - Sub transforms are essentially transforms, but they are NOT
+  automatically executed when the job is executed.  Instead, they must be _imported_
+  in a transform.  They are meant to be reusable bits of transform code.
+  ````ruby
+  sub_transform :my_sub_transform do
+    #... sub_transform stuff
+  end
+  ````
+* Sub Jobs - Sub jobs are simply references to other Remi jobs that may be executed
+  within the current job.
+  ````ruby
+  sub_job :my_sub_job { MySubJob.new }
+  ````
+### Execution Plan
+The `DenormalizeBeersJob` example above can be executed using
+````ruby
+job = DenormalizeBeersJob.new
+job.execute
+````
+Calling `#execute` on an instance of a job does the following, in this order:
+1. All transforms defined in the job (via `transform :name do ... end`) are executed
+   in the order they were defined in the class definition.
+2. All data targets are loaded in the order they are defined in the job.
+Note that data sources are not extracted until the moment the data is
+needed in a transform.  If the source data is never referenced in a
+transform, it is never extracted.
+## Data Subjects
+A _data subject_ refers to either a data source or a data target.
+Either way, a data subject is associated with a data frame.  Currently
+the only data frames supported are
+[Daru data frames](https://github.com/v0dro/daru), but support for
+other data frames may be developed in the future.  The data frame associated
+with a data subject is accessed with the `#df` method and assigned with the `#df=`
+method.
+````ruby
+  my_data_subject.df #=> Daru::DataFrame
+  my_data_subject.df = Daru::DataFrame.new(...)
+````
+Additionally, all data subjects can be associated with a set of fields and field
+metadata.  Associating a data subject with feild data allows us to develop
+generic ETL routines that triggered by arbitrary metadata that may be associated
+with a field.
+### Sources
+### Targets
+### Field Metadata
+## Available Data Subjects
+* CSV Files
+* DataFrames
+* None
+* Local files
+* SFTP Files
+* S3 Files
+* Salesforce
+* Postgres
+## Transforms
+## Sub Jobs
+## Job Parameters
+## Sub Transforms
 ## Transforming Data
+When `#execute` is called on an instance of a `Remi::Job`, all transforms are executed in
+the order defined in the class
 TODO:
 Describe Daru foundation

data/features/step_definitions/remi_step.rb CHANGED Viewed

@@ -61,6 +61,7 @@ end
 Then /^the file that comes last in an alphanumeric sort by group will be downloaded for processing$/ do
   expect(@brt.source.data_subject.extractors.map(&:most_recent_by)).to include :name
+  expect(@brt.source.data_subject.extractors.map(&:most_recent_only)).not_to include false
 end
 Then /^the file is uploaded to the remote path "([^"]+)"$/ do |remote_path|
@@ -626,6 +627,21 @@ Then /^only the following fields should be present on the target:$/ do |table|
   expect(@brt.target.data_subject.df.vectors.to_a).to match_array @brt.target.fields.field_names
 end
+Then /^only the following fields should be present on the targets:$/ do |table|
+  table.rows.each do |row|
+    field = row[0]
+    targets = row[1].split(',')
+    targets.each { |target| step "the target field '#{target}: #{field}'" }
+  end
+  @brt.run_transforms
+  @brt.targets.keys.each do |target|
+    expect(@brt.targets[target].data_subject.df.vectors.to_a).to match_array @brt.targets[target].fields.field_names
+  end
+end
 ### Record-level expectations
 Then /^the record from source '(.+)' should be (?i)(Retained|Rejected)(?-i)(?: without error|)$/ do |source_name, action|

data/jobs/sub_job_example_job.rb CHANGED Viewed

@@ -73,14 +73,14 @@ class SubJobExampleJob < Remi::Job
   target :zombified_beers
   transform :zombification do
-    # Sub jobs must be executed before their sources are available
-    beers_job.execute
+    # Sub jobs are executed when data from a sub job is requested
+    # Here, the sub job beers_job is executed
     just_beers.df = beer_fridge.df
-    # Sub job targets must be loaded before they are available to subjobs
+    # Data is supplied to the sub job on assignment
     beers_to_zombify.df = just_beers.df
-    beers_to_zombify.load
-    zombify_job.execute
+    # Here, the sub job zombify_job is executed using the data supplied to it above
     zombified_beers.df = zombie_fridge.df
   end
 end