RubyGems - remi - Versions diffs - 0.3.0 → 0.3.1 - Mend

remi 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/.bundle/config +1 -1
data/Gemfile +1 -0
data/Gemfile.lock +45 -5
data/README.md +245 -0
data/features/step_definitions/remi_step.rb +16 -0
data/jobs/sub_job_example_job.rb +5 -5
data/lib/remi.rb +4 -1
data/lib/remi/data_subject.rb +10 -1
data/lib/remi/data_subjects/file_system.rb +31 -1
data/lib/remi/data_subjects/gsheet.rb +140 -0
data/lib/remi/data_subjects/sftp_file.rb +1 -0
data/lib/remi/data_subjects/sub_job.rb +13 -4
data/lib/remi/encoder.rb +1 -1
data/lib/remi/job.rb +9 -1
data/lib/remi/job/parameters.rb +8 -3
data/lib/remi/job/sub_job.rb +14 -8
data/lib/remi/loader.rb +14 -2
data/lib/remi/testing/business_rules.rb +12 -9
data/lib/remi/transform.rb +9 -0
data/lib/remi/version.rb +1 -1
data/spec/data_subject_spec.rb +23 -5
data/spec/data_subjects/file_system_spec.rb +43 -9
data/spec/data_subjects/gsheet_spec.rb +133 -0
data/spec/data_subjects/sub_job_spec.rb +40 -8
data/spec/job_spec.rb +58 -15
metadata +5 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 71164673ece850e218a1ef18a72aa02f4ca5d065
-  data.tar.gz: b52e6e86bc4995f2caf79dad79a7f0a195cdb9f9
+  metadata.gz: ea2d2971479e9e0dfcc5de4dd01ac13f5274a6f8
+  data.tar.gz: ea5e3a3280613d00ae29f5265c342740973ea57c
 SHA512:
-  metadata.gz: 708b6fdf566acb525caa85bc038685bad46c3999f65045b5281c9393d731f5a3cdb49ac30a7ecde12f16abab5a27364449a83fc840adfa531e2a04a783fcd69a
-  data.tar.gz: 48730dd300bb84dadb93d911e37baa274958d0d1deaddcbc769aa43a7f7c1887ffbcf025ce43d4188e09630de31868acc6315ae2559dbd80853febc42e13b5d7
+  metadata.gz: a09f8f926a99891356bcd6363ef4729b3a27bcf9cbfacce2a6bc8a8b60a262cfe83fe40d1e0f1f0c79945693aacb0fb1099b5bad299e022511846730e98642fe
+  data.tar.gz: d160e9840162558b1d9e203825f8e724ce1626e9f8a6922b86f894249b78c5c6680b37fef7e24574d794aac41db025bd887c707df35f6485a3540914552a3293

data/.bundle/config CHANGED Viewed

@@ -1,2 +1,2 @@
 ---
-BUNDLE_DISABLE_SHARED_GEMS: '1'
+BUNDLE_DISABLE_SHARED_GEMS: "true"

data/Gemfile CHANGED Viewed

@@ -2,6 +2,7 @@
 source 'https://rubygems.org'
 gemspec
+gem 'google-api-client', '~> 0.9'
 gem 'daru', '0.1.4.1', git: 'git@github.com:inside-track/daru.git', branch: '0.1.4.1-Remi'
 gem 'restforce', '~> 2.1'
 gem 'salesforce_bulk_api', git: 'git@github.com:inside-track/salesforce_bulk_api.git', branch: 'master'

data/Gemfile.lock CHANGED Viewed

@@ -18,7 +18,7 @@ GIT
 PATH
   remote: .
   specs:
-    remi (0.3.0)
+    remi (0.3.1)
       activesupport (~> 4.2)
       bond (~> 0.5)
       cucumber (~> 2.1)
@@ -39,6 +39,7 @@ GEM
       minitest (~> 5.1)
       thread_safe (~> 0.3, >= 0.3.4)
       tzinfo (~> 1.1)
+    addressable (2.4.0)
     aws-sdk (2.3.5)
       aws-sdk-resources (= 2.3.5)
     aws-sdk-core (2.3.5)
@@ -69,7 +70,26 @@ GEM
       faraday (>= 0.7.4, < 0.10)
     gherkin (3.2.0)
     github-markup (1.4.0)
+    google-api-client (0.9.15)
+      addressable (~> 2.3)
+      googleauth (~> 0.5)
+      httpclient (~> 2.7)
+      hurley (~> 0.1)
+      memoist (~> 0.11)
+      mime-types (>= 1.6)
+      representable (~> 2.3.0)
+      retriable (~> 2.0)
+    googleauth (0.5.1)
+      faraday (~> 0.9)
+      jwt (~> 1.4)
+      logging (~> 2.0)
+      memoist (~> 0.12)
+      multi_json (~> 1.11)
+      os (~> 0.9)
+      signet (~> 0.7)
     hashie (3.4.3)
+    httpclient (2.8.2.4)
+    hurley (0.2)
     i18n (0.7.0)
     iruby (0.2.7)
       bond (~> 0.5)
@@ -80,6 +100,15 @@ GEM
       json_pure (>= 1.8.1)
     json (1.8.3)
     json_pure (1.8.3)
+    jwt (1.5.6)
+    little-plugger (1.1.4)
+    logging (2.1.0)
+      little-plugger (~> 1.1)
+      multi_json (~> 1.10)
+    memoist (0.15.0)
+    mime-types (3.1)
+      mime-types-data (~> 3.2015)
+    mime-types-data (3.2016.0521)
     mimemagic (0.3.1)
     minitest (5.8.4)
     multi_json (1.11.2)
@@ -88,16 +117,20 @@ GEM
     net-sftp (2.1.2)
       net-ssh (>= 2.6.5)
     net-ssh (3.1.1)
+    os (0.9.6)
     pg (0.18.4)
     rbczmq (1.7.9)
     redcarpet (3.3.4)
     regex_sieve (0.1.0)
     regexp-examples (1.2.0)
+    representable (2.3.0)
+      uber (~> 0.0.7)
     restforce (2.2.0)
       faraday (~> 0.9.0)
       faraday_middleware (>= 0.8.8)
       hashie (>= 1.2.0, < 4.0)
       json (>= 1.7.5, < 1.9.0)
+    retriable (2.1.0)
     rspec (3.4.0)
       rspec-core (~> 3.4.0)
       rspec-expectations (~> 3.4.0)
@@ -111,9 +144,15 @@ GEM
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.4.0)
     rspec-support (3.4.1)
+    signet (0.7.3)
+      addressable (~> 2.3)
+      faraday (~> 0.9)
+      jwt (~> 1.5)
+      multi_json (~> 1.10)
     thread_safe (0.3.5)
     tzinfo (1.2.2)
       thread_safe (~> 0.1)
+    uber (0.0.15)
     xml-simple (1.1.5)
     yard (0.9.0)
@@ -123,13 +162,14 @@ PLATFORMS
 DEPENDENCIES
   aws-sdk (~> 2.3)
   daru (= 0.1.4.1)!
-  github-markup
+  github-markup (~> 1.4)
+  google-api-client (~> 0.9)
   iruby (= 0.2.7)
-  redcarpet
+  redcarpet (~> 3.3)
   remi!
   restforce (~> 2.1)
   salesforce_bulk_api!
-  yard
+  yard (~> 0.9)
 BUNDLED WITH
-   1.11.2
+   1.13.5

data/README.md CHANGED Viewed

@@ -24,6 +24,11 @@ Remi will follow [semantic versioning](http://semver.org/) principles.
 Of course, while we're still on major version zero, little effort will
 be made to maintain backward compatibility.
+The data transformation layer is built on top of
+[Daru dataframe](https://github.com/v0dro/daru).  Familiarity with
+Daru dataframes is essential for writing complex transformations in
+Remi.
 ## Getting Started
 Add the gem to your Gemfile, `bundle install`, and then initialize your repository as
@@ -40,8 +45,248 @@ sure this works by running
 All of the test should pass.
+## Remi Jobs
+A Remi job describes the data sources that will be used to collect
+data, the transformations that will be performed on the data, and the
+data targets that will be populated when all transformations are
+complete.  With Remi, an ETL process is defined in a class that
+inherits from the `Remi::Job` class.
+### Hello World
+A very simple "Hello World" example of a Remi job would be
+````ruby
+class HelloWorldJob < Remi::Job
+  transform :say_hi do
+    puts "Hello World"
+  end
+end
+````
+This job doesn't make use of any data subjects (data sources or data
+targets), but it does define a single data transform called `:say_hi`.
+The full job can be executed by calling the `#execute` method on an
+instance of the `HelloWorldJob` class
+````ruby
+job = HelloWorldJob.new
+job.execute
+#=> "Hello World"
+````
+The transform called `say_hi` is just a method of the `HelloWorldJob`
+class representing a job transform object.  Multiple transforms can be
+defined in a Remi job.  To execute a specific transform we can call that transform by
+name using
+````ruby
+job = HelloWorldJob.new
+job.say_hi.execute
+#=> "Hello World"
+````
+### A more complete example
+Suppose we have a database containing data on beer sales.  It's a
+normalized database where we store data on individual beers sold in a
+`beer_sales_fact` table and information on the details of the beer in
+a `beers_dim` table.  We'd like to extract data from both of these
+sources, combine them into a single flattened table and save it as a
+CSV file. This operation could be performed with the following Remi
+job.  (Of course, if this were a real world problem, we'd do the join
+in the database before extracting; this is a contrived example to show
+how one can combine data from multiple arbitrary sources).
+````ruby
+class DenormalizeBeersJob < Remi::Job
+  source :beer_sales_fact do
+    extractor Remi::Extractor::Postgres.new(
+      credentials: {
+        dbname: 'my_local_db'
+      },
+      query: 'SELECT beer_id, sold_date, quantity FROM beer_sales_fact'
+    )
+    parser Remi::Parser::Postgres.new
+    fields(
+      {
+        :beer_id  => {},
+        :sold_at  => { type: :date, in_format: '%Y-%m-%d' },
+        :quantity => { type: :integer }
+      }
+    )
+  end
+  source :beers_dim do
+    extractor Remi::Extractor::Postgres.new(
+      credentials: {
+        dbname: 'my_local_db'
+      },
+      query: 'SELECT beer_id, name, price_per_unit FROM beers_dim'
+    )
+    parser Remi::Parser::Postgres.new
+    fields(
+      {
+        :beer_id        => {},
+        :name           => {},
+        :price_per_unit => { type: :decimal, scale: 2 }
+      }
+    )
+  end
+  target :flat_beer_file do
+    encoder Remi::Encoder::CsvFile.new
+    loader Remi::Loader::LocalFile.new(
+      path: 'flat_beers.csv'
+    )
+  end
+  transform :type_enforcement do
+    beer_sales_fact.enforce_types
+    beers_dim.enforce_types
+  end
+  transform :flatten do
+    flat_beer_file.df = beer_sales_fact.df.join(flat_beer_file.df, on: [:beer_id], how: :inner)
+    Remi::SourceToTargetMap.apply(flat_beer_file.df) do
+      map source(:quantity, :price_per_unit) .target(:total_price)
+        .transform(->(row) {
+          row[:quantity] * row[:price_per_unit]
+        })
+    end
+  end
+end
+````
+### Components of a Remi Job
+A Remi job is composed of one or more of the following elements, which are described
+in more detail below.  All of these elements are defined using class methods (part
+of `Remi::Job`).  Each of the elements is given a name and defined in a block.
+* Data Subjects - A data subject is either a data source or a data target.
+  * Data Sources - A data source describes where data is extracted from.
+  ````ruby
+  source :my_source do
+    # ... source definition
+  end
+  ````
+  * Data Targets - A data target describes where data is loaded to.
+  ````ruby
+  target :my_target do
+    # ... target definition
+  end
+  ````
+* Transforms - A transform is essentially arbitrary block of of Ruby
+  code, but is typically used to transform data sources into data targets.
+  ````ruby
+  transform :my_transform do
+    # ... lots of code
+  end
+  ````
+* Job Parameters - A job parameter is a memoized block of code
+  (similar to RSpecs' `let` method) that is used to configure a job and may
+  be overridden at runtime if needed.
+  ````ruby
+  param :my_param do
+    # ... the return value of this block is memoized
+  end
+  ````
+* Sub Transforms - Sub transforms are essentially transforms, but they are NOT
+  automatically executed when the job is executed.  Instead, they must be _imported_
+  in a transform.  They are meant to be reusable bits of transform code.
+  ````ruby
+  sub_transform :my_sub_transform do
+    #... sub_transform stuff
+  end
+  ````
+* Sub Jobs - Sub jobs are simply references to other Remi jobs that may be executed
+  within the current job.
+  ````ruby
+  sub_job :my_sub_job { MySubJob.new }
+  ````
+### Execution Plan
+The `DenormalizeBeersJob` example above can be executed using
+````ruby
+job = DenormalizeBeersJob.new
+job.execute
+````
+Calling `#execute` on an instance of a job does the following, in this order:
+1. All transforms defined in the job (via `transform :name do ... end`) are executed
+   in the order they were defined in the class definition.
+2. All data targets are loaded in the order they are defined in the job.
+Note that data sources are not extracted until the moment the data is
+needed in a transform.  If the source data is never referenced in a
+transform, it is never extracted.
+## Data Subjects
+A _data subject_ refers to either a data source or a data target.
+Either way, a data subject is associated with a data frame.  Currently
+the only data frames supported are
+[Daru data frames](https://github.com/v0dro/daru), but support for
+other data frames may be developed in the future.  The data frame associated
+with a data subject is accessed with the `#df` method and assigned with the `#df=`
+method.
+````ruby
+  my_data_subject.df #=> Daru::DataFrame
+  my_data_subject.df = Daru::DataFrame.new(...)
+````
+Additionally, all data subjects can be associated with a set of fields and field
+metadata.  Associating a data subject with feild data allows us to develop
+generic ETL routines that triggered by arbitrary metadata that may be associated
+with a field.
+### Sources
+### Targets
+### Field Metadata
+## Available Data Subjects
+* CSV Files
+* DataFrames
+* None
+* Local files
+* SFTP Files
+* S3 Files
+* Salesforce
+* Postgres
+## Transforms
+## Sub Jobs
+## Job Parameters
+## Sub Transforms
 ## Transforming Data
+When `#execute` is called on an instance of a `Remi::Job`, all transforms are executed in
+the order defined in the class
 TODO:
 Describe Daru foundation

data/features/step_definitions/remi_step.rb CHANGED Viewed

@@ -61,6 +61,7 @@ end
 Then /^the file that comes last in an alphanumeric sort by group will be downloaded for processing$/ do
   expect(@brt.source.data_subject.extractors.map(&:most_recent_by)).to include :name
+  expect(@brt.source.data_subject.extractors.map(&:most_recent_only)).not_to include false
 end
 Then /^the file is uploaded to the remote path "([^"]+)"$/ do |remote_path|
@@ -626,6 +627,21 @@ Then /^only the following fields should be present on the target:$/ do |table|
   expect(@brt.target.data_subject.df.vectors.to_a).to match_array @brt.target.fields.field_names
 end
+Then /^only the following fields should be present on the targets:$/ do |table|
+  table.rows.each do |row|
+    field = row[0]
+    targets = row[1].split(',')
+    targets.each { |target| step "the target field '#{target}: #{field}'" }
+  end
+  @brt.run_transforms
+  @brt.targets.keys.each do |target|
+    expect(@brt.targets[target].data_subject.df.vectors.to_a).to match_array @brt.targets[target].fields.field_names
+  end
+end
 ### Record-level expectations
 Then /^the record from source '(.+)' should be (?i)(Retained|Rejected)(?-i)(?: without error|)$/ do |source_name, action|

data/jobs/sub_job_example_job.rb CHANGED Viewed

@@ -73,14 +73,14 @@ class SubJobExampleJob < Remi::Job
   target :zombified_beers
   transform :zombification do
-    # Sub jobs must be executed before their sources are available
-    beers_job.execute
+    # Sub jobs are executed when data from a sub job is requested
+    # Here, the sub job beers_job is executed
     just_beers.df = beer_fridge.df
-    # Sub job targets must be loaded before they are available to subjobs
+    # Data is supplied to the sub job on assignment
     beers_to_zombify.df = just_beers.df
-    beers_to_zombify.load
-    zombify_job.execute
+    # Here, the sub job zombify_job is executed using the data supplied to it above
     zombified_beers.df = zombie_fridge.df
   end
 end