RubyGems - jinx-migrate - Versions diffs - 2.1.1 - Mend

jinx-migrate 2.1.1

Files changed (65) hide show

data/.gitignore +14 -0
data/.rspec +3 -0
data/.yardopts +1 -0
data/Gemfile +8 -0
data/Gemfile.lock +38 -0
data/History.md +6 -0
data/LEGAL +5 -0
data/LICENSE +22 -0
data/README.md +33 -0
data/Rakefile +40 -0
data/bin/csvjoin +24 -0
data/examples/family/README.md +24 -0
data/examples/family/conf/children/fields.yaml +2 -0
data/examples/family/conf/parents/defaults.yaml +3 -0
data/examples/family/conf/parents/fields.yaml +6 -0
data/examples/family/conf/parents/values.yaml +4 -0
data/examples/family/data/children.csv +1 -0
data/examples/family/data/parents.csv +1 -0
data/examples/family/lib/shims.rb +17 -0
data/jinx-migrate.gemspec +26 -0
data/lib/jinx/csv/csvio.rb +214 -0
data/lib/jinx/csv/joiner.rb +196 -0
data/lib/jinx/migration/filter.rb +167 -0
data/lib/jinx/migration/migratable.rb +244 -0
data/lib/jinx/migration/migrator.rb +1029 -0
data/lib/jinx/migration/reader.rb +16 -0
data/lib/jinx/migration/version.rb +5 -0
data/spec/bad/bad_spec.rb +25 -0
data/spec/bad/fields.yaml +1 -0
data/spec/bad/parents.csv +1 -0
data/spec/bad/shims.rb +16 -0
data/spec/csv/join/join_helper.rb +35 -0
data/spec/csv/join/join_spec.rb +100 -0
data/spec/csv/join/jumbled_src.csv +7 -0
data/spec/csv/join/jumbled_tgt.csv +7 -0
data/spec/csv/join/source.csv +7 -0
data/spec/csv/join/target.csv +7 -0
data/spec/extract/extract.rb +13 -0
data/spec/extract/extract_spec.rb +33 -0
data/spec/extract/fields.yaml +1 -0
data/spec/extract/parents.csv +1 -0
data/spec/family/child_spec.rb +27 -0
data/spec/family/family.rb +13 -0
data/spec/family/parent_spec.rb +57 -0
data/spec/filter/fields.yaml +1 -0
data/spec/filter/filter_spec.rb +20 -0
data/spec/filter/parents.csv +1 -0
data/spec/filter/values.yaml +4 -0
data/spec/primitive/children.csv +1 -0
data/spec/primitive/fields.yaml +4 -0
data/spec/primitive/primitive_spec.rb +24 -0
data/spec/skip/fields.yaml +1 -0
data/spec/skip/parents.csv +1 -0
data/spec/skip/skip_spec.rb +17 -0
data/spec/spec_helper.rb +17 -0
data/spec/support/model.rb +7 -0
data/spec/unique/fields.yaml +1 -0
data/spec/unique/parent.rb +6 -0
data/spec/unique/parents.csv +1 -0
data/spec/unique/shims.rb +10 -0
data/spec/unique/unique_spec.rb +20 -0
data/test/fixtures/csv/data/empty.csv +1 -0
data/test/fixtures/csv/data/variety.csv +1 -0
data/test/lib/csv/csvio_test.rb +74 -0
metadata +206 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,14 @@
+.DS_Store
+*~
+*.pdf
+.project
+.loadpath
+.yardoc
+*.gem
+*.tar*
+**/ext/bin
+**/classes
+/doc/api
+**log
+/test/results

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--backtrace
+--format Fuubar
+--color

data/.yardopts ADDED Viewed

	@@ -0,0 +1 @@
1	+ -o doc/api --private --protected - History.md LEGAL LICENSE

data/Gemfile ADDED Viewed

@@ -0,0 +1,8 @@
+source :rubygems
+gemspec
+group :development do
+  # Uncomment to use the local development project.
+  gem 'jinx', :path => File.dirname(__FILE__) + '/../core'
+  gem 'jinx-migrate', :path => File.dirname(__FILE__)
+end

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,38 @@
+PATH
+  remote: .
+  specs:
+    jinx-migrate (2.1.1)
+      bundler
+      fastercsv
+      rack
+PATH
+  remote: /Users/loneyf/workspace/jinx/core
+  specs:
+    jinx (2.1.1)
+      bundler
+GEM
+  remote: http://rubygems.org/
+  specs:
+    diff-lcs (1.1.3)
+    fastercsv (1.5.4)
+    rack (1.4.1)
+    rake (0.9.2.2)
+    rspec (2.9.0)
+      rspec-core (~> 2.9.0)
+      rspec-expectations (~> 2.9.0)
+      rspec-mocks (~> 2.9.0)
+    rspec-core (2.9.0)
+    rspec-expectations (2.9.1)
+      diff-lcs (~> 1.1.3)
+    rspec-mocks (2.9.0)
+PLATFORMS
+  java
+DEPENDENCIES
+  jinx!
+  jinx-migrate!
+  rake
+  rspec (>= 2.6)

data/History.md ADDED Viewed

@@ -0,0 +1,6 @@
+This history lists major release themes. See the GitHub commits (https://github.com/jinx/migrate)
+for change details.
+2.1.1 / 2012-04-13
+------------------
+* Initial public release spun off from caruby/core.

data/LEGAL ADDED Viewed

@@ -0,0 +1,5 @@
+LEGAL NOTICE INFORMATION
+------------------------
+All the files in this distribution are covered under either the MIT
+license (see the file LICENSE).

data/LICENSE ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2012 Oregon Health & Science University
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,33 @@
+Jinx Migrator
+=============
+**Home**:         [http://github.com/jinx/migrate](http://github.com/jinx/migrate)
+**Git**:          [http://github.com/jinx/migrate](http://github.com/jinx/migrate)
+**Author**:       OHSU Knight Cancer Institute
+**Copyright**:    2012
+**License**:      MIT License
+Synopsis
+--------
+The Jinx Migrator migrates input data into a [Jinx](http://github.com/jinx/core) target.
+Installing
+----------
+The Jinx Migrator is installed as a JRuby gem:
+    [sudo] jgem install jinx-migrate
+Usage
+-----
+1. Enable Jinx for a Java package, as described in the [Jinx](http://github.com/jinx/core) Usage.
+2. Configure the input -> target mapping.
+3. Run the migrator.
+See the [Family](http://github.com/jinx/migrate/tree/master/examples/family) example for a sample migration.
+Copyright
+---------
+Jinx &copy; 2012 by [Oregon Health & Science University](http://www.ohsu.edu/xd/health/services/cancer/index.cfm).
+Jinx is licensed under the MIT license. Please see the LICENSE and LEGAL files for more information.

data/Rakefile ADDED Viewed

@@ -0,0 +1,40 @@
+require File.dirname(__FILE__) + '/lib/jinx/migration/version'
+# the gem name
+GEM = 'jinx-migrate'
+GEM_VERSION = Jinx::Migrate::VERSION
+WINDOWS = (Config::CONFIG['host_os'] =~ /mingw|win32|cygwin/ ? true : false) rescue false
+SUDO = WINDOWS ? '' : 'sudo'
+desc 'Default: run the specs'
+task :default => :spec
+desc "Builds the gem"
+task :gem do
+  sh "jgem build #{GEM}.gemspec"
+end
+desc "Installs the gem"
+task :install => :gem do
+  sh "#{SUDO} jgem install #{GEM}-#{GEM_VERSION}.gem"
+end
+desc 'Documents the API'
+task :doc do
+  FileUtils.rm_rf 'doc/api'
+  sh 'yardoc'
+end
+desc 'Runs the spec tests'
+task :spec do
+  Dir['spec/**/*_spec.rb'].each { |f| sh "rspec #{f}" rescue nil }
+end
+desc 'Runs the unit tests'
+task :unit do
+  Dir['test/**/*_test.rb'].each { |f| sh "jruby #{f}" rescue nil }
+end
+desc 'Runs all tests'
+task :test => [:spec, :unit]

data/bin/csvjoin ADDED Viewed

@@ -0,0 +1,24 @@
+#!/usr/bin/env ruby
+#
+# csvjoin: joins two CSV files on their common fields
+#
+# Add the migrate lib to the path.
+$:.unshift File.join(File.dirname(__FILE__), '..', 'lib')
+require 'rubygems'
+require 'jinx'
+require 'jinx/csv/csvio'
+require 'jinx/cli/command'
+specs = [
+  [:to, '--to TARGET', 'The join target input file (default stdin)'],
+  [:as, '--as OUTPUT', 'The joined output file (default stdout)'],
+  [:source, 'SOURCE', 'The join source input file']
+]
+Jinx::CLI::Command.new(specs).start do |opts|
+  Jinx::CsvIO.join(opts.delete(:source), opts)
+end
+exit 0

data/examples/family/README.md ADDED Viewed

@@ -0,0 +1,24 @@
+Family migration example
+========================
+Synopsis
+--------
+This directory contains the Jinx migration Family example.
+The Family example demonstrates how to load the content of a source CSV file into
+a Family data store. The use cases illustrate several common migration impediments:
+* Different source-destination terminology
+* Different source-destination associations
+* Incomplete input
+* Denormalized input
+* Inconsistent input
+* Input data scrubbing
+Migration
+---------
+The example migration input data resides in the `data` directory.
+Each `parents` CSV input file holds one row for each parent.
+Each `childs` CSV input file holds one row for each parent.
+Each input file has a corresponding migration mapping configuration in the `conf` directory.

data/examples/family/conf/children/fields.yaml ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ Parent: Parent.name
2	+ Child: Child.name

data/examples/family/conf/parents/defaults.yaml ADDED Viewed

@@ -0,0 +1,3 @@
+# This defaults configuration file demonstrates how to set a default property value in
+# the migrated record.
+Household.address.state: IL

data/examples/family/conf/parents/fields.yaml ADDED Viewed

@@ -0,0 +1,6 @@
+Name: Parent.name
+Street: Household.address.street1
+City: Household.address.city
+Zip: Household.address.postal_code
+Spouse: Parent.spouse.name

data/examples/family/conf/parents/values.yaml ADDED Viewed

@@ -0,0 +1,4 @@
+# This value filter configuration file demonstrates how to transform an input field value
+# to a migrated value. 'Street' is abbreviated to 'St'.
+Address.street1:
+  /^(.* St)reet(.*)$/ : "$1$2"

data/examples/family/data/children.csv ADDED Viewed

	@@ -0,0 +1 @@
1	+ Parent,Child

data/examples/family/data/parents.csv ADDED Viewed

	@@ -0,0 +1 @@
1	+ Name,Street,City,Zip,Spouse

data/examples/family/lib/shims.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module Family
+  # Declares the classes modified for migration.
+  shims Parent
+  class Parent
+    # Augments the migration by setting the spouse household.
+    #
+    # @param [{Symbol => Object}] row the input row field => value hash
+    # @param [<Resource>] migrated the migrated instances
+    def migrate(row, migrated)
+      super
+      if spouse then
+        spouse.household = migrated.detect { |m| Household === m }
+      end
+    end
+  end
+end

data/jinx-migrate.gemspec ADDED Viewed

@@ -0,0 +1,26 @@
+require File.dirname(__FILE__) + '/lib/jinx/migration/version'
+require 'date'
+Gem::Specification.new do |s|
+  s.name          = 'jinx-migrate'
+  s.summary       = 'Jinx JSON plug-in.'
+  s.description   = s.summary + '. See github.com/jinx/migrate for more information.'
+  s.version       = Jinx::Migrate::VERSION
+  s.date          = Date.today
+  s.author        = 'OHSU'
+  s.email         = "jinx.ruby@gmail.com"
+  s.homepage      = 'http://github.com/jinx/migrate'
+  s.require_path  = 'lib'
+  s.bindir        = 'bin'
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files spec`.split("\n")
+  s.executables   = `git ls-files bin`.split("\n").map{ |f| File.basename(f) }
+  s.add_runtime_dependency     'rack'
+  s.add_runtime_dependency     'bundler'
+  s.add_runtime_dependency     'fastercsv'
+  s.add_development_dependency 'rake'
+  s.add_development_dependency 'rspec', '>= 2.6'
+  s.has_rdoc      = 'yard'
+  s.license       = 'MIT'
+  s.rubyforge_project = 'jinx'
+end

data/lib/jinx/csv/csvio.rb ADDED Viewed

@@ -0,0 +1,214 @@
+require 'fileutils'
+require 'faster_csv'
+require 'jinx/helpers/options'
+require 'jinx/helpers/collections'
+require 'jinx/csv/joiner'
+module Jinx
+  # CsvIO reads or writes CSV records.
+  # This class wraps a FasterCSV with the following modifications:
+  # * relax the date parser to allow dd/mm/yyyy dates
+  # * don't convert integer text with a leading zero to an octal number
+  # * allow one custom converter with different semantics: if the converter block
+  #   call returns nil, then continue conversion, otherwise return the converter
+  #   result. This differs from FasterCSV converter semantics which calls converters
+  #   as long the result equals the input field value. The CsvIO converter semantics
+  #   supports converters that intend a String result to be the converted result.
+  #
+  # CsvIO is Enumerable, but does not implement the complete Ruby IO interface.
+  class CsvIO
+    include Enumerable
+    # @return [<String>] the CSV field names
+    attr_reader :field_names
+    # @return [<Symbol>] the CSV field value accessor
+    attr_reader :accessors
+    alias :headers :accessors
+    # Opens the CSV file and calls the given block with this CsvIO as the argument.
+    #
+    # @param (see #initialize)
+    # @option (see #initialize)
+    # @yield [csvio] the optional block to execute
+    # @yieldparam [CsvIO] csvio the open CSVIO instance
+    def self.open(dev, opts=nil)
+      csvio = new(dev, opts)
+      if block_given? then
+        begin
+          yield csvio
+        ensure
+          csvio.close
+        end
+      end
+    end
+    # Opens the given CSV file and calls {#each} with the given block.
+    #
+    # @param (see #initialize)
+    # @option (see #initialize)
+    # @yield [row] the block to execute on the row
+    # @yieldparam [{Symbol => Object}] row the field symbol => value hash
+    def self.foreach(file, opts=nil, &block)
+      open(file, opts) { |csvio| csvio.each(&block) }
+    end
+    # Joins the source to the target and writes the output. The match is on all fields
+    # held in common. If there is more than one match, then all but the first match has
+    # empty values for the merged fields. Both files must be sorted in order of the
+    # common fields, sequenced by their occurence in the source header.
+    #
+    # @param [String, IO] source the join source file
+    # @param [{Symbol => String, IO, <String>}] opts the join options
+    # @option opts [String, IO] :to the join target file name or device (default stdin)
+    # @option opts [<String>] :for the target field names (default all target fields)
+    # @option opts [String, IO] :as the output file name or device (default stdout)
+    # @yield (see Csv::Joiner#join)
+    # @yieldparam (see Csv::Joiner#join)
+    def self.join(source, opts, &block)
+      flds = opts[:for] || Array::EMPTY_ARRAY
+      Csv::Joiner.new(source, opts[:to], opts[:as]).join(*flds, &block)
+    end
+    # Creates a new CsvIO for the specified source file.
+    # If a converter block is given, then it is added to the CSV converters list.
+    #
+    # @param [String, IO] dev the CSV file or stream to open
+    # @param [Hash] opts the open options
+    # @option opts [String] :mode the input mode (default +r+)
+    # @option opts [String] :headers the input field headers
+    # @yield [value, info] converts the input value
+    # @yieldparam [String] value the input value
+    # @yieldparam info the current field's FasterCSV FieldInfo metadata
+    # @raise [ArgumentError] if the input is nil
+    def initialize(dev, opts=nil, &converter)
+      raise ArgumentError.new("CSV input argument is missing") if dev.nil?
+      # the CSV file open mode
+      mode = Options.get(:mode, opts, 'r')
+      # the CSV headers option; can be boolean or array
+      hdr_opt = Options.get(:headers, opts)
+      # there is a header record by default for an input CSV file
+      hdr_opt ||= true if mode =~ /^r/
+      # make parent directories if necessary for an output CSV file
+      File.makedirs(File.dirname(dev)) if String == dev and mode =~ /^w/
+      # if headers aren't given, then convert the input CSV header record names to underscore symbols
+      hdr_cvtr = :symbol unless Enumerable === hdr_opt
+      # make a custom converter
+      custom = Proc.new { |value, info| convert(value, info, &converter) }
+      # collect the options
+      csv_opts = {:headers => hdr_opt, :header_converters => hdr_cvtr, :return_headers => true, :write_headers => true, :converters => custom}
+      # Make the parent directory if necessary.
+      FileUtils.mkdir_p(File.dirname(dev)) if String === dev and mode !~ /^r/
+      # open the CSV file
+      @csv = String === dev ? FasterCSV.open(dev, mode, csv_opts) : FasterCSV.new(dev, csv_opts)
+      # the header => field name hash:
+      # if the header option is set to true, then read the input header line.
+      # otherwise, parse an empty string which mimics an input header line.
+      hdr_row = case hdr_opt
+      when true then
+        @csv.shift
+      when Enumerable then
+        ''.parse_csv(:headers => hdr_opt, :header_converters => :symbol, :return_headers => true)
+      else
+        Jinx.fail(ArgumentError, "CSV headers option value not supported: #{hdr_opt}")
+      end
+      # The field value accessors consist of the header row headers converted to a symbol.
+      @accessors = hdr_row.headers
+      # The field names consist of the header row values.
+      @field_names = @accessors.map { |sym| hdr_row[sym] }
+      # the header name => symbol map
+      @hdr_sym_hash = hdr_row.to_hash.invert
+    end
+    # Closes the CSV file.
+    def close
+      @csv.close
+    end
+    # @param [String] header the CSV field header name
+    # @param [Symbol] the header accessor method
+    def accessor(name)
+      @hdr_sym_hash[name]
+    end
+    # Iterates over each CSV row, yielding a row for each iteration.
+    #
+    # @yield [row] processes the CSV row
+    # @yieldparam [FasterCSV::Row] row the CSV row
+    def each(&block)
+      @csv.each(&block)
+    end
+    # Reads the next CSV row.
+    #
+    # @return the next CSV row
+    # @see #each
+    def readline
+      @csv.shift
+    end
+    alias :shift :readline
+    alias :next :readline
+    # Writes the given row to the CSV file.
+    #
+    #@param [{Symbol => Object}] row the input row
+    def write(row)
+      @csv << row
+      @csv.flush
+    end
+    alias :<< :write
+    private
+    # 3-letter months => month sequence hash.
+    MMM_MM_MAP = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'].to_compact_hash_with_index do |mmm, index|
+      index < 9 ? ('0' + index.succ.to_s) : index.succ.to_s
+    end
+    # DateMatcher relaxes the FasterCSV DateMatcher to allow dd/mm/yyyy dates.
+    DateMatcher = / \A(?: (\w+,?\s+)?\w+\s+\d{1,2},?\s+\d{2,4} | \d{1,2}-\w{3}-\d{2,4} | \d{4}[-\/]\d{1,2}[-\/]\d{1,2} | \d{1,2}[-\/]\d{1,2}[-\/]\d{2,4} )\z /x
+    DD_MMM_YYYY_RE = /^(\d{1,2})-([[:alpha:]]{3})-(\d{2,4})$/
+    # @param f the input field value to convert
+    # @param info the CSV field info
+    # @return the converted value
+    def convert(f, info)
+      return if f.nil?
+      # the block has precedence
+      value = yield(f, info) if block_given?
+      # integer conversion
+      value ||= Integer(f) if f =~ /^[1-9]\d*$/
+      # date conversion
+      value ||= convert_date(f) if f =~ CsvIO::DateMatcher
+      # float conversion
+      value ||= (Float(f) rescue f) if f =~ /^\d+\.\d*$/ or f =~ /^\d*\.\d+$/
+      # return converted value or the input field if there was no conversion
+      value || f
+    end
+    # @param [String] the input field value
+    # @return [Date] the converted date
+    def convert_date(f)
+      # If input value is in dd-mmm-yy format, then reformat.
+      # Otherwise, parse as a Date if possible.
+      if f =~ DD_MMM_YYYY_RE then
+        ddmmyy = reformat_dd_mmm_yy_date(f) || return
+        convert_date(ddmmyy)
+      else
+        Date.parse(f, true) rescue nil
+      end
+    end
+    # @param [String] the input field value in dd-mmm-yy format
+    # @return [String] the reformatted date String in mm/dd/yy format
+    def reformat_dd_mmm_yy_date(f)
+      dd, mmm, yy = DD_MMM_YYYY_RE.match(f).captures
+      mm = MMM_MM_MAP[mmm.downcase] || return
+      "#{mm}/#{dd}/#{yy}"
+    end
+  end
+end