RubyGems - hsds_transformer - Versions diffs - 0.0.2 - Mend

hsds_transformer 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +7 -0
data/lib/api.rb +52 -0
data/lib/datapackage/datapackage.json +1579 -0
data/lib/datapackage/open211_miami_datapackage.json +1007 -0
data/lib/hsds_transformer/base_transformer.rb +182 -0
data/lib/hsds_transformer/custom/ilao_transformer.rb +80 -0
data/lib/hsds_transformer/custom/open211_miami_transformer.rb +168 -0
data/lib/hsds_transformer/exceptions.rb +5 -0
data/lib/hsds_transformer/file_paths.rb +40 -0
data/lib/hsds_transformer/headers.rb +31 -0
data/lib/hsds_transformer/runner.rb +32 -0
data/lib/hsds_transformer.rb +15 -0
data/lib/support.rb +31 -0
metadata +229 -0

data/lib/hsds_transformer/base_transformer.rb ADDED Viewed

@@ -0,0 +1,182 @@
+module HsdsTransformer
+  class BaseTransformer
+    include HsdsTransformer::Headers
+    include HsdsTransformer::FilePaths
+    attr_reader :mapping, :include_custom
+    SUPPORTED_HSDS_MODELS = %w(organizations services locations physical_addresses postal_addresses phones service_taxonomies regular_schedules taxonomies accessibility_for_disabilities contacts languages eligibilities services_at_locations service_areas)
+    def self.run(args)
+      new(args).transform
+    end
+    # TODO validate that incoming data is valid-ish, like unique IDs
+    def initialize(args)
+      @mapping = parse_mapping(args[:mapping])
+      @include_custom = args[:include_custom]
+      @zip_output = args[:zip_output]
+      SUPPORTED_HSDS_MODELS.each do |model|
+        var_name = "@" + model
+        instance_variable_set(var_name, [])
+      end
+      set_file_paths(args)
+    end
+    def transform
+      # Initial transformation into HSDS
+      mapping.each do |input_file_name, file_mapping|
+        transform_file(input_file_name, file_mapping)
+      end
+      # HSDS additional formatting
+      singletonize_languages
+      apply_custom_transformation
+      # make data path for these files
+      Dir.mkdir(output_datapackage_path) unless Dir.exists?(output_datapackage_path)
+      Dir.mkdir(output_data_path) unless Dir.exists?(output_data_path)
+      # Write the data to CSV files
+      write_output_files
+      zip_output if @zip_output
+      return self
+    end
+    def transform_file(input_file_name, file_mapping)
+      path = @input_path + input_file_name
+      org_mapping = file_mapping["columns"]
+      # Now we want to process each row in a way that allows the row to create multiple objects,
+      # including multiple objects from the same rows.
+      CSV.foreach(path, headers: true) do |input|
+        collected_data = hsds_objects_from_row(input, org_mapping)
+        collect_into_ivars(collected_data)
+      end
+    end
+    # This is defined in custom transformer if there is one
+    def apply_custom_transformation
+    end
+    private
+    def hsds_objects_from_row(input, org_mapping)
+      collected_data = {}
+      # k is the input field_name
+      # org_mapping[k] gives us the array of output fields
+      input.each do |k,v|
+        # turn this into array to be backwards compatible
+        output_fields = org_mapping[k].is_a?(Array) ? org_mapping[k] : [org_mapping[k]]
+        # now lets collect each object
+        output_fields.compact.each do |output_field|
+          # collected_data[output_field["model"]] should make it such that collected_data = { "organizations" => {} }
+          collected_data[output_field["model"]] ||= {}
+          # Append all string fields marked as "append" to single output field
+          if output_field["append"]
+            existing_string_value = collected_data[output_field["model"]][output_field["field"]] || ""
+            existing_string_value += v.to_s unless null_type(v)
+            collected_data[output_field["model"]].merge!(output_field["field"] => existing_string_value)
+          else
+            if output_field["map"]
+              value = output_field["map"][v]
+            else
+              value = v
+            end
+            safe_val = null_type(value) ? nil : value
+            collected_data[output_field["model"]].merge!(output_field["field"] => safe_val)
+          end
+        end
+      end
+      collected_data
+    end
+    def null_type(string)
+      string.nil? || string.downcase.strip == "null"
+    end
+    # Now let's pop each object into its respective instance variable collection to be written to the right file
+    def collect_into_ivars(collected_data)
+      SUPPORTED_HSDS_MODELS.each do |model|
+        collection_ivar(model) << collected_data[model] if collected_data[model] && !collected_data[model].empty?
+      end
+    end
+    def collection_ivar(model)
+      var_name = "@" + model
+      instance_variable_get(var_name)
+    end
+    def singletonize_languages
+      formatted_langs = @languages.each_with_object([]) do |language_row, array|
+        langs = language_row["language"].to_s.split(",")
+        if langs.size > 1
+          langs.each do |lang|
+            array << language_row.clone.merge("language" => lang.strip)
+          end
+        else
+          array << language_row
+        end
+      end
+      @languages = formatted_langs
+    end
+    def write_output_files
+      SUPPORTED_HSDS_MODELS.each do |model|
+        path_var = instance_variable_get "@output_#{model}_path"
+        write_csv path_var, headers(collection_ivar(model).first, model), collection_ivar(model)
+      end
+    end
+    def zip_output
+      input_data_files = Dir.glob(File.join(output_data_path, '**/*'))
+      File.delete(zipfile_name) if File.exists?(zipfile_name)
+      Zip::File.open(zipfile_name, Zip::File::CREATE) do |zipfile|
+        # Add databpackage.json
+        zipfile.add("datapackage.json", datapackage_json_path)
+        # Add data files
+        input_data_files.each do |file_path|
+          zipped_name = "data/" + File.basename(file_path)
+          zipfile.add(zipped_name, file_path)
+        end
+      end
+    end
+    # This also dedupes data by calling `uniq` on each collection before writing
+    def write_csv(path, headers, data)
+      return if data.empty?
+      CSV.open(path, 'wb') do |csv|
+        csv << headers
+        data.uniq.each do |row|
+          csv << CSV::Row.new(row.keys, row.values).values_at(*headers) unless row.values.all?(nil)
+        end
+      end
+    end
+    def parse_mapping(mapping_path)
+      if mapping_path[0..3] == "http"
+        uri = URI(mapping_path)
+        file = Net::HTTP.get(uri)
+        YAML.load file
+      else
+        YAML.load File.read(mapping_path)
+      end
+    end
+  end
+end

data/lib/hsds_transformer/custom/ilao_transformer.rb ADDED Viewed

@@ -0,0 +1,80 @@
+module HsdsTransformer
+  class IlaoTransformer < HsdsTransformer::BaseTransformer
+    STATE_ABBREVIATIONS = %w(AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY)
+    def apply_custom_transformation
+      parse_address_data
+      # process_regular_schedule_text
+    end
+    private
+    def parse_address_data
+      # TODO do this for physical too
+      @postal_addresses.each do |address_row|
+        address_str = address_row["address_1"]
+        postal_code = address_str.split(//).last(5).join
+        postal_code = postal_code.match(/\d{5}/)
+        if postal_code != ""
+          address_row["postal_code"] = postal_code.to_s
+          address_str = address_str[0..-7]
+        end
+        state = address_str.split(//).last(2).join.upcase
+        if STATE_ABBREVIATIONS.include?(state)
+          address_row["state_province"] = state
+          address_str = address_str[0..-5]
+        end
+        address_row["address_1"] = address_str
+      end
+    end
+    def process_regular_schedule_text(schedule_key:, schedule_hash:, input:)
+      if input["Hours of operation"]
+        regex_list = input["Hours of operation"].scan(/\S*day: \S*/)
+        for regex in regex_list do
+          day = regex.split(': ')[0]
+          hours = regex.split(': ')[1]
+          if hours == "Closed"
+            opens_at = nil
+            closes_at = nil
+          else
+            opens_at = hours.split('-')[0]
+            closes_at = hours.split('-')[1]
+          end
+          collect_schedule_data(schedule_key: schedule_key,
+                                schedule_hash: schedule_hash, input: input,
+                                day: day, opens_at: opens_at, closes_at: closes_at)
+        end
+      end
+    end
+    def collect_schedule_data(schedule_key:, schedule_hash:, input:,
+                              day:, opens_at:, closes_at:)
+      schedule_row = {}
+      schedule_row["weekday"] = day
+      schedule_row["opens_at"] = opens_at
+      schedule_row["closes_at"] = closes_at
+      foreign_key = schedule_hash["foreign_key_name"]
+      foreign_key_value = schedule_hash["foreign_key_value"]
+      schedule_row[foreign_key] = input[foreign_key_value]
+      schedule_data << schedule_row
+    end
+    def collect_sal_data(sal_key:, sal_hash:, input:)
+      key = sal_hash["field"]
+      sal_row = {}
+      sal_row[key] = input[sal_key]
+      foreign_key = sal_hash["foreign_key_name"]
+      foreign_key_value = sal_hash["foreign_key_value"]
+      sal_row[foreign_key] = input[foreign_key_value]
+      sal_data << sal_row
+    end
+  end
+end

data/lib/hsds_transformer/custom/open211_miami_transformer.rb ADDED Viewed

@@ -0,0 +1,168 @@
+module HsdsTransformer
+  class Open211MiamiTransformer < HsdsTransformer::BaseTransformer
+    WEEKDAYS = %w(Monday Tuesday Wednesday Thursday Friday)
+    ALL_DAYS = %w(Monday Tuesday Wednesday Thursday Friday Saturday Sunday)
+    DAY_MAPPING = {
+        "mon" => "Monday",
+        "tue" => "Tuesday",
+        "wed" => "Wednesday",
+        "thu" => "Thursday",
+        "fri" => "Friday",
+        "sat" => "Saturday",
+        "sun" => "Sunday",
+    }
+    TOP_LEVEL_TAXONOMIES = {
+        "B" => "Basic Needs",
+        "D" => "Consumer Services",
+        "F" => "Criminal Justice and Legal Services",
+        "H" => "Education",
+        "J" => "Environmental Quality",
+        "L" => "Health Care",
+        "N" => "Income Support and Employment",
+        "P" => "Individual and Family Life",
+        "R" => "Mental Health Care and Counseling",
+        "T" => "Organizational/Community/International Services",
+        "Y" => "Target Populations"
+    }
+    TAXONOMY_VOCAB = "Open211 Miami - AIRS"
+    def apply_custom_transformation
+      remove_child_organizations
+      determine_services
+      parse_regular_schedules_text
+      supplement_taxonomy
+    end
+    private
+    def determine_services
+      new_services = @services.each do |service|
+        # Update the name to remove the org name
+        formatted_name = service["name"].to_s.split(" - ").last
+        service.merge!("name" => formatted_name)
+        # Set the org ID as the parent provider id
+        if !service["parent_provider_id"].nil?
+          service.merge!("organization_id" => service["parent_provider_id"])
+        end
+        service.delete "parent_provider_id"
+        service
+      end
+      @services = new_services
+    end
+    # TODO figure out what to do with 24 hour text
+    # TODO add IDs
+    def parse_regular_schedules_text
+      new_schedules = @regular_schedules.each_with_object([]) do |sched_row, new_sheds|
+        # Schedule times and tidbits are mostly separated by a newline
+        sched_options = sched_row["original_text"].to_s.split("\n")
+        sched_options.each do |opt|
+          opt_days = find_days(opt)
+          if all_weekdays?(opt_days)
+            sched_days = WEEKDAYS
+          elsif single_days?(opt_days)
+            sched_days = single_days(opt_days)
+          else
+            sched_days = []
+          end
+          sched_days.each do |day|
+            new_sheds << new_sched_row(day, opt, sched_row)
+          end
+        end
+      end
+      @regular_schedules = new_schedules
+    end
+    def find_days(opt_string)
+      strings = opt_string.to_s.split(", ")[1..-1].compact.flatten
+      strings.map(&:downcase)
+    end
+    def all_weekdays?(days)
+      days == ["mon-fri"]
+    end
+    def single_days?(days)
+      !single_days(days).empty?
+    end
+    def single_days(days)
+      DAY_MAPPING.select{ |day| days.include? day }.values
+    end
+    def hours(opt)
+      range = opt.split(", ")[0]
+      times = range.split("-")
+      return unless times.size == 2
+      open = clean_time(times[0])
+      close = clean_time(times[1])
+      [open, close]
+    end
+    # Finds the time in strings like "Admin:\\n9:00am", "9am", "9:0a", "10:00pm"
+    def clean_time(time)
+      /\d{1,2}.*\z/.match(time).to_s
+    end
+    def new_sched_row(day, opt, sched_row)
+      open, close = hours(opt)
+      {
+          "service_id" => sched_row["service_id"],
+          "weekday" => day,
+          "opens_at" => open,
+          "closes_at" => close,
+          "original_text" => sched_row["original_text"]
+      }
+    end
+    def remove_child_organizations
+      @organizations.reject! do |org|
+        !org["parent_provider_id"].nil?
+      end
+      @organizations.each { |org| org.delete("parent_provider_id") }
+    end
+    def supplement_taxonomy
+      @taxonomies.each do |tax_row|
+        if tax_row["id"].length == 1
+          category = nil # Already top-level
+        else
+          category = tax_row["id"][0]
+        end
+        suppl_attrs = {
+            "parent_id" => category,
+            "parent_name" => TOP_LEVEL_TAXONOMIES[category],
+            "vocabulary" => TAXONOMY_VOCAB
+        }
+        tax_row.merge!(suppl_attrs)
+      end
+      @taxonomies.concat(top_level_taxonomies)
+    end
+    def top_level_taxonomies
+      TOP_LEVEL_TAXONOMIES.map do |key, value|
+        {
+            "id" => key,
+            "name" => value,
+            "taxonomy_facet" => "Service",
+            "parent_id" => nil,
+            "parent_name" => nil,
+            "vocabulary" => TAXONOMY_VOCAB
+        }
+      end
+    end
+  end
+end

data/lib/hsds_transformer/exceptions.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module HsdsTransformer
+  class InvalidCustomTransformerException < ::Exception; end
+end

data/lib/hsds_transformer/file_paths.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module HsdsTransformer
+  module FilePaths
+    DEFAULT_OUTPUT_PATH = "#{ENV["ROOT_PATH"]}/tmp"
+    DEFAULT_INPUT_PATH = "#{ENV["ROOT_PATH"]}/"
+    attr_reader :input_path, :output_path, :output_datapackage_path, :output_data_path, :datapackage_json_path,
+                :zipfile_name, :output_organizations_path, :output_locations_path, :output_services_path,
+                :output_phones_path, :output_physical_addresses_path, :output_postal_addresses_path,
+                :output_services_at_locations_path, :output_eligibilities_path, :output_contacts_path,
+                :output_languages_path, :output_accessibility_for_disabilities_path, :output_taxonomies_path,
+                :output_service_taxonomies_path, :output_regular_schedules_path, :output_service_areas_path
+    # TODO DRY this up
+    def set_file_paths(args)
+      @input_path = args[:input_path] || DEFAULT_INPUT_PATH
+      @output_path = args[:output_path] || DEFAULT_OUTPUT_PATH
+      @output_datapackage_path = File.join(output_path, "datapackage")
+      @output_data_path = File.join(output_datapackage_path, "data")
+      @zipfile_name = File.join(output_path, "datapackage.zip")
+      @output_organizations_path = output_data_path + "/organizations.csv"
+      @output_locations_path = output_data_path + "/locations.csv"
+      @output_services_path = output_data_path + "/services.csv"
+      @output_phones_path = output_data_path + "/phones.csv"
+      @output_physical_addresses_path = output_data_path + "/physical_addresses.csv"
+      @output_postal_addresses_path = output_data_path + "/postal_addresses.csv"
+      @output_services_at_locations_path = output_data_path + "/services_at_location.csv"
+      @output_eligibilities_path = output_data_path + "/eligibility.csv"
+      @output_contacts_path = output_data_path + "/contacts.csv"
+      @output_languages_path = output_data_path + "/languages.csv"
+      @output_accessibility_for_disabilities_path = output_data_path + "/accessibility_for_disabilities.csv"
+      @output_taxonomies_path = output_data_path + "/taxonomy.csv"
+      @output_service_taxonomies_path = output_data_path + "/services_taxonomy.csv"
+      @output_regular_schedules_path = output_data_path + "/regular_schedules.csv"
+      @output_service_areas_path = output_data_path + "/service_areas.csv"
+      @datapackage_json_path = File.join(ENV["ROOT_PATH"], "lib/datapackage/datapackage.json")
+    end
+  end
+end

data/lib/hsds_transformer/headers.rb ADDED Viewed

@@ -0,0 +1,31 @@
+module HsdsTransformer
+  module Headers
+    ORGANIZATIONS_HEADERS = %w(id name alternate_name description email url tax_status tax_id year_incorporated legal_status)
+    LOCATIONS_HEADERS = %w(id organization_id name alternate_name description transportation latitude longitude)
+    SERVICES_HEADERS = %w(id organization_id program_id name alternate_name description url email status interpretation_services application_process wait_time fees accreditations licenses)
+    PHONES_HEADERS = %w(id location_id service_id organization_id contact_id service_at_location_id number extension type language description)
+    PHYSICAL_ADDRESSES_HEADERS = %w(id location_id organization_id attention address_1 city region state_province postal_code country)
+    POSTAL_ADDRESSES_HEADERS = %w(id location_id organization_id attention address_1 city region state_province postal_code country)
+    REGULAR_SCHEDULES_HEADERS = %w(id service_id location_id service_at_location_id weekday opens_at closes_at)
+    SERVICES_AT_LOCATIONS_HEADERS = %w(id service_id location_id description)
+    ELIGIBILITIES_HEADERS = %w(id service_id eligibility)
+    CONTACTS_HEADERS = %w(id organization_id service_id service_at_location_id name title department email)
+    LANGUAGES_HEADERS = %w(id service_id location_id language)
+    ACCESSIBILITY_FOR_DISABILITIES_HEADERS = %w(id location_id accessibility details)
+    TAXONOMIES_HEADERS = %w(id name parent_id parent_name vocabulary)
+    SERVICE_TAXONOMIES_HEADERS = %w(id service_id taxonomy_id taxonomy_detail)
+    SERVICE_AREAS_HEADERS = %w(id service_id service_area description)
+    def headers(row, model)
+      const_name = "HsdsTransformer::Headers::" + model.upcase + "_HEADERS"
+      # TODO make sure valid
+      const = Object.const_get(const_name)
+      if row && @include_custom
+        (const + row.keys).uniq
+      else
+        const
+      end
+    end
+  end
+end

data/lib/hsds_transformer/runner.rb ADDED Viewed

@@ -0,0 +1,32 @@
+module HsdsTransformer
+  class Runner
+    VALID_CUSTOM_TRANSFORMERS = %w(Open211MiamiTransformer IlaoTransformer)
+    # Args:
+    # input_path - indicates the dir containing the input data files
+    # output_path - indicates the dir you want the resulting HSDS files to go
+    # include_custom - Default: false - indicates that the final output CSVs should include the non-HSDS columns that the original input CSVs had
+    # zip_output - Default: false - indicates whether you want the output to be zipped into a single datapackage.zip
+    # custom_transformer - Default: nil - indicates the custom transformer class you want to use. This arg does not get passed to transformer classes
+    def self.run(args)
+      custom = args.delete(:custom_transformer)
+      validate_custom(custom)
+      transformer = custom ? custom_transformer(custom) : BaseTransformer
+      transformer.run(args)
+    end
+    def self.validate_custom(custom)
+      if custom && !VALID_CUSTOM_TRANSFORMERS.include?(custom)
+        raise InvalidCustomTransformerException
+      end
+    end
+    def self.custom_transformer(custom)
+      klass = "HsdsTransformer::" + custom
+      Object.const_get(klass)
+    end
+  end
+end

data/lib/hsds_transformer.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require "dotenv/load"
+require "csv"
+require "yaml"
+require "zip"
+require "zip/zip"
+require "rest_client"
+require "hsds_transformer/file_paths"
+require "hsds_transformer/headers"
+require "hsds_transformer/exceptions"
+require "hsds_transformer/runner"
+require "hsds_transformer/base_transformer"
+require "hsds_transformer/custom/open211_miami_transformer"
+require "hsds_transformer/custom/ilao_transformer"

data/lib/support.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# TODO implement validation
+module Support
+  def validate(filename, type)
+    filename = "#{filename}"
+    file = File.new(filename, 'rb')
+    RestClient.post('http://localhost:1400/validate/csv',
+                    {"file" => file,
+                     "type" => type})
+    return true
+  rescue RestClient::BadRequest
+    @valid = false
+    return false
+  end
+  def validate_output
+    unless validate(output_organizations_path, "organization")
+      puts "Organization data not valid"
+    end
+    unless validate(output_locations_path, "location")
+      puts "Location data not valid"
+    end
+    unless validate(output_services_path, "service")
+      puts "Service data not valid"
+    end
+    unless validate(output_phones_path, "phone")
+      puts "Phone data not valid"
+    end
+  rescue Errno::ECONNREFUSED
+    puts "Can't connect to validation service."
+  end
+end