RubyGems - fech - Versions diffs - 0.1.0 - Mend

fech 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

data/.gitignore +7 -0
data/.rspec +2 -0
data/Gemfile +4 -0
data/Gemfile.lock +49 -0
data/LICENSE +13 -0
data/README.rdoc +178 -0
data/Rakefile +3 -0
data/autotest/discover.rb +1 -0
data/fech.gemspec +32 -0
data/lib/fech.rb +13 -0
data/lib/fech/default_translations.rb +135 -0
data/lib/fech/fech_utils.rb +41 -0
data/lib/fech/filing.rb +248 -0
data/lib/fech/map_generator.rb +187 -0
data/lib/fech/mapped.rb +38 -0
data/lib/fech/mappings.rb +66 -0
data/lib/fech/translator.rb +138 -0
data/lib/fech/version.rb +3 -0
data/sources/F3P.csv +1 -0
data/sources/F3P31.csv +1 -0
data/sources/F3PS.csv +1 -0
data/sources/F3S.csv +1 -0
data/sources/HDR.csv +1 -0
data/sources/SchA.csv +1 -0
data/sources/SchB.csv +1 -0
data/sources/SchC.csv +1 -0
data/sources/SchC1.csv +1 -0
data/sources/SchC2.csv +1 -0
data/sources/SchD.csv +1 -0
data/sources/SchE.csv +1 -0
data/sources/SchF.csv +1 -0
data/sources/TEXT.csv +1 -0
data/sources/headers/3.csv +1 -0
data/sources/headers/5.0.csv +1 -0
data/sources/headers/5.1.csv +1 -0
data/sources/headers/5.2.csv +1 -0
data/sources/headers/5.3.csv +1 -0
data/sources/headers/6.1.csv +1 -0
data/sources/headers/6.2.csv +1 -0
data/sources/headers/6.3.csv +1 -0
data/sources/headers/6.4.csv +1 -0
data/sources/headers/7.0.csv +1 -0
data/sources/headers/ignore.csv +5 -0
data/spec/data/723604.fec +4 -0
data/spec/data/97405.fec +10 -0
data/spec/default_translations_spec.rb +104 -0
data/spec/fech_utils_spec.rb +29 -0
data/spec/filing_spec.rb +251 -0
data/spec/map_generator_spec.rb +49 -0
data/spec/mapped_spec.rb +44 -0
data/spec/mappings_spec.rb +46 -0
data/spec/sources/F3P.csv +1 -0
data/spec/sources/SchA.csv +1 -0
data/spec/sources/SchB.csv +1 -0
data/spec/sources/SchC.csv +1 -0
data/spec/sources/headers/3.csv +1 -0
data/spec/sources/headers/5.0.csv +1 -0
data/spec/sources/headers/5.1.csv +1 -0
data/spec/sources/headers/5.2.csv +1 -0
data/spec/sources/headers/5.3.csv +1 -0
data/spec/sources/headers/6.1.csv +1 -0
data/spec/sources/headers/6.2.csv +1 -0
data/spec/sources/headers/6.3.csv +1 -0
data/spec/sources/headers/6.4.csv +1 -0
data/spec/sources/headers/7.0.csv +1 -0
data/spec/sources/headers/ignore.csv +5 -0
data/spec/sources/sa.csv +1 -0
data/spec/spec_helper.rb +9 -0
data/spec/translator_spec.rb +195 -0
data/tasks/fech.rake +41 -0
metadata +280 -0

data/lib/fech/fech_utils.rb ADDED

@@ -0,0 +1,41 @@
+# Contains helper functions and static variables used by various
+# Fech classes.
+module FechUtils
+  # All supported row types pointed to regular expressions that will correcty
+  # match that row type in the wild.
+  ROW_TYPES = {
+    :hdr   => /^hdr$/i,
+    :f3p   => /(^f3p$)|(^f3p[^s|3])/i,
+    :f3s   => /^f3s/i,
+    :f3p31 => /^f3p31/i,
+    :f3ps  => /^f3ps/i,
+    :sa    => /^sa/i,
+    :sb    => /^sb/i,
+    :sc    => /^sc[^1-2]/i,
+    :sc1   => /^sc1/i,
+    :sc2   => /^sc2/i,
+    :sd    => /^sd/i,
+    :se    => /^se/i,
+    :sf    => /^sf/i,
+    :text  => /^text/i,
+  }
+  # Converts symbols and strings to Regexp objects for use in regex-keyed maps.
+  # Assumes that symbols should be matched literally, strings unanchored.
+  # @param [String,Symbol,Regexp] label the object to convert to a Regexp
+  def regexify(label)
+    if label.is_a?(Regexp)
+      Regexp.new(label.source, Regexp::IGNORECASE)
+    elsif label.is_a?(Symbol)
+      if ROW_TYPES.keys.include?(label)
+        ROW_TYPES[label]
+      else
+        Regexp.new("^#{label.to_s}$", Regexp::IGNORECASE)
+      end
+    else
+      Regexp.new(Regexp.escape(label.to_s), Regexp::IGNORECASE)
+    end
+  end
+end

data/lib/fech/filing.rb ADDED

@@ -0,0 +1,248 @@
+require 'tmpdir'
+require 'open-uri'
+require 'fastercsv'
+module Fech
+  # Fech::Filing downloads an Electronic Filing given its ID, and will search
+  # rows by row type. Using a child Translator object, the data in each row
+  # is automatically mapped at runtime into a labeled Hash. Additional
+  # Translations may be added to change the way that data is mapped and cleaned.
+  class Filing
+    attr_accessor :filing_id, :download_dir, :translator
+    # Create a new Filing object, assign the download directory to system's
+    # temp folder by default.
+    # @param [String] download_dir override the directory where files should
+    #   be downloaded.
+    # @param [Symbol,Array] translate a list of built-in translation sets to use
+    def initialize(filing_id, opts={})
+      @filing_id    = filing_id
+      @download_dir = opts[:download_dir] || Dir.tmpdir
+      @translator   = Fech::Translator.new(:include => opts[:translate])
+    end
+    # Saves the filing data from the FEC website into the default download
+    # directory.
+    def download
+      File.open(file_path, 'w') do |file|
+        file << open(filing_url).read
+      end
+      self
+    end
+    # Access the header (first) line of the filing, containing information
+    # about the filing's version and metadata about the software used to file it.
+    # @return [Hash] a hash that assigns labels to the values of the filing's header row
+    def header(opts={})
+      each_row do |row|
+        return parse_row?(row)
+      end
+    end
+    # Access the summary (second) line of the filing, containing aggregate and
+    # top-level information about the filing.
+    # @return [Hash] a hash that assigns labels to the values of the filing's summary row
+    def summary
+      each_row_with_index do |row, index|
+        next if index == 0
+        return parse_row?(row)
+      end
+    end
+    # Access all lines of the filing that match a given row type. Will return an
+    # Array of all available lines if called directly, or will yield the mapped
+    # rows one by one if a block is passed.
+    #
+    # @param [String, Regexp] row_type a partial or complete name of the type of row desired
+    # @option opts [Boolean] :raw should the function return the data as an array
+    #   that has not been mapped to column names
+    # @option opts [Array] :include list of field names that should be included
+    #   in the returned hash
+    # @yield [Hash] each matched row's data, as either a mapped hash or raw array
+    # @return [Array] the complete set of mapped hashes for matched lines
+    def rows_like(row_type, opts={}, &block)
+      data = []
+      each_row do |row|
+        value = parse_row?(row, opts.merge(:parse_if => row_type))
+        next if value == false
+        if block_given?
+          yield value
+        else
+          data << value if value
+        end
+      end
+      block_given? ? nil : data
+    end
+    # Decides what to do with a given row. If the row's type matches the desired
+    # type, or if no type was specified, it will run the row through #map.
+    # If :raw was passed true, a flat, unmapped data array will be returned.
+    #
+    # @param [String, Regexp] row a partial or complete name of the type of row desired
+    # @option opts [Array] :include list of field names that should be included
+    #   in the returned hash
+    def parse_row?(row, opts={})
+      # Always parse, unless :parse_if is given and does not match row
+      if opts[:parse_if].nil? || \
+          Fech.regexify(opts[:parse_if]).match(row.first.downcase)
+        opts[:raw] ? row : map(row, opts)
+      else
+        false
+      end
+    end
+    # Maps a raw row to a labeled hash following any rules given in the filing's
+    # Translator based on its version and row type.
+    # Finds the correct map for a given row, performs any matching Translations
+    # on the individual values, and returns either the entire dataset, or just
+    # those fields requested.
+    # @param [String, Regexp] row a partial or complete name of the type of row desired
+    # @option opts [Array] :include list of field names that should be included
+    #   in the returned hash
+    def map(row, opts={})
+      data = Fech::Mapped.new(self, row.first)
+      row_map = map_for(row.first)
+      # If specific fields were asked for, return only those
+      row_map = row_map.select { |k,v| opts[:include].include?(k) } if opts[:include]
+      # Inserts the row into data, performing any specified preprocessing
+      # on individual cells along the way
+      row_map.each_with_index do |field, index|
+        value = row[index]
+        translator.get_translations(:row => row.first,
+            :version => filing_version, :action => :convert,
+            :field => field).each do |translation|
+          # User's Procs should be given each field's value as context
+          value = translation[:proc].call(value)
+        end
+        data[field] = value
+      end
+      # Performs any specified group preprocessing / combinations
+      combinations = translator.get_translations(:row => row.first,
+            :version => filing_version, :action => :combine)
+      row_hash = hash_zip(row_map, row) if combinations
+      combinations.each do |translation|
+        # User's Procs should be given the entire row as context
+        value = translation[:proc].call(row_hash)
+        field = translation[:field].source.gsub(/[\^\$]*/, "").to_sym
+        data[field] = value
+      end
+      data
+    end
+    # Returns the column names for given row type and the filing's version
+    # in the order they appear in row data.
+    # @param [String, Regexp] row_type representation of the row desired
+    def map_for(row_type)
+      mappings.for_row(row_type)
+    end
+    # Returns the column names for given row type and version in the order
+    # they appear in row data.
+    # @param [String, Regexp] row_type representation of the row desired
+    # @option opts [String, Regexp] :version representation of the version desired
+    def self.map_for(row_type, opts={})
+      Fech::Mappings.for_row(row_type, opts)
+    end
+    # @yield [t] returns a reference to the filing's Translator
+    # @yieldparam [Translator] the filing's Translator
+    def translate(&block)
+      if block_given?
+        yield translator
+      else
+        translator
+      end
+    end
+    # Whether this filing amends a previous filing or not.
+    def amendment?
+      !amends.nil?
+    end
+    # Returns the filing ID of the past filing this one amends,
+    # nil if this is a first-draft filing.
+    # :report_id in the HDR line references the amended filing
+    def amends
+      header[:report_id]
+    end
+    # Combines an array of keys and values into an Fech::Mapped object,
+    # a type of Hash.
+    # @param [Array] keys the desired keys for the new hash
+    # @param [Array] values the desired values for the new hash
+    # @return [Fech::Mapped, Hash]
+    def hash_zip(keys, values)
+      Fech::Mapped.new(self, values.first).merge(Hash[*keys.zip(values).flatten])
+    end
+    # The version of the FEC software used to generate this Filing
+    def filing_version
+      @filing_version ||= parse_filing_version
+    end
+    # Pulls out the version number from the header line.
+    # Must parse this line manually, since we don't know the version yet, and
+    # thus the delimiter type is still a mystery.
+    def parse_filing_version
+      first = File.open(file_path).first
+      if first.index("\034").nil?
+        FasterCSV.parse(first).flatten[2]
+      else
+        FasterCSV.parse(first, :col_sep => "\034").flatten[2]
+      end
+    end
+    # Gets or creats the Mappings instance for this filing_version
+    def mappings
+      @mapping ||= Fech::Mappings.new(filing_version)
+    end
+    # The location of the Filing on the file system
+    def file_path
+      File.join(download_dir, file_name)
+    end
+    def file_name
+      "#{filing_id}.fec"
+    end
+    def filing_url
+      "http://query.nictusa.com/dcdev/posted/#{filing_id}.fec"
+    end
+    # Iterates over and yields the Filing's lines
+    # @option opts [Boolean] :with_index yield both the item and its index
+    # @yield [Array] a row of the filing, split by the delimiter from #delimiter
+    def each_row(opts={}, &block)
+      unless File.exists?(file_path)
+        raise "File #{file_path} does not exist. Try invoking the .download method on this Filing object."
+      end
+      c = 0
+      FasterCSV.foreach(file_path, :col_sep => delimiter, :skip_blanks => true) do |row|
+        if opts[:with_index]
+          yield [row, c]
+          c += 1
+        else
+          yield row
+        end
+      end
+    end
+    # Wrapper around .each_row to include indexes
+    def each_row_with_index(&block)
+      each_row(:with_index => true, &block)
+    end
+    # @return [String] the delimiter used in the filing's version
+    def delimiter
+      filing_version.to_f < 6 ? "," : "\034"
+    end
+  end
+end

data/lib/fech/map_generator.rb ADDED

@@ -0,0 +1,187 @@
+module Fech
+  # Helper class to generate mapping hashes from source csv data.
+  # Needed to rebuild rendered_maps.rb with new source data, not used
+  # in main gem.
+  #   rake fech:maps
+  class MapGenerator
+    attr_accessor :map
+    FILING_VERSIONS   = ["7.0", "6.4", "6.3", "6.2", "6.1",
+                         "5.3", "5.2", "5.1", "5.0", "3"]
+    BASE_ROW_TYPES    = ["HDR", "F3P", "F3P31", "F3PS", "F3S", "SchA", "SchB",
+                         "SchC", "SchC1", "SchC2", "SchD", "SchE", "SchF", "TEXT"]
+    ROW_TYPE_MATCHERS = {
+      "HDR"    => FechUtils::ROW_TYPES[:hdr],
+      "F3P"    => FechUtils::ROW_TYPES[:f3p],
+      "F3S"    => FechUtils::ROW_TYPES[:f3s],
+      "F3P31"  => FechUtils::ROW_TYPES[:f3p31],
+      "F3PS"   => FechUtils::ROW_TYPES[:f3ps],
+      "SchA"   => FechUtils::ROW_TYPES[:sa],
+      "SchB"   => FechUtils::ROW_TYPES[:sb],
+      "SchC"   => FechUtils::ROW_TYPES[:sc],
+      "SchC1"  => FechUtils::ROW_TYPES[:sc1],
+      "SchC2"  => FechUtils::ROW_TYPES[:sc2],
+      "SchD"   => FechUtils::ROW_TYPES[:sd],
+      "SchE"   => FechUtils::ROW_TYPES[:se],
+      "SchF"   => FechUtils::ROW_TYPES[:sf],
+      "TEXT"   => FechUtils::ROW_TYPES[:text],
+    }
+    # Goes through all version header summary files and generates
+    # row map files for each type of row inside them.
+    def self.convert_header_file_to_row_files(source_dir)
+      data = {}
+      ignored_fields = File.open(ignored_fields_file(source_dir)).readlines.map { |l| l.strip }
+      # Create a hash of data with an entry for each row type found in the source
+      # version summary files. Each row has an entriy for each version map that
+      # exists for it. If maps for two different versions are identical, they
+      # are combined.
+      FILING_VERSIONS.each do |version|
+        FasterCSV.foreach(version_summary_file(source_dir, version)) do |row|
+          # Each row of a version summary file contains the ordered list of
+          # column names.
+          data[row.first] ||= {}
+          row_version_data = remove_ignored_fields(row, ignored_fields)
+          # Check the maps for this row type in already-processed versions.
+          # If this map is identical to a previous map, tack this version on to
+          # to it instead of creating a new one.
+          data[row.first][version] = row_version_data
+          data[row.first].each do |k, v|
+            # skip the row we just added
+            next if k == version
+            if v == row_version_data
+              # Create the new hybrid entry
+              data[row.first]["#{k}|#{version}"] = row_version_data
+              # Delete the old entry, and the one for this version only
+              data[row.first].delete(k)
+              data[row.first].delete(version)
+            end
+          end
+        end
+      end
+      # Go through each row type and create a base map management file that
+      # will serve as a template for organizing which fields are the same
+      # between versions. This file will need to then be arranged by hand to
+      # clean up the data. Each row will represent a column across versions,
+      # each column a unique map for that row for one or more versions.
+      data.each do |row_type, row_data|
+        file_path = write_row_map_file(source_dir, row_type)
+        next unless File.exists?(file_path)
+        File.open(file_path, 'w') do |f|
+          f.write('canonical')
+          to_transpose = []
+          row_data.sort.reverse.each do |version, version_data|
+            to_transpose << ["^#{version}", version_data.each_with_index.collect {|x, idx| idx+1}].flatten
+            to_transpose << [nil, version_data].flatten
+          end
+          # standardize row size
+          max_size = to_transpose.max { |r1, r2| r1.size <=> r2.size }.size
+          to_transpose.each { |r| r[max_size - 1] ||= nil }
+          transposed = to_transpose.transpose
+          transposed.each do |transposed_data|
+            transposed_data.collect! {|x| x.to_s.gsub(/\r/, ' ')}
+            canonical = transposed_data[1] # first description
+            if canonical
+              canonical = canonical.gsub(/\{.*\}/, "").gsub(/[ -\.\/\(\)]/, "_").gsub(/_+/, "_").gsub(/(_$)|(^_)/, "").downcase
+              transposed_data = [canonical, transposed_data].flatten
+            end
+            f.write(transposed_data.join(','))
+            f.write("\n")
+          end
+        end
+      end
+    end
+    # Generates the mapping for each row type in BASE_ROW_TYPES, writes them out
+    # to file for inclusion in the gem.
+    def self.dump_row_maps_to_ruby(source_dir, file_path)
+      File.open(file_path, 'w') do |f|
+        f.write("# Generated automatically by Fech::MapGenerator.\n\n")
+        f.write("# RENDERED_MAPS contains an entry for each supported row type, which in turn:\n")
+        f.write("#   contain an entry for each distinct map between a row's labels and the\n")
+        f.write("#   indexes where their values can be found.\n")
+        f.write("module Fech\n")
+        f.write("  RENDERED_MAPS = {\n")
+        BASE_ROW_TYPES.each do |row_type|
+          f.write("    \"#{ROW_TYPE_MATCHERS[row_type].source}\" => {\n")
+          generate_row_map_from_file(source_dir, row_type).each do |k, v|
+            f.write("      \'#{k}' => [#{v.map {|x| x.to_s.gsub(/^\d+_?/, "") }.collect {|x| (x.nil? || x == "") ? "nil" : ":#{x}" }.join(', ') }],\n")
+          end
+          f.write("    },\n")
+        end
+        f.write("  }\n")
+        f.write("end")
+      end
+    end
+    # For a given row type, parses its source file and returns
+    # a mapping object for it.
+    def self.generate_row_map_from_file(source_dir, row_type)
+      versions = []
+      version_indexes = []
+      data = {}
+      text = open(row_map_file(source_dir, row_type)).read
+      split_char = text.index(/\r/) ? /\r/ : /\n/
+      rows = text.split(split_char).collect {|x| x.split(',')}
+      rows.each do |row|
+        row = row.collect {|x| x.gsub("\n", "")}
+        if row.first.nil?
+          require 'ruby-debug'; debugger
+        end
+        if row.first.downcase == "canonical"
+          versions = row[1..-1].uniq.collect {|x| x unless (x.nil? || x.empty?)}.compact
+          row.each_with_index {|x, ind| version_indexes << ind unless (x.nil? || x.empty?)}.slice!(1)
+          version_indexes.slice!(0, 1)
+          versions.each {|x| data[x] = [] }
+        elsif row.first.size > 0
+          canonical = row.first
+          versions.zip(version_indexes).each do |version, row_index|
+            index = row[row_index]
+            data[version][index.to_i - 1] = canonical.to_sym if index.to_i > 0
+          end
+        end
+      end
+      row_map = {}
+      data.each {|key, value| row_map[key] = value}
+      row_map
+    end
+    # Remove both the row type from the beginning of the row,
+    # and any fields marked as "ignore" in sources/headers/ignore.csv
+    def self.remove_ignored_fields(row, ignore)
+      data = row[1..-1].compact # strip off the row type
+      data.reject { |f| ignore.include?(f) }
+    end
+    def self.row_map_file(source_dir, row_type)
+      File.join(source_dir, row_type + '.csv')
+    end
+    def self.ignored_fields_file(source_dir)
+      File.join(source_dir, 'headers', 'ignore.csv')
+    end
+    def self.version_summary_file(source_dir, version)
+      File.join(source_dir, 'headers', version + '.csv')
+    end
+    def self.write_row_map_file(source_dir, row_type)
+      File.join(source_dir, 'rows', row_type + '.csv')
+    end
+  end
+end