fech 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. data/.gitignore +7 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +4 -0
  4. data/Gemfile.lock +49 -0
  5. data/LICENSE +13 -0
  6. data/README.rdoc +178 -0
  7. data/Rakefile +3 -0
  8. data/autotest/discover.rb +1 -0
  9. data/fech.gemspec +32 -0
  10. data/lib/fech.rb +13 -0
  11. data/lib/fech/default_translations.rb +135 -0
  12. data/lib/fech/fech_utils.rb +41 -0
  13. data/lib/fech/filing.rb +248 -0
  14. data/lib/fech/map_generator.rb +187 -0
  15. data/lib/fech/mapped.rb +38 -0
  16. data/lib/fech/mappings.rb +66 -0
  17. data/lib/fech/translator.rb +138 -0
  18. data/lib/fech/version.rb +3 -0
  19. data/sources/F3P.csv +1 -0
  20. data/sources/F3P31.csv +1 -0
  21. data/sources/F3PS.csv +1 -0
  22. data/sources/F3S.csv +1 -0
  23. data/sources/HDR.csv +1 -0
  24. data/sources/SchA.csv +1 -0
  25. data/sources/SchB.csv +1 -0
  26. data/sources/SchC.csv +1 -0
  27. data/sources/SchC1.csv +1 -0
  28. data/sources/SchC2.csv +1 -0
  29. data/sources/SchD.csv +1 -0
  30. data/sources/SchE.csv +1 -0
  31. data/sources/SchF.csv +1 -0
  32. data/sources/TEXT.csv +1 -0
  33. data/sources/headers/3.csv +1 -0
  34. data/sources/headers/5.0.csv +1 -0
  35. data/sources/headers/5.1.csv +1 -0
  36. data/sources/headers/5.2.csv +1 -0
  37. data/sources/headers/5.3.csv +1 -0
  38. data/sources/headers/6.1.csv +1 -0
  39. data/sources/headers/6.2.csv +1 -0
  40. data/sources/headers/6.3.csv +1 -0
  41. data/sources/headers/6.4.csv +1 -0
  42. data/sources/headers/7.0.csv +1 -0
  43. data/sources/headers/ignore.csv +5 -0
  44. data/spec/data/723604.fec +4 -0
  45. data/spec/data/97405.fec +10 -0
  46. data/spec/default_translations_spec.rb +104 -0
  47. data/spec/fech_utils_spec.rb +29 -0
  48. data/spec/filing_spec.rb +251 -0
  49. data/spec/map_generator_spec.rb +49 -0
  50. data/spec/mapped_spec.rb +44 -0
  51. data/spec/mappings_spec.rb +46 -0
  52. data/spec/sources/F3P.csv +1 -0
  53. data/spec/sources/SchA.csv +1 -0
  54. data/spec/sources/SchB.csv +1 -0
  55. data/spec/sources/SchC.csv +1 -0
  56. data/spec/sources/headers/3.csv +1 -0
  57. data/spec/sources/headers/5.0.csv +1 -0
  58. data/spec/sources/headers/5.1.csv +1 -0
  59. data/spec/sources/headers/5.2.csv +1 -0
  60. data/spec/sources/headers/5.3.csv +1 -0
  61. data/spec/sources/headers/6.1.csv +1 -0
  62. data/spec/sources/headers/6.2.csv +1 -0
  63. data/spec/sources/headers/6.3.csv +1 -0
  64. data/spec/sources/headers/6.4.csv +1 -0
  65. data/spec/sources/headers/7.0.csv +1 -0
  66. data/spec/sources/headers/ignore.csv +5 -0
  67. data/spec/sources/sa.csv +1 -0
  68. data/spec/spec_helper.rb +9 -0
  69. data/spec/translator_spec.rb +195 -0
  70. data/tasks/fech.rake +41 -0
  71. metadata +280 -0
@@ -0,0 +1,41 @@
1
+ # Contains helper functions and static variables used by various
2
+ # Fech classes.
3
+ module FechUtils
4
+
5
+ # All supported row types pointed to regular expressions that will correcty
6
+ # match that row type in the wild.
7
+ ROW_TYPES = {
8
+ :hdr => /^hdr$/i,
9
+ :f3p => /(^f3p$)|(^f3p[^s|3])/i,
10
+ :f3s => /^f3s/i,
11
+ :f3p31 => /^f3p31/i,
12
+ :f3ps => /^f3ps/i,
13
+ :sa => /^sa/i,
14
+ :sb => /^sb/i,
15
+ :sc => /^sc[^1-2]/i,
16
+ :sc1 => /^sc1/i,
17
+ :sc2 => /^sc2/i,
18
+ :sd => /^sd/i,
19
+ :se => /^se/i,
20
+ :sf => /^sf/i,
21
+ :text => /^text/i,
22
+ }
23
+
24
+ # Converts symbols and strings to Regexp objects for use in regex-keyed maps.
25
+ # Assumes that symbols should be matched literally, strings unanchored.
26
+ # @param [String,Symbol,Regexp] label the object to convert to a Regexp
27
+ def regexify(label)
28
+ if label.is_a?(Regexp)
29
+ Regexp.new(label.source, Regexp::IGNORECASE)
30
+ elsif label.is_a?(Symbol)
31
+ if ROW_TYPES.keys.include?(label)
32
+ ROW_TYPES[label]
33
+ else
34
+ Regexp.new("^#{label.to_s}$", Regexp::IGNORECASE)
35
+ end
36
+ else
37
+ Regexp.new(Regexp.escape(label.to_s), Regexp::IGNORECASE)
38
+ end
39
+ end
40
+
41
+ end
@@ -0,0 +1,248 @@
1
+ require 'tmpdir'
2
+ require 'open-uri'
3
+ require 'fastercsv'
4
+
5
+ module Fech
6
+
7
+ # Fech::Filing downloads an Electronic Filing given its ID, and will search
8
+ # rows by row type. Using a child Translator object, the data in each row
9
+ # is automatically mapped at runtime into a labeled Hash. Additional
10
+ # Translations may be added to change the way that data is mapped and cleaned.
11
+ class Filing
12
+ attr_accessor :filing_id, :download_dir, :translator
13
+
14
+ # Create a new Filing object, assign the download directory to system's
15
+ # temp folder by default.
16
+ # @param [String] download_dir override the directory where files should
17
+ # be downloaded.
18
+ # @param [Symbol,Array] translate a list of built-in translation sets to use
19
+ def initialize(filing_id, opts={})
20
+ @filing_id = filing_id
21
+ @download_dir = opts[:download_dir] || Dir.tmpdir
22
+ @translator = Fech::Translator.new(:include => opts[:translate])
23
+ end
24
+
25
+ # Saves the filing data from the FEC website into the default download
26
+ # directory.
27
+ def download
28
+ File.open(file_path, 'w') do |file|
29
+ file << open(filing_url).read
30
+ end
31
+ self
32
+ end
33
+
34
+ # Access the header (first) line of the filing, containing information
35
+ # about the filing's version and metadata about the software used to file it.
36
+ # @return [Hash] a hash that assigns labels to the values of the filing's header row
37
+ def header(opts={})
38
+ each_row do |row|
39
+ return parse_row?(row)
40
+ end
41
+ end
42
+
43
+ # Access the summary (second) line of the filing, containing aggregate and
44
+ # top-level information about the filing.
45
+ # @return [Hash] a hash that assigns labels to the values of the filing's summary row
46
+ def summary
47
+ each_row_with_index do |row, index|
48
+ next if index == 0
49
+ return parse_row?(row)
50
+ end
51
+ end
52
+
53
+ # Access all lines of the filing that match a given row type. Will return an
54
+ # Array of all available lines if called directly, or will yield the mapped
55
+ # rows one by one if a block is passed.
56
+ #
57
+ # @param [String, Regexp] row_type a partial or complete name of the type of row desired
58
+ # @option opts [Boolean] :raw should the function return the data as an array
59
+ # that has not been mapped to column names
60
+ # @option opts [Array] :include list of field names that should be included
61
+ # in the returned hash
62
+ # @yield [Hash] each matched row's data, as either a mapped hash or raw array
63
+ # @return [Array] the complete set of mapped hashes for matched lines
64
+ def rows_like(row_type, opts={}, &block)
65
+ data = []
66
+ each_row do |row|
67
+ value = parse_row?(row, opts.merge(:parse_if => row_type))
68
+ next if value == false
69
+ if block_given?
70
+ yield value
71
+ else
72
+ data << value if value
73
+ end
74
+ end
75
+ block_given? ? nil : data
76
+ end
77
+
78
+ # Decides what to do with a given row. If the row's type matches the desired
79
+ # type, or if no type was specified, it will run the row through #map.
80
+ # If :raw was passed true, a flat, unmapped data array will be returned.
81
+ #
82
+ # @param [String, Regexp] row a partial or complete name of the type of row desired
83
+ # @option opts [Array] :include list of field names that should be included
84
+ # in the returned hash
85
+ def parse_row?(row, opts={})
86
+ # Always parse, unless :parse_if is given and does not match row
87
+ if opts[:parse_if].nil? || \
88
+ Fech.regexify(opts[:parse_if]).match(row.first.downcase)
89
+ opts[:raw] ? row : map(row, opts)
90
+ else
91
+ false
92
+ end
93
+ end
94
+
95
+ # Maps a raw row to a labeled hash following any rules given in the filing's
96
+ # Translator based on its version and row type.
97
+ # Finds the correct map for a given row, performs any matching Translations
98
+ # on the individual values, and returns either the entire dataset, or just
99
+ # those fields requested.
100
+ # @param [String, Regexp] row a partial or complete name of the type of row desired
101
+ # @option opts [Array] :include list of field names that should be included
102
+ # in the returned hash
103
+ def map(row, opts={})
104
+ data = Fech::Mapped.new(self, row.first)
105
+ row_map = map_for(row.first)
106
+
107
+ # If specific fields were asked for, return only those
108
+ row_map = row_map.select { |k,v| opts[:include].include?(k) } if opts[:include]
109
+
110
+ # Inserts the row into data, performing any specified preprocessing
111
+ # on individual cells along the way
112
+ row_map.each_with_index do |field, index|
113
+ value = row[index]
114
+ translator.get_translations(:row => row.first,
115
+ :version => filing_version, :action => :convert,
116
+ :field => field).each do |translation|
117
+ # User's Procs should be given each field's value as context
118
+ value = translation[:proc].call(value)
119
+ end
120
+ data[field] = value
121
+ end
122
+
123
+ # Performs any specified group preprocessing / combinations
124
+ combinations = translator.get_translations(:row => row.first,
125
+ :version => filing_version, :action => :combine)
126
+ row_hash = hash_zip(row_map, row) if combinations
127
+ combinations.each do |translation|
128
+ # User's Procs should be given the entire row as context
129
+ value = translation[:proc].call(row_hash)
130
+ field = translation[:field].source.gsub(/[\^\$]*/, "").to_sym
131
+ data[field] = value
132
+ end
133
+
134
+ data
135
+ end
136
+
137
+ # Returns the column names for given row type and the filing's version
138
+ # in the order they appear in row data.
139
+ # @param [String, Regexp] row_type representation of the row desired
140
+ def map_for(row_type)
141
+ mappings.for_row(row_type)
142
+ end
143
+
144
+ # Returns the column names for given row type and version in the order
145
+ # they appear in row data.
146
+ # @param [String, Regexp] row_type representation of the row desired
147
+ # @option opts [String, Regexp] :version representation of the version desired
148
+ def self.map_for(row_type, opts={})
149
+ Fech::Mappings.for_row(row_type, opts)
150
+ end
151
+
152
+ # @yield [t] returns a reference to the filing's Translator
153
+ # @yieldparam [Translator] the filing's Translator
154
+ def translate(&block)
155
+ if block_given?
156
+ yield translator
157
+ else
158
+ translator
159
+ end
160
+ end
161
+
162
+ # Whether this filing amends a previous filing or not.
163
+ def amendment?
164
+ !amends.nil?
165
+ end
166
+
167
+ # Returns the filing ID of the past filing this one amends,
168
+ # nil if this is a first-draft filing.
169
+ # :report_id in the HDR line references the amended filing
170
+ def amends
171
+ header[:report_id]
172
+ end
173
+
174
+
175
+ # Combines an array of keys and values into an Fech::Mapped object,
176
+ # a type of Hash.
177
+ # @param [Array] keys the desired keys for the new hash
178
+ # @param [Array] values the desired values for the new hash
179
+ # @return [Fech::Mapped, Hash]
180
+ def hash_zip(keys, values)
181
+ Fech::Mapped.new(self, values.first).merge(Hash[*keys.zip(values).flatten])
182
+ end
183
+
184
+ # The version of the FEC software used to generate this Filing
185
+ def filing_version
186
+ @filing_version ||= parse_filing_version
187
+ end
188
+
189
+ # Pulls out the version number from the header line.
190
+ # Must parse this line manually, since we don't know the version yet, and
191
+ # thus the delimiter type is still a mystery.
192
+ def parse_filing_version
193
+ first = File.open(file_path).first
194
+ if first.index("\034").nil?
195
+ FasterCSV.parse(first).flatten[2]
196
+ else
197
+ FasterCSV.parse(first, :col_sep => "\034").flatten[2]
198
+ end
199
+ end
200
+
201
+ # Gets or creats the Mappings instance for this filing_version
202
+ def mappings
203
+ @mapping ||= Fech::Mappings.new(filing_version)
204
+ end
205
+
206
+ # The location of the Filing on the file system
207
+ def file_path
208
+ File.join(download_dir, file_name)
209
+ end
210
+
211
+ def file_name
212
+ "#{filing_id}.fec"
213
+ end
214
+
215
+ def filing_url
216
+ "http://query.nictusa.com/dcdev/posted/#{filing_id}.fec"
217
+ end
218
+
219
+ # Iterates over and yields the Filing's lines
220
+ # @option opts [Boolean] :with_index yield both the item and its index
221
+ # @yield [Array] a row of the filing, split by the delimiter from #delimiter
222
+ def each_row(opts={}, &block)
223
+ unless File.exists?(file_path)
224
+ raise "File #{file_path} does not exist. Try invoking the .download method on this Filing object."
225
+ end
226
+ c = 0
227
+ FasterCSV.foreach(file_path, :col_sep => delimiter, :skip_blanks => true) do |row|
228
+ if opts[:with_index]
229
+ yield [row, c]
230
+ c += 1
231
+ else
232
+ yield row
233
+ end
234
+ end
235
+ end
236
+
237
+ # Wrapper around .each_row to include indexes
238
+ def each_row_with_index(&block)
239
+ each_row(:with_index => true, &block)
240
+ end
241
+
242
+ # @return [String] the delimiter used in the filing's version
243
+ def delimiter
244
+ filing_version.to_f < 6 ? "," : "\034"
245
+ end
246
+
247
+ end
248
+ end
@@ -0,0 +1,187 @@
1
+ module Fech
2
+
3
+ # Helper class to generate mapping hashes from source csv data.
4
+ # Needed to rebuild rendered_maps.rb with new source data, not used
5
+ # in main gem.
6
+ # rake fech:maps
7
+ class MapGenerator
8
+
9
+ attr_accessor :map
10
+ FILING_VERSIONS = ["7.0", "6.4", "6.3", "6.2", "6.1",
11
+ "5.3", "5.2", "5.1", "5.0", "3"]
12
+ BASE_ROW_TYPES = ["HDR", "F3P", "F3P31", "F3PS", "F3S", "SchA", "SchB",
13
+ "SchC", "SchC1", "SchC2", "SchD", "SchE", "SchF", "TEXT"]
14
+ ROW_TYPE_MATCHERS = {
15
+ "HDR" => FechUtils::ROW_TYPES[:hdr],
16
+ "F3P" => FechUtils::ROW_TYPES[:f3p],
17
+ "F3S" => FechUtils::ROW_TYPES[:f3s],
18
+ "F3P31" => FechUtils::ROW_TYPES[:f3p31],
19
+ "F3PS" => FechUtils::ROW_TYPES[:f3ps],
20
+ "SchA" => FechUtils::ROW_TYPES[:sa],
21
+ "SchB" => FechUtils::ROW_TYPES[:sb],
22
+ "SchC" => FechUtils::ROW_TYPES[:sc],
23
+ "SchC1" => FechUtils::ROW_TYPES[:sc1],
24
+ "SchC2" => FechUtils::ROW_TYPES[:sc2],
25
+ "SchD" => FechUtils::ROW_TYPES[:sd],
26
+ "SchE" => FechUtils::ROW_TYPES[:se],
27
+ "SchF" => FechUtils::ROW_TYPES[:sf],
28
+ "TEXT" => FechUtils::ROW_TYPES[:text],
29
+ }
30
+
31
+ # Goes through all version header summary files and generates
32
+ # row map files for each type of row inside them.
33
+ def self.convert_header_file_to_row_files(source_dir)
34
+ data = {}
35
+
36
+ ignored_fields = File.open(ignored_fields_file(source_dir)).readlines.map { |l| l.strip }
37
+
38
+ # Create a hash of data with an entry for each row type found in the source
39
+ # version summary files. Each row has an entriy for each version map that
40
+ # exists for it. If maps for two different versions are identical, they
41
+ # are combined.
42
+ FILING_VERSIONS.each do |version|
43
+ FasterCSV.foreach(version_summary_file(source_dir, version)) do |row|
44
+ # Each row of a version summary file contains the ordered list of
45
+ # column names.
46
+ data[row.first] ||= {}
47
+ row_version_data = remove_ignored_fields(row, ignored_fields)
48
+
49
+ # Check the maps for this row type in already-processed versions.
50
+ # If this map is identical to a previous map, tack this version on to
51
+ # to it instead of creating a new one.
52
+ data[row.first][version] = row_version_data
53
+ data[row.first].each do |k, v|
54
+ # skip the row we just added
55
+
56
+ next if k == version
57
+ if v == row_version_data
58
+ # Create the new hybrid entry
59
+ data[row.first]["#{k}|#{version}"] = row_version_data
60
+
61
+ # Delete the old entry, and the one for this version only
62
+ data[row.first].delete(k)
63
+ data[row.first].delete(version)
64
+ end
65
+ end
66
+ end
67
+ end
68
+
69
+ # Go through each row type and create a base map management file that
70
+ # will serve as a template for organizing which fields are the same
71
+ # between versions. This file will need to then be arranged by hand to
72
+ # clean up the data. Each row will represent a column across versions,
73
+ # each column a unique map for that row for one or more versions.
74
+ data.each do |row_type, row_data|
75
+ file_path = write_row_map_file(source_dir, row_type)
76
+ next unless File.exists?(file_path)
77
+ File.open(file_path, 'w') do |f|
78
+ f.write('canonical')
79
+
80
+ to_transpose = []
81
+ row_data.sort.reverse.each do |version, version_data|
82
+ to_transpose << ["^#{version}", version_data.each_with_index.collect {|x, idx| idx+1}].flatten
83
+ to_transpose << [nil, version_data].flatten
84
+ end
85
+
86
+ # standardize row size
87
+ max_size = to_transpose.max { |r1, r2| r1.size <=> r2.size }.size
88
+ to_transpose.each { |r| r[max_size - 1] ||= nil }
89
+ transposed = to_transpose.transpose
90
+
91
+ transposed.each do |transposed_data|
92
+ transposed_data.collect! {|x| x.to_s.gsub(/\r/, ' ')}
93
+ canonical = transposed_data[1] # first description
94
+ if canonical
95
+ canonical = canonical.gsub(/\{.*\}/, "").gsub(/[ -\.\/\(\)]/, "_").gsub(/_+/, "_").gsub(/(_$)|(^_)/, "").downcase
96
+ transposed_data = [canonical, transposed_data].flatten
97
+ end
98
+ f.write(transposed_data.join(','))
99
+ f.write("\n")
100
+ end
101
+ end
102
+ end
103
+
104
+ end
105
+
106
+ # Generates the mapping for each row type in BASE_ROW_TYPES, writes them out
107
+ # to file for inclusion in the gem.
108
+ def self.dump_row_maps_to_ruby(source_dir, file_path)
109
+ File.open(file_path, 'w') do |f|
110
+ f.write("# Generated automatically by Fech::MapGenerator.\n\n")
111
+ f.write("# RENDERED_MAPS contains an entry for each supported row type, which in turn:\n")
112
+ f.write("# contain an entry for each distinct map between a row's labels and the\n")
113
+ f.write("# indexes where their values can be found.\n")
114
+ f.write("module Fech\n")
115
+ f.write(" RENDERED_MAPS = {\n")
116
+ BASE_ROW_TYPES.each do |row_type|
117
+ f.write(" \"#{ROW_TYPE_MATCHERS[row_type].source}\" => {\n")
118
+ generate_row_map_from_file(source_dir, row_type).each do |k, v|
119
+ f.write(" \'#{k}' => [#{v.map {|x| x.to_s.gsub(/^\d+_?/, "") }.collect {|x| (x.nil? || x == "") ? "nil" : ":#{x}" }.join(', ') }],\n")
120
+ end
121
+ f.write(" },\n")
122
+ end
123
+ f.write(" }\n")
124
+ f.write("end")
125
+ end
126
+ end
127
+
128
+ # For a given row type, parses its source file and returns
129
+ # a mapping object for it.
130
+ def self.generate_row_map_from_file(source_dir, row_type)
131
+ versions = []
132
+ version_indexes = []
133
+ data = {}
134
+ text = open(row_map_file(source_dir, row_type)).read
135
+ split_char = text.index(/\r/) ? /\r/ : /\n/
136
+ rows = text.split(split_char).collect {|x| x.split(',')}
137
+ rows.each do |row|
138
+ row = row.collect {|x| x.gsub("\n", "")}
139
+ if row.first.nil?
140
+ require 'ruby-debug'; debugger
141
+ end
142
+ if row.first.downcase == "canonical"
143
+ versions = row[1..-1].uniq.collect {|x| x unless (x.nil? || x.empty?)}.compact
144
+ row.each_with_index {|x, ind| version_indexes << ind unless (x.nil? || x.empty?)}.slice!(1)
145
+ version_indexes.slice!(0, 1)
146
+ versions.each {|x| data[x] = [] }
147
+
148
+ elsif row.first.size > 0
149
+ canonical = row.first
150
+
151
+ versions.zip(version_indexes).each do |version, row_index|
152
+ index = row[row_index]
153
+ data[version][index.to_i - 1] = canonical.to_sym if index.to_i > 0
154
+ end
155
+ end
156
+ end
157
+
158
+ row_map = {}
159
+ data.each {|key, value| row_map[key] = value}
160
+ row_map
161
+ end
162
+
163
+ # Remove both the row type from the beginning of the row,
164
+ # and any fields marked as "ignore" in sources/headers/ignore.csv
165
+ def self.remove_ignored_fields(row, ignore)
166
+ data = row[1..-1].compact # strip off the row type
167
+ data.reject { |f| ignore.include?(f) }
168
+ end
169
+
170
+ def self.row_map_file(source_dir, row_type)
171
+ File.join(source_dir, row_type + '.csv')
172
+ end
173
+
174
+ def self.ignored_fields_file(source_dir)
175
+ File.join(source_dir, 'headers', 'ignore.csv')
176
+ end
177
+
178
+ def self.version_summary_file(source_dir, version)
179
+ File.join(source_dir, 'headers', version + '.csv')
180
+ end
181
+
182
+ def self.write_row_map_file(source_dir, row_type)
183
+ File.join(source_dir, 'rows', row_type + '.csv')
184
+ end
185
+
186
+ end
187
+ end