fech 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. data/.gitignore +7 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +4 -0
  4. data/Gemfile.lock +49 -0
  5. data/LICENSE +13 -0
  6. data/README.rdoc +178 -0
  7. data/Rakefile +3 -0
  8. data/autotest/discover.rb +1 -0
  9. data/fech.gemspec +32 -0
  10. data/lib/fech.rb +13 -0
  11. data/lib/fech/default_translations.rb +135 -0
  12. data/lib/fech/fech_utils.rb +41 -0
  13. data/lib/fech/filing.rb +248 -0
  14. data/lib/fech/map_generator.rb +187 -0
  15. data/lib/fech/mapped.rb +38 -0
  16. data/lib/fech/mappings.rb +66 -0
  17. data/lib/fech/translator.rb +138 -0
  18. data/lib/fech/version.rb +3 -0
  19. data/sources/F3P.csv +1 -0
  20. data/sources/F3P31.csv +1 -0
  21. data/sources/F3PS.csv +1 -0
  22. data/sources/F3S.csv +1 -0
  23. data/sources/HDR.csv +1 -0
  24. data/sources/SchA.csv +1 -0
  25. data/sources/SchB.csv +1 -0
  26. data/sources/SchC.csv +1 -0
  27. data/sources/SchC1.csv +1 -0
  28. data/sources/SchC2.csv +1 -0
  29. data/sources/SchD.csv +1 -0
  30. data/sources/SchE.csv +1 -0
  31. data/sources/SchF.csv +1 -0
  32. data/sources/TEXT.csv +1 -0
  33. data/sources/headers/3.csv +1 -0
  34. data/sources/headers/5.0.csv +1 -0
  35. data/sources/headers/5.1.csv +1 -0
  36. data/sources/headers/5.2.csv +1 -0
  37. data/sources/headers/5.3.csv +1 -0
  38. data/sources/headers/6.1.csv +1 -0
  39. data/sources/headers/6.2.csv +1 -0
  40. data/sources/headers/6.3.csv +1 -0
  41. data/sources/headers/6.4.csv +1 -0
  42. data/sources/headers/7.0.csv +1 -0
  43. data/sources/headers/ignore.csv +5 -0
  44. data/spec/data/723604.fec +4 -0
  45. data/spec/data/97405.fec +10 -0
  46. data/spec/default_translations_spec.rb +104 -0
  47. data/spec/fech_utils_spec.rb +29 -0
  48. data/spec/filing_spec.rb +251 -0
  49. data/spec/map_generator_spec.rb +49 -0
  50. data/spec/mapped_spec.rb +44 -0
  51. data/spec/mappings_spec.rb +46 -0
  52. data/spec/sources/F3P.csv +1 -0
  53. data/spec/sources/SchA.csv +1 -0
  54. data/spec/sources/SchB.csv +1 -0
  55. data/spec/sources/SchC.csv +1 -0
  56. data/spec/sources/headers/3.csv +1 -0
  57. data/spec/sources/headers/5.0.csv +1 -0
  58. data/spec/sources/headers/5.1.csv +1 -0
  59. data/spec/sources/headers/5.2.csv +1 -0
  60. data/spec/sources/headers/5.3.csv +1 -0
  61. data/spec/sources/headers/6.1.csv +1 -0
  62. data/spec/sources/headers/6.2.csv +1 -0
  63. data/spec/sources/headers/6.3.csv +1 -0
  64. data/spec/sources/headers/6.4.csv +1 -0
  65. data/spec/sources/headers/7.0.csv +1 -0
  66. data/spec/sources/headers/ignore.csv +5 -0
  67. data/spec/sources/sa.csv +1 -0
  68. data/spec/spec_helper.rb +9 -0
  69. data/spec/translator_spec.rb +195 -0
  70. data/tasks/fech.rake +41 -0
  71. metadata +280 -0
@@ -0,0 +1,41 @@
1
+ # Contains helper functions and static variables used by various
2
+ # Fech classes.
3
+ module FechUtils
4
+
5
+ # All supported row types pointed to regular expressions that will correcty
6
+ # match that row type in the wild.
7
+ ROW_TYPES = {
8
+ :hdr => /^hdr$/i,
9
+ :f3p => /(^f3p$)|(^f3p[^s|3])/i,
10
+ :f3s => /^f3s/i,
11
+ :f3p31 => /^f3p31/i,
12
+ :f3ps => /^f3ps/i,
13
+ :sa => /^sa/i,
14
+ :sb => /^sb/i,
15
+ :sc => /^sc[^1-2]/i,
16
+ :sc1 => /^sc1/i,
17
+ :sc2 => /^sc2/i,
18
+ :sd => /^sd/i,
19
+ :se => /^se/i,
20
+ :sf => /^sf/i,
21
+ :text => /^text/i,
22
+ }
23
+
24
+ # Converts symbols and strings to Regexp objects for use in regex-keyed maps.
25
+ # Assumes that symbols should be matched literally, strings unanchored.
26
+ # @param [String,Symbol,Regexp] label the object to convert to a Regexp
27
+ def regexify(label)
28
+ if label.is_a?(Regexp)
29
+ Regexp.new(label.source, Regexp::IGNORECASE)
30
+ elsif label.is_a?(Symbol)
31
+ if ROW_TYPES.keys.include?(label)
32
+ ROW_TYPES[label]
33
+ else
34
+ Regexp.new("^#{label.to_s}$", Regexp::IGNORECASE)
35
+ end
36
+ else
37
+ Regexp.new(Regexp.escape(label.to_s), Regexp::IGNORECASE)
38
+ end
39
+ end
40
+
41
+ end
@@ -0,0 +1,248 @@
1
+ require 'tmpdir'
2
+ require 'open-uri'
3
+ require 'fastercsv'
4
+
5
+ module Fech
6
+
7
+ # Fech::Filing downloads an Electronic Filing given its ID, and will search
8
+ # rows by row type. Using a child Translator object, the data in each row
9
+ # is automatically mapped at runtime into a labeled Hash. Additional
10
+ # Translations may be added to change the way that data is mapped and cleaned.
11
+ class Filing
12
+ attr_accessor :filing_id, :download_dir, :translator
13
+
14
+ # Create a new Filing object, assign the download directory to system's
15
+ # temp folder by default.
16
+ # @param [String] download_dir override the directory where files should
17
+ # be downloaded.
18
+ # @param [Symbol,Array] translate a list of built-in translation sets to use
19
+ def initialize(filing_id, opts={})
20
+ @filing_id = filing_id
21
+ @download_dir = opts[:download_dir] || Dir.tmpdir
22
+ @translator = Fech::Translator.new(:include => opts[:translate])
23
+ end
24
+
25
+ # Saves the filing data from the FEC website into the default download
26
+ # directory.
27
+ def download
28
+ File.open(file_path, 'w') do |file|
29
+ file << open(filing_url).read
30
+ end
31
+ self
32
+ end
33
+
34
+ # Access the header (first) line of the filing, containing information
35
+ # about the filing's version and metadata about the software used to file it.
36
+ # @return [Hash] a hash that assigns labels to the values of the filing's header row
37
+ def header(opts={})
38
+ each_row do |row|
39
+ return parse_row?(row)
40
+ end
41
+ end
42
+
43
+ # Access the summary (second) line of the filing, containing aggregate and
44
+ # top-level information about the filing.
45
+ # @return [Hash] a hash that assigns labels to the values of the filing's summary row
46
+ def summary
47
+ each_row_with_index do |row, index|
48
+ next if index == 0
49
+ return parse_row?(row)
50
+ end
51
+ end
52
+
53
+ # Access all lines of the filing that match a given row type. Will return an
54
+ # Array of all available lines if called directly, or will yield the mapped
55
+ # rows one by one if a block is passed.
56
+ #
57
+ # @param [String, Regexp] row_type a partial or complete name of the type of row desired
58
+ # @option opts [Boolean] :raw should the function return the data as an array
59
+ # that has not been mapped to column names
60
+ # @option opts [Array] :include list of field names that should be included
61
+ # in the returned hash
62
+ # @yield [Hash] each matched row's data, as either a mapped hash or raw array
63
+ # @return [Array] the complete set of mapped hashes for matched lines
64
+ def rows_like(row_type, opts={}, &block)
65
+ data = []
66
+ each_row do |row|
67
+ value = parse_row?(row, opts.merge(:parse_if => row_type))
68
+ next if value == false
69
+ if block_given?
70
+ yield value
71
+ else
72
+ data << value if value
73
+ end
74
+ end
75
+ block_given? ? nil : data
76
+ end
77
+
78
+ # Decides what to do with a given row. If the row's type matches the desired
79
+ # type, or if no type was specified, it will run the row through #map.
80
+ # If :raw was passed true, a flat, unmapped data array will be returned.
81
+ #
82
+ # @param [String, Regexp] row a partial or complete name of the type of row desired
83
+ # @option opts [Array] :include list of field names that should be included
84
+ # in the returned hash
85
+ def parse_row?(row, opts={})
86
+ # Always parse, unless :parse_if is given and does not match row
87
+ if opts[:parse_if].nil? || \
88
+ Fech.regexify(opts[:parse_if]).match(row.first.downcase)
89
+ opts[:raw] ? row : map(row, opts)
90
+ else
91
+ false
92
+ end
93
+ end
94
+
95
+ # Maps a raw row to a labeled hash following any rules given in the filing's
96
+ # Translator based on its version and row type.
97
+ # Finds the correct map for a given row, performs any matching Translations
98
+ # on the individual values, and returns either the entire dataset, or just
99
+ # those fields requested.
100
+ # @param [String, Regexp] row a partial or complete name of the type of row desired
101
+ # @option opts [Array] :include list of field names that should be included
102
+ # in the returned hash
103
+ def map(row, opts={})
104
+ data = Fech::Mapped.new(self, row.first)
105
+ row_map = map_for(row.first)
106
+
107
+ # If specific fields were asked for, return only those
108
+ row_map = row_map.select { |k,v| opts[:include].include?(k) } if opts[:include]
109
+
110
+ # Inserts the row into data, performing any specified preprocessing
111
+ # on individual cells along the way
112
+ row_map.each_with_index do |field, index|
113
+ value = row[index]
114
+ translator.get_translations(:row => row.first,
115
+ :version => filing_version, :action => :convert,
116
+ :field => field).each do |translation|
117
+ # User's Procs should be given each field's value as context
118
+ value = translation[:proc].call(value)
119
+ end
120
+ data[field] = value
121
+ end
122
+
123
+ # Performs any specified group preprocessing / combinations
124
+ combinations = translator.get_translations(:row => row.first,
125
+ :version => filing_version, :action => :combine)
126
+ row_hash = hash_zip(row_map, row) if combinations
127
+ combinations.each do |translation|
128
+ # User's Procs should be given the entire row as context
129
+ value = translation[:proc].call(row_hash)
130
+ field = translation[:field].source.gsub(/[\^\$]*/, "").to_sym
131
+ data[field] = value
132
+ end
133
+
134
+ data
135
+ end
136
+
137
+ # Returns the column names for given row type and the filing's version
138
+ # in the order they appear in row data.
139
+ # @param [String, Regexp] row_type representation of the row desired
140
+ def map_for(row_type)
141
+ mappings.for_row(row_type)
142
+ end
143
+
144
+ # Returns the column names for given row type and version in the order
145
+ # they appear in row data.
146
+ # @param [String, Regexp] row_type representation of the row desired
147
+ # @option opts [String, Regexp] :version representation of the version desired
148
+ def self.map_for(row_type, opts={})
149
+ Fech::Mappings.for_row(row_type, opts)
150
+ end
151
+
152
+ # @yield [t] returns a reference to the filing's Translator
153
+ # @yieldparam [Translator] the filing's Translator
154
+ def translate(&block)
155
+ if block_given?
156
+ yield translator
157
+ else
158
+ translator
159
+ end
160
+ end
161
+
162
+ # Whether this filing amends a previous filing or not.
163
+ def amendment?
164
+ !amends.nil?
165
+ end
166
+
167
+ # Returns the filing ID of the past filing this one amends,
168
+ # nil if this is a first-draft filing.
169
+ # :report_id in the HDR line references the amended filing
170
+ def amends
171
+ header[:report_id]
172
+ end
173
+
174
+
175
+ # Combines an array of keys and values into an Fech::Mapped object,
176
+ # a type of Hash.
177
+ # @param [Array] keys the desired keys for the new hash
178
+ # @param [Array] values the desired values for the new hash
179
+ # @return [Fech::Mapped, Hash]
180
+ def hash_zip(keys, values)
181
+ Fech::Mapped.new(self, values.first).merge(Hash[*keys.zip(values).flatten])
182
+ end
183
+
184
+ # The version of the FEC software used to generate this Filing
185
+ def filing_version
186
+ @filing_version ||= parse_filing_version
187
+ end
188
+
189
+ # Pulls out the version number from the header line.
190
+ # Must parse this line manually, since we don't know the version yet, and
191
+ # thus the delimiter type is still a mystery.
192
+ def parse_filing_version
193
+ first = File.open(file_path).first
194
+ if first.index("\034").nil?
195
+ FasterCSV.parse(first).flatten[2]
196
+ else
197
+ FasterCSV.parse(first, :col_sep => "\034").flatten[2]
198
+ end
199
+ end
200
+
201
+ # Gets or creats the Mappings instance for this filing_version
202
+ def mappings
203
+ @mapping ||= Fech::Mappings.new(filing_version)
204
+ end
205
+
206
+ # The location of the Filing on the file system
207
+ def file_path
208
+ File.join(download_dir, file_name)
209
+ end
210
+
211
+ def file_name
212
+ "#{filing_id}.fec"
213
+ end
214
+
215
+ def filing_url
216
+ "http://query.nictusa.com/dcdev/posted/#{filing_id}.fec"
217
+ end
218
+
219
+ # Iterates over and yields the Filing's lines
220
+ # @option opts [Boolean] :with_index yield both the item and its index
221
+ # @yield [Array] a row of the filing, split by the delimiter from #delimiter
222
+ def each_row(opts={}, &block)
223
+ unless File.exists?(file_path)
224
+ raise "File #{file_path} does not exist. Try invoking the .download method on this Filing object."
225
+ end
226
+ c = 0
227
+ FasterCSV.foreach(file_path, :col_sep => delimiter, :skip_blanks => true) do |row|
228
+ if opts[:with_index]
229
+ yield [row, c]
230
+ c += 1
231
+ else
232
+ yield row
233
+ end
234
+ end
235
+ end
236
+
237
+ # Wrapper around .each_row to include indexes
238
+ def each_row_with_index(&block)
239
+ each_row(:with_index => true, &block)
240
+ end
241
+
242
+ # @return [String] the delimiter used in the filing's version
243
+ def delimiter
244
+ filing_version.to_f < 6 ? "," : "\034"
245
+ end
246
+
247
+ end
248
+ end
@@ -0,0 +1,187 @@
1
+ module Fech
2
+
3
+ # Helper class to generate mapping hashes from source csv data.
4
+ # Needed to rebuild rendered_maps.rb with new source data, not used
5
+ # in main gem.
6
+ # rake fech:maps
7
+ class MapGenerator
8
+
9
+ attr_accessor :map
10
+ FILING_VERSIONS = ["7.0", "6.4", "6.3", "6.2", "6.1",
11
+ "5.3", "5.2", "5.1", "5.0", "3"]
12
+ BASE_ROW_TYPES = ["HDR", "F3P", "F3P31", "F3PS", "F3S", "SchA", "SchB",
13
+ "SchC", "SchC1", "SchC2", "SchD", "SchE", "SchF", "TEXT"]
14
+ ROW_TYPE_MATCHERS = {
15
+ "HDR" => FechUtils::ROW_TYPES[:hdr],
16
+ "F3P" => FechUtils::ROW_TYPES[:f3p],
17
+ "F3S" => FechUtils::ROW_TYPES[:f3s],
18
+ "F3P31" => FechUtils::ROW_TYPES[:f3p31],
19
+ "F3PS" => FechUtils::ROW_TYPES[:f3ps],
20
+ "SchA" => FechUtils::ROW_TYPES[:sa],
21
+ "SchB" => FechUtils::ROW_TYPES[:sb],
22
+ "SchC" => FechUtils::ROW_TYPES[:sc],
23
+ "SchC1" => FechUtils::ROW_TYPES[:sc1],
24
+ "SchC2" => FechUtils::ROW_TYPES[:sc2],
25
+ "SchD" => FechUtils::ROW_TYPES[:sd],
26
+ "SchE" => FechUtils::ROW_TYPES[:se],
27
+ "SchF" => FechUtils::ROW_TYPES[:sf],
28
+ "TEXT" => FechUtils::ROW_TYPES[:text],
29
+ }
30
+
31
+ # Goes through all version header summary files and generates
32
+ # row map files for each type of row inside them.
33
+ def self.convert_header_file_to_row_files(source_dir)
34
+ data = {}
35
+
36
+ ignored_fields = File.open(ignored_fields_file(source_dir)).readlines.map { |l| l.strip }
37
+
38
+ # Create a hash of data with an entry for each row type found in the source
39
+ # version summary files. Each row has an entriy for each version map that
40
+ # exists for it. If maps for two different versions are identical, they
41
+ # are combined.
42
+ FILING_VERSIONS.each do |version|
43
+ FasterCSV.foreach(version_summary_file(source_dir, version)) do |row|
44
+ # Each row of a version summary file contains the ordered list of
45
+ # column names.
46
+ data[row.first] ||= {}
47
+ row_version_data = remove_ignored_fields(row, ignored_fields)
48
+
49
+ # Check the maps for this row type in already-processed versions.
50
+ # If this map is identical to a previous map, tack this version on to
51
+ # to it instead of creating a new one.
52
+ data[row.first][version] = row_version_data
53
+ data[row.first].each do |k, v|
54
+ # skip the row we just added
55
+
56
+ next if k == version
57
+ if v == row_version_data
58
+ # Create the new hybrid entry
59
+ data[row.first]["#{k}|#{version}"] = row_version_data
60
+
61
+ # Delete the old entry, and the one for this version only
62
+ data[row.first].delete(k)
63
+ data[row.first].delete(version)
64
+ end
65
+ end
66
+ end
67
+ end
68
+
69
+ # Go through each row type and create a base map management file that
70
+ # will serve as a template for organizing which fields are the same
71
+ # between versions. This file will need to then be arranged by hand to
72
+ # clean up the data. Each row will represent a column across versions,
73
+ # each column a unique map for that row for one or more versions.
74
+ data.each do |row_type, row_data|
75
+ file_path = write_row_map_file(source_dir, row_type)
76
+ next unless File.exists?(file_path)
77
+ File.open(file_path, 'w') do |f|
78
+ f.write('canonical')
79
+
80
+ to_transpose = []
81
+ row_data.sort.reverse.each do |version, version_data|
82
+ to_transpose << ["^#{version}", version_data.each_with_index.collect {|x, idx| idx+1}].flatten
83
+ to_transpose << [nil, version_data].flatten
84
+ end
85
+
86
+ # standardize row size
87
+ max_size = to_transpose.max { |r1, r2| r1.size <=> r2.size }.size
88
+ to_transpose.each { |r| r[max_size - 1] ||= nil }
89
+ transposed = to_transpose.transpose
90
+
91
+ transposed.each do |transposed_data|
92
+ transposed_data.collect! {|x| x.to_s.gsub(/\r/, ' ')}
93
+ canonical = transposed_data[1] # first description
94
+ if canonical
95
+ canonical = canonical.gsub(/\{.*\}/, "").gsub(/[ -\.\/\(\)]/, "_").gsub(/_+/, "_").gsub(/(_$)|(^_)/, "").downcase
96
+ transposed_data = [canonical, transposed_data].flatten
97
+ end
98
+ f.write(transposed_data.join(','))
99
+ f.write("\n")
100
+ end
101
+ end
102
+ end
103
+
104
+ end
105
+
106
+ # Generates the mapping for each row type in BASE_ROW_TYPES, writes them out
107
+ # to file for inclusion in the gem.
108
+ def self.dump_row_maps_to_ruby(source_dir, file_path)
109
+ File.open(file_path, 'w') do |f|
110
+ f.write("# Generated automatically by Fech::MapGenerator.\n\n")
111
+ f.write("# RENDERED_MAPS contains an entry for each supported row type, which in turn:\n")
112
+ f.write("# contain an entry for each distinct map between a row's labels and the\n")
113
+ f.write("# indexes where their values can be found.\n")
114
+ f.write("module Fech\n")
115
+ f.write(" RENDERED_MAPS = {\n")
116
+ BASE_ROW_TYPES.each do |row_type|
117
+ f.write(" \"#{ROW_TYPE_MATCHERS[row_type].source}\" => {\n")
118
+ generate_row_map_from_file(source_dir, row_type).each do |k, v|
119
+ f.write(" \'#{k}' => [#{v.map {|x| x.to_s.gsub(/^\d+_?/, "") }.collect {|x| (x.nil? || x == "") ? "nil" : ":#{x}" }.join(', ') }],\n")
120
+ end
121
+ f.write(" },\n")
122
+ end
123
+ f.write(" }\n")
124
+ f.write("end")
125
+ end
126
+ end
127
+
128
+ # For a given row type, parses its source file and returns
129
+ # a mapping object for it.
130
+ def self.generate_row_map_from_file(source_dir, row_type)
131
+ versions = []
132
+ version_indexes = []
133
+ data = {}
134
+ text = open(row_map_file(source_dir, row_type)).read
135
+ split_char = text.index(/\r/) ? /\r/ : /\n/
136
+ rows = text.split(split_char).collect {|x| x.split(',')}
137
+ rows.each do |row|
138
+ row = row.collect {|x| x.gsub("\n", "")}
139
+ if row.first.nil?
140
+ require 'ruby-debug'; debugger
141
+ end
142
+ if row.first.downcase == "canonical"
143
+ versions = row[1..-1].uniq.collect {|x| x unless (x.nil? || x.empty?)}.compact
144
+ row.each_with_index {|x, ind| version_indexes << ind unless (x.nil? || x.empty?)}.slice!(1)
145
+ version_indexes.slice!(0, 1)
146
+ versions.each {|x| data[x] = [] }
147
+
148
+ elsif row.first.size > 0
149
+ canonical = row.first
150
+
151
+ versions.zip(version_indexes).each do |version, row_index|
152
+ index = row[row_index]
153
+ data[version][index.to_i - 1] = canonical.to_sym if index.to_i > 0
154
+ end
155
+ end
156
+ end
157
+
158
+ row_map = {}
159
+ data.each {|key, value| row_map[key] = value}
160
+ row_map
161
+ end
162
+
163
+ # Remove both the row type from the beginning of the row,
164
+ # and any fields marked as "ignore" in sources/headers/ignore.csv
165
+ def self.remove_ignored_fields(row, ignore)
166
+ data = row[1..-1].compact # strip off the row type
167
+ data.reject { |f| ignore.include?(f) }
168
+ end
169
+
170
+ def self.row_map_file(source_dir, row_type)
171
+ File.join(source_dir, row_type + '.csv')
172
+ end
173
+
174
+ def self.ignored_fields_file(source_dir)
175
+ File.join(source_dir, 'headers', 'ignore.csv')
176
+ end
177
+
178
+ def self.version_summary_file(source_dir, version)
179
+ File.join(source_dir, 'headers', version + '.csv')
180
+ end
181
+
182
+ def self.write_row_map_file(source_dir, row_type)
183
+ File.join(source_dir, 'rows', row_type + '.csv')
184
+ end
185
+
186
+ end
187
+ end