myl-fech 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. data/.gitignore +7 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +4 -0
  4. data/Gemfile.lock +48 -0
  5. data/LICENSE +13 -0
  6. data/README.rdoc +82 -0
  7. data/Rakefile +3 -0
  8. data/autotest/discover.rb +1 -0
  9. data/fech.gemspec +40 -0
  10. data/lib/fech/comparison.rb +36 -0
  11. data/lib/fech/csv.rb +70 -0
  12. data/lib/fech/default_translations.rb +133 -0
  13. data/lib/fech/fech_utils.rb +76 -0
  14. data/lib/fech/filing.rb +341 -0
  15. data/lib/fech/map_generator.rb +233 -0
  16. data/lib/fech/mapped.rb +38 -0
  17. data/lib/fech/mappings.rb +67 -0
  18. data/lib/fech/rendered_maps.rb +238 -0
  19. data/lib/fech/translator.rb +138 -0
  20. data/lib/fech/version.rb +3 -0
  21. data/lib/fech.rb +15 -0
  22. data/sources/F1.csv +106 -0
  23. data/sources/F1M.csv +78 -0
  24. data/sources/F2.csv +43 -0
  25. data/sources/F24.csv +18 -0
  26. data/sources/F3.csv +1 -0
  27. data/sources/F3L.csv +27 -0
  28. data/sources/F3P.csv +208 -0
  29. data/sources/F3P31.csv +39 -0
  30. data/sources/F3PS.csv +94 -0
  31. data/sources/F3S.csv +36 -0
  32. data/sources/F3X.csv +125 -0
  33. data/sources/F4.csv +86 -0
  34. data/sources/F5.csv +39 -0
  35. data/sources/F56.csv +33 -0
  36. data/sources/F57.csv +44 -0
  37. data/sources/F6.csv +1 -0
  38. data/sources/F65.csv +1 -0
  39. data/sources/F7.csv +1 -0
  40. data/sources/F76.csv +1 -0
  41. data/sources/F9.csv +46 -0
  42. data/sources/F91.csv +17 -0
  43. data/sources/F92.csv +23 -0
  44. data/sources/F93.csv +27 -0
  45. data/sources/F94.csv +18 -0
  46. data/sources/F99.csv +1 -0
  47. data/sources/H1.csv +1 -0
  48. data/sources/H2.csv +1 -0
  49. data/sources/H3.csv +1 -0
  50. data/sources/H4.csv +1 -0
  51. data/sources/H5.csv +1 -0
  52. data/sources/H6.csv +1 -0
  53. data/sources/HDR.csv +10 -0
  54. data/sources/SchA.csv +50 -0
  55. data/sources/SchB.csv +50 -0
  56. data/sources/SchC.csv +41 -0
  57. data/sources/SchC1.csv +52 -0
  58. data/sources/SchC2.csv +19 -0
  59. data/sources/SchD.csv +34 -0
  60. data/sources/SchE.csv +57 -0
  61. data/sources/SchF.csv +55 -0
  62. data/sources/SchL.csv +1 -0
  63. data/sources/TEXT.csv +1 -0
  64. data/sources/headers/3.csv +1 -0
  65. data/sources/headers/5.0.csv +1 -0
  66. data/sources/headers/5.1.csv +1 -0
  67. data/sources/headers/5.2.csv +1 -0
  68. data/sources/headers/5.3.csv +1 -0
  69. data/sources/headers/6.1.csv +1 -0
  70. data/sources/headers/6.2.csv +1 -0
  71. data/sources/headers/6.3.csv +1 -0
  72. data/sources/headers/6.4.csv +1 -0
  73. data/sources/headers/7.0.csv +49 -0
  74. data/sources/headers/8.0.csv +49 -0
  75. data/sources/headers/ignore.csv +5 -0
  76. data/spec/comparison_spec.rb +30 -0
  77. data/spec/data/467627.fec +608 -0
  78. data/spec/data/723604.fec +4 -0
  79. data/spec/data/730635.fec +2 -0
  80. data/spec/data/747058.fec +4 -0
  81. data/spec/data/748730.fec +1196 -0
  82. data/spec/data/752356.fec +5 -0
  83. data/spec/data/753533.fec +7 -0
  84. data/spec/data/764901.fec +7 -0
  85. data/spec/data/765310.fec +2 -0
  86. data/spec/data/767339.fec +648 -0
  87. data/spec/data/82094.fec +144 -0
  88. data/spec/data/97405.fec +10 -0
  89. data/spec/default_translations_spec.rb +104 -0
  90. data/spec/fech_utils_spec.rb +29 -0
  91. data/spec/filing_spec.rb +314 -0
  92. data/spec/map_generator_spec.rb +49 -0
  93. data/spec/mapped_spec.rb +44 -0
  94. data/spec/mappings_spec.rb +46 -0
  95. data/spec/sources/F24.csv +18 -0
  96. data/spec/sources/F3P.csv +1 -0
  97. data/spec/sources/F3P31.csv +39 -0
  98. data/spec/sources/SchA.csv +1 -0
  99. data/spec/sources/SchB.csv +1 -0
  100. data/spec/sources/SchC.csv +1 -0
  101. data/spec/sources/headers/3.csv +1 -0
  102. data/spec/sources/headers/5.0.csv +1 -0
  103. data/spec/sources/headers/5.1.csv +1 -0
  104. data/spec/sources/headers/5.2.csv +1 -0
  105. data/spec/sources/headers/5.3.csv +1 -0
  106. data/spec/sources/headers/6.1.csv +1 -0
  107. data/spec/sources/headers/6.2.csv +1 -0
  108. data/spec/sources/headers/6.3.csv +1 -0
  109. data/spec/sources/headers/6.4.csv +1 -0
  110. data/spec/sources/headers/7.0.csv +1 -0
  111. data/spec/sources/headers/8.0.csv +49 -0
  112. data/spec/sources/headers/ignore.csv +5 -0
  113. data/spec/sources/sa.csv +1 -0
  114. data/spec/spec_helper.rb +9 -0
  115. data/spec/translator_spec.rb +195 -0
  116. data/tasks/fech.rake +41 -0
  117. metadata +342 -0
@@ -0,0 +1,341 @@
1
+ require 'tmpdir'
2
+ require 'open-uri'
3
+
4
+ module Fech
5
+
6
+ # Fech::Filing downloads an Electronic Filing given its ID, and will search
7
+ # rows by row type. Using a child Translator object, the data in each row
8
+ # is automatically mapped at runtime into a labeled Hash. Additional
9
+ # Translations may be added to change the way that data is mapped and cleaned.
10
+ class Filing
11
+ # first filing number using the version >=3.00 format
12
+ # note that there are plenty of <v3 filings after this, so readable? still needs to be checked
13
+ FIRST_V3_FILING = 11850
14
+
15
+ attr_accessor :filing_id, :download_dir, :translator
16
+
17
+ # Create a new Filing object, assign the download directory to system's
18
+ # temp folder by default.
19
+ # @param [String] download_dir override the directory where files should
20
+ # be downloaded.
21
+ # @param [Symbol,Array] translate a list of built-in translation sets to use
22
+ def initialize(filing_id, opts={})
23
+ @filing_id = filing_id
24
+ @download_dir = opts[:download_dir] || Dir.tmpdir
25
+ @translator = Fech::Translator.new(:include => opts[:translate])
26
+ @quote_char = opts[:quote_char] || '"'
27
+ @csv_parser = opts[:csv_parser] || Fech::Csv
28
+ @resaved = false
29
+ @customized = false
30
+ end
31
+
32
+ # Saves the filing data from the FEC website into the default download
33
+ # directory.
34
+ def download
35
+ File.open(file_path, 'w') do |file|
36
+ file << open(filing_url).read
37
+ end
38
+ self
39
+ end
40
+
41
+ # This downloads ALL the filings.
42
+ #
43
+ # Because this trashes the zip files after extraction (to save space), while it is safe to rerun, it has to do the whole thing over again.
44
+ # Update operations should just iterate single file downloads starting from the current+1th filing number.
45
+ #
46
+ # This takes a very long time to run - on the order of an hour or two, depending on your bandwidth.
47
+ #
48
+ # WARNING: As of July 9, 2012, this downloads 536964 files (25.8 GB), into one directory.
49
+ # This means that the download directory will break bash file globbing (so e.g. ls and rm *.fec will not work).
50
+ # If you want to get all of it, make sure to download only to a dedicated FEC filings directory.
51
+ def self.download_all download_dir
52
+ `cd #{download_dir} && ftp -a ftp.fec.gov:/FEC/electronic/*.zip`
53
+ `cd #{download_dir} && for z in *.zip; do unzip -o $z && rm $z; done`
54
+ Dir[File.join(download_dir, '*.fec')].count
55
+ end
56
+
57
+ # Runs the passed block on every downloaded .fec file. Pass the same options hash as you would to Fech::Filing.new.
58
+ # E.g. for_all(:download_dir => Rails.root.join('db', 'data', 'fec', 'filings', :csv_parser => Fech::CsvDoctor, ...) {|filing| ... }
59
+ # filing.download is of course unnecessary.
60
+ #
61
+ # note that if there are a lot of files (e.g. after download_all), just listing them to prepare for this will take several seconds
62
+ def self.for_all options = {}
63
+ options[:download_dir] ||= Dir.tmpdir
64
+ # .sort{|x| x.scan/\d+/.to_i } # should be no need to spend time on sort, since the file system should already do that
65
+ Dir[File.join(options[:download_dir], '*.fec')].each do |file|
66
+ yield Fech::Filing.new(file.scan(/(\d+)\.fec/)[0][0].to_i, options)
67
+ end
68
+ end
69
+
70
+ # Access the header (first) line of the filing, containing information
71
+ # about the filing's version and metadata about the software used to file it.
72
+ # @return [Hash] a hash that assigns labels to the values of the filing's header row
73
+ def header(opts={})
74
+ each_row do |row|
75
+ return parse_row?(row)
76
+ end
77
+ end
78
+
79
+ # Access the summary (second) line of the filing, containing aggregate and
80
+ # top-level information about the filing.
81
+ # @return [Hash] a hash that assigns labels to the values of the filing's summary row
82
+ def summary
83
+ each_row_with_index do |row, index|
84
+ next if index == 0
85
+ return parse_row?(row)
86
+ end
87
+ end
88
+
89
+ # Access all lines of the filing that match a given row type. Will return an
90
+ # Array of all available lines if called directly, or will yield the mapped
91
+ # rows one by one if a block is passed.
92
+ #
93
+ # @param [String, Regexp] row_type a partial or complete name of the type of row desired
94
+ # @option opts [Boolean] :raw should the function return the data as an array
95
+ # that has not been mapped to column names
96
+ # @option opts [Array] :include list of field names that should be included
97
+ # in the returned hash
98
+ # @yield [Hash] each matched row's data, as either a mapped hash or raw array
99
+ # @return [Array] the complete set of mapped hashes for matched lines
100
+ def rows_like(row_type, opts={}, &block)
101
+ data = []
102
+ each_row do |row|
103
+ value = parse_row?(row, opts.merge(:parse_if => row_type))
104
+ next if value == false
105
+ if block_given?
106
+ yield value
107
+ else
108
+ data << value if value
109
+ end
110
+ end
111
+ block_given? ? nil : data
112
+ end
113
+
114
+ # Decides what to do with a given row. If the row's type matches the desired
115
+ # type, or if no type was specified, it will run the row through #map.
116
+ # If :raw was passed true, a flat, unmapped data array will be returned.
117
+ #
118
+ # @param [String, Regexp] row a partial or complete name of the type of row desired
119
+ # @option opts [Array] :include list of field names that should be included
120
+ # in the returned hash
121
+ def parse_row?(row, opts={})
122
+ # Always parse, unless :parse_if is given and does not match row
123
+ if opts[:parse_if].nil? || \
124
+ Fech.regexify(opts[:parse_if]).match(row.first.downcase)
125
+ opts[:raw] ? row : map(row, opts)
126
+ else
127
+ false
128
+ end
129
+ end
130
+
131
+ # Maps a raw row to a labeled hash following any rules given in the filing's
132
+ # Translator based on its version and row type.
133
+ # Finds the correct map for a given row, performs any matching Translations
134
+ # on the individual values, and returns either the entire dataset, or just
135
+ # those fields requested.
136
+ # @param [String, Regexp] row a partial or complete name of the type of row desired
137
+ # @option opts [Array] :include list of field names that should be included
138
+ # in the returned hash
139
+ def map(row, opts={})
140
+ data = Fech::Mapped.new(self, row.first)
141
+ full_row_map = map_for(row.first)
142
+
143
+ # If specific fields were asked for, return only those
144
+ if opts[:include]
145
+ row_map = full_row_map.select { |k| opts[:include].include?(k) }
146
+ else
147
+ row_map = full_row_map
148
+ end
149
+
150
+ # Inserts the row into data, performing any specified preprocessing
151
+ # on individual cells along the way
152
+ row_map.each_with_index do |field, index|
153
+ value = row[full_row_map.index(field)]
154
+ translator.get_translations(:row => row.first,
155
+ :version => filing_version, :action => :convert,
156
+ :field => field).each do |translation|
157
+ # User's Procs should be given each field's value as context
158
+ value = translation[:proc].call(value)
159
+ end
160
+ data[field] = value
161
+ end
162
+
163
+ # Performs any specified group preprocessing / combinations
164
+ combinations = translator.get_translations(:row => row.first,
165
+ :version => filing_version, :action => :combine)
166
+ row_hash = hash_zip(row_map, row) if combinations
167
+ combinations.each do |translation|
168
+ # User's Procs should be given the entire row as context
169
+ value = translation[:proc].call(row_hash)
170
+ field = translation[:field].source.gsub(/[\^\$]*/, "").to_sym
171
+ data[field] = value
172
+ end
173
+
174
+ data
175
+ end
176
+
177
+ # Returns the column names for given row type and the filing's version
178
+ # in the order they appear in row data.
179
+ # @param [String, Regexp] row_type representation of the row desired
180
+ def map_for(row_type)
181
+ mappings.for_row(row_type)
182
+ end
183
+
184
+ # Returns the column names for given row type and version in the order
185
+ # they appear in row data.
186
+ # @param [String, Regexp] row_type representation of the row desired
187
+ # @option opts [String, Regexp] :version representation of the version desired
188
+ def self.map_for(row_type, opts={})
189
+ Fech::Mappings.for_row(row_type, opts)
190
+ end
191
+
192
+ # @yield [t] returns a reference to the filing's Translator
193
+ # @yieldparam [Translator] the filing's Translator
194
+ def translate(&block)
195
+ if block_given?
196
+ yield translator
197
+ else
198
+ translator
199
+ end
200
+ end
201
+
202
+ # Whether this filing amends a previous filing or not.
203
+ def amendment?
204
+ !amends.nil?
205
+ end
206
+
207
+ # Returns the filing ID of the past filing this one amends,
208
+ # nil if this is a first-draft filing.
209
+ # :report_id in the HDR line references the amended filing
210
+ def amends
211
+ header[:report_id]
212
+ end
213
+
214
+ # Combines an array of keys and values into an Fech::Mapped object,
215
+ # a type of Hash.
216
+ # @param [Array] keys the desired keys for the new hash
217
+ # @param [Array] values the desired values for the new hash
218
+ # @return [Fech::Mapped, Hash]
219
+ def hash_zip(keys, values)
220
+ Fech::Mapped.new(self, values.first).merge(Hash[*keys.zip(values).flatten])
221
+ end
222
+
223
+ # The version of the FEC software used to generate this Filing
224
+ def filing_version
225
+ @filing_version ||= parse_filing_version
226
+ end
227
+
228
+ # Pulls out the version number from the header line.
229
+ # Must parse this line manually, since we don't know the version yet, and
230
+ # thus the delimiter type is still a mystery.
231
+ def parse_filing_version
232
+ first = File.open(file_path).first
233
+ if first.index("\034").nil?
234
+ @csv_parser.parse(first).flatten[2]
235
+ else
236
+ @csv_parser.parse(first, :col_sep => "\034").flatten[2]
237
+ end
238
+ end
239
+
240
+ # Only FEC format 3.00 + is supported
241
+ def readable?
242
+ filing_version.to_i >= 3
243
+ end
244
+
245
+ # Gets or creats the Mappings instance for this filing_version
246
+ def mappings
247
+ @mapping ||= Fech::Mappings.new(filing_version)
248
+ end
249
+
250
+ # The location of the Filing on the file system
251
+ def file_path
252
+ File.join(download_dir, file_name)
253
+ end
254
+
255
+ # The raw contents of the Filing
256
+ def file_contents
257
+ File.open(file_path, 'r')
258
+ end
259
+
260
+ # Determine the form type of the filing
261
+ # before it's been parsed. This is needed
262
+ # for the F99 special case.
263
+ def form_type
264
+ file_contents.lines.each_with_index do |row, index|
265
+ next if index == 0
266
+ return row.split(delimiter).first
267
+ end
268
+ end
269
+
270
+ # The file path where custom versions
271
+ # of a filing are to be saved.
272
+ def custom_file_path
273
+ File.join(download_dir, "fech_#{file_name}")
274
+ end
275
+
276
+ # Handle the contents of F99s by removing the
277
+ # [BEGINTEXT] and [ENDTEXT] delimiters and
278
+ # putting the text content onto the same
279
+ # line as the summary.
280
+ def fix_f99_contents
281
+ @customized = true
282
+ content = file_contents.read
283
+ regex = /\n\[BEGINTEXT\]\n(.*?)\[ENDTEXT\]\n/mi # some use eg [EndText]
284
+ match = content.match(regex)
285
+ if match
286
+ repl = match[1].gsub(/"/, '""')
287
+ content.gsub(regex, "#{delimiter}\"#{repl}\"")
288
+ else
289
+ content
290
+ end
291
+ end
292
+
293
+ # Resave the "fixed" version of an F99
294
+ def resave_f99_contents
295
+ return true if @resaved
296
+ File.open(custom_file_path, 'w') { |f| f.write(fix_f99_contents) }
297
+ @resaved = true
298
+ end
299
+
300
+ def file_name
301
+ "#{filing_id}.fec"
302
+ end
303
+
304
+ def filing_url
305
+ "http://query.nictusa.com/dcdev/posted/#{filing_id}.fec"
306
+ end
307
+
308
+ # Iterates over and yields the Filing's lines
309
+ # @option opts [Boolean] :with_index yield both the item and its index
310
+ # @yield [Array] a row of the filing, split by the delimiter from #delimiter
311
+ def each_row(opts={}, &block)
312
+ unless File.exists?(file_path)
313
+ raise "File #{file_path} does not exist. Try invoking the .download method on this Filing object."
314
+ end
315
+
316
+ # If this is an F99, we need to parse it differently.
317
+ resave_f99_contents if form_type == 'F99'
318
+
319
+ c = 0
320
+ @csv_parser.parse_row(@customized ? custom_file_path : file_path, :col_sep => delimiter, :quote_char => @quote_char, :skip_blanks => true) do |row|
321
+ if opts[:with_index]
322
+ yield [row, c]
323
+ c += 1
324
+ else
325
+ yield row
326
+ end
327
+ end
328
+ end
329
+
330
+ # Wrapper around .each_row to include indexes
331
+ def each_row_with_index(&block)
332
+ each_row(:with_index => true, &block)
333
+ end
334
+
335
+ # @return [String] the delimiter used in the filing's version
336
+ def delimiter
337
+ filing_version.to_f < 6 ? "," : "\034"
338
+ end
339
+
340
+ end
341
+ end
@@ -0,0 +1,233 @@
1
+ module Fech
2
+
3
+ # Helper class to generate mapping hashes from source csv data.
4
+ # Needed to rebuild rendered_maps.rb with new source data, not used
5
+ # in main gem.
6
+ # rake fech:maps
7
+ class MapGenerator
8
+
9
+ attr_accessor :map
10
+ FILING_VERSIONS = ["8.0", "7.0", "6.4", "6.3", "6.2", "6.1",
11
+ "5.3", "5.2", "5.1", "5.0", "3"]
12
+ BASE_ROW_TYPES = ["HDR", "F1", "F1M", "F2", "F24", "F3", "F3L", "F3P", "F3P31", "F3PS", "F3S", "F3X",
13
+ "F4", "F5", "F56", "F57", "F6", "F65", "F7", "F76", "F9", "F91", "F92", "F93", "F94", "F99",
14
+ "H1", "H2", "H3", "H4", "H5", "H6",
15
+ "SchA", "SchB", "SchC", "SchC1", "SchC2", "SchD", "SchE", "SchF", "SchL", "TEXT"]
16
+ ROW_TYPE_MATCHERS = {
17
+ "HDR" => FechUtils::ROW_TYPES[:hdr],
18
+ "F1" => FechUtils::ROW_TYPES[:f1],
19
+ "F1M" => FechUtils::ROW_TYPES[:f1m],
20
+ "F2" => FechUtils::ROW_TYPES[:f2],
21
+ "F24" => FechUtils::ROW_TYPES[:f24],
22
+ "F3" => FechUtils::ROW_TYPES[:f3],
23
+ "F3L" => FechUtils::ROW_TYPES[:f3l],
24
+ "F3P" => FechUtils::ROW_TYPES[:f3p],
25
+ "F3S" => FechUtils::ROW_TYPES[:f3s],
26
+ "F3P31" => FechUtils::ROW_TYPES[:f3p31],
27
+ "F3PS" => FechUtils::ROW_TYPES[:f3ps],
28
+ "F3X" => FechUtils::ROW_TYPES[:f3x],
29
+ "F4" => FechUtils::ROW_TYPES[:f4],
30
+ "F5" => FechUtils::ROW_TYPES[:f5],
31
+ "F56" => FechUtils::ROW_TYPES[:f56],
32
+ "F57" => FechUtils::ROW_TYPES[:f57],
33
+ "F6" => FechUtils::ROW_TYPES[:f6],
34
+ "F65" => FechUtils::ROW_TYPES[:f65],
35
+ "F7" => FechUtils::ROW_TYPES[:f7],
36
+ "F76" => FechUtils::ROW_TYPES[:f76],
37
+ "F9" => FechUtils::ROW_TYPES[:f9],
38
+ "F91" => FechUtils::ROW_TYPES[:f91],
39
+ "F92" => FechUtils::ROW_TYPES[:f92],
40
+ "F93" => FechUtils::ROW_TYPES[:f93],
41
+ "F94" => FechUtils::ROW_TYPES[:f94],
42
+ "F99" => FechUtils::ROW_TYPES[:f99],
43
+ "H1" => FechUtils::ROW_TYPES[:h1],
44
+ "H2" => FechUtils::ROW_TYPES[:h2],
45
+ "H3" => FechUtils::ROW_TYPES[:h3],
46
+ "H4" => FechUtils::ROW_TYPES[:h4],
47
+ "H5" => FechUtils::ROW_TYPES[:h5],
48
+ "H6" => FechUtils::ROW_TYPES[:h6],
49
+ "SchA" => FechUtils::ROW_TYPES[:sa],
50
+ "SchB" => FechUtils::ROW_TYPES[:sb],
51
+ "SchC" => FechUtils::ROW_TYPES[:sc],
52
+ "SchC1" => FechUtils::ROW_TYPES[:sc1],
53
+ "SchC2" => FechUtils::ROW_TYPES[:sc2],
54
+ "SchD" => FechUtils::ROW_TYPES[:sd],
55
+ "SchE" => FechUtils::ROW_TYPES[:se],
56
+ "SchF" => FechUtils::ROW_TYPES[:sf],
57
+ "SchL" => FechUtils::ROW_TYPES[:sl],
58
+ "TEXT" => FechUtils::ROW_TYPES[:text],
59
+ }
60
+
61
+ # Goes through all version header summary files and generates
62
+ # row map files for each type of row inside them.
63
+ def self.convert_header_file_to_row_files(source_dir)
64
+ data = {}
65
+ hybrid_data = {}
66
+
67
+ ignored_fields = File.open(ignored_fields_file(source_dir)).readlines.map { |l| l.strip }
68
+
69
+ # Create a hash of data with an entry for each row type found in the source
70
+ # version summary files. Each row has an entry for each version map that
71
+ # exists for it. If maps for two different versions are identical, they
72
+ # are combined.
73
+ FILING_VERSIONS.each do |version|
74
+ filepath = version_summary_file(source_dir, version)
75
+
76
+ # Clean the source files by removing unparseable characters
77
+ if RUBY_VERSION < "1.9.3"
78
+ require 'iconv'
79
+ ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
80
+ valid_string = ic.iconv(open(filepath).read << ' ')[0..-2]
81
+ else
82
+ valid_string = (open(filepath).read << ' ')[0..-2].encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '')
83
+ valid_string = valid_string.encode!('UTF-8', 'UTF-16')
84
+ end
85
+ open(filepath, 'w').write(valid_string)
86
+
87
+ Fech::Csv.foreach(filepath) do |row|
88
+ # Each row of a version summary file contains the ordered list of
89
+ # column names.
90
+ data[row.first] ||= {}
91
+ hybrid_data[row.first] ||= {}
92
+ row_version_data = remove_ignored_fields(row, ignored_fields)
93
+
94
+ # Check the maps for this row type in already-processed versions.
95
+ # If this map is identical to a previous map, tack this version on to
96
+ # to it instead of creating a new one.
97
+ data[row.first][version] = row_version_data
98
+ data[row.first].each do |k, v|
99
+ # skip the row we just added
100
+
101
+ next if k == version
102
+ if v == row_version_data
103
+ # Create the new hybrid entry
104
+ hybrid_data[row.first]["#{k}|#{version}"] = row_version_data
105
+
106
+ # Delete the old entry, and the one for this version only
107
+ data[row.first].delete(k)
108
+ data[row.first].delete(version)
109
+ end
110
+ end
111
+ data[row.first].update(hybrid_data[row.first])
112
+ end
113
+ end
114
+
115
+ # Go through each row type and create a base map management file that
116
+ # will serve as a template for organizing which fields are the same
117
+ # between versions. This file will need to then be arranged by hand to
118
+ # clean up the data. Each row will represent a column across versions,
119
+ # each column a unique map for that row for one or more versions.
120
+ data.each do |row_type, row_data|
121
+ file_path = write_row_map_file(source_dir, row_type)
122
+ next unless File.exists?(file_path)
123
+ File.open(file_path, 'w') do |f|
124
+ f.write('canonical')
125
+
126
+ to_transpose = []
127
+ row_data.sort.reverse.each do |version, version_data|
128
+ to_transpose << ["^#{version}", version_data.each_with_index.collect {|x, idx| idx+1}].flatten
129
+ to_transpose << [nil, version_data].flatten
130
+ end
131
+
132
+ # standardize row size
133
+ max_size = to_transpose.max { |r1, r2| r1.size <=> r2.size }.size
134
+ to_transpose.each { |r| r[max_size - 1] ||= nil }
135
+ transposed = to_transpose.transpose
136
+
137
+ transposed.each do |transposed_data|
138
+ transposed_data.collect! {|x| x.to_s.gsub(/\r/, ' ')}
139
+ canonical = transposed_data[1] # first description
140
+ if canonical
141
+ canonical = canonical.gsub(/\{.*\}/, "").gsub(/[ -\.\/\(\)]/, "_").gsub(/_+/, "_").gsub(/(_$)|(^_)/, "").downcase
142
+ transposed_data = [canonical, transposed_data].flatten
143
+ end
144
+ f.write(transposed_data.join(','))
145
+ f.write("\n")
146
+ end
147
+ end
148
+ end
149
+
150
+ end
151
+
152
+ # Generates the mapping for each row type in BASE_ROW_TYPES, writes them out
153
+ # to file for inclusion in the gem.
154
+ def self.dump_row_maps_to_ruby(source_dir, file_path)
155
+ File.open(file_path, 'w') do |f|
156
+ f.write("# Generated automatically by Fech::MapGenerator.\n\n")
157
+ f.write("# RENDERED_MAPS contains an entry for each supported row type, which in turn:\n")
158
+ f.write("# contain an entry for each distinct map between a row's labels and the\n")
159
+ f.write("# indexes where their values can be found.\n")
160
+ f.write("module Fech\n")
161
+ f.write(" RENDERED_MAPS = {\n")
162
+ BASE_ROW_TYPES.each do |row_type|
163
+ f.write(" \"#{ROW_TYPE_MATCHERS[row_type].source}\" => {\n")
164
+ generate_row_map_from_file(source_dir, row_type).sort_by(&:first).reverse.each do |k, v|
165
+ f.write(" \'#{k}' => [#{v.map {|x| x.to_s.gsub(/^\d+_?/, "") }.collect {|x| (x.nil? || x == "") ? "nil" : ":#{x}" }.join(', ') }],\n")
166
+ end
167
+ f.write(" },\n")
168
+ end
169
+ f.write(" }\n")
170
+ f.write("end")
171
+ end
172
+ end
173
+
174
+ # For a given row type, parses its source file and returns
175
+ # a mapping object for it.
176
+ def self.generate_row_map_from_file(source_dir, row_type)
177
+ versions = []
178
+ version_indexes = []
179
+ data = {}
180
+ text = open(row_map_file(source_dir, row_type)).read
181
+ split_char = text.index(/\r/) ? /\r/ : /\n/
182
+ rows = text.split(split_char).collect {|x| x.split(',')}
183
+ rows.each do |row|
184
+ row = row.collect {|x| x.gsub("\n", "")}
185
+ if row.first.nil?
186
+ require 'ruby-debug'; debugger
187
+ end
188
+ if row.first.downcase == "canonical"
189
+ versions = row[1..-1].uniq.collect {|x| x unless (x.nil? || x.empty?)}.compact
190
+ row.each_with_index {|x, ind| version_indexes << ind unless (x.nil? || x.empty?)}.slice!(1)
191
+ version_indexes.slice!(0, 1)
192
+ versions.each {|x| data[x] = [] }
193
+
194
+ elsif row.first.size > 0
195
+ canonical = row.first
196
+
197
+ versions.zip(version_indexes).each do |version, row_index|
198
+ index = row[row_index]
199
+ data[version][index.to_i - 1] = canonical.to_sym if index.to_i > 0
200
+ end
201
+ end
202
+ end
203
+
204
+ row_map = {}
205
+ data.each {|key, value| row_map[key] = value}
206
+ row_map
207
+ end
208
+
209
+ # Remove both the row type from the beginning of the row,
210
+ # and any fields marked as "ignore" in sources/headers/ignore.csv
211
+ def self.remove_ignored_fields(row, ignore)
212
+ data = row[1..-1].compact # strip off the row type
213
+ data.reject { |f| ignore.include?(f) }
214
+ end
215
+
216
+ def self.row_map_file(source_dir, row_type)
217
+ File.join(source_dir, row_type + '.csv')
218
+ end
219
+
220
+ def self.ignored_fields_file(source_dir)
221
+ File.join(source_dir, 'headers', 'ignore.csv')
222
+ end
223
+
224
+ def self.version_summary_file(source_dir, version)
225
+ File.join(source_dir, 'headers', version + '.csv')
226
+ end
227
+
228
+ def self.write_row_map_file(source_dir, row_type)
229
+ File.join(source_dir, 'rows', row_type + '.csv')
230
+ end
231
+
232
+ end
233
+ end
@@ -0,0 +1,38 @@
1
+ module Fech
2
+
3
+ # Fech::Mapped is a thin wrapper around Hash which allows values to be
4
+ # referenced either by key or by an alias specified in the associated
5
+ # Filing's Translations.
6
+ class Mapped < Hash
7
+
8
+ attr_accessor :filing, :row_type
9
+ alias :old_bracket :[]
10
+
11
+ def initialize(filing, row_type)
12
+ @filing = filing
13
+ @row_type = row_type
14
+ end
15
+
16
+ # Just calls Hash's [] method, unless the specified key doesn't
17
+ # exist, in which case it checks for any aliases on the filing's
18
+ # translator.
19
+ def [](key, &block)
20
+ if has_key?(key)
21
+ old_bracket(key, &block)
22
+ else
23
+ # Look up aliases in reverse, to find the most recent one
24
+ # Does not allow (obvious) recursion
25
+ aliias = filing.translator.aliases.reverse.detect do |a|
26
+ a[:alias] == key && a[:row].match(row_type) && a[:alias] != a[:for]
27
+ end
28
+ # Pass the key this alias references back to this function
29
+ aliias ? old_bracket(aliias[:for], &block) : nil
30
+ end
31
+ end
32
+
33
+ def method_missing(method, *args, &block)
34
+ self[method]
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,67 @@
1
+ module Fech
2
+ class VersionError < RuntimeError; end
3
+
4
+ # Fech::Mappings loads a set of master mappings between labels and where
5
+ # their values can be found in Electronic Filings for various row types
6
+ # and versions.
7
+ # To access a map, call Mappings.for_row with the row_type,
8
+ # and optionally the version:
9
+ # Mappings.for_row("SA", :version => 6.1)
10
+ class Mappings
11
+
12
+ attr_accessor :map, :version
13
+
14
+ def initialize(ver = Fech::DEFAULT_VERSION)
15
+ @version = ver
16
+ @map = load_map
17
+ @cache = {}
18
+ end
19
+
20
+ # Returns a hash of mappings for row with given row_type
21
+ #
22
+ # @param [String,Symbol] row_type the row type whose map to find
23
+ def for_row(row_type)
24
+ @cache[row_type] ||= self.class.for_row(row_type, :version => @version)
25
+ end
26
+
27
+ # Returns the basic, default mappings hash by reading in a mappings
28
+ # file and saving the variable to the class's context.
29
+ def load_map
30
+ self.class.load_map
31
+ end
32
+
33
+ def self.load_map
34
+ Fech::RENDERED_MAPS
35
+ end
36
+
37
+ # Given a row type, first find the entire block of maps for that row type.
38
+ # Then, use the filing's version to choose which specific map set to use,
39
+ # and return it.
40
+ #
41
+ # @param [Symbol,String,Regex] row_type the row whose map to find
42
+ def self.for_row(row_type, opts={})
43
+ opts[:version] ||= Fech::DEFAULT_VERSION
44
+ map = key_by_regex(load_map, row_type)
45
+ key_by_regex(map, opts[:version])
46
+ end
47
+
48
+ # Given a Hash whose keys are string representations of regular expressions,
49
+ # return the value whose key best matches the given label.
50
+ #
51
+ # @param [Hash] hash a Hash with string regular expressions for keys
52
+ # @param [String,Symbol,Regexp] label return the key that best matches this
53
+ def self.key_by_regex(hash, label)
54
+ label = label.source if label.is_a?(Regexp)
55
+
56
+ # Try matching longer keys first, to ensure more accurate keys are
57
+ # prioritized over less accurate ones.
58
+ hash.keys.sort { |x, y| x.length <=> y.length }.reverse.each do |key|
59
+ return hash[key] if Regexp.new(key, Regexp::IGNORECASE).match(label.to_s)
60
+ end
61
+
62
+ raise VersionError, "Attempted to access mapping that has not been generated (#{label}). " +
63
+ "Supported keys match the format: #{hash.keys.join(', ')}"
64
+ end
65
+
66
+ end
67
+ end