myl-fech 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. data/.gitignore +7 -0
  2. data/.rspec +2 -0
  3. data/Gemfile +4 -0
  4. data/Gemfile.lock +48 -0
  5. data/LICENSE +13 -0
  6. data/README.rdoc +82 -0
  7. data/Rakefile +3 -0
  8. data/autotest/discover.rb +1 -0
  9. data/fech.gemspec +40 -0
  10. data/lib/fech/comparison.rb +36 -0
  11. data/lib/fech/csv.rb +70 -0
  12. data/lib/fech/default_translations.rb +133 -0
  13. data/lib/fech/fech_utils.rb +76 -0
  14. data/lib/fech/filing.rb +341 -0
  15. data/lib/fech/map_generator.rb +233 -0
  16. data/lib/fech/mapped.rb +38 -0
  17. data/lib/fech/mappings.rb +67 -0
  18. data/lib/fech/rendered_maps.rb +238 -0
  19. data/lib/fech/translator.rb +138 -0
  20. data/lib/fech/version.rb +3 -0
  21. data/lib/fech.rb +15 -0
  22. data/sources/F1.csv +106 -0
  23. data/sources/F1M.csv +78 -0
  24. data/sources/F2.csv +43 -0
  25. data/sources/F24.csv +18 -0
  26. data/sources/F3.csv +1 -0
  27. data/sources/F3L.csv +27 -0
  28. data/sources/F3P.csv +208 -0
  29. data/sources/F3P31.csv +39 -0
  30. data/sources/F3PS.csv +94 -0
  31. data/sources/F3S.csv +36 -0
  32. data/sources/F3X.csv +125 -0
  33. data/sources/F4.csv +86 -0
  34. data/sources/F5.csv +39 -0
  35. data/sources/F56.csv +33 -0
  36. data/sources/F57.csv +44 -0
  37. data/sources/F6.csv +1 -0
  38. data/sources/F65.csv +1 -0
  39. data/sources/F7.csv +1 -0
  40. data/sources/F76.csv +1 -0
  41. data/sources/F9.csv +46 -0
  42. data/sources/F91.csv +17 -0
  43. data/sources/F92.csv +23 -0
  44. data/sources/F93.csv +27 -0
  45. data/sources/F94.csv +18 -0
  46. data/sources/F99.csv +1 -0
  47. data/sources/H1.csv +1 -0
  48. data/sources/H2.csv +1 -0
  49. data/sources/H3.csv +1 -0
  50. data/sources/H4.csv +1 -0
  51. data/sources/H5.csv +1 -0
  52. data/sources/H6.csv +1 -0
  53. data/sources/HDR.csv +10 -0
  54. data/sources/SchA.csv +50 -0
  55. data/sources/SchB.csv +50 -0
  56. data/sources/SchC.csv +41 -0
  57. data/sources/SchC1.csv +52 -0
  58. data/sources/SchC2.csv +19 -0
  59. data/sources/SchD.csv +34 -0
  60. data/sources/SchE.csv +57 -0
  61. data/sources/SchF.csv +55 -0
  62. data/sources/SchL.csv +1 -0
  63. data/sources/TEXT.csv +1 -0
  64. data/sources/headers/3.csv +1 -0
  65. data/sources/headers/5.0.csv +1 -0
  66. data/sources/headers/5.1.csv +1 -0
  67. data/sources/headers/5.2.csv +1 -0
  68. data/sources/headers/5.3.csv +1 -0
  69. data/sources/headers/6.1.csv +1 -0
  70. data/sources/headers/6.2.csv +1 -0
  71. data/sources/headers/6.3.csv +1 -0
  72. data/sources/headers/6.4.csv +1 -0
  73. data/sources/headers/7.0.csv +49 -0
  74. data/sources/headers/8.0.csv +49 -0
  75. data/sources/headers/ignore.csv +5 -0
  76. data/spec/comparison_spec.rb +30 -0
  77. data/spec/data/467627.fec +608 -0
  78. data/spec/data/723604.fec +4 -0
  79. data/spec/data/730635.fec +2 -0
  80. data/spec/data/747058.fec +4 -0
  81. data/spec/data/748730.fec +1196 -0
  82. data/spec/data/752356.fec +5 -0
  83. data/spec/data/753533.fec +7 -0
  84. data/spec/data/764901.fec +7 -0
  85. data/spec/data/765310.fec +2 -0
  86. data/spec/data/767339.fec +648 -0
  87. data/spec/data/82094.fec +144 -0
  88. data/spec/data/97405.fec +10 -0
  89. data/spec/default_translations_spec.rb +104 -0
  90. data/spec/fech_utils_spec.rb +29 -0
  91. data/spec/filing_spec.rb +314 -0
  92. data/spec/map_generator_spec.rb +49 -0
  93. data/spec/mapped_spec.rb +44 -0
  94. data/spec/mappings_spec.rb +46 -0
  95. data/spec/sources/F24.csv +18 -0
  96. data/spec/sources/F3P.csv +1 -0
  97. data/spec/sources/F3P31.csv +39 -0
  98. data/spec/sources/SchA.csv +1 -0
  99. data/spec/sources/SchB.csv +1 -0
  100. data/spec/sources/SchC.csv +1 -0
  101. data/spec/sources/headers/3.csv +1 -0
  102. data/spec/sources/headers/5.0.csv +1 -0
  103. data/spec/sources/headers/5.1.csv +1 -0
  104. data/spec/sources/headers/5.2.csv +1 -0
  105. data/spec/sources/headers/5.3.csv +1 -0
  106. data/spec/sources/headers/6.1.csv +1 -0
  107. data/spec/sources/headers/6.2.csv +1 -0
  108. data/spec/sources/headers/6.3.csv +1 -0
  109. data/spec/sources/headers/6.4.csv +1 -0
  110. data/spec/sources/headers/7.0.csv +1 -0
  111. data/spec/sources/headers/8.0.csv +49 -0
  112. data/spec/sources/headers/ignore.csv +5 -0
  113. data/spec/sources/sa.csv +1 -0
  114. data/spec/spec_helper.rb +9 -0
  115. data/spec/translator_spec.rb +195 -0
  116. data/tasks/fech.rake +41 -0
  117. metadata +342 -0
@@ -0,0 +1,341 @@
1
+ require 'tmpdir'
2
+ require 'open-uri'
3
+
4
+ module Fech
5
+
6
+ # Fech::Filing downloads an Electronic Filing given its ID, and will search
7
+ # rows by row type. Using a child Translator object, the data in each row
8
+ # is automatically mapped at runtime into a labeled Hash. Additional
9
+ # Translations may be added to change the way that data is mapped and cleaned.
10
+ class Filing
11
+ # first filing number using the version >=3.00 format
12
+ # note that there are plenty of <v3 filings after this, so readable? still needs to be checked
13
+ FIRST_V3_FILING = 11850
14
+
15
+ attr_accessor :filing_id, :download_dir, :translator
16
+
17
+ # Create a new Filing object, assign the download directory to system's
18
+ # temp folder by default.
19
+ # @param [String] download_dir override the directory where files should
20
+ # be downloaded.
21
+ # @param [Symbol,Array] translate a list of built-in translation sets to use
22
+ def initialize(filing_id, opts={})
23
+ @filing_id = filing_id
24
+ @download_dir = opts[:download_dir] || Dir.tmpdir
25
+ @translator = Fech::Translator.new(:include => opts[:translate])
26
+ @quote_char = opts[:quote_char] || '"'
27
+ @csv_parser = opts[:csv_parser] || Fech::Csv
28
+ @resaved = false
29
+ @customized = false
30
+ end
31
+
32
+ # Saves the filing data from the FEC website into the default download
33
+ # directory.
34
+ def download
35
+ File.open(file_path, 'w') do |file|
36
+ file << open(filing_url).read
37
+ end
38
+ self
39
+ end
40
+
41
+ # This downloads ALL the filings.
42
+ #
43
+ # Because this trashes the zip files after extraction (to save space), while it is safe to rerun, it has to do the whole thing over again.
44
+ # Update operations should just iterate single file downloads starting from the current+1th filing number.
45
+ #
46
+ # This takes a very long time to run - on the order of an hour or two, depending on your bandwidth.
47
+ #
48
+ # WARNING: As of July 9, 2012, this downloads 536964 files (25.8 GB), into one directory.
49
+ # This means that the download directory will break bash file globbing (so e.g. ls and rm *.fec will not work).
50
+ # If you want to get all of it, make sure to download only to a dedicated FEC filings directory.
51
+ def self.download_all download_dir
52
+ `cd #{download_dir} && ftp -a ftp.fec.gov:/FEC/electronic/*.zip`
53
+ `cd #{download_dir} && for z in *.zip; do unzip -o $z && rm $z; done`
54
+ Dir[File.join(download_dir, '*.fec')].count
55
+ end
56
+
57
+ # Runs the passed block on every downloaded .fec file. Pass the same options hash as you would to Fech::Filing.new.
58
+ # E.g. for_all(:download_dir => Rails.root.join('db', 'data', 'fec', 'filings', :csv_parser => Fech::CsvDoctor, ...) {|filing| ... }
59
+ # filing.download is of course unnecessary.
60
+ #
61
+ # note that if there are a lot of files (e.g. after download_all), just listing them to prepare for this will take several seconds
62
+ def self.for_all options = {}
63
+ options[:download_dir] ||= Dir.tmpdir
64
+ # .sort{|x| x.scan/\d+/.to_i } # should be no need to spend time on sort, since the file system should already do that
65
+ Dir[File.join(options[:download_dir], '*.fec')].each do |file|
66
+ yield Fech::Filing.new(file.scan(/(\d+)\.fec/)[0][0].to_i, options)
67
+ end
68
+ end
69
+
70
+ # Access the header (first) line of the filing, containing information
71
+ # about the filing's version and metadata about the software used to file it.
72
+ # @return [Hash] a hash that assigns labels to the values of the filing's header row
73
+ def header(opts={})
74
+ each_row do |row|
75
+ return parse_row?(row)
76
+ end
77
+ end
78
+
79
+ # Access the summary (second) line of the filing, containing aggregate and
80
+ # top-level information about the filing.
81
+ # @return [Hash] a hash that assigns labels to the values of the filing's summary row
82
+ def summary
83
+ each_row_with_index do |row, index|
84
+ next if index == 0
85
+ return parse_row?(row)
86
+ end
87
+ end
88
+
89
+ # Access all lines of the filing that match a given row type. Will return an
90
+ # Array of all available lines if called directly, or will yield the mapped
91
+ # rows one by one if a block is passed.
92
+ #
93
+ # @param [String, Regexp] row_type a partial or complete name of the type of row desired
94
+ # @option opts [Boolean] :raw should the function return the data as an array
95
+ # that has not been mapped to column names
96
+ # @option opts [Array] :include list of field names that should be included
97
+ # in the returned hash
98
+ # @yield [Hash] each matched row's data, as either a mapped hash or raw array
99
+ # @return [Array] the complete set of mapped hashes for matched lines
100
+ def rows_like(row_type, opts={}, &block)
101
+ data = []
102
+ each_row do |row|
103
+ value = parse_row?(row, opts.merge(:parse_if => row_type))
104
+ next if value == false
105
+ if block_given?
106
+ yield value
107
+ else
108
+ data << value if value
109
+ end
110
+ end
111
+ block_given? ? nil : data
112
+ end
113
+
114
+ # Decides what to do with a given row. If the row's type matches the desired
115
+ # type, or if no type was specified, it will run the row through #map.
116
+ # If :raw was passed true, a flat, unmapped data array will be returned.
117
+ #
118
+ # @param [String, Regexp] row a partial or complete name of the type of row desired
119
+ # @option opts [Array] :include list of field names that should be included
120
+ # in the returned hash
121
+ def parse_row?(row, opts={})
122
+ # Always parse, unless :parse_if is given and does not match row
123
+ if opts[:parse_if].nil? || \
124
+ Fech.regexify(opts[:parse_if]).match(row.first.downcase)
125
+ opts[:raw] ? row : map(row, opts)
126
+ else
127
+ false
128
+ end
129
+ end
130
+
131
+ # Maps a raw row to a labeled hash following any rules given in the filing's
132
+ # Translator based on its version and row type.
133
+ # Finds the correct map for a given row, performs any matching Translations
134
+ # on the individual values, and returns either the entire dataset, or just
135
+ # those fields requested.
136
+ # @param [String, Regexp] row a partial or complete name of the type of row desired
137
+ # @option opts [Array] :include list of field names that should be included
138
+ # in the returned hash
139
+ def map(row, opts={})
140
+ data = Fech::Mapped.new(self, row.first)
141
+ full_row_map = map_for(row.first)
142
+
143
+ # If specific fields were asked for, return only those
144
+ if opts[:include]
145
+ row_map = full_row_map.select { |k| opts[:include].include?(k) }
146
+ else
147
+ row_map = full_row_map
148
+ end
149
+
150
+ # Inserts the row into data, performing any specified preprocessing
151
+ # on individual cells along the way
152
+ row_map.each_with_index do |field, index|
153
+ value = row[full_row_map.index(field)]
154
+ translator.get_translations(:row => row.first,
155
+ :version => filing_version, :action => :convert,
156
+ :field => field).each do |translation|
157
+ # User's Procs should be given each field's value as context
158
+ value = translation[:proc].call(value)
159
+ end
160
+ data[field] = value
161
+ end
162
+
163
+ # Performs any specified group preprocessing / combinations
164
+ combinations = translator.get_translations(:row => row.first,
165
+ :version => filing_version, :action => :combine)
166
+ row_hash = hash_zip(row_map, row) if combinations
167
+ combinations.each do |translation|
168
+ # User's Procs should be given the entire row as context
169
+ value = translation[:proc].call(row_hash)
170
+ field = translation[:field].source.gsub(/[\^\$]*/, "").to_sym
171
+ data[field] = value
172
+ end
173
+
174
+ data
175
+ end
176
+
177
+ # Returns the column names for given row type and the filing's version
178
+ # in the order they appear in row data.
179
+ # @param [String, Regexp] row_type representation of the row desired
180
+ def map_for(row_type)
181
+ mappings.for_row(row_type)
182
+ end
183
+
184
+ # Returns the column names for given row type and version in the order
185
+ # they appear in row data.
186
+ # @param [String, Regexp] row_type representation of the row desired
187
+ # @option opts [String, Regexp] :version representation of the version desired
188
+ def self.map_for(row_type, opts={})
189
+ Fech::Mappings.for_row(row_type, opts)
190
+ end
191
+
192
+ # @yield [t] returns a reference to the filing's Translator
193
+ # @yieldparam [Translator] the filing's Translator
194
+ def translate(&block)
195
+ if block_given?
196
+ yield translator
197
+ else
198
+ translator
199
+ end
200
+ end
201
+
202
+ # Whether this filing amends a previous filing or not.
203
+ def amendment?
204
+ !amends.nil?
205
+ end
206
+
207
+ # Returns the filing ID of the past filing this one amends,
208
+ # nil if this is a first-draft filing.
209
+ # :report_id in the HDR line references the amended filing
210
+ def amends
211
+ header[:report_id]
212
+ end
213
+
214
+ # Combines an array of keys and values into an Fech::Mapped object,
215
+ # a type of Hash.
216
+ # @param [Array] keys the desired keys for the new hash
217
+ # @param [Array] values the desired values for the new hash
218
+ # @return [Fech::Mapped, Hash]
219
+ def hash_zip(keys, values)
220
+ Fech::Mapped.new(self, values.first).merge(Hash[*keys.zip(values).flatten])
221
+ end
222
+
223
+ # The version of the FEC software used to generate this Filing
224
+ def filing_version
225
+ @filing_version ||= parse_filing_version
226
+ end
227
+
228
+ # Pulls out the version number from the header line.
229
+ # Must parse this line manually, since we don't know the version yet, and
230
+ # thus the delimiter type is still a mystery.
231
+ def parse_filing_version
232
+ first = File.open(file_path).first
233
+ if first.index("\034").nil?
234
+ @csv_parser.parse(first).flatten[2]
235
+ else
236
+ @csv_parser.parse(first, :col_sep => "\034").flatten[2]
237
+ end
238
+ end
239
+
240
+ # Only FEC format 3.00 + is supported
241
+ def readable?
242
+ filing_version.to_i >= 3
243
+ end
244
+
245
+ # Gets or creats the Mappings instance for this filing_version
246
+ def mappings
247
+ @mapping ||= Fech::Mappings.new(filing_version)
248
+ end
249
+
250
+ # The location of the Filing on the file system
251
+ def file_path
252
+ File.join(download_dir, file_name)
253
+ end
254
+
255
+ # The raw contents of the Filing
256
+ def file_contents
257
+ File.open(file_path, 'r')
258
+ end
259
+
260
+ # Determine the form type of the filing
261
+ # before it's been parsed. This is needed
262
+ # for the F99 special case.
263
+ def form_type
264
+ file_contents.lines.each_with_index do |row, index|
265
+ next if index == 0
266
+ return row.split(delimiter).first
267
+ end
268
+ end
269
+
270
+ # The file path where custom versions
271
+ # of a filing are to be saved.
272
+ def custom_file_path
273
+ File.join(download_dir, "fech_#{file_name}")
274
+ end
275
+
276
+ # Handle the contents of F99s by removing the
277
+ # [BEGINTEXT] and [ENDTEXT] delimiters and
278
+ # putting the text content onto the same
279
+ # line as the summary.
280
+ def fix_f99_contents
281
+ @customized = true
282
+ content = file_contents.read
283
+ regex = /\n\[BEGINTEXT\]\n(.*?)\[ENDTEXT\]\n/mi # some use eg [EndText]
284
+ match = content.match(regex)
285
+ if match
286
+ repl = match[1].gsub(/"/, '""')
287
+ content.gsub(regex, "#{delimiter}\"#{repl}\"")
288
+ else
289
+ content
290
+ end
291
+ end
292
+
293
+ # Resave the "fixed" version of an F99
294
+ def resave_f99_contents
295
+ return true if @resaved
296
+ File.open(custom_file_path, 'w') { |f| f.write(fix_f99_contents) }
297
+ @resaved = true
298
+ end
299
+
300
+ def file_name
301
+ "#{filing_id}.fec"
302
+ end
303
+
304
+ def filing_url
305
+ "http://query.nictusa.com/dcdev/posted/#{filing_id}.fec"
306
+ end
307
+
308
+ # Iterates over and yields the Filing's lines
309
+ # @option opts [Boolean] :with_index yield both the item and its index
310
+ # @yield [Array] a row of the filing, split by the delimiter from #delimiter
311
+ def each_row(opts={}, &block)
312
+ unless File.exists?(file_path)
313
+ raise "File #{file_path} does not exist. Try invoking the .download method on this Filing object."
314
+ end
315
+
316
+ # If this is an F99, we need to parse it differently.
317
+ resave_f99_contents if form_type == 'F99'
318
+
319
+ c = 0
320
+ @csv_parser.parse_row(@customized ? custom_file_path : file_path, :col_sep => delimiter, :quote_char => @quote_char, :skip_blanks => true) do |row|
321
+ if opts[:with_index]
322
+ yield [row, c]
323
+ c += 1
324
+ else
325
+ yield row
326
+ end
327
+ end
328
+ end
329
+
330
+ # Wrapper around .each_row to include indexes
331
+ def each_row_with_index(&block)
332
+ each_row(:with_index => true, &block)
333
+ end
334
+
335
+ # @return [String] the delimiter used in the filing's version
336
+ def delimiter
337
+ filing_version.to_f < 6 ? "," : "\034"
338
+ end
339
+
340
+ end
341
+ end
@@ -0,0 +1,233 @@
1
+ module Fech
2
+
3
+ # Helper class to generate mapping hashes from source csv data.
4
+ # Needed to rebuild rendered_maps.rb with new source data, not used
5
+ # in main gem.
6
+ # rake fech:maps
7
+ class MapGenerator
8
+
9
+ attr_accessor :map
10
+ FILING_VERSIONS = ["8.0", "7.0", "6.4", "6.3", "6.2", "6.1",
11
+ "5.3", "5.2", "5.1", "5.0", "3"]
12
+ BASE_ROW_TYPES = ["HDR", "F1", "F1M", "F2", "F24", "F3", "F3L", "F3P", "F3P31", "F3PS", "F3S", "F3X",
13
+ "F4", "F5", "F56", "F57", "F6", "F65", "F7", "F76", "F9", "F91", "F92", "F93", "F94", "F99",
14
+ "H1", "H2", "H3", "H4", "H5", "H6",
15
+ "SchA", "SchB", "SchC", "SchC1", "SchC2", "SchD", "SchE", "SchF", "SchL", "TEXT"]
16
+ ROW_TYPE_MATCHERS = {
17
+ "HDR" => FechUtils::ROW_TYPES[:hdr],
18
+ "F1" => FechUtils::ROW_TYPES[:f1],
19
+ "F1M" => FechUtils::ROW_TYPES[:f1m],
20
+ "F2" => FechUtils::ROW_TYPES[:f2],
21
+ "F24" => FechUtils::ROW_TYPES[:f24],
22
+ "F3" => FechUtils::ROW_TYPES[:f3],
23
+ "F3L" => FechUtils::ROW_TYPES[:f3l],
24
+ "F3P" => FechUtils::ROW_TYPES[:f3p],
25
+ "F3S" => FechUtils::ROW_TYPES[:f3s],
26
+ "F3P31" => FechUtils::ROW_TYPES[:f3p31],
27
+ "F3PS" => FechUtils::ROW_TYPES[:f3ps],
28
+ "F3X" => FechUtils::ROW_TYPES[:f3x],
29
+ "F4" => FechUtils::ROW_TYPES[:f4],
30
+ "F5" => FechUtils::ROW_TYPES[:f5],
31
+ "F56" => FechUtils::ROW_TYPES[:f56],
32
+ "F57" => FechUtils::ROW_TYPES[:f57],
33
+ "F6" => FechUtils::ROW_TYPES[:f6],
34
+ "F65" => FechUtils::ROW_TYPES[:f65],
35
+ "F7" => FechUtils::ROW_TYPES[:f7],
36
+ "F76" => FechUtils::ROW_TYPES[:f76],
37
+ "F9" => FechUtils::ROW_TYPES[:f9],
38
+ "F91" => FechUtils::ROW_TYPES[:f91],
39
+ "F92" => FechUtils::ROW_TYPES[:f92],
40
+ "F93" => FechUtils::ROW_TYPES[:f93],
41
+ "F94" => FechUtils::ROW_TYPES[:f94],
42
+ "F99" => FechUtils::ROW_TYPES[:f99],
43
+ "H1" => FechUtils::ROW_TYPES[:h1],
44
+ "H2" => FechUtils::ROW_TYPES[:h2],
45
+ "H3" => FechUtils::ROW_TYPES[:h3],
46
+ "H4" => FechUtils::ROW_TYPES[:h4],
47
+ "H5" => FechUtils::ROW_TYPES[:h5],
48
+ "H6" => FechUtils::ROW_TYPES[:h6],
49
+ "SchA" => FechUtils::ROW_TYPES[:sa],
50
+ "SchB" => FechUtils::ROW_TYPES[:sb],
51
+ "SchC" => FechUtils::ROW_TYPES[:sc],
52
+ "SchC1" => FechUtils::ROW_TYPES[:sc1],
53
+ "SchC2" => FechUtils::ROW_TYPES[:sc2],
54
+ "SchD" => FechUtils::ROW_TYPES[:sd],
55
+ "SchE" => FechUtils::ROW_TYPES[:se],
56
+ "SchF" => FechUtils::ROW_TYPES[:sf],
57
+ "SchL" => FechUtils::ROW_TYPES[:sl],
58
+ "TEXT" => FechUtils::ROW_TYPES[:text],
59
+ }
60
+
61
+ # Goes through all version header summary files and generates
62
+ # row map files for each type of row inside them.
63
+ def self.convert_header_file_to_row_files(source_dir)
64
+ data = {}
65
+ hybrid_data = {}
66
+
67
+ ignored_fields = File.open(ignored_fields_file(source_dir)).readlines.map { |l| l.strip }
68
+
69
+ # Create a hash of data with an entry for each row type found in the source
70
+ # version summary files. Each row has an entry for each version map that
71
+ # exists for it. If maps for two different versions are identical, they
72
+ # are combined.
73
+ FILING_VERSIONS.each do |version|
74
+ filepath = version_summary_file(source_dir, version)
75
+
76
+ # Clean the source files by removing unparseable characters
77
+ if RUBY_VERSION < "1.9.3"
78
+ require 'iconv'
79
+ ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
80
+ valid_string = ic.iconv(open(filepath).read << ' ')[0..-2]
81
+ else
82
+ valid_string = (open(filepath).read << ' ')[0..-2].encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '')
83
+ valid_string = valid_string.encode!('UTF-8', 'UTF-16')
84
+ end
85
+ open(filepath, 'w').write(valid_string)
86
+
87
+ Fech::Csv.foreach(filepath) do |row|
88
+ # Each row of a version summary file contains the ordered list of
89
+ # column names.
90
+ data[row.first] ||= {}
91
+ hybrid_data[row.first] ||= {}
92
+ row_version_data = remove_ignored_fields(row, ignored_fields)
93
+
94
+ # Check the maps for this row type in already-processed versions.
95
+ # If this map is identical to a previous map, tack this version on to
96
+ # to it instead of creating a new one.
97
+ data[row.first][version] = row_version_data
98
+ data[row.first].each do |k, v|
99
+ # skip the row we just added
100
+
101
+ next if k == version
102
+ if v == row_version_data
103
+ # Create the new hybrid entry
104
+ hybrid_data[row.first]["#{k}|#{version}"] = row_version_data
105
+
106
+ # Delete the old entry, and the one for this version only
107
+ data[row.first].delete(k)
108
+ data[row.first].delete(version)
109
+ end
110
+ end
111
+ data[row.first].update(hybrid_data[row.first])
112
+ end
113
+ end
114
+
115
+ # Go through each row type and create a base map management file that
116
+ # will serve as a template for organizing which fields are the same
117
+ # between versions. This file will need to then be arranged by hand to
118
+ # clean up the data. Each row will represent a column across versions,
119
+ # each column a unique map for that row for one or more versions.
120
+ data.each do |row_type, row_data|
121
+ file_path = write_row_map_file(source_dir, row_type)
122
+ next unless File.exists?(file_path)
123
+ File.open(file_path, 'w') do |f|
124
+ f.write('canonical')
125
+
126
+ to_transpose = []
127
+ row_data.sort.reverse.each do |version, version_data|
128
+ to_transpose << ["^#{version}", version_data.each_with_index.collect {|x, idx| idx+1}].flatten
129
+ to_transpose << [nil, version_data].flatten
130
+ end
131
+
132
+ # standardize row size
133
+ max_size = to_transpose.max { |r1, r2| r1.size <=> r2.size }.size
134
+ to_transpose.each { |r| r[max_size - 1] ||= nil }
135
+ transposed = to_transpose.transpose
136
+
137
+ transposed.each do |transposed_data|
138
+ transposed_data.collect! {|x| x.to_s.gsub(/\r/, ' ')}
139
+ canonical = transposed_data[1] # first description
140
+ if canonical
141
+ canonical = canonical.gsub(/\{.*\}/, "").gsub(/[ -\.\/\(\)]/, "_").gsub(/_+/, "_").gsub(/(_$)|(^_)/, "").downcase
142
+ transposed_data = [canonical, transposed_data].flatten
143
+ end
144
+ f.write(transposed_data.join(','))
145
+ f.write("\n")
146
+ end
147
+ end
148
+ end
149
+
150
+ end
151
+
152
+ # Generates the mapping for each row type in BASE_ROW_TYPES, writes them out
153
+ # to file for inclusion in the gem.
154
+ def self.dump_row_maps_to_ruby(source_dir, file_path)
155
+ File.open(file_path, 'w') do |f|
156
+ f.write("# Generated automatically by Fech::MapGenerator.\n\n")
157
+ f.write("# RENDERED_MAPS contains an entry for each supported row type, which in turn:\n")
158
+ f.write("# contain an entry for each distinct map between a row's labels and the\n")
159
+ f.write("# indexes where their values can be found.\n")
160
+ f.write("module Fech\n")
161
+ f.write(" RENDERED_MAPS = {\n")
162
+ BASE_ROW_TYPES.each do |row_type|
163
+ f.write(" \"#{ROW_TYPE_MATCHERS[row_type].source}\" => {\n")
164
+ generate_row_map_from_file(source_dir, row_type).sort_by(&:first).reverse.each do |k, v|
165
+ f.write(" \'#{k}' => [#{v.map {|x| x.to_s.gsub(/^\d+_?/, "") }.collect {|x| (x.nil? || x == "") ? "nil" : ":#{x}" }.join(', ') }],\n")
166
+ end
167
+ f.write(" },\n")
168
+ end
169
+ f.write(" }\n")
170
+ f.write("end")
171
+ end
172
+ end
173
+
174
+ # For a given row type, parses its source file and returns
175
+ # a mapping object for it.
176
+ def self.generate_row_map_from_file(source_dir, row_type)
177
+ versions = []
178
+ version_indexes = []
179
+ data = {}
180
+ text = open(row_map_file(source_dir, row_type)).read
181
+ split_char = text.index(/\r/) ? /\r/ : /\n/
182
+ rows = text.split(split_char).collect {|x| x.split(',')}
183
+ rows.each do |row|
184
+ row = row.collect {|x| x.gsub("\n", "")}
185
+ if row.first.nil?
186
+ require 'ruby-debug'; debugger
187
+ end
188
+ if row.first.downcase == "canonical"
189
+ versions = row[1..-1].uniq.collect {|x| x unless (x.nil? || x.empty?)}.compact
190
+ row.each_with_index {|x, ind| version_indexes << ind unless (x.nil? || x.empty?)}.slice!(1)
191
+ version_indexes.slice!(0, 1)
192
+ versions.each {|x| data[x] = [] }
193
+
194
+ elsif row.first.size > 0
195
+ canonical = row.first
196
+
197
+ versions.zip(version_indexes).each do |version, row_index|
198
+ index = row[row_index]
199
+ data[version][index.to_i - 1] = canonical.to_sym if index.to_i > 0
200
+ end
201
+ end
202
+ end
203
+
204
+ row_map = {}
205
+ data.each {|key, value| row_map[key] = value}
206
+ row_map
207
+ end
208
+
209
+ # Remove both the row type from the beginning of the row,
210
+ # and any fields marked as "ignore" in sources/headers/ignore.csv
211
+ def self.remove_ignored_fields(row, ignore)
212
+ data = row[1..-1].compact # strip off the row type
213
+ data.reject { |f| ignore.include?(f) }
214
+ end
215
+
216
+ def self.row_map_file(source_dir, row_type)
217
+ File.join(source_dir, row_type + '.csv')
218
+ end
219
+
220
+ def self.ignored_fields_file(source_dir)
221
+ File.join(source_dir, 'headers', 'ignore.csv')
222
+ end
223
+
224
+ def self.version_summary_file(source_dir, version)
225
+ File.join(source_dir, 'headers', version + '.csv')
226
+ end
227
+
228
+ def self.write_row_map_file(source_dir, row_type)
229
+ File.join(source_dir, 'rows', row_type + '.csv')
230
+ end
231
+
232
+ end
233
+ end
@@ -0,0 +1,38 @@
1
+ module Fech
2
+
3
+ # Fech::Mapped is a thin wrapper around Hash which allows values to be
4
+ # referenced either by key or by an alias specified in the associated
5
+ # Filing's Translations.
6
+ class Mapped < Hash
7
+
8
+ attr_accessor :filing, :row_type
9
+ alias :old_bracket :[]
10
+
11
+ def initialize(filing, row_type)
12
+ @filing = filing
13
+ @row_type = row_type
14
+ end
15
+
16
+ # Just calls Hash's [] method, unless the specified key doesn't
17
+ # exist, in which case it checks for any aliases on the filing's
18
+ # translator.
19
+ def [](key, &block)
20
+ if has_key?(key)
21
+ old_bracket(key, &block)
22
+ else
23
+ # Look up aliases in reverse, to find the most recent one
24
+ # Does not allow (obvious) recursion
25
+ aliias = filing.translator.aliases.reverse.detect do |a|
26
+ a[:alias] == key && a[:row].match(row_type) && a[:alias] != a[:for]
27
+ end
28
+ # Pass the key this alias references back to this function
29
+ aliias ? old_bracket(aliias[:for], &block) : nil
30
+ end
31
+ end
32
+
33
+ def method_missing(method, *args, &block)
34
+ self[method]
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,67 @@
1
+ module Fech
2
+ class VersionError < RuntimeError; end
3
+
4
+ # Fech::Mappings loads a set of master mappings between labels and where
5
+ # their values can be found in Electronic Filings for various row types
6
+ # and versions.
7
+ # To access a map, call Mappings.for_row with the row_type,
8
+ # and optionally the version:
9
+ # Mappings.for_row("SA", :version => 6.1)
10
+ class Mappings
11
+
12
+ attr_accessor :map, :version
13
+
14
+ def initialize(ver = Fech::DEFAULT_VERSION)
15
+ @version = ver
16
+ @map = load_map
17
+ @cache = {}
18
+ end
19
+
20
+ # Returns a hash of mappings for row with given row_type
21
+ #
22
+ # @param [String,Symbol] row_type the row type whose map to find
23
+ def for_row(row_type)
24
+ @cache[row_type] ||= self.class.for_row(row_type, :version => @version)
25
+ end
26
+
27
+ # Returns the basic, default mappings hash by reading in a mappings
28
+ # file and saving the variable to the class's context.
29
+ def load_map
30
+ self.class.load_map
31
+ end
32
+
33
+ def self.load_map
34
+ Fech::RENDERED_MAPS
35
+ end
36
+
37
+ # Given a row type, first find the entire block of maps for that row type.
38
+ # Then, use the filing's version to choose which specific map set to use,
39
+ # and return it.
40
+ #
41
+ # @param [Symbol,String,Regex] row_type the row whose map to find
42
+ def self.for_row(row_type, opts={})
43
+ opts[:version] ||= Fech::DEFAULT_VERSION
44
+ map = key_by_regex(load_map, row_type)
45
+ key_by_regex(map, opts[:version])
46
+ end
47
+
48
+ # Given a Hash whose keys are string representations of regular expressions,
49
+ # return the value whose key best matches the given label.
50
+ #
51
+ # @param [Hash] hash a Hash with string regular expressions for keys
52
+ # @param [String,Symbol,Regexp] label return the key that best matches this
53
+ def self.key_by_regex(hash, label)
54
+ label = label.source if label.is_a?(Regexp)
55
+
56
+ # Try matching longer keys first, to ensure more accurate keys are
57
+ # prioritized over less accurate ones.
58
+ hash.keys.sort { |x, y| x.length <=> y.length }.reverse.each do |key|
59
+ return hash[key] if Regexp.new(key, Regexp::IGNORECASE).match(label.to_s)
60
+ end
61
+
62
+ raise VersionError, "Attempted to access mapping that has not been generated (#{label}). " +
63
+ "Supported keys match the format: #{hash.keys.join(', ')}"
64
+ end
65
+
66
+ end
67
+ end