myl-fech 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +7 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +48 -0
- data/LICENSE +13 -0
- data/README.rdoc +82 -0
- data/Rakefile +3 -0
- data/autotest/discover.rb +1 -0
- data/fech.gemspec +40 -0
- data/lib/fech/comparison.rb +36 -0
- data/lib/fech/csv.rb +70 -0
- data/lib/fech/default_translations.rb +133 -0
- data/lib/fech/fech_utils.rb +76 -0
- data/lib/fech/filing.rb +341 -0
- data/lib/fech/map_generator.rb +233 -0
- data/lib/fech/mapped.rb +38 -0
- data/lib/fech/mappings.rb +67 -0
- data/lib/fech/rendered_maps.rb +238 -0
- data/lib/fech/translator.rb +138 -0
- data/lib/fech/version.rb +3 -0
- data/lib/fech.rb +15 -0
- data/sources/F1.csv +106 -0
- data/sources/F1M.csv +78 -0
- data/sources/F2.csv +43 -0
- data/sources/F24.csv +18 -0
- data/sources/F3.csv +1 -0
- data/sources/F3L.csv +27 -0
- data/sources/F3P.csv +208 -0
- data/sources/F3P31.csv +39 -0
- data/sources/F3PS.csv +94 -0
- data/sources/F3S.csv +36 -0
- data/sources/F3X.csv +125 -0
- data/sources/F4.csv +86 -0
- data/sources/F5.csv +39 -0
- data/sources/F56.csv +33 -0
- data/sources/F57.csv +44 -0
- data/sources/F6.csv +1 -0
- data/sources/F65.csv +1 -0
- data/sources/F7.csv +1 -0
- data/sources/F76.csv +1 -0
- data/sources/F9.csv +46 -0
- data/sources/F91.csv +17 -0
- data/sources/F92.csv +23 -0
- data/sources/F93.csv +27 -0
- data/sources/F94.csv +18 -0
- data/sources/F99.csv +1 -0
- data/sources/H1.csv +1 -0
- data/sources/H2.csv +1 -0
- data/sources/H3.csv +1 -0
- data/sources/H4.csv +1 -0
- data/sources/H5.csv +1 -0
- data/sources/H6.csv +1 -0
- data/sources/HDR.csv +10 -0
- data/sources/SchA.csv +50 -0
- data/sources/SchB.csv +50 -0
- data/sources/SchC.csv +41 -0
- data/sources/SchC1.csv +52 -0
- data/sources/SchC2.csv +19 -0
- data/sources/SchD.csv +34 -0
- data/sources/SchE.csv +57 -0
- data/sources/SchF.csv +55 -0
- data/sources/SchL.csv +1 -0
- data/sources/TEXT.csv +1 -0
- data/sources/headers/3.csv +1 -0
- data/sources/headers/5.0.csv +1 -0
- data/sources/headers/5.1.csv +1 -0
- data/sources/headers/5.2.csv +1 -0
- data/sources/headers/5.3.csv +1 -0
- data/sources/headers/6.1.csv +1 -0
- data/sources/headers/6.2.csv +1 -0
- data/sources/headers/6.3.csv +1 -0
- data/sources/headers/6.4.csv +1 -0
- data/sources/headers/7.0.csv +49 -0
- data/sources/headers/8.0.csv +49 -0
- data/sources/headers/ignore.csv +5 -0
- data/spec/comparison_spec.rb +30 -0
- data/spec/data/467627.fec +608 -0
- data/spec/data/723604.fec +4 -0
- data/spec/data/730635.fec +2 -0
- data/spec/data/747058.fec +4 -0
- data/spec/data/748730.fec +1196 -0
- data/spec/data/752356.fec +5 -0
- data/spec/data/753533.fec +7 -0
- data/spec/data/764901.fec +7 -0
- data/spec/data/765310.fec +2 -0
- data/spec/data/767339.fec +648 -0
- data/spec/data/82094.fec +144 -0
- data/spec/data/97405.fec +10 -0
- data/spec/default_translations_spec.rb +104 -0
- data/spec/fech_utils_spec.rb +29 -0
- data/spec/filing_spec.rb +314 -0
- data/spec/map_generator_spec.rb +49 -0
- data/spec/mapped_spec.rb +44 -0
- data/spec/mappings_spec.rb +46 -0
- data/spec/sources/F24.csv +18 -0
- data/spec/sources/F3P.csv +1 -0
- data/spec/sources/F3P31.csv +39 -0
- data/spec/sources/SchA.csv +1 -0
- data/spec/sources/SchB.csv +1 -0
- data/spec/sources/SchC.csv +1 -0
- data/spec/sources/headers/3.csv +1 -0
- data/spec/sources/headers/5.0.csv +1 -0
- data/spec/sources/headers/5.1.csv +1 -0
- data/spec/sources/headers/5.2.csv +1 -0
- data/spec/sources/headers/5.3.csv +1 -0
- data/spec/sources/headers/6.1.csv +1 -0
- data/spec/sources/headers/6.2.csv +1 -0
- data/spec/sources/headers/6.3.csv +1 -0
- data/spec/sources/headers/6.4.csv +1 -0
- data/spec/sources/headers/7.0.csv +1 -0
- data/spec/sources/headers/8.0.csv +49 -0
- data/spec/sources/headers/ignore.csv +5 -0
- data/spec/sources/sa.csv +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/translator_spec.rb +195 -0
- data/tasks/fech.rake +41 -0
- metadata +342 -0
data/lib/fech/filing.rb
ADDED
@@ -0,0 +1,341 @@
|
|
1
|
+
require 'tmpdir'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
module Fech
|
5
|
+
|
6
|
+
# Fech::Filing downloads an Electronic Filing given its ID, and will search
|
7
|
+
# rows by row type. Using a child Translator object, the data in each row
|
8
|
+
# is automatically mapped at runtime into a labeled Hash. Additional
|
9
|
+
# Translations may be added to change the way that data is mapped and cleaned.
|
10
|
+
class Filing
|
11
|
+
# first filing number using the version >=3.00 format
|
12
|
+
# note that there are plenty of <v3 filings after this, so readable? still needs to be checked
|
13
|
+
FIRST_V3_FILING = 11850
|
14
|
+
|
15
|
+
attr_accessor :filing_id, :download_dir, :translator
|
16
|
+
|
17
|
+
# Create a new Filing object, assign the download directory to system's
|
18
|
+
# temp folder by default.
|
19
|
+
# @param [String] download_dir override the directory where files should
|
20
|
+
# be downloaded.
|
21
|
+
# @param [Symbol,Array] translate a list of built-in translation sets to use
|
22
|
+
def initialize(filing_id, opts={})
|
23
|
+
@filing_id = filing_id
|
24
|
+
@download_dir = opts[:download_dir] || Dir.tmpdir
|
25
|
+
@translator = Fech::Translator.new(:include => opts[:translate])
|
26
|
+
@quote_char = opts[:quote_char] || '"'
|
27
|
+
@csv_parser = opts[:csv_parser] || Fech::Csv
|
28
|
+
@resaved = false
|
29
|
+
@customized = false
|
30
|
+
end
|
31
|
+
|
32
|
+
# Saves the filing data from the FEC website into the default download
|
33
|
+
# directory.
|
34
|
+
def download
|
35
|
+
File.open(file_path, 'w') do |file|
|
36
|
+
file << open(filing_url).read
|
37
|
+
end
|
38
|
+
self
|
39
|
+
end
|
40
|
+
|
41
|
+
# This downloads ALL the filings.
|
42
|
+
#
|
43
|
+
# Because this trashes the zip files after extraction (to save space), while it is safe to rerun, it has to do the whole thing over again.
|
44
|
+
# Update operations should just iterate single file downloads starting from the current+1th filing number.
|
45
|
+
#
|
46
|
+
# This takes a very long time to run - on the order of an hour or two, depending on your bandwidth.
|
47
|
+
#
|
48
|
+
# WARNING: As of July 9, 2012, this downloads 536964 files (25.8 GB), into one directory.
|
49
|
+
# This means that the download directory will break bash file globbing (so e.g. ls and rm *.fec will not work).
|
50
|
+
# If you want to get all of it, make sure to download only to a dedicated FEC filings directory.
|
51
|
+
def self.download_all download_dir
|
52
|
+
`cd #{download_dir} && ftp -a ftp.fec.gov:/FEC/electronic/*.zip`
|
53
|
+
`cd #{download_dir} && for z in *.zip; do unzip -o $z && rm $z; done`
|
54
|
+
Dir[File.join(download_dir, '*.fec')].count
|
55
|
+
end
|
56
|
+
|
57
|
+
# Runs the passed block on every downloaded .fec file. Pass the same options hash as you would to Fech::Filing.new.
|
58
|
+
# E.g. for_all(:download_dir => Rails.root.join('db', 'data', 'fec', 'filings', :csv_parser => Fech::CsvDoctor, ...) {|filing| ... }
|
59
|
+
# filing.download is of course unnecessary.
|
60
|
+
#
|
61
|
+
# note that if there are a lot of files (e.g. after download_all), just listing them to prepare for this will take several seconds
|
62
|
+
def self.for_all options = {}
|
63
|
+
options[:download_dir] ||= Dir.tmpdir
|
64
|
+
# .sort{|x| x.scan/\d+/.to_i } # should be no need to spend time on sort, since the file system should already do that
|
65
|
+
Dir[File.join(options[:download_dir], '*.fec')].each do |file|
|
66
|
+
yield Fech::Filing.new(file.scan(/(\d+)\.fec/)[0][0].to_i, options)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Access the header (first) line of the filing, containing information
|
71
|
+
# about the filing's version and metadata about the software used to file it.
|
72
|
+
# @return [Hash] a hash that assigns labels to the values of the filing's header row
|
73
|
+
def header(opts={})
|
74
|
+
each_row do |row|
|
75
|
+
return parse_row?(row)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Access the summary (second) line of the filing, containing aggregate and
|
80
|
+
# top-level information about the filing.
|
81
|
+
# @return [Hash] a hash that assigns labels to the values of the filing's summary row
|
82
|
+
def summary
|
83
|
+
each_row_with_index do |row, index|
|
84
|
+
next if index == 0
|
85
|
+
return parse_row?(row)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Access all lines of the filing that match a given row type. Will return an
|
90
|
+
# Array of all available lines if called directly, or will yield the mapped
|
91
|
+
# rows one by one if a block is passed.
|
92
|
+
#
|
93
|
+
# @param [String, Regexp] row_type a partial or complete name of the type of row desired
|
94
|
+
# @option opts [Boolean] :raw should the function return the data as an array
|
95
|
+
# that has not been mapped to column names
|
96
|
+
# @option opts [Array] :include list of field names that should be included
|
97
|
+
# in the returned hash
|
98
|
+
# @yield [Hash] each matched row's data, as either a mapped hash or raw array
|
99
|
+
# @return [Array] the complete set of mapped hashes for matched lines
|
100
|
+
def rows_like(row_type, opts={}, &block)
|
101
|
+
data = []
|
102
|
+
each_row do |row|
|
103
|
+
value = parse_row?(row, opts.merge(:parse_if => row_type))
|
104
|
+
next if value == false
|
105
|
+
if block_given?
|
106
|
+
yield value
|
107
|
+
else
|
108
|
+
data << value if value
|
109
|
+
end
|
110
|
+
end
|
111
|
+
block_given? ? nil : data
|
112
|
+
end
|
113
|
+
|
114
|
+
# Decides what to do with a given row. If the row's type matches the desired
|
115
|
+
# type, or if no type was specified, it will run the row through #map.
|
116
|
+
# If :raw was passed true, a flat, unmapped data array will be returned.
|
117
|
+
#
|
118
|
+
# @param [String, Regexp] row a partial or complete name of the type of row desired
|
119
|
+
# @option opts [Array] :include list of field names that should be included
|
120
|
+
# in the returned hash
|
121
|
+
def parse_row?(row, opts={})
|
122
|
+
# Always parse, unless :parse_if is given and does not match row
|
123
|
+
if opts[:parse_if].nil? || \
|
124
|
+
Fech.regexify(opts[:parse_if]).match(row.first.downcase)
|
125
|
+
opts[:raw] ? row : map(row, opts)
|
126
|
+
else
|
127
|
+
false
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# Maps a raw row to a labeled hash following any rules given in the filing's
|
132
|
+
# Translator based on its version and row type.
|
133
|
+
# Finds the correct map for a given row, performs any matching Translations
|
134
|
+
# on the individual values, and returns either the entire dataset, or just
|
135
|
+
# those fields requested.
|
136
|
+
# @param [String, Regexp] row a partial or complete name of the type of row desired
|
137
|
+
# @option opts [Array] :include list of field names that should be included
|
138
|
+
# in the returned hash
|
139
|
+
def map(row, opts={})
|
140
|
+
data = Fech::Mapped.new(self, row.first)
|
141
|
+
full_row_map = map_for(row.first)
|
142
|
+
|
143
|
+
# If specific fields were asked for, return only those
|
144
|
+
if opts[:include]
|
145
|
+
row_map = full_row_map.select { |k| opts[:include].include?(k) }
|
146
|
+
else
|
147
|
+
row_map = full_row_map
|
148
|
+
end
|
149
|
+
|
150
|
+
# Inserts the row into data, performing any specified preprocessing
|
151
|
+
# on individual cells along the way
|
152
|
+
row_map.each_with_index do |field, index|
|
153
|
+
value = row[full_row_map.index(field)]
|
154
|
+
translator.get_translations(:row => row.first,
|
155
|
+
:version => filing_version, :action => :convert,
|
156
|
+
:field => field).each do |translation|
|
157
|
+
# User's Procs should be given each field's value as context
|
158
|
+
value = translation[:proc].call(value)
|
159
|
+
end
|
160
|
+
data[field] = value
|
161
|
+
end
|
162
|
+
|
163
|
+
# Performs any specified group preprocessing / combinations
|
164
|
+
combinations = translator.get_translations(:row => row.first,
|
165
|
+
:version => filing_version, :action => :combine)
|
166
|
+
row_hash = hash_zip(row_map, row) if combinations
|
167
|
+
combinations.each do |translation|
|
168
|
+
# User's Procs should be given the entire row as context
|
169
|
+
value = translation[:proc].call(row_hash)
|
170
|
+
field = translation[:field].source.gsub(/[\^\$]*/, "").to_sym
|
171
|
+
data[field] = value
|
172
|
+
end
|
173
|
+
|
174
|
+
data
|
175
|
+
end
|
176
|
+
|
177
|
+
# Returns the column names for given row type and the filing's version
|
178
|
+
# in the order they appear in row data.
|
179
|
+
# @param [String, Regexp] row_type representation of the row desired
|
180
|
+
def map_for(row_type)
|
181
|
+
mappings.for_row(row_type)
|
182
|
+
end
|
183
|
+
|
184
|
+
# Returns the column names for given row type and version in the order
|
185
|
+
# they appear in row data.
|
186
|
+
# @param [String, Regexp] row_type representation of the row desired
|
187
|
+
# @option opts [String, Regexp] :version representation of the version desired
|
188
|
+
def self.map_for(row_type, opts={})
|
189
|
+
Fech::Mappings.for_row(row_type, opts)
|
190
|
+
end
|
191
|
+
|
192
|
+
# @yield [t] returns a reference to the filing's Translator
|
193
|
+
# @yieldparam [Translator] the filing's Translator
|
194
|
+
def translate(&block)
|
195
|
+
if block_given?
|
196
|
+
yield translator
|
197
|
+
else
|
198
|
+
translator
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
# Whether this filing amends a previous filing or not.
|
203
|
+
def amendment?
|
204
|
+
!amends.nil?
|
205
|
+
end
|
206
|
+
|
207
|
+
# Returns the filing ID of the past filing this one amends,
|
208
|
+
# nil if this is a first-draft filing.
|
209
|
+
# :report_id in the HDR line references the amended filing
|
210
|
+
def amends
|
211
|
+
header[:report_id]
|
212
|
+
end
|
213
|
+
|
214
|
+
# Combines an array of keys and values into an Fech::Mapped object,
|
215
|
+
# a type of Hash.
|
216
|
+
# @param [Array] keys the desired keys for the new hash
|
217
|
+
# @param [Array] values the desired values for the new hash
|
218
|
+
# @return [Fech::Mapped, Hash]
|
219
|
+
def hash_zip(keys, values)
|
220
|
+
Fech::Mapped.new(self, values.first).merge(Hash[*keys.zip(values).flatten])
|
221
|
+
end
|
222
|
+
|
223
|
+
# The version of the FEC software used to generate this Filing
|
224
|
+
def filing_version
|
225
|
+
@filing_version ||= parse_filing_version
|
226
|
+
end
|
227
|
+
|
228
|
+
# Pulls out the version number from the header line.
|
229
|
+
# Must parse this line manually, since we don't know the version yet, and
|
230
|
+
# thus the delimiter type is still a mystery.
|
231
|
+
def parse_filing_version
|
232
|
+
first = File.open(file_path).first
|
233
|
+
if first.index("\034").nil?
|
234
|
+
@csv_parser.parse(first).flatten[2]
|
235
|
+
else
|
236
|
+
@csv_parser.parse(first, :col_sep => "\034").flatten[2]
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
# Only FEC format 3.00 + is supported
|
241
|
+
def readable?
|
242
|
+
filing_version.to_i >= 3
|
243
|
+
end
|
244
|
+
|
245
|
+
# Gets or creats the Mappings instance for this filing_version
|
246
|
+
def mappings
|
247
|
+
@mapping ||= Fech::Mappings.new(filing_version)
|
248
|
+
end
|
249
|
+
|
250
|
+
# The location of the Filing on the file system
|
251
|
+
def file_path
|
252
|
+
File.join(download_dir, file_name)
|
253
|
+
end
|
254
|
+
|
255
|
+
# The raw contents of the Filing
|
256
|
+
def file_contents
|
257
|
+
File.open(file_path, 'r')
|
258
|
+
end
|
259
|
+
|
260
|
+
# Determine the form type of the filing
|
261
|
+
# before it's been parsed. This is needed
|
262
|
+
# for the F99 special case.
|
263
|
+
def form_type
|
264
|
+
file_contents.lines.each_with_index do |row, index|
|
265
|
+
next if index == 0
|
266
|
+
return row.split(delimiter).first
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# The file path where custom versions
|
271
|
+
# of a filing are to be saved.
|
272
|
+
def custom_file_path
|
273
|
+
File.join(download_dir, "fech_#{file_name}")
|
274
|
+
end
|
275
|
+
|
276
|
+
# Handle the contents of F99s by removing the
|
277
|
+
# [BEGINTEXT] and [ENDTEXT] delimiters and
|
278
|
+
# putting the text content onto the same
|
279
|
+
# line as the summary.
|
280
|
+
def fix_f99_contents
|
281
|
+
@customized = true
|
282
|
+
content = file_contents.read
|
283
|
+
regex = /\n\[BEGINTEXT\]\n(.*?)\[ENDTEXT\]\n/mi # some use eg [EndText]
|
284
|
+
match = content.match(regex)
|
285
|
+
if match
|
286
|
+
repl = match[1].gsub(/"/, '""')
|
287
|
+
content.gsub(regex, "#{delimiter}\"#{repl}\"")
|
288
|
+
else
|
289
|
+
content
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
# Resave the "fixed" version of an F99
|
294
|
+
def resave_f99_contents
|
295
|
+
return true if @resaved
|
296
|
+
File.open(custom_file_path, 'w') { |f| f.write(fix_f99_contents) }
|
297
|
+
@resaved = true
|
298
|
+
end
|
299
|
+
|
300
|
+
def file_name
|
301
|
+
"#{filing_id}.fec"
|
302
|
+
end
|
303
|
+
|
304
|
+
def filing_url
|
305
|
+
"http://query.nictusa.com/dcdev/posted/#{filing_id}.fec"
|
306
|
+
end
|
307
|
+
|
308
|
+
# Iterates over and yields the Filing's lines
|
309
|
+
# @option opts [Boolean] :with_index yield both the item and its index
|
310
|
+
# @yield [Array] a row of the filing, split by the delimiter from #delimiter
|
311
|
+
def each_row(opts={}, &block)
|
312
|
+
unless File.exists?(file_path)
|
313
|
+
raise "File #{file_path} does not exist. Try invoking the .download method on this Filing object."
|
314
|
+
end
|
315
|
+
|
316
|
+
# If this is an F99, we need to parse it differently.
|
317
|
+
resave_f99_contents if form_type == 'F99'
|
318
|
+
|
319
|
+
c = 0
|
320
|
+
@csv_parser.parse_row(@customized ? custom_file_path : file_path, :col_sep => delimiter, :quote_char => @quote_char, :skip_blanks => true) do |row|
|
321
|
+
if opts[:with_index]
|
322
|
+
yield [row, c]
|
323
|
+
c += 1
|
324
|
+
else
|
325
|
+
yield row
|
326
|
+
end
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
# Wrapper around .each_row to include indexes
|
331
|
+
def each_row_with_index(&block)
|
332
|
+
each_row(:with_index => true, &block)
|
333
|
+
end
|
334
|
+
|
335
|
+
# @return [String] the delimiter used in the filing's version
|
336
|
+
def delimiter
|
337
|
+
filing_version.to_f < 6 ? "," : "\034"
|
338
|
+
end
|
339
|
+
|
340
|
+
end
|
341
|
+
end
|
@@ -0,0 +1,233 @@
|
|
1
|
+
module Fech
|
2
|
+
|
3
|
+
# Helper class to generate mapping hashes from source csv data.
|
4
|
+
# Needed to rebuild rendered_maps.rb with new source data, not used
|
5
|
+
# in main gem.
|
6
|
+
# rake fech:maps
|
7
|
+
class MapGenerator
|
8
|
+
|
9
|
+
attr_accessor :map
|
10
|
+
FILING_VERSIONS = ["8.0", "7.0", "6.4", "6.3", "6.2", "6.1",
|
11
|
+
"5.3", "5.2", "5.1", "5.0", "3"]
|
12
|
+
BASE_ROW_TYPES = ["HDR", "F1", "F1M", "F2", "F24", "F3", "F3L", "F3P", "F3P31", "F3PS", "F3S", "F3X",
|
13
|
+
"F4", "F5", "F56", "F57", "F6", "F65", "F7", "F76", "F9", "F91", "F92", "F93", "F94", "F99",
|
14
|
+
"H1", "H2", "H3", "H4", "H5", "H6",
|
15
|
+
"SchA", "SchB", "SchC", "SchC1", "SchC2", "SchD", "SchE", "SchF", "SchL", "TEXT"]
|
16
|
+
ROW_TYPE_MATCHERS = {
|
17
|
+
"HDR" => FechUtils::ROW_TYPES[:hdr],
|
18
|
+
"F1" => FechUtils::ROW_TYPES[:f1],
|
19
|
+
"F1M" => FechUtils::ROW_TYPES[:f1m],
|
20
|
+
"F2" => FechUtils::ROW_TYPES[:f2],
|
21
|
+
"F24" => FechUtils::ROW_TYPES[:f24],
|
22
|
+
"F3" => FechUtils::ROW_TYPES[:f3],
|
23
|
+
"F3L" => FechUtils::ROW_TYPES[:f3l],
|
24
|
+
"F3P" => FechUtils::ROW_TYPES[:f3p],
|
25
|
+
"F3S" => FechUtils::ROW_TYPES[:f3s],
|
26
|
+
"F3P31" => FechUtils::ROW_TYPES[:f3p31],
|
27
|
+
"F3PS" => FechUtils::ROW_TYPES[:f3ps],
|
28
|
+
"F3X" => FechUtils::ROW_TYPES[:f3x],
|
29
|
+
"F4" => FechUtils::ROW_TYPES[:f4],
|
30
|
+
"F5" => FechUtils::ROW_TYPES[:f5],
|
31
|
+
"F56" => FechUtils::ROW_TYPES[:f56],
|
32
|
+
"F57" => FechUtils::ROW_TYPES[:f57],
|
33
|
+
"F6" => FechUtils::ROW_TYPES[:f6],
|
34
|
+
"F65" => FechUtils::ROW_TYPES[:f65],
|
35
|
+
"F7" => FechUtils::ROW_TYPES[:f7],
|
36
|
+
"F76" => FechUtils::ROW_TYPES[:f76],
|
37
|
+
"F9" => FechUtils::ROW_TYPES[:f9],
|
38
|
+
"F91" => FechUtils::ROW_TYPES[:f91],
|
39
|
+
"F92" => FechUtils::ROW_TYPES[:f92],
|
40
|
+
"F93" => FechUtils::ROW_TYPES[:f93],
|
41
|
+
"F94" => FechUtils::ROW_TYPES[:f94],
|
42
|
+
"F99" => FechUtils::ROW_TYPES[:f99],
|
43
|
+
"H1" => FechUtils::ROW_TYPES[:h1],
|
44
|
+
"H2" => FechUtils::ROW_TYPES[:h2],
|
45
|
+
"H3" => FechUtils::ROW_TYPES[:h3],
|
46
|
+
"H4" => FechUtils::ROW_TYPES[:h4],
|
47
|
+
"H5" => FechUtils::ROW_TYPES[:h5],
|
48
|
+
"H6" => FechUtils::ROW_TYPES[:h6],
|
49
|
+
"SchA" => FechUtils::ROW_TYPES[:sa],
|
50
|
+
"SchB" => FechUtils::ROW_TYPES[:sb],
|
51
|
+
"SchC" => FechUtils::ROW_TYPES[:sc],
|
52
|
+
"SchC1" => FechUtils::ROW_TYPES[:sc1],
|
53
|
+
"SchC2" => FechUtils::ROW_TYPES[:sc2],
|
54
|
+
"SchD" => FechUtils::ROW_TYPES[:sd],
|
55
|
+
"SchE" => FechUtils::ROW_TYPES[:se],
|
56
|
+
"SchF" => FechUtils::ROW_TYPES[:sf],
|
57
|
+
"SchL" => FechUtils::ROW_TYPES[:sl],
|
58
|
+
"TEXT" => FechUtils::ROW_TYPES[:text],
|
59
|
+
}
|
60
|
+
|
61
|
+
# Goes through all version header summary files and generates
|
62
|
+
# row map files for each type of row inside them.
|
63
|
+
def self.convert_header_file_to_row_files(source_dir)
|
64
|
+
data = {}
|
65
|
+
hybrid_data = {}
|
66
|
+
|
67
|
+
ignored_fields = File.open(ignored_fields_file(source_dir)).readlines.map { |l| l.strip }
|
68
|
+
|
69
|
+
# Create a hash of data with an entry for each row type found in the source
|
70
|
+
# version summary files. Each row has an entry for each version map that
|
71
|
+
# exists for it. If maps for two different versions are identical, they
|
72
|
+
# are combined.
|
73
|
+
FILING_VERSIONS.each do |version|
|
74
|
+
filepath = version_summary_file(source_dir, version)
|
75
|
+
|
76
|
+
# Clean the source files by removing unparseable characters
|
77
|
+
if RUBY_VERSION < "1.9.3"
|
78
|
+
require 'iconv'
|
79
|
+
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
80
|
+
valid_string = ic.iconv(open(filepath).read << ' ')[0..-2]
|
81
|
+
else
|
82
|
+
valid_string = (open(filepath).read << ' ')[0..-2].encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '')
|
83
|
+
valid_string = valid_string.encode!('UTF-8', 'UTF-16')
|
84
|
+
end
|
85
|
+
open(filepath, 'w').write(valid_string)
|
86
|
+
|
87
|
+
Fech::Csv.foreach(filepath) do |row|
|
88
|
+
# Each row of a version summary file contains the ordered list of
|
89
|
+
# column names.
|
90
|
+
data[row.first] ||= {}
|
91
|
+
hybrid_data[row.first] ||= {}
|
92
|
+
row_version_data = remove_ignored_fields(row, ignored_fields)
|
93
|
+
|
94
|
+
# Check the maps for this row type in already-processed versions.
|
95
|
+
# If this map is identical to a previous map, tack this version on to
|
96
|
+
# to it instead of creating a new one.
|
97
|
+
data[row.first][version] = row_version_data
|
98
|
+
data[row.first].each do |k, v|
|
99
|
+
# skip the row we just added
|
100
|
+
|
101
|
+
next if k == version
|
102
|
+
if v == row_version_data
|
103
|
+
# Create the new hybrid entry
|
104
|
+
hybrid_data[row.first]["#{k}|#{version}"] = row_version_data
|
105
|
+
|
106
|
+
# Delete the old entry, and the one for this version only
|
107
|
+
data[row.first].delete(k)
|
108
|
+
data[row.first].delete(version)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
data[row.first].update(hybrid_data[row.first])
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# Go through each row type and create a base map management file that
|
116
|
+
# will serve as a template for organizing which fields are the same
|
117
|
+
# between versions. This file will need to then be arranged by hand to
|
118
|
+
# clean up the data. Each row will represent a column across versions,
|
119
|
+
# each column a unique map for that row for one or more versions.
|
120
|
+
data.each do |row_type, row_data|
|
121
|
+
file_path = write_row_map_file(source_dir, row_type)
|
122
|
+
next unless File.exists?(file_path)
|
123
|
+
File.open(file_path, 'w') do |f|
|
124
|
+
f.write('canonical')
|
125
|
+
|
126
|
+
to_transpose = []
|
127
|
+
row_data.sort.reverse.each do |version, version_data|
|
128
|
+
to_transpose << ["^#{version}", version_data.each_with_index.collect {|x, idx| idx+1}].flatten
|
129
|
+
to_transpose << [nil, version_data].flatten
|
130
|
+
end
|
131
|
+
|
132
|
+
# standardize row size
|
133
|
+
max_size = to_transpose.max { |r1, r2| r1.size <=> r2.size }.size
|
134
|
+
to_transpose.each { |r| r[max_size - 1] ||= nil }
|
135
|
+
transposed = to_transpose.transpose
|
136
|
+
|
137
|
+
transposed.each do |transposed_data|
|
138
|
+
transposed_data.collect! {|x| x.to_s.gsub(/\r/, ' ')}
|
139
|
+
canonical = transposed_data[1] # first description
|
140
|
+
if canonical
|
141
|
+
canonical = canonical.gsub(/\{.*\}/, "").gsub(/[ -\.\/\(\)]/, "_").gsub(/_+/, "_").gsub(/(_$)|(^_)/, "").downcase
|
142
|
+
transposed_data = [canonical, transposed_data].flatten
|
143
|
+
end
|
144
|
+
f.write(transposed_data.join(','))
|
145
|
+
f.write("\n")
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
# Generates the mapping for each row type in BASE_ROW_TYPES, writes them out
|
153
|
+
# to file for inclusion in the gem.
|
154
|
+
def self.dump_row_maps_to_ruby(source_dir, file_path)
|
155
|
+
File.open(file_path, 'w') do |f|
|
156
|
+
f.write("# Generated automatically by Fech::MapGenerator.\n\n")
|
157
|
+
f.write("# RENDERED_MAPS contains an entry for each supported row type, which in turn:\n")
|
158
|
+
f.write("# contain an entry for each distinct map between a row's labels and the\n")
|
159
|
+
f.write("# indexes where their values can be found.\n")
|
160
|
+
f.write("module Fech\n")
|
161
|
+
f.write(" RENDERED_MAPS = {\n")
|
162
|
+
BASE_ROW_TYPES.each do |row_type|
|
163
|
+
f.write(" \"#{ROW_TYPE_MATCHERS[row_type].source}\" => {\n")
|
164
|
+
generate_row_map_from_file(source_dir, row_type).sort_by(&:first).reverse.each do |k, v|
|
165
|
+
f.write(" \'#{k}' => [#{v.map {|x| x.to_s.gsub(/^\d+_?/, "") }.collect {|x| (x.nil? || x == "") ? "nil" : ":#{x}" }.join(', ') }],\n")
|
166
|
+
end
|
167
|
+
f.write(" },\n")
|
168
|
+
end
|
169
|
+
f.write(" }\n")
|
170
|
+
f.write("end")
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
# For a given row type, parses its source file and returns
|
175
|
+
# a mapping object for it.
|
176
|
+
def self.generate_row_map_from_file(source_dir, row_type)
|
177
|
+
versions = []
|
178
|
+
version_indexes = []
|
179
|
+
data = {}
|
180
|
+
text = open(row_map_file(source_dir, row_type)).read
|
181
|
+
split_char = text.index(/\r/) ? /\r/ : /\n/
|
182
|
+
rows = text.split(split_char).collect {|x| x.split(',')}
|
183
|
+
rows.each do |row|
|
184
|
+
row = row.collect {|x| x.gsub("\n", "")}
|
185
|
+
if row.first.nil?
|
186
|
+
require 'ruby-debug'; debugger
|
187
|
+
end
|
188
|
+
if row.first.downcase == "canonical"
|
189
|
+
versions = row[1..-1].uniq.collect {|x| x unless (x.nil? || x.empty?)}.compact
|
190
|
+
row.each_with_index {|x, ind| version_indexes << ind unless (x.nil? || x.empty?)}.slice!(1)
|
191
|
+
version_indexes.slice!(0, 1)
|
192
|
+
versions.each {|x| data[x] = [] }
|
193
|
+
|
194
|
+
elsif row.first.size > 0
|
195
|
+
canonical = row.first
|
196
|
+
|
197
|
+
versions.zip(version_indexes).each do |version, row_index|
|
198
|
+
index = row[row_index]
|
199
|
+
data[version][index.to_i - 1] = canonical.to_sym if index.to_i > 0
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
row_map = {}
|
205
|
+
data.each {|key, value| row_map[key] = value}
|
206
|
+
row_map
|
207
|
+
end
|
208
|
+
|
209
|
+
# Remove both the row type from the beginning of the row,
|
210
|
+
# and any fields marked as "ignore" in sources/headers/ignore.csv
|
211
|
+
def self.remove_ignored_fields(row, ignore)
|
212
|
+
data = row[1..-1].compact # strip off the row type
|
213
|
+
data.reject { |f| ignore.include?(f) }
|
214
|
+
end
|
215
|
+
|
216
|
+
def self.row_map_file(source_dir, row_type)
|
217
|
+
File.join(source_dir, row_type + '.csv')
|
218
|
+
end
|
219
|
+
|
220
|
+
def self.ignored_fields_file(source_dir)
|
221
|
+
File.join(source_dir, 'headers', 'ignore.csv')
|
222
|
+
end
|
223
|
+
|
224
|
+
def self.version_summary_file(source_dir, version)
|
225
|
+
File.join(source_dir, 'headers', version + '.csv')
|
226
|
+
end
|
227
|
+
|
228
|
+
def self.write_row_map_file(source_dir, row_type)
|
229
|
+
File.join(source_dir, 'rows', row_type + '.csv')
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
233
|
+
end
|
data/lib/fech/mapped.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
module Fech
|
2
|
+
|
3
|
+
# Fech::Mapped is a thin wrapper around Hash which allows values to be
|
4
|
+
# referenced either by key or by an alias specified in the associated
|
5
|
+
# Filing's Translations.
|
6
|
+
class Mapped < Hash
|
7
|
+
|
8
|
+
attr_accessor :filing, :row_type
|
9
|
+
alias :old_bracket :[]
|
10
|
+
|
11
|
+
def initialize(filing, row_type)
|
12
|
+
@filing = filing
|
13
|
+
@row_type = row_type
|
14
|
+
end
|
15
|
+
|
16
|
+
# Just calls Hash's [] method, unless the specified key doesn't
|
17
|
+
# exist, in which case it checks for any aliases on the filing's
|
18
|
+
# translator.
|
19
|
+
def [](key, &block)
|
20
|
+
if has_key?(key)
|
21
|
+
old_bracket(key, &block)
|
22
|
+
else
|
23
|
+
# Look up aliases in reverse, to find the most recent one
|
24
|
+
# Does not allow (obvious) recursion
|
25
|
+
aliias = filing.translator.aliases.reverse.detect do |a|
|
26
|
+
a[:alias] == key && a[:row].match(row_type) && a[:alias] != a[:for]
|
27
|
+
end
|
28
|
+
# Pass the key this alias references back to this function
|
29
|
+
aliias ? old_bracket(aliias[:for], &block) : nil
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def method_missing(method, *args, &block)
|
34
|
+
self[method]
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module Fech
|
2
|
+
class VersionError < RuntimeError; end
|
3
|
+
|
4
|
+
# Fech::Mappings loads a set of master mappings between labels and where
|
5
|
+
# their values can be found in Electronic Filings for various row types
|
6
|
+
# and versions.
|
7
|
+
# To access a map, call Mappings.for_row with the row_type,
|
8
|
+
# and optionally the version:
|
9
|
+
# Mappings.for_row("SA", :version => 6.1)
|
10
|
+
class Mappings
|
11
|
+
|
12
|
+
attr_accessor :map, :version
|
13
|
+
|
14
|
+
def initialize(ver = Fech::DEFAULT_VERSION)
|
15
|
+
@version = ver
|
16
|
+
@map = load_map
|
17
|
+
@cache = {}
|
18
|
+
end
|
19
|
+
|
20
|
+
# Returns a hash of mappings for row with given row_type
|
21
|
+
#
|
22
|
+
# @param [String,Symbol] row_type the row type whose map to find
|
23
|
+
def for_row(row_type)
|
24
|
+
@cache[row_type] ||= self.class.for_row(row_type, :version => @version)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Returns the basic, default mappings hash by reading in a mappings
|
28
|
+
# file and saving the variable to the class's context.
|
29
|
+
def load_map
|
30
|
+
self.class.load_map
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.load_map
|
34
|
+
Fech::RENDERED_MAPS
|
35
|
+
end
|
36
|
+
|
37
|
+
# Given a row type, first find the entire block of maps for that row type.
|
38
|
+
# Then, use the filing's version to choose which specific map set to use,
|
39
|
+
# and return it.
|
40
|
+
#
|
41
|
+
# @param [Symbol,String,Regex] row_type the row whose map to find
|
42
|
+
def self.for_row(row_type, opts={})
|
43
|
+
opts[:version] ||= Fech::DEFAULT_VERSION
|
44
|
+
map = key_by_regex(load_map, row_type)
|
45
|
+
key_by_regex(map, opts[:version])
|
46
|
+
end
|
47
|
+
|
48
|
+
# Given a Hash whose keys are string representations of regular expressions,
|
49
|
+
# return the value whose key best matches the given label.
|
50
|
+
#
|
51
|
+
# @param [Hash] hash a Hash with string regular expressions for keys
|
52
|
+
# @param [String,Symbol,Regexp] label return the key that best matches this
|
53
|
+
def self.key_by_regex(hash, label)
|
54
|
+
label = label.source if label.is_a?(Regexp)
|
55
|
+
|
56
|
+
# Try matching longer keys first, to ensure more accurate keys are
|
57
|
+
# prioritized over less accurate ones.
|
58
|
+
hash.keys.sort { |x, y| x.length <=> y.length }.reverse.each do |key|
|
59
|
+
return hash[key] if Regexp.new(key, Regexp::IGNORECASE).match(label.to_s)
|
60
|
+
end
|
61
|
+
|
62
|
+
raise VersionError, "Attempted to access mapping that has not been generated (#{label}). " +
|
63
|
+
"Supported keys match the format: #{hash.keys.join(', ')}"
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
end
|