myl-fech 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +7 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +48 -0
- data/LICENSE +13 -0
- data/README.rdoc +82 -0
- data/Rakefile +3 -0
- data/autotest/discover.rb +1 -0
- data/fech.gemspec +40 -0
- data/lib/fech/comparison.rb +36 -0
- data/lib/fech/csv.rb +70 -0
- data/lib/fech/default_translations.rb +133 -0
- data/lib/fech/fech_utils.rb +76 -0
- data/lib/fech/filing.rb +341 -0
- data/lib/fech/map_generator.rb +233 -0
- data/lib/fech/mapped.rb +38 -0
- data/lib/fech/mappings.rb +67 -0
- data/lib/fech/rendered_maps.rb +238 -0
- data/lib/fech/translator.rb +138 -0
- data/lib/fech/version.rb +3 -0
- data/lib/fech.rb +15 -0
- data/sources/F1.csv +106 -0
- data/sources/F1M.csv +78 -0
- data/sources/F2.csv +43 -0
- data/sources/F24.csv +18 -0
- data/sources/F3.csv +1 -0
- data/sources/F3L.csv +27 -0
- data/sources/F3P.csv +208 -0
- data/sources/F3P31.csv +39 -0
- data/sources/F3PS.csv +94 -0
- data/sources/F3S.csv +36 -0
- data/sources/F3X.csv +125 -0
- data/sources/F4.csv +86 -0
- data/sources/F5.csv +39 -0
- data/sources/F56.csv +33 -0
- data/sources/F57.csv +44 -0
- data/sources/F6.csv +1 -0
- data/sources/F65.csv +1 -0
- data/sources/F7.csv +1 -0
- data/sources/F76.csv +1 -0
- data/sources/F9.csv +46 -0
- data/sources/F91.csv +17 -0
- data/sources/F92.csv +23 -0
- data/sources/F93.csv +27 -0
- data/sources/F94.csv +18 -0
- data/sources/F99.csv +1 -0
- data/sources/H1.csv +1 -0
- data/sources/H2.csv +1 -0
- data/sources/H3.csv +1 -0
- data/sources/H4.csv +1 -0
- data/sources/H5.csv +1 -0
- data/sources/H6.csv +1 -0
- data/sources/HDR.csv +10 -0
- data/sources/SchA.csv +50 -0
- data/sources/SchB.csv +50 -0
- data/sources/SchC.csv +41 -0
- data/sources/SchC1.csv +52 -0
- data/sources/SchC2.csv +19 -0
- data/sources/SchD.csv +34 -0
- data/sources/SchE.csv +57 -0
- data/sources/SchF.csv +55 -0
- data/sources/SchL.csv +1 -0
- data/sources/TEXT.csv +1 -0
- data/sources/headers/3.csv +1 -0
- data/sources/headers/5.0.csv +1 -0
- data/sources/headers/5.1.csv +1 -0
- data/sources/headers/5.2.csv +1 -0
- data/sources/headers/5.3.csv +1 -0
- data/sources/headers/6.1.csv +1 -0
- data/sources/headers/6.2.csv +1 -0
- data/sources/headers/6.3.csv +1 -0
- data/sources/headers/6.4.csv +1 -0
- data/sources/headers/7.0.csv +49 -0
- data/sources/headers/8.0.csv +49 -0
- data/sources/headers/ignore.csv +5 -0
- data/spec/comparison_spec.rb +30 -0
- data/spec/data/467627.fec +608 -0
- data/spec/data/723604.fec +4 -0
- data/spec/data/730635.fec +2 -0
- data/spec/data/747058.fec +4 -0
- data/spec/data/748730.fec +1196 -0
- data/spec/data/752356.fec +5 -0
- data/spec/data/753533.fec +7 -0
- data/spec/data/764901.fec +7 -0
- data/spec/data/765310.fec +2 -0
- data/spec/data/767339.fec +648 -0
- data/spec/data/82094.fec +144 -0
- data/spec/data/97405.fec +10 -0
- data/spec/default_translations_spec.rb +104 -0
- data/spec/fech_utils_spec.rb +29 -0
- data/spec/filing_spec.rb +314 -0
- data/spec/map_generator_spec.rb +49 -0
- data/spec/mapped_spec.rb +44 -0
- data/spec/mappings_spec.rb +46 -0
- data/spec/sources/F24.csv +18 -0
- data/spec/sources/F3P.csv +1 -0
- data/spec/sources/F3P31.csv +39 -0
- data/spec/sources/SchA.csv +1 -0
- data/spec/sources/SchB.csv +1 -0
- data/spec/sources/SchC.csv +1 -0
- data/spec/sources/headers/3.csv +1 -0
- data/spec/sources/headers/5.0.csv +1 -0
- data/spec/sources/headers/5.1.csv +1 -0
- data/spec/sources/headers/5.2.csv +1 -0
- data/spec/sources/headers/5.3.csv +1 -0
- data/spec/sources/headers/6.1.csv +1 -0
- data/spec/sources/headers/6.2.csv +1 -0
- data/spec/sources/headers/6.3.csv +1 -0
- data/spec/sources/headers/6.4.csv +1 -0
- data/spec/sources/headers/7.0.csv +1 -0
- data/spec/sources/headers/8.0.csv +49 -0
- data/spec/sources/headers/ignore.csv +5 -0
- data/spec/sources/sa.csv +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/translator_spec.rb +195 -0
- data/tasks/fech.rake +41 -0
- metadata +342 -0
data/lib/fech/filing.rb
ADDED
@@ -0,0 +1,341 @@
|
|
1
|
+
require 'tmpdir'
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
module Fech
|
5
|
+
|
6
|
+
# Fech::Filing downloads an Electronic Filing given its ID, and will search
|
7
|
+
# rows by row type. Using a child Translator object, the data in each row
|
8
|
+
# is automatically mapped at runtime into a labeled Hash. Additional
|
9
|
+
# Translations may be added to change the way that data is mapped and cleaned.
|
10
|
+
class Filing
|
11
|
+
# first filing number using the version >=3.00 format
|
12
|
+
# note that there are plenty of <v3 filings after this, so readable? still needs to be checked
|
13
|
+
FIRST_V3_FILING = 11850
|
14
|
+
|
15
|
+
attr_accessor :filing_id, :download_dir, :translator
|
16
|
+
|
17
|
+
# Create a new Filing object, assign the download directory to system's
|
18
|
+
# temp folder by default.
|
19
|
+
# @param [String] download_dir override the directory where files should
|
20
|
+
# be downloaded.
|
21
|
+
# @param [Symbol,Array] translate a list of built-in translation sets to use
|
22
|
+
def initialize(filing_id, opts={})
|
23
|
+
@filing_id = filing_id
|
24
|
+
@download_dir = opts[:download_dir] || Dir.tmpdir
|
25
|
+
@translator = Fech::Translator.new(:include => opts[:translate])
|
26
|
+
@quote_char = opts[:quote_char] || '"'
|
27
|
+
@csv_parser = opts[:csv_parser] || Fech::Csv
|
28
|
+
@resaved = false
|
29
|
+
@customized = false
|
30
|
+
end
|
31
|
+
|
32
|
+
# Saves the filing data from the FEC website into the default download
|
33
|
+
# directory.
|
34
|
+
def download
|
35
|
+
File.open(file_path, 'w') do |file|
|
36
|
+
file << open(filing_url).read
|
37
|
+
end
|
38
|
+
self
|
39
|
+
end
|
40
|
+
|
41
|
+
# This downloads ALL the filings.
|
42
|
+
#
|
43
|
+
# Because this trashes the zip files after extraction (to save space), while it is safe to rerun, it has to do the whole thing over again.
|
44
|
+
# Update operations should just iterate single file downloads starting from the current+1th filing number.
|
45
|
+
#
|
46
|
+
# This takes a very long time to run - on the order of an hour or two, depending on your bandwidth.
|
47
|
+
#
|
48
|
+
# WARNING: As of July 9, 2012, this downloads 536964 files (25.8 GB), into one directory.
|
49
|
+
# This means that the download directory will break bash file globbing (so e.g. ls and rm *.fec will not work).
|
50
|
+
# If you want to get all of it, make sure to download only to a dedicated FEC filings directory.
|
51
|
+
def self.download_all download_dir
|
52
|
+
`cd #{download_dir} && ftp -a ftp.fec.gov:/FEC/electronic/*.zip`
|
53
|
+
`cd #{download_dir} && for z in *.zip; do unzip -o $z && rm $z; done`
|
54
|
+
Dir[File.join(download_dir, '*.fec')].count
|
55
|
+
end
|
56
|
+
|
57
|
+
# Runs the passed block on every downloaded .fec file. Pass the same options hash as you would to Fech::Filing.new.
|
58
|
+
# E.g. for_all(:download_dir => Rails.root.join('db', 'data', 'fec', 'filings', :csv_parser => Fech::CsvDoctor, ...) {|filing| ... }
|
59
|
+
# filing.download is of course unnecessary.
|
60
|
+
#
|
61
|
+
# note that if there are a lot of files (e.g. after download_all), just listing them to prepare for this will take several seconds
|
62
|
+
def self.for_all options = {}
|
63
|
+
options[:download_dir] ||= Dir.tmpdir
|
64
|
+
# .sort{|x| x.scan/\d+/.to_i } # should be no need to spend time on sort, since the file system should already do that
|
65
|
+
Dir[File.join(options[:download_dir], '*.fec')].each do |file|
|
66
|
+
yield Fech::Filing.new(file.scan(/(\d+)\.fec/)[0][0].to_i, options)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Access the header (first) line of the filing, containing information
|
71
|
+
# about the filing's version and metadata about the software used to file it.
|
72
|
+
# @return [Hash] a hash that assigns labels to the values of the filing's header row
|
73
|
+
def header(opts={})
|
74
|
+
each_row do |row|
|
75
|
+
return parse_row?(row)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# Access the summary (second) line of the filing, containing aggregate and
|
80
|
+
# top-level information about the filing.
|
81
|
+
# @return [Hash] a hash that assigns labels to the values of the filing's summary row
|
82
|
+
def summary
|
83
|
+
each_row_with_index do |row, index|
|
84
|
+
next if index == 0
|
85
|
+
return parse_row?(row)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# Access all lines of the filing that match a given row type. Will return an
|
90
|
+
# Array of all available lines if called directly, or will yield the mapped
|
91
|
+
# rows one by one if a block is passed.
|
92
|
+
#
|
93
|
+
# @param [String, Regexp] row_type a partial or complete name of the type of row desired
|
94
|
+
# @option opts [Boolean] :raw should the function return the data as an array
|
95
|
+
# that has not been mapped to column names
|
96
|
+
# @option opts [Array] :include list of field names that should be included
|
97
|
+
# in the returned hash
|
98
|
+
# @yield [Hash] each matched row's data, as either a mapped hash or raw array
|
99
|
+
# @return [Array] the complete set of mapped hashes for matched lines
|
100
|
+
def rows_like(row_type, opts={}, &block)
|
101
|
+
data = []
|
102
|
+
each_row do |row|
|
103
|
+
value = parse_row?(row, opts.merge(:parse_if => row_type))
|
104
|
+
next if value == false
|
105
|
+
if block_given?
|
106
|
+
yield value
|
107
|
+
else
|
108
|
+
data << value if value
|
109
|
+
end
|
110
|
+
end
|
111
|
+
block_given? ? nil : data
|
112
|
+
end
|
113
|
+
|
114
|
+
# Decides what to do with a given row. If the row's type matches the desired
|
115
|
+
# type, or if no type was specified, it will run the row through #map.
|
116
|
+
# If :raw was passed true, a flat, unmapped data array will be returned.
|
117
|
+
#
|
118
|
+
# @param [String, Regexp] row a partial or complete name of the type of row desired
|
119
|
+
# @option opts [Array] :include list of field names that should be included
|
120
|
+
# in the returned hash
|
121
|
+
def parse_row?(row, opts={})
|
122
|
+
# Always parse, unless :parse_if is given and does not match row
|
123
|
+
if opts[:parse_if].nil? || \
|
124
|
+
Fech.regexify(opts[:parse_if]).match(row.first.downcase)
|
125
|
+
opts[:raw] ? row : map(row, opts)
|
126
|
+
else
|
127
|
+
false
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# Maps a raw row to a labeled hash following any rules given in the filing's
|
132
|
+
# Translator based on its version and row type.
|
133
|
+
# Finds the correct map for a given row, performs any matching Translations
|
134
|
+
# on the individual values, and returns either the entire dataset, or just
|
135
|
+
# those fields requested.
|
136
|
+
# @param [String, Regexp] row a partial or complete name of the type of row desired
|
137
|
+
# @option opts [Array] :include list of field names that should be included
|
138
|
+
# in the returned hash
|
139
|
+
def map(row, opts={})
|
140
|
+
data = Fech::Mapped.new(self, row.first)
|
141
|
+
full_row_map = map_for(row.first)
|
142
|
+
|
143
|
+
# If specific fields were asked for, return only those
|
144
|
+
if opts[:include]
|
145
|
+
row_map = full_row_map.select { |k| opts[:include].include?(k) }
|
146
|
+
else
|
147
|
+
row_map = full_row_map
|
148
|
+
end
|
149
|
+
|
150
|
+
# Inserts the row into data, performing any specified preprocessing
|
151
|
+
# on individual cells along the way
|
152
|
+
row_map.each_with_index do |field, index|
|
153
|
+
value = row[full_row_map.index(field)]
|
154
|
+
translator.get_translations(:row => row.first,
|
155
|
+
:version => filing_version, :action => :convert,
|
156
|
+
:field => field).each do |translation|
|
157
|
+
# User's Procs should be given each field's value as context
|
158
|
+
value = translation[:proc].call(value)
|
159
|
+
end
|
160
|
+
data[field] = value
|
161
|
+
end
|
162
|
+
|
163
|
+
# Performs any specified group preprocessing / combinations
|
164
|
+
combinations = translator.get_translations(:row => row.first,
|
165
|
+
:version => filing_version, :action => :combine)
|
166
|
+
row_hash = hash_zip(row_map, row) if combinations
|
167
|
+
combinations.each do |translation|
|
168
|
+
# User's Procs should be given the entire row as context
|
169
|
+
value = translation[:proc].call(row_hash)
|
170
|
+
field = translation[:field].source.gsub(/[\^\$]*/, "").to_sym
|
171
|
+
data[field] = value
|
172
|
+
end
|
173
|
+
|
174
|
+
data
|
175
|
+
end
|
176
|
+
|
177
|
+
# Returns the column names for given row type and the filing's version
|
178
|
+
# in the order they appear in row data.
|
179
|
+
# @param [String, Regexp] row_type representation of the row desired
|
180
|
+
def map_for(row_type)
|
181
|
+
mappings.for_row(row_type)
|
182
|
+
end
|
183
|
+
|
184
|
+
# Returns the column names for given row type and version in the order
|
185
|
+
# they appear in row data.
|
186
|
+
# @param [String, Regexp] row_type representation of the row desired
|
187
|
+
# @option opts [String, Regexp] :version representation of the version desired
|
188
|
+
def self.map_for(row_type, opts={})
|
189
|
+
Fech::Mappings.for_row(row_type, opts)
|
190
|
+
end
|
191
|
+
|
192
|
+
# @yield [t] returns a reference to the filing's Translator
|
193
|
+
# @yieldparam [Translator] the filing's Translator
|
194
|
+
def translate(&block)
|
195
|
+
if block_given?
|
196
|
+
yield translator
|
197
|
+
else
|
198
|
+
translator
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
# Whether this filing amends a previous filing or not.
|
203
|
+
def amendment?
|
204
|
+
!amends.nil?
|
205
|
+
end
|
206
|
+
|
207
|
+
# Returns the filing ID of the past filing this one amends,
|
208
|
+
# nil if this is a first-draft filing.
|
209
|
+
# :report_id in the HDR line references the amended filing
|
210
|
+
def amends
|
211
|
+
header[:report_id]
|
212
|
+
end
|
213
|
+
|
214
|
+
# Combines an array of keys and values into an Fech::Mapped object,
|
215
|
+
# a type of Hash.
|
216
|
+
# @param [Array] keys the desired keys for the new hash
|
217
|
+
# @param [Array] values the desired values for the new hash
|
218
|
+
# @return [Fech::Mapped, Hash]
|
219
|
+
def hash_zip(keys, values)
|
220
|
+
Fech::Mapped.new(self, values.first).merge(Hash[*keys.zip(values).flatten])
|
221
|
+
end
|
222
|
+
|
223
|
+
# The version of the FEC software used to generate this Filing
|
224
|
+
def filing_version
|
225
|
+
@filing_version ||= parse_filing_version
|
226
|
+
end
|
227
|
+
|
228
|
+
# Pulls out the version number from the header line.
|
229
|
+
# Must parse this line manually, since we don't know the version yet, and
|
230
|
+
# thus the delimiter type is still a mystery.
|
231
|
+
def parse_filing_version
|
232
|
+
first = File.open(file_path).first
|
233
|
+
if first.index("\034").nil?
|
234
|
+
@csv_parser.parse(first).flatten[2]
|
235
|
+
else
|
236
|
+
@csv_parser.parse(first, :col_sep => "\034").flatten[2]
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
# Only FEC format 3.00 + is supported
|
241
|
+
def readable?
|
242
|
+
filing_version.to_i >= 3
|
243
|
+
end
|
244
|
+
|
245
|
+
# Gets or creats the Mappings instance for this filing_version
|
246
|
+
def mappings
|
247
|
+
@mapping ||= Fech::Mappings.new(filing_version)
|
248
|
+
end
|
249
|
+
|
250
|
+
# The location of the Filing on the file system
|
251
|
+
def file_path
|
252
|
+
File.join(download_dir, file_name)
|
253
|
+
end
|
254
|
+
|
255
|
+
# The raw contents of the Filing
|
256
|
+
def file_contents
|
257
|
+
File.open(file_path, 'r')
|
258
|
+
end
|
259
|
+
|
260
|
+
# Determine the form type of the filing
|
261
|
+
# before it's been parsed. This is needed
|
262
|
+
# for the F99 special case.
|
263
|
+
def form_type
|
264
|
+
file_contents.lines.each_with_index do |row, index|
|
265
|
+
next if index == 0
|
266
|
+
return row.split(delimiter).first
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
# The file path where custom versions
|
271
|
+
# of a filing are to be saved.
|
272
|
+
def custom_file_path
|
273
|
+
File.join(download_dir, "fech_#{file_name}")
|
274
|
+
end
|
275
|
+
|
276
|
+
# Handle the contents of F99s by removing the
|
277
|
+
# [BEGINTEXT] and [ENDTEXT] delimiters and
|
278
|
+
# putting the text content onto the same
|
279
|
+
# line as the summary.
|
280
|
+
def fix_f99_contents
|
281
|
+
@customized = true
|
282
|
+
content = file_contents.read
|
283
|
+
regex = /\n\[BEGINTEXT\]\n(.*?)\[ENDTEXT\]\n/mi # some use eg [EndText]
|
284
|
+
match = content.match(regex)
|
285
|
+
if match
|
286
|
+
repl = match[1].gsub(/"/, '""')
|
287
|
+
content.gsub(regex, "#{delimiter}\"#{repl}\"")
|
288
|
+
else
|
289
|
+
content
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
# Resave the "fixed" version of an F99
|
294
|
+
def resave_f99_contents
|
295
|
+
return true if @resaved
|
296
|
+
File.open(custom_file_path, 'w') { |f| f.write(fix_f99_contents) }
|
297
|
+
@resaved = true
|
298
|
+
end
|
299
|
+
|
300
|
+
def file_name
|
301
|
+
"#{filing_id}.fec"
|
302
|
+
end
|
303
|
+
|
304
|
+
def filing_url
|
305
|
+
"http://query.nictusa.com/dcdev/posted/#{filing_id}.fec"
|
306
|
+
end
|
307
|
+
|
308
|
+
# Iterates over and yields the Filing's lines
|
309
|
+
# @option opts [Boolean] :with_index yield both the item and its index
|
310
|
+
# @yield [Array] a row of the filing, split by the delimiter from #delimiter
|
311
|
+
def each_row(opts={}, &block)
|
312
|
+
unless File.exists?(file_path)
|
313
|
+
raise "File #{file_path} does not exist. Try invoking the .download method on this Filing object."
|
314
|
+
end
|
315
|
+
|
316
|
+
# If this is an F99, we need to parse it differently.
|
317
|
+
resave_f99_contents if form_type == 'F99'
|
318
|
+
|
319
|
+
c = 0
|
320
|
+
@csv_parser.parse_row(@customized ? custom_file_path : file_path, :col_sep => delimiter, :quote_char => @quote_char, :skip_blanks => true) do |row|
|
321
|
+
if opts[:with_index]
|
322
|
+
yield [row, c]
|
323
|
+
c += 1
|
324
|
+
else
|
325
|
+
yield row
|
326
|
+
end
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
# Wrapper around .each_row to include indexes
|
331
|
+
def each_row_with_index(&block)
|
332
|
+
each_row(:with_index => true, &block)
|
333
|
+
end
|
334
|
+
|
335
|
+
# @return [String] the delimiter used in the filing's version
|
336
|
+
def delimiter
|
337
|
+
filing_version.to_f < 6 ? "," : "\034"
|
338
|
+
end
|
339
|
+
|
340
|
+
end
|
341
|
+
end
|
@@ -0,0 +1,233 @@
|
|
1
|
+
module Fech
|
2
|
+
|
3
|
+
# Helper class to generate mapping hashes from source csv data.
|
4
|
+
# Needed to rebuild rendered_maps.rb with new source data, not used
|
5
|
+
# in main gem.
|
6
|
+
# rake fech:maps
|
7
|
+
class MapGenerator
|
8
|
+
|
9
|
+
attr_accessor :map
|
10
|
+
FILING_VERSIONS = ["8.0", "7.0", "6.4", "6.3", "6.2", "6.1",
|
11
|
+
"5.3", "5.2", "5.1", "5.0", "3"]
|
12
|
+
BASE_ROW_TYPES = ["HDR", "F1", "F1M", "F2", "F24", "F3", "F3L", "F3P", "F3P31", "F3PS", "F3S", "F3X",
|
13
|
+
"F4", "F5", "F56", "F57", "F6", "F65", "F7", "F76", "F9", "F91", "F92", "F93", "F94", "F99",
|
14
|
+
"H1", "H2", "H3", "H4", "H5", "H6",
|
15
|
+
"SchA", "SchB", "SchC", "SchC1", "SchC2", "SchD", "SchE", "SchF", "SchL", "TEXT"]
|
16
|
+
ROW_TYPE_MATCHERS = {
|
17
|
+
"HDR" => FechUtils::ROW_TYPES[:hdr],
|
18
|
+
"F1" => FechUtils::ROW_TYPES[:f1],
|
19
|
+
"F1M" => FechUtils::ROW_TYPES[:f1m],
|
20
|
+
"F2" => FechUtils::ROW_TYPES[:f2],
|
21
|
+
"F24" => FechUtils::ROW_TYPES[:f24],
|
22
|
+
"F3" => FechUtils::ROW_TYPES[:f3],
|
23
|
+
"F3L" => FechUtils::ROW_TYPES[:f3l],
|
24
|
+
"F3P" => FechUtils::ROW_TYPES[:f3p],
|
25
|
+
"F3S" => FechUtils::ROW_TYPES[:f3s],
|
26
|
+
"F3P31" => FechUtils::ROW_TYPES[:f3p31],
|
27
|
+
"F3PS" => FechUtils::ROW_TYPES[:f3ps],
|
28
|
+
"F3X" => FechUtils::ROW_TYPES[:f3x],
|
29
|
+
"F4" => FechUtils::ROW_TYPES[:f4],
|
30
|
+
"F5" => FechUtils::ROW_TYPES[:f5],
|
31
|
+
"F56" => FechUtils::ROW_TYPES[:f56],
|
32
|
+
"F57" => FechUtils::ROW_TYPES[:f57],
|
33
|
+
"F6" => FechUtils::ROW_TYPES[:f6],
|
34
|
+
"F65" => FechUtils::ROW_TYPES[:f65],
|
35
|
+
"F7" => FechUtils::ROW_TYPES[:f7],
|
36
|
+
"F76" => FechUtils::ROW_TYPES[:f76],
|
37
|
+
"F9" => FechUtils::ROW_TYPES[:f9],
|
38
|
+
"F91" => FechUtils::ROW_TYPES[:f91],
|
39
|
+
"F92" => FechUtils::ROW_TYPES[:f92],
|
40
|
+
"F93" => FechUtils::ROW_TYPES[:f93],
|
41
|
+
"F94" => FechUtils::ROW_TYPES[:f94],
|
42
|
+
"F99" => FechUtils::ROW_TYPES[:f99],
|
43
|
+
"H1" => FechUtils::ROW_TYPES[:h1],
|
44
|
+
"H2" => FechUtils::ROW_TYPES[:h2],
|
45
|
+
"H3" => FechUtils::ROW_TYPES[:h3],
|
46
|
+
"H4" => FechUtils::ROW_TYPES[:h4],
|
47
|
+
"H5" => FechUtils::ROW_TYPES[:h5],
|
48
|
+
"H6" => FechUtils::ROW_TYPES[:h6],
|
49
|
+
"SchA" => FechUtils::ROW_TYPES[:sa],
|
50
|
+
"SchB" => FechUtils::ROW_TYPES[:sb],
|
51
|
+
"SchC" => FechUtils::ROW_TYPES[:sc],
|
52
|
+
"SchC1" => FechUtils::ROW_TYPES[:sc1],
|
53
|
+
"SchC2" => FechUtils::ROW_TYPES[:sc2],
|
54
|
+
"SchD" => FechUtils::ROW_TYPES[:sd],
|
55
|
+
"SchE" => FechUtils::ROW_TYPES[:se],
|
56
|
+
"SchF" => FechUtils::ROW_TYPES[:sf],
|
57
|
+
"SchL" => FechUtils::ROW_TYPES[:sl],
|
58
|
+
"TEXT" => FechUtils::ROW_TYPES[:text],
|
59
|
+
}
|
60
|
+
|
61
|
+
# Goes through all version header summary files and generates
|
62
|
+
# row map files for each type of row inside them.
|
63
|
+
def self.convert_header_file_to_row_files(source_dir)
|
64
|
+
data = {}
|
65
|
+
hybrid_data = {}
|
66
|
+
|
67
|
+
ignored_fields = File.open(ignored_fields_file(source_dir)).readlines.map { |l| l.strip }
|
68
|
+
|
69
|
+
# Create a hash of data with an entry for each row type found in the source
|
70
|
+
# version summary files. Each row has an entry for each version map that
|
71
|
+
# exists for it. If maps for two different versions are identical, they
|
72
|
+
# are combined.
|
73
|
+
FILING_VERSIONS.each do |version|
|
74
|
+
filepath = version_summary_file(source_dir, version)
|
75
|
+
|
76
|
+
# Clean the source files by removing unparseable characters
|
77
|
+
if RUBY_VERSION < "1.9.3"
|
78
|
+
require 'iconv'
|
79
|
+
ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
80
|
+
valid_string = ic.iconv(open(filepath).read << ' ')[0..-2]
|
81
|
+
else
|
82
|
+
valid_string = (open(filepath).read << ' ')[0..-2].encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '')
|
83
|
+
valid_string = valid_string.encode!('UTF-8', 'UTF-16')
|
84
|
+
end
|
85
|
+
open(filepath, 'w').write(valid_string)
|
86
|
+
|
87
|
+
Fech::Csv.foreach(filepath) do |row|
|
88
|
+
# Each row of a version summary file contains the ordered list of
|
89
|
+
# column names.
|
90
|
+
data[row.first] ||= {}
|
91
|
+
hybrid_data[row.first] ||= {}
|
92
|
+
row_version_data = remove_ignored_fields(row, ignored_fields)
|
93
|
+
|
94
|
+
# Check the maps for this row type in already-processed versions.
|
95
|
+
# If this map is identical to a previous map, tack this version on to
|
96
|
+
# to it instead of creating a new one.
|
97
|
+
data[row.first][version] = row_version_data
|
98
|
+
data[row.first].each do |k, v|
|
99
|
+
# skip the row we just added
|
100
|
+
|
101
|
+
next if k == version
|
102
|
+
if v == row_version_data
|
103
|
+
# Create the new hybrid entry
|
104
|
+
hybrid_data[row.first]["#{k}|#{version}"] = row_version_data
|
105
|
+
|
106
|
+
# Delete the old entry, and the one for this version only
|
107
|
+
data[row.first].delete(k)
|
108
|
+
data[row.first].delete(version)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
data[row.first].update(hybrid_data[row.first])
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# Go through each row type and create a base map management file that
|
116
|
+
# will serve as a template for organizing which fields are the same
|
117
|
+
# between versions. This file will need to then be arranged by hand to
|
118
|
+
# clean up the data. Each row will represent a column across versions,
|
119
|
+
# each column a unique map for that row for one or more versions.
|
120
|
+
data.each do |row_type, row_data|
|
121
|
+
file_path = write_row_map_file(source_dir, row_type)
|
122
|
+
next unless File.exists?(file_path)
|
123
|
+
File.open(file_path, 'w') do |f|
|
124
|
+
f.write('canonical')
|
125
|
+
|
126
|
+
to_transpose = []
|
127
|
+
row_data.sort.reverse.each do |version, version_data|
|
128
|
+
to_transpose << ["^#{version}", version_data.each_with_index.collect {|x, idx| idx+1}].flatten
|
129
|
+
to_transpose << [nil, version_data].flatten
|
130
|
+
end
|
131
|
+
|
132
|
+
# standardize row size
|
133
|
+
max_size = to_transpose.max { |r1, r2| r1.size <=> r2.size }.size
|
134
|
+
to_transpose.each { |r| r[max_size - 1] ||= nil }
|
135
|
+
transposed = to_transpose.transpose
|
136
|
+
|
137
|
+
transposed.each do |transposed_data|
|
138
|
+
transposed_data.collect! {|x| x.to_s.gsub(/\r/, ' ')}
|
139
|
+
canonical = transposed_data[1] # first description
|
140
|
+
if canonical
|
141
|
+
canonical = canonical.gsub(/\{.*\}/, "").gsub(/[ -\.\/\(\)]/, "_").gsub(/_+/, "_").gsub(/(_$)|(^_)/, "").downcase
|
142
|
+
transposed_data = [canonical, transposed_data].flatten
|
143
|
+
end
|
144
|
+
f.write(transposed_data.join(','))
|
145
|
+
f.write("\n")
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
# Generates the mapping for each row type in BASE_ROW_TYPES, writes them out
|
153
|
+
# to file for inclusion in the gem.
|
154
|
+
def self.dump_row_maps_to_ruby(source_dir, file_path)
|
155
|
+
File.open(file_path, 'w') do |f|
|
156
|
+
f.write("# Generated automatically by Fech::MapGenerator.\n\n")
|
157
|
+
f.write("# RENDERED_MAPS contains an entry for each supported row type, which in turn:\n")
|
158
|
+
f.write("# contain an entry for each distinct map between a row's labels and the\n")
|
159
|
+
f.write("# indexes where their values can be found.\n")
|
160
|
+
f.write("module Fech\n")
|
161
|
+
f.write(" RENDERED_MAPS = {\n")
|
162
|
+
BASE_ROW_TYPES.each do |row_type|
|
163
|
+
f.write(" \"#{ROW_TYPE_MATCHERS[row_type].source}\" => {\n")
|
164
|
+
generate_row_map_from_file(source_dir, row_type).sort_by(&:first).reverse.each do |k, v|
|
165
|
+
f.write(" \'#{k}' => [#{v.map {|x| x.to_s.gsub(/^\d+_?/, "") }.collect {|x| (x.nil? || x == "") ? "nil" : ":#{x}" }.join(', ') }],\n")
|
166
|
+
end
|
167
|
+
f.write(" },\n")
|
168
|
+
end
|
169
|
+
f.write(" }\n")
|
170
|
+
f.write("end")
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
# For a given row type, parses its source file and returns
|
175
|
+
# a mapping object for it.
|
176
|
+
def self.generate_row_map_from_file(source_dir, row_type)
|
177
|
+
versions = []
|
178
|
+
version_indexes = []
|
179
|
+
data = {}
|
180
|
+
text = open(row_map_file(source_dir, row_type)).read
|
181
|
+
split_char = text.index(/\r/) ? /\r/ : /\n/
|
182
|
+
rows = text.split(split_char).collect {|x| x.split(',')}
|
183
|
+
rows.each do |row|
|
184
|
+
row = row.collect {|x| x.gsub("\n", "")}
|
185
|
+
if row.first.nil?
|
186
|
+
require 'ruby-debug'; debugger
|
187
|
+
end
|
188
|
+
if row.first.downcase == "canonical"
|
189
|
+
versions = row[1..-1].uniq.collect {|x| x unless (x.nil? || x.empty?)}.compact
|
190
|
+
row.each_with_index {|x, ind| version_indexes << ind unless (x.nil? || x.empty?)}.slice!(1)
|
191
|
+
version_indexes.slice!(0, 1)
|
192
|
+
versions.each {|x| data[x] = [] }
|
193
|
+
|
194
|
+
elsif row.first.size > 0
|
195
|
+
canonical = row.first
|
196
|
+
|
197
|
+
versions.zip(version_indexes).each do |version, row_index|
|
198
|
+
index = row[row_index]
|
199
|
+
data[version][index.to_i - 1] = canonical.to_sym if index.to_i > 0
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
row_map = {}
|
205
|
+
data.each {|key, value| row_map[key] = value}
|
206
|
+
row_map
|
207
|
+
end
|
208
|
+
|
209
|
+
# Remove both the row type from the beginning of the row,
|
210
|
+
# and any fields marked as "ignore" in sources/headers/ignore.csv
|
211
|
+
def self.remove_ignored_fields(row, ignore)
|
212
|
+
data = row[1..-1].compact # strip off the row type
|
213
|
+
data.reject { |f| ignore.include?(f) }
|
214
|
+
end
|
215
|
+
|
216
|
+
def self.row_map_file(source_dir, row_type)
|
217
|
+
File.join(source_dir, row_type + '.csv')
|
218
|
+
end
|
219
|
+
|
220
|
+
def self.ignored_fields_file(source_dir)
|
221
|
+
File.join(source_dir, 'headers', 'ignore.csv')
|
222
|
+
end
|
223
|
+
|
224
|
+
def self.version_summary_file(source_dir, version)
|
225
|
+
File.join(source_dir, 'headers', version + '.csv')
|
226
|
+
end
|
227
|
+
|
228
|
+
def self.write_row_map_file(source_dir, row_type)
|
229
|
+
File.join(source_dir, 'rows', row_type + '.csv')
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
233
|
+
end
|
data/lib/fech/mapped.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
module Fech
|
2
|
+
|
3
|
+
# Fech::Mapped is a thin wrapper around Hash which allows values to be
|
4
|
+
# referenced either by key or by an alias specified in the associated
|
5
|
+
# Filing's Translations.
|
6
|
+
class Mapped < Hash
|
7
|
+
|
8
|
+
attr_accessor :filing, :row_type
|
9
|
+
alias :old_bracket :[]
|
10
|
+
|
11
|
+
def initialize(filing, row_type)
|
12
|
+
@filing = filing
|
13
|
+
@row_type = row_type
|
14
|
+
end
|
15
|
+
|
16
|
+
# Just calls Hash's [] method, unless the specified key doesn't
|
17
|
+
# exist, in which case it checks for any aliases on the filing's
|
18
|
+
# translator.
|
19
|
+
def [](key, &block)
|
20
|
+
if has_key?(key)
|
21
|
+
old_bracket(key, &block)
|
22
|
+
else
|
23
|
+
# Look up aliases in reverse, to find the most recent one
|
24
|
+
# Does not allow (obvious) recursion
|
25
|
+
aliias = filing.translator.aliases.reverse.detect do |a|
|
26
|
+
a[:alias] == key && a[:row].match(row_type) && a[:alias] != a[:for]
|
27
|
+
end
|
28
|
+
# Pass the key this alias references back to this function
|
29
|
+
aliias ? old_bracket(aliias[:for], &block) : nil
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def method_missing(method, *args, &block)
|
34
|
+
self[method]
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module Fech
|
2
|
+
class VersionError < RuntimeError; end
|
3
|
+
|
4
|
+
# Fech::Mappings loads a set of master mappings between labels and where
|
5
|
+
# their values can be found in Electronic Filings for various row types
|
6
|
+
# and versions.
|
7
|
+
# To access a map, call Mappings.for_row with the row_type,
|
8
|
+
# and optionally the version:
|
9
|
+
# Mappings.for_row("SA", :version => 6.1)
|
10
|
+
class Mappings
|
11
|
+
|
12
|
+
attr_accessor :map, :version
|
13
|
+
|
14
|
+
def initialize(ver = Fech::DEFAULT_VERSION)
|
15
|
+
@version = ver
|
16
|
+
@map = load_map
|
17
|
+
@cache = {}
|
18
|
+
end
|
19
|
+
|
20
|
+
# Returns a hash of mappings for row with given row_type
|
21
|
+
#
|
22
|
+
# @param [String,Symbol] row_type the row type whose map to find
|
23
|
+
def for_row(row_type)
|
24
|
+
@cache[row_type] ||= self.class.for_row(row_type, :version => @version)
|
25
|
+
end
|
26
|
+
|
27
|
+
# Returns the basic, default mappings hash by reading in a mappings
|
28
|
+
# file and saving the variable to the class's context.
|
29
|
+
def load_map
|
30
|
+
self.class.load_map
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.load_map
|
34
|
+
Fech::RENDERED_MAPS
|
35
|
+
end
|
36
|
+
|
37
|
+
# Given a row type, first find the entire block of maps for that row type.
|
38
|
+
# Then, use the filing's version to choose which specific map set to use,
|
39
|
+
# and return it.
|
40
|
+
#
|
41
|
+
# @param [Symbol,String,Regex] row_type the row whose map to find
|
42
|
+
def self.for_row(row_type, opts={})
|
43
|
+
opts[:version] ||= Fech::DEFAULT_VERSION
|
44
|
+
map = key_by_regex(load_map, row_type)
|
45
|
+
key_by_regex(map, opts[:version])
|
46
|
+
end
|
47
|
+
|
48
|
+
# Given a Hash whose keys are string representations of regular expressions,
|
49
|
+
# return the value whose key best matches the given label.
|
50
|
+
#
|
51
|
+
# @param [Hash] hash a Hash with string regular expressions for keys
|
52
|
+
# @param [String,Symbol,Regexp] label return the key that best matches this
|
53
|
+
def self.key_by_regex(hash, label)
|
54
|
+
label = label.source if label.is_a?(Regexp)
|
55
|
+
|
56
|
+
# Try matching longer keys first, to ensure more accurate keys are
|
57
|
+
# prioritized over less accurate ones.
|
58
|
+
hash.keys.sort { |x, y| x.length <=> y.length }.reverse.each do |key|
|
59
|
+
return hash[key] if Regexp.new(key, Regexp::IGNORECASE).match(label.to_s)
|
60
|
+
end
|
61
|
+
|
62
|
+
raise VersionError, "Attempted to access mapping that has not been generated (#{label}). " +
|
63
|
+
"Supported keys match the format: #{hash.keys.join(', ')}"
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
end
|