fech 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +7 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +49 -0
- data/LICENSE +13 -0
- data/README.rdoc +178 -0
- data/Rakefile +3 -0
- data/autotest/discover.rb +1 -0
- data/fech.gemspec +32 -0
- data/lib/fech.rb +13 -0
- data/lib/fech/default_translations.rb +135 -0
- data/lib/fech/fech_utils.rb +41 -0
- data/lib/fech/filing.rb +248 -0
- data/lib/fech/map_generator.rb +187 -0
- data/lib/fech/mapped.rb +38 -0
- data/lib/fech/mappings.rb +66 -0
- data/lib/fech/translator.rb +138 -0
- data/lib/fech/version.rb +3 -0
- data/sources/F3P.csv +1 -0
- data/sources/F3P31.csv +1 -0
- data/sources/F3PS.csv +1 -0
- data/sources/F3S.csv +1 -0
- data/sources/HDR.csv +1 -0
- data/sources/SchA.csv +1 -0
- data/sources/SchB.csv +1 -0
- data/sources/SchC.csv +1 -0
- data/sources/SchC1.csv +1 -0
- data/sources/SchC2.csv +1 -0
- data/sources/SchD.csv +1 -0
- data/sources/SchE.csv +1 -0
- data/sources/SchF.csv +1 -0
- data/sources/TEXT.csv +1 -0
- data/sources/headers/3.csv +1 -0
- data/sources/headers/5.0.csv +1 -0
- data/sources/headers/5.1.csv +1 -0
- data/sources/headers/5.2.csv +1 -0
- data/sources/headers/5.3.csv +1 -0
- data/sources/headers/6.1.csv +1 -0
- data/sources/headers/6.2.csv +1 -0
- data/sources/headers/6.3.csv +1 -0
- data/sources/headers/6.4.csv +1 -0
- data/sources/headers/7.0.csv +1 -0
- data/sources/headers/ignore.csv +5 -0
- data/spec/data/723604.fec +4 -0
- data/spec/data/97405.fec +10 -0
- data/spec/default_translations_spec.rb +104 -0
- data/spec/fech_utils_spec.rb +29 -0
- data/spec/filing_spec.rb +251 -0
- data/spec/map_generator_spec.rb +49 -0
- data/spec/mapped_spec.rb +44 -0
- data/spec/mappings_spec.rb +46 -0
- data/spec/sources/F3P.csv +1 -0
- data/spec/sources/SchA.csv +1 -0
- data/spec/sources/SchB.csv +1 -0
- data/spec/sources/SchC.csv +1 -0
- data/spec/sources/headers/3.csv +1 -0
- data/spec/sources/headers/5.0.csv +1 -0
- data/spec/sources/headers/5.1.csv +1 -0
- data/spec/sources/headers/5.2.csv +1 -0
- data/spec/sources/headers/5.3.csv +1 -0
- data/spec/sources/headers/6.1.csv +1 -0
- data/spec/sources/headers/6.2.csv +1 -0
- data/spec/sources/headers/6.3.csv +1 -0
- data/spec/sources/headers/6.4.csv +1 -0
- data/spec/sources/headers/7.0.csv +1 -0
- data/spec/sources/headers/ignore.csv +5 -0
- data/spec/sources/sa.csv +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/translator_spec.rb +195 -0
- data/tasks/fech.rake +41 -0
- metadata +280 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
# Contains helper functions and static variables used by various
|
2
|
+
# Fech classes.
|
3
|
+
module FechUtils
|
4
|
+
|
5
|
+
# All supported row types pointed to regular expressions that will correcty
|
6
|
+
# match that row type in the wild.
|
7
|
+
ROW_TYPES = {
|
8
|
+
:hdr => /^hdr$/i,
|
9
|
+
:f3p => /(^f3p$)|(^f3p[^s|3])/i,
|
10
|
+
:f3s => /^f3s/i,
|
11
|
+
:f3p31 => /^f3p31/i,
|
12
|
+
:f3ps => /^f3ps/i,
|
13
|
+
:sa => /^sa/i,
|
14
|
+
:sb => /^sb/i,
|
15
|
+
:sc => /^sc[^1-2]/i,
|
16
|
+
:sc1 => /^sc1/i,
|
17
|
+
:sc2 => /^sc2/i,
|
18
|
+
:sd => /^sd/i,
|
19
|
+
:se => /^se/i,
|
20
|
+
:sf => /^sf/i,
|
21
|
+
:text => /^text/i,
|
22
|
+
}
|
23
|
+
|
24
|
+
# Converts symbols and strings to Regexp objects for use in regex-keyed maps.
|
25
|
+
# Assumes that symbols should be matched literally, strings unanchored.
|
26
|
+
# @param [String,Symbol,Regexp] label the object to convert to a Regexp
|
27
|
+
def regexify(label)
|
28
|
+
if label.is_a?(Regexp)
|
29
|
+
Regexp.new(label.source, Regexp::IGNORECASE)
|
30
|
+
elsif label.is_a?(Symbol)
|
31
|
+
if ROW_TYPES.keys.include?(label)
|
32
|
+
ROW_TYPES[label]
|
33
|
+
else
|
34
|
+
Regexp.new("^#{label.to_s}$", Regexp::IGNORECASE)
|
35
|
+
end
|
36
|
+
else
|
37
|
+
Regexp.new(Regexp.escape(label.to_s), Regexp::IGNORECASE)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
data/lib/fech/filing.rb
ADDED
@@ -0,0 +1,248 @@
|
|
1
|
+
require 'tmpdir'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'fastercsv'
|
4
|
+
|
5
|
+
module Fech
|
6
|
+
|
7
|
+
# Fech::Filing downloads an Electronic Filing given its ID, and will search
|
8
|
+
# rows by row type. Using a child Translator object, the data in each row
|
9
|
+
# is automatically mapped at runtime into a labeled Hash. Additional
|
10
|
+
# Translations may be added to change the way that data is mapped and cleaned.
|
11
|
+
class Filing
|
12
|
+
attr_accessor :filing_id, :download_dir, :translator
|
13
|
+
|
14
|
+
# Create a new Filing object, assign the download directory to system's
|
15
|
+
# temp folder by default.
|
16
|
+
# @param [String] download_dir override the directory where files should
|
17
|
+
# be downloaded.
|
18
|
+
# @param [Symbol,Array] translate a list of built-in translation sets to use
|
19
|
+
def initialize(filing_id, opts={})
|
20
|
+
@filing_id = filing_id
|
21
|
+
@download_dir = opts[:download_dir] || Dir.tmpdir
|
22
|
+
@translator = Fech::Translator.new(:include => opts[:translate])
|
23
|
+
end
|
24
|
+
|
25
|
+
# Saves the filing data from the FEC website into the default download
|
26
|
+
# directory.
|
27
|
+
def download
|
28
|
+
File.open(file_path, 'w') do |file|
|
29
|
+
file << open(filing_url).read
|
30
|
+
end
|
31
|
+
self
|
32
|
+
end
|
33
|
+
|
34
|
+
# Access the header (first) line of the filing, containing information
|
35
|
+
# about the filing's version and metadata about the software used to file it.
|
36
|
+
# @return [Hash] a hash that assigns labels to the values of the filing's header row
|
37
|
+
def header(opts={})
|
38
|
+
each_row do |row|
|
39
|
+
return parse_row?(row)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Access the summary (second) line of the filing, containing aggregate and
|
44
|
+
# top-level information about the filing.
|
45
|
+
# @return [Hash] a hash that assigns labels to the values of the filing's summary row
|
46
|
+
def summary
|
47
|
+
each_row_with_index do |row, index|
|
48
|
+
next if index == 0
|
49
|
+
return parse_row?(row)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Access all lines of the filing that match a given row type. Will return an
|
54
|
+
# Array of all available lines if called directly, or will yield the mapped
|
55
|
+
# rows one by one if a block is passed.
|
56
|
+
#
|
57
|
+
# @param [String, Regexp] row_type a partial or complete name of the type of row desired
|
58
|
+
# @option opts [Boolean] :raw should the function return the data as an array
|
59
|
+
# that has not been mapped to column names
|
60
|
+
# @option opts [Array] :include list of field names that should be included
|
61
|
+
# in the returned hash
|
62
|
+
# @yield [Hash] each matched row's data, as either a mapped hash or raw array
|
63
|
+
# @return [Array] the complete set of mapped hashes for matched lines
|
64
|
+
def rows_like(row_type, opts={}, &block)
|
65
|
+
data = []
|
66
|
+
each_row do |row|
|
67
|
+
value = parse_row?(row, opts.merge(:parse_if => row_type))
|
68
|
+
next if value == false
|
69
|
+
if block_given?
|
70
|
+
yield value
|
71
|
+
else
|
72
|
+
data << value if value
|
73
|
+
end
|
74
|
+
end
|
75
|
+
block_given? ? nil : data
|
76
|
+
end
|
77
|
+
|
78
|
+
# Decides what to do with a given row. If the row's type matches the desired
|
79
|
+
# type, or if no type was specified, it will run the row through #map.
|
80
|
+
# If :raw was passed true, a flat, unmapped data array will be returned.
|
81
|
+
#
|
82
|
+
# @param [String, Regexp] row a partial or complete name of the type of row desired
|
83
|
+
# @option opts [Array] :include list of field names that should be included
|
84
|
+
# in the returned hash
|
85
|
+
def parse_row?(row, opts={})
|
86
|
+
# Always parse, unless :parse_if is given and does not match row
|
87
|
+
if opts[:parse_if].nil? || \
|
88
|
+
Fech.regexify(opts[:parse_if]).match(row.first.downcase)
|
89
|
+
opts[:raw] ? row : map(row, opts)
|
90
|
+
else
|
91
|
+
false
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# Maps a raw row to a labeled hash following any rules given in the filing's
|
96
|
+
# Translator based on its version and row type.
|
97
|
+
# Finds the correct map for a given row, performs any matching Translations
|
98
|
+
# on the individual values, and returns either the entire dataset, or just
|
99
|
+
# those fields requested.
|
100
|
+
# @param [String, Regexp] row a partial or complete name of the type of row desired
|
101
|
+
# @option opts [Array] :include list of field names that should be included
|
102
|
+
# in the returned hash
|
103
|
+
def map(row, opts={})
|
104
|
+
data = Fech::Mapped.new(self, row.first)
|
105
|
+
row_map = map_for(row.first)
|
106
|
+
|
107
|
+
# If specific fields were asked for, return only those
|
108
|
+
row_map = row_map.select { |k,v| opts[:include].include?(k) } if opts[:include]
|
109
|
+
|
110
|
+
# Inserts the row into data, performing any specified preprocessing
|
111
|
+
# on individual cells along the way
|
112
|
+
row_map.each_with_index do |field, index|
|
113
|
+
value = row[index]
|
114
|
+
translator.get_translations(:row => row.first,
|
115
|
+
:version => filing_version, :action => :convert,
|
116
|
+
:field => field).each do |translation|
|
117
|
+
# User's Procs should be given each field's value as context
|
118
|
+
value = translation[:proc].call(value)
|
119
|
+
end
|
120
|
+
data[field] = value
|
121
|
+
end
|
122
|
+
|
123
|
+
# Performs any specified group preprocessing / combinations
|
124
|
+
combinations = translator.get_translations(:row => row.first,
|
125
|
+
:version => filing_version, :action => :combine)
|
126
|
+
row_hash = hash_zip(row_map, row) if combinations
|
127
|
+
combinations.each do |translation|
|
128
|
+
# User's Procs should be given the entire row as context
|
129
|
+
value = translation[:proc].call(row_hash)
|
130
|
+
field = translation[:field].source.gsub(/[\^\$]*/, "").to_sym
|
131
|
+
data[field] = value
|
132
|
+
end
|
133
|
+
|
134
|
+
data
|
135
|
+
end
|
136
|
+
|
137
|
+
# Returns the column names for given row type and the filing's version
|
138
|
+
# in the order they appear in row data.
|
139
|
+
# @param [String, Regexp] row_type representation of the row desired
|
140
|
+
def map_for(row_type)
|
141
|
+
mappings.for_row(row_type)
|
142
|
+
end
|
143
|
+
|
144
|
+
# Returns the column names for given row type and version in the order
|
145
|
+
# they appear in row data.
|
146
|
+
# @param [String, Regexp] row_type representation of the row desired
|
147
|
+
# @option opts [String, Regexp] :version representation of the version desired
|
148
|
+
def self.map_for(row_type, opts={})
|
149
|
+
Fech::Mappings.for_row(row_type, opts)
|
150
|
+
end
|
151
|
+
|
152
|
+
# @yield [t] returns a reference to the filing's Translator
|
153
|
+
# @yieldparam [Translator] the filing's Translator
|
154
|
+
def translate(&block)
|
155
|
+
if block_given?
|
156
|
+
yield translator
|
157
|
+
else
|
158
|
+
translator
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# Whether this filing amends a previous filing or not.
|
163
|
+
def amendment?
|
164
|
+
!amends.nil?
|
165
|
+
end
|
166
|
+
|
167
|
+
# Returns the filing ID of the past filing this one amends,
|
168
|
+
# nil if this is a first-draft filing.
|
169
|
+
# :report_id in the HDR line references the amended filing
|
170
|
+
def amends
|
171
|
+
header[:report_id]
|
172
|
+
end
|
173
|
+
|
174
|
+
|
175
|
+
# Combines an array of keys and values into an Fech::Mapped object,
|
176
|
+
# a type of Hash.
|
177
|
+
# @param [Array] keys the desired keys for the new hash
|
178
|
+
# @param [Array] values the desired values for the new hash
|
179
|
+
# @return [Fech::Mapped, Hash]
|
180
|
+
def hash_zip(keys, values)
|
181
|
+
Fech::Mapped.new(self, values.first).merge(Hash[*keys.zip(values).flatten])
|
182
|
+
end
|
183
|
+
|
184
|
+
# The version of the FEC software used to generate this Filing
|
185
|
+
def filing_version
|
186
|
+
@filing_version ||= parse_filing_version
|
187
|
+
end
|
188
|
+
|
189
|
+
# Pulls out the version number from the header line.
|
190
|
+
# Must parse this line manually, since we don't know the version yet, and
|
191
|
+
# thus the delimiter type is still a mystery.
|
192
|
+
def parse_filing_version
|
193
|
+
first = File.open(file_path).first
|
194
|
+
if first.index("\034").nil?
|
195
|
+
FasterCSV.parse(first).flatten[2]
|
196
|
+
else
|
197
|
+
FasterCSV.parse(first, :col_sep => "\034").flatten[2]
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
# Gets or creats the Mappings instance for this filing_version
|
202
|
+
def mappings
|
203
|
+
@mapping ||= Fech::Mappings.new(filing_version)
|
204
|
+
end
|
205
|
+
|
206
|
+
# The location of the Filing on the file system
|
207
|
+
def file_path
|
208
|
+
File.join(download_dir, file_name)
|
209
|
+
end
|
210
|
+
|
211
|
+
def file_name
|
212
|
+
"#{filing_id}.fec"
|
213
|
+
end
|
214
|
+
|
215
|
+
def filing_url
|
216
|
+
"http://query.nictusa.com/dcdev/posted/#{filing_id}.fec"
|
217
|
+
end
|
218
|
+
|
219
|
+
# Iterates over and yields the Filing's lines
|
220
|
+
# @option opts [Boolean] :with_index yield both the item and its index
|
221
|
+
# @yield [Array] a row of the filing, split by the delimiter from #delimiter
|
222
|
+
def each_row(opts={}, &block)
|
223
|
+
unless File.exists?(file_path)
|
224
|
+
raise "File #{file_path} does not exist. Try invoking the .download method on this Filing object."
|
225
|
+
end
|
226
|
+
c = 0
|
227
|
+
FasterCSV.foreach(file_path, :col_sep => delimiter, :skip_blanks => true) do |row|
|
228
|
+
if opts[:with_index]
|
229
|
+
yield [row, c]
|
230
|
+
c += 1
|
231
|
+
else
|
232
|
+
yield row
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
# Wrapper around .each_row to include indexes
|
238
|
+
def each_row_with_index(&block)
|
239
|
+
each_row(:with_index => true, &block)
|
240
|
+
end
|
241
|
+
|
242
|
+
# @return [String] the delimiter used in the filing's version
|
243
|
+
def delimiter
|
244
|
+
filing_version.to_f < 6 ? "," : "\034"
|
245
|
+
end
|
246
|
+
|
247
|
+
end
|
248
|
+
end
|
@@ -0,0 +1,187 @@
|
|
1
|
+
module Fech
|
2
|
+
|
3
|
+
# Helper class to generate mapping hashes from source csv data.
|
4
|
+
# Needed to rebuild rendered_maps.rb with new source data, not used
|
5
|
+
# in main gem.
|
6
|
+
# rake fech:maps
|
7
|
+
class MapGenerator
|
8
|
+
|
9
|
+
attr_accessor :map
|
10
|
+
FILING_VERSIONS = ["7.0", "6.4", "6.3", "6.2", "6.1",
|
11
|
+
"5.3", "5.2", "5.1", "5.0", "3"]
|
12
|
+
BASE_ROW_TYPES = ["HDR", "F3P", "F3P31", "F3PS", "F3S", "SchA", "SchB",
|
13
|
+
"SchC", "SchC1", "SchC2", "SchD", "SchE", "SchF", "TEXT"]
|
14
|
+
ROW_TYPE_MATCHERS = {
|
15
|
+
"HDR" => FechUtils::ROW_TYPES[:hdr],
|
16
|
+
"F3P" => FechUtils::ROW_TYPES[:f3p],
|
17
|
+
"F3S" => FechUtils::ROW_TYPES[:f3s],
|
18
|
+
"F3P31" => FechUtils::ROW_TYPES[:f3p31],
|
19
|
+
"F3PS" => FechUtils::ROW_TYPES[:f3ps],
|
20
|
+
"SchA" => FechUtils::ROW_TYPES[:sa],
|
21
|
+
"SchB" => FechUtils::ROW_TYPES[:sb],
|
22
|
+
"SchC" => FechUtils::ROW_TYPES[:sc],
|
23
|
+
"SchC1" => FechUtils::ROW_TYPES[:sc1],
|
24
|
+
"SchC2" => FechUtils::ROW_TYPES[:sc2],
|
25
|
+
"SchD" => FechUtils::ROW_TYPES[:sd],
|
26
|
+
"SchE" => FechUtils::ROW_TYPES[:se],
|
27
|
+
"SchF" => FechUtils::ROW_TYPES[:sf],
|
28
|
+
"TEXT" => FechUtils::ROW_TYPES[:text],
|
29
|
+
}
|
30
|
+
|
31
|
+
# Goes through all version header summary files and generates
|
32
|
+
# row map files for each type of row inside them.
|
33
|
+
def self.convert_header_file_to_row_files(source_dir)
|
34
|
+
data = {}
|
35
|
+
|
36
|
+
ignored_fields = File.open(ignored_fields_file(source_dir)).readlines.map { |l| l.strip }
|
37
|
+
|
38
|
+
# Create a hash of data with an entry for each row type found in the source
|
39
|
+
# version summary files. Each row has an entriy for each version map that
|
40
|
+
# exists for it. If maps for two different versions are identical, they
|
41
|
+
# are combined.
|
42
|
+
FILING_VERSIONS.each do |version|
|
43
|
+
FasterCSV.foreach(version_summary_file(source_dir, version)) do |row|
|
44
|
+
# Each row of a version summary file contains the ordered list of
|
45
|
+
# column names.
|
46
|
+
data[row.first] ||= {}
|
47
|
+
row_version_data = remove_ignored_fields(row, ignored_fields)
|
48
|
+
|
49
|
+
# Check the maps for this row type in already-processed versions.
|
50
|
+
# If this map is identical to a previous map, tack this version on to
|
51
|
+
# to it instead of creating a new one.
|
52
|
+
data[row.first][version] = row_version_data
|
53
|
+
data[row.first].each do |k, v|
|
54
|
+
# skip the row we just added
|
55
|
+
|
56
|
+
next if k == version
|
57
|
+
if v == row_version_data
|
58
|
+
# Create the new hybrid entry
|
59
|
+
data[row.first]["#{k}|#{version}"] = row_version_data
|
60
|
+
|
61
|
+
# Delete the old entry, and the one for this version only
|
62
|
+
data[row.first].delete(k)
|
63
|
+
data[row.first].delete(version)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Go through each row type and create a base map management file that
|
70
|
+
# will serve as a template for organizing which fields are the same
|
71
|
+
# between versions. This file will need to then be arranged by hand to
|
72
|
+
# clean up the data. Each row will represent a column across versions,
|
73
|
+
# each column a unique map for that row for one or more versions.
|
74
|
+
data.each do |row_type, row_data|
|
75
|
+
file_path = write_row_map_file(source_dir, row_type)
|
76
|
+
next unless File.exists?(file_path)
|
77
|
+
File.open(file_path, 'w') do |f|
|
78
|
+
f.write('canonical')
|
79
|
+
|
80
|
+
to_transpose = []
|
81
|
+
row_data.sort.reverse.each do |version, version_data|
|
82
|
+
to_transpose << ["^#{version}", version_data.each_with_index.collect {|x, idx| idx+1}].flatten
|
83
|
+
to_transpose << [nil, version_data].flatten
|
84
|
+
end
|
85
|
+
|
86
|
+
# standardize row size
|
87
|
+
max_size = to_transpose.max { |r1, r2| r1.size <=> r2.size }.size
|
88
|
+
to_transpose.each { |r| r[max_size - 1] ||= nil }
|
89
|
+
transposed = to_transpose.transpose
|
90
|
+
|
91
|
+
transposed.each do |transposed_data|
|
92
|
+
transposed_data.collect! {|x| x.to_s.gsub(/\r/, ' ')}
|
93
|
+
canonical = transposed_data[1] # first description
|
94
|
+
if canonical
|
95
|
+
canonical = canonical.gsub(/\{.*\}/, "").gsub(/[ -\.\/\(\)]/, "_").gsub(/_+/, "_").gsub(/(_$)|(^_)/, "").downcase
|
96
|
+
transposed_data = [canonical, transposed_data].flatten
|
97
|
+
end
|
98
|
+
f.write(transposed_data.join(','))
|
99
|
+
f.write("\n")
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
# Generates the mapping for each row type in BASE_ROW_TYPES, writes them out
|
107
|
+
# to file for inclusion in the gem.
|
108
|
+
def self.dump_row_maps_to_ruby(source_dir, file_path)
|
109
|
+
File.open(file_path, 'w') do |f|
|
110
|
+
f.write("# Generated automatically by Fech::MapGenerator.\n\n")
|
111
|
+
f.write("# RENDERED_MAPS contains an entry for each supported row type, which in turn:\n")
|
112
|
+
f.write("# contain an entry for each distinct map between a row's labels and the\n")
|
113
|
+
f.write("# indexes where their values can be found.\n")
|
114
|
+
f.write("module Fech\n")
|
115
|
+
f.write(" RENDERED_MAPS = {\n")
|
116
|
+
BASE_ROW_TYPES.each do |row_type|
|
117
|
+
f.write(" \"#{ROW_TYPE_MATCHERS[row_type].source}\" => {\n")
|
118
|
+
generate_row_map_from_file(source_dir, row_type).each do |k, v|
|
119
|
+
f.write(" \'#{k}' => [#{v.map {|x| x.to_s.gsub(/^\d+_?/, "") }.collect {|x| (x.nil? || x == "") ? "nil" : ":#{x}" }.join(', ') }],\n")
|
120
|
+
end
|
121
|
+
f.write(" },\n")
|
122
|
+
end
|
123
|
+
f.write(" }\n")
|
124
|
+
f.write("end")
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# For a given row type, parses its source file and returns
|
129
|
+
# a mapping object for it.
|
130
|
+
def self.generate_row_map_from_file(source_dir, row_type)
|
131
|
+
versions = []
|
132
|
+
version_indexes = []
|
133
|
+
data = {}
|
134
|
+
text = open(row_map_file(source_dir, row_type)).read
|
135
|
+
split_char = text.index(/\r/) ? /\r/ : /\n/
|
136
|
+
rows = text.split(split_char).collect {|x| x.split(',')}
|
137
|
+
rows.each do |row|
|
138
|
+
row = row.collect {|x| x.gsub("\n", "")}
|
139
|
+
if row.first.nil?
|
140
|
+
require 'ruby-debug'; debugger
|
141
|
+
end
|
142
|
+
if row.first.downcase == "canonical"
|
143
|
+
versions = row[1..-1].uniq.collect {|x| x unless (x.nil? || x.empty?)}.compact
|
144
|
+
row.each_with_index {|x, ind| version_indexes << ind unless (x.nil? || x.empty?)}.slice!(1)
|
145
|
+
version_indexes.slice!(0, 1)
|
146
|
+
versions.each {|x| data[x] = [] }
|
147
|
+
|
148
|
+
elsif row.first.size > 0
|
149
|
+
canonical = row.first
|
150
|
+
|
151
|
+
versions.zip(version_indexes).each do |version, row_index|
|
152
|
+
index = row[row_index]
|
153
|
+
data[version][index.to_i - 1] = canonical.to_sym if index.to_i > 0
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
row_map = {}
|
159
|
+
data.each {|key, value| row_map[key] = value}
|
160
|
+
row_map
|
161
|
+
end
|
162
|
+
|
163
|
+
# Remove both the row type from the beginning of the row,
|
164
|
+
# and any fields marked as "ignore" in sources/headers/ignore.csv
|
165
|
+
def self.remove_ignored_fields(row, ignore)
|
166
|
+
data = row[1..-1].compact # strip off the row type
|
167
|
+
data.reject { |f| ignore.include?(f) }
|
168
|
+
end
|
169
|
+
|
170
|
+
def self.row_map_file(source_dir, row_type)
|
171
|
+
File.join(source_dir, row_type + '.csv')
|
172
|
+
end
|
173
|
+
|
174
|
+
def self.ignored_fields_file(source_dir)
|
175
|
+
File.join(source_dir, 'headers', 'ignore.csv')
|
176
|
+
end
|
177
|
+
|
178
|
+
def self.version_summary_file(source_dir, version)
|
179
|
+
File.join(source_dir, 'headers', version + '.csv')
|
180
|
+
end
|
181
|
+
|
182
|
+
def self.write_row_map_file(source_dir, row_type)
|
183
|
+
File.join(source_dir, 'rows', row_type + '.csv')
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
end
|