ofac 1.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +6 -0
- data/History.txt +72 -0
- data/LICENSE +20 -0
- data/PostInstall.txt +11 -0
- data/README.rdoc +123 -0
- data/Rakefile +60 -0
- data/VERSION.yml +4 -0
- data/generators/ofac_migration/ofac_migration_generator.rb +12 -0
- data/generators/ofac_migration/templates/migration.rb +31 -0
- data/lib/ofac.rb +9 -0
- data/lib/ofac/models/ofac.rb +177 -0
- data/lib/ofac/models/ofac_sdn.rb +5 -0
- data/lib/ofac/models/ofac_sdn_loader.rb +305 -0
- data/lib/ofac/ofac_match.rb +139 -0
- data/lib/ofac/ruby_string_extensions.rb +22 -0
- data/lib/tasks/ofac.rake +8 -0
- data/ofac.gemspec +104 -0
- data/test/files/test_address_data_load.pip +10 -0
- data/test/files/test_alt_data_load.pip +10 -0
- data/test/files/test_sdn_data_load.pip +9 -0
- data/test/files/valid_flattened_file.csv +19 -0
- data/test/mocks/test/ofac_sdn_loader.rb +20 -0
- data/test/ofac_sdn_loader_test.rb +40 -0
- data/test/ofac_test.rb +138 -0
- data/test/test_helper.rb +49 -0
- metadata +119 -0
@@ -0,0 +1,305 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'activerecord'
|
3
|
+
require 'active_record/connection_adapters/mysql_adapter'
|
4
|
+
|
5
|
+
class OfacSdnLoader
|
6
|
+
|
7
|
+
|
8
|
+
#Loads the most recent file from http://www.treas.gov/offices/enforcement/ofac/sdn/delimit/index.shtml
|
9
|
+
def self.load_current_sdn_file
|
10
|
+
puts "Reloading OFAC sdn data"
|
11
|
+
puts "Downloading OFAC data from http://www.treas.gov/offices/enforcement/ofac/sdn"
|
12
|
+
#get the 3 data files
|
13
|
+
sdn = Tempfile.new('sdn')
|
14
|
+
sdn.write(Net::HTTP.get(URI.parse('http://www.treas.gov/offices/enforcement/ofac/sdn/delimit/sdn.pip')))
|
15
|
+
sdn.rewind
|
16
|
+
address = Tempfile.new('sdn')
|
17
|
+
address.write(Net::HTTP.get(URI.parse('http://www.treas.gov/offices/enforcement/ofac/sdn/delimit/add.pip')))
|
18
|
+
address.rewind
|
19
|
+
alt = Tempfile.new('sdn')
|
20
|
+
alt.write(Net::HTTP.get(URI.parse('http://www.treas.gov/offices/enforcement/ofac/sdn/delimit/alt.pip')))
|
21
|
+
alt.rewind
|
22
|
+
|
23
|
+
if OfacSdn.connection.kind_of?(ActiveRecord::ConnectionAdapters::MysqlAdapter)
|
24
|
+
puts "Converting file to csv format for Mysql import. This could take several minutes."
|
25
|
+
|
26
|
+
csv_file = convert_to_flattened_csv(sdn, address, alt)
|
27
|
+
|
28
|
+
bulk_mysql_update(csv_file)
|
29
|
+
else
|
30
|
+
active_record_file_load(sdn, address, alt)
|
31
|
+
end
|
32
|
+
|
33
|
+
sdn.close
|
34
|
+
@address.close
|
35
|
+
@alt.close
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
#convert the file's null value to an empty string
|
42
|
+
#and removes " chars.
|
43
|
+
def self.clean_file_string(line)
|
44
|
+
line.gsub!(/-0-(\s)?/,'')
|
45
|
+
line.gsub!(/\n/,'')
|
46
|
+
line.gsub(/\"/,'')
|
47
|
+
end
|
48
|
+
|
49
|
+
#split the line into an array
|
50
|
+
def self.convert_line_to_array(line)
|
51
|
+
clean_file_string(line).split('|') unless line.nil?
|
52
|
+
end
|
53
|
+
|
54
|
+
#return an 2 arrays of the records matching the sdn primary key
|
55
|
+
#1 array of address records and one array of alt records
|
56
|
+
def self.foreign_key_records(sdn_id)
|
57
|
+
address_records = []
|
58
|
+
alt_records = []
|
59
|
+
|
60
|
+
#the first element in each array is the primary and foreign keys
|
61
|
+
#we are denormalizing the data
|
62
|
+
if @current_address_hash && @current_address_hash[:id] == sdn_id
|
63
|
+
address_records << @current_address_hash
|
64
|
+
loop do
|
65
|
+
@current_address_hash = address_text_to_hash(@address.gets)
|
66
|
+
if @current_address_hash && @current_address_hash[:id] == sdn_id
|
67
|
+
address_records << @current_address_hash
|
68
|
+
else
|
69
|
+
break
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
if @current_alt_hash && @current_alt_hash[:id] == sdn_id
|
75
|
+
alt_records << @current_alt_hash
|
76
|
+
loop do
|
77
|
+
@current_alt_hash = alt_text_to_hash(@alt.gets)
|
78
|
+
if @current_alt_hash && @current_alt_hash[:id] == sdn_id
|
79
|
+
alt_records << @current_alt_hash
|
80
|
+
else
|
81
|
+
break
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
return address_records, alt_records
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.sdn_text_to_hash(line)
|
89
|
+
unless line.nil?
|
90
|
+
value_array = convert_line_to_array(line)
|
91
|
+
{:id => value_array[0],
|
92
|
+
:name => value_array[1],
|
93
|
+
:sdn_type => value_array[2],
|
94
|
+
:program => value_array[3],
|
95
|
+
:title => value_array[4],
|
96
|
+
:vessel_call_sign => value_array[5],
|
97
|
+
:vessel_type => value_array[6],
|
98
|
+
:vessel_tonnage => value_array[7],
|
99
|
+
:gross_registered_tonnage => value_array[8],
|
100
|
+
:vessel_flag => value_array[9],
|
101
|
+
:vessel_owner => value_array[10],
|
102
|
+
:remarks => value_array[11]
|
103
|
+
}
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def self.address_text_to_hash(line)
|
108
|
+
unless line.nil?
|
109
|
+
value_array = convert_line_to_array(line)
|
110
|
+
{:id => value_array[0],
|
111
|
+
:address => value_array[2],
|
112
|
+
:city => value_array[3],
|
113
|
+
:country => value_array[4],
|
114
|
+
:address_remarks => value_array[5]
|
115
|
+
}
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def self.alt_text_to_hash(line)
|
120
|
+
unless line.nil?
|
121
|
+
value_array = convert_line_to_array(line)
|
122
|
+
{:id => value_array[0],
|
123
|
+
:alternate_identity_type => value_array[2],
|
124
|
+
:alternate_identity_name => value_array[3],
|
125
|
+
:alternate_identity_remarks => value_array[4]
|
126
|
+
}
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def self.convert_hash_to_mysql_import_string(record_hash)
|
131
|
+
# empty field for id to be generated by mysql.
|
132
|
+
new_line = "``|" +
|
133
|
+
# :name
|
134
|
+
"`#{record_hash[:name]}`|" +
|
135
|
+
# :sdn_type
|
136
|
+
"`#{record_hash[:sdn_type]}`|" +
|
137
|
+
# :program
|
138
|
+
"`#{record_hash[:program]}`|" +
|
139
|
+
# :title
|
140
|
+
"`#{record_hash[:title]}`|" +
|
141
|
+
# :vessel_call_sign
|
142
|
+
"`#{record_hash[:vessel_call_sign]}`|" +
|
143
|
+
# :vessel_type
|
144
|
+
"`#{record_hash[:vessel_type]}`|" +
|
145
|
+
# :vessel_tonnage
|
146
|
+
"`#{record_hash[:vessel_tonnage]}`|" +
|
147
|
+
# :gross_registered_tonnage
|
148
|
+
"`#{record_hash[:gross_registered_tonnage]}`|" +
|
149
|
+
# :vessel_flag
|
150
|
+
"`#{record_hash[:vessel_flag]}`|" +
|
151
|
+
# :vessel_owner
|
152
|
+
"`#{record_hash[:vessel_owner]}`|" +
|
153
|
+
# :remarks
|
154
|
+
"`#{record_hash[:remarks]}`|" +
|
155
|
+
# :address
|
156
|
+
"`#{record_hash[:address]}`|" +
|
157
|
+
# :city
|
158
|
+
"`#{record_hash[:city]}`|" +
|
159
|
+
# :country
|
160
|
+
"`#{record_hash[:country]}`|" +
|
161
|
+
# :address_remarks
|
162
|
+
"`#{record_hash[:address_remarks]}`|" +
|
163
|
+
# :alternate_identity_type
|
164
|
+
"`#{record_hash[:alternate_identity_type]}`|" +
|
165
|
+
# :alternate_identity_name
|
166
|
+
"`#{record_hash[:alternate_identity_name]}`|" +
|
167
|
+
# :alternate_identity_remarks
|
168
|
+
"`#{record_hash[:alternate_identity_remarks]}`|" +
|
169
|
+
#:created_at
|
170
|
+
"`#{Time.now.to_s(:db)}`|" +
|
171
|
+
# updated_at
|
172
|
+
"`#{Time.now.to_s(:db)}`" + "\n"
|
173
|
+
|
174
|
+
new_line
|
175
|
+
end
|
176
|
+
|
177
|
+
def self.convert_to_flattened_csv(sdn_file, address_file, alt_file)
|
178
|
+
@address = address_file
|
179
|
+
@alt = alt_file
|
180
|
+
|
181
|
+
csv_file = Tempfile.new("ofac") # create temp file for converted csv format.
|
182
|
+
#get the first line from the address and alt files
|
183
|
+
@current_address_hash = address_text_to_hash(@address.gets)
|
184
|
+
@current_alt_hash = alt_text_to_hash(@alt.gets)
|
185
|
+
|
186
|
+
start = Time.now
|
187
|
+
|
188
|
+
sdn_file.each_with_index do |line, i|
|
189
|
+
|
190
|
+
#initialize the address and alt atributes to empty strings
|
191
|
+
address_attributes = address_text_to_hash("|||||")
|
192
|
+
alt_attributes = alt_text_to_hash("||||")
|
193
|
+
|
194
|
+
sdn_attributes = sdn_text_to_hash(line)
|
195
|
+
|
196
|
+
#get the foreign key records for this sdn
|
197
|
+
address_records, alt_records = foreign_key_records(sdn_attributes[:id])
|
198
|
+
|
199
|
+
if address_records.empty?
|
200
|
+
#no matching address records, so initialized blank values will be used.
|
201
|
+
if alt_records.empty?
|
202
|
+
#no matching address records, so initialized blank values will be used.
|
203
|
+
csv_file.syswrite(convert_hash_to_mysql_import_string(sdn_attributes.merge(address_attributes).merge(alt_attributes)))
|
204
|
+
else
|
205
|
+
alt_records.each do |alt|
|
206
|
+
csv_file.syswrite(convert_hash_to_mysql_import_string(sdn_attributes.merge(address_attributes).merge(alt)))
|
207
|
+
end
|
208
|
+
end
|
209
|
+
else
|
210
|
+
address_records.each do |address|
|
211
|
+
if alt_records.empty?
|
212
|
+
#no matching address records, so initialized blank values will be used.
|
213
|
+
csv_file.syswrite(convert_hash_to_mysql_import_string(sdn_attributes.merge(address).merge(alt_attributes)))
|
214
|
+
else
|
215
|
+
alt_records.each do |alt|
|
216
|
+
csv_file.syswrite(convert_hash_to_mysql_import_string(sdn_attributes.merge(address).merge(alt)))
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
puts "#{i} records processed." if (i % 1000 == 0) && (i > 0)
|
222
|
+
end
|
223
|
+
puts "File conversion ran for #{(Time.now - start) / 60} minutes."
|
224
|
+
return csv_file
|
225
|
+
end
|
226
|
+
|
227
|
+
def self.active_record_file_load(sdn_file, address_file, alt_file)
|
228
|
+
@address = address_file
|
229
|
+
@alt = alt_file
|
230
|
+
|
231
|
+
#OFAC data is a complete list, so we have to dump and load
|
232
|
+
OfacSdn.delete_all
|
233
|
+
|
234
|
+
#get the first line from the address and alt files
|
235
|
+
@current_address_hash = address_text_to_hash(@address.gets)
|
236
|
+
@current_alt_hash = alt_text_to_hash(@alt.gets)
|
237
|
+
attributes = {}
|
238
|
+
sdn_file.each_with_index do |line, i|
|
239
|
+
|
240
|
+
#initialize the address and alt atributes to empty strings
|
241
|
+
address_attributes = address_text_to_hash("|||||")
|
242
|
+
alt_attributes = alt_text_to_hash("||||")
|
243
|
+
|
244
|
+
sdn_attributes = sdn_text_to_hash(line)
|
245
|
+
|
246
|
+
#get the foreign key records for this sdn
|
247
|
+
address_records, alt_records = foreign_key_records(sdn_attributes[:id])
|
248
|
+
|
249
|
+
if address_records.empty?
|
250
|
+
#no matching address records, so initialized blank values will be used.
|
251
|
+
if alt_records.empty?
|
252
|
+
#no matching address records, so initialized blank values will be used.
|
253
|
+
attributes = sdn_attributes.merge(address_attributes).merge(alt_attributes)
|
254
|
+
attributes.delete(:id)
|
255
|
+
OfacSdn.create(attributes)
|
256
|
+
else
|
257
|
+
alt_records.each do |alt|
|
258
|
+
attributes = sdn_attributes.merge(address_attributes).merge(alt)
|
259
|
+
attributes.delete(:id)
|
260
|
+
OfacSdn.create(attributes)
|
261
|
+
end
|
262
|
+
end
|
263
|
+
else
|
264
|
+
address_records.each do |address|
|
265
|
+
if alt_records.empty?
|
266
|
+
#no matching address records, so initialized blank values will be used.
|
267
|
+
attributes = sdn_attributes.merge(address).merge(alt_attributes)
|
268
|
+
attributes.delete(:id)
|
269
|
+
OfacSdn.create(attributes)
|
270
|
+
else
|
271
|
+
alt_records.each do |alt|
|
272
|
+
attributes = sdn_attributes.merge(address).merge(alt)
|
273
|
+
attributes.delete(:id)
|
274
|
+
OfacSdn.create(attributes)
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
puts "#{i} records processed." if (i % 5000 == 0) && (i > 0)
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
# For mysql, use:
|
285
|
+
# LOAD DATA LOCAL INFILE 'ssdm1.csv' INTO TABLE death_master_files FIELDS TERMINATED BY '|' ENCLOSED BY "`" LINES TERMINATED BY '\n';
|
286
|
+
# This is a much faster way of loading large amounts of data into mysql. For information on the LOAD DATA command
|
287
|
+
# see http://dev.mysql.com/doc/refman/5.1/en/load-data.html
|
288
|
+
def self.bulk_mysql_update(csv_file)
|
289
|
+
puts "Deleting all records in ofac_sdn..."
|
290
|
+
|
291
|
+
#OFAC data is a complete list, so we have to dump and load
|
292
|
+
OfacSdn.delete_all
|
293
|
+
|
294
|
+
puts "Importing into Mysql..."
|
295
|
+
|
296
|
+
mysql_command = <<-TEXT
|
297
|
+
LOAD DATA LOCAL INFILE '#{csv_file.path}' REPLACE INTO TABLE ofac_sdns FIELDS TERMINATED BY '|' ENCLOSED BY "`" LINES TERMINATED BY '\n';
|
298
|
+
TEXT
|
299
|
+
|
300
|
+
OfacSdn.connection.execute(mysql_command)
|
301
|
+
puts "Mysql import complete."
|
302
|
+
|
303
|
+
end
|
304
|
+
|
305
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
class OfacMatch
|
2
|
+
|
3
|
+
attr_reader :possible_hits
|
4
|
+
|
5
|
+
#Intialize a Match object with a record hash of fields you want to match on.
|
6
|
+
#Each key in the hash, also has a data hash value for the weight, token, and type.
|
7
|
+
#
|
8
|
+
# match = Ofac::Match.new({:name => {:weight => 10, :token => 'Kevin Tyll'},
|
9
|
+
# :city => {:weight => 40, :token => 'Clearwater', },
|
10
|
+
# :address => {:weight => 40, :token => '1234 Park St.', },
|
11
|
+
# :zip => {:weight => 10, :token => '33759', :type => :number}})
|
12
|
+
#
|
13
|
+
# data hash keys:
|
14
|
+
# * <tt>data[:weight]</tt> - value to apply to the score if there is a match (Default is 100/number of keys in the record hash)
|
15
|
+
# * <tt>data[:token]</tt> - string to match
|
16
|
+
# * <tt>data[:match]</tt> - set from records hash
|
17
|
+
# * <tt>data[:score]</tt> - output field
|
18
|
+
# * <tt>data[:type]</tt> - the type of match that should be performed (valid values are +:sound+ | +:number+) (Default is +:sound+)
|
19
|
+
def initialize(stats={})
|
20
|
+
@possible_hits = []
|
21
|
+
@stats = stats.dup
|
22
|
+
weight = 100
|
23
|
+
weight = 100 / @stats.length if @stats.length > 0
|
24
|
+
@stats.each_value do |data|
|
25
|
+
data[:weight] ||= weight
|
26
|
+
data[:match] ||= ''
|
27
|
+
data[:type] ||= :sound
|
28
|
+
data[:score] ||= 0
|
29
|
+
data[:token] = data[:token].to_s.upcase
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# match_records is an array of hashes.
|
34
|
+
#
|
35
|
+
# The hash keys must match the record hash keys set when initialized.
|
36
|
+
#
|
37
|
+
# score will return the highest score of all the records that
|
38
|
+
# are sent in match_records.
|
39
|
+
def score(match_records)
|
40
|
+
score_results = Array.new
|
41
|
+
unless match_records.empty?
|
42
|
+
#place the match_records information
|
43
|
+
#into our @stats hash
|
44
|
+
match_records.each do |match|
|
45
|
+
match.each do |key, value|
|
46
|
+
@stats[key.to_sym][:match] = value.to_s.upcase
|
47
|
+
end
|
48
|
+
record_score = calculate_record
|
49
|
+
score_results.push(record_score)
|
50
|
+
@possible_hits << match.merge(:score => record_score) if record_score > 0
|
51
|
+
end
|
52
|
+
score = score_results.max #take max score
|
53
|
+
end
|
54
|
+
@possible_hits.uniq!
|
55
|
+
score ||= 0
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
|
61
|
+
# calculate the score for this record
|
62
|
+
# comparing the token to the match fields in the @stats hash
|
63
|
+
# and storing the score into the record
|
64
|
+
def calculate_record
|
65
|
+
score = 0
|
66
|
+
unless @stats.nil?
|
67
|
+
#need to make sure we check the name first, since city and address don't
|
68
|
+
#get added to the score unless there is a name match
|
69
|
+
[:name,:city,:address].each do |field|
|
70
|
+
data = @stats[field]
|
71
|
+
if (data[:token].blank?)
|
72
|
+
value = 0 #token is blank can't be sure of a match if nothing to match against
|
73
|
+
else
|
74
|
+
if (data[:match].blank?)
|
75
|
+
value = 0 #token has value match is blank
|
76
|
+
else
|
77
|
+
#token and match both have values
|
78
|
+
if (data[:type] == :number)
|
79
|
+
value = data[:token] == data[:match] ? 1 : 0
|
80
|
+
else
|
81
|
+
#first see if there is an exact match
|
82
|
+
value = data[:match].split('|').include?(data[:token]) ? 1 : 0
|
83
|
+
|
84
|
+
unless value > 0
|
85
|
+
#do a sounds like with the data as given to see if we get a match
|
86
|
+
#if match on sounds_like, only give .75 of the weight.
|
87
|
+
data[:match].split('|').each do |separate_value|
|
88
|
+
if data[:token].ofac_sounds_like(separate_value,false)
|
89
|
+
value = 0.75
|
90
|
+
break
|
91
|
+
else
|
92
|
+
value = 0
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
#if no match, then break the data down and see if we can find matches on the
|
99
|
+
#individual words
|
100
|
+
unless value > 0
|
101
|
+
token_data = data[:token].gsub(/\W/,'|')
|
102
|
+
token_array = token_data.split('|')
|
103
|
+
token_array.delete('')
|
104
|
+
|
105
|
+
match_data = data[:match].gsub(/\W/,'|')
|
106
|
+
match_array = match_data.split('|')
|
107
|
+
match_array.delete('')
|
108
|
+
|
109
|
+
value = 0
|
110
|
+
partial_weight = 1/token_array.length.to_f
|
111
|
+
|
112
|
+
token_array.each do |partial_token|
|
113
|
+
#first see if we get an exact match of the partial
|
114
|
+
if success = match_array.include?(partial_token)
|
115
|
+
value += partial_weight
|
116
|
+
end
|
117
|
+
unless success
|
118
|
+
#if this for :address or :city
|
119
|
+
#and there is no match at all, subtract 10% of the weight from :name score
|
120
|
+
unless field == :name
|
121
|
+
value -= partial_weight * 0.1
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
data[:score] = data[:weight] * value
|
130
|
+
score += data[:score]
|
131
|
+
break if field == :name && data[:score] == 0
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
score.round
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
139
|
+
|