hsds_transformer 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,182 @@
1
+ module HsdsTransformer
2
+ class BaseTransformer
3
+ include HsdsTransformer::Headers
4
+ include HsdsTransformer::FilePaths
5
+
6
+ attr_reader :mapping, :include_custom
7
+
8
+ SUPPORTED_HSDS_MODELS = %w(organizations services locations physical_addresses postal_addresses phones service_taxonomies regular_schedules taxonomies accessibility_for_disabilities contacts languages eligibilities services_at_locations service_areas)
9
+
10
+ def self.run(args)
11
+ new(args).transform
12
+ end
13
+
14
+ # TODO validate that incoming data is valid-ish, like unique IDs
15
+ def initialize(args)
16
+ @mapping = parse_mapping(args[:mapping])
17
+
18
+ @include_custom = args[:include_custom]
19
+ @zip_output = args[:zip_output]
20
+
21
+ SUPPORTED_HSDS_MODELS.each do |model|
22
+ var_name = "@" + model
23
+ instance_variable_set(var_name, [])
24
+ end
25
+
26
+ set_file_paths(args)
27
+ end
28
+
29
+ def transform
30
+ # Initial transformation into HSDS
31
+ mapping.each do |input_file_name, file_mapping|
32
+ transform_file(input_file_name, file_mapping)
33
+ end
34
+
35
+ # HSDS additional formatting
36
+ singletonize_languages
37
+
38
+ apply_custom_transformation
39
+
40
+ # make data path for these files
41
+ Dir.mkdir(output_datapackage_path) unless Dir.exists?(output_datapackage_path)
42
+ Dir.mkdir(output_data_path) unless Dir.exists?(output_data_path)
43
+
44
+ # Write the data to CSV files
45
+ write_output_files
46
+
47
+ zip_output if @zip_output
48
+
49
+ return self
50
+ end
51
+
52
+ def transform_file(input_file_name, file_mapping)
53
+ path = @input_path + input_file_name
54
+ org_mapping = file_mapping["columns"]
55
+
56
+ # Now we want to process each row in a way that allows the row to create multiple objects,
57
+ # including multiple objects from the same rows.
58
+ CSV.foreach(path, headers: true) do |input|
59
+ collected_data = hsds_objects_from_row(input, org_mapping)
60
+ collect_into_ivars(collected_data)
61
+ end
62
+ end
63
+
64
+
65
+ # This is defined in custom transformer if there is one
66
+ def apply_custom_transformation
67
+ end
68
+
69
+ private
70
+
71
+ def hsds_objects_from_row(input, org_mapping)
72
+ collected_data = {}
73
+
74
+ # k is the input field_name
75
+ # org_mapping[k] gives us the array of output fields
76
+ input.each do |k,v|
77
+ # turn this into array to be backwards compatible
78
+ output_fields = org_mapping[k].is_a?(Array) ? org_mapping[k] : [org_mapping[k]]
79
+
80
+ # now lets collect each object
81
+ output_fields.compact.each do |output_field|
82
+
83
+ # collected_data[output_field["model"]] should make it such that collected_data = { "organizations" => {} }
84
+ collected_data[output_field["model"]] ||= {}
85
+
86
+ # Append all string fields marked as "append" to single output field
87
+ if output_field["append"]
88
+ existing_string_value = collected_data[output_field["model"]][output_field["field"]] || ""
89
+ existing_string_value += v.to_s unless null_type(v)
90
+
91
+ collected_data[output_field["model"]].merge!(output_field["field"] => existing_string_value)
92
+ else
93
+ if output_field["map"]
94
+ value = output_field["map"][v]
95
+ else
96
+ value = v
97
+ end
98
+ safe_val = null_type(value) ? nil : value
99
+ collected_data[output_field["model"]].merge!(output_field["field"] => safe_val)
100
+ end
101
+ end
102
+ end
103
+ collected_data
104
+ end
105
+
106
+ def null_type(string)
107
+ string.nil? || string.downcase.strip == "null"
108
+ end
109
+
110
+ # Now let's pop each object into its respective instance variable collection to be written to the right file
111
+ def collect_into_ivars(collected_data)
112
+ SUPPORTED_HSDS_MODELS.each do |model|
113
+ collection_ivar(model) << collected_data[model] if collected_data[model] && !collected_data[model].empty?
114
+ end
115
+ end
116
+
117
+ def collection_ivar(model)
118
+ var_name = "@" + model
119
+ instance_variable_get(var_name)
120
+ end
121
+
122
+ def singletonize_languages
123
+ formatted_langs = @languages.each_with_object([]) do |language_row, array|
124
+ langs = language_row["language"].to_s.split(",")
125
+ if langs.size > 1
126
+ langs.each do |lang|
127
+ array << language_row.clone.merge("language" => lang.strip)
128
+ end
129
+ else
130
+ array << language_row
131
+ end
132
+ end
133
+ @languages = formatted_langs
134
+ end
135
+
136
+ def write_output_files
137
+ SUPPORTED_HSDS_MODELS.each do |model|
138
+ path_var = instance_variable_get "@output_#{model}_path"
139
+ write_csv path_var, headers(collection_ivar(model).first, model), collection_ivar(model)
140
+ end
141
+ end
142
+
143
+ def zip_output
144
+ input_data_files = Dir.glob(File.join(output_data_path, '**/*'))
145
+
146
+
147
+ File.delete(zipfile_name) if File.exists?(zipfile_name)
148
+
149
+ Zip::File.open(zipfile_name, Zip::File::CREATE) do |zipfile|
150
+ # Add databpackage.json
151
+ zipfile.add("datapackage.json", datapackage_json_path)
152
+
153
+ # Add data files
154
+ input_data_files.each do |file_path|
155
+ zipped_name = "data/" + File.basename(file_path)
156
+ zipfile.add(zipped_name, file_path)
157
+ end
158
+ end
159
+ end
160
+
161
+ # This also dedupes data by calling `uniq` on each collection before writing
162
+ def write_csv(path, headers, data)
163
+ return if data.empty?
164
+ CSV.open(path, 'wb') do |csv|
165
+ csv << headers
166
+ data.uniq.each do |row|
167
+ csv << CSV::Row.new(row.keys, row.values).values_at(*headers) unless row.values.all?(nil)
168
+ end
169
+ end
170
+ end
171
+
172
+ def parse_mapping(mapping_path)
173
+ if mapping_path[0..3] == "http"
174
+ uri = URI(mapping_path)
175
+ file = Net::HTTP.get(uri)
176
+ YAML.load file
177
+ else
178
+ YAML.load File.read(mapping_path)
179
+ end
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,80 @@
1
+ module HsdsTransformer
2
+ class IlaoTransformer < HsdsTransformer::BaseTransformer
3
+
4
+ STATE_ABBREVIATIONS = %w(AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY)
5
+
6
+ def apply_custom_transformation
7
+ parse_address_data
8
+ # process_regular_schedule_text
9
+ end
10
+
11
+ private
12
+
13
+ def parse_address_data
14
+ # TODO do this for physical too
15
+ @postal_addresses.each do |address_row|
16
+ address_str = address_row["address_1"]
17
+ postal_code = address_str.split(//).last(5).join
18
+ postal_code = postal_code.match(/\d{5}/)
19
+
20
+ if postal_code != ""
21
+ address_row["postal_code"] = postal_code.to_s
22
+ address_str = address_str[0..-7]
23
+ end
24
+
25
+ state = address_str.split(//).last(2).join.upcase
26
+
27
+ if STATE_ABBREVIATIONS.include?(state)
28
+ address_row["state_province"] = state
29
+ address_str = address_str[0..-5]
30
+ end
31
+
32
+ address_row["address_1"] = address_str
33
+ end
34
+ end
35
+
36
+ def process_regular_schedule_text(schedule_key:, schedule_hash:, input:)
37
+ if input["Hours of operation"]
38
+ regex_list = input["Hours of operation"].scan(/\S*day: \S*/)
39
+ for regex in regex_list do
40
+ day = regex.split(': ')[0]
41
+ hours = regex.split(': ')[1]
42
+ if hours == "Closed"
43
+ opens_at = nil
44
+ closes_at = nil
45
+ else
46
+ opens_at = hours.split('-')[0]
47
+ closes_at = hours.split('-')[1]
48
+ end
49
+ collect_schedule_data(schedule_key: schedule_key,
50
+ schedule_hash: schedule_hash, input: input,
51
+ day: day, opens_at: opens_at, closes_at: closes_at)
52
+ end
53
+ end
54
+ end
55
+
56
+ def collect_schedule_data(schedule_key:, schedule_hash:, input:,
57
+ day:, opens_at:, closes_at:)
58
+ schedule_row = {}
59
+ schedule_row["weekday"] = day
60
+ schedule_row["opens_at"] = opens_at
61
+ schedule_row["closes_at"] = closes_at
62
+
63
+ foreign_key = schedule_hash["foreign_key_name"]
64
+ foreign_key_value = schedule_hash["foreign_key_value"]
65
+ schedule_row[foreign_key] = input[foreign_key_value]
66
+ schedule_data << schedule_row
67
+ end
68
+
69
+ def collect_sal_data(sal_key:, sal_hash:, input:)
70
+ key = sal_hash["field"]
71
+ sal_row = {}
72
+ sal_row[key] = input[sal_key]
73
+
74
+ foreign_key = sal_hash["foreign_key_name"]
75
+ foreign_key_value = sal_hash["foreign_key_value"]
76
+ sal_row[foreign_key] = input[foreign_key_value]
77
+ sal_data << sal_row
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,168 @@
1
+ module HsdsTransformer
2
+ class Open211MiamiTransformer < HsdsTransformer::BaseTransformer
3
+ WEEKDAYS = %w(Monday Tuesday Wednesday Thursday Friday)
4
+ ALL_DAYS = %w(Monday Tuesday Wednesday Thursday Friday Saturday Sunday)
5
+ DAY_MAPPING = {
6
+ "mon" => "Monday",
7
+ "tue" => "Tuesday",
8
+ "wed" => "Wednesday",
9
+ "thu" => "Thursday",
10
+ "fri" => "Friday",
11
+ "sat" => "Saturday",
12
+ "sun" => "Sunday",
13
+ }
14
+
15
+ TOP_LEVEL_TAXONOMIES = {
16
+ "B" => "Basic Needs",
17
+ "D" => "Consumer Services",
18
+ "F" => "Criminal Justice and Legal Services",
19
+ "H" => "Education",
20
+ "J" => "Environmental Quality",
21
+ "L" => "Health Care",
22
+ "N" => "Income Support and Employment",
23
+ "P" => "Individual and Family Life",
24
+ "R" => "Mental Health Care and Counseling",
25
+ "T" => "Organizational/Community/International Services",
26
+ "Y" => "Target Populations"
27
+ }
28
+
29
+ TAXONOMY_VOCAB = "Open211 Miami - AIRS"
30
+
31
+ def apply_custom_transformation
32
+ remove_child_organizations
33
+ determine_services
34
+ parse_regular_schedules_text
35
+ supplement_taxonomy
36
+ end
37
+
38
+ private
39
+
40
+ def determine_services
41
+ new_services = @services.each do |service|
42
+ # Update the name to remove the org name
43
+ formatted_name = service["name"].to_s.split(" - ").last
44
+ service.merge!("name" => formatted_name)
45
+
46
+ # Set the org ID as the parent provider id
47
+ if !service["parent_provider_id"].nil?
48
+ service.merge!("organization_id" => service["parent_provider_id"])
49
+ end
50
+ service.delete "parent_provider_id"
51
+ service
52
+ end
53
+
54
+ @services = new_services
55
+ end
56
+
57
+ # TODO figure out what to do with 24 hour text
58
+ # TODO add IDs
59
+ def parse_regular_schedules_text
60
+ new_schedules = @regular_schedules.each_with_object([]) do |sched_row, new_sheds|
61
+ # Schedule times and tidbits are mostly separated by a newline
62
+ sched_options = sched_row["original_text"].to_s.split("\n")
63
+
64
+ sched_options.each do |opt|
65
+ opt_days = find_days(opt)
66
+ if all_weekdays?(opt_days)
67
+ sched_days = WEEKDAYS
68
+ elsif single_days?(opt_days)
69
+ sched_days = single_days(opt_days)
70
+ else
71
+ sched_days = []
72
+ end
73
+
74
+ sched_days.each do |day|
75
+ new_sheds << new_sched_row(day, opt, sched_row)
76
+ end
77
+ end
78
+ end
79
+
80
+ @regular_schedules = new_schedules
81
+ end
82
+
83
+ def find_days(opt_string)
84
+ strings = opt_string.to_s.split(", ")[1..-1].compact.flatten
85
+ strings.map(&:downcase)
86
+ end
87
+
88
+ def all_weekdays?(days)
89
+ days == ["mon-fri"]
90
+ end
91
+
92
+ def single_days?(days)
93
+ !single_days(days).empty?
94
+ end
95
+
96
+ def single_days(days)
97
+ DAY_MAPPING.select{ |day| days.include? day }.values
98
+ end
99
+
100
+ def hours(opt)
101
+ range = opt.split(", ")[0]
102
+ times = range.split("-")
103
+ return unless times.size == 2
104
+
105
+ open = clean_time(times[0])
106
+ close = clean_time(times[1])
107
+
108
+ [open, close]
109
+ end
110
+
111
+ # Finds the time in strings like "Admin:\\n9:00am", "9am", "9:0a", "10:00pm"
112
+ def clean_time(time)
113
+ /\d{1,2}.*\z/.match(time).to_s
114
+ end
115
+
116
+ def new_sched_row(day, opt, sched_row)
117
+ open, close = hours(opt)
118
+ {
119
+ "service_id" => sched_row["service_id"],
120
+ "weekday" => day,
121
+ "opens_at" => open,
122
+ "closes_at" => close,
123
+ "original_text" => sched_row["original_text"]
124
+ }
125
+ end
126
+
127
+ def remove_child_organizations
128
+ @organizations.reject! do |org|
129
+ !org["parent_provider_id"].nil?
130
+ end
131
+
132
+ @organizations.each { |org| org.delete("parent_provider_id") }
133
+ end
134
+
135
+ def supplement_taxonomy
136
+ @taxonomies.each do |tax_row|
137
+ if tax_row["id"].length == 1
138
+ category = nil # Already top-level
139
+ else
140
+ category = tax_row["id"][0]
141
+ end
142
+
143
+ suppl_attrs = {
144
+ "parent_id" => category,
145
+ "parent_name" => TOP_LEVEL_TAXONOMIES[category],
146
+ "vocabulary" => TAXONOMY_VOCAB
147
+ }
148
+
149
+ tax_row.merge!(suppl_attrs)
150
+ end
151
+
152
+ @taxonomies.concat(top_level_taxonomies)
153
+ end
154
+
155
+ def top_level_taxonomies
156
+ TOP_LEVEL_TAXONOMIES.map do |key, value|
157
+ {
158
+ "id" => key,
159
+ "name" => value,
160
+ "taxonomy_facet" => "Service",
161
+ "parent_id" => nil,
162
+ "parent_name" => nil,
163
+ "vocabulary" => TAXONOMY_VOCAB
164
+ }
165
+ end
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,5 @@
1
+ module HsdsTransformer
2
+
3
+ class InvalidCustomTransformerException < ::Exception; end
4
+
5
+ end
@@ -0,0 +1,40 @@
1
+ module HsdsTransformer
2
+ module FilePaths
3
+ DEFAULT_OUTPUT_PATH = "#{ENV["ROOT_PATH"]}/tmp"
4
+ DEFAULT_INPUT_PATH = "#{ENV["ROOT_PATH"]}/"
5
+
6
+ attr_reader :input_path, :output_path, :output_datapackage_path, :output_data_path, :datapackage_json_path,
7
+ :zipfile_name, :output_organizations_path, :output_locations_path, :output_services_path,
8
+ :output_phones_path, :output_physical_addresses_path, :output_postal_addresses_path,
9
+ :output_services_at_locations_path, :output_eligibilities_path, :output_contacts_path,
10
+ :output_languages_path, :output_accessibility_for_disabilities_path, :output_taxonomies_path,
11
+ :output_service_taxonomies_path, :output_regular_schedules_path, :output_service_areas_path
12
+
13
+ # TODO DRY this up
14
+ def set_file_paths(args)
15
+ @input_path = args[:input_path] || DEFAULT_INPUT_PATH
16
+ @output_path = args[:output_path] || DEFAULT_OUTPUT_PATH
17
+ @output_datapackage_path = File.join(output_path, "datapackage")
18
+ @output_data_path = File.join(output_datapackage_path, "data")
19
+ @zipfile_name = File.join(output_path, "datapackage.zip")
20
+
21
+ @output_organizations_path = output_data_path + "/organizations.csv"
22
+ @output_locations_path = output_data_path + "/locations.csv"
23
+ @output_services_path = output_data_path + "/services.csv"
24
+ @output_phones_path = output_data_path + "/phones.csv"
25
+ @output_physical_addresses_path = output_data_path + "/physical_addresses.csv"
26
+ @output_postal_addresses_path = output_data_path + "/postal_addresses.csv"
27
+ @output_services_at_locations_path = output_data_path + "/services_at_location.csv"
28
+ @output_eligibilities_path = output_data_path + "/eligibility.csv"
29
+ @output_contacts_path = output_data_path + "/contacts.csv"
30
+ @output_languages_path = output_data_path + "/languages.csv"
31
+ @output_accessibility_for_disabilities_path = output_data_path + "/accessibility_for_disabilities.csv"
32
+ @output_taxonomies_path = output_data_path + "/taxonomy.csv"
33
+ @output_service_taxonomies_path = output_data_path + "/services_taxonomy.csv"
34
+ @output_regular_schedules_path = output_data_path + "/regular_schedules.csv"
35
+ @output_service_areas_path = output_data_path + "/service_areas.csv"
36
+
37
+ @datapackage_json_path = File.join(ENV["ROOT_PATH"], "lib/datapackage/datapackage.json")
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,31 @@
1
+ module HsdsTransformer
2
+ module Headers
3
+ ORGANIZATIONS_HEADERS = %w(id name alternate_name description email url tax_status tax_id year_incorporated legal_status)
4
+ LOCATIONS_HEADERS = %w(id organization_id name alternate_name description transportation latitude longitude)
5
+ SERVICES_HEADERS = %w(id organization_id program_id name alternate_name description url email status interpretation_services application_process wait_time fees accreditations licenses)
6
+ PHONES_HEADERS = %w(id location_id service_id organization_id contact_id service_at_location_id number extension type language description)
7
+ PHYSICAL_ADDRESSES_HEADERS = %w(id location_id organization_id attention address_1 city region state_province postal_code country)
8
+ POSTAL_ADDRESSES_HEADERS = %w(id location_id organization_id attention address_1 city region state_province postal_code country)
9
+ REGULAR_SCHEDULES_HEADERS = %w(id service_id location_id service_at_location_id weekday opens_at closes_at)
10
+ SERVICES_AT_LOCATIONS_HEADERS = %w(id service_id location_id description)
11
+ ELIGIBILITIES_HEADERS = %w(id service_id eligibility)
12
+ CONTACTS_HEADERS = %w(id organization_id service_id service_at_location_id name title department email)
13
+ LANGUAGES_HEADERS = %w(id service_id location_id language)
14
+ ACCESSIBILITY_FOR_DISABILITIES_HEADERS = %w(id location_id accessibility details)
15
+ TAXONOMIES_HEADERS = %w(id name parent_id parent_name vocabulary)
16
+ SERVICE_TAXONOMIES_HEADERS = %w(id service_id taxonomy_id taxonomy_detail)
17
+ SERVICE_AREAS_HEADERS = %w(id service_id service_area description)
18
+
19
+ def headers(row, model)
20
+ const_name = "HsdsTransformer::Headers::" + model.upcase + "_HEADERS"
21
+ # TODO make sure valid
22
+ const = Object.const_get(const_name)
23
+
24
+ if row && @include_custom
25
+ (const + row.keys).uniq
26
+ else
27
+ const
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,32 @@
1
+ module HsdsTransformer
2
+ class Runner
3
+
4
+ VALID_CUSTOM_TRANSFORMERS = %w(Open211MiamiTransformer IlaoTransformer)
5
+
6
+ # Args:
7
+ # input_path - indicates the dir containing the input data files
8
+ # output_path - indicates the dir you want the resulting HSDS files to go
9
+ # include_custom - Default: false - indicates that the final output CSVs should include the non-HSDS columns that the original input CSVs had
10
+ # zip_output - Default: false - indicates whether you want the output to be zipped into a single datapackage.zip
11
+ # custom_transformer - Default: nil - indicates the custom transformer class you want to use. This arg does not get passed to transformer classes
12
+ def self.run(args)
13
+ custom = args.delete(:custom_transformer)
14
+ validate_custom(custom)
15
+
16
+ transformer = custom ? custom_transformer(custom) : BaseTransformer
17
+
18
+ transformer.run(args)
19
+ end
20
+
21
+ def self.validate_custom(custom)
22
+ if custom && !VALID_CUSTOM_TRANSFORMERS.include?(custom)
23
+ raise InvalidCustomTransformerException
24
+ end
25
+ end
26
+
27
+ def self.custom_transformer(custom)
28
+ klass = "HsdsTransformer::" + custom
29
+ Object.const_get(klass)
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,15 @@
1
+ require "dotenv/load"
2
+ require "csv"
3
+ require "yaml"
4
+ require "zip"
5
+ require "zip/zip"
6
+ require "rest_client"
7
+
8
+ require "hsds_transformer/file_paths"
9
+ require "hsds_transformer/headers"
10
+ require "hsds_transformer/exceptions"
11
+ require "hsds_transformer/runner"
12
+ require "hsds_transformer/base_transformer"
13
+
14
+ require "hsds_transformer/custom/open211_miami_transformer"
15
+ require "hsds_transformer/custom/ilao_transformer"
data/lib/support.rb ADDED
@@ -0,0 +1,31 @@
1
+ # TODO implement validation
2
+ module Support
3
+ def validate(filename, type)
4
+ filename = "#{filename}"
5
+ file = File.new(filename, 'rb')
6
+ RestClient.post('http://localhost:1400/validate/csv',
7
+ {"file" => file,
8
+ "type" => type})
9
+ return true
10
+ rescue RestClient::BadRequest
11
+ @valid = false
12
+ return false
13
+ end
14
+
15
+ def validate_output
16
+ unless validate(output_organizations_path, "organization")
17
+ puts "Organization data not valid"
18
+ end
19
+ unless validate(output_locations_path, "location")
20
+ puts "Location data not valid"
21
+ end
22
+ unless validate(output_services_path, "service")
23
+ puts "Service data not valid"
24
+ end
25
+ unless validate(output_phones_path, "phone")
26
+ puts "Phone data not valid"
27
+ end
28
+ rescue Errno::ECONNREFUSED
29
+ puts "Can't connect to validation service."
30
+ end
31
+ end