hsds_transformer 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,182 @@
1
+ module HsdsTransformer
2
+ class BaseTransformer
3
+ include HsdsTransformer::Headers
4
+ include HsdsTransformer::FilePaths
5
+
6
+ attr_reader :mapping, :include_custom
7
+
8
+ SUPPORTED_HSDS_MODELS = %w(organizations services locations physical_addresses postal_addresses phones service_taxonomies regular_schedules taxonomies accessibility_for_disabilities contacts languages eligibilities services_at_locations service_areas)
9
+
10
+ def self.run(args)
11
+ new(args).transform
12
+ end
13
+
14
+ # TODO validate that incoming data is valid-ish, like unique IDs
15
+ def initialize(args)
16
+ @mapping = parse_mapping(args[:mapping])
17
+
18
+ @include_custom = args[:include_custom]
19
+ @zip_output = args[:zip_output]
20
+
21
+ SUPPORTED_HSDS_MODELS.each do |model|
22
+ var_name = "@" + model
23
+ instance_variable_set(var_name, [])
24
+ end
25
+
26
+ set_file_paths(args)
27
+ end
28
+
29
+ def transform
30
+ # Initial transformation into HSDS
31
+ mapping.each do |input_file_name, file_mapping|
32
+ transform_file(input_file_name, file_mapping)
33
+ end
34
+
35
+ # HSDS additional formatting
36
+ singletonize_languages
37
+
38
+ apply_custom_transformation
39
+
40
+ # make data path for these files
41
+ Dir.mkdir(output_datapackage_path) unless Dir.exists?(output_datapackage_path)
42
+ Dir.mkdir(output_data_path) unless Dir.exists?(output_data_path)
43
+
44
+ # Write the data to CSV files
45
+ write_output_files
46
+
47
+ zip_output if @zip_output
48
+
49
+ return self
50
+ end
51
+
52
+ def transform_file(input_file_name, file_mapping)
53
+ path = @input_path + input_file_name
54
+ org_mapping = file_mapping["columns"]
55
+
56
+ # Now we want to process each row in a way that allows the row to create multiple objects,
57
+ # including multiple objects from the same rows.
58
+ CSV.foreach(path, headers: true) do |input|
59
+ collected_data = hsds_objects_from_row(input, org_mapping)
60
+ collect_into_ivars(collected_data)
61
+ end
62
+ end
63
+
64
+
65
+ # This is defined in custom transformer if there is one
66
+ def apply_custom_transformation
67
+ end
68
+
69
+ private
70
+
71
+ def hsds_objects_from_row(input, org_mapping)
72
+ collected_data = {}
73
+
74
+ # k is the input field_name
75
+ # org_mapping[k] gives us the array of output fields
76
+ input.each do |k,v|
77
+ # turn this into array to be backwards compatible
78
+ output_fields = org_mapping[k].is_a?(Array) ? org_mapping[k] : [org_mapping[k]]
79
+
80
+ # now lets collect each object
81
+ output_fields.compact.each do |output_field|
82
+
83
+ # collected_data[output_field["model"]] should make it such that collected_data = { "organizations" => {} }
84
+ collected_data[output_field["model"]] ||= {}
85
+
86
+ # Append all string fields marked as "append" to single output field
87
+ if output_field["append"]
88
+ existing_string_value = collected_data[output_field["model"]][output_field["field"]] || ""
89
+ existing_string_value += v.to_s unless null_type(v)
90
+
91
+ collected_data[output_field["model"]].merge!(output_field["field"] => existing_string_value)
92
+ else
93
+ if output_field["map"]
94
+ value = output_field["map"][v]
95
+ else
96
+ value = v
97
+ end
98
+ safe_val = null_type(value) ? nil : value
99
+ collected_data[output_field["model"]].merge!(output_field["field"] => safe_val)
100
+ end
101
+ end
102
+ end
103
+ collected_data
104
+ end
105
+
106
+ def null_type(string)
107
+ string.nil? || string.downcase.strip == "null"
108
+ end
109
+
110
+ # Now let's pop each object into its respective instance variable collection to be written to the right file
111
+ def collect_into_ivars(collected_data)
112
+ SUPPORTED_HSDS_MODELS.each do |model|
113
+ collection_ivar(model) << collected_data[model] if collected_data[model] && !collected_data[model].empty?
114
+ end
115
+ end
116
+
117
+ def collection_ivar(model)
118
+ var_name = "@" + model
119
+ instance_variable_get(var_name)
120
+ end
121
+
122
+ def singletonize_languages
123
+ formatted_langs = @languages.each_with_object([]) do |language_row, array|
124
+ langs = language_row["language"].to_s.split(",")
125
+ if langs.size > 1
126
+ langs.each do |lang|
127
+ array << language_row.clone.merge("language" => lang.strip)
128
+ end
129
+ else
130
+ array << language_row
131
+ end
132
+ end
133
+ @languages = formatted_langs
134
+ end
135
+
136
+ def write_output_files
137
+ SUPPORTED_HSDS_MODELS.each do |model|
138
+ path_var = instance_variable_get "@output_#{model}_path"
139
+ write_csv path_var, headers(collection_ivar(model).first, model), collection_ivar(model)
140
+ end
141
+ end
142
+
143
+ def zip_output
144
+ input_data_files = Dir.glob(File.join(output_data_path, '**/*'))
145
+
146
+
147
+ File.delete(zipfile_name) if File.exists?(zipfile_name)
148
+
149
+ Zip::File.open(zipfile_name, Zip::File::CREATE) do |zipfile|
150
+ # Add databpackage.json
151
+ zipfile.add("datapackage.json", datapackage_json_path)
152
+
153
+ # Add data files
154
+ input_data_files.each do |file_path|
155
+ zipped_name = "data/" + File.basename(file_path)
156
+ zipfile.add(zipped_name, file_path)
157
+ end
158
+ end
159
+ end
160
+
161
+ # This also dedupes data by calling `uniq` on each collection before writing
162
+ def write_csv(path, headers, data)
163
+ return if data.empty?
164
+ CSV.open(path, 'wb') do |csv|
165
+ csv << headers
166
+ data.uniq.each do |row|
167
+ csv << CSV::Row.new(row.keys, row.values).values_at(*headers) unless row.values.all?(nil)
168
+ end
169
+ end
170
+ end
171
+
172
+ def parse_mapping(mapping_path)
173
+ if mapping_path[0..3] == "http"
174
+ uri = URI(mapping_path)
175
+ file = Net::HTTP.get(uri)
176
+ YAML.load file
177
+ else
178
+ YAML.load File.read(mapping_path)
179
+ end
180
+ end
181
+ end
182
+ end
@@ -0,0 +1,80 @@
1
+ module HsdsTransformer
2
+ class IlaoTransformer < HsdsTransformer::BaseTransformer
3
+
4
+ STATE_ABBREVIATIONS = %w(AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY)
5
+
6
+ def apply_custom_transformation
7
+ parse_address_data
8
+ # process_regular_schedule_text
9
+ end
10
+
11
+ private
12
+
13
+ def parse_address_data
14
+ # TODO do this for physical too
15
+ @postal_addresses.each do |address_row|
16
+ address_str = address_row["address_1"]
17
+ postal_code = address_str.split(//).last(5).join
18
+ postal_code = postal_code.match(/\d{5}/)
19
+
20
+ if postal_code != ""
21
+ address_row["postal_code"] = postal_code.to_s
22
+ address_str = address_str[0..-7]
23
+ end
24
+
25
+ state = address_str.split(//).last(2).join.upcase
26
+
27
+ if STATE_ABBREVIATIONS.include?(state)
28
+ address_row["state_province"] = state
29
+ address_str = address_str[0..-5]
30
+ end
31
+
32
+ address_row["address_1"] = address_str
33
+ end
34
+ end
35
+
36
+ def process_regular_schedule_text(schedule_key:, schedule_hash:, input:)
37
+ if input["Hours of operation"]
38
+ regex_list = input["Hours of operation"].scan(/\S*day: \S*/)
39
+ for regex in regex_list do
40
+ day = regex.split(': ')[0]
41
+ hours = regex.split(': ')[1]
42
+ if hours == "Closed"
43
+ opens_at = nil
44
+ closes_at = nil
45
+ else
46
+ opens_at = hours.split('-')[0]
47
+ closes_at = hours.split('-')[1]
48
+ end
49
+ collect_schedule_data(schedule_key: schedule_key,
50
+ schedule_hash: schedule_hash, input: input,
51
+ day: day, opens_at: opens_at, closes_at: closes_at)
52
+ end
53
+ end
54
+ end
55
+
56
+ def collect_schedule_data(schedule_key:, schedule_hash:, input:,
57
+ day:, opens_at:, closes_at:)
58
+ schedule_row = {}
59
+ schedule_row["weekday"] = day
60
+ schedule_row["opens_at"] = opens_at
61
+ schedule_row["closes_at"] = closes_at
62
+
63
+ foreign_key = schedule_hash["foreign_key_name"]
64
+ foreign_key_value = schedule_hash["foreign_key_value"]
65
+ schedule_row[foreign_key] = input[foreign_key_value]
66
+ schedule_data << schedule_row
67
+ end
68
+
69
+ def collect_sal_data(sal_key:, sal_hash:, input:)
70
+ key = sal_hash["field"]
71
+ sal_row = {}
72
+ sal_row[key] = input[sal_key]
73
+
74
+ foreign_key = sal_hash["foreign_key_name"]
75
+ foreign_key_value = sal_hash["foreign_key_value"]
76
+ sal_row[foreign_key] = input[foreign_key_value]
77
+ sal_data << sal_row
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,168 @@
1
+ module HsdsTransformer
2
+ class Open211MiamiTransformer < HsdsTransformer::BaseTransformer
3
+ WEEKDAYS = %w(Monday Tuesday Wednesday Thursday Friday)
4
+ ALL_DAYS = %w(Monday Tuesday Wednesday Thursday Friday Saturday Sunday)
5
+ DAY_MAPPING = {
6
+ "mon" => "Monday",
7
+ "tue" => "Tuesday",
8
+ "wed" => "Wednesday",
9
+ "thu" => "Thursday",
10
+ "fri" => "Friday",
11
+ "sat" => "Saturday",
12
+ "sun" => "Sunday",
13
+ }
14
+
15
+ TOP_LEVEL_TAXONOMIES = {
16
+ "B" => "Basic Needs",
17
+ "D" => "Consumer Services",
18
+ "F" => "Criminal Justice and Legal Services",
19
+ "H" => "Education",
20
+ "J" => "Environmental Quality",
21
+ "L" => "Health Care",
22
+ "N" => "Income Support and Employment",
23
+ "P" => "Individual and Family Life",
24
+ "R" => "Mental Health Care and Counseling",
25
+ "T" => "Organizational/Community/International Services",
26
+ "Y" => "Target Populations"
27
+ }
28
+
29
+ TAXONOMY_VOCAB = "Open211 Miami - AIRS"
30
+
31
+ def apply_custom_transformation
32
+ remove_child_organizations
33
+ determine_services
34
+ parse_regular_schedules_text
35
+ supplement_taxonomy
36
+ end
37
+
38
+ private
39
+
40
+ def determine_services
41
+ new_services = @services.each do |service|
42
+ # Update the name to remove the org name
43
+ formatted_name = service["name"].to_s.split(" - ").last
44
+ service.merge!("name" => formatted_name)
45
+
46
+ # Set the org ID as the parent provider id
47
+ if !service["parent_provider_id"].nil?
48
+ service.merge!("organization_id" => service["parent_provider_id"])
49
+ end
50
+ service.delete "parent_provider_id"
51
+ service
52
+ end
53
+
54
+ @services = new_services
55
+ end
56
+
57
+ # TODO figure out what to do with 24 hour text
58
+ # TODO add IDs
59
+ def parse_regular_schedules_text
60
+ new_schedules = @regular_schedules.each_with_object([]) do |sched_row, new_sheds|
61
+ # Schedule times and tidbits are mostly separated by a newline
62
+ sched_options = sched_row["original_text"].to_s.split("\n")
63
+
64
+ sched_options.each do |opt|
65
+ opt_days = find_days(opt)
66
+ if all_weekdays?(opt_days)
67
+ sched_days = WEEKDAYS
68
+ elsif single_days?(opt_days)
69
+ sched_days = single_days(opt_days)
70
+ else
71
+ sched_days = []
72
+ end
73
+
74
+ sched_days.each do |day|
75
+ new_sheds << new_sched_row(day, opt, sched_row)
76
+ end
77
+ end
78
+ end
79
+
80
+ @regular_schedules = new_schedules
81
+ end
82
+
83
+ def find_days(opt_string)
84
+ strings = opt_string.to_s.split(", ")[1..-1].compact.flatten
85
+ strings.map(&:downcase)
86
+ end
87
+
88
+ def all_weekdays?(days)
89
+ days == ["mon-fri"]
90
+ end
91
+
92
+ def single_days?(days)
93
+ !single_days(days).empty?
94
+ end
95
+
96
+ def single_days(days)
97
+ DAY_MAPPING.select{ |day| days.include? day }.values
98
+ end
99
+
100
+ def hours(opt)
101
+ range = opt.split(", ")[0]
102
+ times = range.split("-")
103
+ return unless times.size == 2
104
+
105
+ open = clean_time(times[0])
106
+ close = clean_time(times[1])
107
+
108
+ [open, close]
109
+ end
110
+
111
+ # Finds the time in strings like "Admin:\\n9:00am", "9am", "9:0a", "10:00pm"
112
+ def clean_time(time)
113
+ /\d{1,2}.*\z/.match(time).to_s
114
+ end
115
+
116
+ def new_sched_row(day, opt, sched_row)
117
+ open, close = hours(opt)
118
+ {
119
+ "service_id" => sched_row["service_id"],
120
+ "weekday" => day,
121
+ "opens_at" => open,
122
+ "closes_at" => close,
123
+ "original_text" => sched_row["original_text"]
124
+ }
125
+ end
126
+
127
+ def remove_child_organizations
128
+ @organizations.reject! do |org|
129
+ !org["parent_provider_id"].nil?
130
+ end
131
+
132
+ @organizations.each { |org| org.delete("parent_provider_id") }
133
+ end
134
+
135
+ def supplement_taxonomy
136
+ @taxonomies.each do |tax_row|
137
+ if tax_row["id"].length == 1
138
+ category = nil # Already top-level
139
+ else
140
+ category = tax_row["id"][0]
141
+ end
142
+
143
+ suppl_attrs = {
144
+ "parent_id" => category,
145
+ "parent_name" => TOP_LEVEL_TAXONOMIES[category],
146
+ "vocabulary" => TAXONOMY_VOCAB
147
+ }
148
+
149
+ tax_row.merge!(suppl_attrs)
150
+ end
151
+
152
+ @taxonomies.concat(top_level_taxonomies)
153
+ end
154
+
155
+ def top_level_taxonomies
156
+ TOP_LEVEL_TAXONOMIES.map do |key, value|
157
+ {
158
+ "id" => key,
159
+ "name" => value,
160
+ "taxonomy_facet" => "Service",
161
+ "parent_id" => nil,
162
+ "parent_name" => nil,
163
+ "vocabulary" => TAXONOMY_VOCAB
164
+ }
165
+ end
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,5 @@
1
+ module HsdsTransformer
2
+
3
+ class InvalidCustomTransformerException < ::Exception; end
4
+
5
+ end
@@ -0,0 +1,40 @@
1
+ module HsdsTransformer
2
+ module FilePaths
3
+ DEFAULT_OUTPUT_PATH = "#{ENV["ROOT_PATH"]}/tmp"
4
+ DEFAULT_INPUT_PATH = "#{ENV["ROOT_PATH"]}/"
5
+
6
+ attr_reader :input_path, :output_path, :output_datapackage_path, :output_data_path, :datapackage_json_path,
7
+ :zipfile_name, :output_organizations_path, :output_locations_path, :output_services_path,
8
+ :output_phones_path, :output_physical_addresses_path, :output_postal_addresses_path,
9
+ :output_services_at_locations_path, :output_eligibilities_path, :output_contacts_path,
10
+ :output_languages_path, :output_accessibility_for_disabilities_path, :output_taxonomies_path,
11
+ :output_service_taxonomies_path, :output_regular_schedules_path, :output_service_areas_path
12
+
13
+ # TODO DRY this up
14
+ def set_file_paths(args)
15
+ @input_path = args[:input_path] || DEFAULT_INPUT_PATH
16
+ @output_path = args[:output_path] || DEFAULT_OUTPUT_PATH
17
+ @output_datapackage_path = File.join(output_path, "datapackage")
18
+ @output_data_path = File.join(output_datapackage_path, "data")
19
+ @zipfile_name = File.join(output_path, "datapackage.zip")
20
+
21
+ @output_organizations_path = output_data_path + "/organizations.csv"
22
+ @output_locations_path = output_data_path + "/locations.csv"
23
+ @output_services_path = output_data_path + "/services.csv"
24
+ @output_phones_path = output_data_path + "/phones.csv"
25
+ @output_physical_addresses_path = output_data_path + "/physical_addresses.csv"
26
+ @output_postal_addresses_path = output_data_path + "/postal_addresses.csv"
27
+ @output_services_at_locations_path = output_data_path + "/services_at_location.csv"
28
+ @output_eligibilities_path = output_data_path + "/eligibility.csv"
29
+ @output_contacts_path = output_data_path + "/contacts.csv"
30
+ @output_languages_path = output_data_path + "/languages.csv"
31
+ @output_accessibility_for_disabilities_path = output_data_path + "/accessibility_for_disabilities.csv"
32
+ @output_taxonomies_path = output_data_path + "/taxonomy.csv"
33
+ @output_service_taxonomies_path = output_data_path + "/services_taxonomy.csv"
34
+ @output_regular_schedules_path = output_data_path + "/regular_schedules.csv"
35
+ @output_service_areas_path = output_data_path + "/service_areas.csv"
36
+
37
+ @datapackage_json_path = File.join(ENV["ROOT_PATH"], "lib/datapackage/datapackage.json")
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,31 @@
1
+ module HsdsTransformer
2
+ module Headers
3
+ ORGANIZATIONS_HEADERS = %w(id name alternate_name description email url tax_status tax_id year_incorporated legal_status)
4
+ LOCATIONS_HEADERS = %w(id organization_id name alternate_name description transportation latitude longitude)
5
+ SERVICES_HEADERS = %w(id organization_id program_id name alternate_name description url email status interpretation_services application_process wait_time fees accreditations licenses)
6
+ PHONES_HEADERS = %w(id location_id service_id organization_id contact_id service_at_location_id number extension type language description)
7
+ PHYSICAL_ADDRESSES_HEADERS = %w(id location_id organization_id attention address_1 city region state_province postal_code country)
8
+ POSTAL_ADDRESSES_HEADERS = %w(id location_id organization_id attention address_1 city region state_province postal_code country)
9
+ REGULAR_SCHEDULES_HEADERS = %w(id service_id location_id service_at_location_id weekday opens_at closes_at)
10
+ SERVICES_AT_LOCATIONS_HEADERS = %w(id service_id location_id description)
11
+ ELIGIBILITIES_HEADERS = %w(id service_id eligibility)
12
+ CONTACTS_HEADERS = %w(id organization_id service_id service_at_location_id name title department email)
13
+ LANGUAGES_HEADERS = %w(id service_id location_id language)
14
+ ACCESSIBILITY_FOR_DISABILITIES_HEADERS = %w(id location_id accessibility details)
15
+ TAXONOMIES_HEADERS = %w(id name parent_id parent_name vocabulary)
16
+ SERVICE_TAXONOMIES_HEADERS = %w(id service_id taxonomy_id taxonomy_detail)
17
+ SERVICE_AREAS_HEADERS = %w(id service_id service_area description)
18
+
19
+ def headers(row, model)
20
+ const_name = "HsdsTransformer::Headers::" + model.upcase + "_HEADERS"
21
+ # TODO make sure valid
22
+ const = Object.const_get(const_name)
23
+
24
+ if row && @include_custom
25
+ (const + row.keys).uniq
26
+ else
27
+ const
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,32 @@
1
+ module HsdsTransformer
2
+ class Runner
3
+
4
+ VALID_CUSTOM_TRANSFORMERS = %w(Open211MiamiTransformer IlaoTransformer)
5
+
6
+ # Args:
7
+ # input_path - indicates the dir containing the input data files
8
+ # output_path - indicates the dir you want the resulting HSDS files to go
9
+ # include_custom - Default: false - indicates that the final output CSVs should include the non-HSDS columns that the original input CSVs had
10
+ # zip_output - Default: false - indicates whether you want the output to be zipped into a single datapackage.zip
11
+ # custom_transformer - Default: nil - indicates the custom transformer class you want to use. This arg does not get passed to transformer classes
12
+ def self.run(args)
13
+ custom = args.delete(:custom_transformer)
14
+ validate_custom(custom)
15
+
16
+ transformer = custom ? custom_transformer(custom) : BaseTransformer
17
+
18
+ transformer.run(args)
19
+ end
20
+
21
+ def self.validate_custom(custom)
22
+ if custom && !VALID_CUSTOM_TRANSFORMERS.include?(custom)
23
+ raise InvalidCustomTransformerException
24
+ end
25
+ end
26
+
27
+ def self.custom_transformer(custom)
28
+ klass = "HsdsTransformer::" + custom
29
+ Object.const_get(klass)
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,15 @@
1
+ require "dotenv/load"
2
+ require "csv"
3
+ require "yaml"
4
+ require "zip"
5
+ require "zip/zip"
6
+ require "rest_client"
7
+
8
+ require "hsds_transformer/file_paths"
9
+ require "hsds_transformer/headers"
10
+ require "hsds_transformer/exceptions"
11
+ require "hsds_transformer/runner"
12
+ require "hsds_transformer/base_transformer"
13
+
14
+ require "hsds_transformer/custom/open211_miami_transformer"
15
+ require "hsds_transformer/custom/ilao_transformer"
data/lib/support.rb ADDED
@@ -0,0 +1,31 @@
1
+ # TODO implement validation
2
+ module Support
3
+ def validate(filename, type)
4
+ filename = "#{filename}"
5
+ file = File.new(filename, 'rb')
6
+ RestClient.post('http://localhost:1400/validate/csv',
7
+ {"file" => file,
8
+ "type" => type})
9
+ return true
10
+ rescue RestClient::BadRequest
11
+ @valid = false
12
+ return false
13
+ end
14
+
15
+ def validate_output
16
+ unless validate(output_organizations_path, "organization")
17
+ puts "Organization data not valid"
18
+ end
19
+ unless validate(output_locations_path, "location")
20
+ puts "Location data not valid"
21
+ end
22
+ unless validate(output_services_path, "service")
23
+ puts "Service data not valid"
24
+ end
25
+ unless validate(output_phones_path, "phone")
26
+ puts "Phone data not valid"
27
+ end
28
+ rescue Errno::ECONNREFUSED
29
+ puts "Can't connect to validation service."
30
+ end
31
+ end