stats_package_syntax_file_generator 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,126 @@
1
+ # This file is part of the Minnesota Population Center's stats_package_syntax_file_generator project.
2
+ # For copyright and licensing information, see the NOTICE and LICENSE files
3
+ # in this project's top-level directory, and also on-line at:
4
+ # https://github.com/mnpopcenter/stats_package_syntax_file_generator
5
+
6
+ module StatsPackageSyntaxFileGenerator
7
+ class Maker
8
+
9
+ attr_reader :sfc, :syntax_type
10
+ attr_accessor :cmd_end
11
+
12
+ def initialize (sfc, syntax_type)
13
+ @sfc = sfc
14
+ @syntax_type = syntax_type
15
+ @cmd_end = ''
16
+ end
17
+
18
+
19
+ # Syntax terminator.
20
+
21
+ def syntax_end
22
+ [ @cmd_end, blank ]
23
+ end
24
+
25
+ def blank
26
+ ''
27
+ end
28
+
29
+
30
+ # Quoting methods.
31
+
32
+ def q (s)
33
+ '"' + s.to_s.gsub('"', '""') + '"'
34
+ end
35
+
36
+ def val_q (var, v)
37
+ var.is_string_var ? q(v) : v.to_s
38
+ end
39
+
40
+ def val_as_s (var, val_orig)
41
+ # Write a value in a syntax file varies by variable type:
42
+ # - Numeric variable: simply return the value as a string.
43
+ # - String variable: zero-pad the value if it looks like an integer.
44
+ v = val_orig.to_s
45
+ return v unless var.is_string_var
46
+ return v unless v =~ /^\-?\d+$/
47
+ sprintf('%0' + var.width.to_s + 'i', v.to_i)
48
+ end
49
+
50
+
51
+ # Methods to deal with long labels.
52
+
53
+ def label_trunc (label, limit)
54
+ label.to_s[0,limit]
55
+ end
56
+
57
+ def label_segments (label, max_length)
58
+ # Takes a string and a max length.
59
+ # Returns the array of strings that results from chopping the
60
+ # original string into segments no longer than max length.
61
+ # This is needed because some stats packages have max line lengths.
62
+ label = label.to_s
63
+ return [label] if label.length <= max_length
64
+ label = String.new(label)
65
+ r = []
66
+ r.push( label.slice!(0,max_length) ) while label.length > 0
67
+ r
68
+ end
69
+
70
+ def weave_label_segments (fmt, a, b, op_a, op_c)
71
+ # The function takes a sprintf format, two lists (a, b), and
72
+ # two strings (the assignment and concatenation operators used
73
+ # by the stats package). The purpose of the function is to handle
74
+ # long values and labels for stats packages that have a max syntax
75
+ # line length. See unit tests for an illustration.
76
+ r = []
77
+ r.push(sprintf(fmt, a.shift, op_c, '' )) while a.size > 1
78
+ r.push(sprintf(fmt, a.shift, op_a, b.shift))
79
+ r.push(sprintf(fmt, '', op_c, b.shift)) while b.size > 0
80
+ r
81
+ end
82
+
83
+
84
+ # Helper methods for values and their labels.
85
+
86
+ def labelable_values (var)
87
+ # For non-string variables, only values that look
88
+ # like integers can be labeled.
89
+ return var.values if var.is_string_var
90
+ var.values.find_all { |val| val.value.to_s =~ /^\-?\d+$/ }
91
+ end
92
+
93
+ def max_value_length (var, val_list)
94
+ return 0 if val_list.empty?
95
+ val_list.map { |val| val_as_s(var, val.value).length }.max
96
+ end
97
+
98
+
99
+ # Methods for comments at the start or end of the syntax file.
100
+
101
+ def comments_start
102
+ # Comments not needed unless syntax file is for the web app.
103
+ return [] unless @sfc.caller == 'web_app'
104
+
105
+ return [
106
+ 'NOTE: You need to set the Stata working directory to the path',
107
+ 'where the data file is located.',
108
+ ] if @syntax_type == 'stata'
109
+
110
+ cmd = (@syntax_type == 'sas') ? 'libname' : 'cd'
111
+ result = [
112
+ "NOTE: You need to edit the `#{cmd}` command to specify the path to the directory",
113
+ 'where the data file is located. For example: "C:\ipums_directory".'
114
+ ]
115
+ if @syntax_type == 'sas'
116
+ result << "Edit the `filename` command similarly to include the full path (the directory and the data file name)."
117
+ end
118
+ result
119
+ end
120
+
121
+ def comments_end
122
+ []
123
+ end
124
+
125
+ end
126
+ end
@@ -0,0 +1,306 @@
1
+ # This file is part of the Minnesota Population Center's stats_package_syntax_file_generator project.
2
+ # For copyright and licensing information, see the NOTICE and LICENSE files
3
+ # in this project's top-level directory, and also on-line at:
4
+ # https://github.com/mnpopcenter/stats_package_syntax_file_generator
5
+
6
+ module StatsPackageSyntaxFileGenerator
7
+ class MakerSAS < Maker
8
+
9
+ def initialize (sfc, syntax_type)
10
+ super
11
+ m = @sfc.max_var_name_length
12
+ @var_loc_format = " %-#{m}s "
13
+ @var_lab_format = " %-#{m}s %s %s"
14
+ @fmt_link_format = " %-#{m}s %s."
15
+ @bignum_int_format = " %-#{m}s %d."
16
+ @bignum_dec_format = " %-#{m}s %d.%d"
17
+ @cmd_end = ';'
18
+ @label_max_leng = 256
19
+ @segment_max_leng = 100
20
+ @sas_library_handle = 'IPUMS'
21
+ @sas_file_handle = 'ASCIIDAT'
22
+ @sas_fmt_suffix = '_f'
23
+ @sas_data_file_name = @sas_library_handle + '.' + @sfc.data_file_name_stem
24
+ end
25
+
26
+ def syntax
27
+ r = [
28
+ comments_start,
29
+ syn_libname,
30
+ syn_filename,
31
+ blank,
32
+ syn_val_labs,
33
+ syn_df,
34
+ syn_var_labs,
35
+ syn_fmt_link,
36
+ syn_fmt_big_nums,
37
+ syn_run,
38
+ comments_end,
39
+ ]
40
+ r.flatten!
41
+ end
42
+
43
+ def comments_start
44
+ convert_to_comments(super)
45
+ end
46
+
47
+ def comments_end
48
+ convert_to_comments(super)
49
+ end
50
+
51
+ def convert_to_comments (lines)
52
+ return [] if lines.empty?
53
+ [
54
+ '/*',
55
+ lines.map { |ln| ' ' + ln },
56
+ '*/',
57
+ blank,
58
+ ].flatten
59
+ end
60
+
61
+ def syn_libname
62
+ 'libname ' + @sas_library_handle + ' ' + q(@sfc.data_dir_name) + @cmd_end
63
+ end
64
+
65
+ def syn_filename
66
+ 'filename ' + @sas_file_handle + ' ' + q(@sfc.data_file_name) + @cmd_end
67
+ end
68
+
69
+ def syn_val_labs
70
+ var_list = @sfc.get_vars_with_values
71
+ return [] if var_list.empty?
72
+ r = [
73
+ syn_proc_format,
74
+ blank,
75
+ syn_val_labs_for_var_list(var_list),
76
+ syn_run,
77
+ ]
78
+ r.flatten!
79
+ end
80
+
81
+ def syn_proc_format
82
+ 'proc format cntlout = ' + @sas_data_file_name + @sas_fmt_suffix + @cmd_end
83
+ end
84
+
85
+ def syn_val_labs_for_var_list (var_list)
86
+ r = []
87
+ var_list.each do |var|
88
+ r.push syn_val_labs_for_var_start(var)
89
+ r.push syn_val_labs_for_var(var)
90
+ r.push syntax_end
91
+ end
92
+ r.flatten!
93
+ end
94
+
95
+ def syn_val_labs_for_var_start (var)
96
+ 'value' +
97
+ (var.is_string_var ? ' $ ' : ' ') +
98
+ var.name +
99
+ @sas_fmt_suffix
100
+ end
101
+
102
+ def syn_val_labs_for_var (var)
103
+ val_list = labelable_values(var)
104
+ return [] if val_list.empty?
105
+ m = max_value_length(var, val_list)
106
+ m = m + 2 if var.is_string_var
107
+ m = @segment_max_leng + 2 if m > @segment_max_leng + 2
108
+ fmt = " %-#{m}s %s %s"
109
+ r = val_list.collect { |val| syn_val_lab_for_val(var, val, fmt) }
110
+ r.flatten!
111
+ end
112
+
113
+ def syn_val_lab_for_val (var, val, fmt)
114
+ lab = label_trunc(val.label, @label_max_leng)
115
+ lab = val.value if lab.nil? || (lab.strip.length == 0)
116
+ vs = val_as_s(var, val.value)
117
+ val_segments = label_segments(vs, @segment_max_leng).map { |s| val_q(var, s) }
118
+ lab_segments = label_segments(lab, @segment_max_leng).map { |s| q(s) }
119
+ weave_label_segments(fmt, val_segments, lab_segments, '=', ' ')
120
+ end
121
+
122
+ def syn_run
123
+ [ 'run' + @cmd_end, blank ]
124
+ end
125
+
126
+ def syn_df
127
+ r = [
128
+ syn_df_start,
129
+ syn_df_infile,
130
+ blank,
131
+ @sfc.data_structure == 'hier' ? syn_dfh : syn_dfr,
132
+ ]
133
+ r.flatten!
134
+ end
135
+
136
+ def syn_df_start
137
+ 'data ' + @sas_data_file_name + @cmd_end
138
+ end
139
+
140
+ def syn_df_infile
141
+ # The LRECL specification is needed because the default behavior on some
142
+ # operating systems is to truncate records to 256 columns.
143
+ c = @sfc.last_column_used
144
+ 'infile ' + @sas_file_handle + ' pad missover lrecl=' + c.to_s + @cmd_end
145
+ end
146
+
147
+ def syn_dfr
148
+ r = syn_input(@sfc.variables)
149
+ r.push blank
150
+ r
151
+ end
152
+
153
+ def syn_input (var_list)
154
+ r = [
155
+ 'input',
156
+ syn_var_locations(var_list),
157
+ @cmd_end,
158
+ ]
159
+ r.flatten!
160
+ end
161
+
162
+ def syn_var_locations (var_list)
163
+ var_list.collect { |v|
164
+ sprintf( @var_loc_format, v.name ) +
165
+ (v.is_string_var ? '$ ' : ' ') +
166
+ v.column_locations_as_s +
167
+ implied_decimal_fmt(v)
168
+ }
169
+ end
170
+
171
+ def syn_dfh
172
+ r = [
173
+ syn_dfh_retain,
174
+ syn_dfh_rec_type_block,
175
+ blank,
176
+ syn_dfh_if_blocks,
177
+ ]
178
+ r.flatten!
179
+ end
180
+
181
+ def syn_dfh_retain
182
+ return [] unless @sfc.rectangularize
183
+ var_list = non_last_non_common_vars
184
+ return [] if var_list.size == 0
185
+ r = [
186
+ 'retain',
187
+ var_list.map { |var| ' ' + var.name },
188
+ syntax_end,
189
+ ]
190
+ r.flatten!
191
+ end
192
+
193
+ def syn_dfh_rec_type_block
194
+ r = syn_input( [@sfc.record_type_var])
195
+ r[1] = r[1] + ' @'
196
+ r
197
+ end
198
+
199
+ def syn_dfh_if_blocks
200
+ if_cmd = 'if'
201
+ r = []
202
+ @sfc.record_types.each do |rt|
203
+ r.push(
204
+ syn_dfh_if_start(if_cmd, rt),
205
+ syn_input( @sfc.get_vars_by_record_type(rt) ),
206
+ syn_dfh_if_end(rt)
207
+ )
208
+ if_cmd = 'else if'
209
+ end
210
+ r.flatten!
211
+ end
212
+
213
+ def syn_dfh_if_start (if_cmd, rt)
214
+ rt_var = @sfc.record_type_var
215
+ r = [
216
+ if_cmd,
217
+ @sfc.record_type_var.name,
218
+ '=',
219
+ val_q(rt_var, val_as_s(rt_var, rt)),
220
+ 'then do' + @cmd_end,
221
+ ]
222
+ r.join(' ')
223
+ end
224
+
225
+ def syn_dfh_if_end (rt)
226
+ r = []
227
+ r.push 'output' + @cmd_end if (not @sfc.rectangularize) or @sfc.is_last_record_type(rt)
228
+ r.push 'end' + @cmd_end
229
+ r.push blank
230
+ r
231
+ end
232
+
233
+ def syn_var_labs
234
+ var_list = @sfc.get_vars_with_var_labels
235
+ return [] if var_list.empty?
236
+ r = [
237
+ 'label',
238
+ var_list.map { |var| syn_var_lab_for_var(var) },
239
+ syntax_end,
240
+ ]
241
+ r.flatten!
242
+ end
243
+
244
+ def syn_var_lab_for_var (var)
245
+ lab = label_trunc(var.label, @label_max_leng)
246
+ lab_segments = label_segments(lab, @segment_max_leng).map { |s| q(s) }
247
+ weave_label_segments(@var_lab_format, [var.name], lab_segments, '=', ' ')
248
+ end
249
+
250
+ def syn_fmt_big_nums
251
+ big_num_vars = @sfc.get_big_nums
252
+ return [] if big_num_vars.empty?
253
+ r = [
254
+ 'format',
255
+ syn_fmt_big_nums_for_var_list(big_num_vars),
256
+ syntax_end,
257
+ ]
258
+ r.flatten!
259
+ end
260
+
261
+ def syn_fmt_big_nums_for_var_list (var_list)
262
+ var_list.map do |v|
263
+ if v.implied_decimals > 0
264
+ sprintf @bignum_dec_format, v.name, v.width + 1, v.implied_decimals
265
+ else
266
+ sprintf @bignum_int_format, v.name, v.width
267
+ end
268
+ end
269
+ end
270
+
271
+
272
+ def syn_fmt_link
273
+ var_list = @sfc.get_vars_with_values
274
+ return [] if var_list.empty?
275
+ r = [
276
+ 'format',
277
+ syn_fmt_link_for_var_list(var_list),
278
+ syntax_end,
279
+ ]
280
+ r.flatten!
281
+ end
282
+
283
+ def syn_fmt_link_for_var_list (var_list)
284
+ var_list.map { |v|
285
+ sprintf @fmt_link_format, v.name, v.name + @sas_fmt_suffix
286
+ }
287
+ end
288
+
289
+ def implied_decimal_fmt (var)
290
+ return '' if var.is_string_var or var.implied_decimals == 0
291
+ return ' .' + var.implied_decimals.to_s
292
+ end
293
+
294
+ def non_last_non_common_vars
295
+ # Returns a list of variables, excluding:
296
+ # - variables from the last record type
297
+ # - common variables
298
+ var_list = @sfc.rec_types_except_last.map do |rt|
299
+ vars = @sfc.get_vars_by_record_type(rt)
300
+ vars.find_all { |var| not var.is_common_var }
301
+ end
302
+ var_list.flatten!
303
+ end
304
+
305
+ end
306
+ end
@@ -0,0 +1,194 @@
1
+ # This file is part of the Minnesota Population Center's stats_package_syntax_file_generator project.
2
+ # For copyright and licensing information, see the NOTICE and LICENSE files
3
+ # in this project's top-level directory, and also on-line at:
4
+ # https://github.com/mnpopcenter/stats_package_syntax_file_generator
5
+
6
+ module StatsPackageSyntaxFileGenerator
7
+ class MakerSPSS < Maker
8
+
9
+ def initialize (sfc, syntax_type)
10
+ super
11
+ m = @sfc.max_var_name_length
12
+ @var_loc_format = " %-#{m}s %s"
13
+ @var_lab_format = " %-#{m}s %s %s"
14
+ @cmd_end = '.'
15
+ @var_label_max_leng = 120
16
+ @val_label_max_leng = 120
17
+ @segment_max_leng = 100
18
+ end
19
+
20
+ def syntax
21
+ r = [
22
+ comments_start,
23
+ syn_cd,
24
+ blank,
25
+ syn_df,
26
+ syn_var_labs,
27
+ syn_val_labs,
28
+ syn_execute,
29
+ blank,
30
+ comments_end,
31
+ ]
32
+ r.flatten
33
+ end
34
+
35
+ def comments_start
36
+ convert_to_comments(super)
37
+ end
38
+
39
+ def comments_end
40
+ convert_to_comments(super)
41
+ end
42
+
43
+ def convert_to_comments (lines)
44
+ return [] if lines.empty?
45
+ lines.push @cmd_end
46
+ [
47
+ lines.map { |ln| '* ' + ln },
48
+ blank,
49
+ ].flatten
50
+ end
51
+
52
+ def syn_cd
53
+ 'cd ' + q(@sfc.data_dir_name) + @cmd_end
54
+ end
55
+
56
+ def syn_df
57
+ @sfc.data_structure == 'hier' ? syn_dfh : syn_dfr
58
+ end
59
+
60
+ def syn_dfr
61
+ r = [
62
+ syn_dfr_start,
63
+ syn_var_locations(@sfc.variables),
64
+ syntax_end,
65
+ ]
66
+ r.flatten
67
+ end
68
+
69
+ def syn_dfr_start
70
+ 'data list file = ' + q(@sfc.data_file_name) + ' /'
71
+ end
72
+
73
+ def syn_dfh
74
+ r = [
75
+ syn_dfh_file_type,
76
+ syn_dfh_data_blocks,
77
+ 'end file type' + @cmd_end,
78
+ blank,
79
+ ]
80
+ r.flatten
81
+ end
82
+
83
+ def syn_dfh_file_type
84
+ r = [
85
+ 'file type ' + nested_or_mixed(),
86
+ ' /file = ' + q(@sfc.data_file_name),
87
+ ' /record = ' + var_loc_with_fmt(@sfc.record_type_var).to_s,
88
+ syntax_end,
89
+ ]
90
+ r.flatten
91
+ end
92
+
93
+ def syn_dfh_data_blocks
94
+ r = @sfc.record_types.map { |rt|
95
+ [
96
+ syn_dfh_data_block_start(rt),
97
+ syn_var_locations(@sfc.get_vars_by_record_type(rt)),
98
+ syntax_end,
99
+ ]
100
+ }
101
+ r.flatten
102
+ end
103
+
104
+ def syn_dfh_data_block_start (rt)
105
+ rt_var = @sfc.record_type_var
106
+ [
107
+ 'record type ' + val_q(rt_var, val_as_s(rt_var, rt)) + @cmd_end,
108
+ 'data list /',
109
+ ]
110
+ end
111
+
112
+ def syn_var_locations (var_list)
113
+ var_list.map { |v| sprintf @var_loc_format, v.name, var_loc_with_fmt(v) }
114
+ end
115
+
116
+ def syn_var_labs (var_list = [])
117
+ var_list = @sfc.get_vars_with_var_labels if var_list.empty?
118
+ return [] if var_list.empty?
119
+ r = [
120
+ 'variable labels',
121
+ var_list.map { |var| syn_var_lab_for_var(var) },
122
+ syntax_end,
123
+ ]
124
+ r.flatten
125
+ end
126
+
127
+ def syn_var_lab_for_var (var)
128
+ lab = label_trunc(var.label, @var_label_max_leng)
129
+ lab_segments = label_segments(lab, @segment_max_leng).map { |s| q(s) }
130
+ weave_label_segments(@var_lab_format, [var.name], lab_segments, ' ', '+')
131
+ end
132
+
133
+ def syn_val_labs
134
+ var_list = @sfc.get_vars_with_values
135
+ return [] if var_list.empty?
136
+ r = [
137
+ 'value labels',
138
+ syn_val_labs_for_var_list(var_list),
139
+ syntax_end,
140
+ ]
141
+ r.flatten
142
+ end
143
+
144
+ def syn_val_labs_for_var_list (var_list)
145
+ var_list.map { |var| syn_val_labs_for_var(var) }
146
+ end
147
+
148
+ def syn_val_labs_for_var (var)
149
+ val_list = labelable_values(var)
150
+ return [] if val_list.empty?
151
+ m = max_value_length(var, val_list)
152
+ m = m + 2 if var.is_string_var
153
+ m = @segment_max_leng + 2 if m > @segment_max_leng + 2
154
+ value_format = " %-#{m}s %s %s"
155
+ r = [
156
+ syn_val_labs_for_var_start(var),
157
+ val_list.map { |val| syn_val_lab_for_val(var, val, value_format) },
158
+ ]
159
+ r.flatten
160
+ end
161
+
162
+ def syn_val_labs_for_var_start (var)
163
+ ' /' + var.name
164
+ end
165
+
166
+ def syn_val_lab_for_val (var, val, fmt)
167
+ lab = label_trunc(val.label, @val_label_max_leng)
168
+ lab = val.value if lab.nil? || (lab.strip.length == 0)
169
+ vs = val_as_s(var, val.value)
170
+ val_segments = label_segments(vs, @segment_max_leng).map { |s| val_q(var, s) }
171
+ lab_segments = label_segments(lab, @segment_max_leng).map { |s| q(s) }
172
+ weave_label_segments(fmt, val_segments, lab_segments, ' ', '+')
173
+ end
174
+
175
+ def syn_execute
176
+ 'execute' + @cmd_end
177
+ end
178
+
179
+ def nested_or_mixed
180
+ @sfc.rectangularize ? 'nested' : 'mixed'
181
+ end
182
+
183
+ def var_loc_with_fmt (var)
184
+ var.column_locations_as_s + var_fmt(var)
185
+ end
186
+
187
+ def var_fmt (var)
188
+ return ' (a)' if var.is_string_var
189
+ return '' unless var.implied_decimals > 0
190
+ return ' (' + var.implied_decimals.to_s + ')'
191
+ end
192
+
193
+ end
194
+ end