stats_package_syntax_file_generator 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ # This file is part of the Minnesota Population Center's stats_package_syntax_file_generator project.
2
+ # For copyright and licensing information, see the NOTICE and LICENSE files
3
+ # in this project's top-level directory, and also on-line at:
4
+ # https://github.com/mnpopcenter/stats_package_syntax_file_generator
5
+
6
+ module StatsPackageSyntaxFileGenerator
7
+ class Maker
8
+
9
+ attr_reader :sfc, :syntax_type
10
+ attr_accessor :cmd_end
11
+
12
+ def initialize (sfc, syntax_type)
13
+ @sfc = sfc
14
+ @syntax_type = syntax_type
15
+ @cmd_end = ''
16
+ end
17
+
18
+
19
+ # Syntax terminator.
20
+
21
+ def syntax_end
22
+ [ @cmd_end, blank ]
23
+ end
24
+
25
+ def blank
26
+ ''
27
+ end
28
+
29
+
30
+ # Quoting methods.
31
+
32
+ def q (s)
33
+ '"' + s.to_s.gsub('"', '""') + '"'
34
+ end
35
+
36
+ def val_q (var, v)
37
+ var.is_string_var ? q(v) : v.to_s
38
+ end
39
+
40
+ def val_as_s (var, val_orig)
41
+ # Write a value in a syntax file varies by variable type:
42
+ # - Numeric variable: simply return the value as a string.
43
+ # - String variable: zero-pad the value if it looks like an integer.
44
+ v = val_orig.to_s
45
+ return v unless var.is_string_var
46
+ return v unless v =~ /^\-?\d+$/
47
+ sprintf('%0' + var.width.to_s + 'i', v.to_i)
48
+ end
49
+
50
+
51
+ # Methods to deal with long labels.
52
+
53
+ def label_trunc (label, limit)
54
+ label.to_s[0,limit]
55
+ end
56
+
57
+ def label_segments (label, max_length)
58
+ # Takes a string and a max length.
59
+ # Returns the array of strings that results from chopping the
60
+ # original string into segments no longer than max length.
61
+ # This is needed because some stats packages have max line lengths.
62
+ label = label.to_s
63
+ return [label] if label.length <= max_length
64
+ label = String.new(label)
65
+ r = []
66
+ r.push( label.slice!(0,max_length) ) while label.length > 0
67
+ r
68
+ end
69
+
70
+ def weave_label_segments (fmt, a, b, op_a, op_c)
71
+ # The function takes a sprintf format, two lists (a, b), and
72
+ # two strings (the assignment and concatenation operators used
73
+ # by the stats package). The purpose of the function is to handle
74
+ # long values and labels for stats packages that have a max syntax
75
+ # line length. See unit tests for an illustration.
76
+ r = []
77
+ r.push(sprintf(fmt, a.shift, op_c, '' )) while a.size > 1
78
+ r.push(sprintf(fmt, a.shift, op_a, b.shift))
79
+ r.push(sprintf(fmt, '', op_c, b.shift)) while b.size > 0
80
+ r
81
+ end
82
+
83
+
84
+ # Helper methods for values and their labels.
85
+
86
+ def labelable_values (var)
87
+ # For non-string variables, only values that look
88
+ # like integers can be labeled.
89
+ return var.values if var.is_string_var
90
+ var.values.find_all { |val| val.value.to_s =~ /^\-?\d+$/ }
91
+ end
92
+
93
+ def max_value_length (var, val_list)
94
+ return 0 if val_list.empty?
95
+ val_list.map { |val| val_as_s(var, val.value).length }.max
96
+ end
97
+
98
+
99
+ # Methods for comments at the start or end of the syntax file.
100
+
101
+ def comments_start
102
+ # Comments not needed unless syntax file is for the web app.
103
+ return [] unless @sfc.caller == 'web_app'
104
+
105
+ return [
106
+ 'NOTE: You need to set the Stata working directory to the path',
107
+ 'where the data file is located.',
108
+ ] if @syntax_type == 'stata'
109
+
110
+ cmd = (@syntax_type == 'sas') ? 'libname' : 'cd'
111
+ result = [
112
+ "NOTE: You need to edit the `#{cmd}` command to specify the path to the directory",
113
+ 'where the data file is located. For example: "C:\ipums_directory".'
114
+ ]
115
+ if @syntax_type == 'sas'
116
+ result << "Edit the `filename` command similarly to include the full path (the directory and the data file name)."
117
+ end
118
+ result
119
+ end
120
+
121
+ def comments_end
122
+ []
123
+ end
124
+
125
+ end
126
+ end
@@ -0,0 +1,306 @@
1
+ # This file is part of the Minnesota Population Center's stats_package_syntax_file_generator project.
2
+ # For copyright and licensing information, see the NOTICE and LICENSE files
3
+ # in this project's top-level directory, and also on-line at:
4
+ # https://github.com/mnpopcenter/stats_package_syntax_file_generator
5
+
6
+ module StatsPackageSyntaxFileGenerator
7
+ class MakerSAS < Maker
8
+
9
+ def initialize (sfc, syntax_type)
10
+ super
11
+ m = @sfc.max_var_name_length
12
+ @var_loc_format = " %-#{m}s "
13
+ @var_lab_format = " %-#{m}s %s %s"
14
+ @fmt_link_format = " %-#{m}s %s."
15
+ @bignum_int_format = " %-#{m}s %d."
16
+ @bignum_dec_format = " %-#{m}s %d.%d"
17
+ @cmd_end = ';'
18
+ @label_max_leng = 256
19
+ @segment_max_leng = 100
20
+ @sas_library_handle = 'IPUMS'
21
+ @sas_file_handle = 'ASCIIDAT'
22
+ @sas_fmt_suffix = '_f'
23
+ @sas_data_file_name = @sas_library_handle + '.' + @sfc.data_file_name_stem
24
+ end
25
+
26
+ def syntax
27
+ r = [
28
+ comments_start,
29
+ syn_libname,
30
+ syn_filename,
31
+ blank,
32
+ syn_val_labs,
33
+ syn_df,
34
+ syn_var_labs,
35
+ syn_fmt_link,
36
+ syn_fmt_big_nums,
37
+ syn_run,
38
+ comments_end,
39
+ ]
40
+ r.flatten!
41
+ end
42
+
43
+ def comments_start
44
+ convert_to_comments(super)
45
+ end
46
+
47
+ def comments_end
48
+ convert_to_comments(super)
49
+ end
50
+
51
+ def convert_to_comments (lines)
52
+ return [] if lines.empty?
53
+ [
54
+ '/*',
55
+ lines.map { |ln| ' ' + ln },
56
+ '*/',
57
+ blank,
58
+ ].flatten
59
+ end
60
+
61
+ def syn_libname
62
+ 'libname ' + @sas_library_handle + ' ' + q(@sfc.data_dir_name) + @cmd_end
63
+ end
64
+
65
+ def syn_filename
66
+ 'filename ' + @sas_file_handle + ' ' + q(@sfc.data_file_name) + @cmd_end
67
+ end
68
+
69
+ def syn_val_labs
70
+ var_list = @sfc.get_vars_with_values
71
+ return [] if var_list.empty?
72
+ r = [
73
+ syn_proc_format,
74
+ blank,
75
+ syn_val_labs_for_var_list(var_list),
76
+ syn_run,
77
+ ]
78
+ r.flatten!
79
+ end
80
+
81
+ def syn_proc_format
82
+ 'proc format cntlout = ' + @sas_data_file_name + @sas_fmt_suffix + @cmd_end
83
+ end
84
+
85
+ def syn_val_labs_for_var_list (var_list)
86
+ r = []
87
+ var_list.each do |var|
88
+ r.push syn_val_labs_for_var_start(var)
89
+ r.push syn_val_labs_for_var(var)
90
+ r.push syntax_end
91
+ end
92
+ r.flatten!
93
+ end
94
+
95
+ def syn_val_labs_for_var_start (var)
96
+ 'value' +
97
+ (var.is_string_var ? ' $ ' : ' ') +
98
+ var.name +
99
+ @sas_fmt_suffix
100
+ end
101
+
102
+ def syn_val_labs_for_var (var)
103
+ val_list = labelable_values(var)
104
+ return [] if val_list.empty?
105
+ m = max_value_length(var, val_list)
106
+ m = m + 2 if var.is_string_var
107
+ m = @segment_max_leng + 2 if m > @segment_max_leng + 2
108
+ fmt = " %-#{m}s %s %s"
109
+ r = val_list.collect { |val| syn_val_lab_for_val(var, val, fmt) }
110
+ r.flatten!
111
+ end
112
+
113
+ def syn_val_lab_for_val (var, val, fmt)
114
+ lab = label_trunc(val.label, @label_max_leng)
115
+ lab = val.value if lab.nil? || (lab.strip.length == 0)
116
+ vs = val_as_s(var, val.value)
117
+ val_segments = label_segments(vs, @segment_max_leng).map { |s| val_q(var, s) }
118
+ lab_segments = label_segments(lab, @segment_max_leng).map { |s| q(s) }
119
+ weave_label_segments(fmt, val_segments, lab_segments, '=', ' ')
120
+ end
121
+
122
+ def syn_run
123
+ [ 'run' + @cmd_end, blank ]
124
+ end
125
+
126
+ def syn_df
127
+ r = [
128
+ syn_df_start,
129
+ syn_df_infile,
130
+ blank,
131
+ @sfc.data_structure == 'hier' ? syn_dfh : syn_dfr,
132
+ ]
133
+ r.flatten!
134
+ end
135
+
136
+ def syn_df_start
137
+ 'data ' + @sas_data_file_name + @cmd_end
138
+ end
139
+
140
+ def syn_df_infile
141
+ # The LRECL specification is needed because the default behavior on some
142
+ # operating systems is to truncate records to 256 columns.
143
+ c = @sfc.last_column_used
144
+ 'infile ' + @sas_file_handle + ' pad missover lrecl=' + c.to_s + @cmd_end
145
+ end
146
+
147
+ def syn_dfr
148
+ r = syn_input(@sfc.variables)
149
+ r.push blank
150
+ r
151
+ end
152
+
153
+ def syn_input (var_list)
154
+ r = [
155
+ 'input',
156
+ syn_var_locations(var_list),
157
+ @cmd_end,
158
+ ]
159
+ r.flatten!
160
+ end
161
+
162
+ def syn_var_locations (var_list)
163
+ var_list.collect { |v|
164
+ sprintf( @var_loc_format, v.name ) +
165
+ (v.is_string_var ? '$ ' : ' ') +
166
+ v.column_locations_as_s +
167
+ implied_decimal_fmt(v)
168
+ }
169
+ end
170
+
171
+ def syn_dfh
172
+ r = [
173
+ syn_dfh_retain,
174
+ syn_dfh_rec_type_block,
175
+ blank,
176
+ syn_dfh_if_blocks,
177
+ ]
178
+ r.flatten!
179
+ end
180
+
181
+ def syn_dfh_retain
182
+ return [] unless @sfc.rectangularize
183
+ var_list = non_last_non_common_vars
184
+ return [] if var_list.size == 0
185
+ r = [
186
+ 'retain',
187
+ var_list.map { |var| ' ' + var.name },
188
+ syntax_end,
189
+ ]
190
+ r.flatten!
191
+ end
192
+
193
+ def syn_dfh_rec_type_block
194
+ r = syn_input( [@sfc.record_type_var])
195
+ r[1] = r[1] + ' @'
196
+ r
197
+ end
198
+
199
+ def syn_dfh_if_blocks
200
+ if_cmd = 'if'
201
+ r = []
202
+ @sfc.record_types.each do |rt|
203
+ r.push(
204
+ syn_dfh_if_start(if_cmd, rt),
205
+ syn_input( @sfc.get_vars_by_record_type(rt) ),
206
+ syn_dfh_if_end(rt)
207
+ )
208
+ if_cmd = 'else if'
209
+ end
210
+ r.flatten!
211
+ end
212
+
213
+ def syn_dfh_if_start (if_cmd, rt)
214
+ rt_var = @sfc.record_type_var
215
+ r = [
216
+ if_cmd,
217
+ @sfc.record_type_var.name,
218
+ '=',
219
+ val_q(rt_var, val_as_s(rt_var, rt)),
220
+ 'then do' + @cmd_end,
221
+ ]
222
+ r.join(' ')
223
+ end
224
+
225
+ def syn_dfh_if_end (rt)
226
+ r = []
227
+ r.push 'output' + @cmd_end if (not @sfc.rectangularize) or @sfc.is_last_record_type(rt)
228
+ r.push 'end' + @cmd_end
229
+ r.push blank
230
+ r
231
+ end
232
+
233
+ def syn_var_labs
234
+ var_list = @sfc.get_vars_with_var_labels
235
+ return [] if var_list.empty?
236
+ r = [
237
+ 'label',
238
+ var_list.map { |var| syn_var_lab_for_var(var) },
239
+ syntax_end,
240
+ ]
241
+ r.flatten!
242
+ end
243
+
244
+ def syn_var_lab_for_var (var)
245
+ lab = label_trunc(var.label, @label_max_leng)
246
+ lab_segments = label_segments(lab, @segment_max_leng).map { |s| q(s) }
247
+ weave_label_segments(@var_lab_format, [var.name], lab_segments, '=', ' ')
248
+ end
249
+
250
+ def syn_fmt_big_nums
251
+ big_num_vars = @sfc.get_big_nums
252
+ return [] if big_num_vars.empty?
253
+ r = [
254
+ 'format',
255
+ syn_fmt_big_nums_for_var_list(big_num_vars),
256
+ syntax_end,
257
+ ]
258
+ r.flatten!
259
+ end
260
+
261
+ def syn_fmt_big_nums_for_var_list (var_list)
262
+ var_list.map do |v|
263
+ if v.implied_decimals > 0
264
+ sprintf @bignum_dec_format, v.name, v.width + 1, v.implied_decimals
265
+ else
266
+ sprintf @bignum_int_format, v.name, v.width
267
+ end
268
+ end
269
+ end
270
+
271
+
272
+ def syn_fmt_link
273
+ var_list = @sfc.get_vars_with_values
274
+ return [] if var_list.empty?
275
+ r = [
276
+ 'format',
277
+ syn_fmt_link_for_var_list(var_list),
278
+ syntax_end,
279
+ ]
280
+ r.flatten!
281
+ end
282
+
283
+ def syn_fmt_link_for_var_list (var_list)
284
+ var_list.map { |v|
285
+ sprintf @fmt_link_format, v.name, v.name + @sas_fmt_suffix
286
+ }
287
+ end
288
+
289
+ def implied_decimal_fmt (var)
290
+ return '' if var.is_string_var or var.implied_decimals == 0
291
+ return ' .' + var.implied_decimals.to_s
292
+ end
293
+
294
+ def non_last_non_common_vars
295
+ # Returns a list of variables, excluding:
296
+ # - variables from the last record type
297
+ # - common variables
298
+ var_list = @sfc.rec_types_except_last.map do |rt|
299
+ vars = @sfc.get_vars_by_record_type(rt)
300
+ vars.find_all { |var| not var.is_common_var }
301
+ end
302
+ var_list.flatten!
303
+ end
304
+
305
+ end
306
+ end
@@ -0,0 +1,194 @@
1
+ # This file is part of the Minnesota Population Center's stats_package_syntax_file_generator project.
2
+ # For copyright and licensing information, see the NOTICE and LICENSE files
3
+ # in this project's top-level directory, and also on-line at:
4
+ # https://github.com/mnpopcenter/stats_package_syntax_file_generator
5
+
6
+ module StatsPackageSyntaxFileGenerator
7
+ class MakerSPSS < Maker
8
+
9
+ def initialize (sfc, syntax_type)
10
+ super
11
+ m = @sfc.max_var_name_length
12
+ @var_loc_format = " %-#{m}s %s"
13
+ @var_lab_format = " %-#{m}s %s %s"
14
+ @cmd_end = '.'
15
+ @var_label_max_leng = 120
16
+ @val_label_max_leng = 120
17
+ @segment_max_leng = 100
18
+ end
19
+
20
+ def syntax
21
+ r = [
22
+ comments_start,
23
+ syn_cd,
24
+ blank,
25
+ syn_df,
26
+ syn_var_labs,
27
+ syn_val_labs,
28
+ syn_execute,
29
+ blank,
30
+ comments_end,
31
+ ]
32
+ r.flatten
33
+ end
34
+
35
+ def comments_start
36
+ convert_to_comments(super)
37
+ end
38
+
39
+ def comments_end
40
+ convert_to_comments(super)
41
+ end
42
+
43
+ def convert_to_comments (lines)
44
+ return [] if lines.empty?
45
+ lines.push @cmd_end
46
+ [
47
+ lines.map { |ln| '* ' + ln },
48
+ blank,
49
+ ].flatten
50
+ end
51
+
52
+ def syn_cd
53
+ 'cd ' + q(@sfc.data_dir_name) + @cmd_end
54
+ end
55
+
56
+ def syn_df
57
+ @sfc.data_structure == 'hier' ? syn_dfh : syn_dfr
58
+ end
59
+
60
+ def syn_dfr
61
+ r = [
62
+ syn_dfr_start,
63
+ syn_var_locations(@sfc.variables),
64
+ syntax_end,
65
+ ]
66
+ r.flatten
67
+ end
68
+
69
+ def syn_dfr_start
70
+ 'data list file = ' + q(@sfc.data_file_name) + ' /'
71
+ end
72
+
73
+ def syn_dfh
74
+ r = [
75
+ syn_dfh_file_type,
76
+ syn_dfh_data_blocks,
77
+ 'end file type' + @cmd_end,
78
+ blank,
79
+ ]
80
+ r.flatten
81
+ end
82
+
83
+ def syn_dfh_file_type
84
+ r = [
85
+ 'file type ' + nested_or_mixed(),
86
+ ' /file = ' + q(@sfc.data_file_name),
87
+ ' /record = ' + var_loc_with_fmt(@sfc.record_type_var).to_s,
88
+ syntax_end,
89
+ ]
90
+ r.flatten
91
+ end
92
+
93
+ def syn_dfh_data_blocks
94
+ r = @sfc.record_types.map { |rt|
95
+ [
96
+ syn_dfh_data_block_start(rt),
97
+ syn_var_locations(@sfc.get_vars_by_record_type(rt)),
98
+ syntax_end,
99
+ ]
100
+ }
101
+ r.flatten
102
+ end
103
+
104
+ def syn_dfh_data_block_start (rt)
105
+ rt_var = @sfc.record_type_var
106
+ [
107
+ 'record type ' + val_q(rt_var, val_as_s(rt_var, rt)) + @cmd_end,
108
+ 'data list /',
109
+ ]
110
+ end
111
+
112
+ def syn_var_locations (var_list)
113
+ var_list.map { |v| sprintf @var_loc_format, v.name, var_loc_with_fmt(v) }
114
+ end
115
+
116
+ def syn_var_labs (var_list = [])
117
+ var_list = @sfc.get_vars_with_var_labels if var_list.empty?
118
+ return [] if var_list.empty?
119
+ r = [
120
+ 'variable labels',
121
+ var_list.map { |var| syn_var_lab_for_var(var) },
122
+ syntax_end,
123
+ ]
124
+ r.flatten
125
+ end
126
+
127
+ def syn_var_lab_for_var (var)
128
+ lab = label_trunc(var.label, @var_label_max_leng)
129
+ lab_segments = label_segments(lab, @segment_max_leng).map { |s| q(s) }
130
+ weave_label_segments(@var_lab_format, [var.name], lab_segments, ' ', '+')
131
+ end
132
+
133
+ def syn_val_labs
134
+ var_list = @sfc.get_vars_with_values
135
+ return [] if var_list.empty?
136
+ r = [
137
+ 'value labels',
138
+ syn_val_labs_for_var_list(var_list),
139
+ syntax_end,
140
+ ]
141
+ r.flatten
142
+ end
143
+
144
+ def syn_val_labs_for_var_list (var_list)
145
+ var_list.map { |var| syn_val_labs_for_var(var) }
146
+ end
147
+
148
+ def syn_val_labs_for_var (var)
149
+ val_list = labelable_values(var)
150
+ return [] if val_list.empty?
151
+ m = max_value_length(var, val_list)
152
+ m = m + 2 if var.is_string_var
153
+ m = @segment_max_leng + 2 if m > @segment_max_leng + 2
154
+ value_format = " %-#{m}s %s %s"
155
+ r = [
156
+ syn_val_labs_for_var_start(var),
157
+ val_list.map { |val| syn_val_lab_for_val(var, val, value_format) },
158
+ ]
159
+ r.flatten
160
+ end
161
+
162
+ def syn_val_labs_for_var_start (var)
163
+ ' /' + var.name
164
+ end
165
+
166
+ def syn_val_lab_for_val (var, val, fmt)
167
+ lab = label_trunc(val.label, @val_label_max_leng)
168
+ lab = val.value if lab.nil? || (lab.strip.length == 0)
169
+ vs = val_as_s(var, val.value)
170
+ val_segments = label_segments(vs, @segment_max_leng).map { |s| val_q(var, s) }
171
+ lab_segments = label_segments(lab, @segment_max_leng).map { |s| q(s) }
172
+ weave_label_segments(fmt, val_segments, lab_segments, ' ', '+')
173
+ end
174
+
175
+ def syn_execute
176
+ 'execute' + @cmd_end
177
+ end
178
+
179
+ def nested_or_mixed
180
+ @sfc.rectangularize ? 'nested' : 'mixed'
181
+ end
182
+
183
+ def var_loc_with_fmt (var)
184
+ var.column_locations_as_s + var_fmt(var)
185
+ end
186
+
187
+ def var_fmt (var)
188
+ return ' (a)' if var.is_string_var
189
+ return '' unless var.implied_decimals > 0
190
+ return ' (' + var.implied_decimals.to_s + ')'
191
+ end
192
+
193
+ end
194
+ end