stats_package_syntax_file_generator 1.1.7 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2006302f8dde73e249ee6175ef65b7c41439a14911f515f82f32f6f7dbf5403b
4
- data.tar.gz: 4e3eaf39e8537c7e49ac3ea01a37c2e4440d4a07e80d5d8810b103e02257ec8d
3
+ metadata.gz: 8e54f08bcc60310213618ae309728257a75fde34af6dd0c9958a04279d47e111
4
+ data.tar.gz: 2c09c27ee28137762017fe52845ae61496310a2175adba9b2656390576b66e14
5
5
  SHA512:
6
- metadata.gz: 68dd339e0624e374c374d7c006a7747dcf3800b9661e976594a6f005acdb46a00d972ab1cea25becc3ddb4c945672b3cf15bd3ede23487f21bdd5e0f3685cb5e
7
- data.tar.gz: 553b2e56987e63490a7684c00e2bd3a12b4b18027a7fd60343e0d7089efc00f75f6385a49c94cd8b4301fb489b04e469819897e9c3cb815c9baafa1255ac1170
6
+ metadata.gz: 853aa7c596cf1964227877e6f562ad1b7e274262bd161e947e9138afff0d3d570d831efaaa96ec4694c1b6d794ba9798762b9311a41a35063127e1cde2416fc0
7
+ data.tar.gz: c4cfec89e1b2cf998f5aaf9b9edecd9cb2aec820bb9b04701300b378a41ed9dc0738458526255b5bf446d8aa803fce345bbb0e27c6f25c4f05a75c002f7280da
@@ -6,7 +6,7 @@
6
6
  module SyntaxFile
7
7
  class Controller
8
8
 
9
- VERSION = "1.1.7"
9
+ VERSION = "1.2.0"
10
10
 
11
11
  ATTR = {
12
12
  :project => { :req => false, :rw => 'rw', :def => '', :yaml => true },
@@ -52,9 +52,17 @@ module SyntaxFile
52
52
  @record_types = [] if @record_types.nil?
53
53
  @variables = [] if @variables.nil?
54
54
  @yaml_files = [] if @yaml_files.nil?
55
+
56
+ if @data_structure == 'hier' && is_csv?
57
+ raise(ArgumentError, 'Hierarchical data_structure is not supported for CSV data')
58
+ end
55
59
  read_metadata_from_yaml
56
60
  end
57
61
 
62
+ def is_csv?
63
+ @data_file_name.end_with?('.csv')
64
+ end
65
+
58
66
  # Methods to import metadata from YAML files into the Controller object.
59
67
 
60
68
  def yaml_files=(file_names)
@@ -66,11 +74,31 @@ module SyntaxFile
66
74
  def read_metadata_from_yaml
67
75
  return if @yaml_files.empty?
68
76
  md = {}
69
- @yaml_files.each { |f| md.merge! YAML.load_file(f) }
77
+ @yaml_files.each { |f|
78
+ md.merge! parse_yaml(f)
79
+ }
70
80
  md = symbolize_keys(md)
71
81
  load_yaml_md(md)
72
82
  end
73
83
 
84
+ def parse_yaml(yaml_file)
85
+ if RUBY_ENGINE == 'jruby'
86
+ # This code exists to handle the new size limit in the Snakeyml Java library
87
+ # of three million code points.. The issue is discussed here:
88
+ # https://github.com/jruby/jruby/issues/7543
89
+ tree_builder = Psych::TreeBuilder.new
90
+ parser = Psych::Parser.new(tree_builder)
91
+ parser.code_point_limit = 20_000_000
92
+
93
+ yaml_data = File.read(yaml_file)
94
+ parser.parse(yaml_data)
95
+ # Convert to Ruby and get the hash out of the document array
96
+ tree_builder.root.to_ruby.first
97
+ else
98
+ YAML.load_file(yaml_file)
99
+ end
100
+ end
101
+
74
102
  def load_yaml_md(md)
75
103
  # Uses metadata from yaml to set metadata-related instance variables.
76
104
  ATTR.each_key do |k|
@@ -13,6 +13,14 @@ module SyntaxFile
13
13
  @sfc = sfc
14
14
  @syntax_type = syntax_type
15
15
  @cmd_end = ''
16
+
17
+ if @sfc.is_csv? && !supports_csv?
18
+ raise "CSV data not supported for #{@syntax_type.upcase} syntax files"
19
+ end
20
+ end
21
+
22
+ def supports_csv?
23
+ false
16
24
  end
17
25
 
18
26
  # Syntax terminator.
@@ -18,11 +18,15 @@ module SyntaxFile
18
18
  @label_max_leng = 256
19
19
  @segment_max_leng = 100
20
20
  @sas_library_handle = 'IPUMS'
21
- @sas_file_handle = 'ASCIIDAT'
21
+ @sas_file_handle = @sfc.is_csv? ? 'CSV' : 'ASCIIDAT'
22
22
  @sas_fmt_suffix = '_f'
23
23
  @sas_data_file_name = @sas_library_handle + '.' + @sfc.data_file_name_stem
24
24
  end
25
25
 
26
+ def supports_csv?
27
+ true
28
+ end
29
+
26
30
  def syntax
27
31
  r = [
28
32
  comments_start,
@@ -138,10 +142,14 @@ module SyntaxFile
138
142
  end
139
143
 
140
144
  def syn_df_infile
141
- # The LRECL specification is needed because the default behavior on some
142
- # operating systems is to truncate records to 256 columns.
143
- c = @sfc.last_column_used
144
- 'infile ' + @sas_file_handle + ' pad missover lrecl=' + c.to_s + @cmd_end
145
+ if @sfc.is_csv?
146
+ 'infile ' + @sas_file_handle + " missover dsd delimiter=" + q(',') + " firstobs=2" + @cmd_end
147
+ else
148
+ # The LRECL specification is needed because the default behavior on some
149
+ # operating systems is to truncate records to 256 columns.
150
+ c = @sfc.last_column_used
151
+ 'infile ' + @sas_file_handle + ' pad missover lrecl=' + c.to_s + @cmd_end
152
+ end
145
153
  end
146
154
 
147
155
  def syn_dfr
@@ -163,7 +171,7 @@ module SyntaxFile
163
171
  var_list.collect { |v|
164
172
  sprintf(@var_loc_format, v.name) +
165
173
  (v.is_string_var ? '$ ' : ' ') +
166
- v.column_locations_as_s +
174
+ (@sfc.is_csv? ? '' : v.column_locations_as_s) +
167
175
  implied_decimal_fmt(v)
168
176
  }
169
177
  end
@@ -249,7 +257,7 @@ module SyntaxFile
249
257
 
250
258
  def syn_fmt_big_nums
251
259
  big_num_vars = @sfc.get_big_nums
252
- return [] if big_num_vars.empty?
260
+ return [] if big_num_vars.empty? || @sfc.is_csv?
253
261
  r = [
254
262
  'format',
255
263
  syn_fmt_big_nums_for_var_list(big_num_vars),
@@ -270,7 +278,7 @@ module SyntaxFile
270
278
 
271
279
  def syn_fmt_link
272
280
  var_list = @sfc.get_vars_with_values
273
- return [] if var_list.empty?
281
+ return [] if var_list.empty? || @sfc.is_csv?
274
282
  r = [
275
283
  'format',
276
284
  syn_fmt_link_for_var_list(var_list),
@@ -286,7 +294,7 @@ module SyntaxFile
286
294
  end
287
295
 
288
296
  def implied_decimal_fmt(var)
289
- return '' if var.is_string_var or var.implied_decimals == 0
297
+ return '' if var.is_string_var or var.implied_decimals == 0 or @sfc.is_csv?
290
298
  return ' .' + var.implied_decimals.to_s
291
299
  end
292
300
 
@@ -17,6 +17,10 @@ module SyntaxFile
17
17
  @segment_max_leng = 100
18
18
  end
19
19
 
20
+ def supports_csv?
21
+ true
22
+ end
23
+
20
24
  def syntax
21
25
  r = [
22
26
  comments_start,
@@ -54,7 +58,33 @@ module SyntaxFile
54
58
  end
55
59
 
56
60
  def syn_df
57
- @sfc.data_structure == 'hier' ? syn_dfh : syn_dfr
61
+ if @sfc.is_csv?
62
+ syn_dfr_csv
63
+ else
64
+ @sfc.data_structure == 'hier' ? syn_dfh : syn_dfr
65
+ end
66
+ end
67
+
68
+ def syn_dfr_csv
69
+ r = [
70
+ 'GET DATA /TYPE=TXT',
71
+ ' /FILE=' + q(@sfc.data_file_name),
72
+ ' /ENCODING=\'UTF8\'',
73
+ ' /DELIMITERS=","',
74
+ ' /QUALIFIER=\'"\'',
75
+ ' /ARRANGEMENT=DELIMITED',
76
+ ' /FIRSTCASE=2',
77
+ ' /DATATYPEMIN PERCENTAGE=100.0',
78
+ ' /VARIABLES=',
79
+ syn_vars_csv(@sfc.variables),
80
+ ' /MAP.',
81
+ 'execute.'
82
+ ]
83
+ r.flatten
84
+ end
85
+
86
+ def syn_vars_csv(var_list)
87
+ var_list.map { |v| sprintf @var_loc_format, v.name, v.is_string_var ? 'A' : 'AUTO' }
58
88
  end
59
89
 
60
90
  def syn_dfr
@@ -16,7 +16,8 @@ module SyntaxFile
16
16
  @infix_format = "%#{mx_col + mx_var + 4}s"
17
17
  @replace_format = "replace %-#{mx_var}s = %-#{mx_var}s / %d"
18
18
  @fixed_point_display_format = "format %-#{mx_var}s %%%d.%df"
19
- @general_display_format = "format %-#{mx_var}s %%%d.%dg"
19
+ @general_display_format = "format %-#{mx_var}s %%%d.0g"
20
+ @integer_display_format = "format %-#{mx_var}s %%%d.0f"
20
21
 
21
22
  @cmd_end = ''
22
23
  @cmd_continue = ' ///'
@@ -25,6 +26,10 @@ module SyntaxFile
25
26
  @sort_var_stem = '_line_num'
26
27
  end
27
28
 
29
+ def supports_csv?
30
+ true
31
+ end
32
+
28
33
  def syntax
29
34
  r = [
30
35
  comments_start,
@@ -79,17 +84,24 @@ module SyntaxFile
79
84
 
80
85
  def syn_infix(var_list)
81
86
  r = [
82
- syn_infix_start,
83
- syn_infix_var_locs(var_list),
84
- syn_infix_end,
87
+ syn_infix_start
85
88
  ]
89
+ if !@sfc.is_csv?
90
+ r.push syn_infix_var_locs(var_list)
91
+ r.push syn_infix_end
92
+ end
86
93
  r.flatten
87
94
  end
88
95
 
89
96
  def syn_infix_start
97
+ if @sfc.is_csv?
98
+ infix_cmd = "quietly import delimited #{q(@sfc.data_file_name)}, stringcols(#{list_stringcols(@sfc.variables)})"
99
+ else
100
+ infix_cmd = 'quietly infix'
101
+ end
90
102
  [
91
103
  'clear',
92
- 'quietly infix' + sprintf(@infix_format, @cmd_continue),
104
+ infix_cmd + sprintf(@infix_format, @cmd_continue),
93
105
  ]
94
106
  end
95
107
 
@@ -189,7 +201,7 @@ module SyntaxFile
189
201
 
190
202
  def syn_convert_implied_decim
191
203
  var_list = @sfc.variables.find_all { |var| var.implied_decimals > 0 }
192
- return [] if var_list.empty?
204
+ return [] if var_list.empty? || @sfc.is_csv?
193
205
  var_list.map { |var|
194
206
  v = var.name.downcase
195
207
  sprintf @replace_format, v, v, 10 ** var.implied_decimals
@@ -201,17 +213,25 @@ module SyntaxFile
201
213
  vf = var_fmt(var)
202
214
  vf == 'double' or vf == 'float'
203
215
  }
204
- return [] if var_list.empty?
216
+ return [] if var_list.empty? || @sfc.is_csv?
205
217
  var_list.map { |var|
206
218
  v = var.name.downcase
207
219
 
208
- # If implied decimals set, it means we know exactly how much precision
209
- # to show. Otherwise we go with the underlying value and the 'g' (general) formatting rules in Stata
210
- formatting = var.implied_decimals > 0 ?
211
- sprintf(@fixed_point_display_format, v, var.width, var.implied_decimals) :
212
- sprintf(@general_display_format, v, var.width, var.implied_decimals)
213
-
214
- formatting
220
+ # When implied decimals are set, it means we know exactly how much precision to show: '.nf' where n is the
221
+ # number of implied decimals.
222
+ # When the number is a double float in our metadata, we go with the underlying value and the '.0g' (general)
223
+ # formatting rules in Stata.
224
+ # Otherwise the variable is a integer in our metadata, and we therefore use '.0f' for formatting
225
+ #
226
+ # In Stata you store large integer type data in 'double' or 'long double' type variables. It's confusing.
227
+ case
228
+ when var.implied_decimals > 0
229
+ sprintf(@fixed_point_display_format, v, var.width, var.implied_decimals)
230
+ when var.is_double_var
231
+ sprintf(@general_display_format, v, var.width)
232
+ else # default is integer
233
+ sprintf(@integer_display_format, v, var.width)
234
+ end
215
235
  }
216
236
  end
217
237
 
@@ -314,5 +334,17 @@ module SyntaxFile
314
334
  rt_var.name.downcase + ' != ' + val_q(rt_var, val_as_s(rt_var, rt))
315
335
  end
316
336
 
337
+ def list_stringcols(vars)
338
+ positions = []
339
+ index = 1
340
+ vars.each do |v|
341
+ if v.is_string_var
342
+ positions << index
343
+ end
344
+ index += 1
345
+ end
346
+ positions.join(' ')
347
+ end
348
+
317
349
  end
318
350
  end
data/tests/setup.rb CHANGED
@@ -23,6 +23,13 @@ def new_controller
23
23
  SyntaxFile::Controller.new(:yaml_files => YAML_FILES)
24
24
  end
25
25
 
26
+ def new_controller_csv
27
+ controller = new_controller
28
+ controller.data_file_name = 'data.csv'
29
+ controller.data_structure = 'rect'
30
+ controller
31
+ end
32
+
26
33
  def new_variable
27
34
  SyntaxFile::Variable.new params_variable()
28
35
  end
@@ -31,9 +38,10 @@ def new_value
31
38
  SyntaxFile::Value.new params_value()
32
39
  end
33
40
 
34
- def new_maker(syntax_type = '')
41
+ def new_maker(syntax_type = '', csv: false)
35
42
  maker_class = 'SyntaxFile::Maker' + syntax_type.upcase
36
- eval(maker_class).new(new_controller, syntax_type)
43
+ controller = csv ? new_controller_csv : new_controller
44
+ eval(maker_class).new(controller, syntax_type)
37
45
  end
38
46
 
39
47
  # Parameters used when creating objects with known values.
@@ -374,5 +374,12 @@ def test_rec_type_lookup_hash
374
374
  assert_equal( {}, sfc.rec_type_lookup_hash, msg )
375
375
  end
376
376
 
377
+ def test_hier_csv
378
+ error = assert_raises ArgumentError do
379
+ SyntaxFile::Controller.new(data_structure: 'hier', data_file_name: 'data.csv')
380
+ end
381
+ assert_equal('Hierarchical data_structure is not supported for CSV data', error.message)
382
+ end
383
+
377
384
  end
378
385
  end
@@ -247,5 +247,43 @@ def test_non_last_non_common_vars
247
247
  assert_equal [], vars_to_names(var_list), msg
248
248
  end
249
249
 
250
+ def test_csv_import
251
+ msg = 'Compare against hardcoded result.'
252
+ mk = new_maker('sas', csv: true)
253
+ expected = [
254
+ 'data IPUMS.data;',
255
+ 'infile CSV missover dsd delimiter="," firstobs=2;',
256
+ '',
257
+ 'input',
258
+ " RECTYPE $ ",
259
+ " DWNUM ",
260
+ " HHNUM ",
261
+ " HDFIRSTD ",
262
+ " FBIG_ND ",
263
+ " BADDW ",
264
+ " CANTON ",
265
+ " URBAN ",
266
+ " DWTYPE ",
267
+ " OWNERSHP ",
268
+ " RENT $ ",
269
+ " RELATE ",
270
+ " SEX ",
271
+ " AGE ",
272
+ " RESPREV2 ",
273
+ " SOCSEC ",
274
+ " EDLEVEL ",
275
+ " LIT ",
276
+ " BIGDEC ",
277
+ " BIGINT ",
278
+ " BIGSTR $ ",
279
+ ";",
280
+ ''
281
+ ]
282
+ assert_equal expected, mk.syn_df, msg
283
+
284
+ expected2 = []
285
+ assert_equal expected2, mk.syn_fmt_link, msg
286
+ end
287
+
250
288
  end
251
289
  end
@@ -116,6 +116,17 @@ def test_var_fmt
116
116
  assert_equal expected, actual, msg
117
117
  end
118
118
 
119
+ def test_csv_import
120
+ msg = 'Compare against hardcoded result.'
121
+ mk = new_maker('spss', csv: true)
122
+ syn_df = mk.syn_df
123
+ assert_equal 'GET DATA /TYPE=TXT', syn_df[0], msg
124
+ assert_equal ' /FILE="data.csv"', syn_df[1], msg
125
+ assert_equal ' RECTYPE A', syn_df[9], msg
126
+ assert_equal ' /MAP.', syn_df[-2], msg
127
+ assert_equal 'execute.', syn_df[-1], msg
128
+ end
129
+
119
130
 
120
131
  end
121
132
  end
@@ -145,7 +145,7 @@ def test_syn_display_format
145
145
  "format canton %3.1f",
146
146
  "format resprev2 %4.3f",
147
147
  "format bigdec %10.5f",
148
- "format bigint %19.0g"
148
+ "format bigint %19.0f"
149
149
  ]
150
150
  actual = mk.syn_display_format
151
151
  assert_equal expected, actual, msg
@@ -220,5 +220,12 @@ def test_rt_ne_statement
220
220
  assert_equal expected, mk.rt_ne_statement('H'), msg
221
221
  end
222
222
 
223
+ def test_csv_import
224
+ msg = 'Compare against hardcoded result.'
225
+ mk = new_maker('stata', csv: true)
226
+ expected = ['clear', 'quietly import delimited `"data.csv"\', stringcols(1 11 21) ///']
227
+ assert_equal expected, mk.syn_df, msg
228
+ end
229
+
223
230
  end
224
231
  end
@@ -185,5 +185,12 @@ def test_syn_var_locations
185
185
  assert_equal expected, actual, msg
186
186
  end
187
187
 
188
+ def test_csv_import
189
+ error = assert_raises RuntimeError do
190
+ new_maker('sts', csv: true)
191
+ end
192
+ assert_equal('CSV data not supported for STS syntax files', error.message)
193
+ end
194
+
188
195
  end
189
196
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: stats_package_syntax_file_generator
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.7
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Monty Hindman, Marcus Peterson, Colin Davis, Dan Elbert, Jayandra Pokharel
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-03-29 00:00:00.000000000 Z
11
+ date: 2026-02-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -88,7 +88,7 @@ homepage: https://github.com/mnpopcenter/stats_package_syntax_file_generator
88
88
  licenses:
89
89
  - MPL-2.0
90
90
  metadata: {}
91
- post_install_message:
91
+ post_install_message:
92
92
  rdoc_options: []
93
93
  require_paths:
94
94
  - lib
@@ -103,8 +103,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
103
103
  - !ruby/object:Gem::Version
104
104
  version: '0'
105
105
  requirements: []
106
- rubygems_version: 3.1.4
107
- signing_key:
106
+ rubygems_version: 3.5.14
107
+ signing_key:
108
108
  specification_version: 4
109
109
  summary: Produces statistical package syntax files for fixed-column data.
110
110
  test_files: