fast_underscore 0.0.3 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/console CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'bundler/setup'
4
- require 'active_support'
5
5
  require 'fast_underscore'
6
6
 
7
7
  require 'irb'
@@ -1,34 +1,22 @@
1
1
  #include <ruby.h>
2
2
  #include <ruby/encoding.h>
3
- #include <stdlib.h>
4
3
 
5
- /**
6
- * true if the given codepoint is a lowercase ascii character.
7
- */
8
- static int
9
- character_is_lower(unsigned int character) {
4
+ // true if the given codepoint is a lowercase ascii character.
5
+ static int character_is_lower(unsigned int character) {
10
6
  return character >= 'a' && character <= 'z';
11
7
  }
12
8
 
13
- /**
14
- * true if the given codepoint is a uppercase ascii character.
15
- */
16
- static int
17
- character_is_upper(unsigned int character) {
9
+ // true if the given codepoint is a uppercase ascii character.
10
+ static int character_is_upper(unsigned int character) {
18
11
  return character >= 'A' && character <= 'Z';
19
12
  }
20
13
 
21
- /**
22
- * true if the given codepoint is an ascii digit.
23
- */
24
- static int
25
- character_is_digit(unsigned int character) {
14
+ // true if the given codepoint is an ascii digit.
15
+ static int character_is_digit(unsigned int character) {
26
16
  return character >= '0' && character <= '9';
27
17
  }
28
18
 
29
- /**
30
- * Macros for extracting the character out of the `codepoint_t` struct.
31
- */
19
+ // Macros for extracting the character out of the `codepoint_t` struct.
32
20
  #define codepoint_is_lower(codepoint) character_is_lower(codepoint->character)
33
21
  #define codepoint_is_upper(codepoint) character_is_upper(codepoint->character)
34
22
  #define codepoint_is_digit(codepoint) character_is_digit(codepoint->character)
@@ -62,9 +50,64 @@ typedef struct codepoint {
62
50
  * A struct for tracking the built string as it gets converted. Maintains an
63
51
  * internal DFA for transitioning through various inputs to match certain
64
52
  * patterns that need to be separated with underscores.
53
+ *
54
+ * The internal DFA looks like:
55
+ *
56
+ * ┌ - ┐ ┌ * ┐
57
+ * │ v │ v
58
+ * ┌─────────────┐ ┌─────────────┐
59
+ * │ │──── : ───>│ │
60
+ * ──>│ DEFAULT │<─── : ────│ COLON │
61
+ * │ │<─── * ────│ │
62
+ * └─────────────┘ └─────────────┘
63
+ * │ ^ ^ ^
64
+ * │ │ │ └───── a-z ─────────────┐
65
+ * 0-9A-Z * │ │
66
+ * │ │ └───────── * ────────┐ │
67
+ * v │ │ │
68
+ * ┌─────────────┐ ┌─────────────┐
69
+ * │ │─── A-Z ──>│ │
70
+ * │ UPPER_START │ │ UPPER_END │
71
+ * │ │<── 0-9 ───│ │
72
+ * └─────────────┘ └─────────────┘
73
+ * │ ^ ^ │
74
+ * └ 0-9 ┘ └ A-Z ┘
75
+ *
76
+ * Transitions from DEFAULT:
77
+ * - On "-", push an "_" and stay on DEFAULT
78
+ * - On ":", go to COLON
79
+ * - On a digit or upper, start a buffer with the char and go to UPPER_START
80
+ * - On anything else, push the char and stay on DEFAULT
81
+ *
82
+ * Transitions from COLON:
83
+ * - On ":", push a "/" and go to DEFAULT
84
+ * - On anything else, push a ":" and the char and go to DEFAULT
85
+ *
86
+ * Transitions from UPPER_START:
87
+ * - On a digit, push the digit and stay on UPPER_START
88
+ * - On an upper, push the upper and go to UPPER_END
89
+ * - On anything else, push the buffer, go to DEFAULT, then handle the char
90
+ *
91
+ * Transitions from UPPER_END:
92
+ * - On a digit, push the digit onto the buffer and go to UPPER_START
93
+ * - On an upper, push the upper onto the buffer and stay on UPPER_END
94
+ * - On a lower, push the buffer up to the last char, push an "_", then push
95
+ * the last char of the buffer, go to DEFAULT, then handle the char
96
+ * - On anything else, push the buffer, go to DEFAULT, then handle the char
97
+ *
98
+ * These transitions allow us to accomplish the equivalent of the following code
99
+ * with one pass through the string:
100
+ *
101
+ * def underscore(word)
102
+ * word.gsub!('::', '/')
103
+ * word.gsub!(/([A-Z\d]+)([A-Z][a-z])/, '\1_\2')
104
+ * word.gsub!(/([a-z\d])([A-Z])/, '\1_\2')
105
+ * word.tr!('-', '_')
106
+ * word.downcase!
107
+ * end
65
108
  */
66
109
  typedef struct builder {
67
- // The state of the DFA in which is the builder
110
+ // The state of the DFA that the builder is in
68
111
  enum state {
69
112
  STATE_DEFAULT,
70
113
  STATE_COLON,
@@ -82,78 +125,15 @@ typedef struct builder {
82
125
 
83
126
  // Whether or not the last pushed result character should cause the following
84
127
  // one to be spaced by an underscore
85
- int pushNext;
128
+ int push_next;
86
129
  } builder_t;
87
130
 
88
- /**
89
- * Allocate and initialize a `codepoint_t` struct.
90
- */
91
- static codepoint_t*
92
- codepoint_build(rb_encoding *encoding) {
93
- codepoint_t *codepoint;
94
-
95
- codepoint = (codepoint_t *) malloc(sizeof(codepoint_t));
96
- if (codepoint == NULL) {
97
- return NULL;
98
- }
99
-
100
- codepoint->encoding = encoding;
101
- return codepoint;
102
- }
103
-
104
- /**
105
- * Free a previously allocated `codepoint_t` struct.
106
- */
107
- static void
108
- codepoint_free(codepoint_t *codepoint) {
109
- free(codepoint);
110
- }
111
-
112
- /**
113
- * Allocate and initialize a `builder_t` struct.
114
- */
115
- static builder_t*
116
- builder_build(long str_len) {
117
- builder_t *builder;
118
-
119
- builder = (builder_t *) malloc(sizeof(builder_t));
120
- if (builder == NULL) {
121
- return NULL;
122
- }
123
-
124
- builder->state = STATE_DEFAULT;
125
- builder->segment = (char *) malloc(str_len * sizeof(unsigned int) * 2);
126
-
127
- if (builder->segment == NULL) {
128
- free(builder);
129
- return NULL;
130
- }
131
-
132
- builder->result = (char *) malloc(str_len * sizeof(unsigned int) * 2);
133
-
134
- if (builder->result == NULL) {
135
- free(builder->segment);
136
- free(builder);
137
- return NULL;
138
- }
139
-
140
- builder->segment_size = 0;
141
- builder->result_size = 0;
142
- builder->pushNext = 0;
143
-
144
- return builder;
145
- }
146
-
147
- /**
148
- * Push a character onto the resultant string using the given codepoint and
149
- * encoding.
150
- */
151
- static void
152
- builder_result_push_char(builder_t *builder, unsigned int character, int size,
153
- rb_encoding *encoding) {
131
+ // Push a character onto the resultant string using the given codepoint and
132
+ // encoding.
133
+ static void builder_result_push_char(builder_t *builder, unsigned int character, int size, rb_encoding *encoding) {
154
134
  if (character_is_upper(character)) {
155
- if (builder->pushNext == 1) {
156
- builder->pushNext = 0;
135
+ if (builder->push_next == 1) {
136
+ builder->push_next = 0;
157
137
  builder_result_push_literal(builder, '_');
158
138
  }
159
139
 
@@ -161,7 +141,7 @@ builder_result_push_char(builder_t *builder, unsigned int character, int size,
161
141
  return;
162
142
  }
163
143
 
164
- builder->pushNext = (character_is_lower(character) || character_is_digit(character));
144
+ builder->push_next = (character_is_lower(character) || character_is_digit(character));
165
145
 
166
146
  if (encoding == NULL) {
167
147
  builder->result[builder->result_size++] = (char) character;
@@ -171,20 +151,14 @@ builder_result_push_char(builder_t *builder, unsigned int character, int size,
171
151
  }
172
152
  }
173
153
 
174
- /**
175
- * Push the given codepoint onto the builder.
176
- */
177
- static void
178
- builder_segment_push(builder_t *builder, codepoint_t *codepoint) {
154
+ // Push the given codepoint onto the builder.
155
+ static void builder_segment_push(builder_t *builder, codepoint_t *codepoint) {
179
156
  builder->segment[builder->segment_size++] = (char) codepoint->character;
180
157
  }
181
158
 
182
- /**
183
- * Copy the given number of characters out of the segment cache onto the result
184
- * string.
185
- */
186
- static void
187
- builder_segment_copy(builder_t *builder, long size) {
159
+ // Copy the given number of characters out of the segment cache onto the result
160
+ // string.
161
+ static void builder_segment_copy(builder_t *builder, long size) {
188
162
  long idx;
189
163
 
190
164
  for (idx = 0; idx < size; idx++) {
@@ -192,24 +166,18 @@ builder_segment_copy(builder_t *builder, long size) {
192
166
  }
193
167
  }
194
168
 
195
- /**
196
- * Restart the `builder_t` back at the default state (because we've hit a
197
- * character for which we have no allowed transitions).
198
- */
199
- static void
200
- builder_restart(builder_t *builder) {
169
+ // Restart the `builder_t` back at the default state (because we've hit a
170
+ // character for which we have no allowed transitions).
171
+ static void builder_restart(builder_t *builder) {
201
172
  builder->state = STATE_DEFAULT;
202
173
  builder->segment_size = 0;
203
174
  }
204
175
 
205
176
  static void builder_next(builder_t *builder, codepoint_t *codepoint);
206
177
 
207
- /**
208
- * Pull the remaining content out of the cached segment in case we don't end
209
- * parsing while not in the default state.
210
- */
211
- static void
212
- builder_flush(builder_t *builder) {
178
+ // Pull the remaining content out of the cached segment in case we don't end
179
+ // parsing while not in the default state.
180
+ static void builder_flush(builder_t *builder) {
213
181
  switch (builder->state) {
214
182
  case STATE_DEFAULT: return;
215
183
  case STATE_COLON:
@@ -222,33 +190,30 @@ builder_flush(builder_t *builder) {
222
190
  }
223
191
  }
224
192
 
225
- /**
226
- * Perform transitions from the STATE_DEFAULT state.
227
- */
228
- static inline void
229
- builder_default_transition(builder_t *builder, codepoint_t *codepoint) {
193
+ // Perform transitions from the STATE_DEFAULT state.
194
+ static inline void builder_default_transition(builder_t *builder, codepoint_t *codepoint) {
230
195
  if (codepoint->character == '-') {
231
196
  builder_result_push_literal(builder, '_');
232
197
  return;
233
198
  }
199
+
234
200
  if (codepoint->character == ':') {
235
201
  builder->state = STATE_COLON;
236
202
  return;
237
203
  }
204
+
238
205
  if (codepoint_is_digit(codepoint) || codepoint_is_upper(codepoint)) {
239
206
  builder->segment[0] = (char) codepoint->character;
240
207
  builder->segment_size = 1;
241
208
  builder->state = STATE_UPPER_START;
242
209
  return;
243
210
  }
211
+
244
212
  builder_result_push(builder, codepoint);
245
213
  }
246
214
 
247
- /**
248
- * Perform transitions from the STATE_COLON state.
249
- */
250
- static inline void
251
- builder_colon_transition(builder_t *builder, codepoint_t *codepoint) {
215
+ // Perform transitions from the STATE_COLON state.
216
+ static inline void builder_colon_transition(builder_t *builder, codepoint_t *codepoint) {
252
217
  if (codepoint->character == ':') {
253
218
  builder_result_push_literal(builder, '/');
254
219
  builder_restart(builder);
@@ -260,15 +225,13 @@ builder_colon_transition(builder_t *builder, codepoint_t *codepoint) {
260
225
  builder_next(builder, codepoint);
261
226
  }
262
227
 
263
- /**
264
- * Perform transitions from the STATE_UPPER_START state.
265
- */
266
- static inline void
267
- builder_upper_start_transition(builder_t *builder, codepoint_t *codepoint) {
228
+ // Perform transitions from the STATE_UPPER_START state.
229
+ static inline void builder_upper_start_transition(builder_t *builder, codepoint_t *codepoint) {
268
230
  if (codepoint_is_digit(codepoint)) {
269
231
  builder_segment_push(builder, codepoint);
270
232
  return;
271
233
  }
234
+
272
235
  if (codepoint_is_upper(codepoint)) {
273
236
  builder_segment_push(builder, codepoint);
274
237
  builder->state = STATE_UPPER_END;
@@ -280,20 +243,19 @@ builder_upper_start_transition(builder_t *builder, codepoint_t *codepoint) {
280
243
  builder_next(builder, codepoint);
281
244
  }
282
245
 
283
- /**
284
- * Perform transitions from the STATE_UPPER_END state.
285
- */
286
- static inline void
287
- builder_upper_end_transition(builder_t *builder, codepoint_t *codepoint) {
246
+ // Perform transitions from the STATE_UPPER_END state.
247
+ static inline void builder_upper_end_transition(builder_t *builder, codepoint_t *codepoint) {
288
248
  if (codepoint_is_digit(codepoint)) {
289
249
  builder_segment_push(builder, codepoint);
290
250
  builder->state = STATE_UPPER_START;
291
251
  return;
292
252
  }
253
+
293
254
  if (codepoint_is_upper(codepoint)) {
294
255
  builder_segment_push(builder, codepoint);
295
256
  return;
296
257
  }
258
+
297
259
  if (codepoint_is_lower(codepoint)) {
298
260
  builder_segment_copy(builder, builder->segment_size - 1);
299
261
  builder_result_push_literal(builder, '_');
@@ -308,12 +270,8 @@ builder_upper_end_transition(builder_t *builder, codepoint_t *codepoint) {
308
270
  builder_next(builder, codepoint);
309
271
  }
310
272
 
311
- /**
312
- * Accept the next codepoint, which will move the `builder_t` struct into the
313
- * next state.
314
- */
315
- static void
316
- builder_next(builder_t *builder, codepoint_t *codepoint) {
273
+ // Accept the next codepoint, which will move the `builder_t` struct into the next state.
274
+ static void builder_next(builder_t *builder, codepoint_t *codepoint) {
317
275
  switch (builder->state) {
318
276
  case STATE_DEFAULT:
319
277
  return builder_default_transition(builder, codepoint);
@@ -326,16 +284,6 @@ builder_next(builder_t *builder, codepoint_t *codepoint) {
326
284
  }
327
285
  }
328
286
 
329
- /**
330
- * Frees a previously allocated `builder_t` struct.
331
- */
332
- static void
333
- builder_free(builder_t *builder) {
334
- free(builder->segment);
335
- free(builder->result);
336
- free(builder);
337
- }
338
-
339
287
  /**
340
288
  * Makes an underscored, lowercase form from the expression in the string.
341
289
  *
@@ -349,43 +297,46 @@ builder_free(builder_t *builder) {
349
297
  *
350
298
  * camelize(underscore('SSLError')) # => "SslError"
351
299
  */
352
- static VALUE
353
- rb_str_underscore(VALUE self, VALUE rb_string) {
354
- VALUE resultant;
355
- rb_encoding *encoding;
356
-
357
- char *string;
358
- char *end;
359
-
360
- builder_t *builder;
361
- codepoint_t *codepoint;
362
-
363
- encoding = rb_enc_from_index(ENCODING_GET(rb_string));
364
- string = RSTRING_PTR(rb_string);
365
- end = RSTRING_END(rb_string);
366
-
367
- builder = builder_build(RSTRING_LEN(rb_string) * 2);
368
- codepoint = codepoint_build(encoding);
369
-
370
- while (string < end) {
371
- codepoint->character = rb_enc_codepoint_len(string, end, &codepoint->size, encoding);
372
- builder_next(builder, codepoint);
373
- string += codepoint->size;
300
+ static VALUE underscore(VALUE string) {
301
+ char segment[RSTRING_LEN(string) * 2 * sizeof(unsigned int) * 2];
302
+ char result[RSTRING_LEN(string) * 2 * sizeof(unsigned int) * 2];
303
+
304
+ builder_t builder = {
305
+ .state = STATE_DEFAULT,
306
+ .segment = segment,
307
+ .result = result,
308
+ .segment_size = 0,
309
+ .result_size = 0,
310
+ .push_next = 0
311
+ };
312
+
313
+ codepoint_t codepoint = {
314
+ .encoding = rb_enc_from_index(ENCODING_GET(string)),
315
+ .character = 0,
316
+ .size = 0
317
+ };
318
+
319
+ char *pointer = RSTRING_PTR(string);
320
+ char *end = RSTRING_END(string);
321
+
322
+ while (pointer < end) {
323
+ codepoint.character = rb_enc_codepoint_len(pointer, end, &codepoint.size, codepoint.encoding);
324
+ builder_next(&builder, &codepoint);
325
+ pointer += codepoint.size;
374
326
  }
375
- builder_flush(builder);
376
327
 
377
- resultant = rb_enc_str_new(builder->result, builder->result_size, encoding);
378
- builder_free(builder);
379
- codepoint_free(codepoint);
328
+ builder_flush(&builder);
329
+ return rb_enc_str_new(builder.result, builder.result_size, codepoint.encoding);
330
+ }
380
331
 
381
- return resultant;
332
+ // FastUnderscore::underscore
333
+ static VALUE fast_underscore(VALUE self, VALUE string) {
334
+ return underscore(string);
382
335
  }
383
336
 
384
- /**
385
- * Hook into Ruby and define the `FastUnderscore::underscore`.
386
- */
387
- void
388
- Init_fast_underscore(void) {
337
+ // Hook into Ruby and define FastUnderscore::underscore and String#underscore
338
+ void Init_fast_underscore(void) {
389
339
  VALUE rb_cFastUnderscore = rb_define_module("FastUnderscore");
390
- rb_define_singleton_method(rb_cFastUnderscore, "underscore", rb_str_underscore, 1);
340
+ rb_define_singleton_method(rb_cFastUnderscore, "underscore", fast_underscore, 1);
341
+ rb_define_method(rb_cString, "underscore", underscore, 0);
391
342
  }