fast_underscore 0.0.3 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
data/bin/console CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'bundler/setup'
4
- require 'active_support'
5
5
  require 'fast_underscore'
6
6
 
7
7
  require 'irb'
@@ -1,34 +1,22 @@
1
1
  #include <ruby.h>
2
2
  #include <ruby/encoding.h>
3
- #include <stdlib.h>
4
3
 
5
- /**
6
- * true if the given codepoint is a lowercase ascii character.
7
- */
8
- static int
9
- character_is_lower(unsigned int character) {
4
+ // true if the given codepoint is a lowercase ascii character.
5
+ static int character_is_lower(unsigned int character) {
10
6
  return character >= 'a' && character <= 'z';
11
7
  }
12
8
 
13
- /**
14
- * true if the given codepoint is a uppercase ascii character.
15
- */
16
- static int
17
- character_is_upper(unsigned int character) {
9
+ // true if the given codepoint is a uppercase ascii character.
10
+ static int character_is_upper(unsigned int character) {
18
11
  return character >= 'A' && character <= 'Z';
19
12
  }
20
13
 
21
- /**
22
- * true if the given codepoint is an ascii digit.
23
- */
24
- static int
25
- character_is_digit(unsigned int character) {
14
+ // true if the given codepoint is an ascii digit.
15
+ static int character_is_digit(unsigned int character) {
26
16
  return character >= '0' && character <= '9';
27
17
  }
28
18
 
29
- /**
30
- * Macros for extracting the character out of the `codepoint_t` struct.
31
- */
19
+ // Macros for extracting the character out of the `codepoint_t` struct.
32
20
  #define codepoint_is_lower(codepoint) character_is_lower(codepoint->character)
33
21
  #define codepoint_is_upper(codepoint) character_is_upper(codepoint->character)
34
22
  #define codepoint_is_digit(codepoint) character_is_digit(codepoint->character)
@@ -62,9 +50,64 @@ typedef struct codepoint {
62
50
  * A struct for tracking the built string as it gets converted. Maintains an
63
51
  * internal DFA for transitioning through various inputs to match certain
64
52
  * patterns that need to be separated with underscores.
53
+ *
54
+ * The internal DFA looks like:
55
+ *
56
+ * ┌ - ┐ ┌ * ┐
57
+ * │ v │ v
58
+ * ┌─────────────┐ ┌─────────────┐
59
+ * │ │──── : ───>│ │
60
+ * ──>│ DEFAULT │<─── : ────│ COLON │
61
+ * │ │<─── * ────│ │
62
+ * └─────────────┘ └─────────────┘
63
+ * │ ^ ^ ^
64
+ * │ │ │ └───── a-z ─────────────┐
65
+ * 0-9A-Z * │ │
66
+ * │ │ └───────── * ────────┐ │
67
+ * v │ │ │
68
+ * ┌─────────────┐ ┌─────────────┐
69
+ * │ │─── A-Z ──>│ │
70
+ * │ UPPER_START │ │ UPPER_END │
71
+ * │ │<── 0-9 ───│ │
72
+ * └─────────────┘ └─────────────┘
73
+ * │ ^ ^ │
74
+ * └ 0-9 ┘ └ A-Z ┘
75
+ *
76
+ * Transitions from DEFAULT:
77
+ * - On "-", push an "_" and stay on DEFAULT
78
+ * - On ":", go to COLON
79
+ * - On a digit or upper, start a buffer with the char and go to UPPER_START
80
+ * - On anything else, push the char and stay on DEFAULT
81
+ *
82
+ * Transitions from COLON:
83
+ * - On ":", push a "/" and go to DEFAULT
84
+ * - On anything else, push a ":" and the char and go to DEFAULT
85
+ *
86
+ * Transitions from UPPER_START:
87
+ * - On a digit, push the digit and stay on UPPER_START
88
+ * - On an upper, push the upper and go to UPPER_END
89
+ * - On anything else, push the buffer, go to DEFAULT, then handle the char
90
+ *
91
+ * Transitions from UPPER_END:
92
+ * - On a digit, push the digit onto the buffer and go to UPPER_START
93
+ * - On an upper, push the upper onto the buffer and stay on UPPER_END
94
+ * - On a lower, push the buffer up to the last char, push an "_", then push
95
+ * the last char of the buffer, go to DEFAULT, then handle the char
96
+ * - On anything else, push the buffer, go to DEFAULT, then handle the char
97
+ *
98
+ * These transitions allow us to accomplish the equivalent of the following code
99
+ * with one pass through the string:
100
+ *
101
+ * def underscore(word)
102
+ * word.gsub!('::', '/')
103
+ * word.gsub!(/([A-Z\d]+)([A-Z][a-z])/, '\1_\2')
104
+ * word.gsub!(/([a-z\d])([A-Z])/, '\1_\2')
105
+ * word.tr!('-', '_')
106
+ * word.downcase!
107
+ * end
65
108
  */
66
109
  typedef struct builder {
67
- // The state of the DFA in which is the builder
110
+ // The state of the DFA that the builder is in
68
111
  enum state {
69
112
  STATE_DEFAULT,
70
113
  STATE_COLON,
@@ -82,78 +125,15 @@ typedef struct builder {
82
125
 
83
126
  // Whether or not the last pushed result character should cause the following
84
127
  // one to be spaced by an underscore
85
- int pushNext;
128
+ int push_next;
86
129
  } builder_t;
87
130
 
88
- /**
89
- * Allocate and initialize a `codepoint_t` struct.
90
- */
91
- static codepoint_t*
92
- codepoint_build(rb_encoding *encoding) {
93
- codepoint_t *codepoint;
94
-
95
- codepoint = (codepoint_t *) malloc(sizeof(codepoint_t));
96
- if (codepoint == NULL) {
97
- return NULL;
98
- }
99
-
100
- codepoint->encoding = encoding;
101
- return codepoint;
102
- }
103
-
104
- /**
105
- * Free a previously allocated `codepoint_t` struct.
106
- */
107
- static void
108
- codepoint_free(codepoint_t *codepoint) {
109
- free(codepoint);
110
- }
111
-
112
- /**
113
- * Allocate and initialize a `builder_t` struct.
114
- */
115
- static builder_t*
116
- builder_build(long str_len) {
117
- builder_t *builder;
118
-
119
- builder = (builder_t *) malloc(sizeof(builder_t));
120
- if (builder == NULL) {
121
- return NULL;
122
- }
123
-
124
- builder->state = STATE_DEFAULT;
125
- builder->segment = (char *) malloc(str_len * sizeof(unsigned int) * 2);
126
-
127
- if (builder->segment == NULL) {
128
- free(builder);
129
- return NULL;
130
- }
131
-
132
- builder->result = (char *) malloc(str_len * sizeof(unsigned int) * 2);
133
-
134
- if (builder->result == NULL) {
135
- free(builder->segment);
136
- free(builder);
137
- return NULL;
138
- }
139
-
140
- builder->segment_size = 0;
141
- builder->result_size = 0;
142
- builder->pushNext = 0;
143
-
144
- return builder;
145
- }
146
-
147
- /**
148
- * Push a character onto the resultant string using the given codepoint and
149
- * encoding.
150
- */
151
- static void
152
- builder_result_push_char(builder_t *builder, unsigned int character, int size,
153
- rb_encoding *encoding) {
131
+ // Push a character onto the resultant string using the given codepoint and
132
+ // encoding.
133
+ static void builder_result_push_char(builder_t *builder, unsigned int character, int size, rb_encoding *encoding) {
154
134
  if (character_is_upper(character)) {
155
- if (builder->pushNext == 1) {
156
- builder->pushNext = 0;
135
+ if (builder->push_next == 1) {
136
+ builder->push_next = 0;
157
137
  builder_result_push_literal(builder, '_');
158
138
  }
159
139
 
@@ -161,7 +141,7 @@ builder_result_push_char(builder_t *builder, unsigned int character, int size,
161
141
  return;
162
142
  }
163
143
 
164
- builder->pushNext = (character_is_lower(character) || character_is_digit(character));
144
+ builder->push_next = (character_is_lower(character) || character_is_digit(character));
165
145
 
166
146
  if (encoding == NULL) {
167
147
  builder->result[builder->result_size++] = (char) character;
@@ -171,20 +151,14 @@ builder_result_push_char(builder_t *builder, unsigned int character, int size,
171
151
  }
172
152
  }
173
153
 
174
- /**
175
- * Push the given codepoint onto the builder.
176
- */
177
- static void
178
- builder_segment_push(builder_t *builder, codepoint_t *codepoint) {
154
+ // Push the given codepoint onto the builder.
155
+ static void builder_segment_push(builder_t *builder, codepoint_t *codepoint) {
179
156
  builder->segment[builder->segment_size++] = (char) codepoint->character;
180
157
  }
181
158
 
182
- /**
183
- * Copy the given number of characters out of the segment cache onto the result
184
- * string.
185
- */
186
- static void
187
- builder_segment_copy(builder_t *builder, long size) {
159
+ // Copy the given number of characters out of the segment cache onto the result
160
+ // string.
161
+ static void builder_segment_copy(builder_t *builder, long size) {
188
162
  long idx;
189
163
 
190
164
  for (idx = 0; idx < size; idx++) {
@@ -192,24 +166,18 @@ builder_segment_copy(builder_t *builder, long size) {
192
166
  }
193
167
  }
194
168
 
195
- /**
196
- * Restart the `builder_t` back at the default state (because we've hit a
197
- * character for which we have no allowed transitions).
198
- */
199
- static void
200
- builder_restart(builder_t *builder) {
169
+ // Restart the `builder_t` back at the default state (because we've hit a
170
+ // character for which we have no allowed transitions).
171
+ static void builder_restart(builder_t *builder) {
201
172
  builder->state = STATE_DEFAULT;
202
173
  builder->segment_size = 0;
203
174
  }
204
175
 
205
176
  static void builder_next(builder_t *builder, codepoint_t *codepoint);
206
177
 
207
- /**
208
- * Pull the remaining content out of the cached segment in case we don't end
209
- * parsing while not in the default state.
210
- */
211
- static void
212
- builder_flush(builder_t *builder) {
178
+ // Pull the remaining content out of the cached segment in case we don't end
179
+ // parsing while not in the default state.
180
+ static void builder_flush(builder_t *builder) {
213
181
  switch (builder->state) {
214
182
  case STATE_DEFAULT: return;
215
183
  case STATE_COLON:
@@ -222,33 +190,30 @@ builder_flush(builder_t *builder) {
222
190
  }
223
191
  }
224
192
 
225
- /**
226
- * Perform transitions from the STATE_DEFAULT state.
227
- */
228
- static inline void
229
- builder_default_transition(builder_t *builder, codepoint_t *codepoint) {
193
+ // Perform transitions from the STATE_DEFAULT state.
194
+ static inline void builder_default_transition(builder_t *builder, codepoint_t *codepoint) {
230
195
  if (codepoint->character == '-') {
231
196
  builder_result_push_literal(builder, '_');
232
197
  return;
233
198
  }
199
+
234
200
  if (codepoint->character == ':') {
235
201
  builder->state = STATE_COLON;
236
202
  return;
237
203
  }
204
+
238
205
  if (codepoint_is_digit(codepoint) || codepoint_is_upper(codepoint)) {
239
206
  builder->segment[0] = (char) codepoint->character;
240
207
  builder->segment_size = 1;
241
208
  builder->state = STATE_UPPER_START;
242
209
  return;
243
210
  }
211
+
244
212
  builder_result_push(builder, codepoint);
245
213
  }
246
214
 
247
- /**
248
- * Perform transitions from the STATE_COLON state.
249
- */
250
- static inline void
251
- builder_colon_transition(builder_t *builder, codepoint_t *codepoint) {
215
+ // Perform transitions from the STATE_COLON state.
216
+ static inline void builder_colon_transition(builder_t *builder, codepoint_t *codepoint) {
252
217
  if (codepoint->character == ':') {
253
218
  builder_result_push_literal(builder, '/');
254
219
  builder_restart(builder);
@@ -260,15 +225,13 @@ builder_colon_transition(builder_t *builder, codepoint_t *codepoint) {
260
225
  builder_next(builder, codepoint);
261
226
  }
262
227
 
263
- /**
264
- * Perform transitions from the STATE_UPPER_START state.
265
- */
266
- static inline void
267
- builder_upper_start_transition(builder_t *builder, codepoint_t *codepoint) {
228
+ // Perform transitions from the STATE_UPPER_START state.
229
+ static inline void builder_upper_start_transition(builder_t *builder, codepoint_t *codepoint) {
268
230
  if (codepoint_is_digit(codepoint)) {
269
231
  builder_segment_push(builder, codepoint);
270
232
  return;
271
233
  }
234
+
272
235
  if (codepoint_is_upper(codepoint)) {
273
236
  builder_segment_push(builder, codepoint);
274
237
  builder->state = STATE_UPPER_END;
@@ -280,20 +243,19 @@ builder_upper_start_transition(builder_t *builder, codepoint_t *codepoint) {
280
243
  builder_next(builder, codepoint);
281
244
  }
282
245
 
283
- /**
284
- * Perform transitions from the STATE_UPPER_END state.
285
- */
286
- static inline void
287
- builder_upper_end_transition(builder_t *builder, codepoint_t *codepoint) {
246
+ // Perform transitions from the STATE_UPPER_END state.
247
+ static inline void builder_upper_end_transition(builder_t *builder, codepoint_t *codepoint) {
288
248
  if (codepoint_is_digit(codepoint)) {
289
249
  builder_segment_push(builder, codepoint);
290
250
  builder->state = STATE_UPPER_START;
291
251
  return;
292
252
  }
253
+
293
254
  if (codepoint_is_upper(codepoint)) {
294
255
  builder_segment_push(builder, codepoint);
295
256
  return;
296
257
  }
258
+
297
259
  if (codepoint_is_lower(codepoint)) {
298
260
  builder_segment_copy(builder, builder->segment_size - 1);
299
261
  builder_result_push_literal(builder, '_');
@@ -308,12 +270,8 @@ builder_upper_end_transition(builder_t *builder, codepoint_t *codepoint) {
308
270
  builder_next(builder, codepoint);
309
271
  }
310
272
 
311
- /**
312
- * Accept the next codepoint, which will move the `builder_t` struct into the
313
- * next state.
314
- */
315
- static void
316
- builder_next(builder_t *builder, codepoint_t *codepoint) {
273
+ // Accept the next codepoint, which will move the `builder_t` struct into the next state.
274
+ static void builder_next(builder_t *builder, codepoint_t *codepoint) {
317
275
  switch (builder->state) {
318
276
  case STATE_DEFAULT:
319
277
  return builder_default_transition(builder, codepoint);
@@ -326,16 +284,6 @@ builder_next(builder_t *builder, codepoint_t *codepoint) {
326
284
  }
327
285
  }
328
286
 
329
- /**
330
- * Frees a previously allocated `builder_t` struct.
331
- */
332
- static void
333
- builder_free(builder_t *builder) {
334
- free(builder->segment);
335
- free(builder->result);
336
- free(builder);
337
- }
338
-
339
287
  /**
340
288
  * Makes an underscored, lowercase form from the expression in the string.
341
289
  *
@@ -349,43 +297,46 @@ builder_free(builder_t *builder) {
349
297
  *
350
298
  * camelize(underscore('SSLError')) # => "SslError"
351
299
  */
352
- static VALUE
353
- rb_str_underscore(VALUE self, VALUE rb_string) {
354
- VALUE resultant;
355
- rb_encoding *encoding;
356
-
357
- char *string;
358
- char *end;
359
-
360
- builder_t *builder;
361
- codepoint_t *codepoint;
362
-
363
- encoding = rb_enc_from_index(ENCODING_GET(rb_string));
364
- string = RSTRING_PTR(rb_string);
365
- end = RSTRING_END(rb_string);
366
-
367
- builder = builder_build(RSTRING_LEN(rb_string) * 2);
368
- codepoint = codepoint_build(encoding);
369
-
370
- while (string < end) {
371
- codepoint->character = rb_enc_codepoint_len(string, end, &codepoint->size, encoding);
372
- builder_next(builder, codepoint);
373
- string += codepoint->size;
300
+ static VALUE underscore(VALUE string) {
301
+ char segment[RSTRING_LEN(string) * 2 * sizeof(unsigned int) * 2];
302
+ char result[RSTRING_LEN(string) * 2 * sizeof(unsigned int) * 2];
303
+
304
+ builder_t builder = {
305
+ .state = STATE_DEFAULT,
306
+ .segment = segment,
307
+ .result = result,
308
+ .segment_size = 0,
309
+ .result_size = 0,
310
+ .push_next = 0
311
+ };
312
+
313
+ codepoint_t codepoint = {
314
+ .encoding = rb_enc_from_index(ENCODING_GET(string)),
315
+ .character = 0,
316
+ .size = 0
317
+ };
318
+
319
+ char *pointer = RSTRING_PTR(string);
320
+ char *end = RSTRING_END(string);
321
+
322
+ while (pointer < end) {
323
+ codepoint.character = rb_enc_codepoint_len(pointer, end, &codepoint.size, codepoint.encoding);
324
+ builder_next(&builder, &codepoint);
325
+ pointer += codepoint.size;
374
326
  }
375
- builder_flush(builder);
376
327
 
377
- resultant = rb_enc_str_new(builder->result, builder->result_size, encoding);
378
- builder_free(builder);
379
- codepoint_free(codepoint);
328
+ builder_flush(&builder);
329
+ return rb_enc_str_new(builder.result, builder.result_size, codepoint.encoding);
330
+ }
380
331
 
381
- return resultant;
332
+ // FastUnderscore::underscore
333
+ static VALUE fast_underscore(VALUE self, VALUE string) {
334
+ return underscore(string);
382
335
  }
383
336
 
384
- /**
385
- * Hook into Ruby and define the `FastUnderscore::underscore`.
386
- */
387
- void
388
- Init_fast_underscore(void) {
337
+ // Hook into Ruby and define FastUnderscore::underscore and String#underscore
338
+ void Init_fast_underscore(void) {
389
339
  VALUE rb_cFastUnderscore = rb_define_module("FastUnderscore");
390
- rb_define_singleton_method(rb_cFastUnderscore, "underscore", rb_str_underscore, 1);
340
+ rb_define_singleton_method(rb_cFastUnderscore, "underscore", fast_underscore, 1);
341
+ rb_define_method(rb_cString, "underscore", underscore, 0);
391
342
  }