fast_underscore 0.0.3 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.github/dependabot.yml +6 -0
- data/.github/workflows/main.yml +44 -0
- data/.gitignore +2 -0
- data/.rubocop.yml +24 -5
- data/CHANGELOG.md +44 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +58 -31
- data/{LICENSE.txt → LICENSE} +1 -1
- data/README.md +16 -25
- data/Steepfile +6 -0
- data/bin/{benchmark → bench} +3 -2
- data/bin/console +1 -1
- data/ext/fast_underscore/fast_underscore.c +133 -182
- data/fast_underscore.gemspec +6 -5
- data/gemfiles/5.1.gemfile +7 -0
- data/gemfiles/5.2.gemfile +7 -0
- data/gemfiles/6.0.gemfile +7 -0
- data/gemfiles/6.1.gemfile +7 -0
- data/lib/fast_underscore.rb +71 -14
- data/lib/fast_underscore/version.rb +1 -1
- data/sig/fast_underscore.rbs +10 -0
- metadata +40 -20
- data/.travis.yml +0 -6
- data/bin/rake +0 -29
- data/bin/rubocop +0 -29
data/bin/console
CHANGED
@@ -1,34 +1,22 @@
|
|
1
1
|
#include <ruby.h>
|
2
2
|
#include <ruby/encoding.h>
|
3
|
-
#include <stdlib.h>
|
4
3
|
|
5
|
-
|
6
|
-
|
7
|
-
*/
|
8
|
-
static int
|
9
|
-
character_is_lower(unsigned int character) {
|
4
|
+
// true if the given codepoint is a lowercase ascii character.
|
5
|
+
static int character_is_lower(unsigned int character) {
|
10
6
|
return character >= 'a' && character <= 'z';
|
11
7
|
}
|
12
8
|
|
13
|
-
|
14
|
-
|
15
|
-
*/
|
16
|
-
static int
|
17
|
-
character_is_upper(unsigned int character) {
|
9
|
+
// true if the given codepoint is a uppercase ascii character.
|
10
|
+
static int character_is_upper(unsigned int character) {
|
18
11
|
return character >= 'A' && character <= 'Z';
|
19
12
|
}
|
20
13
|
|
21
|
-
|
22
|
-
|
23
|
-
*/
|
24
|
-
static int
|
25
|
-
character_is_digit(unsigned int character) {
|
14
|
+
// true if the given codepoint is an ascii digit.
|
15
|
+
static int character_is_digit(unsigned int character) {
|
26
16
|
return character >= '0' && character <= '9';
|
27
17
|
}
|
28
18
|
|
29
|
-
|
30
|
-
* Macros for extracting the character out of the `codepoint_t` struct.
|
31
|
-
*/
|
19
|
+
// Macros for extracting the character out of the `codepoint_t` struct.
|
32
20
|
#define codepoint_is_lower(codepoint) character_is_lower(codepoint->character)
|
33
21
|
#define codepoint_is_upper(codepoint) character_is_upper(codepoint->character)
|
34
22
|
#define codepoint_is_digit(codepoint) character_is_digit(codepoint->character)
|
@@ -62,9 +50,64 @@ typedef struct codepoint {
|
|
62
50
|
* A struct for tracking the built string as it gets converted. Maintains an
|
63
51
|
* internal DFA for transitioning through various inputs to match certain
|
64
52
|
* patterns that need to be separated with underscores.
|
53
|
+
*
|
54
|
+
* The internal DFA looks like:
|
55
|
+
*
|
56
|
+
* ┌ - ┐ ┌ * ┐
|
57
|
+
* │ v │ v
|
58
|
+
* ┌─────────────┐ ┌─────────────┐
|
59
|
+
* │ │──── : ───>│ │
|
60
|
+
* ──>│ DEFAULT │<─── : ────│ COLON │
|
61
|
+
* │ │<─── * ────│ │
|
62
|
+
* └─────────────┘ └─────────────┘
|
63
|
+
* │ ^ ^ ^
|
64
|
+
* │ │ │ └───── a-z ─────────────┐
|
65
|
+
* 0-9A-Z * │ │
|
66
|
+
* │ │ └───────── * ────────┐ │
|
67
|
+
* v │ │ │
|
68
|
+
* ┌─────────────┐ ┌─────────────┐
|
69
|
+
* │ │─── A-Z ──>│ │
|
70
|
+
* │ UPPER_START │ │ UPPER_END │
|
71
|
+
* │ │<── 0-9 ───│ │
|
72
|
+
* └─────────────┘ └─────────────┘
|
73
|
+
* │ ^ ^ │
|
74
|
+
* └ 0-9 ┘ └ A-Z ┘
|
75
|
+
*
|
76
|
+
* Transitions from DEFAULT:
|
77
|
+
* - On "-", push an "_" and stay on DEFAULT
|
78
|
+
* - On ":", go to COLON
|
79
|
+
* - On a digit or upper, start a buffer with the char and go to UPPER_START
|
80
|
+
* - On anything else, push the char and stay on DEFAULT
|
81
|
+
*
|
82
|
+
* Transitions from COLON:
|
83
|
+
* - On ":", push a "/" and go to DEFAULT
|
84
|
+
* - On anything else, push a ":" and the char and go to DEFAULT
|
85
|
+
*
|
86
|
+
* Transitions from UPPER_START:
|
87
|
+
* - On a digit, push the digit and stay on UPPER_START
|
88
|
+
* - On an upper, push the upper and go to UPPER_END
|
89
|
+
* - On anything else, push the buffer, go to DEFAULT, then handle the char
|
90
|
+
*
|
91
|
+
* Transitions from UPPER_END:
|
92
|
+
* - On a digit, push the digit onto the buffer and go to UPPER_START
|
93
|
+
* - On an upper, push the upper onto the buffer and stay on UPPER_END
|
94
|
+
* - On a lower, push the buffer up to the last char, push an "_", then push
|
95
|
+
* the last char of the buffer, go to DEFAULT, then handle the char
|
96
|
+
* - On anything else, push the buffer, go to DEFAULT, then handle the char
|
97
|
+
*
|
98
|
+
* These transitions allow us to accomplish the equivalent of the following code
|
99
|
+
* with one pass through the string:
|
100
|
+
*
|
101
|
+
* def underscore(word)
|
102
|
+
* word.gsub!('::', '/')
|
103
|
+
* word.gsub!(/([A-Z\d]+)([A-Z][a-z])/, '\1_\2')
|
104
|
+
* word.gsub!(/([a-z\d])([A-Z])/, '\1_\2')
|
105
|
+
* word.tr!('-', '_')
|
106
|
+
* word.downcase!
|
107
|
+
* end
|
65
108
|
*/
|
66
109
|
typedef struct builder {
|
67
|
-
// The state of the DFA
|
110
|
+
// The state of the DFA that the builder is in
|
68
111
|
enum state {
|
69
112
|
STATE_DEFAULT,
|
70
113
|
STATE_COLON,
|
@@ -82,78 +125,15 @@ typedef struct builder {
|
|
82
125
|
|
83
126
|
// Whether or not the last pushed result character should cause the following
|
84
127
|
// one to be spaced by an underscore
|
85
|
-
int
|
128
|
+
int push_next;
|
86
129
|
} builder_t;
|
87
130
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
static codepoint_t*
|
92
|
-
codepoint_build(rb_encoding *encoding) {
|
93
|
-
codepoint_t *codepoint;
|
94
|
-
|
95
|
-
codepoint = (codepoint_t *) malloc(sizeof(codepoint_t));
|
96
|
-
if (codepoint == NULL) {
|
97
|
-
return NULL;
|
98
|
-
}
|
99
|
-
|
100
|
-
codepoint->encoding = encoding;
|
101
|
-
return codepoint;
|
102
|
-
}
|
103
|
-
|
104
|
-
/**
|
105
|
-
* Free a previously allocated `codepoint_t` struct.
|
106
|
-
*/
|
107
|
-
static void
|
108
|
-
codepoint_free(codepoint_t *codepoint) {
|
109
|
-
free(codepoint);
|
110
|
-
}
|
111
|
-
|
112
|
-
/**
|
113
|
-
* Allocate and initialize a `builder_t` struct.
|
114
|
-
*/
|
115
|
-
static builder_t*
|
116
|
-
builder_build(long str_len) {
|
117
|
-
builder_t *builder;
|
118
|
-
|
119
|
-
builder = (builder_t *) malloc(sizeof(builder_t));
|
120
|
-
if (builder == NULL) {
|
121
|
-
return NULL;
|
122
|
-
}
|
123
|
-
|
124
|
-
builder->state = STATE_DEFAULT;
|
125
|
-
builder->segment = (char *) malloc(str_len * sizeof(unsigned int) * 2);
|
126
|
-
|
127
|
-
if (builder->segment == NULL) {
|
128
|
-
free(builder);
|
129
|
-
return NULL;
|
130
|
-
}
|
131
|
-
|
132
|
-
builder->result = (char *) malloc(str_len * sizeof(unsigned int) * 2);
|
133
|
-
|
134
|
-
if (builder->result == NULL) {
|
135
|
-
free(builder->segment);
|
136
|
-
free(builder);
|
137
|
-
return NULL;
|
138
|
-
}
|
139
|
-
|
140
|
-
builder->segment_size = 0;
|
141
|
-
builder->result_size = 0;
|
142
|
-
builder->pushNext = 0;
|
143
|
-
|
144
|
-
return builder;
|
145
|
-
}
|
146
|
-
|
147
|
-
/**
|
148
|
-
* Push a character onto the resultant string using the given codepoint and
|
149
|
-
* encoding.
|
150
|
-
*/
|
151
|
-
static void
|
152
|
-
builder_result_push_char(builder_t *builder, unsigned int character, int size,
|
153
|
-
rb_encoding *encoding) {
|
131
|
+
// Push a character onto the resultant string using the given codepoint and
|
132
|
+
// encoding.
|
133
|
+
static void builder_result_push_char(builder_t *builder, unsigned int character, int size, rb_encoding *encoding) {
|
154
134
|
if (character_is_upper(character)) {
|
155
|
-
if (builder->
|
156
|
-
builder->
|
135
|
+
if (builder->push_next == 1) {
|
136
|
+
builder->push_next = 0;
|
157
137
|
builder_result_push_literal(builder, '_');
|
158
138
|
}
|
159
139
|
|
@@ -161,7 +141,7 @@ builder_result_push_char(builder_t *builder, unsigned int character, int size,
|
|
161
141
|
return;
|
162
142
|
}
|
163
143
|
|
164
|
-
builder->
|
144
|
+
builder->push_next = (character_is_lower(character) || character_is_digit(character));
|
165
145
|
|
166
146
|
if (encoding == NULL) {
|
167
147
|
builder->result[builder->result_size++] = (char) character;
|
@@ -171,20 +151,14 @@ builder_result_push_char(builder_t *builder, unsigned int character, int size,
|
|
171
151
|
}
|
172
152
|
}
|
173
153
|
|
174
|
-
|
175
|
-
|
176
|
-
*/
|
177
|
-
static void
|
178
|
-
builder_segment_push(builder_t *builder, codepoint_t *codepoint) {
|
154
|
+
// Push the given codepoint onto the builder.
|
155
|
+
static void builder_segment_push(builder_t *builder, codepoint_t *codepoint) {
|
179
156
|
builder->segment[builder->segment_size++] = (char) codepoint->character;
|
180
157
|
}
|
181
158
|
|
182
|
-
|
183
|
-
|
184
|
-
*
|
185
|
-
*/
|
186
|
-
static void
|
187
|
-
builder_segment_copy(builder_t *builder, long size) {
|
159
|
+
// Copy the given number of characters out of the segment cache onto the result
|
160
|
+
// string.
|
161
|
+
static void builder_segment_copy(builder_t *builder, long size) {
|
188
162
|
long idx;
|
189
163
|
|
190
164
|
for (idx = 0; idx < size; idx++) {
|
@@ -192,24 +166,18 @@ builder_segment_copy(builder_t *builder, long size) {
|
|
192
166
|
}
|
193
167
|
}
|
194
168
|
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
*/
|
199
|
-
static void
|
200
|
-
builder_restart(builder_t *builder) {
|
169
|
+
// Restart the `builder_t` back at the default state (because we've hit a
|
170
|
+
// character for which we have no allowed transitions).
|
171
|
+
static void builder_restart(builder_t *builder) {
|
201
172
|
builder->state = STATE_DEFAULT;
|
202
173
|
builder->segment_size = 0;
|
203
174
|
}
|
204
175
|
|
205
176
|
static void builder_next(builder_t *builder, codepoint_t *codepoint);
|
206
177
|
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
*/
|
211
|
-
static void
|
212
|
-
builder_flush(builder_t *builder) {
|
178
|
+
// Pull the remaining content out of the cached segment in case we don't end
|
179
|
+
// parsing while not in the default state.
|
180
|
+
static void builder_flush(builder_t *builder) {
|
213
181
|
switch (builder->state) {
|
214
182
|
case STATE_DEFAULT: return;
|
215
183
|
case STATE_COLON:
|
@@ -222,33 +190,30 @@ builder_flush(builder_t *builder) {
|
|
222
190
|
}
|
223
191
|
}
|
224
192
|
|
225
|
-
|
226
|
-
|
227
|
-
*/
|
228
|
-
static inline void
|
229
|
-
builder_default_transition(builder_t *builder, codepoint_t *codepoint) {
|
193
|
+
// Perform transitions from the STATE_DEFAULT state.
|
194
|
+
static inline void builder_default_transition(builder_t *builder, codepoint_t *codepoint) {
|
230
195
|
if (codepoint->character == '-') {
|
231
196
|
builder_result_push_literal(builder, '_');
|
232
197
|
return;
|
233
198
|
}
|
199
|
+
|
234
200
|
if (codepoint->character == ':') {
|
235
201
|
builder->state = STATE_COLON;
|
236
202
|
return;
|
237
203
|
}
|
204
|
+
|
238
205
|
if (codepoint_is_digit(codepoint) || codepoint_is_upper(codepoint)) {
|
239
206
|
builder->segment[0] = (char) codepoint->character;
|
240
207
|
builder->segment_size = 1;
|
241
208
|
builder->state = STATE_UPPER_START;
|
242
209
|
return;
|
243
210
|
}
|
211
|
+
|
244
212
|
builder_result_push(builder, codepoint);
|
245
213
|
}
|
246
214
|
|
247
|
-
|
248
|
-
|
249
|
-
*/
|
250
|
-
static inline void
|
251
|
-
builder_colon_transition(builder_t *builder, codepoint_t *codepoint) {
|
215
|
+
// Perform transitions from the STATE_COLON state.
|
216
|
+
static inline void builder_colon_transition(builder_t *builder, codepoint_t *codepoint) {
|
252
217
|
if (codepoint->character == ':') {
|
253
218
|
builder_result_push_literal(builder, '/');
|
254
219
|
builder_restart(builder);
|
@@ -260,15 +225,13 @@ builder_colon_transition(builder_t *builder, codepoint_t *codepoint) {
|
|
260
225
|
builder_next(builder, codepoint);
|
261
226
|
}
|
262
227
|
|
263
|
-
|
264
|
-
|
265
|
-
*/
|
266
|
-
static inline void
|
267
|
-
builder_upper_start_transition(builder_t *builder, codepoint_t *codepoint) {
|
228
|
+
// Perform transitions from the STATE_UPPER_START state.
|
229
|
+
static inline void builder_upper_start_transition(builder_t *builder, codepoint_t *codepoint) {
|
268
230
|
if (codepoint_is_digit(codepoint)) {
|
269
231
|
builder_segment_push(builder, codepoint);
|
270
232
|
return;
|
271
233
|
}
|
234
|
+
|
272
235
|
if (codepoint_is_upper(codepoint)) {
|
273
236
|
builder_segment_push(builder, codepoint);
|
274
237
|
builder->state = STATE_UPPER_END;
|
@@ -280,20 +243,19 @@ builder_upper_start_transition(builder_t *builder, codepoint_t *codepoint) {
|
|
280
243
|
builder_next(builder, codepoint);
|
281
244
|
}
|
282
245
|
|
283
|
-
|
284
|
-
|
285
|
-
*/
|
286
|
-
static inline void
|
287
|
-
builder_upper_end_transition(builder_t *builder, codepoint_t *codepoint) {
|
246
|
+
// Perform transitions from the STATE_UPPER_END state.
|
247
|
+
static inline void builder_upper_end_transition(builder_t *builder, codepoint_t *codepoint) {
|
288
248
|
if (codepoint_is_digit(codepoint)) {
|
289
249
|
builder_segment_push(builder, codepoint);
|
290
250
|
builder->state = STATE_UPPER_START;
|
291
251
|
return;
|
292
252
|
}
|
253
|
+
|
293
254
|
if (codepoint_is_upper(codepoint)) {
|
294
255
|
builder_segment_push(builder, codepoint);
|
295
256
|
return;
|
296
257
|
}
|
258
|
+
|
297
259
|
if (codepoint_is_lower(codepoint)) {
|
298
260
|
builder_segment_copy(builder, builder->segment_size - 1);
|
299
261
|
builder_result_push_literal(builder, '_');
|
@@ -308,12 +270,8 @@ builder_upper_end_transition(builder_t *builder, codepoint_t *codepoint) {
|
|
308
270
|
builder_next(builder, codepoint);
|
309
271
|
}
|
310
272
|
|
311
|
-
|
312
|
-
|
313
|
-
* next state.
|
314
|
-
*/
|
315
|
-
static void
|
316
|
-
builder_next(builder_t *builder, codepoint_t *codepoint) {
|
273
|
+
// Accept the next codepoint, which will move the `builder_t` struct into the next state.
|
274
|
+
static void builder_next(builder_t *builder, codepoint_t *codepoint) {
|
317
275
|
switch (builder->state) {
|
318
276
|
case STATE_DEFAULT:
|
319
277
|
return builder_default_transition(builder, codepoint);
|
@@ -326,16 +284,6 @@ builder_next(builder_t *builder, codepoint_t *codepoint) {
|
|
326
284
|
}
|
327
285
|
}
|
328
286
|
|
329
|
-
/**
|
330
|
-
* Frees a previously allocated `builder_t` struct.
|
331
|
-
*/
|
332
|
-
static void
|
333
|
-
builder_free(builder_t *builder) {
|
334
|
-
free(builder->segment);
|
335
|
-
free(builder->result);
|
336
|
-
free(builder);
|
337
|
-
}
|
338
|
-
|
339
287
|
/**
|
340
288
|
* Makes an underscored, lowercase form from the expression in the string.
|
341
289
|
*
|
@@ -349,43 +297,46 @@ builder_free(builder_t *builder) {
|
|
349
297
|
*
|
350
298
|
* camelize(underscore('SSLError')) # => "SslError"
|
351
299
|
*/
|
352
|
-
static VALUE
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
300
|
+
static VALUE underscore(VALUE string) {
|
301
|
+
char segment[RSTRING_LEN(string) * 2 * sizeof(unsigned int) * 2];
|
302
|
+
char result[RSTRING_LEN(string) * 2 * sizeof(unsigned int) * 2];
|
303
|
+
|
304
|
+
builder_t builder = {
|
305
|
+
.state = STATE_DEFAULT,
|
306
|
+
.segment = segment,
|
307
|
+
.result = result,
|
308
|
+
.segment_size = 0,
|
309
|
+
.result_size = 0,
|
310
|
+
.push_next = 0
|
311
|
+
};
|
312
|
+
|
313
|
+
codepoint_t codepoint = {
|
314
|
+
.encoding = rb_enc_from_index(ENCODING_GET(string)),
|
315
|
+
.character = 0,
|
316
|
+
.size = 0
|
317
|
+
};
|
318
|
+
|
319
|
+
char *pointer = RSTRING_PTR(string);
|
320
|
+
char *end = RSTRING_END(string);
|
321
|
+
|
322
|
+
while (pointer < end) {
|
323
|
+
codepoint.character = rb_enc_codepoint_len(pointer, end, &codepoint.size, codepoint.encoding);
|
324
|
+
builder_next(&builder, &codepoint);
|
325
|
+
pointer += codepoint.size;
|
374
326
|
}
|
375
|
-
builder_flush(builder);
|
376
327
|
|
377
|
-
|
378
|
-
|
379
|
-
|
328
|
+
builder_flush(&builder);
|
329
|
+
return rb_enc_str_new(builder.result, builder.result_size, codepoint.encoding);
|
330
|
+
}
|
380
331
|
|
381
|
-
|
332
|
+
// FastUnderscore::underscore
|
333
|
+
static VALUE fast_underscore(VALUE self, VALUE string) {
|
334
|
+
return underscore(string);
|
382
335
|
}
|
383
336
|
|
384
|
-
|
385
|
-
|
386
|
-
*/
|
387
|
-
void
|
388
|
-
Init_fast_underscore(void) {
|
337
|
+
// Hook into Ruby and define FastUnderscore::underscore and String#underscore
|
338
|
+
void Init_fast_underscore(void) {
|
389
339
|
VALUE rb_cFastUnderscore = rb_define_module("FastUnderscore");
|
390
|
-
rb_define_singleton_method(rb_cFastUnderscore, "underscore",
|
340
|
+
rb_define_singleton_method(rb_cFastUnderscore, "underscore", fast_underscore, 1);
|
341
|
+
rb_define_method(rb_cString, "underscore", underscore, 0);
|
391
342
|
}
|