fast_underscore 0.0.3 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.github/dependabot.yml +6 -0
- data/.github/workflows/main.yml +44 -0
- data/.gitignore +2 -0
- data/.rubocop.yml +24 -5
- data/CHANGELOG.md +44 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +58 -31
- data/{LICENSE.txt → LICENSE} +1 -1
- data/README.md +16 -25
- data/Steepfile +6 -0
- data/bin/{benchmark → bench} +3 -2
- data/bin/console +1 -1
- data/ext/fast_underscore/fast_underscore.c +133 -182
- data/fast_underscore.gemspec +6 -5
- data/gemfiles/5.1.gemfile +7 -0
- data/gemfiles/5.2.gemfile +7 -0
- data/gemfiles/6.0.gemfile +7 -0
- data/gemfiles/6.1.gemfile +7 -0
- data/lib/fast_underscore.rb +71 -14
- data/lib/fast_underscore/version.rb +1 -1
- data/sig/fast_underscore.rbs +10 -0
- metadata +40 -20
- data/.travis.yml +0 -6
- data/bin/rake +0 -29
- data/bin/rubocop +0 -29
data/bin/console
CHANGED
@@ -1,34 +1,22 @@
|
|
1
1
|
#include <ruby.h>
|
2
2
|
#include <ruby/encoding.h>
|
3
|
-
#include <stdlib.h>
|
4
3
|
|
5
|
-
|
6
|
-
|
7
|
-
*/
|
8
|
-
static int
|
9
|
-
character_is_lower(unsigned int character) {
|
4
|
+
// true if the given codepoint is a lowercase ascii character.
|
5
|
+
static int character_is_lower(unsigned int character) {
|
10
6
|
return character >= 'a' && character <= 'z';
|
11
7
|
}
|
12
8
|
|
13
|
-
|
14
|
-
|
15
|
-
*/
|
16
|
-
static int
|
17
|
-
character_is_upper(unsigned int character) {
|
9
|
+
// true if the given codepoint is a uppercase ascii character.
|
10
|
+
static int character_is_upper(unsigned int character) {
|
18
11
|
return character >= 'A' && character <= 'Z';
|
19
12
|
}
|
20
13
|
|
21
|
-
|
22
|
-
|
23
|
-
*/
|
24
|
-
static int
|
25
|
-
character_is_digit(unsigned int character) {
|
14
|
+
// true if the given codepoint is an ascii digit.
|
15
|
+
static int character_is_digit(unsigned int character) {
|
26
16
|
return character >= '0' && character <= '9';
|
27
17
|
}
|
28
18
|
|
29
|
-
|
30
|
-
* Macros for extracting the character out of the `codepoint_t` struct.
|
31
|
-
*/
|
19
|
+
// Macros for extracting the character out of the `codepoint_t` struct.
|
32
20
|
#define codepoint_is_lower(codepoint) character_is_lower(codepoint->character)
|
33
21
|
#define codepoint_is_upper(codepoint) character_is_upper(codepoint->character)
|
34
22
|
#define codepoint_is_digit(codepoint) character_is_digit(codepoint->character)
|
@@ -62,9 +50,64 @@ typedef struct codepoint {
|
|
62
50
|
* A struct for tracking the built string as it gets converted. Maintains an
|
63
51
|
* internal DFA for transitioning through various inputs to match certain
|
64
52
|
* patterns that need to be separated with underscores.
|
53
|
+
*
|
54
|
+
* The internal DFA looks like:
|
55
|
+
*
|
56
|
+
* ┌ - ┐ ┌ * ┐
|
57
|
+
* │ v │ v
|
58
|
+
* ┌─────────────┐ ┌─────────────┐
|
59
|
+
* │ │──── : ───>│ │
|
60
|
+
* ──>│ DEFAULT │<─── : ────│ COLON │
|
61
|
+
* │ │<─── * ────│ │
|
62
|
+
* └─────────────┘ └─────────────┘
|
63
|
+
* │ ^ ^ ^
|
64
|
+
* │ │ │ └───── a-z ─────────────┐
|
65
|
+
* 0-9A-Z * │ │
|
66
|
+
* │ │ └───────── * ────────┐ │
|
67
|
+
* v │ │ │
|
68
|
+
* ┌─────────────┐ ┌─────────────┐
|
69
|
+
* │ │─── A-Z ──>│ │
|
70
|
+
* │ UPPER_START │ │ UPPER_END │
|
71
|
+
* │ │<── 0-9 ───│ │
|
72
|
+
* └─────────────┘ └─────────────┘
|
73
|
+
* │ ^ ^ │
|
74
|
+
* └ 0-9 ┘ └ A-Z ┘
|
75
|
+
*
|
76
|
+
* Transitions from DEFAULT:
|
77
|
+
* - On "-", push an "_" and stay on DEFAULT
|
78
|
+
* - On ":", go to COLON
|
79
|
+
* - On a digit or upper, start a buffer with the char and go to UPPER_START
|
80
|
+
* - On anything else, push the char and stay on DEFAULT
|
81
|
+
*
|
82
|
+
* Transitions from COLON:
|
83
|
+
* - On ":", push a "/" and go to DEFAULT
|
84
|
+
* - On anything else, push a ":" and the char and go to DEFAULT
|
85
|
+
*
|
86
|
+
* Transitions from UPPER_START:
|
87
|
+
* - On a digit, push the digit and stay on UPPER_START
|
88
|
+
* - On an upper, push the upper and go to UPPER_END
|
89
|
+
* - On anything else, push the buffer, go to DEFAULT, then handle the char
|
90
|
+
*
|
91
|
+
* Transitions from UPPER_END:
|
92
|
+
* - On a digit, push the digit onto the buffer and go to UPPER_START
|
93
|
+
* - On an upper, push the upper onto the buffer and stay on UPPER_END
|
94
|
+
* - On a lower, push the buffer up to the last char, push an "_", then push
|
95
|
+
* the last char of the buffer, go to DEFAULT, then handle the char
|
96
|
+
* - On anything else, push the buffer, go to DEFAULT, then handle the char
|
97
|
+
*
|
98
|
+
* These transitions allow us to accomplish the equivalent of the following code
|
99
|
+
* with one pass through the string:
|
100
|
+
*
|
101
|
+
* def underscore(word)
|
102
|
+
* word.gsub!('::', '/')
|
103
|
+
* word.gsub!(/([A-Z\d]+)([A-Z][a-z])/, '\1_\2')
|
104
|
+
* word.gsub!(/([a-z\d])([A-Z])/, '\1_\2')
|
105
|
+
* word.tr!('-', '_')
|
106
|
+
* word.downcase!
|
107
|
+
* end
|
65
108
|
*/
|
66
109
|
typedef struct builder {
|
67
|
-
// The state of the DFA
|
110
|
+
// The state of the DFA that the builder is in
|
68
111
|
enum state {
|
69
112
|
STATE_DEFAULT,
|
70
113
|
STATE_COLON,
|
@@ -82,78 +125,15 @@ typedef struct builder {
|
|
82
125
|
|
83
126
|
// Whether or not the last pushed result character should cause the following
|
84
127
|
// one to be spaced by an underscore
|
85
|
-
int
|
128
|
+
int push_next;
|
86
129
|
} builder_t;
|
87
130
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
static codepoint_t*
|
92
|
-
codepoint_build(rb_encoding *encoding) {
|
93
|
-
codepoint_t *codepoint;
|
94
|
-
|
95
|
-
codepoint = (codepoint_t *) malloc(sizeof(codepoint_t));
|
96
|
-
if (codepoint == NULL) {
|
97
|
-
return NULL;
|
98
|
-
}
|
99
|
-
|
100
|
-
codepoint->encoding = encoding;
|
101
|
-
return codepoint;
|
102
|
-
}
|
103
|
-
|
104
|
-
/**
|
105
|
-
* Free a previously allocated `codepoint_t` struct.
|
106
|
-
*/
|
107
|
-
static void
|
108
|
-
codepoint_free(codepoint_t *codepoint) {
|
109
|
-
free(codepoint);
|
110
|
-
}
|
111
|
-
|
112
|
-
/**
|
113
|
-
* Allocate and initialize a `builder_t` struct.
|
114
|
-
*/
|
115
|
-
static builder_t*
|
116
|
-
builder_build(long str_len) {
|
117
|
-
builder_t *builder;
|
118
|
-
|
119
|
-
builder = (builder_t *) malloc(sizeof(builder_t));
|
120
|
-
if (builder == NULL) {
|
121
|
-
return NULL;
|
122
|
-
}
|
123
|
-
|
124
|
-
builder->state = STATE_DEFAULT;
|
125
|
-
builder->segment = (char *) malloc(str_len * sizeof(unsigned int) * 2);
|
126
|
-
|
127
|
-
if (builder->segment == NULL) {
|
128
|
-
free(builder);
|
129
|
-
return NULL;
|
130
|
-
}
|
131
|
-
|
132
|
-
builder->result = (char *) malloc(str_len * sizeof(unsigned int) * 2);
|
133
|
-
|
134
|
-
if (builder->result == NULL) {
|
135
|
-
free(builder->segment);
|
136
|
-
free(builder);
|
137
|
-
return NULL;
|
138
|
-
}
|
139
|
-
|
140
|
-
builder->segment_size = 0;
|
141
|
-
builder->result_size = 0;
|
142
|
-
builder->pushNext = 0;
|
143
|
-
|
144
|
-
return builder;
|
145
|
-
}
|
146
|
-
|
147
|
-
/**
|
148
|
-
* Push a character onto the resultant string using the given codepoint and
|
149
|
-
* encoding.
|
150
|
-
*/
|
151
|
-
static void
|
152
|
-
builder_result_push_char(builder_t *builder, unsigned int character, int size,
|
153
|
-
rb_encoding *encoding) {
|
131
|
+
// Push a character onto the resultant string using the given codepoint and
|
132
|
+
// encoding.
|
133
|
+
static void builder_result_push_char(builder_t *builder, unsigned int character, int size, rb_encoding *encoding) {
|
154
134
|
if (character_is_upper(character)) {
|
155
|
-
if (builder->
|
156
|
-
builder->
|
135
|
+
if (builder->push_next == 1) {
|
136
|
+
builder->push_next = 0;
|
157
137
|
builder_result_push_literal(builder, '_');
|
158
138
|
}
|
159
139
|
|
@@ -161,7 +141,7 @@ builder_result_push_char(builder_t *builder, unsigned int character, int size,
|
|
161
141
|
return;
|
162
142
|
}
|
163
143
|
|
164
|
-
builder->
|
144
|
+
builder->push_next = (character_is_lower(character) || character_is_digit(character));
|
165
145
|
|
166
146
|
if (encoding == NULL) {
|
167
147
|
builder->result[builder->result_size++] = (char) character;
|
@@ -171,20 +151,14 @@ builder_result_push_char(builder_t *builder, unsigned int character, int size,
|
|
171
151
|
}
|
172
152
|
}
|
173
153
|
|
174
|
-
|
175
|
-
|
176
|
-
*/
|
177
|
-
static void
|
178
|
-
builder_segment_push(builder_t *builder, codepoint_t *codepoint) {
|
154
|
+
// Push the given codepoint onto the builder.
|
155
|
+
static void builder_segment_push(builder_t *builder, codepoint_t *codepoint) {
|
179
156
|
builder->segment[builder->segment_size++] = (char) codepoint->character;
|
180
157
|
}
|
181
158
|
|
182
|
-
|
183
|
-
|
184
|
-
*
|
185
|
-
*/
|
186
|
-
static void
|
187
|
-
builder_segment_copy(builder_t *builder, long size) {
|
159
|
+
// Copy the given number of characters out of the segment cache onto the result
|
160
|
+
// string.
|
161
|
+
static void builder_segment_copy(builder_t *builder, long size) {
|
188
162
|
long idx;
|
189
163
|
|
190
164
|
for (idx = 0; idx < size; idx++) {
|
@@ -192,24 +166,18 @@ builder_segment_copy(builder_t *builder, long size) {
|
|
192
166
|
}
|
193
167
|
}
|
194
168
|
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
*/
|
199
|
-
static void
|
200
|
-
builder_restart(builder_t *builder) {
|
169
|
+
// Restart the `builder_t` back at the default state (because we've hit a
|
170
|
+
// character for which we have no allowed transitions).
|
171
|
+
static void builder_restart(builder_t *builder) {
|
201
172
|
builder->state = STATE_DEFAULT;
|
202
173
|
builder->segment_size = 0;
|
203
174
|
}
|
204
175
|
|
205
176
|
static void builder_next(builder_t *builder, codepoint_t *codepoint);
|
206
177
|
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
*/
|
211
|
-
static void
|
212
|
-
builder_flush(builder_t *builder) {
|
178
|
+
// Pull the remaining content out of the cached segment in case we don't end
|
179
|
+
// parsing while not in the default state.
|
180
|
+
static void builder_flush(builder_t *builder) {
|
213
181
|
switch (builder->state) {
|
214
182
|
case STATE_DEFAULT: return;
|
215
183
|
case STATE_COLON:
|
@@ -222,33 +190,30 @@ builder_flush(builder_t *builder) {
|
|
222
190
|
}
|
223
191
|
}
|
224
192
|
|
225
|
-
|
226
|
-
|
227
|
-
*/
|
228
|
-
static inline void
|
229
|
-
builder_default_transition(builder_t *builder, codepoint_t *codepoint) {
|
193
|
+
// Perform transitions from the STATE_DEFAULT state.
|
194
|
+
static inline void builder_default_transition(builder_t *builder, codepoint_t *codepoint) {
|
230
195
|
if (codepoint->character == '-') {
|
231
196
|
builder_result_push_literal(builder, '_');
|
232
197
|
return;
|
233
198
|
}
|
199
|
+
|
234
200
|
if (codepoint->character == ':') {
|
235
201
|
builder->state = STATE_COLON;
|
236
202
|
return;
|
237
203
|
}
|
204
|
+
|
238
205
|
if (codepoint_is_digit(codepoint) || codepoint_is_upper(codepoint)) {
|
239
206
|
builder->segment[0] = (char) codepoint->character;
|
240
207
|
builder->segment_size = 1;
|
241
208
|
builder->state = STATE_UPPER_START;
|
242
209
|
return;
|
243
210
|
}
|
211
|
+
|
244
212
|
builder_result_push(builder, codepoint);
|
245
213
|
}
|
246
214
|
|
247
|
-
|
248
|
-
|
249
|
-
*/
|
250
|
-
static inline void
|
251
|
-
builder_colon_transition(builder_t *builder, codepoint_t *codepoint) {
|
215
|
+
// Perform transitions from the STATE_COLON state.
|
216
|
+
static inline void builder_colon_transition(builder_t *builder, codepoint_t *codepoint) {
|
252
217
|
if (codepoint->character == ':') {
|
253
218
|
builder_result_push_literal(builder, '/');
|
254
219
|
builder_restart(builder);
|
@@ -260,15 +225,13 @@ builder_colon_transition(builder_t *builder, codepoint_t *codepoint) {
|
|
260
225
|
builder_next(builder, codepoint);
|
261
226
|
}
|
262
227
|
|
263
|
-
|
264
|
-
|
265
|
-
*/
|
266
|
-
static inline void
|
267
|
-
builder_upper_start_transition(builder_t *builder, codepoint_t *codepoint) {
|
228
|
+
// Perform transitions from the STATE_UPPER_START state.
|
229
|
+
static inline void builder_upper_start_transition(builder_t *builder, codepoint_t *codepoint) {
|
268
230
|
if (codepoint_is_digit(codepoint)) {
|
269
231
|
builder_segment_push(builder, codepoint);
|
270
232
|
return;
|
271
233
|
}
|
234
|
+
|
272
235
|
if (codepoint_is_upper(codepoint)) {
|
273
236
|
builder_segment_push(builder, codepoint);
|
274
237
|
builder->state = STATE_UPPER_END;
|
@@ -280,20 +243,19 @@ builder_upper_start_transition(builder_t *builder, codepoint_t *codepoint) {
|
|
280
243
|
builder_next(builder, codepoint);
|
281
244
|
}
|
282
245
|
|
283
|
-
|
284
|
-
|
285
|
-
*/
|
286
|
-
static inline void
|
287
|
-
builder_upper_end_transition(builder_t *builder, codepoint_t *codepoint) {
|
246
|
+
// Perform transitions from the STATE_UPPER_END state.
|
247
|
+
static inline void builder_upper_end_transition(builder_t *builder, codepoint_t *codepoint) {
|
288
248
|
if (codepoint_is_digit(codepoint)) {
|
289
249
|
builder_segment_push(builder, codepoint);
|
290
250
|
builder->state = STATE_UPPER_START;
|
291
251
|
return;
|
292
252
|
}
|
253
|
+
|
293
254
|
if (codepoint_is_upper(codepoint)) {
|
294
255
|
builder_segment_push(builder, codepoint);
|
295
256
|
return;
|
296
257
|
}
|
258
|
+
|
297
259
|
if (codepoint_is_lower(codepoint)) {
|
298
260
|
builder_segment_copy(builder, builder->segment_size - 1);
|
299
261
|
builder_result_push_literal(builder, '_');
|
@@ -308,12 +270,8 @@ builder_upper_end_transition(builder_t *builder, codepoint_t *codepoint) {
|
|
308
270
|
builder_next(builder, codepoint);
|
309
271
|
}
|
310
272
|
|
311
|
-
|
312
|
-
|
313
|
-
* next state.
|
314
|
-
*/
|
315
|
-
static void
|
316
|
-
builder_next(builder_t *builder, codepoint_t *codepoint) {
|
273
|
+
// Accept the next codepoint, which will move the `builder_t` struct into the next state.
|
274
|
+
static void builder_next(builder_t *builder, codepoint_t *codepoint) {
|
317
275
|
switch (builder->state) {
|
318
276
|
case STATE_DEFAULT:
|
319
277
|
return builder_default_transition(builder, codepoint);
|
@@ -326,16 +284,6 @@ builder_next(builder_t *builder, codepoint_t *codepoint) {
|
|
326
284
|
}
|
327
285
|
}
|
328
286
|
|
329
|
-
/**
|
330
|
-
* Frees a previously allocated `builder_t` struct.
|
331
|
-
*/
|
332
|
-
static void
|
333
|
-
builder_free(builder_t *builder) {
|
334
|
-
free(builder->segment);
|
335
|
-
free(builder->result);
|
336
|
-
free(builder);
|
337
|
-
}
|
338
|
-
|
339
287
|
/**
|
340
288
|
* Makes an underscored, lowercase form from the expression in the string.
|
341
289
|
*
|
@@ -349,43 +297,46 @@ builder_free(builder_t *builder) {
|
|
349
297
|
*
|
350
298
|
* camelize(underscore('SSLError')) # => "SslError"
|
351
299
|
*/
|
352
|
-
static VALUE
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
300
|
+
static VALUE underscore(VALUE string) {
|
301
|
+
char segment[RSTRING_LEN(string) * 2 * sizeof(unsigned int) * 2];
|
302
|
+
char result[RSTRING_LEN(string) * 2 * sizeof(unsigned int) * 2];
|
303
|
+
|
304
|
+
builder_t builder = {
|
305
|
+
.state = STATE_DEFAULT,
|
306
|
+
.segment = segment,
|
307
|
+
.result = result,
|
308
|
+
.segment_size = 0,
|
309
|
+
.result_size = 0,
|
310
|
+
.push_next = 0
|
311
|
+
};
|
312
|
+
|
313
|
+
codepoint_t codepoint = {
|
314
|
+
.encoding = rb_enc_from_index(ENCODING_GET(string)),
|
315
|
+
.character = 0,
|
316
|
+
.size = 0
|
317
|
+
};
|
318
|
+
|
319
|
+
char *pointer = RSTRING_PTR(string);
|
320
|
+
char *end = RSTRING_END(string);
|
321
|
+
|
322
|
+
while (pointer < end) {
|
323
|
+
codepoint.character = rb_enc_codepoint_len(pointer, end, &codepoint.size, codepoint.encoding);
|
324
|
+
builder_next(&builder, &codepoint);
|
325
|
+
pointer += codepoint.size;
|
374
326
|
}
|
375
|
-
builder_flush(builder);
|
376
327
|
|
377
|
-
|
378
|
-
|
379
|
-
|
328
|
+
builder_flush(&builder);
|
329
|
+
return rb_enc_str_new(builder.result, builder.result_size, codepoint.encoding);
|
330
|
+
}
|
380
331
|
|
381
|
-
|
332
|
+
// FastUnderscore::underscore
|
333
|
+
static VALUE fast_underscore(VALUE self, VALUE string) {
|
334
|
+
return underscore(string);
|
382
335
|
}
|
383
336
|
|
384
|
-
|
385
|
-
|
386
|
-
*/
|
387
|
-
void
|
388
|
-
Init_fast_underscore(void) {
|
337
|
+
// Hook into Ruby and define FastUnderscore::underscore and String#underscore
|
338
|
+
void Init_fast_underscore(void) {
|
389
339
|
VALUE rb_cFastUnderscore = rb_define_module("FastUnderscore");
|
390
|
-
rb_define_singleton_method(rb_cFastUnderscore, "underscore",
|
340
|
+
rb_define_singleton_method(rb_cFastUnderscore, "underscore", fast_underscore, 1);
|
341
|
+
rb_define_method(rb_cString, "underscore", underscore, 0);
|
391
342
|
}
|