string-scrub 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/.rspec +2 -0
- data/.travis.yml +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +1 -0
- data/ext/string/extconf.rb +2 -0
- data/ext/string/scrub.c +366 -0
- data/string-scrub.gemspec +24 -0
- data/test/test_scrub.rb +71 -0
- metadata +86 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9eb9dbb53755095ddf297dcadd31df955a8d40aa
|
4
|
+
data.tar.gz: ff3692ef538237cfe5cd8da80136744b933102e5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 367c9b6389befefdf01757bed6b0cf8e6e6100155a0a5ca886299be886391f78d48765b7d3e6bd13555805d620a51d57e643e1e2fc735297492be5c03b6bffa2
|
7
|
+
data.tar.gz: c76f4019b64dcdd6794124015351b6109ffd1fb247a5b36ea4e9aaef4fa5c6c58732a47f2c01ddf68c3b0aefc6cbb8d28964e4c57815c76f06317127b95abf52
|
data/.gitignore
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
test/tmp
|
15
|
+
test/version_tmp
|
16
|
+
tmp
|
17
|
+
ext/string/*.o
|
18
|
+
ext/string/Makefile
|
19
|
+
ext/string/*.bundle
|
20
|
+
ext/string/*.so
|
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 SHIBATA Hiroshi
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# String::Scrub
|
2
|
+
|
3
|
+
String#scrub for Ruby 2.0.0
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'string-scrub'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install string-scrub
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
see [testcase](https://github.com/hsbt/string-scrub/blob/master/test/test_scrub.rb)
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/ext/string/scrub.c
ADDED
@@ -0,0 +1,366 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <ruby/encoding.h>
|
3
|
+
|
4
|
+
#ifndef TRUE
|
5
|
+
#define TRUE 1
|
6
|
+
#endif
|
7
|
+
#ifndef FALSE
|
8
|
+
#define FALSE 0
|
9
|
+
#endif
|
10
|
+
|
11
|
+
#define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
|
12
|
+
|
13
|
+
static inline const char *
|
14
|
+
search_nonascii(const char *p, const char *e)
|
15
|
+
{
|
16
|
+
#if SIZEOF_VALUE == 8
|
17
|
+
# define NONASCII_MASK 0x8080808080808080ULL
|
18
|
+
#elif SIZEOF_VALUE == 4
|
19
|
+
# define NONASCII_MASK 0x80808080UL
|
20
|
+
#endif
|
21
|
+
#ifdef NONASCII_MASK
|
22
|
+
if ((int)sizeof(VALUE) * 2 < e - p) {
|
23
|
+
const VALUE *s, *t;
|
24
|
+
const VALUE lowbits = sizeof(VALUE) - 1;
|
25
|
+
s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
|
26
|
+
while (p < (const char *)s) {
|
27
|
+
if (!ISASCII(*p))
|
28
|
+
return p;
|
29
|
+
p++;
|
30
|
+
}
|
31
|
+
t = (const VALUE*)(~lowbits & (VALUE)e);
|
32
|
+
while (s < t) {
|
33
|
+
if (*s & NONASCII_MASK) {
|
34
|
+
t = s;
|
35
|
+
break;
|
36
|
+
}
|
37
|
+
s++;
|
38
|
+
}
|
39
|
+
p = (const char *)t;
|
40
|
+
}
|
41
|
+
#endif
|
42
|
+
while (p < e) {
|
43
|
+
if (!ISASCII(*p))
|
44
|
+
return p;
|
45
|
+
p++;
|
46
|
+
}
|
47
|
+
return NULL;
|
48
|
+
}
|
49
|
+
|
50
|
+
static VALUE
|
51
|
+
str_compat_and_valid(VALUE str, rb_encoding *enc)
|
52
|
+
{
|
53
|
+
int cr;
|
54
|
+
str = StringValue(str);
|
55
|
+
cr = rb_enc_str_coderange(str);
|
56
|
+
if (cr == ENC_CODERANGE_BROKEN) {
|
57
|
+
rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
|
58
|
+
}
|
59
|
+
else if (cr == ENC_CODERANGE_7BIT) {
|
60
|
+
rb_encoding *e = STR_ENC_GET(str);
|
61
|
+
if (!rb_enc_asciicompat(enc)) {
|
62
|
+
rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
|
63
|
+
rb_enc_name(enc), rb_enc_name(e));
|
64
|
+
}
|
65
|
+
}
|
66
|
+
else { /* ENC_CODERANGE_VALID */
|
67
|
+
rb_encoding *e = STR_ENC_GET(str);
|
68
|
+
if (enc != e) {
|
69
|
+
rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
|
70
|
+
rb_enc_name(enc), rb_enc_name(e));
|
71
|
+
}
|
72
|
+
}
|
73
|
+
return str;
|
74
|
+
}
|
75
|
+
|
76
|
+
/**
|
77
|
+
* @param repl the replacement character
|
78
|
+
* @return If given string is invalid, returns a new string. Otherwise, returns Qnil.
|
79
|
+
*/
|
80
|
+
static VALUE
|
81
|
+
str_scrub0(int argc, VALUE *argv, VALUE str)
|
82
|
+
{
|
83
|
+
int cr = ENC_CODERANGE(str);
|
84
|
+
rb_encoding *enc;
|
85
|
+
int encidx;
|
86
|
+
VALUE repl;
|
87
|
+
|
88
|
+
if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID)
|
89
|
+
return Qnil;
|
90
|
+
|
91
|
+
enc = STR_ENC_GET(str);
|
92
|
+
rb_scan_args(argc, argv, "01", &repl);
|
93
|
+
if (argc != 0) {
|
94
|
+
repl = str_compat_and_valid(repl, enc);
|
95
|
+
}
|
96
|
+
|
97
|
+
if (rb_enc_dummy_p(enc)) {
|
98
|
+
return Qnil;
|
99
|
+
}
|
100
|
+
encidx = rb_enc_to_index(enc);
|
101
|
+
|
102
|
+
#define DEFAULT_REPLACE_CHAR(str) do { \
|
103
|
+
static const char replace[sizeof(str)-1] = str; \
|
104
|
+
rep = replace; replen = (int)sizeof(replace); \
|
105
|
+
} while (0)
|
106
|
+
|
107
|
+
if (rb_enc_asciicompat(enc)) {
|
108
|
+
const char *p = RSTRING_PTR(str);
|
109
|
+
const char *e = RSTRING_END(str);
|
110
|
+
const char *p1 = p;
|
111
|
+
const char *rep;
|
112
|
+
long replen;
|
113
|
+
int rep7bit_p;
|
114
|
+
VALUE buf = Qnil;
|
115
|
+
if (rb_block_given_p()) {
|
116
|
+
rep = NULL;
|
117
|
+
replen = 0;
|
118
|
+
rep7bit_p = FALSE;
|
119
|
+
}
|
120
|
+
else if (!NIL_P(repl)) {
|
121
|
+
rep = RSTRING_PTR(repl);
|
122
|
+
replen = RSTRING_LEN(repl);
|
123
|
+
rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
|
124
|
+
}
|
125
|
+
else if (encidx == rb_utf8_encindex()) {
|
126
|
+
DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
|
127
|
+
rep7bit_p = FALSE;
|
128
|
+
}
|
129
|
+
else {
|
130
|
+
DEFAULT_REPLACE_CHAR("?");
|
131
|
+
rep7bit_p = TRUE;
|
132
|
+
}
|
133
|
+
cr = ENC_CODERANGE_7BIT;
|
134
|
+
|
135
|
+
p = search_nonascii(p, e);
|
136
|
+
if (!p) {
|
137
|
+
p = e;
|
138
|
+
}
|
139
|
+
while (p < e) {
|
140
|
+
int ret = rb_enc_precise_mbclen(p, e, enc);
|
141
|
+
if (MBCLEN_NEEDMORE_P(ret)) {
|
142
|
+
break;
|
143
|
+
}
|
144
|
+
else if (MBCLEN_CHARFOUND_P(ret)) {
|
145
|
+
cr = ENC_CODERANGE_VALID;
|
146
|
+
p += MBCLEN_CHARFOUND_LEN(ret);
|
147
|
+
}
|
148
|
+
else if (MBCLEN_INVALID_P(ret)) {
|
149
|
+
/*
|
150
|
+
* p1~p: valid ascii/multibyte chars
|
151
|
+
* p ~e: invalid bytes + unknown bytes
|
152
|
+
*/
|
153
|
+
long clen = rb_enc_mbmaxlen(enc);
|
154
|
+
if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
|
155
|
+
if (p > p1) {
|
156
|
+
rb_str_buf_cat(buf, p1, p - p1);
|
157
|
+
}
|
158
|
+
|
159
|
+
if (e - p < clen) clen = e - p;
|
160
|
+
if (clen <= 2) {
|
161
|
+
clen = 1;
|
162
|
+
}
|
163
|
+
else {
|
164
|
+
const char *q = p;
|
165
|
+
clen--;
|
166
|
+
for (; clen > 1; clen--) {
|
167
|
+
ret = rb_enc_precise_mbclen(q, q + clen, enc);
|
168
|
+
if (MBCLEN_NEEDMORE_P(ret)) break;
|
169
|
+
if (MBCLEN_INVALID_P(ret)) continue;
|
170
|
+
UNREACHABLE;
|
171
|
+
}
|
172
|
+
}
|
173
|
+
if (rep) {
|
174
|
+
rb_str_buf_cat(buf, rep, replen);
|
175
|
+
if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
|
176
|
+
}
|
177
|
+
else {
|
178
|
+
repl = rb_yield(rb_enc_str_new(p1, clen, enc));
|
179
|
+
repl = str_compat_and_valid(repl, enc);
|
180
|
+
rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
|
181
|
+
if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
|
182
|
+
cr = ENC_CODERANGE_VALID;
|
183
|
+
}
|
184
|
+
p += clen;
|
185
|
+
p1 = p;
|
186
|
+
p = search_nonascii(p, e);
|
187
|
+
if (!p) {
|
188
|
+
p = e;
|
189
|
+
break;
|
190
|
+
}
|
191
|
+
}
|
192
|
+
else {
|
193
|
+
UNREACHABLE;
|
194
|
+
}
|
195
|
+
}
|
196
|
+
if (NIL_P(buf)) {
|
197
|
+
if (p == e) {
|
198
|
+
ENC_CODERANGE_SET(str, cr);
|
199
|
+
return Qnil;
|
200
|
+
}
|
201
|
+
buf = rb_str_buf_new(RSTRING_LEN(str));
|
202
|
+
}
|
203
|
+
if (p1 < p) {
|
204
|
+
rb_str_buf_cat(buf, p1, p - p1);
|
205
|
+
}
|
206
|
+
if (p < e) {
|
207
|
+
if (rep) {
|
208
|
+
rb_str_buf_cat(buf, rep, replen);
|
209
|
+
if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
|
210
|
+
}
|
211
|
+
else {
|
212
|
+
repl = rb_yield(rb_enc_str_new(p, e-p, enc));
|
213
|
+
repl = str_compat_and_valid(repl, enc);
|
214
|
+
rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
|
215
|
+
if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
|
216
|
+
cr = ENC_CODERANGE_VALID;
|
217
|
+
}
|
218
|
+
}
|
219
|
+
ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
|
220
|
+
return buf;
|
221
|
+
}
|
222
|
+
else {
|
223
|
+
/* ASCII incompatible */
|
224
|
+
const char *p = RSTRING_PTR(str);
|
225
|
+
const char *e = RSTRING_END(str);
|
226
|
+
const char *p1 = p;
|
227
|
+
VALUE buf = Qnil;
|
228
|
+
const char *rep;
|
229
|
+
long replen;
|
230
|
+
long mbminlen = rb_enc_mbminlen(enc);
|
231
|
+
if (!NIL_P(repl)) {
|
232
|
+
rep = RSTRING_PTR(repl);
|
233
|
+
replen = RSTRING_LEN(repl);
|
234
|
+
}
|
235
|
+
else if (!strcasecmp(rb_enc_name(enc), "UTF-16BE")) {
|
236
|
+
DEFAULT_REPLACE_CHAR("\xFF\xFD");
|
237
|
+
}
|
238
|
+
else if (!strcasecmp(rb_enc_name(enc), "UTF-16LE")) {
|
239
|
+
DEFAULT_REPLACE_CHAR("\xFD\xFF");
|
240
|
+
}
|
241
|
+
else if (!strcasecmp(rb_enc_name(enc), "UTF-32BE")) {
|
242
|
+
DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
|
243
|
+
}
|
244
|
+
else if (!strcasecmp(rb_enc_name(enc), "UTF-32lE")) {
|
245
|
+
DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
|
246
|
+
}
|
247
|
+
else {
|
248
|
+
DEFAULT_REPLACE_CHAR("?");
|
249
|
+
}
|
250
|
+
|
251
|
+
while (p < e) {
|
252
|
+
int ret = rb_enc_precise_mbclen(p, e, enc);
|
253
|
+
if (MBCLEN_NEEDMORE_P(ret)) {
|
254
|
+
break;
|
255
|
+
}
|
256
|
+
else if (MBCLEN_CHARFOUND_P(ret)) {
|
257
|
+
p += MBCLEN_CHARFOUND_LEN(ret);
|
258
|
+
}
|
259
|
+
else if (MBCLEN_INVALID_P(ret)) {
|
260
|
+
const char *q = p;
|
261
|
+
long clen = rb_enc_mbmaxlen(enc);
|
262
|
+
if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
|
263
|
+
if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
|
264
|
+
|
265
|
+
if (e - p < clen) clen = e - p;
|
266
|
+
if (clen <= mbminlen * 2) {
|
267
|
+
clen = mbminlen;
|
268
|
+
}
|
269
|
+
else {
|
270
|
+
clen -= mbminlen;
|
271
|
+
for (; clen > mbminlen; clen-=mbminlen) {
|
272
|
+
ret = rb_enc_precise_mbclen(q, q + clen, enc);
|
273
|
+
if (MBCLEN_NEEDMORE_P(ret)) break;
|
274
|
+
if (MBCLEN_INVALID_P(ret)) continue;
|
275
|
+
UNREACHABLE;
|
276
|
+
}
|
277
|
+
}
|
278
|
+
if (rep) {
|
279
|
+
rb_str_buf_cat(buf, rep, replen);
|
280
|
+
}
|
281
|
+
else {
|
282
|
+
repl = rb_yield(rb_enc_str_new(p, e-p, enc));
|
283
|
+
repl = str_compat_and_valid(repl, enc);
|
284
|
+
rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
|
285
|
+
}
|
286
|
+
p += clen;
|
287
|
+
p1 = p;
|
288
|
+
}
|
289
|
+
else {
|
290
|
+
UNREACHABLE;
|
291
|
+
}
|
292
|
+
}
|
293
|
+
if (NIL_P(buf)) {
|
294
|
+
if (p == e) {
|
295
|
+
ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
|
296
|
+
return Qnil;
|
297
|
+
}
|
298
|
+
buf = rb_str_buf_new(RSTRING_LEN(str));
|
299
|
+
}
|
300
|
+
if (p1 < p) {
|
301
|
+
rb_str_buf_cat(buf, p1, p - p1);
|
302
|
+
}
|
303
|
+
if (p < e) {
|
304
|
+
if (rep) {
|
305
|
+
rb_str_buf_cat(buf, rep, replen);
|
306
|
+
}
|
307
|
+
else {
|
308
|
+
repl = rb_yield(rb_enc_str_new(p, e-p, enc));
|
309
|
+
repl = str_compat_and_valid(repl, enc);
|
310
|
+
rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
|
311
|
+
}
|
312
|
+
}
|
313
|
+
ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID);
|
314
|
+
return buf;
|
315
|
+
}
|
316
|
+
}
|
317
|
+
|
318
|
+
/*
|
319
|
+
* call-seq:
|
320
|
+
* str.scrub -> new_str
|
321
|
+
* str.scrub(repl) -> new_str
|
322
|
+
* str.scrub{|bytes|} -> new_str
|
323
|
+
*
|
324
|
+
* If the string is invalid byte sequence then replace invalid bytes with given replacement
|
325
|
+
* character, else returns self.
|
326
|
+
* If block is given, replace invalid bytes with returned value of the block.
|
327
|
+
*
|
328
|
+
* "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
|
329
|
+
* "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
|
330
|
+
* "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
|
331
|
+
*/
|
332
|
+
VALUE
|
333
|
+
rb_str_scrub(int argc, VALUE *argv, VALUE str)
|
334
|
+
{
|
335
|
+
VALUE new = str_scrub0(argc, argv, str);
|
336
|
+
return NIL_P(new) ? rb_str_dup(str): new;
|
337
|
+
}
|
338
|
+
|
339
|
+
/*
|
340
|
+
* call-seq:
|
341
|
+
* str.scrub! -> str
|
342
|
+
* str.scrub!(repl) -> str
|
343
|
+
* str.scrub!{|bytes|} -> str
|
344
|
+
*
|
345
|
+
* If the string is invalid byte sequence then replace invalid bytes with given replacement
|
346
|
+
* character, else returns self.
|
347
|
+
* If block is given, replace invalid bytes with returned value of the block.
|
348
|
+
*
|
349
|
+
* "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
|
350
|
+
* "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
|
351
|
+
* "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
|
352
|
+
*/
|
353
|
+
static VALUE
|
354
|
+
str_scrub_bang(int argc, VALUE *argv, VALUE str)
|
355
|
+
{
|
356
|
+
VALUE new = str_scrub0(argc, argv, str);
|
357
|
+
if (!NIL_P(new)) rb_str_replace(str, new);
|
358
|
+
return str;
|
359
|
+
}
|
360
|
+
|
361
|
+
void
|
362
|
+
Init_scrub(void)
|
363
|
+
{
|
364
|
+
rb_define_method(rb_cString, "scrub", rb_str_scrub, -1);
|
365
|
+
rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
|
366
|
+
}
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |spec|
|
5
|
+
spec.name = "string-scrub"
|
6
|
+
spec.version = "0.0.1"
|
7
|
+
spec.authors = ["SHIBATA Hiroshi"]
|
8
|
+
spec.email = ["shibata.hiroshi@gmail.com"]
|
9
|
+
spec.summary = %q{String#scrub for Ruby 2.0.0}
|
10
|
+
spec.description = %q{String#scrub for Ruby 2.0.0}
|
11
|
+
spec.homepage = "https://github.com/hsbt/string-scrub"
|
12
|
+
spec.license = "MIT"
|
13
|
+
|
14
|
+
spec.files = `git ls-files`.split($/)
|
15
|
+
spec.extensions = ["ext/string/extconf.rb"]
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.required_ruby_version = '>= 2.0.0'
|
21
|
+
|
22
|
+
spec.add_development_dependency "bundler"
|
23
|
+
spec.add_development_dependency "rake"
|
24
|
+
end
|
data/test/test_scrub.rb
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
# coding: US-ASCII
|
2
|
+
require 'test/unit'
|
3
|
+
require_relative '../ext/string/scrub'
|
4
|
+
|
5
|
+
class TestScrub < Test::Unit::TestCase
|
6
|
+
module AESU
|
7
|
+
def ua(str) str.dup.force_encoding("US-ASCII") end
|
8
|
+
def a(str) str.dup.force_encoding("ASCII-8BIT") end
|
9
|
+
def e(str) str.dup.force_encoding("EUC-JP") end
|
10
|
+
def s(str) str.dup.force_encoding("Windows-31J") end
|
11
|
+
def u(str) str.dup.force_encoding("UTF-8") end
|
12
|
+
end
|
13
|
+
include AESU
|
14
|
+
|
15
|
+
def test_scrub
|
16
|
+
str = "\u3042\u3044"
|
17
|
+
assert_not_same(str, str.scrub)
|
18
|
+
str.force_encoding(Encoding::ISO_2022_JP) # dummy encoding
|
19
|
+
assert_not_same(str, str.scrub)
|
20
|
+
|
21
|
+
assert_equal("\uFFFD\uFFFD\uFFFD", u("\x80\x80\x80").scrub)
|
22
|
+
assert_equal("\uFFFDA", u("\xF4\x80\x80A").scrub)
|
23
|
+
|
24
|
+
# exapmles in Unicode 6.1.0 D93b
|
25
|
+
assert_equal("\x41\uFFFD\uFFFD\x41\uFFFD\x41",
|
26
|
+
u("\x41\xC0\xAF\x41\xF4\x80\x80\x41").scrub)
|
27
|
+
assert_equal("\x41\uFFFD\uFFFD\uFFFD\x41",
|
28
|
+
u("\x41\xE0\x9F\x80\x41").scrub)
|
29
|
+
assert_equal("\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064",
|
30
|
+
u("\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64").scrub)
|
31
|
+
assert_equal("abcdefghijklmnopqrstuvwxyz\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064",
|
32
|
+
u("abcdefghijklmnopqrstuvwxyz\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64").scrub)
|
33
|
+
|
34
|
+
assert_equal("\u3042\u3013", u("\xE3\x81\x82\xE3\x81").scrub("\u3013"))
|
35
|
+
assert_raise(Encoding::CompatibilityError){ u("\xE3\x81\x82\xE3\x81").scrub(e("\xA4\xA2")) }
|
36
|
+
assert_raise(TypeError){ u("\xE3\x81\x82\xE3\x81").scrub(1) }
|
37
|
+
assert_raise(ArgumentError){ u("\xE3\x81\x82\xE3\x81\x82\xE3\x81").scrub(u("\x81")) }
|
38
|
+
assert_equal(e("\xA4\xA2\xA2\xAE"), e("\xA4\xA2\xA4").scrub(e("\xA2\xAE")))
|
39
|
+
|
40
|
+
assert_equal("\u3042<e381>", u("\xE3\x81\x82\xE3\x81").scrub{|x|'<'+x.unpack('H*')[0]+'>'})
|
41
|
+
assert_raise(Encoding::CompatibilityError){ u("\xE3\x81\x82\xE3\x81").scrub{e("\xA4\xA2")} }
|
42
|
+
assert_raise(TypeError){ u("\xE3\x81\x82\xE3\x81").scrub{1} }
|
43
|
+
assert_raise(ArgumentError){ u("\xE3\x81\x82\xE3\x81\x82\xE3\x81").scrub{u("\x81")} }
|
44
|
+
assert_equal(e("\xA4\xA2\xA2\xAE"), e("\xA4\xA2\xA4").scrub{e("\xA2\xAE")})
|
45
|
+
|
46
|
+
assert_equal("\uFFFD\u3042".encode("UTF-16BE"),
|
47
|
+
"\xD8\x00\x30\x42".force_encoding(Encoding::UTF_16BE).
|
48
|
+
scrub)
|
49
|
+
assert_equal("\uFFFD\u3042".encode("UTF-16LE"),
|
50
|
+
"\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE).
|
51
|
+
scrub)
|
52
|
+
assert_equal("\uFFFD".encode("UTF-32BE"),
|
53
|
+
"\xff".force_encoding(Encoding::UTF_32BE).
|
54
|
+
scrub)
|
55
|
+
assert_equal("\uFFFD".encode("UTF-32LE"),
|
56
|
+
"\xff".force_encoding(Encoding::UTF_32LE).
|
57
|
+
scrub)
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_scrub_bang
|
61
|
+
str = "\u3042\u3044"
|
62
|
+
assert_same(str, str.scrub!)
|
63
|
+
str.force_encoding(Encoding::ISO_2022_JP) # dummy encoding
|
64
|
+
assert_same(str, str.scrub!)
|
65
|
+
|
66
|
+
str = u("\x80\x80\x80")
|
67
|
+
str.scrub!
|
68
|
+
assert_same(str, str.scrub!)
|
69
|
+
assert_equal("\uFFFD\uFFFD\uFFFD", str)
|
70
|
+
end
|
71
|
+
end
|
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: string-scrub
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- SHIBATA Hiroshi
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-11-01 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: String#scrub for Ruby 2.0.0
|
42
|
+
email:
|
43
|
+
- shibata.hiroshi@gmail.com
|
44
|
+
executables: []
|
45
|
+
extensions:
|
46
|
+
- ext/string/extconf.rb
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- ".gitignore"
|
50
|
+
- ".rspec"
|
51
|
+
- ".travis.yml"
|
52
|
+
- Gemfile
|
53
|
+
- LICENSE.txt
|
54
|
+
- README.md
|
55
|
+
- Rakefile
|
56
|
+
- ext/string/extconf.rb
|
57
|
+
- ext/string/scrub.c
|
58
|
+
- string-scrub.gemspec
|
59
|
+
- test/test_scrub.rb
|
60
|
+
homepage: https://github.com/hsbt/string-scrub
|
61
|
+
licenses:
|
62
|
+
- MIT
|
63
|
+
metadata: {}
|
64
|
+
post_install_message:
|
65
|
+
rdoc_options: []
|
66
|
+
require_paths:
|
67
|
+
- lib
|
68
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
69
|
+
requirements:
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: 2.0.0
|
73
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
requirements: []
|
79
|
+
rubyforge_project:
|
80
|
+
rubygems_version: 2.1.10
|
81
|
+
signing_key:
|
82
|
+
specification_version: 4
|
83
|
+
summary: String#scrub for Ruby 2.0.0
|
84
|
+
test_files:
|
85
|
+
- test/test_scrub.rb
|
86
|
+
has_rdoc:
|