charlock_holmes 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +1 -1
- data/ext/charlock_holmes/common.h +41 -0
- data/ext/charlock_holmes/converter.c +54 -0
- data/ext/charlock_holmes/{charlock_holmes.c → encoding_detector.c} +3 -27
- data/ext/charlock_holmes/ext.c +13 -0
- data/lib/charlock_holmes/version.rb +1 -1
- data/spec/converter_spec.rb +29 -0
- metadata +79 -45
data/Gemfile.lock
CHANGED
@@ -0,0 +1,41 @@
|
|
1
|
+
#ifndef CHARLOCK_COMMON_H
|
2
|
+
#define CHARLOCK_COMMON_H
|
3
|
+
|
4
|
+
// tell rbx not to use it's caching compat layer
|
5
|
+
// by doing this we're making a promize to RBX that
|
6
|
+
// we'll never modify the pointers we get back from RSTRING_PTR
|
7
|
+
#define RSTRING_NOT_MODIFIED
|
8
|
+
|
9
|
+
#include <ruby.h>
|
10
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
11
|
+
#include <ruby/encoding.h>
|
12
|
+
#endif
|
13
|
+
|
14
|
+
static VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
|
15
|
+
{
|
16
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
17
|
+
return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding);
|
18
|
+
#else
|
19
|
+
return rb_str_new(str, len);
|
20
|
+
#endif
|
21
|
+
}
|
22
|
+
|
23
|
+
static VALUE charlock_new_str(const char *str, size_t len)
|
24
|
+
{
|
25
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
26
|
+
return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
|
27
|
+
#else
|
28
|
+
return rb_str_new(str, len);
|
29
|
+
#endif
|
30
|
+
}
|
31
|
+
|
32
|
+
static VALUE charlock_new_str2(const char *str)
|
33
|
+
{
|
34
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
35
|
+
return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
|
36
|
+
#else
|
37
|
+
return rb_str_new2(str);
|
38
|
+
#endif
|
39
|
+
}
|
40
|
+
|
41
|
+
#endif
|
@@ -0,0 +1,54 @@
|
|
1
|
+
#include "unicode/ucnv.h"
|
2
|
+
#include "common.h"
|
3
|
+
|
4
|
+
extern VALUE rb_mCharlockHolmes;
|
5
|
+
static VALUE rb_cConverter;
|
6
|
+
|
7
|
+
static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VALUE rb_dst_enc) {
|
8
|
+
VALUE rb_out;
|
9
|
+
const char *src_enc;
|
10
|
+
const char *dst_enc;
|
11
|
+
const char *src_txt;
|
12
|
+
char *out_buf;
|
13
|
+
void *rb_enc = NULL;
|
14
|
+
int32_t src_len;
|
15
|
+
int32_t out_len;
|
16
|
+
UErrorCode status = U_ZERO_ERROR;
|
17
|
+
|
18
|
+
src_txt = RSTRING_PTR(rb_txt);
|
19
|
+
src_len = RSTRING_LEN(rb_txt);
|
20
|
+
src_enc = RSTRING_PTR(rb_src_enc);
|
21
|
+
dst_enc = RSTRING_PTR(rb_dst_enc);
|
22
|
+
|
23
|
+
// first determin the size of the output buffer
|
24
|
+
out_len = ucnv_convert(dst_enc, src_enc, NULL, 0, src_txt, src_len, &status);
|
25
|
+
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
26
|
+
rb_raise(rb_eArgError, u_errorName(status));
|
27
|
+
}
|
28
|
+
out_buf = xmalloc(out_len);
|
29
|
+
|
30
|
+
// now do the actual conversion
|
31
|
+
status = U_ZERO_ERROR;
|
32
|
+
out_len = ucnv_convert(dst_enc, src_enc, out_buf, out_len, src_txt, src_len, &status);
|
33
|
+
if (U_FAILURE(status)) {
|
34
|
+
xfree(out_buf);
|
35
|
+
rb_raise(rb_eArgError, u_errorName(status));
|
36
|
+
}
|
37
|
+
|
38
|
+
xfree(out_buf);
|
39
|
+
|
40
|
+
#ifdef HAVE_RUBY_ENCODING_H
|
41
|
+
(rb_encoding *)rb_enc = rb_enc_find(dst_enc);
|
42
|
+
#endif
|
43
|
+
|
44
|
+
rb_out = charlock_new_enc_str(out_buf, out_len, rb_enc);
|
45
|
+
|
46
|
+
return rb_out;
|
47
|
+
}
|
48
|
+
|
49
|
+
void _init_charlock_converter() {
|
50
|
+
rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject);
|
51
|
+
|
52
|
+
// rb_define_alloc_func(rb_cConverter, rb_converter__alloc);
|
53
|
+
rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3);
|
54
|
+
}
|
@@ -1,31 +1,9 @@
|
|
1
1
|
#include "unicode/ucsdet.h"
|
2
|
+
#include "common.h"
|
2
3
|
|
3
|
-
|
4
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
5
|
-
#include <ruby/encoding.h>
|
6
|
-
#endif
|
7
|
-
|
8
|
-
static VALUE rb_mCharlockHolmes;
|
4
|
+
extern VALUE rb_mCharlockHolmes;
|
9
5
|
static VALUE rb_cEncodingDetector;
|
10
6
|
|
11
|
-
static VALUE charlock_new_str(const char *str, size_t len)
|
12
|
-
{
|
13
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
14
|
-
return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
|
15
|
-
#else
|
16
|
-
return rb_str_new(str, len);
|
17
|
-
#endif
|
18
|
-
}
|
19
|
-
|
20
|
-
static VALUE charlock_new_str2(const char *str)
|
21
|
-
{
|
22
|
-
#ifdef HAVE_RUBY_ENCODING_H
|
23
|
-
return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
|
24
|
-
#else
|
25
|
-
return rb_str_new2(str);
|
26
|
-
#endif
|
27
|
-
}
|
28
|
-
|
29
7
|
static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
|
30
8
|
{
|
31
9
|
UErrorCode status = U_ZERO_ERROR;
|
@@ -229,10 +207,8 @@ static VALUE rb_encdec__alloc(VALUE klass)
|
|
229
207
|
return Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)csd);
|
230
208
|
}
|
231
209
|
|
232
|
-
void
|
210
|
+
void _init_charlock_encoding_detector()
|
233
211
|
{
|
234
|
-
rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
|
235
|
-
|
236
212
|
rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
|
237
213
|
rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
|
238
214
|
rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#include "common.h"
|
2
|
+
|
3
|
+
extern void _init_charlock_encoding_detector();
|
4
|
+
extern void _init_charlock_converter();
|
5
|
+
|
6
|
+
VALUE rb_mCharlockHolmes;
|
7
|
+
|
8
|
+
void Init_charlock_holmes() {
|
9
|
+
rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
|
10
|
+
|
11
|
+
_init_charlock_encoding_detector();
|
12
|
+
_init_charlock_converter();
|
13
|
+
}
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe CharlockHolmes::Converter do
|
6
|
+
test 'is able to convert regular ascii content from ISO-8859-1 to UTF-16, and back again' do
|
7
|
+
input = 'test'
|
8
|
+
|
9
|
+
output = CharlockHolmes::Converter.convert input, 'ISO-8859-1', 'UTF-16'
|
10
|
+
assert input.bytesize < output.bytesize
|
11
|
+
assert input != output
|
12
|
+
|
13
|
+
output = CharlockHolmes::Converter.convert output, 'UTF-16', 'ISO-8859-1'
|
14
|
+
assert input.bytesize == output.bytesize
|
15
|
+
assert input == output
|
16
|
+
end
|
17
|
+
|
18
|
+
test 'is able to convert UTF-8 content from UTF-8 to UTF-16, and back again' do
|
19
|
+
input = 'λ, λ, λ'
|
20
|
+
|
21
|
+
output = CharlockHolmes::Converter.convert input, 'UTF-8', 'UTF-16'
|
22
|
+
assert input.bytesize < output.bytesize
|
23
|
+
assert input != output
|
24
|
+
|
25
|
+
output = CharlockHolmes::Converter.convert output, 'UTF-16', 'UTF-8'
|
26
|
+
assert input.bytesize == output.bytesize
|
27
|
+
assert input == output
|
28
|
+
end
|
29
|
+
end
|
metadata
CHANGED
@@ -1,58 +1,79 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: charlock_holmes
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 11
|
5
5
|
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 5
|
9
|
+
- 0
|
10
|
+
version: 0.5.0
|
6
11
|
platform: ruby
|
7
|
-
authors:
|
12
|
+
authors:
|
8
13
|
- Brian Lopez
|
9
|
-
- Vicent
|
14
|
+
- "Vicent Mart\xC3\xAD"
|
10
15
|
autorequire:
|
11
16
|
bindir: bin
|
12
17
|
cert_chain: []
|
13
|
-
|
18
|
+
|
19
|
+
date: 2011-08-26 00:00:00 -07:00
|
14
20
|
default_executable:
|
15
|
-
dependencies:
|
16
|
-
- !ruby/object:Gem::Dependency
|
21
|
+
dependencies:
|
22
|
+
- !ruby/object:Gem::Dependency
|
17
23
|
name: rake-compiler
|
18
|
-
|
24
|
+
prerelease: false
|
25
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
19
26
|
none: false
|
20
|
-
requirements:
|
21
|
-
- -
|
22
|
-
- !ruby/object:Gem::Version
|
27
|
+
requirements:
|
28
|
+
- - ">="
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
hash: 9
|
31
|
+
segments:
|
32
|
+
- 0
|
33
|
+
- 7
|
34
|
+
- 5
|
23
35
|
version: 0.7.5
|
24
36
|
type: :development
|
25
|
-
|
26
|
-
|
27
|
-
- !ruby/object:Gem::Dependency
|
37
|
+
version_requirements: *id001
|
38
|
+
- !ruby/object:Gem::Dependency
|
28
39
|
name: rspec
|
29
|
-
|
40
|
+
prerelease: false
|
41
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
30
42
|
none: false
|
31
|
-
requirements:
|
32
|
-
- -
|
33
|
-
- !ruby/object:Gem::Version
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
hash: 15
|
47
|
+
segments:
|
48
|
+
- 2
|
49
|
+
- 0
|
50
|
+
- 0
|
34
51
|
version: 2.0.0
|
35
52
|
type: :development
|
36
|
-
|
37
|
-
|
38
|
-
- !ruby/object:Gem::Dependency
|
53
|
+
version_requirements: *id002
|
54
|
+
- !ruby/object:Gem::Dependency
|
39
55
|
name: chardet
|
40
|
-
|
56
|
+
prerelease: false
|
57
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
41
58
|
none: false
|
42
|
-
requirements:
|
43
|
-
- -
|
44
|
-
- !ruby/object:Gem::Version
|
45
|
-
|
59
|
+
requirements:
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
hash: 3
|
63
|
+
segments:
|
64
|
+
- 0
|
65
|
+
version: "0"
|
46
66
|
type: :development
|
47
|
-
|
48
|
-
version_requirements: *70218114471840
|
67
|
+
version_requirements: *id003
|
49
68
|
description:
|
50
69
|
email: seniorlopez@gmail.com
|
51
70
|
executables: []
|
52
|
-
|
71
|
+
|
72
|
+
extensions:
|
53
73
|
- ext/charlock_holmes/extconf.rb
|
54
74
|
extra_rdoc_files: []
|
55
|
-
|
75
|
+
|
76
|
+
files:
|
56
77
|
- .gitignore
|
57
78
|
- .rspec
|
58
79
|
- Gemfile
|
@@ -63,12 +84,16 @@ files:
|
|
63
84
|
- benchmark/detection.rb
|
64
85
|
- benchmark/test.txt
|
65
86
|
- charlock_holmes.gemspec
|
66
|
-
- ext/charlock_holmes/
|
87
|
+
- ext/charlock_holmes/common.h
|
88
|
+
- ext/charlock_holmes/converter.c
|
89
|
+
- ext/charlock_holmes/encoding_detector.c
|
90
|
+
- ext/charlock_holmes/ext.c
|
67
91
|
- ext/charlock_holmes/extconf.rb
|
68
92
|
- lib/charlock_holmes.rb
|
69
93
|
- lib/charlock_holmes/encoding_detector.rb
|
70
94
|
- lib/charlock_holmes/string.rb
|
71
95
|
- lib/charlock_holmes/version.rb
|
96
|
+
- spec/converter_spec.rb
|
72
97
|
- spec/encoding_detector_spec.rb
|
73
98
|
- spec/fixtures/AnsiGraph.psm1
|
74
99
|
- spec/fixtures/TwigExtensionsDate.es.yml
|
@@ -81,31 +106,40 @@ files:
|
|
81
106
|
has_rdoc: true
|
82
107
|
homepage: http://github.com/brianmario/charlock_holmes
|
83
108
|
licenses: []
|
109
|
+
|
84
110
|
post_install_message:
|
85
|
-
rdoc_options:
|
111
|
+
rdoc_options:
|
86
112
|
- --charset=UTF-8
|
87
|
-
require_paths:
|
113
|
+
require_paths:
|
88
114
|
- lib
|
89
115
|
- ext
|
90
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
116
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
91
117
|
none: false
|
92
|
-
requirements:
|
93
|
-
- -
|
94
|
-
- !ruby/object:Gem::Version
|
95
|
-
|
96
|
-
|
118
|
+
requirements:
|
119
|
+
- - ">="
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
hash: 3
|
122
|
+
segments:
|
123
|
+
- 0
|
124
|
+
version: "0"
|
125
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
126
|
none: false
|
98
|
-
requirements:
|
99
|
-
- -
|
100
|
-
- !ruby/object:Gem::Version
|
101
|
-
|
127
|
+
requirements:
|
128
|
+
- - ">="
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
hash: 3
|
131
|
+
segments:
|
132
|
+
- 0
|
133
|
+
version: "0"
|
102
134
|
requirements: []
|
135
|
+
|
103
136
|
rubyforge_project:
|
104
137
|
rubygems_version: 1.6.2
|
105
138
|
signing_key:
|
106
139
|
specification_version: 3
|
107
140
|
summary: Character encoding detection, brought to you by ICU
|
108
|
-
test_files:
|
141
|
+
test_files:
|
142
|
+
- spec/converter_spec.rb
|
109
143
|
- spec/encoding_detector_spec.rb
|
110
144
|
- spec/fixtures/AnsiGraph.psm1
|
111
145
|
- spec/fixtures/TwigExtensionsDate.es.yml
|