charlock_holmes 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- charlock_holmes (0.4.0)
4
+ charlock_holmes (0.5.0)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
@@ -0,0 +1,41 @@
1
+ #ifndef CHARLOCK_COMMON_H
2
+ #define CHARLOCK_COMMON_H
3
+
4
+ // tell rbx not to use it's caching compat layer
5
+ // by doing this we're making a promize to RBX that
6
+ // we'll never modify the pointers we get back from RSTRING_PTR
7
+ #define RSTRING_NOT_MODIFIED
8
+
9
+ #include <ruby.h>
10
+ #ifdef HAVE_RUBY_ENCODING_H
11
+ #include <ruby/encoding.h>
12
+ #endif
13
+
14
+ static VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding)
15
+ {
16
+ #ifdef HAVE_RUBY_ENCODING_H
17
+ return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding);
18
+ #else
19
+ return rb_str_new(str, len);
20
+ #endif
21
+ }
22
+
23
+ static VALUE charlock_new_str(const char *str, size_t len)
24
+ {
25
+ #ifdef HAVE_RUBY_ENCODING_H
26
+ return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
27
+ #else
28
+ return rb_str_new(str, len);
29
+ #endif
30
+ }
31
+
32
+ static VALUE charlock_new_str2(const char *str)
33
+ {
34
+ #ifdef HAVE_RUBY_ENCODING_H
35
+ return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
36
+ #else
37
+ return rb_str_new2(str);
38
+ #endif
39
+ }
40
+
41
+ #endif
@@ -0,0 +1,54 @@
1
+ #include "unicode/ucnv.h"
2
+ #include "common.h"
3
+
4
+ extern VALUE rb_mCharlockHolmes;
5
+ static VALUE rb_cConverter;
6
+
7
+ static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VALUE rb_dst_enc) {
8
+ VALUE rb_out;
9
+ const char *src_enc;
10
+ const char *dst_enc;
11
+ const char *src_txt;
12
+ char *out_buf;
13
+ void *rb_enc = NULL;
14
+ int32_t src_len;
15
+ int32_t out_len;
16
+ UErrorCode status = U_ZERO_ERROR;
17
+
18
+ src_txt = RSTRING_PTR(rb_txt);
19
+ src_len = RSTRING_LEN(rb_txt);
20
+ src_enc = RSTRING_PTR(rb_src_enc);
21
+ dst_enc = RSTRING_PTR(rb_dst_enc);
22
+
23
+ // first determin the size of the output buffer
24
+ out_len = ucnv_convert(dst_enc, src_enc, NULL, 0, src_txt, src_len, &status);
25
+ if (status != U_BUFFER_OVERFLOW_ERROR) {
26
+ rb_raise(rb_eArgError, u_errorName(status));
27
+ }
28
+ out_buf = xmalloc(out_len);
29
+
30
+ // now do the actual conversion
31
+ status = U_ZERO_ERROR;
32
+ out_len = ucnv_convert(dst_enc, src_enc, out_buf, out_len, src_txt, src_len, &status);
33
+ if (U_FAILURE(status)) {
34
+ xfree(out_buf);
35
+ rb_raise(rb_eArgError, u_errorName(status));
36
+ }
37
+
38
+ xfree(out_buf);
39
+
40
+ #ifdef HAVE_RUBY_ENCODING_H
41
+ (rb_encoding *)rb_enc = rb_enc_find(dst_enc);
42
+ #endif
43
+
44
+ rb_out = charlock_new_enc_str(out_buf, out_len, rb_enc);
45
+
46
+ return rb_out;
47
+ }
48
+
49
+ void _init_charlock_converter() {
50
+ rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject);
51
+
52
+ // rb_define_alloc_func(rb_cConverter, rb_converter__alloc);
53
+ rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3);
54
+ }
@@ -1,31 +1,9 @@
1
1
  #include "unicode/ucsdet.h"
2
+ #include "common.h"
2
3
 
3
- #include <ruby.h>
4
- #ifdef HAVE_RUBY_ENCODING_H
5
- #include <ruby/encoding.h>
6
- #endif
7
-
8
- static VALUE rb_mCharlockHolmes;
4
+ extern VALUE rb_mCharlockHolmes;
9
5
  static VALUE rb_cEncodingDetector;
10
6
 
11
- static VALUE charlock_new_str(const char *str, size_t len)
12
- {
13
- #ifdef HAVE_RUBY_ENCODING_H
14
- return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
15
- #else
16
- return rb_str_new(str, len);
17
- #endif
18
- }
19
-
20
- static VALUE charlock_new_str2(const char *str)
21
- {
22
- #ifdef HAVE_RUBY_ENCODING_H
23
- return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding());
24
- #else
25
- return rb_str_new2(str);
26
- #endif
27
- }
28
-
29
7
  static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
30
8
  {
31
9
  UErrorCode status = U_ZERO_ERROR;
@@ -229,10 +207,8 @@ static VALUE rb_encdec__alloc(VALUE klass)
229
207
  return Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)csd);
230
208
  }
231
209
 
232
- void Init_charlock_holmes()
210
+ void _init_charlock_encoding_detector()
233
211
  {
234
- rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
235
-
236
212
  rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
237
213
  rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
238
214
  rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
@@ -0,0 +1,13 @@
1
+ #include "common.h"
2
+
3
+ extern void _init_charlock_encoding_detector();
4
+ extern void _init_charlock_converter();
5
+
6
+ VALUE rb_mCharlockHolmes;
7
+
8
+ void Init_charlock_holmes() {
9
+ rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
10
+
11
+ _init_charlock_encoding_detector();
12
+ _init_charlock_converter();
13
+ }
@@ -1,3 +1,3 @@
1
1
  module CharlockHolmes
2
- VERSION = "0.4.1"
2
+ VERSION = "0.5.0"
3
3
  end
@@ -0,0 +1,29 @@
1
+ # encoding: utf-8
2
+
3
+ require 'spec_helper'
4
+
5
+ describe CharlockHolmes::Converter do
6
+ test 'is able to convert regular ascii content from ISO-8859-1 to UTF-16, and back again' do
7
+ input = 'test'
8
+
9
+ output = CharlockHolmes::Converter.convert input, 'ISO-8859-1', 'UTF-16'
10
+ assert input.bytesize < output.bytesize
11
+ assert input != output
12
+
13
+ output = CharlockHolmes::Converter.convert output, 'UTF-16', 'ISO-8859-1'
14
+ assert input.bytesize == output.bytesize
15
+ assert input == output
16
+ end
17
+
18
+ test 'is able to convert UTF-8 content from UTF-8 to UTF-16, and back again' do
19
+ input = 'λ, λ, λ'
20
+
21
+ output = CharlockHolmes::Converter.convert input, 'UTF-8', 'UTF-16'
22
+ assert input.bytesize < output.bytesize
23
+ assert input != output
24
+
25
+ output = CharlockHolmes::Converter.convert output, 'UTF-16', 'UTF-8'
26
+ assert input.bytesize == output.bytesize
27
+ assert input == output
28
+ end
29
+ end
metadata CHANGED
@@ -1,58 +1,79 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: charlock_holmes
3
- version: !ruby/object:Gem::Version
4
- version: 0.4.1
3
+ version: !ruby/object:Gem::Version
4
+ hash: 11
5
5
  prerelease:
6
+ segments:
7
+ - 0
8
+ - 5
9
+ - 0
10
+ version: 0.5.0
6
11
  platform: ruby
7
- authors:
12
+ authors:
8
13
  - Brian Lopez
9
- - Vicent Martí
14
+ - "Vicent Mart\xC3\xAD"
10
15
  autorequire:
11
16
  bindir: bin
12
17
  cert_chain: []
13
- date: 2011-08-25 00:00:00.000000000 -07:00
18
+
19
+ date: 2011-08-26 00:00:00 -07:00
14
20
  default_executable:
15
- dependencies:
16
- - !ruby/object:Gem::Dependency
21
+ dependencies:
22
+ - !ruby/object:Gem::Dependency
17
23
  name: rake-compiler
18
- requirement: &70218132232740 !ruby/object:Gem::Requirement
24
+ prerelease: false
25
+ requirement: &id001 !ruby/object:Gem::Requirement
19
26
  none: false
20
- requirements:
21
- - - ! '>='
22
- - !ruby/object:Gem::Version
27
+ requirements:
28
+ - - ">="
29
+ - !ruby/object:Gem::Version
30
+ hash: 9
31
+ segments:
32
+ - 0
33
+ - 7
34
+ - 5
23
35
  version: 0.7.5
24
36
  type: :development
25
- prerelease: false
26
- version_requirements: *70218132232740
27
- - !ruby/object:Gem::Dependency
37
+ version_requirements: *id001
38
+ - !ruby/object:Gem::Dependency
28
39
  name: rspec
29
- requirement: &70218114472220 !ruby/object:Gem::Requirement
40
+ prerelease: false
41
+ requirement: &id002 !ruby/object:Gem::Requirement
30
42
  none: false
31
- requirements:
32
- - - ! '>='
33
- - !ruby/object:Gem::Version
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ hash: 15
47
+ segments:
48
+ - 2
49
+ - 0
50
+ - 0
34
51
  version: 2.0.0
35
52
  type: :development
36
- prerelease: false
37
- version_requirements: *70218114472220
38
- - !ruby/object:Gem::Dependency
53
+ version_requirements: *id002
54
+ - !ruby/object:Gem::Dependency
39
55
  name: chardet
40
- requirement: &70218114471840 !ruby/object:Gem::Requirement
56
+ prerelease: false
57
+ requirement: &id003 !ruby/object:Gem::Requirement
41
58
  none: false
42
- requirements:
43
- - - ! '>='
44
- - !ruby/object:Gem::Version
45
- version: '0'
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ hash: 3
63
+ segments:
64
+ - 0
65
+ version: "0"
46
66
  type: :development
47
- prerelease: false
48
- version_requirements: *70218114471840
67
+ version_requirements: *id003
49
68
  description:
50
69
  email: seniorlopez@gmail.com
51
70
  executables: []
52
- extensions:
71
+
72
+ extensions:
53
73
  - ext/charlock_holmes/extconf.rb
54
74
  extra_rdoc_files: []
55
- files:
75
+
76
+ files:
56
77
  - .gitignore
57
78
  - .rspec
58
79
  - Gemfile
@@ -63,12 +84,16 @@ files:
63
84
  - benchmark/detection.rb
64
85
  - benchmark/test.txt
65
86
  - charlock_holmes.gemspec
66
- - ext/charlock_holmes/charlock_holmes.c
87
+ - ext/charlock_holmes/common.h
88
+ - ext/charlock_holmes/converter.c
89
+ - ext/charlock_holmes/encoding_detector.c
90
+ - ext/charlock_holmes/ext.c
67
91
  - ext/charlock_holmes/extconf.rb
68
92
  - lib/charlock_holmes.rb
69
93
  - lib/charlock_holmes/encoding_detector.rb
70
94
  - lib/charlock_holmes/string.rb
71
95
  - lib/charlock_holmes/version.rb
96
+ - spec/converter_spec.rb
72
97
  - spec/encoding_detector_spec.rb
73
98
  - spec/fixtures/AnsiGraph.psm1
74
99
  - spec/fixtures/TwigExtensionsDate.es.yml
@@ -81,31 +106,40 @@ files:
81
106
  has_rdoc: true
82
107
  homepage: http://github.com/brianmario/charlock_holmes
83
108
  licenses: []
109
+
84
110
  post_install_message:
85
- rdoc_options:
111
+ rdoc_options:
86
112
  - --charset=UTF-8
87
- require_paths:
113
+ require_paths:
88
114
  - lib
89
115
  - ext
90
- required_ruby_version: !ruby/object:Gem::Requirement
116
+ required_ruby_version: !ruby/object:Gem::Requirement
91
117
  none: false
92
- requirements:
93
- - - ! '>='
94
- - !ruby/object:Gem::Version
95
- version: '0'
96
- required_rubygems_version: !ruby/object:Gem::Requirement
118
+ requirements:
119
+ - - ">="
120
+ - !ruby/object:Gem::Version
121
+ hash: 3
122
+ segments:
123
+ - 0
124
+ version: "0"
125
+ required_rubygems_version: !ruby/object:Gem::Requirement
97
126
  none: false
98
- requirements:
99
- - - ! '>='
100
- - !ruby/object:Gem::Version
101
- version: '0'
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ hash: 3
131
+ segments:
132
+ - 0
133
+ version: "0"
102
134
  requirements: []
135
+
103
136
  rubyforge_project:
104
137
  rubygems_version: 1.6.2
105
138
  signing_key:
106
139
  specification_version: 3
107
140
  summary: Character encoding detection, brought to you by ICU
108
- test_files:
141
+ test_files:
142
+ - spec/converter_spec.rb
109
143
  - spec/encoding_detector_spec.rb
110
144
  - spec/fixtures/AnsiGraph.psm1
111
145
  - spec/fixtures/TwigExtensionsDate.es.yml