charlock_holmes 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- charlock_holmes (0.5.0)
4
+ charlock_holmes (0.6.0)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
data/README.md CHANGED
@@ -10,12 +10,12 @@ First you'll need to require it
10
10
  require 'charlock_holmes'
11
11
  ```
12
12
 
13
- Now if you want to detect the encoding of a set of bytes:
13
+ ## Encoding detection
14
14
 
15
15
  ``` ruby
16
16
  contents = File.read('test.xml')
17
17
  detection = CharlockHolmes::EncodingDetector.detect(contents)
18
- # => {:encoding => 'UTF-8', :confidence => 100}
18
+ # => {:encoding => 'UTF-8', :confidence => 100, :type => :text}
19
19
 
20
20
  # optionally there will be a :language key as well, but
21
21
  # that's mostly only returned for legacy encodings like ISO-8859-1
@@ -23,6 +23,8 @@ detection = CharlockHolmes::EncodingDetector.detect(contents)
23
23
 
24
24
  NOTE: `CharlockHolmes::EncodingDetector.detect` will return `nil` if it was unable to find an encoding.
25
25
 
26
+ For binary content, `:type` will be set to `:binary`
27
+
26
28
  Though it's more efficient to reuse once detector instance:
27
29
 
28
30
  ``` ruby
@@ -34,7 +36,7 @@ detection2 = detector.detect(File.read('test2.json'))
34
36
  # and so on...
35
37
  ```
36
38
 
37
- ## String monkey patch
39
+ ### String monkey patch
38
40
 
39
41
  Alternatively, you can just use the `detect_encoding` method on the `String` class
40
42
 
@@ -46,7 +48,7 @@ contents = File.read('test.xml')
46
48
  detection = contents.detect_encoding
47
49
  ```
48
50
 
49
- ## Ruby 1.9 specific
51
+ ### Ruby 1.9 specific
50
52
 
51
53
  NOTE: This method only exists on Ruby 1.9+
52
54
 
@@ -57,18 +59,30 @@ require 'charlock_holmes/string'
57
59
 
58
60
  contents = File.read('test.xml')
59
61
 
60
- # this will detect and set the encoding of `contents`
62
+ # this will detect and set the encoding of `contents`, then return self
61
63
  contents.detect_encoding!
62
64
  ```
63
65
 
66
+ ## Transcoding
67
+
68
+ Being able to detect the encoding of some arbitrary content is nice, but what you probably want is to be able to transcode that content into an encoding your application is using.
69
+
70
+ ``` ruby
71
+ content = File.read('test2.txt')
72
+ detection = CharlockHolmes::EncodingDetector.detect(content)
73
+ utf8_encoded_content CharlockHolmes::Converter.convert content, detection[:encoding], 'UTF-8'
74
+ ```
75
+
76
+ The first parameter is the content to transcode, the second is the source encoding (the encoding the content is assumed to be in), and the third parameter is the destination encoding.
77
+
64
78
  ## Installing
65
79
 
66
- If the traditional `gem install charlock_holmes` doesn't work, you may need to specify the path to your installation of ICU using the `--with-icu-dir` option during the gem install.
80
+ If the traditional `gem install charlock_holmes` doesn't work, you may need to specify the path to your installation of ICU and libmagic using the `--with-icu-dir` and/or `--with-magic-dir` option during the gem install.
67
81
 
68
- At the time of writing, if you installed ICU via homebrew on OSX your gem install may look something like this:
82
+ At the time of writing, Homebrew for OSX installs ICU (icu4c is the package name) and libmagic as keg-only installs so you'll have to specify the location during the gem install:
69
83
 
70
- `gem install charlock_holmes --with-icu-dir=/usr/local/Cellar/icu4c/4.4.1`
84
+ `gem install charlock_holmes --with-icu-dir=/usr/local/Cellar/icu4c/4.4.1 --with-magic-dir=/usr/local/Cellar/libmagic/5.04`
71
85
 
72
- If you're using Bundler and need to specify a custom path, you can do so with the `bundle config` command:
86
+ If you're using Bundler and need to specify a custom path(s), you can do so with the `bundle config` command:
73
87
 
74
- `bundle config build.charlock_holmes --with-icu-dir=/usr/local/Cellar/icu4c/4.4.1`
88
+ `bundle config build.charlock_holmes --with-icu-dir=/usr/local/Cellar/icu4c/4.4.1 --with-magic-dir=/usr/local/Cellar/libmagic/5.04`
@@ -16,14 +16,14 @@ DETECTOR = CharlockHolmes::EncodingDetector.new
16
16
 
17
17
  Benchmark.bmbm do |x|
18
18
  # new detector every iteration
19
- x.report 'class-level detect' do
19
+ x.report 'singleton call' do
20
20
  TIMES.times do
21
21
  CharlockHolmes::EncodingDetector.detect CONTENT
22
22
  end
23
23
  end
24
24
 
25
25
  # shared detector for all iterations
26
- x.report 'instance-level detect' do
26
+ x.report 'reusing a single detector' do
27
27
  TIMES.times do
28
28
  DETECTOR.detect CONTENT
29
29
  end
@@ -49,6 +49,5 @@ static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VA
49
49
  void _init_charlock_converter() {
50
50
  rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject);
51
51
 
52
- // rb_define_alloc_func(rb_cConverter, rb_converter__alloc);
53
52
  rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3);
54
53
  }
@@ -1,9 +1,15 @@
1
1
  #include "unicode/ucsdet.h"
2
+ #include "magic.h"
2
3
  #include "common.h"
3
4
 
4
5
  extern VALUE rb_mCharlockHolmes;
5
6
  static VALUE rb_cEncodingDetector;
6
7
 
8
+ typedef struct {
9
+ UCharsetDetector *csd;
10
+ magic_t magic;
11
+ } charlock_detector_t;
12
+
7
13
  static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
8
14
  {
9
15
  UErrorCode status = U_ZERO_ERROR;
@@ -21,6 +27,7 @@ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
21
27
 
22
28
  rb_match = rb_hash_new();
23
29
 
30
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text")));
24
31
  rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), charlock_new_str2(mname));
25
32
  rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence));
26
33
 
@@ -30,6 +37,36 @@ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
30
37
  return rb_match;
31
38
  }
32
39
 
40
+ static VALUE rb_encdec_binarymatch() {
41
+ VALUE rb_match;
42
+
43
+ rb_match = rb_hash_new();
44
+
45
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("binary")));
46
+ rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(100));
47
+
48
+ return rb_match;
49
+ }
50
+
51
+ static int detect_binary_content(charlock_detector_t *detector, VALUE rb_str) {
52
+ const char *binary_result;
53
+
54
+ binary_result = magic_buffer(detector->magic, RSTRING_PTR(rb_str), RSTRING_LEN(rb_str));
55
+
56
+ if (binary_result) {
57
+ if (strstr(binary_result, "library") ||
58
+ strstr(binary_result, "bundle") ||
59
+ strstr(binary_result, "archive") ||
60
+ (!strstr(binary_result, "text") && strstr(binary_result, "executable")) ||
61
+ strstr(binary_result, "data"))
62
+ return 1;
63
+ } else {
64
+ rb_raise(rb_eStandardError, magic_error(detector->magic));
65
+ }
66
+
67
+ return 0;
68
+ }
69
+
33
70
  /*
34
71
  * call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
35
72
  *
@@ -39,28 +76,35 @@ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
39
76
  * hint_enc - an optional String (like "UTF-8"), the encoding name which will
40
77
  * be used as an additional hint to the charset detector
41
78
  *
42
- * Returns: a Hash with :encoding, :language and :confidence
79
+ * Returns: a Hash with :encoding, :language, :type and :confidence
43
80
  */
44
81
  static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
45
82
  {
46
83
  UErrorCode status = U_ZERO_ERROR;
47
- UCharsetDetector *csd;
84
+ charlock_detector_t *detector;
48
85
  VALUE rb_str;
49
86
  VALUE rb_enc_hint;
50
87
 
51
88
  rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
52
89
 
53
90
  Check_Type(rb_str, T_STRING);
54
- Data_Get_Struct(self, UCharsetDetector, csd);
91
+ Data_Get_Struct(self, charlock_detector_t, detector);
55
92
 
56
- ucsdet_setText(csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
93
+ // first lets see if this is binary content
94
+ if (detect_binary_content(detector, rb_str)) {
95
+ return rb_encdec_binarymatch();
96
+ }
97
+
98
+ // if we got here - the data doesn't look like binary
99
+ // lets try to figure out what encoding the text is in
100
+ ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
57
101
 
58
102
  if (!NIL_P(rb_enc_hint)) {
59
103
  Check_Type(rb_enc_hint, T_STRING);
60
- ucsdet_setDeclaredEncoding(csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
104
+ ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
61
105
  }
62
106
 
63
- return rb_encdec_buildmatch(ucsdet_detect(csd, &status));
107
+ return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status));
64
108
  }
65
109
 
66
110
 
@@ -76,38 +120,48 @@ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
76
120
  * be used as an additional hint to the charset detector
77
121
  *
78
122
  * Returns: an Array with zero or more Hashes,
79
- * each one of them with with :encoding, :language and :confidence
123
+ * each one of them with with :encoding, :language, :type and :confidence
80
124
  */
81
125
  static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
82
126
  {
83
127
  UErrorCode status = U_ZERO_ERROR;
84
- UCharsetDetector *csd;
128
+ charlock_detector_t *detector;
85
129
  const UCharsetMatch **csm;
86
130
  VALUE rb_ret;
87
131
  int i, match_count;
88
132
  VALUE rb_str;
89
133
  VALUE rb_enc_hint;
134
+ VALUE binary_match;
90
135
 
91
136
  rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
92
137
 
93
138
  Check_Type(rb_str, T_STRING);
94
- Data_Get_Struct(self, UCharsetDetector, csd);
139
+ Data_Get_Struct(self, charlock_detector_t, detector);
95
140
 
96
141
  rb_ret = rb_ary_new();
97
142
 
98
- ucsdet_setText(csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
143
+ // first lets see if this is binary content
144
+ binary_match = Qnil;
145
+ if (detect_binary_content(detector, rb_str)) {
146
+ binary_match = rb_encdec_binarymatch();
147
+ }
148
+
149
+ ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
99
150
 
100
151
  if (!NIL_P(rb_enc_hint)) {
101
152
  Check_Type(rb_enc_hint, T_STRING);
102
- ucsdet_setDeclaredEncoding(csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
153
+ ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
103
154
  }
104
155
 
105
- csm = ucsdet_detectAll(csd, &match_count, &status);
156
+ csm = ucsdet_detectAll(detector->csd, &match_count, &status);
106
157
 
107
158
  for (i = 0; i < match_count; ++i) {
108
159
  rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
109
160
  }
110
161
 
162
+ if (!NIL_P(binary_match))
163
+ rb_ary_unshift(rb_ret, binary_match);
164
+
111
165
  return rb_ret;
112
166
  }
113
167
 
@@ -120,13 +174,13 @@ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
120
174
  */
121
175
  static VALUE rb_get_strip_tags(VALUE self)
122
176
  {
123
- UCharsetDetector *csd;
177
+ charlock_detector_t *detector;
124
178
  UBool val;
125
179
  VALUE rb_val;
126
180
 
127
- Data_Get_Struct(self, UCharsetDetector, csd);
181
+ Data_Get_Struct(self, charlock_detector_t, detector);
128
182
 
129
- val = ucsdet_isInputFilterEnabled(csd);
183
+ val = ucsdet_isInputFilterEnabled(detector->csd);
130
184
 
131
185
  rb_val = val == 1 ? Qtrue : Qfalse;
132
186
 
@@ -143,14 +197,14 @@ static VALUE rb_get_strip_tags(VALUE self)
143
197
  */
144
198
  static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
145
199
  {
146
- UCharsetDetector *csd;
200
+ charlock_detector_t *detector;
147
201
  UBool val;
148
202
 
149
- Data_Get_Struct(self, UCharsetDetector, csd);
203
+ Data_Get_Struct(self, charlock_detector_t, detector);
150
204
 
151
205
  val = rb_val == Qtrue ? 1 : 0;
152
206
 
153
- ucsdet_enableInputFilter(csd, val);
207
+ ucsdet_enableInputFilter(detector->csd, val);
154
208
 
155
209
  return rb_val;
156
210
  }
@@ -195,16 +249,42 @@ static VALUE rb_get_supported_encodings(VALUE klass)
195
249
  return rb_encoding_list;
196
250
  }
197
251
 
198
- static void rb_encdec__free(void *csd)
252
+ static void rb_encdec__free(void *obj)
199
253
  {
200
- ucsdet_close((UCharsetDetector *)csd);
254
+ charlock_detector_t *detector;
255
+
256
+ detector = (charlock_detector_t *)obj;
257
+
258
+ if (detector->csd)
259
+ ucsdet_close(detector->csd);
260
+
261
+ if (detector->magic)
262
+ magic_close(detector->magic);
201
263
  }
202
264
 
203
265
  static VALUE rb_encdec__alloc(VALUE klass)
204
266
  {
267
+ charlock_detector_t *detector;
205
268
  UErrorCode status = U_ZERO_ERROR;
206
- UCharsetDetector *csd = ucsdet_open(&status);
207
- return Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)csd);
269
+ VALUE obj;
270
+
271
+ obj = Data_Make_Struct(klass, charlock_detector_t, NULL, rb_encdec__free, (void *)detector);
272
+
273
+ detector->csd = ucsdet_open(&status);
274
+ if (U_FAILURE(status)) {
275
+ rb_raise(rb_eStandardError, u_errorName(status));
276
+ }
277
+
278
+ detector->magic = magic_open(0);
279
+ if (detector->magic == NULL) {
280
+ rb_raise(rb_eStandardError, magic_error(detector->magic));
281
+ }
282
+
283
+ // load the libmagic database
284
+ // NULL means use the default or whatever is specified by the MAGIC env var
285
+ magic_load(detector->magic, NULL);
286
+
287
+ return obj;
208
288
  }
209
289
 
210
290
  void _init_charlock_encoding_detector()
@@ -3,11 +3,14 @@ require 'mkmf'
3
3
  $CFLAGS << ' -Wall -funroll-loops'
4
4
  $CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
5
5
 
6
+ dir_config 'icu'
6
7
  $CFLAGS << ' -I/usr/local/Cellar/icu4c/4.4.1/include'
7
8
  $LDFLAGS << ' -L/usr/local/Cellar/icu4c/4.4.1/lib'
8
-
9
- dir_config 'icu'
10
-
11
9
  have_library 'icui18n'
12
10
 
11
+ dir_config 'magic'
12
+ $LDFLAGS << ' -L/usr/local/Cellar/libmagic/5.04/lib'
13
+ $CFLAGS << ' -I/usr/local/Cellar/libmagic/5.04/include'
14
+ have_library 'magic'
15
+
13
16
  create_makefile 'charlock_holmes'
@@ -10,7 +10,7 @@ module CharlockHolmes
10
10
  # hint_enc - an optional String (like "UTF-8"), the encoding name which will
11
11
  # be used as an additional hint to the charset detector
12
12
  #
13
- # Returns: a Hash with :encoding, :language and :confidence
13
+ # Returns: a Hash with :encoding, :language, :type and :confidence
14
14
  def self.detect(str, hint_enc=nil)
15
15
  new.detect(str, hint_enc)
16
16
  end
@@ -25,7 +25,7 @@ module CharlockHolmes
25
25
  # be used as an additional hint to the charset detector
26
26
  #
27
27
  # Returns: an Array with zero or more Hashes,
28
- # each one of them with with :encoding, :language and :confidence
28
+ # each one of them with with :encoding, :language, :type and :confidence
29
29
  def self.detect_all(str, hint_enc=nil)
30
30
  new.detect_all(str, hint_enc)
31
31
  end
@@ -3,7 +3,7 @@ require 'charlock_holmes' unless defined? CharlockHolmes
3
3
  class String
4
4
  # Attempt to detect the encoding of this string
5
5
  #
6
- # Returns: a Hash with :encoding, :language and :confidence
6
+ # Returns: a Hash with :encoding, :language, :type and :confidence
7
7
  def detect_encoding(hint_enc=nil)
8
8
  encoding_detector.detect(self, hint_enc)
9
9
  end
@@ -12,7 +12,7 @@ class String
12
12
  # a list with all the possible encodings that match it.
13
13
  #
14
14
  # Returns: an Array with zero or more Hashes,
15
- # each one of them with with :encoding, :language and :confidence
15
+ # each one of them with with :encoding, :language, :type and :confidence
16
16
  def detect_encodings(hint_enc=nil)
17
17
  encoding_detector.detect_all(self, hint_enc)
18
18
  end
@@ -21,12 +21,12 @@ class String
21
21
  # Attempt to detect the encoding of this string
22
22
  # then set the encoding to what was detected ala `force_encoding`
23
23
  #
24
- # Returns: a Hash with :encoding, :language and :confidence
24
+ # Returns: self
25
25
  def detect_encoding!(hint_enc=nil)
26
26
  if detected = self.detect_encoding(hint_enc)
27
27
  self.force_encoding detected[:encoding]
28
- detected
29
28
  end
29
+ self
30
30
  end
31
31
  end
32
32
 
@@ -1,3 +1,3 @@
1
1
  module CharlockHolmes
2
- VERSION = "0.5.0"
2
+ VERSION = "0.6.0"
3
3
  end
@@ -92,16 +92,17 @@ describe CharlockHolmes::EncodingDetector do
92
92
 
93
93
  context 'encoding detection' do
94
94
  MAPPING = [
95
- ['repl2.cljs', 'ISO-8859-1'],
96
- ['core.rkt', 'UTF-8'],
97
- ['cl-messagepack.lisp', 'ISO-8859-1'],
98
- ['TwigExtensionsDate.es.yml', 'UTF-8'],
99
- ['AnsiGraph.psm1', 'UTF-16LE'],
100
- ['laholator.py', 'UTF-8']
95
+ ['repl2.cljs', 'ISO-8859-1', :text],
96
+ ['core.rkt', 'UTF-8', :text],
97
+ ['cl-messagepack.lisp', 'ISO-8859-1', :text],
98
+ ['TwigExtensionsDate.es.yml', 'UTF-8', :text],
99
+ ['AnsiGraph.psm1', 'UTF-16LE', :text],
100
+ ['laholator.py', 'UTF-8', :text],
101
+ ['hello_world', nil, :binary]
101
102
  ]
102
103
 
103
104
  MAPPING.each do |mapping|
104
- file, encoding = mapping
105
+ file, encoding, type = mapping
105
106
 
106
107
  test "#{file} should be detected as #{encoding}" do
107
108
  path = File.expand_path "../fixtures/#{file}", __FILE__
@@ -109,6 +110,7 @@ describe CharlockHolmes::EncodingDetector do
109
110
  guessed = @detector.detect content
110
111
 
111
112
  assert_equal encoding, guessed[:encoding]
113
+ assert_equal type, guessed[:type]
112
114
 
113
115
  if content.respond_to? :force_encoding
114
116
  content.force_encoding guessed[:encoding]
Binary file
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: charlock_holmes
3
3
  version: !ruby/object:Gem::Version
4
- hash: 11
4
+ hash: 7
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 5
8
+ - 6
9
9
  - 0
10
- version: 0.5.0
10
+ version: 0.6.0
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brian Lopez
@@ -16,7 +16,7 @@ autorequire:
16
16
  bindir: bin
17
17
  cert_chain: []
18
18
 
19
- date: 2011-08-26 00:00:00 -07:00
19
+ date: 2011-08-27 00:00:00 -07:00
20
20
  default_executable:
21
21
  dependencies:
22
22
  - !ruby/object:Gem::Dependency
@@ -99,6 +99,7 @@ files:
99
99
  - spec/fixtures/TwigExtensionsDate.es.yml
100
100
  - spec/fixtures/cl-messagepack.lisp
101
101
  - spec/fixtures/core.rkt
102
+ - spec/fixtures/hello_world
102
103
  - spec/fixtures/laholator.py
103
104
  - spec/fixtures/repl2.cljs
104
105
  - spec/spec_helper.rb
@@ -145,6 +146,7 @@ test_files:
145
146
  - spec/fixtures/TwigExtensionsDate.es.yml
146
147
  - spec/fixtures/cl-messagepack.lisp
147
148
  - spec/fixtures/core.rkt
149
+ - spec/fixtures/hello_world
148
150
  - spec/fixtures/laholator.py
149
151
  - spec/fixtures/repl2.cljs
150
152
  - spec/spec_helper.rb