charlock_holmes 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -21,6 +21,8 @@ detection = CharlockHolmes::EncodingDetector.detect(contents)
21
21
  # that's mostly only returned for legacy encodings like ISO-8859-1
22
22
  ```
23
23
 
24
+ NOTE: `CharlockHolmes::EncodingDetector.detect` will return `nil` if it was unable to find an encoding.
25
+
24
26
  Though it's more efficient to reuse once detector instance:
25
27
 
26
28
  ``` ruby
@@ -66,3 +68,7 @@ If the traditional `gem install charlock_holmes` doesn't work, you may need to s
66
68
  At the time of writing, if you installed ICU via homebrew on OSX your gem install may look something like this:
67
69
 
68
70
  `gem install charlock_holmes --with-icu-dir=/usr/local/Cellar/icu4c/4.4.1`
71
+
72
+ If you're using Bundler and need to specify a custom path, you can do so with the `bundle config` command:
73
+
74
+ `bundle config build.charlock_holmes --with-icu-dir=/usr/local/Cellar/icu4c/4.4.1`
@@ -44,41 +44,64 @@ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
44
44
  }
45
45
 
46
46
  /*
47
- * call-seq: detection_hash = EncodingDetector.detect "some string"
47
+ * call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
48
48
  *
49
49
  * Attempt to detect the encoding of this string
50
50
  *
51
+ * str - a String, what you want to detect the encoding of
52
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
53
+ * be used as an additional hint to the charset detector
54
+ *
51
55
  * Returns: a Hash with :encoding, :language and :confidence
52
56
  */
53
- static VALUE rb_encdec_detect(VALUE self, VALUE rb_str)
57
+ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
54
58
  {
55
59
  UErrorCode status = U_ZERO_ERROR;
56
60
  UCharsetDetector *csd;
61
+ VALUE rb_str;
62
+ VALUE rb_enc_hint;
63
+
64
+ rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
57
65
 
58
66
  Check_Type(rb_str, T_STRING);
59
67
  Data_Get_Struct(self, UCharsetDetector, csd);
60
68
 
61
69
  ucsdet_setText(csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
70
+
71
+ if (!NIL_P(rb_enc_hint)) {
72
+ Check_Type(rb_enc_hint, T_STRING);
73
+ ucsdet_setDeclaredEncoding(csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
74
+ }
75
+
62
76
  return rb_encdec_buildmatch(ucsdet_detect(csd, &status));
63
77
  }
64
78
 
65
79
 
66
80
  /*
67
- * call-seq: detection_hash_array = EncodingDetector.detect_all "some string"
81
+ * call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc]
68
82
  *
69
83
  * Attempt to detect the encoding of this string, and return
70
84
  * a list with all the possible encodings that match it.
71
85
  *
72
- * Returns: a List with zero or more Hashes,
86
+ *
87
+ * str - a String, what you want to detect the encoding of
88
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
89
+ * be used as an additional hint to the charset detector
90
+ *
91
+ * Returns: an Array with zero or more Hashes,
73
92
  * each one of them with with :encoding, :language and :confidence
74
93
  */
75
- static VALUE rb_encdec_detect_all(VALUE self, VALUE rb_str)
94
+ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
76
95
  {
77
96
  UErrorCode status = U_ZERO_ERROR;
78
97
  UCharsetDetector *csd;
79
- const UCharsetMatch **csm;
98
+ const UCharsetMatch **csm;
80
99
  VALUE rb_ret;
81
100
  int i, match_count;
101
+ VALUE rb_str;
102
+ VALUE rb_enc_hint;
103
+
104
+ rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
82
105
 
83
106
  Check_Type(rb_str, T_STRING);
84
107
  Data_Get_Struct(self, UCharsetDetector, csd);
@@ -86,7 +109,13 @@ static VALUE rb_encdec_detect_all(VALUE self, VALUE rb_str)
86
109
  rb_ret = rb_ary_new();
87
110
 
88
111
  ucsdet_setText(csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
89
- csm = ucsdet_detectAll(csd, &match_count, &status);
112
+
113
+ if (!NIL_P(rb_enc_hint)) {
114
+ Check_Type(rb_enc_hint, T_STRING);
115
+ ucsdet_setDeclaredEncoding(csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
116
+ }
117
+
118
+ csm = ucsdet_detectAll(csd, &match_count, &status);
90
119
 
91
120
  for (i = 0; i < match_count; ++i) {
92
121
  rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
@@ -114,6 +143,6 @@ void Init_charlock_holmes()
114
143
 
115
144
  rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
116
145
  rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
117
- rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, 1);
118
- rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, 1);
146
+ rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
147
+ rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
119
148
  }
@@ -3,6 +3,9 @@ require 'mkmf'
3
3
  $CFLAGS << ' -Wall -funroll-loops'
4
4
  $CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
5
5
 
6
+ $CFLAGS << ' -I/usr/local/Cellar/icu4c/4.4.1/include'
7
+ $LDFLAGS << ' -L/usr/local/Cellar/icu4c/4.4.1/lib'
8
+
6
9
  dir_config 'icu'
7
10
 
8
11
  have_library 'icui18n'
@@ -4,9 +4,28 @@ module CharlockHolmes
4
4
  #
5
5
  # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
6
6
  #
7
+ # str - a String, what you want to detect the encoding of
8
+ # hint_enc - an optional String (like "UTF-8"), the encoding name which will
9
+ # be used as an additional hint to the charset detector
10
+ #
7
11
  # Returns: a Hash with :encoding, :language and :confidence
8
- def self.detect(str)
9
- new.detect(str)
12
+ def self.detect(str, hint_enc=nil)
13
+ new.detect(str, hint_enc)
14
+ end
15
+
16
+ # Attempt to detect the encoding of this string, and return
17
+ # a list with all the possible encodings that match it.
18
+ #
19
+ # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
20
+ #
21
+ # str - a String, what you want to detect the encoding of
22
+ # hint_enc - an optional String (like "UTF-8"), the encoding name which will
23
+ # be used as an additional hint to the charset detector
24
+ #
25
+ # Returns: an Array with zero or more Hashes,
26
+ # each one of them with with :encoding, :language and :confidence
27
+ def self.detect_all(str, hint_enc=nil)
28
+ new.detect_all(str, hint_enc)
10
29
  end
11
30
  end
12
31
  end
@@ -4,8 +4,17 @@ class String
4
4
  # Attempt to detect the encoding of this string
5
5
  #
6
6
  # Returns: a Hash with :encoding, :language and :confidence
7
- def detect_encoding
8
- encoding_detector.detect(self)
7
+ def detect_encoding(hint_enc=nil)
8
+ encoding_detector.detect(self, hint_enc)
9
+ end
10
+
11
+ # Attempt to detect the encoding of this string, and return
12
+ # a list with all the possible encodings that match it.
13
+ #
14
+ # Returns: an Array with zero or more Hashes,
15
+ # each one of them with with :encoding, :language and :confidence
16
+ def detect_encodings(hint_enc=nil)
17
+ encoding_detector.detect_all(self, hint_enc)
9
18
  end
10
19
 
11
20
  if RUBY_VERSION =~ /1.9/
@@ -13,8 +22,8 @@ class String
13
22
  # then set the encoding to what was detected ala `force_encoding`
14
23
  #
15
24
  # Returns: a Hash with :encoding, :language and :confidence
16
- def detect_encoding!
17
- if detected = self.detect_encoding
25
+ def detect_encoding!(hint_enc=nil)
26
+ if detected = self.detect_encoding(hint_enc)
18
27
  self.force_encoding detected[:encoding]
19
28
  detected
20
29
  end
@@ -1,3 +1,3 @@
1
1
  module CharlockHolmes
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -5,21 +5,62 @@ describe CharlockHolmes::EncodingDetector do
5
5
  @detector = CharlockHolmes::EncodingDetector.new
6
6
  end
7
7
 
8
- test 'has a detect class-level method' do
8
+ test 'has a class-level detect method' do
9
9
  CharlockHolmes::EncodingDetector.respond_to? :detect
10
10
  detected = CharlockHolmes::EncodingDetector.detect 'test'
11
11
  assert_equal 'ISO-8859-1', detected[:encoding]
12
12
  end
13
13
 
14
+ test 'has a class-level detect method that accepts an encoding hint' do
15
+ CharlockHolmes::EncodingDetector.respond_to? :detect
16
+ detected = CharlockHolmes::EncodingDetector.detect 'test', 'UTF-8'
17
+ assert_equal 'ISO-8859-1', detected[:encoding]
18
+ end
19
+
20
+ test 'has a class-level detect_all method' do
21
+ CharlockHolmes::EncodingDetector.respond_to? :detect_all
22
+ detected_list = CharlockHolmes::EncodingDetector.detect_all 'test'
23
+ assert detected_list.is_a? Array
24
+
25
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
26
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
27
+ end
28
+
29
+ test 'has a class-level detect_all method that accepts an encoding hint' do
30
+ CharlockHolmes::EncodingDetector.respond_to? :detect_all
31
+ detected_list = CharlockHolmes::EncodingDetector.detect_all 'test', 'UTF-8'
32
+ assert detected_list.is_a? Array
33
+
34
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
35
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
36
+ end
37
+
14
38
  test 'has a detect method' do
15
39
  @detector.respond_to? :detect
16
40
  detected = @detector.detect 'test'
17
41
  assert_equal 'ISO-8859-1', detected[:encoding]
18
42
  end
19
43
 
44
+ test 'has a detect method that accepts an encoding hint' do
45
+ @detector.respond_to? :detect
46
+ detected = @detector.detect 'test', 'UTF-8'
47
+ assert_equal 'ISO-8859-1', detected[:encoding]
48
+ end
49
+
20
50
  test 'has a detect_all method' do
21
51
  @detector.respond_to? :detect_all
22
52
  detected_list = @detector.detect_all 'test'
53
+ assert detected_list.is_a? Array
54
+
55
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
56
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
57
+ end
58
+
59
+ test 'has a detect_all method that accepts an encoding hint' do
60
+ @detector.respond_to? :detect_all
61
+ detected_list = @detector.detect_all 'test', 'UTF-8'
62
+ assert detected_list.is_a? Array
63
+
23
64
  encoding_list = detected_list.map {|d| d[:encoding]}.sort
24
65
  assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
25
66
  end
@@ -10,6 +10,36 @@ describe String do
10
10
  assert_equal 'ISO-8859-1', detected[:encoding]
11
11
  end
12
12
 
13
+ test 'has a detect_encoding method that accepts an encoding hint' do
14
+ str = 'test'
15
+ str.respond_to? :detect_encoding
16
+
17
+ detected = str.detect_encoding 'UTF-8'
18
+ assert_equal 'ISO-8859-1', detected[:encoding]
19
+ end
20
+
21
+ test 'has a detect_encodings method' do
22
+ str = 'test'
23
+ str.respond_to? :detect_encodings
24
+
25
+ detected_list = str.detect_encodings
26
+ assert detected_list.is_a? Array
27
+
28
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
29
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
30
+ end
31
+
32
+ test 'has a detect_encodings method that accepts an encoding hint' do
33
+ str = 'test'
34
+ str.respond_to? :detect_encodings
35
+
36
+ detected_list = str.detect_encodings 'UTF-8'
37
+ assert detected_list.is_a? Array
38
+
39
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
40
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
41
+ end
42
+
13
43
  if RUBY_VERSION =~ /1.9/
14
44
  test 'has a detect_encoding! method' do
15
45
  str = 'test'
metadata CHANGED
@@ -1,58 +1,79 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: charlock_holmes
3
- version: !ruby/object:Gem::Version
4
- version: 0.2.0
3
+ version: !ruby/object:Gem::Version
4
+ hash: 19
5
5
  prerelease:
6
+ segments:
7
+ - 0
8
+ - 3
9
+ - 0
10
+ version: 0.3.0
6
11
  platform: ruby
7
- authors:
12
+ authors:
8
13
  - Brian Lopez
9
- - Vicent Martí
14
+ - "Vicent Mart\xC3\xAD"
10
15
  autorequire:
11
16
  bindir: bin
12
17
  cert_chain: []
13
- date: 2011-08-24 00:00:00.000000000 -07:00
18
+
19
+ date: 2011-08-25 00:00:00 -07:00
14
20
  default_executable:
15
- dependencies:
16
- - !ruby/object:Gem::Dependency
21
+ dependencies:
22
+ - !ruby/object:Gem::Dependency
17
23
  name: rake-compiler
18
- requirement: &70293348071280 !ruby/object:Gem::Requirement
24
+ prerelease: false
25
+ requirement: &id001 !ruby/object:Gem::Requirement
19
26
  none: false
20
- requirements:
21
- - - ! '>='
22
- - !ruby/object:Gem::Version
27
+ requirements:
28
+ - - ">="
29
+ - !ruby/object:Gem::Version
30
+ hash: 9
31
+ segments:
32
+ - 0
33
+ - 7
34
+ - 5
23
35
  version: 0.7.5
24
36
  type: :development
25
- prerelease: false
26
- version_requirements: *70293348071280
27
- - !ruby/object:Gem::Dependency
37
+ version_requirements: *id001
38
+ - !ruby/object:Gem::Dependency
28
39
  name: rspec
29
- requirement: &70293348070780 !ruby/object:Gem::Requirement
40
+ prerelease: false
41
+ requirement: &id002 !ruby/object:Gem::Requirement
30
42
  none: false
31
- requirements:
32
- - - ! '>='
33
- - !ruby/object:Gem::Version
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ hash: 15
47
+ segments:
48
+ - 2
49
+ - 0
50
+ - 0
34
51
  version: 2.0.0
35
52
  type: :development
36
- prerelease: false
37
- version_requirements: *70293348070780
38
- - !ruby/object:Gem::Dependency
53
+ version_requirements: *id002
54
+ - !ruby/object:Gem::Dependency
39
55
  name: chardet
40
- requirement: &70293348070400 !ruby/object:Gem::Requirement
56
+ prerelease: false
57
+ requirement: &id003 !ruby/object:Gem::Requirement
41
58
  none: false
42
- requirements:
43
- - - ! '>='
44
- - !ruby/object:Gem::Version
45
- version: '0'
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ hash: 3
63
+ segments:
64
+ - 0
65
+ version: "0"
46
66
  type: :development
47
- prerelease: false
48
- version_requirements: *70293348070400
67
+ version_requirements: *id003
49
68
  description:
50
69
  email: seniorlopez@gmail.com
51
70
  executables: []
52
- extensions:
71
+
72
+ extensions:
53
73
  - ext/charlock_holmes/extconf.rb
54
74
  extra_rdoc_files: []
55
- files:
75
+
76
+ files:
56
77
  - .gitignore
57
78
  - .rspec
58
79
  - Gemfile
@@ -81,31 +102,39 @@ files:
81
102
  has_rdoc: true
82
103
  homepage: http://github.com/brianmario/charlock_holmes
83
104
  licenses: []
105
+
84
106
  post_install_message:
85
- rdoc_options:
107
+ rdoc_options:
86
108
  - --charset=UTF-8
87
- require_paths:
109
+ require_paths:
88
110
  - lib
89
111
  - ext
90
- required_ruby_version: !ruby/object:Gem::Requirement
112
+ required_ruby_version: !ruby/object:Gem::Requirement
91
113
  none: false
92
- requirements:
93
- - - ! '>='
94
- - !ruby/object:Gem::Version
95
- version: '0'
96
- required_rubygems_version: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ hash: 3
118
+ segments:
119
+ - 0
120
+ version: "0"
121
+ required_rubygems_version: !ruby/object:Gem::Requirement
97
122
  none: false
98
- requirements:
99
- - - ! '>='
100
- - !ruby/object:Gem::Version
101
- version: '0'
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ hash: 3
127
+ segments:
128
+ - 0
129
+ version: "0"
102
130
  requirements: []
131
+
103
132
  rubyforge_project:
104
133
  rubygems_version: 1.6.2
105
134
  signing_key:
106
135
  specification_version: 3
107
136
  summary: Character encoding detection, brought to you by ICU
108
- test_files:
137
+ test_files:
109
138
  - spec/encoding_detector_spec.rb
110
139
  - spec/fixtures/AnsiGraph.psm1
111
140
  - spec/fixtures/TwigExtensionsDate.es.yml