charlock_holmes 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -21,6 +21,8 @@ detection = CharlockHolmes::EncodingDetector.detect(contents)
21
21
  # that's mostly only returned for legacy encodings like ISO-8859-1
22
22
  ```
23
23
 
24
+ NOTE: `CharlockHolmes::EncodingDetector.detect` will return `nil` if it was unable to find an encoding.
25
+
24
26
  Though it's more efficient to reuse once detector instance:
25
27
 
26
28
  ``` ruby
@@ -66,3 +68,7 @@ If the traditional `gem install charlock_holmes` doesn't work, you may need to s
66
68
  At the time of writing, if you installed ICU via homebrew on OSX your gem install may look something like this:
67
69
 
68
70
  `gem install charlock_holmes --with-icu-dir=/usr/local/Cellar/icu4c/4.4.1`
71
+
72
+ If you're using Bundler and need to specify a custom path, you can do so with the `bundle config` command:
73
+
74
+ `bundle config build.charlock_holmes --with-icu-dir=/usr/local/Cellar/icu4c/4.4.1`
@@ -44,41 +44,64 @@ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
44
44
  }
45
45
 
46
46
  /*
47
- * call-seq: detection_hash = EncodingDetector.detect "some string"
47
+ * call-seq: detection_hash = EncodingDetector.detect str[, hint_enc]
48
48
  *
49
49
  * Attempt to detect the encoding of this string
50
50
  *
51
+ * str - a String, what you want to detect the encoding of
52
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
53
+ * be used as an additional hint to the charset detector
54
+ *
51
55
  * Returns: a Hash with :encoding, :language and :confidence
52
56
  */
53
- static VALUE rb_encdec_detect(VALUE self, VALUE rb_str)
57
+ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
54
58
  {
55
59
  UErrorCode status = U_ZERO_ERROR;
56
60
  UCharsetDetector *csd;
61
+ VALUE rb_str;
62
+ VALUE rb_enc_hint;
63
+
64
+ rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
57
65
 
58
66
  Check_Type(rb_str, T_STRING);
59
67
  Data_Get_Struct(self, UCharsetDetector, csd);
60
68
 
61
69
  ucsdet_setText(csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
70
+
71
+ if (!NIL_P(rb_enc_hint)) {
72
+ Check_Type(rb_enc_hint, T_STRING);
73
+ ucsdet_setDeclaredEncoding(csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
74
+ }
75
+
62
76
  return rb_encdec_buildmatch(ucsdet_detect(csd, &status));
63
77
  }
64
78
 
65
79
 
66
80
  /*
67
- * call-seq: detection_hash_array = EncodingDetector.detect_all "some string"
81
+ * call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc]
68
82
  *
69
83
  * Attempt to detect the encoding of this string, and return
70
84
  * a list with all the possible encodings that match it.
71
85
  *
72
- * Returns: a List with zero or more Hashes,
86
+ *
87
+ * str - a String, what you want to detect the encoding of
88
+ * hint_enc - an optional String (like "UTF-8"), the encoding name which will
89
+ * be used as an additional hint to the charset detector
90
+ *
91
+ * Returns: an Array with zero or more Hashes,
73
92
  * each one of them with with :encoding, :language and :confidence
74
93
  */
75
- static VALUE rb_encdec_detect_all(VALUE self, VALUE rb_str)
94
+ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
76
95
  {
77
96
  UErrorCode status = U_ZERO_ERROR;
78
97
  UCharsetDetector *csd;
79
- const UCharsetMatch **csm;
98
+ const UCharsetMatch **csm;
80
99
  VALUE rb_ret;
81
100
  int i, match_count;
101
+ VALUE rb_str;
102
+ VALUE rb_enc_hint;
103
+
104
+ rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
82
105
 
83
106
  Check_Type(rb_str, T_STRING);
84
107
  Data_Get_Struct(self, UCharsetDetector, csd);
@@ -86,7 +109,13 @@ static VALUE rb_encdec_detect_all(VALUE self, VALUE rb_str)
86
109
  rb_ret = rb_ary_new();
87
110
 
88
111
  ucsdet_setText(csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status);
89
- csm = ucsdet_detectAll(csd, &match_count, &status);
112
+
113
+ if (!NIL_P(rb_enc_hint)) {
114
+ Check_Type(rb_enc_hint, T_STRING);
115
+ ucsdet_setDeclaredEncoding(csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
116
+ }
117
+
118
+ csm = ucsdet_detectAll(csd, &match_count, &status);
90
119
 
91
120
  for (i = 0; i < match_count; ++i) {
92
121
  rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i]));
@@ -114,6 +143,6 @@ void Init_charlock_holmes()
114
143
 
115
144
  rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
116
145
  rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
117
- rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, 1);
118
- rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, 1);
146
+ rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
147
+ rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
119
148
  }
@@ -3,6 +3,9 @@ require 'mkmf'
3
3
  $CFLAGS << ' -Wall -funroll-loops'
4
4
  $CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
5
5
 
6
+ $CFLAGS << ' -I/usr/local/Cellar/icu4c/4.4.1/include'
7
+ $LDFLAGS << ' -L/usr/local/Cellar/icu4c/4.4.1/lib'
8
+
6
9
  dir_config 'icu'
7
10
 
8
11
  have_library 'icui18n'
@@ -4,9 +4,28 @@ module CharlockHolmes
4
4
  #
5
5
  # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
6
6
  #
7
+ # str - a String, what you want to detect the encoding of
8
+ # hint_enc - an optional String (like "UTF-8"), the encoding name which will
9
+ # be used as an additional hint to the charset detector
10
+ #
7
11
  # Returns: a Hash with :encoding, :language and :confidence
8
- def self.detect(str)
9
- new.detect(str)
12
+ def self.detect(str, hint_enc=nil)
13
+ new.detect(str, hint_enc)
14
+ end
15
+
16
+ # Attempt to detect the encoding of this string, and return
17
+ # a list with all the possible encodings that match it.
18
+ #
19
+ # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
20
+ #
21
+ # str - a String, what you want to detect the encoding of
22
+ # hint_enc - an optional String (like "UTF-8"), the encoding name which will
23
+ # be used as an additional hint to the charset detector
24
+ #
25
+ # Returns: an Array with zero or more Hashes,
26
+ # each one of them with with :encoding, :language and :confidence
27
+ def self.detect_all(str, hint_enc=nil)
28
+ new.detect_all(str, hint_enc)
10
29
  end
11
30
  end
12
31
  end
@@ -4,8 +4,17 @@ class String
4
4
  # Attempt to detect the encoding of this string
5
5
  #
6
6
  # Returns: a Hash with :encoding, :language and :confidence
7
- def detect_encoding
8
- encoding_detector.detect(self)
7
+ def detect_encoding(hint_enc=nil)
8
+ encoding_detector.detect(self, hint_enc)
9
+ end
10
+
11
+ # Attempt to detect the encoding of this string, and return
12
+ # a list with all the possible encodings that match it.
13
+ #
14
+ # Returns: an Array with zero or more Hashes,
15
+ # each one of them with with :encoding, :language and :confidence
16
+ def detect_encodings(hint_enc=nil)
17
+ encoding_detector.detect_all(self, hint_enc)
9
18
  end
10
19
 
11
20
  if RUBY_VERSION =~ /1.9/
@@ -13,8 +22,8 @@ class String
13
22
  # then set the encoding to what was detected ala `force_encoding`
14
23
  #
15
24
  # Returns: a Hash with :encoding, :language and :confidence
16
- def detect_encoding!
17
- if detected = self.detect_encoding
25
+ def detect_encoding!(hint_enc=nil)
26
+ if detected = self.detect_encoding(hint_enc)
18
27
  self.force_encoding detected[:encoding]
19
28
  detected
20
29
  end
@@ -1,3 +1,3 @@
1
1
  module CharlockHolmes
2
- VERSION = "0.2.0"
2
+ VERSION = "0.3.0"
3
3
  end
@@ -5,21 +5,62 @@ describe CharlockHolmes::EncodingDetector do
5
5
  @detector = CharlockHolmes::EncodingDetector.new
6
6
  end
7
7
 
8
- test 'has a detect class-level method' do
8
+ test 'has a class-level detect method' do
9
9
  CharlockHolmes::EncodingDetector.respond_to? :detect
10
10
  detected = CharlockHolmes::EncodingDetector.detect 'test'
11
11
  assert_equal 'ISO-8859-1', detected[:encoding]
12
12
  end
13
13
 
14
+ test 'has a class-level detect method that accepts an encoding hint' do
15
+ CharlockHolmes::EncodingDetector.respond_to? :detect
16
+ detected = CharlockHolmes::EncodingDetector.detect 'test', 'UTF-8'
17
+ assert_equal 'ISO-8859-1', detected[:encoding]
18
+ end
19
+
20
+ test 'has a class-level detect_all method' do
21
+ CharlockHolmes::EncodingDetector.respond_to? :detect_all
22
+ detected_list = CharlockHolmes::EncodingDetector.detect_all 'test'
23
+ assert detected_list.is_a? Array
24
+
25
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
26
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
27
+ end
28
+
29
+ test 'has a class-level detect_all method that accepts an encoding hint' do
30
+ CharlockHolmes::EncodingDetector.respond_to? :detect_all
31
+ detected_list = CharlockHolmes::EncodingDetector.detect_all 'test', 'UTF-8'
32
+ assert detected_list.is_a? Array
33
+
34
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
35
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
36
+ end
37
+
14
38
  test 'has a detect method' do
15
39
  @detector.respond_to? :detect
16
40
  detected = @detector.detect 'test'
17
41
  assert_equal 'ISO-8859-1', detected[:encoding]
18
42
  end
19
43
 
44
+ test 'has a detect method that accepts an encoding hint' do
45
+ @detector.respond_to? :detect
46
+ detected = @detector.detect 'test', 'UTF-8'
47
+ assert_equal 'ISO-8859-1', detected[:encoding]
48
+ end
49
+
20
50
  test 'has a detect_all method' do
21
51
  @detector.respond_to? :detect_all
22
52
  detected_list = @detector.detect_all 'test'
53
+ assert detected_list.is_a? Array
54
+
55
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
56
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
57
+ end
58
+
59
+ test 'has a detect_all method that accepts an encoding hint' do
60
+ @detector.respond_to? :detect_all
61
+ detected_list = @detector.detect_all 'test', 'UTF-8'
62
+ assert detected_list.is_a? Array
63
+
23
64
  encoding_list = detected_list.map {|d| d[:encoding]}.sort
24
65
  assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
25
66
  end
@@ -10,6 +10,36 @@ describe String do
10
10
  assert_equal 'ISO-8859-1', detected[:encoding]
11
11
  end
12
12
 
13
+ test 'has a detect_encoding method that accepts an encoding hint' do
14
+ str = 'test'
15
+ str.respond_to? :detect_encoding
16
+
17
+ detected = str.detect_encoding 'UTF-8'
18
+ assert_equal 'ISO-8859-1', detected[:encoding]
19
+ end
20
+
21
+ test 'has a detect_encodings method' do
22
+ str = 'test'
23
+ str.respond_to? :detect_encodings
24
+
25
+ detected_list = str.detect_encodings
26
+ assert detected_list.is_a? Array
27
+
28
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
29
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
30
+ end
31
+
32
+ test 'has a detect_encodings method that accepts an encoding hint' do
33
+ str = 'test'
34
+ str.respond_to? :detect_encodings
35
+
36
+ detected_list = str.detect_encodings 'UTF-8'
37
+ assert detected_list.is_a? Array
38
+
39
+ encoding_list = detected_list.map {|d| d[:encoding]}.sort
40
+ assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
41
+ end
42
+
13
43
  if RUBY_VERSION =~ /1.9/
14
44
  test 'has a detect_encoding! method' do
15
45
  str = 'test'
metadata CHANGED
@@ -1,58 +1,79 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: charlock_holmes
3
- version: !ruby/object:Gem::Version
4
- version: 0.2.0
3
+ version: !ruby/object:Gem::Version
4
+ hash: 19
5
5
  prerelease:
6
+ segments:
7
+ - 0
8
+ - 3
9
+ - 0
10
+ version: 0.3.0
6
11
  platform: ruby
7
- authors:
12
+ authors:
8
13
  - Brian Lopez
9
- - Vicent Martí
14
+ - "Vicent Mart\xC3\xAD"
10
15
  autorequire:
11
16
  bindir: bin
12
17
  cert_chain: []
13
- date: 2011-08-24 00:00:00.000000000 -07:00
18
+
19
+ date: 2011-08-25 00:00:00 -07:00
14
20
  default_executable:
15
- dependencies:
16
- - !ruby/object:Gem::Dependency
21
+ dependencies:
22
+ - !ruby/object:Gem::Dependency
17
23
  name: rake-compiler
18
- requirement: &70293348071280 !ruby/object:Gem::Requirement
24
+ prerelease: false
25
+ requirement: &id001 !ruby/object:Gem::Requirement
19
26
  none: false
20
- requirements:
21
- - - ! '>='
22
- - !ruby/object:Gem::Version
27
+ requirements:
28
+ - - ">="
29
+ - !ruby/object:Gem::Version
30
+ hash: 9
31
+ segments:
32
+ - 0
33
+ - 7
34
+ - 5
23
35
  version: 0.7.5
24
36
  type: :development
25
- prerelease: false
26
- version_requirements: *70293348071280
27
- - !ruby/object:Gem::Dependency
37
+ version_requirements: *id001
38
+ - !ruby/object:Gem::Dependency
28
39
  name: rspec
29
- requirement: &70293348070780 !ruby/object:Gem::Requirement
40
+ prerelease: false
41
+ requirement: &id002 !ruby/object:Gem::Requirement
30
42
  none: false
31
- requirements:
32
- - - ! '>='
33
- - !ruby/object:Gem::Version
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ hash: 15
47
+ segments:
48
+ - 2
49
+ - 0
50
+ - 0
34
51
  version: 2.0.0
35
52
  type: :development
36
- prerelease: false
37
- version_requirements: *70293348070780
38
- - !ruby/object:Gem::Dependency
53
+ version_requirements: *id002
54
+ - !ruby/object:Gem::Dependency
39
55
  name: chardet
40
- requirement: &70293348070400 !ruby/object:Gem::Requirement
56
+ prerelease: false
57
+ requirement: &id003 !ruby/object:Gem::Requirement
41
58
  none: false
42
- requirements:
43
- - - ! '>='
44
- - !ruby/object:Gem::Version
45
- version: '0'
59
+ requirements:
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ hash: 3
63
+ segments:
64
+ - 0
65
+ version: "0"
46
66
  type: :development
47
- prerelease: false
48
- version_requirements: *70293348070400
67
+ version_requirements: *id003
49
68
  description:
50
69
  email: seniorlopez@gmail.com
51
70
  executables: []
52
- extensions:
71
+
72
+ extensions:
53
73
  - ext/charlock_holmes/extconf.rb
54
74
  extra_rdoc_files: []
55
- files:
75
+
76
+ files:
56
77
  - .gitignore
57
78
  - .rspec
58
79
  - Gemfile
@@ -81,31 +102,39 @@ files:
81
102
  has_rdoc: true
82
103
  homepage: http://github.com/brianmario/charlock_holmes
83
104
  licenses: []
105
+
84
106
  post_install_message:
85
- rdoc_options:
107
+ rdoc_options:
86
108
  - --charset=UTF-8
87
- require_paths:
109
+ require_paths:
88
110
  - lib
89
111
  - ext
90
- required_ruby_version: !ruby/object:Gem::Requirement
112
+ required_ruby_version: !ruby/object:Gem::Requirement
91
113
  none: false
92
- requirements:
93
- - - ! '>='
94
- - !ruby/object:Gem::Version
95
- version: '0'
96
- required_rubygems_version: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ hash: 3
118
+ segments:
119
+ - 0
120
+ version: "0"
121
+ required_rubygems_version: !ruby/object:Gem::Requirement
97
122
  none: false
98
- requirements:
99
- - - ! '>='
100
- - !ruby/object:Gem::Version
101
- version: '0'
123
+ requirements:
124
+ - - ">="
125
+ - !ruby/object:Gem::Version
126
+ hash: 3
127
+ segments:
128
+ - 0
129
+ version: "0"
102
130
  requirements: []
131
+
103
132
  rubyforge_project:
104
133
  rubygems_version: 1.6.2
105
134
  signing_key:
106
135
  specification_version: 3
107
136
  summary: Character encoding detection, brought to you by ICU
108
- test_files:
137
+ test_files:
109
138
  - spec/encoding_detector_spec.rb
110
139
  - spec/fixtures/AnsiGraph.psm1
111
140
  - spec/fixtures/TwigExtensionsDate.es.yml