charlock_holmes 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- charlock_holmes (0.2.0)
4
+ charlock_holmes (0.3.0)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
@@ -8,6 +8,15 @@
8
8
  static VALUE rb_mCharlockHolmes;
9
9
  static VALUE rb_cEncodingDetector;
10
10
 
11
+ static VALUE charlock_new_str(const char *str, size_t len)
12
+ {
13
+ #ifdef HAVE_RUBY_ENCODING_H
14
+ return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
15
+ #else
16
+ return rb_str_new(str, len);
17
+ #endif
18
+ }
19
+
11
20
  static VALUE charlock_new_str2(const char *str)
12
21
  {
13
22
  #ifdef HAVE_RUBY_ENCODING_H
@@ -89,7 +98,7 @@ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
89
98
  * be used as an additional hint to the charset detector
90
99
  *
91
100
  * Returns: an Array with zero or more Hashes,
92
- * each one of them with with :encoding, :language and :confidence
101
+ * each one of them with with :encoding, :language and :confidence
93
102
  */
94
103
  static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
95
104
  {
@@ -124,6 +133,90 @@ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
124
133
  return rb_ret;
125
134
  }
126
135
 
136
+ /*
137
+ * call-seq: EncodingDetector#strip_tags?
138
+ *
139
+ * Returns whether or not the strip_tags flag is set on this detector
140
+ *
141
+ * Returns: Boolean
142
+ */
143
+ static VALUE rb_get_strip_tags(VALUE self)
144
+ {
145
+ UCharsetDetector *csd;
146
+ UBool val;
147
+ VALUE rb_val;
148
+
149
+ Data_Get_Struct(self, UCharsetDetector, csd);
150
+
151
+ val = ucsdet_isInputFilterEnabled(csd);
152
+
153
+ rb_val = val == 1 ? Qtrue : Qfalse;
154
+
155
+ return rb_val;
156
+ }
157
+
158
+ /*
159
+ * call-seq: EncodingDetector#strip_tags = true
160
+ *
161
+ * Enable or disable the stripping of HTML/XML tags from the input before
162
+ * attempting any detection
163
+ *
164
+ * Returns: Boolean, the value passed
165
+ */
166
+ static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
167
+ {
168
+ UCharsetDetector *csd;
169
+ UBool val;
170
+
171
+ Data_Get_Struct(self, UCharsetDetector, csd);
172
+
173
+ val = rb_val == Qtrue ? 1 : 0;
174
+
175
+ ucsdet_enableInputFilter(csd, val);
176
+
177
+ return rb_val;
178
+ }
179
+
180
+ /*
181
+ * call-seq: detectable_encodings = EncodingDetector.supported_encodings
182
+ *
183
+ * The list of detectable encodings supported by this library
184
+ *
185
+ * Returns: an Array of Strings
186
+ */
187
+ static VALUE rb_get_supported_encodings(VALUE klass)
188
+ {
189
+ UCharsetDetector *csd;
190
+ UErrorCode status = U_ZERO_ERROR;
191
+ UEnumeration *encoding_list;
192
+ VALUE rb_encoding_list;
193
+ int32_t enc_count;
194
+ int32_t i;
195
+ const char *enc_name;
196
+ int32_t enc_name_len;
197
+
198
+ csd = ucsdet_open(&status);
199
+
200
+ rb_encoding_list = rb_iv_get(klass, "encoding_list");
201
+
202
+ // lazily populate the list
203
+ if (NIL_P(rb_encoding_list)) {
204
+ encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
205
+ rb_encoding_list = rb_ary_new();
206
+ enc_count = uenum_count(encoding_list, &status);
207
+
208
+ for(i=0; i < enc_count; i++) {
209
+ enc_name = uenum_next(encoding_list, &enc_name_len, &status);
210
+ rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
211
+ }
212
+
213
+ rb_iv_set(klass, "encoding_list", rb_encoding_list);
214
+ }
215
+
216
+ ucsdet_close(csd);
217
+
218
+ return rb_encoding_list;
219
+ }
127
220
 
128
221
  static void rb_encdec__free(void *csd)
129
222
  {
@@ -145,4 +238,8 @@ void Init_charlock_holmes()
145
238
  rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
146
239
  rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
147
240
  rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
241
+ rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
242
+ rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1);
243
+
244
+ rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0);
148
245
  }
@@ -1,5 +1,7 @@
1
1
  module CharlockHolmes
2
2
  class EncodingDetector
3
+ alias :strip_tags? :strip_tags
4
+
3
5
  # Attempt to detect the encoding of this string
4
6
  #
5
7
  # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
@@ -1,3 +1,3 @@
1
1
  module CharlockHolmes
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  require 'spec_helper'
2
4
 
3
5
  describe CharlockHolmes::EncodingDetector do
@@ -65,6 +67,29 @@ describe CharlockHolmes::EncodingDetector do
65
67
  assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
66
68
  end
67
69
 
70
+ test 'has a strip_tags flag' do
71
+ detector = CharlockHolmes::EncodingDetector.new
72
+ detector.strip_tags = true
73
+ assert detector.strip_tags
74
+
75
+ detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
76
+ assert_equal 'UTF-8', detection[:encoding]
77
+
78
+ detector.strip_tags = false
79
+ assert !detector.strip_tags
80
+
81
+ detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
82
+ assert_equal 'UTF-8', detection[:encoding]
83
+ end
84
+
85
+ test 'has a list of supported encodings' do
86
+ CharlockHolmes::EncodingDetector.respond_to? :supported_encodings
87
+ supported_encodings = CharlockHolmes::EncodingDetector.supported_encodings
88
+
89
+ assert supported_encodings.is_a?(Array)
90
+ assert supported_encodings.include? 'UTF-8'
91
+ end
92
+
68
93
  context 'encoding detection' do
69
94
  MAPPING = [
70
95
  ['repl2.cljs', 'ISO-8859-1'],
metadata CHANGED
@@ -1,79 +1,58 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: charlock_holmes
3
- version: !ruby/object:Gem::Version
4
- hash: 19
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 3
9
- - 0
10
- version: 0.3.0
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Brian Lopez
14
- - "Vicent Mart\xC3\xAD"
9
+ - Vicent Martí
15
10
  autorequire:
16
11
  bindir: bin
17
12
  cert_chain: []
18
-
19
- date: 2011-08-25 00:00:00 -07:00
13
+ date: 2011-08-25 00:00:00.000000000 -07:00
20
14
  default_executable:
21
- dependencies:
22
- - !ruby/object:Gem::Dependency
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
23
17
  name: rake-compiler
24
- prerelease: false
25
- requirement: &id001 !ruby/object:Gem::Requirement
18
+ requirement: &70365951363440 !ruby/object:Gem::Requirement
26
19
  none: false
27
- requirements:
28
- - - ">="
29
- - !ruby/object:Gem::Version
30
- hash: 9
31
- segments:
32
- - 0
33
- - 7
34
- - 5
20
+ requirements:
21
+ - - ! '>='
22
+ - !ruby/object:Gem::Version
35
23
  version: 0.7.5
36
24
  type: :development
37
- version_requirements: *id001
38
- - !ruby/object:Gem::Dependency
39
- name: rspec
40
25
  prerelease: false
41
- requirement: &id002 !ruby/object:Gem::Requirement
26
+ version_requirements: *70365951363440
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: &70365951362940 !ruby/object:Gem::Requirement
42
30
  none: false
43
- requirements:
44
- - - ">="
45
- - !ruby/object:Gem::Version
46
- hash: 15
47
- segments:
48
- - 2
49
- - 0
50
- - 0
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
51
34
  version: 2.0.0
52
35
  type: :development
53
- version_requirements: *id002
54
- - !ruby/object:Gem::Dependency
55
- name: chardet
56
36
  prerelease: false
57
- requirement: &id003 !ruby/object:Gem::Requirement
37
+ version_requirements: *70365951362940
38
+ - !ruby/object:Gem::Dependency
39
+ name: chardet
40
+ requirement: &70365951394040 !ruby/object:Gem::Requirement
58
41
  none: false
59
- requirements:
60
- - - ">="
61
- - !ruby/object:Gem::Version
62
- hash: 3
63
- segments:
64
- - 0
65
- version: "0"
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
66
46
  type: :development
67
- version_requirements: *id003
47
+ prerelease: false
48
+ version_requirements: *70365951394040
68
49
  description:
69
50
  email: seniorlopez@gmail.com
70
51
  executables: []
71
-
72
- extensions:
52
+ extensions:
73
53
  - ext/charlock_holmes/extconf.rb
74
54
  extra_rdoc_files: []
75
-
76
- files:
55
+ files:
77
56
  - .gitignore
78
57
  - .rspec
79
58
  - Gemfile
@@ -102,39 +81,31 @@ files:
102
81
  has_rdoc: true
103
82
  homepage: http://github.com/brianmario/charlock_holmes
104
83
  licenses: []
105
-
106
84
  post_install_message:
107
- rdoc_options:
85
+ rdoc_options:
108
86
  - --charset=UTF-8
109
- require_paths:
87
+ require_paths:
110
88
  - lib
111
89
  - ext
112
- required_ruby_version: !ruby/object:Gem::Requirement
90
+ required_ruby_version: !ruby/object:Gem::Requirement
113
91
  none: false
114
- requirements:
115
- - - ">="
116
- - !ruby/object:Gem::Version
117
- hash: 3
118
- segments:
119
- - 0
120
- version: "0"
121
- required_rubygems_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ! '>='
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ required_rubygems_version: !ruby/object:Gem::Requirement
122
97
  none: false
123
- requirements:
124
- - - ">="
125
- - !ruby/object:Gem::Version
126
- hash: 3
127
- segments:
128
- - 0
129
- version: "0"
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
130
102
  requirements: []
131
-
132
103
  rubyforge_project:
133
104
  rubygems_version: 1.6.2
134
105
  signing_key:
135
106
  specification_version: 3
136
107
  summary: Character encoding detection, brought to you by ICU
137
- test_files:
108
+ test_files:
138
109
  - spec/encoding_detector_spec.rb
139
110
  - spec/fixtures/AnsiGraph.psm1
140
111
  - spec/fixtures/TwigExtensionsDate.es.yml