charlock_holmes 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- charlock_holmes (0.2.0)
4
+ charlock_holmes (0.3.0)
5
5
 
6
6
  GEM
7
7
  remote: http://rubygems.org/
@@ -8,6 +8,15 @@
8
8
  static VALUE rb_mCharlockHolmes;
9
9
  static VALUE rb_cEncodingDetector;
10
10
 
11
+ static VALUE charlock_new_str(const char *str, size_t len)
12
+ {
13
+ #ifdef HAVE_RUBY_ENCODING_H
14
+ return rb_external_str_new_with_enc(str, len, rb_utf8_encoding());
15
+ #else
16
+ return rb_str_new(str, len);
17
+ #endif
18
+ }
19
+
11
20
  static VALUE charlock_new_str2(const char *str)
12
21
  {
13
22
  #ifdef HAVE_RUBY_ENCODING_H
@@ -89,7 +98,7 @@ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
89
98
  * be used as an additional hint to the charset detector
90
99
  *
91
100
  * Returns: an Array with zero or more Hashes,
92
- * each one of them with with :encoding, :language and :confidence
101
+ * each one of them with with :encoding, :language and :confidence
93
102
  */
94
103
  static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
95
104
  {
@@ -124,6 +133,90 @@ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
124
133
  return rb_ret;
125
134
  }
126
135
 
136
+ /*
137
+ * call-seq: EncodingDetector#strip_tags?
138
+ *
139
+ * Returns whether or not the strip_tags flag is set on this detector
140
+ *
141
+ * Returns: Boolean
142
+ */
143
+ static VALUE rb_get_strip_tags(VALUE self)
144
+ {
145
+ UCharsetDetector *csd;
146
+ UBool val;
147
+ VALUE rb_val;
148
+
149
+ Data_Get_Struct(self, UCharsetDetector, csd);
150
+
151
+ val = ucsdet_isInputFilterEnabled(csd);
152
+
153
+ rb_val = val == 1 ? Qtrue : Qfalse;
154
+
155
+ return rb_val;
156
+ }
157
+
158
+ /*
159
+ * call-seq: EncodingDetector#strip_tags = true
160
+ *
161
+ * Enable or disable the stripping of HTML/XML tags from the input before
162
+ * attempting any detection
163
+ *
164
+ * Returns: Boolean, the value passed
165
+ */
166
+ static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
167
+ {
168
+ UCharsetDetector *csd;
169
+ UBool val;
170
+
171
+ Data_Get_Struct(self, UCharsetDetector, csd);
172
+
173
+ val = rb_val == Qtrue ? 1 : 0;
174
+
175
+ ucsdet_enableInputFilter(csd, val);
176
+
177
+ return rb_val;
178
+ }
179
+
180
+ /*
181
+ * call-seq: detectable_encodings = EncodingDetector.supported_encodings
182
+ *
183
+ * The list of detectable encodings supported by this library
184
+ *
185
+ * Returns: an Array of Strings
186
+ */
187
+ static VALUE rb_get_supported_encodings(VALUE klass)
188
+ {
189
+ UCharsetDetector *csd;
190
+ UErrorCode status = U_ZERO_ERROR;
191
+ UEnumeration *encoding_list;
192
+ VALUE rb_encoding_list;
193
+ int32_t enc_count;
194
+ int32_t i;
195
+ const char *enc_name;
196
+ int32_t enc_name_len;
197
+
198
+ csd = ucsdet_open(&status);
199
+
200
+ rb_encoding_list = rb_iv_get(klass, "encoding_list");
201
+
202
+ // lazily populate the list
203
+ if (NIL_P(rb_encoding_list)) {
204
+ encoding_list = ucsdet_getAllDetectableCharsets(csd, &status);
205
+ rb_encoding_list = rb_ary_new();
206
+ enc_count = uenum_count(encoding_list, &status);
207
+
208
+ for(i=0; i < enc_count; i++) {
209
+ enc_name = uenum_next(encoding_list, &enc_name_len, &status);
210
+ rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len));
211
+ }
212
+
213
+ rb_iv_set(klass, "encoding_list", rb_encoding_list);
214
+ }
215
+
216
+ ucsdet_close(csd);
217
+
218
+ return rb_encoding_list;
219
+ }
127
220
 
128
221
  static void rb_encdec__free(void *csd)
129
222
  {
@@ -145,4 +238,8 @@ void Init_charlock_holmes()
145
238
  rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
146
239
  rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1);
147
240
  rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1);
241
+ rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0);
242
+ rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1);
243
+
244
+ rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0);
148
245
  }
@@ -1,5 +1,7 @@
1
1
  module CharlockHolmes
2
2
  class EncodingDetector
3
+ alias :strip_tags? :strip_tags
4
+
3
5
  # Attempt to detect the encoding of this string
4
6
  #
5
7
  # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call
@@ -1,3 +1,3 @@
1
1
  module CharlockHolmes
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
@@ -1,3 +1,5 @@
1
+ # encoding: utf-8
2
+
1
3
  require 'spec_helper'
2
4
 
3
5
  describe CharlockHolmes::EncodingDetector do
@@ -65,6 +67,29 @@ describe CharlockHolmes::EncodingDetector do
65
67
  assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list
66
68
  end
67
69
 
70
+ test 'has a strip_tags flag' do
71
+ detector = CharlockHolmes::EncodingDetector.new
72
+ detector.strip_tags = true
73
+ assert detector.strip_tags
74
+
75
+ detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
76
+ assert_equal 'UTF-8', detection[:encoding]
77
+
78
+ detector.strip_tags = false
79
+ assert !detector.strip_tags
80
+
81
+ detection = detector.detect "<div ascii_attribute='some more ascii'>λ, λ, λ</div>"
82
+ assert_equal 'UTF-8', detection[:encoding]
83
+ end
84
+
85
+ test 'has a list of supported encodings' do
86
+ CharlockHolmes::EncodingDetector.respond_to? :supported_encodings
87
+ supported_encodings = CharlockHolmes::EncodingDetector.supported_encodings
88
+
89
+ assert supported_encodings.is_a?(Array)
90
+ assert supported_encodings.include? 'UTF-8'
91
+ end
92
+
68
93
  context 'encoding detection' do
69
94
  MAPPING = [
70
95
  ['repl2.cljs', 'ISO-8859-1'],
metadata CHANGED
@@ -1,79 +1,58 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: charlock_holmes
3
- version: !ruby/object:Gem::Version
4
- hash: 19
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 3
9
- - 0
10
- version: 0.3.0
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Brian Lopez
14
- - "Vicent Mart\xC3\xAD"
9
+ - Vicent Martí
15
10
  autorequire:
16
11
  bindir: bin
17
12
  cert_chain: []
18
-
19
- date: 2011-08-25 00:00:00 -07:00
13
+ date: 2011-08-25 00:00:00.000000000 -07:00
20
14
  default_executable:
21
- dependencies:
22
- - !ruby/object:Gem::Dependency
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
23
17
  name: rake-compiler
24
- prerelease: false
25
- requirement: &id001 !ruby/object:Gem::Requirement
18
+ requirement: &70365951363440 !ruby/object:Gem::Requirement
26
19
  none: false
27
- requirements:
28
- - - ">="
29
- - !ruby/object:Gem::Version
30
- hash: 9
31
- segments:
32
- - 0
33
- - 7
34
- - 5
20
+ requirements:
21
+ - - ! '>='
22
+ - !ruby/object:Gem::Version
35
23
  version: 0.7.5
36
24
  type: :development
37
- version_requirements: *id001
38
- - !ruby/object:Gem::Dependency
39
- name: rspec
40
25
  prerelease: false
41
- requirement: &id002 !ruby/object:Gem::Requirement
26
+ version_requirements: *70365951363440
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: &70365951362940 !ruby/object:Gem::Requirement
42
30
  none: false
43
- requirements:
44
- - - ">="
45
- - !ruby/object:Gem::Version
46
- hash: 15
47
- segments:
48
- - 2
49
- - 0
50
- - 0
31
+ requirements:
32
+ - - ! '>='
33
+ - !ruby/object:Gem::Version
51
34
  version: 2.0.0
52
35
  type: :development
53
- version_requirements: *id002
54
- - !ruby/object:Gem::Dependency
55
- name: chardet
56
36
  prerelease: false
57
- requirement: &id003 !ruby/object:Gem::Requirement
37
+ version_requirements: *70365951362940
38
+ - !ruby/object:Gem::Dependency
39
+ name: chardet
40
+ requirement: &70365951394040 !ruby/object:Gem::Requirement
58
41
  none: false
59
- requirements:
60
- - - ">="
61
- - !ruby/object:Gem::Version
62
- hash: 3
63
- segments:
64
- - 0
65
- version: "0"
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
66
46
  type: :development
67
- version_requirements: *id003
47
+ prerelease: false
48
+ version_requirements: *70365951394040
68
49
  description:
69
50
  email: seniorlopez@gmail.com
70
51
  executables: []
71
-
72
- extensions:
52
+ extensions:
73
53
  - ext/charlock_holmes/extconf.rb
74
54
  extra_rdoc_files: []
75
-
76
- files:
55
+ files:
77
56
  - .gitignore
78
57
  - .rspec
79
58
  - Gemfile
@@ -102,39 +81,31 @@ files:
102
81
  has_rdoc: true
103
82
  homepage: http://github.com/brianmario/charlock_holmes
104
83
  licenses: []
105
-
106
84
  post_install_message:
107
- rdoc_options:
85
+ rdoc_options:
108
86
  - --charset=UTF-8
109
- require_paths:
87
+ require_paths:
110
88
  - lib
111
89
  - ext
112
- required_ruby_version: !ruby/object:Gem::Requirement
90
+ required_ruby_version: !ruby/object:Gem::Requirement
113
91
  none: false
114
- requirements:
115
- - - ">="
116
- - !ruby/object:Gem::Version
117
- hash: 3
118
- segments:
119
- - 0
120
- version: "0"
121
- required_rubygems_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ! '>='
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ required_rubygems_version: !ruby/object:Gem::Requirement
122
97
  none: false
123
- requirements:
124
- - - ">="
125
- - !ruby/object:Gem::Version
126
- hash: 3
127
- segments:
128
- - 0
129
- version: "0"
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
130
102
  requirements: []
131
-
132
103
  rubyforge_project:
133
104
  rubygems_version: 1.6.2
134
105
  signing_key:
135
106
  specification_version: 3
136
107
  summary: Character encoding detection, brought to you by ICU
137
- test_files:
108
+ test_files:
138
109
  - spec/encoding_detector_spec.rb
139
110
  - spec/fixtures/AnsiGraph.psm1
140
111
  - spec/fixtures/TwigExtensionsDate.es.yml