charlock_holmes 0.7.8 → 0.7.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/charlock_holmes/common.h +15 -0
- data/ext/charlock_holmes/converter.c +3 -3
- data/ext/charlock_holmes/encoding_detector.c +29 -22
- data/ext/charlock_holmes/ext.c +2 -6
- data/ext/charlock_holmes/extconf.rb +89 -0
- data/ext/charlock_holmes/transliterator.cpp +1 -1
- data/lib/charlock_holmes/version.rb +1 -1
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 11393fb92c8ecb1d18d4741c9915c0293fde4381ccfeae3e929c84970b6779f4
|
4
|
+
data.tar.gz: addcb3bdf4fc04e53f7a287c483d6510205b317d980f3a8b21a26940dc5a198d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1729db533d5d41aa4708888763270e3f346552a01a8f29371a010253db3bb0cc4f58a7ed628b735829e7f5d9763d35de20a17855dc9c393d4244d1acf30f0dbe
|
7
|
+
data.tar.gz: 5033611bfe43c98fd8d88a7141b94676750a2c7fe49a44f5e621d61a4bfc4eef8e627b478471c0cce7dc128b4fc3ad6922129a73c48274681522ff639e82dd75
|
@@ -38,4 +38,19 @@ static inline VALUE charlock_new_str2(const char *str)
|
|
38
38
|
#endif
|
39
39
|
}
|
40
40
|
|
41
|
+
|
42
|
+
#ifdef __cplusplus
|
43
|
+
extern "C"
|
44
|
+
{
|
45
|
+
#endif
|
46
|
+
|
47
|
+
extern void Init_charlock_holmes(void);
|
48
|
+
extern void _init_charlock_encoding_detector(void);
|
49
|
+
extern void _init_charlock_converter(void);
|
50
|
+
extern void _init_charlock_transliterator(void);
|
51
|
+
|
52
|
+
#ifdef __cplusplus
|
53
|
+
}
|
54
|
+
#endif
|
55
|
+
|
41
56
|
#endif
|
@@ -20,7 +20,7 @@ static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VA
|
|
20
20
|
Check_Type(rb_dst_enc, T_STRING);
|
21
21
|
|
22
22
|
src_txt = RSTRING_PTR(rb_txt);
|
23
|
-
src_len = RSTRING_LEN(rb_txt);
|
23
|
+
src_len = (int32_t)RSTRING_LEN(rb_txt);
|
24
24
|
src_enc = RSTRING_PTR(rb_src_enc);
|
25
25
|
dst_enc = RSTRING_PTR(rb_dst_enc);
|
26
26
|
|
@@ -29,7 +29,7 @@ static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VA
|
|
29
29
|
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
30
30
|
rb_raise(rb_eArgError, "%s", u_errorName(status));
|
31
31
|
}
|
32
|
-
out_buf = malloc(out_len);
|
32
|
+
out_buf = (char *) malloc(out_len);
|
33
33
|
|
34
34
|
// now do the actual conversion
|
35
35
|
status = U_ZERO_ERROR;
|
@@ -50,7 +50,7 @@ static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VA
|
|
50
50
|
return rb_out;
|
51
51
|
}
|
52
52
|
|
53
|
-
void _init_charlock_converter() {
|
53
|
+
void _init_charlock_converter(void) {
|
54
54
|
rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject);
|
55
55
|
|
56
56
|
rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3);
|
@@ -8,6 +8,25 @@ typedef struct {
|
|
8
8
|
UCharsetDetector *csd;
|
9
9
|
} charlock_detector_t;
|
10
10
|
|
11
|
+
static void rb_encdec__free(void *obj)
|
12
|
+
{
|
13
|
+
charlock_detector_t *detector;
|
14
|
+
|
15
|
+
detector = (charlock_detector_t *)obj;
|
16
|
+
|
17
|
+
if (detector->csd)
|
18
|
+
ucsdet_close(detector->csd);
|
19
|
+
|
20
|
+
free(detector);
|
21
|
+
}
|
22
|
+
|
23
|
+
static const rb_data_type_t charlock_detector_type = {
|
24
|
+
"Charlock/Detector",
|
25
|
+
{ 0, rb_encdec__free, 0, },
|
26
|
+
0, 0,
|
27
|
+
RUBY_TYPED_FREE_IMMEDIATELY,
|
28
|
+
};
|
29
|
+
|
11
30
|
static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
|
12
31
|
{
|
13
32
|
UErrorCode status = U_ZERO_ERROR;
|
@@ -47,7 +66,7 @@ static VALUE rb_encdec_buildmatch(const UCharsetMatch *match)
|
|
47
66
|
return rb_match;
|
48
67
|
}
|
49
68
|
|
50
|
-
static VALUE rb_encdec_binarymatch() {
|
69
|
+
static VALUE rb_encdec_binarymatch(void) {
|
51
70
|
VALUE rb_match;
|
52
71
|
|
53
72
|
rb_match = rb_hash_new();
|
@@ -167,7 +186,7 @@ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
|
|
167
186
|
rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
|
168
187
|
|
169
188
|
Check_Type(rb_str, T_STRING);
|
170
|
-
|
189
|
+
TypedData_Get_Struct(self, charlock_detector_t, &charlock_detector_type, detector);
|
171
190
|
|
172
191
|
// first lets see if this is binary content
|
173
192
|
if (detect_binary_content(self, rb_str)) {
|
@@ -180,7 +199,7 @@ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self)
|
|
180
199
|
|
181
200
|
if (!NIL_P(rb_enc_hint)) {
|
182
201
|
Check_Type(rb_enc_hint, T_STRING);
|
183
|
-
ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
|
202
|
+
ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), (int32_t)RSTRING_LEN(rb_enc_hint), &status);
|
184
203
|
}
|
185
204
|
|
186
205
|
return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status));
|
@@ -215,7 +234,7 @@ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
|
|
215
234
|
rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint);
|
216
235
|
|
217
236
|
Check_Type(rb_str, T_STRING);
|
218
|
-
|
237
|
+
TypedData_Get_Struct(self, charlock_detector_t, &charlock_detector_type, detector);
|
219
238
|
|
220
239
|
rb_ret = rb_ary_new();
|
221
240
|
|
@@ -229,7 +248,7 @@ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self)
|
|
229
248
|
|
230
249
|
if (!NIL_P(rb_enc_hint)) {
|
231
250
|
Check_Type(rb_enc_hint, T_STRING);
|
232
|
-
ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status);
|
251
|
+
ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), (int32_t)RSTRING_LEN(rb_enc_hint), &status);
|
233
252
|
}
|
234
253
|
|
235
254
|
csm = ucsdet_detectAll(detector->csd, &match_count, &status);
|
@@ -257,7 +276,7 @@ static VALUE rb_get_strip_tags(VALUE self)
|
|
257
276
|
UBool val;
|
258
277
|
VALUE rb_val;
|
259
278
|
|
260
|
-
|
279
|
+
TypedData_Get_Struct(self, charlock_detector_t, &charlock_detector_type, detector);
|
261
280
|
|
262
281
|
val = ucsdet_isInputFilterEnabled(detector->csd);
|
263
282
|
|
@@ -279,7 +298,7 @@ static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val)
|
|
279
298
|
charlock_detector_t *detector;
|
280
299
|
UBool val;
|
281
300
|
|
282
|
-
|
301
|
+
TypedData_Get_Struct(self, charlock_detector_t, &charlock_detector_type, detector);
|
283
302
|
|
284
303
|
val = rb_val == Qtrue ? 1 : 0;
|
285
304
|
|
@@ -334,26 +353,14 @@ static VALUE rb_get_supported_encodings(VALUE klass)
|
|
334
353
|
return rb_encoding_list;
|
335
354
|
}
|
336
355
|
|
337
|
-
static void rb_encdec__free(void *obj)
|
338
|
-
{
|
339
|
-
charlock_detector_t *detector;
|
340
|
-
|
341
|
-
detector = (charlock_detector_t *)obj;
|
342
|
-
|
343
|
-
if (detector->csd)
|
344
|
-
ucsdet_close(detector->csd);
|
345
|
-
|
346
|
-
free(detector);
|
347
|
-
}
|
348
|
-
|
349
356
|
static VALUE rb_encdec__alloc(VALUE klass)
|
350
357
|
{
|
351
358
|
charlock_detector_t *detector;
|
352
359
|
UErrorCode status = U_ZERO_ERROR;
|
353
360
|
VALUE obj;
|
354
361
|
|
355
|
-
detector = calloc(1, sizeof(charlock_detector_t));
|
356
|
-
|
362
|
+
detector = (charlock_detector_t *) calloc(1, sizeof(charlock_detector_t));
|
363
|
+
obj = TypedData_Wrap_Struct(klass, &charlock_detector_type, (void *)detector);
|
357
364
|
|
358
365
|
detector->csd = ucsdet_open(&status);
|
359
366
|
if (U_FAILURE(status)) {
|
@@ -363,7 +370,7 @@ static VALUE rb_encdec__alloc(VALUE klass)
|
|
363
370
|
return obj;
|
364
371
|
}
|
365
372
|
|
366
|
-
void _init_charlock_encoding_detector()
|
373
|
+
void _init_charlock_encoding_detector(void)
|
367
374
|
{
|
368
375
|
rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject);
|
369
376
|
rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc);
|
data/ext/charlock_holmes/ext.c
CHANGED
@@ -1,15 +1,11 @@
|
|
1
1
|
#include "common.h"
|
2
2
|
|
3
|
-
extern void _init_charlock_encoding_detector();
|
4
|
-
extern void _init_charlock_converter();
|
5
|
-
extern void _init_charlock_transliterator();
|
6
|
-
|
7
3
|
VALUE rb_mCharlockHolmes;
|
8
4
|
|
9
|
-
void Init_charlock_holmes() {
|
5
|
+
void Init_charlock_holmes(void) {
|
10
6
|
rb_mCharlockHolmes = rb_define_module("CharlockHolmes");
|
11
7
|
|
12
8
|
_init_charlock_encoding_detector();
|
13
9
|
_init_charlock_converter();
|
14
10
|
_init_charlock_transliterator();
|
15
|
-
}
|
11
|
+
}
|
@@ -49,4 +49,93 @@ have_library 'icudata' or abort 'libicudata missing'
|
|
49
49
|
$CFLAGS << ' -Wall -funroll-loops'
|
50
50
|
$CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG']
|
51
51
|
|
52
|
+
minimal_program = <<~SRC
|
53
|
+
#include <unicode/translit.h>
|
54
|
+
int main() { return 0; }
|
55
|
+
SRC
|
56
|
+
|
57
|
+
# Pass -x c++ to force gcc to compile the test program
|
58
|
+
# as C++ (as it will end in .c by default).
|
59
|
+
compile_options = +"-x c++"
|
60
|
+
|
61
|
+
icu_requires_version_flag = checking_for("icu that requires explicit C++ version flag") do
|
62
|
+
!try_compile(minimal_program, compile_options)
|
63
|
+
end
|
64
|
+
|
65
|
+
if icu_requires_version_flag
|
66
|
+
abort "Cannot compile icu with your compiler: recent versions require C++17 support." unless %w[c++20 c++17 c++11 c++0x].any? do |std|
|
67
|
+
checking_for("icu that compiles with #{std} standard") do
|
68
|
+
flags = compile_options + " -std=#{std}"
|
69
|
+
if try_compile(minimal_program, flags)
|
70
|
+
$CPPFLAGS << flags
|
71
|
+
|
72
|
+
true
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def libflag_to_filename(ldflag)
|
79
|
+
case ldflag
|
80
|
+
when /\A-l(.+)/
|
81
|
+
"lib#{Regexp.last_match(1)}.#{$LIBEXT}"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def resolve_static_library(libflag, dirs)
|
86
|
+
filename = libflag_to_filename(libflag)
|
87
|
+
|
88
|
+
dir = dirs.find { |path| File.exist?(File.join(path, filename)) }
|
89
|
+
|
90
|
+
raise "Unable to find #{filename} in #{dirs}" unless dir
|
91
|
+
|
92
|
+
File.join(dir, filename)
|
93
|
+
end
|
94
|
+
|
95
|
+
def substitute_static_libs(packages)
|
96
|
+
packages.each do |pkg|
|
97
|
+
unless pkg_config(pkg)
|
98
|
+
message = <<~MSG
|
99
|
+
Unable to run `pkg-config #{pkg}`.
|
100
|
+
|
101
|
+
Check that PKG_CONFIG_PATH includes #{pkg}.pc (or unset it if it's already set).
|
102
|
+
|
103
|
+
Current environment:
|
104
|
+
PKG_CONFIG_PATH=#{ENV['PKG_CONFIG_PATH']}
|
105
|
+
MSG
|
106
|
+
|
107
|
+
raise message
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# First, find all the -l<lib> flags added by pkg-config. We want to drop
|
112
|
+
# these dynamically linked libraries and substitute them with the static libraries.
|
113
|
+
libflags = packages.map do |pkg|
|
114
|
+
pkg_config(pkg, 'libs-only-l')&.strip&.split(' ')
|
115
|
+
end.flatten.uniq
|
116
|
+
|
117
|
+
# To find where the static libraries live, we need to search the
|
118
|
+
# library paths given by the -L flag from pkg-config.
|
119
|
+
lib_paths = packages.map do |pkg|
|
120
|
+
include_path = pkg_config(pkg, 'libs-only-L')&.strip
|
121
|
+
include_path&.split(' ')&.map { |lib| lib.gsub(/^-L/, '') }
|
122
|
+
end.flatten.uniq
|
123
|
+
|
124
|
+
# Drop the -l<lib> flags and add in the static libraries.
|
125
|
+
new_libs = $libs.shellsplit
|
126
|
+
new_libs.reject! { |arg| libflags.include?(arg) }
|
127
|
+
libflags.each { |flag| new_libs << resolve_static_library(flag, lib_paths) }
|
128
|
+
$libs = new_libs.uniq.shelljoin
|
129
|
+
end
|
130
|
+
|
131
|
+
static_p = enable_config('static', false)
|
132
|
+
message "Static linking is #{static_p ? 'enabled' : 'disabled'}.\n"
|
133
|
+
|
134
|
+
if static_p
|
135
|
+
$CXXFLAGS << ' -fPIC'
|
136
|
+
ENV['PKG_CONFIG_ALLOW_SYSTEM_LIBS'] = '1'
|
137
|
+
|
138
|
+
substitute_static_libs(%w[icu-i18n icu-io icu-uc])
|
139
|
+
end
|
140
|
+
|
52
141
|
create_makefile 'charlock_holmes/charlock_holmes'
|
@@ -116,7 +116,7 @@ static VALUE rb_transliterator_transliterate(VALUE self, VALUE rb_txt, VALUE rb_
|
|
116
116
|
return rb_out;
|
117
117
|
}
|
118
118
|
|
119
|
-
void _init_charlock_transliterator() {
|
119
|
+
void _init_charlock_transliterator(void) {
|
120
120
|
#ifdef HAVE_RUBY_ENCODING_H
|
121
121
|
rb_eEncodingCompatibilityError = rb_const_get(rb_cEncoding, rb_intern("CompatibilityError"));
|
122
122
|
#endif
|
metadata
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: charlock_holmes
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brian Lopez
|
8
8
|
- Vicent Martí
|
9
|
+
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2024-
|
12
|
+
date: 2024-07-10 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: rake-compiler
|
@@ -74,6 +75,7 @@ homepage: https://github.com/brianmario/charlock_holmes
|
|
74
75
|
licenses:
|
75
76
|
- MIT
|
76
77
|
metadata: {}
|
78
|
+
post_install_message:
|
77
79
|
rdoc_options:
|
78
80
|
- "--charset=UTF-8"
|
79
81
|
require_paths:
|
@@ -89,7 +91,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
89
91
|
- !ruby/object:Gem::Version
|
90
92
|
version: '0'
|
91
93
|
requirements: []
|
92
|
-
rubygems_version: 3.
|
94
|
+
rubygems_version: 3.0.3.1
|
95
|
+
signing_key:
|
93
96
|
specification_version: 4
|
94
97
|
summary: Character encoding detection, brought to you by ICU
|
95
98
|
test_files: []
|