compact_enc_det 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
- data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
- data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
- data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
- data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
- data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
- data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
- data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
- data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
- data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
- data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
- data/ext/compact_enc_det/compact_enc_det.cc +100 -0
- data/ext/compact_enc_det/extconf.rb +20 -0
- data/lib/compact_enc_det/version.rb +3 -0
- data/lib/compact_enc_det.rb +2 -0
- metadata +106 -0
@@ -0,0 +1,61 @@
|
|
1
|
+
// Copyright 2016 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
////////////////////////////////////////////////////////////////////////////////
|
16
|
+
|
17
|
+
#ifndef UTIL_STRING_UTIL_H_
|
18
|
+
#define UTIL_STRING_UTIL_H_
|
19
|
+
|
20
|
+
#include <string.h>
|
21
|
+
|
22
|
+
namespace base {
|
23
|
+
|
24
|
+
#if defined(_WIN32)
|
25
|
+
// Compare the two strings s1 and s2 without regard to case using
|
26
|
+
// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if
|
27
|
+
// s2 > s1 according to a lexicographic comparison.
|
28
|
+
inline int strcasecmp(const char* s1, const char* s2) {
|
29
|
+
return _stricmp(s1, s2);
|
30
|
+
}
|
31
|
+
inline int strncasecmp(const char* s1, const char* s2, size_t n) {
|
32
|
+
return _strnicmp(s1, s2, n);
|
33
|
+
}
|
34
|
+
#else
|
35
|
+
inline int strcasecmp(const char* s1, const char* s2) {
|
36
|
+
return ::strcasecmp(s1, s2);
|
37
|
+
}
|
38
|
+
inline int strncasecmp(const char* s1, const char* s2, size_t n) {
|
39
|
+
return ::strncasecmp(s1, s2, n);
|
40
|
+
}
|
41
|
+
#endif
|
42
|
+
}
|
43
|
+
|
44
|
+
#ifndef HAVE_MEMRCHR
|
45
|
+
#if defined(__GLIBC__) && ((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ >= 2)))
|
46
|
+
#define HAVE_MEMRCHR
|
47
|
+
#endif
|
48
|
+
#endif
|
49
|
+
|
50
|
+
#ifndef HAVE_MEMRCHR
|
51
|
+
inline void* memrchr(const void* s, int c, size_t n) {
|
52
|
+
const unsigned char* p = (const unsigned char*) s;
|
53
|
+
for (p += n; n > 0; n--) {
|
54
|
+
if (*--p == c)
|
55
|
+
return (void*) p;
|
56
|
+
}
|
57
|
+
return NULL;
|
58
|
+
}
|
59
|
+
#endif
|
60
|
+
|
61
|
+
#endif // UTIL_STRING_UTIL_H_
|
@@ -0,0 +1,66 @@
|
|
1
|
+
// Copyright 2016 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
////////////////////////////////////////////////////////////////////////////////
|
16
|
+
|
17
|
+
#ifndef UTIL_VARSETTER_H_
|
18
|
+
#define UTIL_VARSETTER_H_
|
19
|
+
|
20
|
+
//
|
21
|
+
// Use a VarSetter object to temporarily set an object of some sort to
|
22
|
+
// a particular value. When the VarSetter object is destructed, the
|
23
|
+
// underlying object will revert to its former value.
|
24
|
+
//
|
25
|
+
// Sample code:
|
26
|
+
//
|
27
|
+
#if 0
|
28
|
+
{
|
29
|
+
bool b = true;
|
30
|
+
{
|
31
|
+
VarSetter<bool> bool_setter(&b, false);
|
32
|
+
// Now b == false.
|
33
|
+
}
|
34
|
+
// Now b == true again.
|
35
|
+
}
|
36
|
+
#endif
|
37
|
+
|
38
|
+
template <class C>
|
39
|
+
class VarSetter {
|
40
|
+
public:
|
41
|
+
|
42
|
+
// Constructor that just sets the object to a fixed value
|
43
|
+
VarSetter(C* object, const C& value) : object_(object), old_value_(*object) {
|
44
|
+
*object = value;
|
45
|
+
}
|
46
|
+
|
47
|
+
~VarSetter() { *object_ = old_value_; }
|
48
|
+
|
49
|
+
private:
|
50
|
+
|
51
|
+
C*const object_;
|
52
|
+
C old_value_;
|
53
|
+
|
54
|
+
// Disallow
|
55
|
+
VarSetter(const VarSetter&);
|
56
|
+
VarSetter& operator=(const VarSetter&);
|
57
|
+
|
58
|
+
// VarSetters always live on the stack
|
59
|
+
static void* operator new (size_t);
|
60
|
+
static void* operator new[](size_t); // Redundant, no default ctor
|
61
|
+
|
62
|
+
static void operator delete (void*);
|
63
|
+
static void operator delete[](void*);
|
64
|
+
};
|
65
|
+
|
66
|
+
#endif // UTIL_VARSETTER_H_
|
@@ -0,0 +1,100 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include "compact_enc_det/compact_enc_det/compact_enc_det.h"
|
3
|
+
#include "compact_enc_det/util/encodings/encodings.h"
|
4
|
+
|
5
|
+
// Define custom Ruby class CompactEncDet::DetectEncodingResult
|
6
|
+
// for the result of CompactEncDet.detect_encoding
|
7
|
+
void Init_detect_encoding_result(VALUE rb_mCompactEncDet);
|
8
|
+
static VALUE rb_cDetectEncodingResult;
|
9
|
+
|
10
|
+
static VALUE detect_encoding_result_encoding(VALUE self)
|
11
|
+
{
|
12
|
+
return rb_iv_get(self, "@encoding");
|
13
|
+
}
|
14
|
+
|
15
|
+
static VALUE detect_encoding_result_bytes_consumed(VALUE self)
|
16
|
+
{
|
17
|
+
return rb_iv_get(self, "@bytes_consumed");
|
18
|
+
}
|
19
|
+
|
20
|
+
static VALUE detect_encoding_result_is_reliable(VALUE self)
|
21
|
+
{
|
22
|
+
return rb_iv_get(self, "@is_reliable");
|
23
|
+
}
|
24
|
+
|
25
|
+
void Init_detect_encoding_result(VALUE rb_mCompactEncDet)
|
26
|
+
{
|
27
|
+
rb_cDetectEncodingResult = rb_define_class_under(rb_mCompactEncDet, "DetectEncodingResult", rb_cObject);
|
28
|
+
rb_define_method(rb_cDetectEncodingResult, "encoding", RUBY_METHOD_FUNC(detect_encoding_result_encoding), 0);
|
29
|
+
rb_define_method(rb_cDetectEncodingResult, "bytes_consumed", RUBY_METHOD_FUNC(detect_encoding_result_bytes_consumed), 0);
|
30
|
+
rb_define_method(rb_cDetectEncodingResult, "is_reliable?", RUBY_METHOD_FUNC(detect_encoding_result_is_reliable), 0);
|
31
|
+
}
|
32
|
+
|
33
|
+
// Ruby wrapper CompactEncDet.detect_encoding
|
34
|
+
// for the CompactEncDet::DetectEncoding C++ function
|
35
|
+
static VALUE detect_encoding(int argc, VALUE *argv, VALUE self)
|
36
|
+
{
|
37
|
+
VALUE ruby_text,
|
38
|
+
ruby_text_length,
|
39
|
+
url_hint,
|
40
|
+
http_charset_hint,
|
41
|
+
meta_charset_hint,
|
42
|
+
encoding_hint,
|
43
|
+
language_hint,
|
44
|
+
corpus_type,
|
45
|
+
ignore_7bit_mail_encodings;
|
46
|
+
|
47
|
+
// Parse the Ruby arguments
|
48
|
+
rb_scan_args(argc, argv, "27",
|
49
|
+
&ruby_text,
|
50
|
+
&ruby_text_length,
|
51
|
+
&url_hint,
|
52
|
+
&http_charset_hint,
|
53
|
+
&meta_charset_hint,
|
54
|
+
&encoding_hint,
|
55
|
+
&language_hint,
|
56
|
+
&corpus_type,
|
57
|
+
&ignore_7bit_mail_encodings);
|
58
|
+
|
59
|
+
// Convert the Ruby values to C types
|
60
|
+
const char *text = StringValueCStr(ruby_text);
|
61
|
+
const int text_length = NUM2INT(ruby_text_length);
|
62
|
+
|
63
|
+
// Declare the output variables
|
64
|
+
int bytes_consumed;
|
65
|
+
bool is_reliable;
|
66
|
+
|
67
|
+
// Detect the encoding using CompactEncDet::DetectEncoding
|
68
|
+
Encoding encoding = CompactEncDet::DetectEncoding(
|
69
|
+
text, text_length,
|
70
|
+
NIL_P(url_hint) ? nullptr : StringValueCStr(url_hint),
|
71
|
+
NIL_P(http_charset_hint) ? nullptr : StringValueCStr(http_charset_hint),
|
72
|
+
NIL_P(meta_charset_hint) ? nullptr : StringValueCStr(meta_charset_hint),
|
73
|
+
NIL_P(encoding_hint) ? UNKNOWN_ENCODING : NUM2INT(encoding_hint),
|
74
|
+
NIL_P(language_hint) ? UNKNOWN_LANGUAGE : static_cast<Language>(NUM2INT(language_hint)),
|
75
|
+
NIL_P(corpus_type) ? CompactEncDet::WEB_CORPUS : static_cast<CompactEncDet::TextCorpusType>(NUM2INT(corpus_type)),
|
76
|
+
NIL_P(ignore_7bit_mail_encodings) ? false : RTEST(ignore_7bit_mail_encodings),
|
77
|
+
&bytes_consumed,
|
78
|
+
&is_reliable);
|
79
|
+
|
80
|
+
// Convert the encoding enum to string using MimeEncodingName
|
81
|
+
const char* encoding_mime_name = MimeEncodingName(encoding);
|
82
|
+
VALUE rb_encoding_mime_name = rb_str_new_cstr(encoding_mime_name);
|
83
|
+
|
84
|
+
// Find the Ruby Encoding class
|
85
|
+
VALUE rb_encoding = rb_funcall(rb_cEncoding, rb_intern("find"), 1, rb_encoding_mime_name);
|
86
|
+
|
87
|
+
// Return the detected encoding as a Ruby class
|
88
|
+
VALUE result = rb_class_new_instance(0, NULL, rb_cDetectEncodingResult);
|
89
|
+
rb_iv_set(result, "@encoding", rb_encoding);
|
90
|
+
rb_iv_set(result, "@bytes_consumed", rb_int_new(bytes_consumed));
|
91
|
+
rb_iv_set(result, "@is_reliable", is_reliable ? Qtrue : Qfalse);
|
92
|
+
return result;
|
93
|
+
}
|
94
|
+
|
95
|
+
extern "C" void Init_compact_enc_det()
|
96
|
+
{
|
97
|
+
VALUE rb_mCompactEncDet = rb_define_module("CompactEncDet");
|
98
|
+
Init_detect_encoding_result(rb_mCompactEncDet);
|
99
|
+
rb_define_module_function(rb_mCompactEncDet, "detect_encoding", RUBY_METHOD_FUNC(detect_encoding), -1);
|
100
|
+
}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require "mkmf"
|
2
|
+
require "rbconfig"
|
3
|
+
|
4
|
+
compact_enc_det_path = File.expand_path("../compact_enc_det/compact_enc_det", __dir__)
|
5
|
+
|
6
|
+
host_cpu = RbConfig::CONFIG['host_cpu']
|
7
|
+
is_amd64 = host_cpu == 'x86_64' || host_cpu == 'amd64'
|
8
|
+
|
9
|
+
compact_enc_det_build_command = "cd #{compact_enc_det_path} &&"
|
10
|
+
compact_enc_det_build_command += " CXXFLAGS=\"-fPIC\"" if is_amd64
|
11
|
+
compact_enc_det_build_command += " ./autogen.sh"
|
12
|
+
|
13
|
+
unless system(compact_enc_det_build_command)
|
14
|
+
raise "Failed to build the compact_enc_det library"
|
15
|
+
end
|
16
|
+
|
17
|
+
$INCFLAGS << " -I$(srcdir)/compact_enc_det"
|
18
|
+
$LDFLAGS << " -L$(srcdir)/compact_enc_det/lib -lced"
|
19
|
+
|
20
|
+
create_makefile("compact_enc_det/compact_enc_det")
|
metadata
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: compact_enc_det
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Cloudaper
|
8
|
+
- Kryštof Korb
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2024-02-04 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: minitest
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '5.0'
|
21
|
+
type: :development
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - "~>"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: '5.0'
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: rake-compiler
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - "~>"
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '1.0'
|
35
|
+
type: :development
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '1.0'
|
42
|
+
description: Ruby bindings for Google's Compact Encoding Detection C++ library
|
43
|
+
email:
|
44
|
+
- hey@cloudaper.dev
|
45
|
+
- krystof@korb.cz
|
46
|
+
executables: []
|
47
|
+
extensions:
|
48
|
+
- ext/compact_enc_det/extconf.rb
|
49
|
+
extra_rdoc_files: []
|
50
|
+
files:
|
51
|
+
- ext/compact_enc_det/compact_enc_det.cc
|
52
|
+
- ext/compact_enc_det/compact_enc_det/CMakeLists.txt
|
53
|
+
- ext/compact_enc_det/compact_enc_det/LICENSE
|
54
|
+
- ext/compact_enc_det/compact_enc_det/README.md
|
55
|
+
- ext/compact_enc_det/compact_enc_det/autogen.sh
|
56
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc
|
57
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h
|
58
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc
|
59
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h
|
60
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h
|
61
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc
|
62
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h
|
63
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc
|
64
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc
|
65
|
+
- ext/compact_enc_det/compact_enc_det/util/basictypes.h
|
66
|
+
- ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h
|
67
|
+
- ext/compact_enc_det/compact_enc_det/util/commandlineflags.h
|
68
|
+
- ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc
|
69
|
+
- ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h
|
70
|
+
- ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h
|
71
|
+
- ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc
|
72
|
+
- ext/compact_enc_det/compact_enc_det/util/languages/languages.cc
|
73
|
+
- ext/compact_enc_det/compact_enc_det/util/languages/languages.h
|
74
|
+
- ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h
|
75
|
+
- ext/compact_enc_det/compact_enc_det/util/logging.h
|
76
|
+
- ext/compact_enc_det/compact_enc_det/util/port.h
|
77
|
+
- ext/compact_enc_det/compact_enc_det/util/string_util.h
|
78
|
+
- ext/compact_enc_det/compact_enc_det/util/varsetter.h
|
79
|
+
- ext/compact_enc_det/extconf.rb
|
80
|
+
- lib/compact_enc_det.rb
|
81
|
+
- lib/compact_enc_det/version.rb
|
82
|
+
homepage: https://github.com/cloudaper/compact_enc_det
|
83
|
+
licenses:
|
84
|
+
- MIT
|
85
|
+
- Apache-2.0
|
86
|
+
metadata: {}
|
87
|
+
post_install_message:
|
88
|
+
rdoc_options: []
|
89
|
+
require_paths:
|
90
|
+
- lib
|
91
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '2.7'
|
96
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
requirements: []
|
102
|
+
rubygems_version: 3.5.3
|
103
|
+
signing_key:
|
104
|
+
specification_version: 4
|
105
|
+
summary: Compact Encoding Detection
|
106
|
+
test_files: []
|