compact_enc_det 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
- data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
- data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
- data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
- data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
- data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
- data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
- data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
- data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
- data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
- data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
- data/ext/compact_enc_det/compact_enc_det.cc +100 -0
- data/ext/compact_enc_det/extconf.rb +20 -0
- data/lib/compact_enc_det/version.rb +3 -0
- data/lib/compact_enc_det.rb +2 -0
- metadata +106 -0
@@ -0,0 +1,61 @@
|
|
1
|
+
// Copyright 2016 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
////////////////////////////////////////////////////////////////////////////////
|
16
|
+
|
17
|
+
#ifndef UTIL_STRING_UTIL_H_
|
18
|
+
#define UTIL_STRING_UTIL_H_
|
19
|
+
|
20
|
+
#include <string.h>
|
21
|
+
|
22
|
+
namespace base {
|
23
|
+
|
24
|
+
#if defined(_WIN32)
|
25
|
+
// Compare the two strings s1 and s2 without regard to case using
|
26
|
+
// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if
|
27
|
+
// s2 > s1 according to a lexicographic comparison.
|
28
|
+
inline int strcasecmp(const char* s1, const char* s2) {
|
29
|
+
return _stricmp(s1, s2);
|
30
|
+
}
|
31
|
+
inline int strncasecmp(const char* s1, const char* s2, size_t n) {
|
32
|
+
return _strnicmp(s1, s2, n);
|
33
|
+
}
|
34
|
+
#else
|
35
|
+
inline int strcasecmp(const char* s1, const char* s2) {
|
36
|
+
return ::strcasecmp(s1, s2);
|
37
|
+
}
|
38
|
+
inline int strncasecmp(const char* s1, const char* s2, size_t n) {
|
39
|
+
return ::strncasecmp(s1, s2, n);
|
40
|
+
}
|
41
|
+
#endif
|
42
|
+
}
|
43
|
+
|
44
|
+
#ifndef HAVE_MEMRCHR
|
45
|
+
#if defined(__GLIBC__) && ((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ >= 2)))
|
46
|
+
#define HAVE_MEMRCHR
|
47
|
+
#endif
|
48
|
+
#endif
|
49
|
+
|
50
|
+
#ifndef HAVE_MEMRCHR
|
51
|
+
inline void* memrchr(const void* s, int c, size_t n) {
|
52
|
+
const unsigned char* p = (const unsigned char*) s;
|
53
|
+
for (p += n; n > 0; n--) {
|
54
|
+
if (*--p == c)
|
55
|
+
return (void*) p;
|
56
|
+
}
|
57
|
+
return NULL;
|
58
|
+
}
|
59
|
+
#endif
|
60
|
+
|
61
|
+
#endif // UTIL_STRING_UTIL_H_
|
@@ -0,0 +1,66 @@
|
|
1
|
+
// Copyright 2016 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
////////////////////////////////////////////////////////////////////////////////
|
16
|
+
|
17
|
+
#ifndef UTIL_VARSETTER_H_
|
18
|
+
#define UTIL_VARSETTER_H_
|
19
|
+
|
20
|
+
//
|
21
|
+
// Use a VarSetter object to temporarily set an object of some sort to
|
22
|
+
// a particular value. When the VarSetter object is destructed, the
|
23
|
+
// underlying object will revert to its former value.
|
24
|
+
//
|
25
|
+
// Sample code:
|
26
|
+
//
|
27
|
+
#if 0
|
28
|
+
{
|
29
|
+
bool b = true;
|
30
|
+
{
|
31
|
+
VarSetter<bool> bool_setter(&b, false);
|
32
|
+
// Now b == false.
|
33
|
+
}
|
34
|
+
// Now b == true again.
|
35
|
+
}
|
36
|
+
#endif
|
37
|
+
|
38
|
+
template <class C>
|
39
|
+
class VarSetter {
|
40
|
+
public:
|
41
|
+
|
42
|
+
// Constructor that just sets the object to a fixed value
|
43
|
+
VarSetter(C* object, const C& value) : object_(object), old_value_(*object) {
|
44
|
+
*object = value;
|
45
|
+
}
|
46
|
+
|
47
|
+
~VarSetter() { *object_ = old_value_; }
|
48
|
+
|
49
|
+
private:
|
50
|
+
|
51
|
+
C*const object_;
|
52
|
+
C old_value_;
|
53
|
+
|
54
|
+
// Disallow
|
55
|
+
VarSetter(const VarSetter&);
|
56
|
+
VarSetter& operator=(const VarSetter&);
|
57
|
+
|
58
|
+
// VarSetters always live on the stack
|
59
|
+
static void* operator new (size_t);
|
60
|
+
static void* operator new[](size_t); // Redundant, no default ctor
|
61
|
+
|
62
|
+
static void operator delete (void*);
|
63
|
+
static void operator delete[](void*);
|
64
|
+
};
|
65
|
+
|
66
|
+
#endif // UTIL_VARSETTER_H_
|
@@ -0,0 +1,100 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include "compact_enc_det/compact_enc_det/compact_enc_det.h"
|
3
|
+
#include "compact_enc_det/util/encodings/encodings.h"
|
4
|
+
|
5
|
+
// Define custom Ruby class CompactEncDet::DetectEncodingResult
|
6
|
+
// for the result of CompactEncDet.detect_encoding
|
7
|
+
void Init_detect_encoding_result(VALUE rb_mCompactEncDet);
|
8
|
+
static VALUE rb_cDetectEncodingResult;
|
9
|
+
|
10
|
+
static VALUE detect_encoding_result_encoding(VALUE self)
|
11
|
+
{
|
12
|
+
return rb_iv_get(self, "@encoding");
|
13
|
+
}
|
14
|
+
|
15
|
+
static VALUE detect_encoding_result_bytes_consumed(VALUE self)
|
16
|
+
{
|
17
|
+
return rb_iv_get(self, "@bytes_consumed");
|
18
|
+
}
|
19
|
+
|
20
|
+
static VALUE detect_encoding_result_is_reliable(VALUE self)
|
21
|
+
{
|
22
|
+
return rb_iv_get(self, "@is_reliable");
|
23
|
+
}
|
24
|
+
|
25
|
+
void Init_detect_encoding_result(VALUE rb_mCompactEncDet)
|
26
|
+
{
|
27
|
+
rb_cDetectEncodingResult = rb_define_class_under(rb_mCompactEncDet, "DetectEncodingResult", rb_cObject);
|
28
|
+
rb_define_method(rb_cDetectEncodingResult, "encoding", RUBY_METHOD_FUNC(detect_encoding_result_encoding), 0);
|
29
|
+
rb_define_method(rb_cDetectEncodingResult, "bytes_consumed", RUBY_METHOD_FUNC(detect_encoding_result_bytes_consumed), 0);
|
30
|
+
rb_define_method(rb_cDetectEncodingResult, "is_reliable?", RUBY_METHOD_FUNC(detect_encoding_result_is_reliable), 0);
|
31
|
+
}
|
32
|
+
|
33
|
+
// Ruby wrapper CompactEncDet.detect_encoding
|
34
|
+
// for the CompactEncDet::DetectEncoding C++ function
|
35
|
+
static VALUE detect_encoding(int argc, VALUE *argv, VALUE self)
|
36
|
+
{
|
37
|
+
VALUE ruby_text,
|
38
|
+
ruby_text_length,
|
39
|
+
url_hint,
|
40
|
+
http_charset_hint,
|
41
|
+
meta_charset_hint,
|
42
|
+
encoding_hint,
|
43
|
+
language_hint,
|
44
|
+
corpus_type,
|
45
|
+
ignore_7bit_mail_encodings;
|
46
|
+
|
47
|
+
// Parse the Ruby arguments
|
48
|
+
rb_scan_args(argc, argv, "27",
|
49
|
+
&ruby_text,
|
50
|
+
&ruby_text_length,
|
51
|
+
&url_hint,
|
52
|
+
&http_charset_hint,
|
53
|
+
&meta_charset_hint,
|
54
|
+
&encoding_hint,
|
55
|
+
&language_hint,
|
56
|
+
&corpus_type,
|
57
|
+
&ignore_7bit_mail_encodings);
|
58
|
+
|
59
|
+
// Convert the Ruby values to C types
|
60
|
+
const char *text = StringValueCStr(ruby_text);
|
61
|
+
const int text_length = NUM2INT(ruby_text_length);
|
62
|
+
|
63
|
+
// Declare the output variables
|
64
|
+
int bytes_consumed;
|
65
|
+
bool is_reliable;
|
66
|
+
|
67
|
+
// Detect the encoding using CompactEncDet::DetectEncoding
|
68
|
+
Encoding encoding = CompactEncDet::DetectEncoding(
|
69
|
+
text, text_length,
|
70
|
+
NIL_P(url_hint) ? nullptr : StringValueCStr(url_hint),
|
71
|
+
NIL_P(http_charset_hint) ? nullptr : StringValueCStr(http_charset_hint),
|
72
|
+
NIL_P(meta_charset_hint) ? nullptr : StringValueCStr(meta_charset_hint),
|
73
|
+
NIL_P(encoding_hint) ? UNKNOWN_ENCODING : NUM2INT(encoding_hint),
|
74
|
+
NIL_P(language_hint) ? UNKNOWN_LANGUAGE : static_cast<Language>(NUM2INT(language_hint)),
|
75
|
+
NIL_P(corpus_type) ? CompactEncDet::WEB_CORPUS : static_cast<CompactEncDet::TextCorpusType>(NUM2INT(corpus_type)),
|
76
|
+
NIL_P(ignore_7bit_mail_encodings) ? false : RTEST(ignore_7bit_mail_encodings),
|
77
|
+
&bytes_consumed,
|
78
|
+
&is_reliable);
|
79
|
+
|
80
|
+
// Convert the encoding enum to string using MimeEncodingName
|
81
|
+
const char* encoding_mime_name = MimeEncodingName(encoding);
|
82
|
+
VALUE rb_encoding_mime_name = rb_str_new_cstr(encoding_mime_name);
|
83
|
+
|
84
|
+
// Find the Ruby Encoding class
|
85
|
+
VALUE rb_encoding = rb_funcall(rb_cEncoding, rb_intern("find"), 1, rb_encoding_mime_name);
|
86
|
+
|
87
|
+
// Return the detected encoding as a Ruby class
|
88
|
+
VALUE result = rb_class_new_instance(0, NULL, rb_cDetectEncodingResult);
|
89
|
+
rb_iv_set(result, "@encoding", rb_encoding);
|
90
|
+
rb_iv_set(result, "@bytes_consumed", rb_int_new(bytes_consumed));
|
91
|
+
rb_iv_set(result, "@is_reliable", is_reliable ? Qtrue : Qfalse);
|
92
|
+
return result;
|
93
|
+
}
|
94
|
+
|
95
|
+
extern "C" void Init_compact_enc_det()
|
96
|
+
{
|
97
|
+
VALUE rb_mCompactEncDet = rb_define_module("CompactEncDet");
|
98
|
+
Init_detect_encoding_result(rb_mCompactEncDet);
|
99
|
+
rb_define_module_function(rb_mCompactEncDet, "detect_encoding", RUBY_METHOD_FUNC(detect_encoding), -1);
|
100
|
+
}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require "mkmf"
|
2
|
+
require "rbconfig"
|
3
|
+
|
4
|
+
compact_enc_det_path = File.expand_path("../compact_enc_det/compact_enc_det", __dir__)
|
5
|
+
|
6
|
+
host_cpu = RbConfig::CONFIG['host_cpu']
|
7
|
+
is_amd64 = host_cpu == 'x86_64' || host_cpu == 'amd64'
|
8
|
+
|
9
|
+
compact_enc_det_build_command = "cd #{compact_enc_det_path} &&"
|
10
|
+
compact_enc_det_build_command += " CXXFLAGS=\"-fPIC\"" if is_amd64
|
11
|
+
compact_enc_det_build_command += " ./autogen.sh"
|
12
|
+
|
13
|
+
unless system(compact_enc_det_build_command)
|
14
|
+
raise "Failed to build the compact_enc_det library"
|
15
|
+
end
|
16
|
+
|
17
|
+
$INCFLAGS << " -I$(srcdir)/compact_enc_det"
|
18
|
+
$LDFLAGS << " -L$(srcdir)/compact_enc_det/lib -lced"
|
19
|
+
|
20
|
+
create_makefile("compact_enc_det/compact_enc_det")
|
metadata
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: compact_enc_det
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Cloudaper
|
8
|
+
- Kryštof Korb
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2024-02-04 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: minitest
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - "~>"
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: '5.0'
|
21
|
+
type: :development
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - "~>"
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: '5.0'
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: rake-compiler
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - "~>"
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: '1.0'
|
35
|
+
type: :development
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: '1.0'
|
42
|
+
description: Ruby bindings for Google's Compact Encoding Detection C++ library
|
43
|
+
email:
|
44
|
+
- hey@cloudaper.dev
|
45
|
+
- krystof@korb.cz
|
46
|
+
executables: []
|
47
|
+
extensions:
|
48
|
+
- ext/compact_enc_det/extconf.rb
|
49
|
+
extra_rdoc_files: []
|
50
|
+
files:
|
51
|
+
- ext/compact_enc_det/compact_enc_det.cc
|
52
|
+
- ext/compact_enc_det/compact_enc_det/CMakeLists.txt
|
53
|
+
- ext/compact_enc_det/compact_enc_det/LICENSE
|
54
|
+
- ext/compact_enc_det/compact_enc_det/README.md
|
55
|
+
- ext/compact_enc_det/compact_enc_det/autogen.sh
|
56
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc
|
57
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h
|
58
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc
|
59
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h
|
60
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h
|
61
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc
|
62
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h
|
63
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc
|
64
|
+
- ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc
|
65
|
+
- ext/compact_enc_det/compact_enc_det/util/basictypes.h
|
66
|
+
- ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h
|
67
|
+
- ext/compact_enc_det/compact_enc_det/util/commandlineflags.h
|
68
|
+
- ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc
|
69
|
+
- ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h
|
70
|
+
- ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h
|
71
|
+
- ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc
|
72
|
+
- ext/compact_enc_det/compact_enc_det/util/languages/languages.cc
|
73
|
+
- ext/compact_enc_det/compact_enc_det/util/languages/languages.h
|
74
|
+
- ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h
|
75
|
+
- ext/compact_enc_det/compact_enc_det/util/logging.h
|
76
|
+
- ext/compact_enc_det/compact_enc_det/util/port.h
|
77
|
+
- ext/compact_enc_det/compact_enc_det/util/string_util.h
|
78
|
+
- ext/compact_enc_det/compact_enc_det/util/varsetter.h
|
79
|
+
- ext/compact_enc_det/extconf.rb
|
80
|
+
- lib/compact_enc_det.rb
|
81
|
+
- lib/compact_enc_det/version.rb
|
82
|
+
homepage: https://github.com/cloudaper/compact_enc_det
|
83
|
+
licenses:
|
84
|
+
- MIT
|
85
|
+
- Apache-2.0
|
86
|
+
metadata: {}
|
87
|
+
post_install_message:
|
88
|
+
rdoc_options: []
|
89
|
+
require_paths:
|
90
|
+
- lib
|
91
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '2.7'
|
96
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
|
+
requirements:
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: '0'
|
101
|
+
requirements: []
|
102
|
+
rubygems_version: 3.5.3
|
103
|
+
signing_key:
|
104
|
+
specification_version: 4
|
105
|
+
summary: Compact Encoding Detection
|
106
|
+
test_files: []
|