compact_enc_det 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
  3. data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
  4. data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
  5. data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
  6. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
  7. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
  8. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
  9. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
  10. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
  11. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
  12. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
  13. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
  14. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
  15. data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
  16. data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
  17. data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
  18. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
  19. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
  20. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
  21. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
  22. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
  23. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
  24. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
  25. data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
  26. data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
  27. data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
  28. data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
  29. data/ext/compact_enc_det/compact_enc_det.cc +100 -0
  30. data/ext/compact_enc_det/extconf.rb +20 -0
  31. data/lib/compact_enc_det/version.rb +3 -0
  32. data/lib/compact_enc_det.rb +2 -0
  33. metadata +106 -0
@@ -0,0 +1,61 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #ifndef UTIL_STRING_UTIL_H_
18
+ #define UTIL_STRING_UTIL_H_
19
+
20
+ #include <string.h>
21
+
22
+ namespace base {
23
+
24
+ #if defined(_WIN32)
25
+ // Compare the two strings s1 and s2 without regard to case using
26
+ // the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if
27
+ // s2 > s1 according to a lexicographic comparison.
28
+ inline int strcasecmp(const char* s1, const char* s2) {
29
+ return _stricmp(s1, s2);
30
+ }
31
+ inline int strncasecmp(const char* s1, const char* s2, size_t n) {
32
+ return _strnicmp(s1, s2, n);
33
+ }
34
+ #else
35
+ inline int strcasecmp(const char* s1, const char* s2) {
36
+ return ::strcasecmp(s1, s2);
37
+ }
38
+ inline int strncasecmp(const char* s1, const char* s2, size_t n) {
39
+ return ::strncasecmp(s1, s2, n);
40
+ }
41
+ #endif
42
+ }
43
+
44
+ #ifndef HAVE_MEMRCHR
45
+ #if defined(__GLIBC__) && ((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ >= 2)))
46
+ #define HAVE_MEMRCHR
47
+ #endif
48
+ #endif
49
+
50
+ #ifndef HAVE_MEMRCHR
51
+ inline void* memrchr(const void* s, int c, size_t n) {
52
+ const unsigned char* p = (const unsigned char*) s;
53
+ for (p += n; n > 0; n--) {
54
+ if (*--p == c)
55
+ return (void*) p;
56
+ }
57
+ return NULL;
58
+ }
59
+ #endif
60
+
61
+ #endif // UTIL_STRING_UTIL_H_
@@ -0,0 +1,66 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #ifndef UTIL_VARSETTER_H_
18
+ #define UTIL_VARSETTER_H_
19
+
20
+ //
21
+ // Use a VarSetter object to temporarily set an object of some sort to
22
+ // a particular value. When the VarSetter object is destructed, the
23
+ // underlying object will revert to its former value.
24
+ //
25
+ // Sample code:
26
+ //
27
+ #if 0
28
+ {
29
+ bool b = true;
30
+ {
31
+ VarSetter<bool> bool_setter(&b, false);
32
+ // Now b == false.
33
+ }
34
+ // Now b == true again.
35
+ }
36
+ #endif
37
+
38
+ template <class C>
39
+ class VarSetter {
40
+ public:
41
+
42
+ // Constructor that just sets the object to a fixed value
43
+ VarSetter(C* object, const C& value) : object_(object), old_value_(*object) {
44
+ *object = value;
45
+ }
46
+
47
+ ~VarSetter() { *object_ = old_value_; }
48
+
49
+ private:
50
+
51
+ C*const object_;
52
+ C old_value_;
53
+
54
+ // Disallow
55
+ VarSetter(const VarSetter&);
56
+ VarSetter& operator=(const VarSetter&);
57
+
58
+ // VarSetters always live on the stack
59
+ static void* operator new (size_t);
60
+ static void* operator new[](size_t); // Redundant, no default ctor
61
+
62
+ static void operator delete (void*);
63
+ static void operator delete[](void*);
64
+ };
65
+
66
+ #endif // UTIL_VARSETTER_H_
@@ -0,0 +1,100 @@
1
+ #include <ruby.h>
2
+ #include "compact_enc_det/compact_enc_det/compact_enc_det.h"
3
+ #include "compact_enc_det/util/encodings/encodings.h"
4
+
5
+ // Define custom Ruby class CompactEncDet::DetectEncodingResult
6
+ // for the result of CompactEncDet.detect_encoding
7
+ void Init_detect_encoding_result(VALUE rb_mCompactEncDet);
8
+ static VALUE rb_cDetectEncodingResult;
9
+
10
+ static VALUE detect_encoding_result_encoding(VALUE self)
11
+ {
12
+ return rb_iv_get(self, "@encoding");
13
+ }
14
+
15
+ static VALUE detect_encoding_result_bytes_consumed(VALUE self)
16
+ {
17
+ return rb_iv_get(self, "@bytes_consumed");
18
+ }
19
+
20
+ static VALUE detect_encoding_result_is_reliable(VALUE self)
21
+ {
22
+ return rb_iv_get(self, "@is_reliable");
23
+ }
24
+
25
+ void Init_detect_encoding_result(VALUE rb_mCompactEncDet)
26
+ {
27
+ rb_cDetectEncodingResult = rb_define_class_under(rb_mCompactEncDet, "DetectEncodingResult", rb_cObject);
28
+ rb_define_method(rb_cDetectEncodingResult, "encoding", RUBY_METHOD_FUNC(detect_encoding_result_encoding), 0);
29
+ rb_define_method(rb_cDetectEncodingResult, "bytes_consumed", RUBY_METHOD_FUNC(detect_encoding_result_bytes_consumed), 0);
30
+ rb_define_method(rb_cDetectEncodingResult, "is_reliable?", RUBY_METHOD_FUNC(detect_encoding_result_is_reliable), 0);
31
+ }
32
+
33
+ // Ruby wrapper CompactEncDet.detect_encoding
34
+ // for the CompactEncDet::DetectEncoding C++ function
35
+ static VALUE detect_encoding(int argc, VALUE *argv, VALUE self)
36
+ {
37
+ VALUE ruby_text,
38
+ ruby_text_length,
39
+ url_hint,
40
+ http_charset_hint,
41
+ meta_charset_hint,
42
+ encoding_hint,
43
+ language_hint,
44
+ corpus_type,
45
+ ignore_7bit_mail_encodings;
46
+
47
+ // Parse the Ruby arguments
48
+ rb_scan_args(argc, argv, "27",
49
+ &ruby_text,
50
+ &ruby_text_length,
51
+ &url_hint,
52
+ &http_charset_hint,
53
+ &meta_charset_hint,
54
+ &encoding_hint,
55
+ &language_hint,
56
+ &corpus_type,
57
+ &ignore_7bit_mail_encodings);
58
+
59
+ // Convert the Ruby values to C types
60
+ const char *text = StringValueCStr(ruby_text);
61
+ const int text_length = NUM2INT(ruby_text_length);
62
+
63
+ // Declare the output variables
64
+ int bytes_consumed;
65
+ bool is_reliable;
66
+
67
+ // Detect the encoding using CompactEncDet::DetectEncoding
68
+ Encoding encoding = CompactEncDet::DetectEncoding(
69
+ text, text_length,
70
+ NIL_P(url_hint) ? nullptr : StringValueCStr(url_hint),
71
+ NIL_P(http_charset_hint) ? nullptr : StringValueCStr(http_charset_hint),
72
+ NIL_P(meta_charset_hint) ? nullptr : StringValueCStr(meta_charset_hint),
73
+ NIL_P(encoding_hint) ? UNKNOWN_ENCODING : NUM2INT(encoding_hint),
74
+ NIL_P(language_hint) ? UNKNOWN_LANGUAGE : static_cast<Language>(NUM2INT(language_hint)),
75
+ NIL_P(corpus_type) ? CompactEncDet::WEB_CORPUS : static_cast<CompactEncDet::TextCorpusType>(NUM2INT(corpus_type)),
76
+ NIL_P(ignore_7bit_mail_encodings) ? false : RTEST(ignore_7bit_mail_encodings),
77
+ &bytes_consumed,
78
+ &is_reliable);
79
+
80
+ // Convert the encoding enum to string using MimeEncodingName
81
+ const char* encoding_mime_name = MimeEncodingName(encoding);
82
+ VALUE rb_encoding_mime_name = rb_str_new_cstr(encoding_mime_name);
83
+
84
+ // Find the Ruby Encoding class
85
+ VALUE rb_encoding = rb_funcall(rb_cEncoding, rb_intern("find"), 1, rb_encoding_mime_name);
86
+
87
+ // Return the detected encoding as a Ruby class
88
+ VALUE result = rb_class_new_instance(0, NULL, rb_cDetectEncodingResult);
89
+ rb_iv_set(result, "@encoding", rb_encoding);
90
+ rb_iv_set(result, "@bytes_consumed", rb_int_new(bytes_consumed));
91
+ rb_iv_set(result, "@is_reliable", is_reliable ? Qtrue : Qfalse);
92
+ return result;
93
+ }
94
+
95
+ extern "C" void Init_compact_enc_det()
96
+ {
97
+ VALUE rb_mCompactEncDet = rb_define_module("CompactEncDet");
98
+ Init_detect_encoding_result(rb_mCompactEncDet);
99
+ rb_define_module_function(rb_mCompactEncDet, "detect_encoding", RUBY_METHOD_FUNC(detect_encoding), -1);
100
+ }
@@ -0,0 +1,20 @@
1
+ require "mkmf"
2
+ require "rbconfig"
3
+
4
+ compact_enc_det_path = File.expand_path("../compact_enc_det/compact_enc_det", __dir__)
5
+
6
+ host_cpu = RbConfig::CONFIG['host_cpu']
7
+ is_amd64 = host_cpu == 'x86_64' || host_cpu == 'amd64'
8
+
9
+ compact_enc_det_build_command = "cd #{compact_enc_det_path} &&"
10
+ compact_enc_det_build_command += " CXXFLAGS=\"-fPIC\"" if is_amd64
11
+ compact_enc_det_build_command += " ./autogen.sh"
12
+
13
+ unless system(compact_enc_det_build_command)
14
+ raise "Failed to build the compact_enc_det library"
15
+ end
16
+
17
+ $INCFLAGS << " -I$(srcdir)/compact_enc_det"
18
+ $LDFLAGS << " -L$(srcdir)/compact_enc_det/lib -lced"
19
+
20
+ create_makefile("compact_enc_det/compact_enc_det")
@@ -0,0 +1,3 @@
1
+ module CompactEncDet
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,2 @@
1
+ require "compact_enc_det/compact_enc_det"
2
+ require "compact_enc_det/version"
metadata ADDED
@@ -0,0 +1,106 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: compact_enc_det
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Cloudaper
8
+ - Kryštof Korb
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2024-02-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: minitest
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '5.0'
21
+ type: :development
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '5.0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: rake-compiler
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '1.0'
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '1.0'
42
+ description: Ruby bindings for Google's Compact Encoding Detection C++ library
43
+ email:
44
+ - hey@cloudaper.dev
45
+ - krystof@korb.cz
46
+ executables: []
47
+ extensions:
48
+ - ext/compact_enc_det/extconf.rb
49
+ extra_rdoc_files: []
50
+ files:
51
+ - ext/compact_enc_det/compact_enc_det.cc
52
+ - ext/compact_enc_det/compact_enc_det/CMakeLists.txt
53
+ - ext/compact_enc_det/compact_enc_det/LICENSE
54
+ - ext/compact_enc_det/compact_enc_det/README.md
55
+ - ext/compact_enc_det/compact_enc_det/autogen.sh
56
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc
57
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h
58
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc
59
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h
60
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h
61
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc
62
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h
63
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc
64
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc
65
+ - ext/compact_enc_det/compact_enc_det/util/basictypes.h
66
+ - ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h
67
+ - ext/compact_enc_det/compact_enc_det/util/commandlineflags.h
68
+ - ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc
69
+ - ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h
70
+ - ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h
71
+ - ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc
72
+ - ext/compact_enc_det/compact_enc_det/util/languages/languages.cc
73
+ - ext/compact_enc_det/compact_enc_det/util/languages/languages.h
74
+ - ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h
75
+ - ext/compact_enc_det/compact_enc_det/util/logging.h
76
+ - ext/compact_enc_det/compact_enc_det/util/port.h
77
+ - ext/compact_enc_det/compact_enc_det/util/string_util.h
78
+ - ext/compact_enc_det/compact_enc_det/util/varsetter.h
79
+ - ext/compact_enc_det/extconf.rb
80
+ - lib/compact_enc_det.rb
81
+ - lib/compact_enc_det/version.rb
82
+ homepage: https://github.com/cloudaper/compact_enc_det
83
+ licenses:
84
+ - MIT
85
+ - Apache-2.0
86
+ metadata: {}
87
+ post_install_message:
88
+ rdoc_options: []
89
+ require_paths:
90
+ - lib
91
+ required_ruby_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '2.7'
96
+ required_rubygems_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - ">="
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ requirements: []
102
+ rubygems_version: 3.5.3
103
+ signing_key:
104
+ specification_version: 4
105
+ summary: Compact Encoding Detection
106
+ test_files: []