compact_enc_det 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
  3. data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
  4. data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
  5. data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
  6. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
  7. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
  8. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
  9. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
  10. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
  11. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
  12. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
  13. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
  14. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
  15. data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
  16. data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
  17. data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
  18. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
  19. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
  20. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
  21. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
  22. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
  23. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
  24. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
  25. data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
  26. data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
  27. data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
  28. data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
  29. data/ext/compact_enc_det/compact_enc_det.cc +100 -0
  30. data/ext/compact_enc_det/extconf.rb +20 -0
  31. data/lib/compact_enc_det/version.rb +3 -0
  32. data/lib/compact_enc_det.rb +2 -0
  33. metadata +106 -0
@@ -0,0 +1,61 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #ifndef UTIL_STRING_UTIL_H_
18
+ #define UTIL_STRING_UTIL_H_
19
+
20
+ #include <string.h>
21
+
22
+ namespace base {
23
+
24
+ #if defined(_WIN32)
25
+ // Compare the two strings s1 and s2 without regard to case using
26
+ // the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if
27
+ // s2 > s1 according to a lexicographic comparison.
28
+ inline int strcasecmp(const char* s1, const char* s2) {
29
+ return _stricmp(s1, s2);
30
+ }
31
+ inline int strncasecmp(const char* s1, const char* s2, size_t n) {
32
+ return _strnicmp(s1, s2, n);
33
+ }
34
+ #else
35
+ inline int strcasecmp(const char* s1, const char* s2) {
36
+ return ::strcasecmp(s1, s2);
37
+ }
38
+ inline int strncasecmp(const char* s1, const char* s2, size_t n) {
39
+ return ::strncasecmp(s1, s2, n);
40
+ }
41
+ #endif
42
+ }
43
+
44
+ #ifndef HAVE_MEMRCHR
45
+ #if defined(__GLIBC__) && ((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ >= 2)))
46
+ #define HAVE_MEMRCHR
47
+ #endif
48
+ #endif
49
+
50
+ #ifndef HAVE_MEMRCHR
51
+ inline void* memrchr(const void* s, int c, size_t n) {
52
+ const unsigned char* p = (const unsigned char*) s;
53
+ for (p += n; n > 0; n--) {
54
+ if (*--p == c)
55
+ return (void*) p;
56
+ }
57
+ return NULL;
58
+ }
59
+ #endif
60
+
61
+ #endif // UTIL_STRING_UTIL_H_
@@ -0,0 +1,66 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #ifndef UTIL_VARSETTER_H_
18
+ #define UTIL_VARSETTER_H_
19
+
20
+ //
21
+ // Use a VarSetter object to temporarily set an object of some sort to
22
+ // a particular value. When the VarSetter object is destructed, the
23
+ // underlying object will revert to its former value.
24
+ //
25
+ // Sample code:
26
+ //
27
+ #if 0
28
+ {
29
+ bool b = true;
30
+ {
31
+ VarSetter<bool> bool_setter(&b, false);
32
+ // Now b == false.
33
+ }
34
+ // Now b == true again.
35
+ }
36
+ #endif
37
+
38
+ template <class C>
39
+ class VarSetter {
40
+ public:
41
+
42
+ // Constructor that just sets the object to a fixed value
43
+ VarSetter(C* object, const C& value) : object_(object), old_value_(*object) {
44
+ *object = value;
45
+ }
46
+
47
+ ~VarSetter() { *object_ = old_value_; }
48
+
49
+ private:
50
+
51
+ C*const object_;
52
+ C old_value_;
53
+
54
+ // Disallow
55
+ VarSetter(const VarSetter&);
56
+ VarSetter& operator=(const VarSetter&);
57
+
58
+ // VarSetters always live on the stack
59
+ static void* operator new (size_t);
60
+ static void* operator new[](size_t); // Redundant, no default ctor
61
+
62
+ static void operator delete (void*);
63
+ static void operator delete[](void*);
64
+ };
65
+
66
+ #endif // UTIL_VARSETTER_H_
@@ -0,0 +1,100 @@
1
+ #include <ruby.h>
2
+ #include "compact_enc_det/compact_enc_det/compact_enc_det.h"
3
+ #include "compact_enc_det/util/encodings/encodings.h"
4
+
5
+ // Define custom Ruby class CompactEncDet::DetectEncodingResult
6
+ // for the result of CompactEncDet.detect_encoding
7
+ void Init_detect_encoding_result(VALUE rb_mCompactEncDet);
8
+ static VALUE rb_cDetectEncodingResult;
9
+
10
+ static VALUE detect_encoding_result_encoding(VALUE self)
11
+ {
12
+ return rb_iv_get(self, "@encoding");
13
+ }
14
+
15
+ static VALUE detect_encoding_result_bytes_consumed(VALUE self)
16
+ {
17
+ return rb_iv_get(self, "@bytes_consumed");
18
+ }
19
+
20
+ static VALUE detect_encoding_result_is_reliable(VALUE self)
21
+ {
22
+ return rb_iv_get(self, "@is_reliable");
23
+ }
24
+
25
+ void Init_detect_encoding_result(VALUE rb_mCompactEncDet)
26
+ {
27
+ rb_cDetectEncodingResult = rb_define_class_under(rb_mCompactEncDet, "DetectEncodingResult", rb_cObject);
28
+ rb_define_method(rb_cDetectEncodingResult, "encoding", RUBY_METHOD_FUNC(detect_encoding_result_encoding), 0);
29
+ rb_define_method(rb_cDetectEncodingResult, "bytes_consumed", RUBY_METHOD_FUNC(detect_encoding_result_bytes_consumed), 0);
30
+ rb_define_method(rb_cDetectEncodingResult, "is_reliable?", RUBY_METHOD_FUNC(detect_encoding_result_is_reliable), 0);
31
+ }
32
+
33
+ // Ruby wrapper CompactEncDet.detect_encoding
34
+ // for the CompactEncDet::DetectEncoding C++ function
35
+ static VALUE detect_encoding(int argc, VALUE *argv, VALUE self)
36
+ {
37
+ VALUE ruby_text,
38
+ ruby_text_length,
39
+ url_hint,
40
+ http_charset_hint,
41
+ meta_charset_hint,
42
+ encoding_hint,
43
+ language_hint,
44
+ corpus_type,
45
+ ignore_7bit_mail_encodings;
46
+
47
+ // Parse the Ruby arguments
48
+ rb_scan_args(argc, argv, "27",
49
+ &ruby_text,
50
+ &ruby_text_length,
51
+ &url_hint,
52
+ &http_charset_hint,
53
+ &meta_charset_hint,
54
+ &encoding_hint,
55
+ &language_hint,
56
+ &corpus_type,
57
+ &ignore_7bit_mail_encodings);
58
+
59
+ // Convert the Ruby values to C types
60
+ const char *text = StringValueCStr(ruby_text);
61
+ const int text_length = NUM2INT(ruby_text_length);
62
+
63
+ // Declare the output variables
64
+ int bytes_consumed;
65
+ bool is_reliable;
66
+
67
+ // Detect the encoding using CompactEncDet::DetectEncoding
68
+ Encoding encoding = CompactEncDet::DetectEncoding(
69
+ text, text_length,
70
+ NIL_P(url_hint) ? nullptr : StringValueCStr(url_hint),
71
+ NIL_P(http_charset_hint) ? nullptr : StringValueCStr(http_charset_hint),
72
+ NIL_P(meta_charset_hint) ? nullptr : StringValueCStr(meta_charset_hint),
73
+ NIL_P(encoding_hint) ? UNKNOWN_ENCODING : NUM2INT(encoding_hint),
74
+ NIL_P(language_hint) ? UNKNOWN_LANGUAGE : static_cast<Language>(NUM2INT(language_hint)),
75
+ NIL_P(corpus_type) ? CompactEncDet::WEB_CORPUS : static_cast<CompactEncDet::TextCorpusType>(NUM2INT(corpus_type)),
76
+ NIL_P(ignore_7bit_mail_encodings) ? false : RTEST(ignore_7bit_mail_encodings),
77
+ &bytes_consumed,
78
+ &is_reliable);
79
+
80
+ // Convert the encoding enum to string using MimeEncodingName
81
+ const char* encoding_mime_name = MimeEncodingName(encoding);
82
+ VALUE rb_encoding_mime_name = rb_str_new_cstr(encoding_mime_name);
83
+
84
+ // Find the Ruby Encoding class
85
+ VALUE rb_encoding = rb_funcall(rb_cEncoding, rb_intern("find"), 1, rb_encoding_mime_name);
86
+
87
+ // Return the detected encoding as a Ruby class
88
+ VALUE result = rb_class_new_instance(0, NULL, rb_cDetectEncodingResult);
89
+ rb_iv_set(result, "@encoding", rb_encoding);
90
+ rb_iv_set(result, "@bytes_consumed", rb_int_new(bytes_consumed));
91
+ rb_iv_set(result, "@is_reliable", is_reliable ? Qtrue : Qfalse);
92
+ return result;
93
+ }
94
+
95
+ extern "C" void Init_compact_enc_det()
96
+ {
97
+ VALUE rb_mCompactEncDet = rb_define_module("CompactEncDet");
98
+ Init_detect_encoding_result(rb_mCompactEncDet);
99
+ rb_define_module_function(rb_mCompactEncDet, "detect_encoding", RUBY_METHOD_FUNC(detect_encoding), -1);
100
+ }
@@ -0,0 +1,20 @@
1
+ require "mkmf"
2
+ require "rbconfig"
3
+
4
+ compact_enc_det_path = File.expand_path("../compact_enc_det/compact_enc_det", __dir__)
5
+
6
+ host_cpu = RbConfig::CONFIG['host_cpu']
7
+ is_amd64 = host_cpu == 'x86_64' || host_cpu == 'amd64'
8
+
9
+ compact_enc_det_build_command = "cd #{compact_enc_det_path} &&"
10
+ compact_enc_det_build_command += " CXXFLAGS=\"-fPIC\"" if is_amd64
11
+ compact_enc_det_build_command += " ./autogen.sh"
12
+
13
+ unless system(compact_enc_det_build_command)
14
+ raise "Failed to build the compact_enc_det library"
15
+ end
16
+
17
+ $INCFLAGS << " -I$(srcdir)/compact_enc_det"
18
+ $LDFLAGS << " -L$(srcdir)/compact_enc_det/lib -lced"
19
+
20
+ create_makefile("compact_enc_det/compact_enc_det")
@@ -0,0 +1,3 @@
1
+ module CompactEncDet
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,2 @@
1
+ require "compact_enc_det/compact_enc_det"
2
+ require "compact_enc_det/version"
metadata ADDED
@@ -0,0 +1,106 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: compact_enc_det
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Cloudaper
8
+ - Kryštof Korb
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2024-02-04 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: minitest
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '5.0'
21
+ type: :development
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '5.0'
28
+ - !ruby/object:Gem::Dependency
29
+ name: rake-compiler
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '1.0'
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '1.0'
42
+ description: Ruby bindings for Google's Compact Encoding Detection C++ library
43
+ email:
44
+ - hey@cloudaper.dev
45
+ - krystof@korb.cz
46
+ executables: []
47
+ extensions:
48
+ - ext/compact_enc_det/extconf.rb
49
+ extra_rdoc_files: []
50
+ files:
51
+ - ext/compact_enc_det/compact_enc_det.cc
52
+ - ext/compact_enc_det/compact_enc_det/CMakeLists.txt
53
+ - ext/compact_enc_det/compact_enc_det/LICENSE
54
+ - ext/compact_enc_det/compact_enc_det/README.md
55
+ - ext/compact_enc_det/compact_enc_det/autogen.sh
56
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc
57
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h
58
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc
59
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h
60
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h
61
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc
62
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h
63
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc
64
+ - ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc
65
+ - ext/compact_enc_det/compact_enc_det/util/basictypes.h
66
+ - ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h
67
+ - ext/compact_enc_det/compact_enc_det/util/commandlineflags.h
68
+ - ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc
69
+ - ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h
70
+ - ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h
71
+ - ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc
72
+ - ext/compact_enc_det/compact_enc_det/util/languages/languages.cc
73
+ - ext/compact_enc_det/compact_enc_det/util/languages/languages.h
74
+ - ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h
75
+ - ext/compact_enc_det/compact_enc_det/util/logging.h
76
+ - ext/compact_enc_det/compact_enc_det/util/port.h
77
+ - ext/compact_enc_det/compact_enc_det/util/string_util.h
78
+ - ext/compact_enc_det/compact_enc_det/util/varsetter.h
79
+ - ext/compact_enc_det/extconf.rb
80
+ - lib/compact_enc_det.rb
81
+ - lib/compact_enc_det/version.rb
82
+ homepage: https://github.com/cloudaper/compact_enc_det
83
+ licenses:
84
+ - MIT
85
+ - Apache-2.0
86
+ metadata: {}
87
+ post_install_message:
88
+ rdoc_options: []
89
+ require_paths:
90
+ - lib
91
+ required_ruby_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '2.7'
96
+ required_rubygems_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - ">="
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ requirements: []
102
+ rubygems_version: 3.5.3
103
+ signing_key:
104
+ specification_version: 4
105
+ summary: Compact Encoding Detection
106
+ test_files: []