compact_enc_det 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
  3. data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
  4. data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
  5. data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
  6. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
  7. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
  8. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
  9. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
  10. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
  11. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
  12. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
  13. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
  14. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
  15. data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
  16. data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
  17. data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
  18. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
  19. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
  20. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
  21. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
  22. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
  23. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
  24. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
  25. data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
  26. data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
  27. data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
  28. data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
  29. data/ext/compact_enc_det/compact_enc_det.cc +100 -0
  30. data/ext/compact_enc_det/extconf.rb +20 -0
  31. data/lib/compact_enc_det/version.rb +3 -0
  32. data/lib/compact_enc_det.rb +2 -0
  33. metadata +106 -0
@@ -0,0 +1,83 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #ifndef COMPACT_ENC_DET_COMPACT_ENC_DET_H_
18
+ #define COMPACT_ENC_DET_COMPACT_ENC_DET_H_
19
+
20
+ #include "util/encodings/encodings.h" // for Encoding
21
+ #include "util/languages/languages.h" // for Language
22
+
23
+ #include <string.h>
24
+
25
+ namespace CompactEncDet {
26
+ // We may want different statistics, depending on whether the text being
27
+ // identfied is from the web, from email, etc. This is currently ignored,
28
+ // except WEB_CORPUS enables ignoring chars inside tags.
29
+ enum TextCorpusType {
30
+ WEB_CORPUS,
31
+ XML_CORPUS,
32
+ QUERY_CORPUS, // Use this for vanilla plaintext
33
+ EMAIL_CORPUS,
34
+ NUM_CORPA, // always last
35
+ };
36
+
37
+ // Scan raw bytes and detect most likely encoding
38
+ // Design goals:
39
+ // Skip over big initial stretches of seven-bit ASCII bytes very quickly
40
+ // Thread safe
41
+ // Works equally well on
42
+ // 50-byte queries,
43
+ // 5000-byte email and
44
+ // 50000-byte web pages
45
+ // Length 0 input returns ASCII (aka ISO-8859-1 or Latin1)
46
+ //
47
+ // Inputs: text and text_length
48
+ // web page's url (preferred) or just
49
+ // top-level domain name (e.g. "com") or NULL as a hint
50
+ // web page's HTTPheader charset= string (e.g. "Latin1") or NULL as a hint
51
+ // web page's <meta> tag charset= string (e.g. "utf-8") or NULL as a hint
52
+ // an Encoding or UNKNOWN_ENCODING as a hint
53
+ // a Language or UNKNOWN_LANGUAGE as a hint
54
+ // corpus type from the list above. Currently ignored; may select
55
+ // different probability tables in the future
56
+ // ignore_7bit if true says to NOT return the pure seven-bit encodings
57
+ // ISO-2022-JP (aka JIS), ISO-2022-CN, ISO-2022-KR, HZ, and UTF-7.
58
+ // This may save a little scoring time on pure printable ASCII input text
59
+ // Outputs: bytes_consumed says how much of text_length was actually examined
60
+ // is_reliable set true if the returned encoding is at least 2**10 time more
61
+ // probable then the second-best encoding
62
+ // Return value: the most likely encoding for the input text
63
+ //
64
+ // Setting ignore_7bit_mail_encodings effectively turns off detection of
65
+ // UTF-7, HZ, and ISO-2022-xx. It is recommended that this flag be true
66
+ // when corpus_type is QUERY_CORPUS.
67
+ Encoding DetectEncoding(
68
+ const char* text, int text_length, const char* url_hint,
69
+ const char* http_charset_hint, const char* meta_charset_hint,
70
+ const int encoding_hint,
71
+ const Language language_hint, // User interface lang
72
+ const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
73
+ int* bytes_consumed, bool* is_reliable);
74
+
75
+ // Support functions for unit test program
76
+ int BackmapEncodingToRankedEncoding(Encoding enc);
77
+ Encoding TopEncodingOfLangHint(const char* name);
78
+ Encoding TopEncodingOfTLDHint(const char* name);
79
+ Encoding TopEncodingOfCharsetHint(const char* name);
80
+ const char* Version(void);
81
+ } // End namespace CompactEncDet
82
+
83
+ #endif // COMPACT_ENC_DET_COMPACT_ENC_DET_H_
@@ -0,0 +1,54 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #include <stddef.h>
18
+ #include <stdlib.h>
19
+ #include <memory>
20
+
21
+ #include "compact_enc_det/compact_enc_det.h"
22
+ #include "util/encodings/encodings.h"
23
+ #include "util/languages/languages.h"
24
+ #include "util/port.h"
25
+ #include "gtest/gtest.h"
26
+
27
+ namespace {
28
+
29
+ class CompactEncDetFuzzTest : public testing::Test {};
30
+
31
+ TEST_F(CompactEncDetFuzzTest, TestRandom) {
32
+ for (size_t i = 0; i < 16384; ++i) {
33
+ unsigned int seed = i;
34
+ srand(seed);
35
+ size_t length = static_cast<size_t>(rand()) % 1024;
36
+ std::unique_ptr<char[]> text(new char[length]);
37
+
38
+ for (size_t j = 0; j < length; ++j) text[j] = rand();
39
+
40
+ int bytes_consumed;
41
+ bool is_reliable;
42
+
43
+ CompactEncDet::DetectEncoding(text.get(), length, nullptr, // URL hint
44
+ nullptr, // HTTP hint
45
+ nullptr, // Meta hint
46
+ UNKNOWN_ENCODING,
47
+ UNKNOWN_LANGUAGE,
48
+ CompactEncDet::WEB_CORPUS,
49
+ false, // Include 7-bit encodings?
50
+ &bytes_consumed, &is_reliable);
51
+ }
52
+ }
53
+
54
+ } // namespace