compact_enc_det 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
  3. data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
  4. data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
  5. data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
  6. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
  7. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
  8. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
  9. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
  10. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
  11. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
  12. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
  13. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
  14. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
  15. data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
  16. data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
  17. data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
  18. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
  19. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
  20. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
  21. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
  22. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
  23. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
  24. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
  25. data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
  26. data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
  27. data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
  28. data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
  29. data/ext/compact_enc_det/compact_enc_det.cc +100 -0
  30. data/ext/compact_enc_det/extconf.rb +20 -0
  31. data/lib/compact_enc_det/version.rb +3 -0
  32. data/lib/compact_enc_det.rb +2 -0
  33. metadata +106 -0
@@ -0,0 +1,83 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #ifndef COMPACT_ENC_DET_COMPACT_ENC_DET_H_
18
+ #define COMPACT_ENC_DET_COMPACT_ENC_DET_H_
19
+
20
+ #include "util/encodings/encodings.h" // for Encoding
21
+ #include "util/languages/languages.h" // for Language
22
+
23
+ #include <string.h>
24
+
25
+ namespace CompactEncDet {
26
+ // We may want different statistics, depending on whether the text being
27
+ // identfied is from the web, from email, etc. This is currently ignored,
28
+ // except WEB_CORPUS enables ignoring chars inside tags.
29
+ enum TextCorpusType {
30
+ WEB_CORPUS,
31
+ XML_CORPUS,
32
+ QUERY_CORPUS, // Use this for vanilla plaintext
33
+ EMAIL_CORPUS,
34
+ NUM_CORPA, // always last
35
+ };
36
+
37
+ // Scan raw bytes and detect most likely encoding
38
+ // Design goals:
39
+ // Skip over big initial stretches of seven-bit ASCII bytes very quickly
40
+ // Thread safe
41
+ // Works equally well on
42
+ // 50-byte queries,
43
+ // 5000-byte email and
44
+ // 50000-byte web pages
45
+ // Length 0 input returns ASCII (aka ISO-8859-1 or Latin1)
46
+ //
47
+ // Inputs: text and text_length
48
+ // web page's url (preferred) or just
49
+ // top-level domain name (e.g. "com") or NULL as a hint
50
+ // web page's HTTPheader charset= string (e.g. "Latin1") or NULL as a hint
51
+ // web page's <meta> tag charset= string (e.g. "utf-8") or NULL as a hint
52
+ // an Encoding or UNKNOWN_ENCODING as a hint
53
+ // a Language or UNKNOWN_LANGUAGE as a hint
54
+ // corpus type from the list above. Currently ignored; may select
55
+ // different probability tables in the future
56
+ // ignore_7bit if true says to NOT return the pure seven-bit encodings
57
+ // ISO-2022-JP (aka JIS), ISO-2022-CN, ISO-2022-KR, HZ, and UTF-7.
58
+ // This may save a little scoring time on pure printable ASCII input text
59
+ // Outputs: bytes_consumed says how much of text_length was actually examined
60
+ // is_reliable set true if the returned encoding is at least 2**10 time more
61
+ // probable then the second-best encoding
62
+ // Return value: the most likely encoding for the input text
63
+ //
64
+ // Setting ignore_7bit_mail_encodings effectively turns off detection of
65
+ // UTF-7, HZ, and ISO-2022-xx. It is recommended that this flag be true
66
+ // when corpus_type is QUERY_CORPUS.
67
+ Encoding DetectEncoding(
68
+ const char* text, int text_length, const char* url_hint,
69
+ const char* http_charset_hint, const char* meta_charset_hint,
70
+ const int encoding_hint,
71
+ const Language language_hint, // User interface lang
72
+ const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
73
+ int* bytes_consumed, bool* is_reliable);
74
+
75
+ // Support functions for unit test program
76
+ int BackmapEncodingToRankedEncoding(Encoding enc);
77
+ Encoding TopEncodingOfLangHint(const char* name);
78
+ Encoding TopEncodingOfTLDHint(const char* name);
79
+ Encoding TopEncodingOfCharsetHint(const char* name);
80
+ const char* Version(void);
81
+ } // End namespace CompactEncDet
82
+
83
+ #endif // COMPACT_ENC_DET_COMPACT_ENC_DET_H_
@@ -0,0 +1,54 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #include <stddef.h>
18
+ #include <stdlib.h>
19
+ #include <memory>
20
+
21
+ #include "compact_enc_det/compact_enc_det.h"
22
+ #include "util/encodings/encodings.h"
23
+ #include "util/languages/languages.h"
24
+ #include "util/port.h"
25
+ #include "gtest/gtest.h"
26
+
27
+ namespace {
28
+
29
+ class CompactEncDetFuzzTest : public testing::Test {};
30
+
31
+ TEST_F(CompactEncDetFuzzTest, TestRandom) {
32
+ for (size_t i = 0; i < 16384; ++i) {
33
+ unsigned int seed = i;
34
+ srand(seed);
35
+ size_t length = static_cast<size_t>(rand()) % 1024;
36
+ std::unique_ptr<char[]> text(new char[length]);
37
+
38
+ for (size_t j = 0; j < length; ++j) text[j] = rand();
39
+
40
+ int bytes_consumed;
41
+ bool is_reliable;
42
+
43
+ CompactEncDet::DetectEncoding(text.get(), length, nullptr, // URL hint
44
+ nullptr, // HTTP hint
45
+ nullptr, // Meta hint
46
+ UNKNOWN_ENCODING,
47
+ UNKNOWN_LANGUAGE,
48
+ CompactEncDet::WEB_CORPUS,
49
+ false, // Include 7-bit encodings?
50
+ &bytes_consumed, &is_reliable);
51
+ }
52
+ }
53
+
54
+ } // namespace