compact_enc_det 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
- data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
- data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
- data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
- data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
- data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
- data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
- data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
- data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
- data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
- data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
- data/ext/compact_enc_det/compact_enc_det.cc +100 -0
- data/ext/compact_enc_det/extconf.rb +20 -0
- data/lib/compact_enc_det/version.rb +3 -0
- data/lib/compact_enc_det.rb +2 -0
- metadata +106 -0
@@ -0,0 +1,83 @@
|
|
1
|
+
// Copyright 2016 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
////////////////////////////////////////////////////////////////////////////////
|
16
|
+
|
17
|
+
#ifndef COMPACT_ENC_DET_COMPACT_ENC_DET_H_
|
18
|
+
#define COMPACT_ENC_DET_COMPACT_ENC_DET_H_
|
19
|
+
|
20
|
+
#include "util/encodings/encodings.h" // for Encoding
|
21
|
+
#include "util/languages/languages.h" // for Language
|
22
|
+
|
23
|
+
#include <string.h>
|
24
|
+
|
25
|
+
namespace CompactEncDet {
|
26
|
+
// We may want different statistics, depending on whether the text being
|
27
|
+
// identfied is from the web, from email, etc. This is currently ignored,
|
28
|
+
// except WEB_CORPUS enables ignoring chars inside tags.
|
29
|
+
enum TextCorpusType {
|
30
|
+
WEB_CORPUS,
|
31
|
+
XML_CORPUS,
|
32
|
+
QUERY_CORPUS, // Use this for vanilla plaintext
|
33
|
+
EMAIL_CORPUS,
|
34
|
+
NUM_CORPA, // always last
|
35
|
+
};
|
36
|
+
|
37
|
+
// Scan raw bytes and detect most likely encoding
|
38
|
+
// Design goals:
|
39
|
+
// Skip over big initial stretches of seven-bit ASCII bytes very quickly
|
40
|
+
// Thread safe
|
41
|
+
// Works equally well on
|
42
|
+
// 50-byte queries,
|
43
|
+
// 5000-byte email and
|
44
|
+
// 50000-byte web pages
|
45
|
+
// Length 0 input returns ASCII (aka ISO-8859-1 or Latin1)
|
46
|
+
//
|
47
|
+
// Inputs: text and text_length
|
48
|
+
// web page's url (preferred) or just
|
49
|
+
// top-level domain name (e.g. "com") or NULL as a hint
|
50
|
+
// web page's HTTPheader charset= string (e.g. "Latin1") or NULL as a hint
|
51
|
+
// web page's <meta> tag charset= string (e.g. "utf-8") or NULL as a hint
|
52
|
+
// an Encoding or UNKNOWN_ENCODING as a hint
|
53
|
+
// a Language or UNKNOWN_LANGUAGE as a hint
|
54
|
+
// corpus type from the list above. Currently ignored; may select
|
55
|
+
// different probability tables in the future
|
56
|
+
// ignore_7bit if true says to NOT return the pure seven-bit encodings
|
57
|
+
// ISO-2022-JP (aka JIS), ISO-2022-CN, ISO-2022-KR, HZ, and UTF-7.
|
58
|
+
// This may save a little scoring time on pure printable ASCII input text
|
59
|
+
// Outputs: bytes_consumed says how much of text_length was actually examined
|
60
|
+
// is_reliable set true if the returned encoding is at least 2**10 time more
|
61
|
+
// probable then the second-best encoding
|
62
|
+
// Return value: the most likely encoding for the input text
|
63
|
+
//
|
64
|
+
// Setting ignore_7bit_mail_encodings effectively turns off detection of
|
65
|
+
// UTF-7, HZ, and ISO-2022-xx. It is recommended that this flag be true
|
66
|
+
// when corpus_type is QUERY_CORPUS.
|
67
|
+
Encoding DetectEncoding(
|
68
|
+
const char* text, int text_length, const char* url_hint,
|
69
|
+
const char* http_charset_hint, const char* meta_charset_hint,
|
70
|
+
const int encoding_hint,
|
71
|
+
const Language language_hint, // User interface lang
|
72
|
+
const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
|
73
|
+
int* bytes_consumed, bool* is_reliable);
|
74
|
+
|
75
|
+
// Support functions for unit test program
|
76
|
+
int BackmapEncodingToRankedEncoding(Encoding enc);
|
77
|
+
Encoding TopEncodingOfLangHint(const char* name);
|
78
|
+
Encoding TopEncodingOfTLDHint(const char* name);
|
79
|
+
Encoding TopEncodingOfCharsetHint(const char* name);
|
80
|
+
const char* Version(void);
|
81
|
+
} // End namespace CompactEncDet
|
82
|
+
|
83
|
+
#endif // COMPACT_ENC_DET_COMPACT_ENC_DET_H_
|
@@ -0,0 +1,54 @@
|
|
1
|
+
// Copyright 2016 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
////////////////////////////////////////////////////////////////////////////////
|
16
|
+
|
17
|
+
#include <stddef.h>
|
18
|
+
#include <stdlib.h>
|
19
|
+
#include <memory>
|
20
|
+
|
21
|
+
#include "compact_enc_det/compact_enc_det.h"
|
22
|
+
#include "util/encodings/encodings.h"
|
23
|
+
#include "util/languages/languages.h"
|
24
|
+
#include "util/port.h"
|
25
|
+
#include "gtest/gtest.h"
|
26
|
+
|
27
|
+
namespace {
|
28
|
+
|
29
|
+
class CompactEncDetFuzzTest : public testing::Test {};
|
30
|
+
|
31
|
+
TEST_F(CompactEncDetFuzzTest, TestRandom) {
|
32
|
+
for (size_t i = 0; i < 16384; ++i) {
|
33
|
+
unsigned int seed = i;
|
34
|
+
srand(seed);
|
35
|
+
size_t length = static_cast<size_t>(rand()) % 1024;
|
36
|
+
std::unique_ptr<char[]> text(new char[length]);
|
37
|
+
|
38
|
+
for (size_t j = 0; j < length; ++j) text[j] = rand();
|
39
|
+
|
40
|
+
int bytes_consumed;
|
41
|
+
bool is_reliable;
|
42
|
+
|
43
|
+
CompactEncDet::DetectEncoding(text.get(), length, nullptr, // URL hint
|
44
|
+
nullptr, // HTTP hint
|
45
|
+
nullptr, // Meta hint
|
46
|
+
UNKNOWN_ENCODING,
|
47
|
+
UNKNOWN_LANGUAGE,
|
48
|
+
CompactEncDet::WEB_CORPUS,
|
49
|
+
false, // Include 7-bit encodings?
|
50
|
+
&bytes_consumed, &is_reliable);
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
} // namespace
|