compact_enc_det 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
- data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
- data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
- data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
- data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
- data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
- data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
- data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
- data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
- data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
- data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
- data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
- data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
- data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
- data/ext/compact_enc_det/compact_enc_det.cc +100 -0
- data/ext/compact_enc_det/extconf.rb +20 -0
- data/lib/compact_enc_det/version.rb +3 -0
- data/lib/compact_enc_det.rb +2 -0
- metadata +106 -0
@@ -0,0 +1,83 @@
|
|
1
|
+
// Copyright 2016 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
////////////////////////////////////////////////////////////////////////////////
|
16
|
+
|
17
|
+
#ifndef COMPACT_ENC_DET_COMPACT_ENC_DET_H_
|
18
|
+
#define COMPACT_ENC_DET_COMPACT_ENC_DET_H_
|
19
|
+
|
20
|
+
#include "util/encodings/encodings.h" // for Encoding
|
21
|
+
#include "util/languages/languages.h" // for Language
|
22
|
+
|
23
|
+
#include <string.h>
|
24
|
+
|
25
|
+
namespace CompactEncDet {
|
26
|
+
// We may want different statistics, depending on whether the text being
|
27
|
+
// identfied is from the web, from email, etc. This is currently ignored,
|
28
|
+
// except WEB_CORPUS enables ignoring chars inside tags.
|
29
|
+
enum TextCorpusType {
|
30
|
+
WEB_CORPUS,
|
31
|
+
XML_CORPUS,
|
32
|
+
QUERY_CORPUS, // Use this for vanilla plaintext
|
33
|
+
EMAIL_CORPUS,
|
34
|
+
NUM_CORPA, // always last
|
35
|
+
};
|
36
|
+
|
37
|
+
// Scan raw bytes and detect most likely encoding
|
38
|
+
// Design goals:
|
39
|
+
// Skip over big initial stretches of seven-bit ASCII bytes very quickly
|
40
|
+
// Thread safe
|
41
|
+
// Works equally well on
|
42
|
+
// 50-byte queries,
|
43
|
+
// 5000-byte email and
|
44
|
+
// 50000-byte web pages
|
45
|
+
// Length 0 input returns ASCII (aka ISO-8859-1 or Latin1)
|
46
|
+
//
|
47
|
+
// Inputs: text and text_length
|
48
|
+
// web page's url (preferred) or just
|
49
|
+
// top-level domain name (e.g. "com") or NULL as a hint
|
50
|
+
// web page's HTTPheader charset= string (e.g. "Latin1") or NULL as a hint
|
51
|
+
// web page's <meta> tag charset= string (e.g. "utf-8") or NULL as a hint
|
52
|
+
// an Encoding or UNKNOWN_ENCODING as a hint
|
53
|
+
// a Language or UNKNOWN_LANGUAGE as a hint
|
54
|
+
// corpus type from the list above. Currently ignored; may select
|
55
|
+
// different probability tables in the future
|
56
|
+
// ignore_7bit if true says to NOT return the pure seven-bit encodings
|
57
|
+
// ISO-2022-JP (aka JIS), ISO-2022-CN, ISO-2022-KR, HZ, and UTF-7.
|
58
|
+
// This may save a little scoring time on pure printable ASCII input text
|
59
|
+
// Outputs: bytes_consumed says how much of text_length was actually examined
|
60
|
+
// is_reliable set true if the returned encoding is at least 2**10 time more
|
61
|
+
// probable then the second-best encoding
|
62
|
+
// Return value: the most likely encoding for the input text
|
63
|
+
//
|
64
|
+
// Setting ignore_7bit_mail_encodings effectively turns off detection of
|
65
|
+
// UTF-7, HZ, and ISO-2022-xx. It is recommended that this flag be true
|
66
|
+
// when corpus_type is QUERY_CORPUS.
|
67
|
+
Encoding DetectEncoding(
|
68
|
+
const char* text, int text_length, const char* url_hint,
|
69
|
+
const char* http_charset_hint, const char* meta_charset_hint,
|
70
|
+
const int encoding_hint,
|
71
|
+
const Language language_hint, // User interface lang
|
72
|
+
const TextCorpusType corpus_type, bool ignore_7bit_mail_encodings,
|
73
|
+
int* bytes_consumed, bool* is_reliable);
|
74
|
+
|
75
|
+
// Support functions for unit test program
|
76
|
+
int BackmapEncodingToRankedEncoding(Encoding enc);
|
77
|
+
Encoding TopEncodingOfLangHint(const char* name);
|
78
|
+
Encoding TopEncodingOfTLDHint(const char* name);
|
79
|
+
Encoding TopEncodingOfCharsetHint(const char* name);
|
80
|
+
const char* Version(void);
|
81
|
+
} // End namespace CompactEncDet
|
82
|
+
|
83
|
+
#endif // COMPACT_ENC_DET_COMPACT_ENC_DET_H_
|
@@ -0,0 +1,54 @@
|
|
1
|
+
// Copyright 2016 Google Inc.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
////////////////////////////////////////////////////////////////////////////////
|
16
|
+
|
17
|
+
#include <stddef.h>
|
18
|
+
#include <stdlib.h>
|
19
|
+
#include <memory>
|
20
|
+
|
21
|
+
#include "compact_enc_det/compact_enc_det.h"
|
22
|
+
#include "util/encodings/encodings.h"
|
23
|
+
#include "util/languages/languages.h"
|
24
|
+
#include "util/port.h"
|
25
|
+
#include "gtest/gtest.h"
|
26
|
+
|
27
|
+
namespace {
|
28
|
+
|
29
|
+
class CompactEncDetFuzzTest : public testing::Test {};
|
30
|
+
|
31
|
+
TEST_F(CompactEncDetFuzzTest, TestRandom) {
|
32
|
+
for (size_t i = 0; i < 16384; ++i) {
|
33
|
+
unsigned int seed = i;
|
34
|
+
srand(seed);
|
35
|
+
size_t length = static_cast<size_t>(rand()) % 1024;
|
36
|
+
std::unique_ptr<char[]> text(new char[length]);
|
37
|
+
|
38
|
+
for (size_t j = 0; j < length; ++j) text[j] = rand();
|
39
|
+
|
40
|
+
int bytes_consumed;
|
41
|
+
bool is_reliable;
|
42
|
+
|
43
|
+
CompactEncDet::DetectEncoding(text.get(), length, nullptr, // URL hint
|
44
|
+
nullptr, // HTTP hint
|
45
|
+
nullptr, // Meta hint
|
46
|
+
UNKNOWN_ENCODING,
|
47
|
+
UNKNOWN_LANGUAGE,
|
48
|
+
CompactEncDet::WEB_CORPUS,
|
49
|
+
false, // Include 7-bit encodings?
|
50
|
+
&bytes_consumed, &is_reliable);
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
} // namespace
|