compact_enc_det 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
  3. data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
  4. data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
  5. data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
  6. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
  7. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
  8. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
  9. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
  10. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
  11. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
  12. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
  13. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
  14. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
  15. data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
  16. data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
  17. data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
  18. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
  19. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
  20. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
  21. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
  22. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
  23. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
  24. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
  25. data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
  26. data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
  27. data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
  28. data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
  29. data/ext/compact_enc_det/compact_enc_det.cc +100 -0
  30. data/ext/compact_enc_det/extconf.rb +20 -0
  31. data/lib/compact_enc_det/version.rb +3 -0
  32. data/lib/compact_enc_det.rb +2 -0
  33. metadata +106 -0
@@ -0,0 +1,169 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #include "compact_enc_det/compact_enc_det_hint_code.h"
18
+
19
+ #include <ctype.h> // for isalpha
20
+ #include <string.h> // for NULL, memchr, strlen, etc
21
+
22
+ #include "util/basictypes.h" // for uint8, uint32
23
+ #include "util/string_util.h"
24
+
25
+ // Upper to lower, keep digits, everything else to minus '-' (2d)
26
+ static const char kCharsetToLowerTbl[256] = {
27
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
28
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
29
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
30
+ 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
31
+
32
+ 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
33
+ 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
34
+ 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
35
+ 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
36
+
37
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
38
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
39
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
40
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
41
+
42
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
43
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
44
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
45
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
46
+ };
47
+
48
+
49
+ static const char kIsAlpha[256] = {
50
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
51
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
52
+ 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
53
+ 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
54
+
55
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
56
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
57
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
58
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
59
+ };
60
+
61
+ static const char kIsDigit[256] = {
62
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
63
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0,
64
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
65
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
66
+
67
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
68
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
69
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
70
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
71
+ };
72
+
73
+ static const char* kFakeEncodingName[] = {
74
+ "FakeEnc100", "FakeEnc101", "FakeEnc102", "FakeEnc103", "FakeEnc104",
75
+ "FakeEnc105", "FakeEnc106", "FakeEnc107", "FakeEnc108", "FakeEnc109",
76
+ "FakeEnc110", "FakeEnc111", "FakeEnc112", "FakeEnc113", "FakeEnc114",
77
+ "FakeEnc115", "FakeEnc116", "FakeEnc117", "FakeEnc118", "FakeEnc119",
78
+ };
79
+ static const char* kFakeEncodingName2[] = {
80
+ "FakeEnc_0", "FakeEnc_1", "FakeEnc_2", "FakeEnc_3", "FakeEnc_4",
81
+ };
82
+
83
+ // Return name for extended encoding
84
+ const char* MyEncodingName(Encoding enc) {
85
+ if (enc < 0) {
86
+ return "~";
87
+ }
88
+ if (enc == ISO_8859_1) {
89
+ return "Latin1"; // I can't stand "ASCII" for this
90
+ }
91
+ if (enc < NUM_ENCODINGS) {
92
+ return EncodingName(enc);
93
+ }
94
+ // allow fake names, for exploration
95
+ if ((NUM_ENCODINGS <= enc) && (enc < (NUM_ENCODINGS + 4))) {
96
+ return kFakeEncodingName2[enc - NUM_ENCODINGS];
97
+ }
98
+ if ((100 <= enc) && (enc < 120)) {
99
+ return kFakeEncodingName[enc - 100];
100
+ }
101
+ return "~";
102
+ }
103
+
104
+
105
+ // Normalize ASCII string to first 4 alphabetic chars and last 4 digit chars
106
+ // Letters are forced to lowercase ASCII
107
+ // Used to normalize charset= values
108
+ string MakeChar44(const string& str) {
109
+ string res("________"); // eight underscores
110
+ int l_ptr = 0;
111
+ int d_ptr = 0;
112
+ for (uint32 i = 0; i < str.size(); ++i) {
113
+ uint8 uc = static_cast<uint8>(str[i]);
114
+ if (kIsAlpha[uc]) {
115
+ if (l_ptr < 4) { // Else ignore
116
+ res[l_ptr] = kCharsetToLowerTbl[uc];
117
+ l_ptr++;
118
+ }
119
+ } else if (kIsDigit[uc]) {
120
+ if (d_ptr < 4) {
121
+ res[4 + d_ptr] = kCharsetToLowerTbl[uc];
122
+ } else {
123
+ // Keep last 4 digits by shifting left
124
+ res[4] = res[5];
125
+ res[5] = res[6];
126
+ res[6] = res[7];
127
+ res[7] = kCharsetToLowerTbl[uc];
128
+ }
129
+ d_ptr++;
130
+ } // If neither letter nor digit, drop entirely
131
+ }
132
+ return res;
133
+ }
134
+
135
+ // Normalize ASCII string to first 8 alphabetic/digit chars
136
+ // Letters are forced to lowercase ASCII
137
+ // Used to normalize TLD values
138
+ string MakeChar4(const string& str) {
139
+ string res("____"); // four underscores
140
+ int l_ptr = 0;
141
+ for (uint32 i = 0; i < str.size(); ++i) {
142
+ uint8 uc = static_cast<uint8>(str[i]);
143
+ if (kIsAlpha[uc] | kIsDigit[uc]) {
144
+ if (l_ptr < 4) { // Else ignore
145
+ res[l_ptr] = kCharsetToLowerTbl[uc];
146
+ l_ptr++;
147
+ }
148
+ }
149
+ }
150
+ return res;
151
+ }
152
+
153
+ // Normalize ASCII string to first 8 alphabetic/digit chars
154
+ // Letters are forced to lowercase ASCII
155
+ // Used to normalize TLD values
156
+ string MakeChar8(const string& str) {
157
+ string res("________"); // eight dots
158
+ int l_ptr = 0;
159
+ for (uint32 i = 0; i < str.size(); ++i) {
160
+ uint8 uc = static_cast<uint8>(str[i]);
161
+ if (kIsAlpha[uc] | kIsDigit[uc]) {
162
+ if (l_ptr < 8) { // Else ignore
163
+ res[l_ptr] = kCharsetToLowerTbl[uc];
164
+ l_ptr++;
165
+ }
166
+ }
167
+ }
168
+ return res;
169
+ }
@@ -0,0 +1,45 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #ifndef COMPACT_ENC_DET_COMPACT_ENC_DET_HINT_CODE_H_
18
+ #define COMPACT_ENC_DET_COMPACT_ENC_DET_HINT_CODE_H_
19
+
20
+ #include <string> // for string
21
+
22
+ #include "util/basictypes.h" // for uint32
23
+ #include "util/encodings/encodings.h" // for Encoding
24
+
25
+ using std::string;
26
+
27
+ // Return name for extended encoding
28
+ const char* MyEncodingName(Encoding enc);
29
+
30
+ // Normalize ASCII string to first 4 alphabetic chars and last 4 digit chars
31
+ // Letters are forced to lowercase ASCII
32
+ // Used to normalize charset= values
33
+ string MakeChar44(const string& str);
34
+
35
+ // Normalize ASCII string to first 4 alphabetic/digit chars
36
+ // Letters are forced to lowercase ASCII
37
+ // Used to normalize TLD values
38
+ string MakeChar4(const string& str);
39
+
40
+ // Normalize ASCII string to first 8 alphabetic/digit chars
41
+ // Letters are forced to lowercase ASCII
42
+ // Used to normalize other values
43
+ string MakeChar8(const string& str);
44
+
45
+ #endif // COMPACT_ENC_DET_COMPACT_ENC_DET_HINT_CODE_H_