compact_enc_det 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +7 -0
  2. data/ext/compact_enc_det/compact_enc_det/CMakeLists.txt +103 -0
  3. data/ext/compact_enc_det/compact_enc_det/LICENSE +202 -0
  4. data/ext/compact_enc_det/compact_enc_det/README.md +46 -0
  5. data/ext/compact_enc_det/compact_enc_det/autogen.sh +74 -0
  6. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.cc +5719 -0
  7. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det.h +83 -0
  8. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_fuzz_test.cc +54 -0
  9. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables.h +6326 -0
  10. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_generated_tables2.h +856 -0
  11. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.cc +169 -0
  12. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_hint_code.h +45 -0
  13. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/compact_enc_det_unittest.cc +5260 -0
  14. data/ext/compact_enc_det/compact_enc_det/compact_enc_det/detail_head_string.inc +152 -0
  15. data/ext/compact_enc_det/compact_enc_det/util/basictypes.h +331 -0
  16. data/ext/compact_enc_det/compact_enc_det/util/case_insensitive_hash.h +88 -0
  17. data/ext/compact_enc_det/compact_enc_det/util/commandlineflags.h +39 -0
  18. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.cc +891 -0
  19. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.h +299 -0
  20. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings.pb.h +181 -0
  21. data/ext/compact_enc_det/compact_enc_det/util/encodings/encodings_unittest.cc +34 -0
  22. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.cc +349 -0
  23. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.h +381 -0
  24. data/ext/compact_enc_det/compact_enc_det/util/languages/languages.pb.h +191 -0
  25. data/ext/compact_enc_det/compact_enc_det/util/logging.h +25 -0
  26. data/ext/compact_enc_det/compact_enc_det/util/port.h +53 -0
  27. data/ext/compact_enc_det/compact_enc_det/util/string_util.h +61 -0
  28. data/ext/compact_enc_det/compact_enc_det/util/varsetter.h +66 -0
  29. data/ext/compact_enc_det/compact_enc_det.cc +100 -0
  30. data/ext/compact_enc_det/extconf.rb +20 -0
  31. data/lib/compact_enc_det/version.rb +3 -0
  32. data/lib/compact_enc_det.rb +2 -0
  33. metadata +106 -0
@@ -0,0 +1,169 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #include "compact_enc_det/compact_enc_det_hint_code.h"
18
+
19
+ #include <ctype.h> // for isalpha
20
+ #include <string.h> // for NULL, memchr, strlen, etc
21
+
22
+ #include "util/basictypes.h" // for uint8, uint32
23
+ #include "util/string_util.h"
24
+
25
+ // Upper to lower, keep digits, everything else to minus '-' (2d)
26
+ static const char kCharsetToLowerTbl[256] = {
27
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
28
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
29
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
30
+ 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 0x38,0x39,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
31
+
32
+ 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
33
+ 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
34
+ 0x2d,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
35
+ 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 0x78,0x79,0x7a,0x2d,0x2d,0x2d,0x2d,0x2d,
36
+
37
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
38
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
39
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
40
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
41
+
42
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
43
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
44
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
45
+ 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d, 0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,0x2d,
46
+ };
47
+
48
+
49
+ static const char kIsAlpha[256] = {
50
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
51
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
52
+ 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
53
+ 0,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,0,0,0,0,0,
54
+
55
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
56
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
57
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
58
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
59
+ };
60
+
61
+ static const char kIsDigit[256] = {
62
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
63
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1, 1,1,0,0,0,0,0,0,
64
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
65
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
66
+
67
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
68
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
69
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
70
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
71
+ };
72
+
73
+ static const char* kFakeEncodingName[] = {
74
+ "FakeEnc100", "FakeEnc101", "FakeEnc102", "FakeEnc103", "FakeEnc104",
75
+ "FakeEnc105", "FakeEnc106", "FakeEnc107", "FakeEnc108", "FakeEnc109",
76
+ "FakeEnc110", "FakeEnc111", "FakeEnc112", "FakeEnc113", "FakeEnc114",
77
+ "FakeEnc115", "FakeEnc116", "FakeEnc117", "FakeEnc118", "FakeEnc119",
78
+ };
79
+ static const char* kFakeEncodingName2[] = {
80
+ "FakeEnc_0", "FakeEnc_1", "FakeEnc_2", "FakeEnc_3", "FakeEnc_4",
81
+ };
82
+
83
+ // Return name for extended encoding
84
+ const char* MyEncodingName(Encoding enc) {
85
+ if (enc < 0) {
86
+ return "~";
87
+ }
88
+ if (enc == ISO_8859_1) {
89
+ return "Latin1"; // I can't stand "ASCII" for this
90
+ }
91
+ if (enc < NUM_ENCODINGS) {
92
+ return EncodingName(enc);
93
+ }
94
+ // allow fake names, for exploration
95
+ if ((NUM_ENCODINGS <= enc) && (enc < (NUM_ENCODINGS + 4))) {
96
+ return kFakeEncodingName2[enc - NUM_ENCODINGS];
97
+ }
98
+ if ((100 <= enc) && (enc < 120)) {
99
+ return kFakeEncodingName[enc - 100];
100
+ }
101
+ return "~";
102
+ }
103
+
104
+
105
+ // Normalize ASCII string to first 4 alphabetic chars and last 4 digit chars
106
+ // Letters are forced to lowercase ASCII
107
+ // Used to normalize charset= values
108
+ string MakeChar44(const string& str) {
109
+ string res("________"); // eight underscores
110
+ int l_ptr = 0;
111
+ int d_ptr = 0;
112
+ for (uint32 i = 0; i < str.size(); ++i) {
113
+ uint8 uc = static_cast<uint8>(str[i]);
114
+ if (kIsAlpha[uc]) {
115
+ if (l_ptr < 4) { // Else ignore
116
+ res[l_ptr] = kCharsetToLowerTbl[uc];
117
+ l_ptr++;
118
+ }
119
+ } else if (kIsDigit[uc]) {
120
+ if (d_ptr < 4) {
121
+ res[4 + d_ptr] = kCharsetToLowerTbl[uc];
122
+ } else {
123
+ // Keep last 4 digits by shifting left
124
+ res[4] = res[5];
125
+ res[5] = res[6];
126
+ res[6] = res[7];
127
+ res[7] = kCharsetToLowerTbl[uc];
128
+ }
129
+ d_ptr++;
130
+ } // If neither letter nor digit, drop entirely
131
+ }
132
+ return res;
133
+ }
134
+
135
+ // Normalize ASCII string to first 8 alphabetic/digit chars
136
+ // Letters are forced to lowercase ASCII
137
+ // Used to normalize TLD values
138
+ string MakeChar4(const string& str) {
139
+ string res("____"); // four underscores
140
+ int l_ptr = 0;
141
+ for (uint32 i = 0; i < str.size(); ++i) {
142
+ uint8 uc = static_cast<uint8>(str[i]);
143
+ if (kIsAlpha[uc] | kIsDigit[uc]) {
144
+ if (l_ptr < 4) { // Else ignore
145
+ res[l_ptr] = kCharsetToLowerTbl[uc];
146
+ l_ptr++;
147
+ }
148
+ }
149
+ }
150
+ return res;
151
+ }
152
+
153
+ // Normalize ASCII string to first 8 alphabetic/digit chars
154
+ // Letters are forced to lowercase ASCII
155
+ // Used to normalize TLD values
156
+ string MakeChar8(const string& str) {
157
+ string res("________"); // eight dots
158
+ int l_ptr = 0;
159
+ for (uint32 i = 0; i < str.size(); ++i) {
160
+ uint8 uc = static_cast<uint8>(str[i]);
161
+ if (kIsAlpha[uc] | kIsDigit[uc]) {
162
+ if (l_ptr < 8) { // Else ignore
163
+ res[l_ptr] = kCharsetToLowerTbl[uc];
164
+ l_ptr++;
165
+ }
166
+ }
167
+ }
168
+ return res;
169
+ }
@@ -0,0 +1,45 @@
1
+ // Copyright 2016 Google Inc.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ ////////////////////////////////////////////////////////////////////////////////
16
+
17
+ #ifndef COMPACT_ENC_DET_COMPACT_ENC_DET_HINT_CODE_H_
18
+ #define COMPACT_ENC_DET_COMPACT_ENC_DET_HINT_CODE_H_
19
+
20
+ #include <string> // for string
21
+
22
+ #include "util/basictypes.h" // for uint32
23
+ #include "util/encodings/encodings.h" // for Encoding
24
+
25
+ using std::string;
26
+
27
+ // Return name for extended encoding
28
+ const char* MyEncodingName(Encoding enc);
29
+
30
+ // Normalize ASCII string to first 4 alphabetic chars and last 4 digit chars
31
+ // Letters are forced to lowercase ASCII
32
+ // Used to normalize charset= values
33
+ string MakeChar44(const string& str);
34
+
35
+ // Normalize ASCII string to first 4 alphabetic/digit chars
36
+ // Letters are forced to lowercase ASCII
37
+ // Used to normalize TLD values
38
+ string MakeChar4(const string& str);
39
+
40
+ // Normalize ASCII string to first 8 alphabetic/digit chars
41
+ // Letters are forced to lowercase ASCII
42
+ // Used to normalize other values
43
+ string MakeChar8(const string& str);
44
+
45
+ #endif // COMPACT_ENC_DET_COMPACT_ENC_DET_HINT_CODE_H_