cld3 3.5.0 → 3.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +0 -8
- data/cld3.gemspec +6 -6
- data/ext/cld3/extconf.rb +1 -2
- data/ext/cld3/nnet_language_identifier_c.cc +163 -70
- data/lib/cld3.rb +14 -102
- data/sig/cld3.rbs +2 -0
- metadata +15 -77
- data/ext/cld3/Makefile +0 -268
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.def +0 -8
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/script_span/fixunicodevalue.h +0 -69
- data/ext/cld3/script_span/generated_ulscript.h +0 -142
- data/ext/cld3/script_span/getonescriptspan.h +0 -124
- data/ext/cld3/script_span/integral_types.h +0 -37
- data/ext/cld3/script_span/offsetmap.h +0 -168
- data/ext/cld3/script_span/port.h +0 -143
- data/ext/cld3/script_span/stringpiece.h +0 -81
- data/ext/cld3/script_span/text_processing.h +0 -30
- data/ext/cld3/script_span/utf8acceptinterchange.h +0 -486
- data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +0 -1631
- data/ext/cld3/script_span/utf8repl_lettermarklower.h +0 -758
- data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +0 -1455
- data/ext/cld3/script_span/utf8statetable.h +0 -285
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/cld3/unstable.rb +0 -58
@@ -1,142 +0,0 @@
|
|
1
|
-
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
|
15
|
-
// generated_ulscript.h
|
16
|
-
// Machine generated. Do Not Edit.
|
17
|
-
//
|
18
|
-
// Declarations for scripts recognized by CLD2
|
19
|
-
//
|
20
|
-
|
21
|
-
#ifndef SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
22
|
-
#define SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
23
|
-
|
24
|
-
namespace chrome_lang_id {
|
25
|
-
namespace CLD2 {
|
26
|
-
|
27
|
-
typedef enum {RTypeNone = 0, RTypeOne, RTypeMany, RTypeCJK} ULScriptRType;
|
28
|
-
|
29
|
-
typedef struct {const char* s; int i;} CharIntPair;
|
30
|
-
|
31
|
-
typedef enum {
|
32
|
-
ULScript_Common = 0, // Zyyy
|
33
|
-
ULScript_Latin = 1, // Latn
|
34
|
-
ULScript_Greek = 2, // Grek
|
35
|
-
ULScript_Cyrillic = 3, // Cyrl
|
36
|
-
ULScript_Armenian = 4, // Armn
|
37
|
-
ULScript_Hebrew = 5, // Hebr
|
38
|
-
ULScript_Arabic = 6, // Arab
|
39
|
-
ULScript_Syriac = 7, // Syrc
|
40
|
-
ULScript_Thaana = 8, // Thaa
|
41
|
-
ULScript_Devanagari = 9, // Deva
|
42
|
-
ULScript_Bengali = 10, // Beng
|
43
|
-
ULScript_Gurmukhi = 11, // Guru
|
44
|
-
ULScript_Gujarati = 12, // Gujr
|
45
|
-
ULScript_Oriya = 13, // Orya
|
46
|
-
ULScript_Tamil = 14, // Taml
|
47
|
-
ULScript_Telugu = 15, // Telu
|
48
|
-
ULScript_Kannada = 16, // Knda
|
49
|
-
ULScript_Malayalam = 17, // Mlym
|
50
|
-
ULScript_Sinhala = 18, // Sinh
|
51
|
-
ULScript_Thai = 19, // Thai
|
52
|
-
ULScript_Lao = 20, // Laoo
|
53
|
-
ULScript_Tibetan = 21, // Tibt
|
54
|
-
ULScript_Myanmar = 22, // Mymr
|
55
|
-
ULScript_Georgian = 23, // Geor
|
56
|
-
ULScript_Hani = 24, // Hani
|
57
|
-
ULScript_Ethiopic = 25, // Ethi
|
58
|
-
ULScript_Cherokee = 26, // Cher
|
59
|
-
ULScript_Canadian_Aboriginal = 27, // Cans
|
60
|
-
ULScript_Ogham = 28, // Ogam
|
61
|
-
ULScript_Runic = 29, // Runr
|
62
|
-
ULScript_Khmer = 30, // Khmr
|
63
|
-
ULScript_Mongolian = 31, // Mong
|
64
|
-
ULScript_32 = 32, //
|
65
|
-
ULScript_33 = 33, //
|
66
|
-
ULScript_Bopomofo = 34, // Bopo
|
67
|
-
ULScript_35 = 35, //
|
68
|
-
ULScript_Yi = 36, // Yiii
|
69
|
-
ULScript_Old_Italic = 37, // Ital
|
70
|
-
ULScript_Gothic = 38, // Goth
|
71
|
-
ULScript_Deseret = 39, // Dsrt
|
72
|
-
ULScript_Inherited = 40, // Zinh
|
73
|
-
ULScript_Tagalog = 41, // Tglg
|
74
|
-
ULScript_Hanunoo = 42, // Hano
|
75
|
-
ULScript_Buhid = 43, // Buhd
|
76
|
-
ULScript_Tagbanwa = 44, // Tagb
|
77
|
-
ULScript_Limbu = 45, // Limb
|
78
|
-
ULScript_Tai_Le = 46, // Tale
|
79
|
-
ULScript_Linear_B = 47, // Linb
|
80
|
-
ULScript_Ugaritic = 48, // Ugar
|
81
|
-
ULScript_Shavian = 49, // Shaw
|
82
|
-
ULScript_Osmanya = 50, // Osma
|
83
|
-
ULScript_Cypriot = 51, // Cprt
|
84
|
-
ULScript_Braille = 52, // Brai
|
85
|
-
ULScript_Buginese = 53, // Bugi
|
86
|
-
ULScript_Coptic = 54, // Copt
|
87
|
-
ULScript_New_Tai_Lue = 55, // Talu
|
88
|
-
ULScript_Glagolitic = 56, // Glag
|
89
|
-
ULScript_Tifinagh = 57, // Tfng
|
90
|
-
ULScript_Syloti_Nagri = 58, // Sylo
|
91
|
-
ULScript_Old_Persian = 59, // Xpeo
|
92
|
-
ULScript_Kharoshthi = 60, // Khar
|
93
|
-
ULScript_Balinese = 61, // Bali
|
94
|
-
ULScript_Cuneiform = 62, // Xsux
|
95
|
-
ULScript_Phoenician = 63, // Phnx
|
96
|
-
ULScript_Phags_Pa = 64, // Phag
|
97
|
-
ULScript_Nko = 65, // Nkoo
|
98
|
-
ULScript_Sundanese = 66, // Sund
|
99
|
-
ULScript_Lepcha = 67, // Lepc
|
100
|
-
ULScript_Ol_Chiki = 68, // Olck
|
101
|
-
ULScript_Vai = 69, // Vaii
|
102
|
-
ULScript_Saurashtra = 70, // Saur
|
103
|
-
ULScript_Kayah_Li = 71, // Kali
|
104
|
-
ULScript_Rejang = 72, // Rjng
|
105
|
-
ULScript_Lycian = 73, // Lyci
|
106
|
-
ULScript_Carian = 74, // Cari
|
107
|
-
ULScript_Lydian = 75, // Lydi
|
108
|
-
ULScript_Cham = 76, // Cham
|
109
|
-
ULScript_Tai_Tham = 77, // Lana
|
110
|
-
ULScript_Tai_Viet = 78, // Tavt
|
111
|
-
ULScript_Avestan = 79, // Avst
|
112
|
-
ULScript_Egyptian_Hieroglyphs = 80, // Egyp
|
113
|
-
ULScript_Samaritan = 81, // Samr
|
114
|
-
ULScript_Lisu = 82, // Lisu
|
115
|
-
ULScript_Bamum = 83, // Bamu
|
116
|
-
ULScript_Javanese = 84, // Java
|
117
|
-
ULScript_Meetei_Mayek = 85, // Mtei
|
118
|
-
ULScript_Imperial_Aramaic = 86, // Armi
|
119
|
-
ULScript_Old_South_Arabian = 87, // Sarb
|
120
|
-
ULScript_Inscriptional_Parthian = 88, // Prti
|
121
|
-
ULScript_Inscriptional_Pahlavi = 89, // Phli
|
122
|
-
ULScript_Old_Turkic = 90, // Orkh
|
123
|
-
ULScript_Kaithi = 91, // Kthi
|
124
|
-
ULScript_Batak = 92, // Batk
|
125
|
-
ULScript_Brahmi = 93, // Brah
|
126
|
-
ULScript_Mandaic = 94, // Mand
|
127
|
-
ULScript_Chakma = 95, // Cakm
|
128
|
-
ULScript_Meroitic_Cursive = 96, // Merc
|
129
|
-
ULScript_Meroitic_Hieroglyphs = 97, // Mero
|
130
|
-
ULScript_Miao = 98, // Plrd
|
131
|
-
ULScript_Sharada = 99, // Shrd
|
132
|
-
ULScript_Sora_Sompeng = 100, // Sora
|
133
|
-
ULScript_Takri = 101, // Takr
|
134
|
-
NUM_ULSCRIPTS
|
135
|
-
} ULScript;
|
136
|
-
|
137
|
-
#define UNKNOWN_ULSCRIPT ULScript_Common
|
138
|
-
|
139
|
-
} // namespace CLD2
|
140
|
-
} // namespace chrome_lang_id
|
141
|
-
|
142
|
-
#endif // SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
@@ -1,124 +0,0 @@
|
|
1
|
-
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
|
15
|
-
//
|
16
|
-
// Author: dsites@google.com (Dick Sites)
|
17
|
-
//
|
18
|
-
|
19
|
-
|
20
|
-
#ifndef SCRIPT_SPAN_GETONESCRIPTSPAN_H_
|
21
|
-
#define SCRIPT_SPAN_GETONESCRIPTSPAN_H_
|
22
|
-
|
23
|
-
#include "generated_ulscript.h"
|
24
|
-
#include "integral_types.h"
|
25
|
-
#include "offsetmap.h"
|
26
|
-
|
27
|
-
namespace chrome_lang_id {
|
28
|
-
namespace CLD2 {
|
29
|
-
|
30
|
-
static const int kMaxScriptBuffer = 40960;
|
31
|
-
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
|
32
|
-
static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
|
33
|
-
static const int kWithinScriptTail = 32; // Stop at word space in last
|
34
|
-
// N bytes of script buffer
|
35
|
-
|
36
|
-
struct LangSpan {
|
37
|
-
char* text = nullptr; // Pointer to the span, somewhere
|
38
|
-
int text_bytes = 0; // Number of bytes of text in the span
|
39
|
-
int offset = 0; // Offset of start of span in original input buffer
|
40
|
-
ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
|
41
|
-
bool truncated = false; // true if buffer filled up before a
|
42
|
-
// different script or EOF was found
|
43
|
-
};
|
44
|
-
|
45
|
-
static inline bool IsContinuationByte(char c) {
|
46
|
-
return static_cast<signed char>(c) < -64;
|
47
|
-
}
|
48
|
-
|
49
|
-
// Gets lscript number for letters; always returns
|
50
|
-
// 0 (common script) for non-letters
|
51
|
-
int GetUTF8LetterScriptNum(const char* src);
|
52
|
-
|
53
|
-
// Update src pointer to point to next quadgram, +2..+5
|
54
|
-
// Looks at src[0..4]
|
55
|
-
const char* AdvanceQuad(const char* src);
|
56
|
-
|
57
|
-
// Utility routine to search alphabetical tables
|
58
|
-
int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair);
|
59
|
-
|
60
|
-
// Returns the length in bytes of the prefix of src that is all
|
61
|
-
// interchange valid UTF-8
|
62
|
-
int SpanInterchangeValid(const char* src, int byte_length);
|
63
|
-
|
64
|
-
class ScriptScanner {
|
65
|
-
public:
|
66
|
-
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
|
67
|
-
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
|
68
|
-
bool any_text, bool any_script);
|
69
|
-
~ScriptScanner();
|
70
|
-
|
71
|
-
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
72
|
-
bool GetOneScriptSpan(LangSpan* span);
|
73
|
-
|
74
|
-
// Force Latin and Cyrillic scripts to be lowercase
|
75
|
-
void LowerScriptSpan(LangSpan* span);
|
76
|
-
|
77
|
-
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
78
|
-
// Force Latin and Cyrillic scripts to be lowercase
|
79
|
-
bool GetOneScriptSpanLower(LangSpan* span);
|
80
|
-
|
81
|
-
// Copy next run of non-tag characters to buffer [NUL terminated]
|
82
|
-
// This just removes tags and removes entities
|
83
|
-
// Buffer has leading space
|
84
|
-
bool GetOneTextSpan(LangSpan* span);
|
85
|
-
|
86
|
-
// Maps byte offset in most recent GetOneScriptSpan/Lower
|
87
|
-
// span->text [0..text_bytes] into an additional byte offset from
|
88
|
-
// span->offset, to get back to corresponding text in the original
|
89
|
-
// input buffer.
|
90
|
-
// text_offset must be the first byte
|
91
|
-
// of a UTF-8 character, or just beyond the last character. Normally this
|
92
|
-
// routine is called with the first byte of an interesting range and
|
93
|
-
// again with the first byte of the following range.
|
94
|
-
int MapBack(int text_offset);
|
95
|
-
|
96
|
-
const char* GetBufferStart() {return start_byte_;}
|
97
|
-
|
98
|
-
private:
|
99
|
-
// Skip over tags and non-letters
|
100
|
-
int SkipToFrontOfSpan(const char* src, int len, int* script);
|
101
|
-
|
102
|
-
const char* start_byte_; // Starting byte of buffer to scan
|
103
|
-
const char* next_byte_; // First unscanned byte
|
104
|
-
int byte_length_; // Bytes left
|
105
|
-
|
106
|
-
bool is_plain_text_; // true fo text, false for HTML
|
107
|
-
char* script_buffer_; // Holds text with expanded entities
|
108
|
-
char* script_buffer_lower_; // Holds lowercased text
|
109
|
-
bool letters_marks_only_; // To distinguish scriptspan of one
|
110
|
-
// letters/marks vs. any mixture of text
|
111
|
-
bool one_script_only_; // To distinguish scriptspan of one
|
112
|
-
// script vs. any mixture of scripts
|
113
|
-
int exit_state_; // For tag parser kTagParseTbl_0, based
|
114
|
-
// on letters_marks_only_
|
115
|
-
public :
|
116
|
-
// Expose for debugging
|
117
|
-
OffsetMap map2original_; // map from script_buffer_ to buffer
|
118
|
-
OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_
|
119
|
-
};
|
120
|
-
|
121
|
-
} // namespace CLD2
|
122
|
-
} // namespace chrome_lang_id
|
123
|
-
|
124
|
-
#endif // SCRIPT_SPAN_GETONESCRIPTSPAN_H_
|
@@ -1,37 +0,0 @@
|
|
1
|
-
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
|
15
|
-
#ifndef SCRIPT_SPAN_INTEGRAL_TYPES_H_
|
16
|
-
#define SCRIPT_SPAN_INTEGRAL_TYPES_H_
|
17
|
-
|
18
|
-
// Cheap version
|
19
|
-
namespace chrome_lang_id {
|
20
|
-
namespace CLD2 {
|
21
|
-
|
22
|
-
typedef unsigned char uint8;
|
23
|
-
typedef unsigned short uint16;
|
24
|
-
typedef unsigned int uint32;
|
25
|
-
typedef unsigned long long int uint64;
|
26
|
-
|
27
|
-
typedef signed char int8;
|
28
|
-
typedef signed short int16;
|
29
|
-
typedef signed int int32;
|
30
|
-
typedef signed long long int int64;
|
31
|
-
|
32
|
-
typedef int32 char32;
|
33
|
-
|
34
|
-
} // End namespace CLD2
|
35
|
-
} // End namespace chrome_lang_id
|
36
|
-
|
37
|
-
#endif // SCRIPT_SPAN_INTEGRAL_TYPES_H_
|
@@ -1,168 +0,0 @@
|
|
1
|
-
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
|
15
|
-
//
|
16
|
-
// Author: dsites@google.com (Dick Sites)
|
17
|
-
//
|
18
|
-
|
19
|
-
#ifndef SCRIPT_SPAN_OFFSETMAP_H_
|
20
|
-
#define SCRIPT_SPAN_OFFSETMAP_H_
|
21
|
-
|
22
|
-
#include <string> // for string
|
23
|
-
|
24
|
-
#include "integral_types.h" // for uint32
|
25
|
-
|
26
|
-
// ***************************** OffsetMap **************************
|
27
|
-
//
|
28
|
-
// An OffsetMap object is a container for a mapping from offsets in one text
|
29
|
-
// buffer A' to offsets in another text buffer A. It is most useful when A' is
|
30
|
-
// built from A via substitutions that occasionally do not preserve byte length.
|
31
|
-
//
|
32
|
-
// A series of operators are used to build the correspondence map, then
|
33
|
-
// calls can be made to map an offset in A' to an offset in A, or vice versa.
|
34
|
-
// The map starts with offset 0 in A corresponding to offset 0 in A'.
|
35
|
-
// The mapping is then built sequentially, adding on byte ranges that are
|
36
|
-
// identical in A and A', byte ranges that are inserted in A', and byte ranges
|
37
|
-
// that are deleted from A. All bytes beyond those specified when building the
|
38
|
-
// map are assumed to correspond, i.e. a Copy(infinity) is assumed at the
|
39
|
-
// end of the map.
|
40
|
-
//
|
41
|
-
// The internal data structure records positions at which bytes are added or
|
42
|
-
// deleted. Using the map is O(1) when increasing the A' or A offset
|
43
|
-
// monotonically, and O(n) when accessing random offsets, where n is the
|
44
|
-
// number of differences.
|
45
|
-
//
|
46
|
-
|
47
|
-
namespace chrome_lang_id {
|
48
|
-
namespace CLD2 {
|
49
|
-
|
50
|
-
class OffsetMap {
|
51
|
-
public:
|
52
|
-
// Constructor, destructor
|
53
|
-
OffsetMap();
|
54
|
-
~OffsetMap();
|
55
|
-
|
56
|
-
// Clear the map
|
57
|
-
void Clear();
|
58
|
-
|
59
|
-
// Add to mapping from A to A', specifying how many next bytes correspond
|
60
|
-
// in A and A'
|
61
|
-
void Copy(int bytes);
|
62
|
-
|
63
|
-
// Add to mapping from A to A', specifying how many next bytes are
|
64
|
-
// inserted in A' while not advancing in A at all
|
65
|
-
void Insert(int bytes);
|
66
|
-
|
67
|
-
// Add to mapping from A to A', specifying how many next bytes are
|
68
|
-
// deleted from A while not advancing in A' at all
|
69
|
-
void Delete(int bytes);
|
70
|
-
|
71
|
-
// [Finish building map,] Re-position to offset 0
|
72
|
-
// This call is optional; MapForward and MapBack finish building the map
|
73
|
-
// if necessary
|
74
|
-
void Reset();
|
75
|
-
|
76
|
-
// Map an offset in A' to the corresponding offset in A
|
77
|
-
int MapBack(int aprimeoffset);
|
78
|
-
|
79
|
-
// Map an offset in A to the corresponding offset in A'
|
80
|
-
int MapForward(int aoffset);
|
81
|
-
|
82
|
-
// h = ComposeOffsetMap(g, f), where f is a map from A to A', g is
|
83
|
-
// from A' to A'' and h is from A to A''.
|
84
|
-
//
|
85
|
-
// Note that g->MoveForward(f->MoveForward(aoffset)) always equals
|
86
|
-
// to h->MoveForward(aoffset), while
|
87
|
-
// f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals
|
88
|
-
// to h->MoveBack(aprimeprimeoffset). This happens when deletion in
|
89
|
-
// f and insertion in g are at the same place. For example,
|
90
|
-
//
|
91
|
-
// A 1 2 3 4
|
92
|
-
// ^ | ^ ^
|
93
|
-
// | | / | f
|
94
|
-
// v vv v
|
95
|
-
// A' 1' 2' 3'
|
96
|
-
// ^ ^^ ^
|
97
|
-
// | | \ | g
|
98
|
-
// v | v v
|
99
|
-
// A'' 1'' 2'' 3'' 4''
|
100
|
-
//
|
101
|
-
// results in:
|
102
|
-
//
|
103
|
-
// A 1 2 3 4
|
104
|
-
// ^ ^\ ^ ^
|
105
|
-
// | | \ | | h
|
106
|
-
// v | vv v
|
107
|
-
// A'' 1'' 2'' 3'' 4''
|
108
|
-
//
|
109
|
-
// 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in
|
110
|
-
// the latter figure.
|
111
|
-
static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h);
|
112
|
-
|
113
|
-
// For testing only -- force a mapping
|
114
|
-
void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset);
|
115
|
-
|
116
|
-
private:
|
117
|
-
enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP};
|
118
|
-
|
119
|
-
void Flush();
|
120
|
-
void FlushAll();
|
121
|
-
void MaybeFlushAll();
|
122
|
-
void Emit(MapOp op, int len);
|
123
|
-
|
124
|
-
void SetLeft();
|
125
|
-
void SetRight();
|
126
|
-
|
127
|
-
// Back up over previous range, 1..5 bytes
|
128
|
-
// Return subscript at the beginning of that. Pins at 0
|
129
|
-
int Backup(int sub);
|
130
|
-
|
131
|
-
// Parse next range, 1..5 bytes
|
132
|
-
// Return subscript just off the end of that
|
133
|
-
int ParseNext(int sub, MapOp* op, int* length);
|
134
|
-
|
135
|
-
// Parse previous range, 1..5 bytes
|
136
|
-
// Return current subscript
|
137
|
-
int ParsePrevious(int sub, MapOp* op, int* length);
|
138
|
-
|
139
|
-
bool MoveRight(); // Returns true if OK
|
140
|
-
bool MoveLeft(); // Returns true if OK
|
141
|
-
|
142
|
-
// Copies insert operations from source to dest. Returns true if no
|
143
|
-
// other operations are found.
|
144
|
-
static bool CopyInserts(OffsetMap* source, OffsetMap* dest);
|
145
|
-
|
146
|
-
// Copies delete operations from source to dest. Returns true if no other
|
147
|
-
// operations are found.
|
148
|
-
static bool CopyDeletes(OffsetMap* source, OffsetMap* dest);
|
149
|
-
|
150
|
-
std::string diffs_;
|
151
|
-
MapOp pending_op_;
|
152
|
-
uint32 pending_length_;
|
153
|
-
|
154
|
-
// Offsets in the ranges below correspond to each other, with A' = A + diff
|
155
|
-
int next_diff_sub_;
|
156
|
-
int current_lo_aoffset_;
|
157
|
-
int current_hi_aoffset_;
|
158
|
-
int current_lo_aprimeoffset_;
|
159
|
-
int current_hi_aprimeoffset_;
|
160
|
-
int current_diff_;
|
161
|
-
int max_aoffset_;
|
162
|
-
int max_aprimeoffset_;
|
163
|
-
};
|
164
|
-
|
165
|
-
} // namespace CLD2
|
166
|
-
} // namespace chrome_lang_id
|
167
|
-
|
168
|
-
#endif // SCRIPT_SPAN_OFFSETMAP_H_
|
data/ext/cld3/script_span/port.h
DELETED
@@ -1,143 +0,0 @@
|
|
1
|
-
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
|
15
|
-
//
|
16
|
-
// These are weird things we need to do to get this compiling on
|
17
|
-
// random systems [subset].
|
18
|
-
|
19
|
-
#ifndef SCRIPT_SPAN_PORT_H_
|
20
|
-
#define SCRIPT_SPAN_PORT_H_
|
21
|
-
|
22
|
-
#include <string.h> // for memcpy()
|
23
|
-
|
24
|
-
#include "integral_types.h"
|
25
|
-
|
26
|
-
namespace chrome_lang_id {
|
27
|
-
namespace CLD2 {
|
28
|
-
|
29
|
-
// Portable handling of unaligned loads, stores, and copies.
|
30
|
-
// On some platforms, like ARM, the copy functions can be more efficient
|
31
|
-
// then a load and a store.
|
32
|
-
|
33
|
-
#if defined(ARCH_PIII) || defined(ARCH_ATHLON) || defined(ARCH_K8) || defined(_ARCH_PPC)
|
34
|
-
|
35
|
-
// x86 and x86-64 can perform unaligned loads/stores directly;
|
36
|
-
// modern PowerPC hardware can also do unaligned integer loads and stores;
|
37
|
-
// but note: the FPU still sends unaligned loads and stores to a trap handler!
|
38
|
-
|
39
|
-
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
|
40
|
-
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
|
41
|
-
#define UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64 *>(_p))
|
42
|
-
|
43
|
-
#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
|
44
|
-
#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
|
45
|
-
#define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val))
|
46
|
-
|
47
|
-
#elif defined(__arm__) && \
|
48
|
-
!defined(__ARM_ARCH_5__) && \
|
49
|
-
!defined(__ARM_ARCH_5T__) && \
|
50
|
-
!defined(__ARM_ARCH_5TE__) && \
|
51
|
-
!defined(__ARM_ARCH_5TEJ__) && \
|
52
|
-
!defined(__ARM_ARCH_6__) && \
|
53
|
-
!defined(__ARM_ARCH_6J__) && \
|
54
|
-
!defined(__ARM_ARCH_6K__) && \
|
55
|
-
!defined(__ARM_ARCH_6Z__) && \
|
56
|
-
!defined(__ARM_ARCH_6ZK__) && \
|
57
|
-
!defined(__ARM_ARCH_6T2__) && \
|
58
|
-
!defined(__ARM_ARCH_7__) && \
|
59
|
-
!defined(__ARM_ARCH_7A__) && \
|
60
|
-
!defined(__ARM_ARCH_7M__) && \
|
61
|
-
!defined(__ARM_ARCH_7R__) && \
|
62
|
-
!defined(__ARM_ARCH_8__) && \
|
63
|
-
!defined(__ARM_ARCH_8A__)
|
64
|
-
|
65
|
-
// ARMv7 and newer support native unaligned accesses, but only of 16-bit
|
66
|
-
// and 32-bit values (not 64-bit); older versions either raise a fatal signal,
|
67
|
-
// do an unaligned read and rotate the words around a bit, or do the reads very
|
68
|
-
// slowly (trip through kernel mode). There's no simple #define that says just
|
69
|
-
// “ARMv7 or higher”, so we have to filter away all ARMv5 and ARMv6
|
70
|
-
// sub-architectures. Newer gcc (>= 4.6) set an __ARM_FEATURE_ALIGNED #define,
|
71
|
-
// so in time, maybe we can move on to that.
|
72
|
-
//
|
73
|
-
// Note that even if a chipset supports unaligned access, it might not be
|
74
|
-
// enabled in any given system, e.g.:
|
75
|
-
// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/CIHCGCFD.html
|
76
|
-
// Therefore, it's generally just not safe to allow unaligned access on any ARM
|
77
|
-
// variant.
|
78
|
-
//
|
79
|
-
// This is a mess, but there's not much we can do about it.
|
80
|
-
|
81
|
-
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
|
82
|
-
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
|
83
|
-
|
84
|
-
#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
|
85
|
-
#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
|
86
|
-
|
87
|
-
// TODO(sesse): NEON supports unaligned 64-bit loads and stores.
|
88
|
-
// See if that would be more efficient on platforms supporting it,
|
89
|
-
// at least for copies.
|
90
|
-
|
91
|
-
inline uint64 UNALIGNED_LOAD64(const void *p) {
|
92
|
-
uint64 t;
|
93
|
-
memcpy(&t, p, sizeof t);
|
94
|
-
return t;
|
95
|
-
}
|
96
|
-
|
97
|
-
inline void UNALIGNED_STORE64(void *p, uint64 v) {
|
98
|
-
memcpy(p, &v, sizeof v);
|
99
|
-
}
|
100
|
-
|
101
|
-
#else
|
102
|
-
|
103
|
-
#define NEED_ALIGNED_LOADS
|
104
|
-
|
105
|
-
// These functions are provided for architectures that don't support
|
106
|
-
// unaligned loads and stores.
|
107
|
-
|
108
|
-
inline uint16 UNALIGNED_LOAD16(const void *p) {
|
109
|
-
uint16 t;
|
110
|
-
memcpy(&t, p, sizeof t);
|
111
|
-
return t;
|
112
|
-
}
|
113
|
-
|
114
|
-
inline uint32 UNALIGNED_LOAD32(const void *p) {
|
115
|
-
uint32 t;
|
116
|
-
memcpy(&t, p, sizeof t);
|
117
|
-
return t;
|
118
|
-
}
|
119
|
-
|
120
|
-
inline uint64 UNALIGNED_LOAD64(const void *p) {
|
121
|
-
uint64 t;
|
122
|
-
memcpy(&t, p, sizeof t);
|
123
|
-
return t;
|
124
|
-
}
|
125
|
-
|
126
|
-
inline void UNALIGNED_STORE16(void *p, uint16 v) {
|
127
|
-
memcpy(p, &v, sizeof v);
|
128
|
-
}
|
129
|
-
|
130
|
-
inline void UNALIGNED_STORE32(void *p, uint32 v) {
|
131
|
-
memcpy(p, &v, sizeof v);
|
132
|
-
}
|
133
|
-
|
134
|
-
inline void UNALIGNED_STORE64(void *p, uint64 v) {
|
135
|
-
memcpy(p, &v, sizeof v);
|
136
|
-
}
|
137
|
-
|
138
|
-
#endif
|
139
|
-
|
140
|
-
} // End namespace CLD2
|
141
|
-
} // End namespace chrome_lang_id
|
142
|
-
|
143
|
-
#endif // SCRIPT_SPAN_PORT_H_
|