cld3 3.4.4 → 3.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +0 -1
- data/README.md +4 -7
- data/cld3.gemspec +5 -5
- data/ext/cld3/Makefile +17 -16
- data/ext/cld3/base.o +0 -0
- data/ext/cld3/cld_3/protos/feature_extractor.pb.h +100 -0
- data/ext/cld3/cld_3/protos/sentence.pb.h +35 -0
- data/ext/cld3/cld_3/protos/task_spec.pb.h +106 -0
- data/ext/cld3/embedding_feature_extractor.o +0 -0
- data/ext/cld3/embedding_network.o +0 -0
- data/ext/cld3/extconf.rb +1 -10
- data/ext/cld3/feature_extractor.o +0 -0
- data/ext/cld3/feature_types.o +0 -0
- data/ext/cld3/fixunicodevalue.o +0 -0
- data/ext/cld3/fml_parser.o +0 -0
- data/ext/cld3/generated_entities.o +0 -0
- data/ext/cld3/generated_ulscript.o +0 -0
- data/ext/cld3/getonescriptspan.cc +0 -2
- data/ext/cld3/getonescriptspan.o +0 -0
- data/ext/cld3/lang_id_nn_params.o +0 -0
- data/ext/cld3/language_identifier_features.o +0 -0
- data/ext/cld3/libcld3.so +0 -0
- data/ext/cld3/nnet_language_identifier.o +0 -0
- data/ext/cld3/nnet_language_identifier_c.o +0 -0
- data/ext/cld3/offsetmap.o +0 -0
- data/ext/cld3/registry.o +0 -0
- data/ext/cld3/relevant_script_feature.o +0 -0
- data/ext/cld3/script_span/fixunicodevalue.h +69 -0
- data/ext/cld3/script_span/generated_ulscript.h +142 -0
- data/ext/cld3/script_span/getonescriptspan.h +124 -0
- data/ext/cld3/script_span/integral_types.h +37 -0
- data/ext/cld3/script_span/offsetmap.h +168 -0
- data/ext/cld3/script_span/port.h +143 -0
- data/ext/cld3/script_span/stringpiece.h +81 -0
- data/ext/cld3/script_span/text_processing.h +30 -0
- data/ext/cld3/script_span/utf8acceptinterchange.h +486 -0
- data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +1631 -0
- data/ext/cld3/script_span/utf8repl_lettermarklower.h +758 -0
- data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +1455 -0
- data/ext/cld3/script_span/utf8statetable.h +285 -0
- data/ext/cld3/sentence_features.o +0 -0
- data/ext/cld3/task_context.o +0 -0
- data/ext/cld3/task_context_params.o +0 -0
- data/ext/cld3/text_processing.o +0 -0
- data/ext/cld3/unicodetext.o +0 -0
- data/ext/cld3/utf8statetable.o +0 -0
- data/ext/cld3/utils.o +0 -0
- data/ext/cld3/workspace.o +0 -0
- data/lib/cld3.rb +4 -1
- metadata +33 -25
- data/ext/cld3/feature_extractor.pb.o +0 -0
- data/ext/cld3/feature_extractor.proto +0 -50
- data/ext/cld3/mkmf.log +0 -37
- data/ext/cld3/sentence.pb.o +0 -0
- data/ext/cld3/sentence.proto +0 -77
- data/ext/cld3/task_spec.pb.o +0 -0
- data/ext/cld3/task_spec.proto +0 -98
- data/lib/a.rb +0 -24
@@ -0,0 +1,142 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
// generated_ulscript.h
|
16
|
+
// Machine generated. Do Not Edit.
|
17
|
+
//
|
18
|
+
// Declarations for scripts recognized by CLD2
|
19
|
+
//
|
20
|
+
|
21
|
+
#ifndef SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
22
|
+
#define SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
23
|
+
|
24
|
+
namespace chrome_lang_id {
|
25
|
+
namespace CLD2 {
|
26
|
+
|
27
|
+
typedef enum {RTypeNone = 0, RTypeOne, RTypeMany, RTypeCJK} ULScriptRType;
|
28
|
+
|
29
|
+
typedef struct {const char* s; int i;} CharIntPair;
|
30
|
+
|
31
|
+
typedef enum {
|
32
|
+
ULScript_Common = 0, // Zyyy
|
33
|
+
ULScript_Latin = 1, // Latn
|
34
|
+
ULScript_Greek = 2, // Grek
|
35
|
+
ULScript_Cyrillic = 3, // Cyrl
|
36
|
+
ULScript_Armenian = 4, // Armn
|
37
|
+
ULScript_Hebrew = 5, // Hebr
|
38
|
+
ULScript_Arabic = 6, // Arab
|
39
|
+
ULScript_Syriac = 7, // Syrc
|
40
|
+
ULScript_Thaana = 8, // Thaa
|
41
|
+
ULScript_Devanagari = 9, // Deva
|
42
|
+
ULScript_Bengali = 10, // Beng
|
43
|
+
ULScript_Gurmukhi = 11, // Guru
|
44
|
+
ULScript_Gujarati = 12, // Gujr
|
45
|
+
ULScript_Oriya = 13, // Orya
|
46
|
+
ULScript_Tamil = 14, // Taml
|
47
|
+
ULScript_Telugu = 15, // Telu
|
48
|
+
ULScript_Kannada = 16, // Knda
|
49
|
+
ULScript_Malayalam = 17, // Mlym
|
50
|
+
ULScript_Sinhala = 18, // Sinh
|
51
|
+
ULScript_Thai = 19, // Thai
|
52
|
+
ULScript_Lao = 20, // Laoo
|
53
|
+
ULScript_Tibetan = 21, // Tibt
|
54
|
+
ULScript_Myanmar = 22, // Mymr
|
55
|
+
ULScript_Georgian = 23, // Geor
|
56
|
+
ULScript_Hani = 24, // Hani
|
57
|
+
ULScript_Ethiopic = 25, // Ethi
|
58
|
+
ULScript_Cherokee = 26, // Cher
|
59
|
+
ULScript_Canadian_Aboriginal = 27, // Cans
|
60
|
+
ULScript_Ogham = 28, // Ogam
|
61
|
+
ULScript_Runic = 29, // Runr
|
62
|
+
ULScript_Khmer = 30, // Khmr
|
63
|
+
ULScript_Mongolian = 31, // Mong
|
64
|
+
ULScript_32 = 32, //
|
65
|
+
ULScript_33 = 33, //
|
66
|
+
ULScript_Bopomofo = 34, // Bopo
|
67
|
+
ULScript_35 = 35, //
|
68
|
+
ULScript_Yi = 36, // Yiii
|
69
|
+
ULScript_Old_Italic = 37, // Ital
|
70
|
+
ULScript_Gothic = 38, // Goth
|
71
|
+
ULScript_Deseret = 39, // Dsrt
|
72
|
+
ULScript_Inherited = 40, // Zinh
|
73
|
+
ULScript_Tagalog = 41, // Tglg
|
74
|
+
ULScript_Hanunoo = 42, // Hano
|
75
|
+
ULScript_Buhid = 43, // Buhd
|
76
|
+
ULScript_Tagbanwa = 44, // Tagb
|
77
|
+
ULScript_Limbu = 45, // Limb
|
78
|
+
ULScript_Tai_Le = 46, // Tale
|
79
|
+
ULScript_Linear_B = 47, // Linb
|
80
|
+
ULScript_Ugaritic = 48, // Ugar
|
81
|
+
ULScript_Shavian = 49, // Shaw
|
82
|
+
ULScript_Osmanya = 50, // Osma
|
83
|
+
ULScript_Cypriot = 51, // Cprt
|
84
|
+
ULScript_Braille = 52, // Brai
|
85
|
+
ULScript_Buginese = 53, // Bugi
|
86
|
+
ULScript_Coptic = 54, // Copt
|
87
|
+
ULScript_New_Tai_Lue = 55, // Talu
|
88
|
+
ULScript_Glagolitic = 56, // Glag
|
89
|
+
ULScript_Tifinagh = 57, // Tfng
|
90
|
+
ULScript_Syloti_Nagri = 58, // Sylo
|
91
|
+
ULScript_Old_Persian = 59, // Xpeo
|
92
|
+
ULScript_Kharoshthi = 60, // Khar
|
93
|
+
ULScript_Balinese = 61, // Bali
|
94
|
+
ULScript_Cuneiform = 62, // Xsux
|
95
|
+
ULScript_Phoenician = 63, // Phnx
|
96
|
+
ULScript_Phags_Pa = 64, // Phag
|
97
|
+
ULScript_Nko = 65, // Nkoo
|
98
|
+
ULScript_Sundanese = 66, // Sund
|
99
|
+
ULScript_Lepcha = 67, // Lepc
|
100
|
+
ULScript_Ol_Chiki = 68, // Olck
|
101
|
+
ULScript_Vai = 69, // Vaii
|
102
|
+
ULScript_Saurashtra = 70, // Saur
|
103
|
+
ULScript_Kayah_Li = 71, // Kali
|
104
|
+
ULScript_Rejang = 72, // Rjng
|
105
|
+
ULScript_Lycian = 73, // Lyci
|
106
|
+
ULScript_Carian = 74, // Cari
|
107
|
+
ULScript_Lydian = 75, // Lydi
|
108
|
+
ULScript_Cham = 76, // Cham
|
109
|
+
ULScript_Tai_Tham = 77, // Lana
|
110
|
+
ULScript_Tai_Viet = 78, // Tavt
|
111
|
+
ULScript_Avestan = 79, // Avst
|
112
|
+
ULScript_Egyptian_Hieroglyphs = 80, // Egyp
|
113
|
+
ULScript_Samaritan = 81, // Samr
|
114
|
+
ULScript_Lisu = 82, // Lisu
|
115
|
+
ULScript_Bamum = 83, // Bamu
|
116
|
+
ULScript_Javanese = 84, // Java
|
117
|
+
ULScript_Meetei_Mayek = 85, // Mtei
|
118
|
+
ULScript_Imperial_Aramaic = 86, // Armi
|
119
|
+
ULScript_Old_South_Arabian = 87, // Sarb
|
120
|
+
ULScript_Inscriptional_Parthian = 88, // Prti
|
121
|
+
ULScript_Inscriptional_Pahlavi = 89, // Phli
|
122
|
+
ULScript_Old_Turkic = 90, // Orkh
|
123
|
+
ULScript_Kaithi = 91, // Kthi
|
124
|
+
ULScript_Batak = 92, // Batk
|
125
|
+
ULScript_Brahmi = 93, // Brah
|
126
|
+
ULScript_Mandaic = 94, // Mand
|
127
|
+
ULScript_Chakma = 95, // Cakm
|
128
|
+
ULScript_Meroitic_Cursive = 96, // Merc
|
129
|
+
ULScript_Meroitic_Hieroglyphs = 97, // Mero
|
130
|
+
ULScript_Miao = 98, // Plrd
|
131
|
+
ULScript_Sharada = 99, // Shrd
|
132
|
+
ULScript_Sora_Sompeng = 100, // Sora
|
133
|
+
ULScript_Takri = 101, // Takr
|
134
|
+
NUM_ULSCRIPTS
|
135
|
+
} ULScript;
|
136
|
+
|
137
|
+
#define UNKNOWN_ULSCRIPT ULScript_Common
|
138
|
+
|
139
|
+
} // namespace CLD2
|
140
|
+
} // namespace chrome_lang_id
|
141
|
+
|
142
|
+
#endif // SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
|
@@ -0,0 +1,124 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// Author: dsites@google.com (Dick Sites)
|
17
|
+
//
|
18
|
+
|
19
|
+
|
20
|
+
#ifndef SCRIPT_SPAN_GETONESCRIPTSPAN_H_
|
21
|
+
#define SCRIPT_SPAN_GETONESCRIPTSPAN_H_
|
22
|
+
|
23
|
+
#include "generated_ulscript.h"
|
24
|
+
#include "integral_types.h"
|
25
|
+
#include "offsetmap.h"
|
26
|
+
|
27
|
+
namespace chrome_lang_id {
|
28
|
+
namespace CLD2 {
|
29
|
+
|
30
|
+
static const int kMaxScriptBuffer = 40960;
|
31
|
+
static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
|
32
|
+
static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
|
33
|
+
static const int kWithinScriptTail = 32; // Stop at word space in last
|
34
|
+
// N bytes of script buffer
|
35
|
+
|
36
|
+
struct LangSpan {
|
37
|
+
char* text = nullptr; // Pointer to the span, somewhere
|
38
|
+
int text_bytes = 0; // Number of bytes of text in the span
|
39
|
+
int offset = 0; // Offset of start of span in original input buffer
|
40
|
+
ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
|
41
|
+
bool truncated = false; // true if buffer filled up before a
|
42
|
+
// different script or EOF was found
|
43
|
+
};
|
44
|
+
|
45
|
+
static inline bool IsContinuationByte(char c) {
|
46
|
+
return static_cast<signed char>(c) < -64;
|
47
|
+
}
|
48
|
+
|
49
|
+
// Gets lscript number for letters; always returns
|
50
|
+
// 0 (common script) for non-letters
|
51
|
+
int GetUTF8LetterScriptNum(const char* src);
|
52
|
+
|
53
|
+
// Update src pointer to point to next quadgram, +2..+5
|
54
|
+
// Looks at src[0..4]
|
55
|
+
const char* AdvanceQuad(const char* src);
|
56
|
+
|
57
|
+
// Utility routine to search alphabetical tables
|
58
|
+
int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair);
|
59
|
+
|
60
|
+
// Returns the length in bytes of the prefix of src that is all
|
61
|
+
// interchange valid UTF-8
|
62
|
+
int SpanInterchangeValid(const char* src, int byte_length);
|
63
|
+
|
64
|
+
class ScriptScanner {
|
65
|
+
public:
|
66
|
+
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
|
67
|
+
ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
|
68
|
+
bool any_text, bool any_script);
|
69
|
+
~ScriptScanner();
|
70
|
+
|
71
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
72
|
+
bool GetOneScriptSpan(LangSpan* span);
|
73
|
+
|
74
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
75
|
+
void LowerScriptSpan(LangSpan* span);
|
76
|
+
|
77
|
+
// Copy next run of same-script non-tag letters to buffer [NUL terminated]
|
78
|
+
// Force Latin and Cyrillic scripts to be lowercase
|
79
|
+
bool GetOneScriptSpanLower(LangSpan* span);
|
80
|
+
|
81
|
+
// Copy next run of non-tag characters to buffer [NUL terminated]
|
82
|
+
// This just removes tags and removes entities
|
83
|
+
// Buffer has leading space
|
84
|
+
bool GetOneTextSpan(LangSpan* span);
|
85
|
+
|
86
|
+
// Maps byte offset in most recent GetOneScriptSpan/Lower
|
87
|
+
// span->text [0..text_bytes] into an additional byte offset from
|
88
|
+
// span->offset, to get back to corresponding text in the original
|
89
|
+
// input buffer.
|
90
|
+
// text_offset must be the first byte
|
91
|
+
// of a UTF-8 character, or just beyond the last character. Normally this
|
92
|
+
// routine is called with the first byte of an interesting range and
|
93
|
+
// again with the first byte of the following range.
|
94
|
+
int MapBack(int text_offset);
|
95
|
+
|
96
|
+
const char* GetBufferStart() {return start_byte_;}
|
97
|
+
|
98
|
+
private:
|
99
|
+
// Skip over tags and non-letters
|
100
|
+
int SkipToFrontOfSpan(const char* src, int len, int* script);
|
101
|
+
|
102
|
+
const char* start_byte_; // Starting byte of buffer to scan
|
103
|
+
const char* next_byte_; // First unscanned byte
|
104
|
+
int byte_length_; // Bytes left
|
105
|
+
|
106
|
+
bool is_plain_text_; // true fo text, false for HTML
|
107
|
+
char* script_buffer_; // Holds text with expanded entities
|
108
|
+
char* script_buffer_lower_; // Holds lowercased text
|
109
|
+
bool letters_marks_only_; // To distinguish scriptspan of one
|
110
|
+
// letters/marks vs. any mixture of text
|
111
|
+
bool one_script_only_; // To distinguish scriptspan of one
|
112
|
+
// script vs. any mixture of scripts
|
113
|
+
int exit_state_; // For tag parser kTagParseTbl_0, based
|
114
|
+
// on letters_marks_only_
|
115
|
+
public :
|
116
|
+
// Expose for debugging
|
117
|
+
OffsetMap map2original_; // map from script_buffer_ to buffer
|
118
|
+
OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_
|
119
|
+
};
|
120
|
+
|
121
|
+
} // namespace CLD2
|
122
|
+
} // namespace chrome_lang_id
|
123
|
+
|
124
|
+
#endif // SCRIPT_SPAN_GETONESCRIPTSPAN_H_
|
@@ -0,0 +1,37 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
#ifndef SCRIPT_SPAN_INTEGRAL_TYPES_H_
|
16
|
+
#define SCRIPT_SPAN_INTEGRAL_TYPES_H_
|
17
|
+
|
18
|
+
// Cheap version
|
19
|
+
namespace chrome_lang_id {
|
20
|
+
namespace CLD2 {
|
21
|
+
|
22
|
+
typedef unsigned char uint8;
|
23
|
+
typedef unsigned short uint16;
|
24
|
+
typedef unsigned int uint32;
|
25
|
+
typedef unsigned long long int uint64;
|
26
|
+
|
27
|
+
typedef signed char int8;
|
28
|
+
typedef signed short int16;
|
29
|
+
typedef signed int int32;
|
30
|
+
typedef signed long long int int64;
|
31
|
+
|
32
|
+
typedef int32 char32;
|
33
|
+
|
34
|
+
} // End namespace CLD2
|
35
|
+
} // End namespace chrome_lang_id
|
36
|
+
|
37
|
+
#endif // SCRIPT_SPAN_INTEGRAL_TYPES_H_
|
@@ -0,0 +1,168 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// Author: dsites@google.com (Dick Sites)
|
17
|
+
//
|
18
|
+
|
19
|
+
#ifndef SCRIPT_SPAN_OFFSETMAP_H_
|
20
|
+
#define SCRIPT_SPAN_OFFSETMAP_H_
|
21
|
+
|
22
|
+
#include <string> // for string
|
23
|
+
|
24
|
+
#include "integral_types.h" // for uint32
|
25
|
+
|
26
|
+
// ***************************** OffsetMap **************************
|
27
|
+
//
|
28
|
+
// An OffsetMap object is a container for a mapping from offsets in one text
|
29
|
+
// buffer A' to offsets in another text buffer A. It is most useful when A' is
|
30
|
+
// built from A via substitutions that occasionally do not preserve byte length.
|
31
|
+
//
|
32
|
+
// A series of operators are used to build the correspondence map, then
|
33
|
+
// calls can be made to map an offset in A' to an offset in A, or vice versa.
|
34
|
+
// The map starts with offset 0 in A corresponding to offset 0 in A'.
|
35
|
+
// The mapping is then built sequentially, adding on byte ranges that are
|
36
|
+
// identical in A and A', byte ranges that are inserted in A', and byte ranges
|
37
|
+
// that are deleted from A. All bytes beyond those specified when building the
|
38
|
+
// map are assumed to correspond, i.e. a Copy(infinity) is assumed at the
|
39
|
+
// end of the map.
|
40
|
+
//
|
41
|
+
// The internal data structure records positions at which bytes are added or
|
42
|
+
// deleted. Using the map is O(1) when increasing the A' or A offset
|
43
|
+
// monotonically, and O(n) when accessing random offsets, where n is the
|
44
|
+
// number of differences.
|
45
|
+
//
|
46
|
+
|
47
|
+
namespace chrome_lang_id {
|
48
|
+
namespace CLD2 {
|
49
|
+
|
50
|
+
class OffsetMap {
|
51
|
+
public:
|
52
|
+
// Constructor, destructor
|
53
|
+
OffsetMap();
|
54
|
+
~OffsetMap();
|
55
|
+
|
56
|
+
// Clear the map
|
57
|
+
void Clear();
|
58
|
+
|
59
|
+
// Add to mapping from A to A', specifying how many next bytes correspond
|
60
|
+
// in A and A'
|
61
|
+
void Copy(int bytes);
|
62
|
+
|
63
|
+
// Add to mapping from A to A', specifying how many next bytes are
|
64
|
+
// inserted in A' while not advancing in A at all
|
65
|
+
void Insert(int bytes);
|
66
|
+
|
67
|
+
// Add to mapping from A to A', specifying how many next bytes are
|
68
|
+
// deleted from A while not advancing in A' at all
|
69
|
+
void Delete(int bytes);
|
70
|
+
|
71
|
+
// [Finish building map,] Re-position to offset 0
|
72
|
+
// This call is optional; MapForward and MapBack finish building the map
|
73
|
+
// if necessary
|
74
|
+
void Reset();
|
75
|
+
|
76
|
+
// Map an offset in A' to the corresponding offset in A
|
77
|
+
int MapBack(int aprimeoffset);
|
78
|
+
|
79
|
+
// Map an offset in A to the corresponding offset in A'
|
80
|
+
int MapForward(int aoffset);
|
81
|
+
|
82
|
+
// h = ComposeOffsetMap(g, f), where f is a map from A to A', g is
|
83
|
+
// from A' to A'' and h is from A to A''.
|
84
|
+
//
|
85
|
+
// Note that g->MoveForward(f->MoveForward(aoffset)) always equals
|
86
|
+
// to h->MoveForward(aoffset), while
|
87
|
+
// f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals
|
88
|
+
// to h->MoveBack(aprimeprimeoffset). This happens when deletion in
|
89
|
+
// f and insertion in g are at the same place. For example,
|
90
|
+
//
|
91
|
+
// A 1 2 3 4
|
92
|
+
// ^ | ^ ^
|
93
|
+
// | | / | f
|
94
|
+
// v vv v
|
95
|
+
// A' 1' 2' 3'
|
96
|
+
// ^ ^^ ^
|
97
|
+
// | | \ | g
|
98
|
+
// v | v v
|
99
|
+
// A'' 1'' 2'' 3'' 4''
|
100
|
+
//
|
101
|
+
// results in:
|
102
|
+
//
|
103
|
+
// A 1 2 3 4
|
104
|
+
// ^ ^\ ^ ^
|
105
|
+
// | | \ | | h
|
106
|
+
// v | vv v
|
107
|
+
// A'' 1'' 2'' 3'' 4''
|
108
|
+
//
|
109
|
+
// 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in
|
110
|
+
// the latter figure.
|
111
|
+
static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h);
|
112
|
+
|
113
|
+
// For testing only -- force a mapping
|
114
|
+
void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset);
|
115
|
+
|
116
|
+
private:
|
117
|
+
enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP};
|
118
|
+
|
119
|
+
void Flush();
|
120
|
+
void FlushAll();
|
121
|
+
void MaybeFlushAll();
|
122
|
+
void Emit(MapOp op, int len);
|
123
|
+
|
124
|
+
void SetLeft();
|
125
|
+
void SetRight();
|
126
|
+
|
127
|
+
// Back up over previous range, 1..5 bytes
|
128
|
+
// Return subscript at the beginning of that. Pins at 0
|
129
|
+
int Backup(int sub);
|
130
|
+
|
131
|
+
// Parse next range, 1..5 bytes
|
132
|
+
// Return subscript just off the end of that
|
133
|
+
int ParseNext(int sub, MapOp* op, int* length);
|
134
|
+
|
135
|
+
// Parse previous range, 1..5 bytes
|
136
|
+
// Return current subscript
|
137
|
+
int ParsePrevious(int sub, MapOp* op, int* length);
|
138
|
+
|
139
|
+
bool MoveRight(); // Returns true if OK
|
140
|
+
bool MoveLeft(); // Returns true if OK
|
141
|
+
|
142
|
+
// Copies insert operations from source to dest. Returns true if no
|
143
|
+
// other operations are found.
|
144
|
+
static bool CopyInserts(OffsetMap* source, OffsetMap* dest);
|
145
|
+
|
146
|
+
// Copies delete operations from source to dest. Returns true if no other
|
147
|
+
// operations are found.
|
148
|
+
static bool CopyDeletes(OffsetMap* source, OffsetMap* dest);
|
149
|
+
|
150
|
+
std::string diffs_;
|
151
|
+
MapOp pending_op_;
|
152
|
+
uint32 pending_length_;
|
153
|
+
|
154
|
+
// Offsets in the ranges below correspond to each other, with A' = A + diff
|
155
|
+
int next_diff_sub_;
|
156
|
+
int current_lo_aoffset_;
|
157
|
+
int current_hi_aoffset_;
|
158
|
+
int current_lo_aprimeoffset_;
|
159
|
+
int current_hi_aprimeoffset_;
|
160
|
+
int current_diff_;
|
161
|
+
int max_aoffset_;
|
162
|
+
int max_aprimeoffset_;
|
163
|
+
};
|
164
|
+
|
165
|
+
} // namespace CLD2
|
166
|
+
} // namespace chrome_lang_id
|
167
|
+
|
168
|
+
#endif // SCRIPT_SPAN_OFFSETMAP_H_
|
@@ -0,0 +1,143 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// These are weird things we need to do to get this compiling on
|
17
|
+
// random systems [subset].
|
18
|
+
|
19
|
+
#ifndef SCRIPT_SPAN_PORT_H_
|
20
|
+
#define SCRIPT_SPAN_PORT_H_
|
21
|
+
|
22
|
+
#include <string.h> // for memcpy()
|
23
|
+
|
24
|
+
#include "integral_types.h"
|
25
|
+
|
26
|
+
namespace chrome_lang_id {
|
27
|
+
namespace CLD2 {
|
28
|
+
|
29
|
+
// Portable handling of unaligned loads, stores, and copies.
|
30
|
+
// On some platforms, like ARM, the copy functions can be more efficient
|
31
|
+
// then a load and a store.
|
32
|
+
|
33
|
+
#if defined(ARCH_PIII) || defined(ARCH_ATHLON) || defined(ARCH_K8) || defined(_ARCH_PPC)
|
34
|
+
|
35
|
+
// x86 and x86-64 can perform unaligned loads/stores directly;
|
36
|
+
// modern PowerPC hardware can also do unaligned integer loads and stores;
|
37
|
+
// but note: the FPU still sends unaligned loads and stores to a trap handler!
|
38
|
+
|
39
|
+
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
|
40
|
+
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
|
41
|
+
#define UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64 *>(_p))
|
42
|
+
|
43
|
+
#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
|
44
|
+
#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
|
45
|
+
#define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val))
|
46
|
+
|
47
|
+
#elif defined(__arm__) && \
|
48
|
+
!defined(__ARM_ARCH_5__) && \
|
49
|
+
!defined(__ARM_ARCH_5T__) && \
|
50
|
+
!defined(__ARM_ARCH_5TE__) && \
|
51
|
+
!defined(__ARM_ARCH_5TEJ__) && \
|
52
|
+
!defined(__ARM_ARCH_6__) && \
|
53
|
+
!defined(__ARM_ARCH_6J__) && \
|
54
|
+
!defined(__ARM_ARCH_6K__) && \
|
55
|
+
!defined(__ARM_ARCH_6Z__) && \
|
56
|
+
!defined(__ARM_ARCH_6ZK__) && \
|
57
|
+
!defined(__ARM_ARCH_6T2__) && \
|
58
|
+
!defined(__ARM_ARCH_7__) && \
|
59
|
+
!defined(__ARM_ARCH_7A__) && \
|
60
|
+
!defined(__ARM_ARCH_7M__) && \
|
61
|
+
!defined(__ARM_ARCH_7R__) && \
|
62
|
+
!defined(__ARM_ARCH_8__) && \
|
63
|
+
!defined(__ARM_ARCH_8A__)
|
64
|
+
|
65
|
+
// ARMv7 and newer support native unaligned accesses, but only of 16-bit
|
66
|
+
// and 32-bit values (not 64-bit); older versions either raise a fatal signal,
|
67
|
+
// do an unaligned read and rotate the words around a bit, or do the reads very
|
68
|
+
// slowly (trip through kernel mode). There's no simple #define that says just
|
69
|
+
// “ARMv7 or higher”, so we have to filter away all ARMv5 and ARMv6
|
70
|
+
// sub-architectures. Newer gcc (>= 4.6) set an __ARM_FEATURE_ALIGNED #define,
|
71
|
+
// so in time, maybe we can move on to that.
|
72
|
+
//
|
73
|
+
// Note that even if a chipset supports unaligned access, it might not be
|
74
|
+
// enabled in any given system, e.g.:
|
75
|
+
// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/CIHCGCFD.html
|
76
|
+
// Therefore, it's generally just not safe to allow unaligned access on any ARM
|
77
|
+
// variant.
|
78
|
+
//
|
79
|
+
// This is a mess, but there's not much we can do about it.
|
80
|
+
|
81
|
+
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
|
82
|
+
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
|
83
|
+
|
84
|
+
#define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
|
85
|
+
#define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
|
86
|
+
|
87
|
+
// TODO(sesse): NEON supports unaligned 64-bit loads and stores.
|
88
|
+
// See if that would be more efficient on platforms supporting it,
|
89
|
+
// at least for copies.
|
90
|
+
|
91
|
+
inline uint64 UNALIGNED_LOAD64(const void *p) {
|
92
|
+
uint64 t;
|
93
|
+
memcpy(&t, p, sizeof t);
|
94
|
+
return t;
|
95
|
+
}
|
96
|
+
|
97
|
+
inline void UNALIGNED_STORE64(void *p, uint64 v) {
|
98
|
+
memcpy(p, &v, sizeof v);
|
99
|
+
}
|
100
|
+
|
101
|
+
#else
|
102
|
+
|
103
|
+
#define NEED_ALIGNED_LOADS
|
104
|
+
|
105
|
+
// These functions are provided for architectures that don't support
|
106
|
+
// unaligned loads and stores.
|
107
|
+
|
108
|
+
inline uint16 UNALIGNED_LOAD16(const void *p) {
|
109
|
+
uint16 t;
|
110
|
+
memcpy(&t, p, sizeof t);
|
111
|
+
return t;
|
112
|
+
}
|
113
|
+
|
114
|
+
inline uint32 UNALIGNED_LOAD32(const void *p) {
|
115
|
+
uint32 t;
|
116
|
+
memcpy(&t, p, sizeof t);
|
117
|
+
return t;
|
118
|
+
}
|
119
|
+
|
120
|
+
inline uint64 UNALIGNED_LOAD64(const void *p) {
|
121
|
+
uint64 t;
|
122
|
+
memcpy(&t, p, sizeof t);
|
123
|
+
return t;
|
124
|
+
}
|
125
|
+
|
126
|
+
inline void UNALIGNED_STORE16(void *p, uint16 v) {
|
127
|
+
memcpy(p, &v, sizeof v);
|
128
|
+
}
|
129
|
+
|
130
|
+
inline void UNALIGNED_STORE32(void *p, uint32 v) {
|
131
|
+
memcpy(p, &v, sizeof v);
|
132
|
+
}
|
133
|
+
|
134
|
+
inline void UNALIGNED_STORE64(void *p, uint64 v) {
|
135
|
+
memcpy(p, &v, sizeof v);
|
136
|
+
}
|
137
|
+
|
138
|
+
#endif
|
139
|
+
|
140
|
+
} // End namespace CLD2
|
141
|
+
} // End namespace chrome_lang_id
|
142
|
+
|
143
|
+
#endif // SCRIPT_SPAN_PORT_H_
|