cld3 3.4.4 → 3.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +0 -1
  3. data/README.md +4 -7
  4. data/cld3.gemspec +5 -5
  5. data/ext/cld3/Makefile +17 -16
  6. data/ext/cld3/base.o +0 -0
  7. data/ext/cld3/cld_3/protos/feature_extractor.pb.h +100 -0
  8. data/ext/cld3/cld_3/protos/sentence.pb.h +35 -0
  9. data/ext/cld3/cld_3/protos/task_spec.pb.h +106 -0
  10. data/ext/cld3/embedding_feature_extractor.o +0 -0
  11. data/ext/cld3/embedding_network.o +0 -0
  12. data/ext/cld3/extconf.rb +1 -10
  13. data/ext/cld3/feature_extractor.o +0 -0
  14. data/ext/cld3/feature_types.o +0 -0
  15. data/ext/cld3/fixunicodevalue.o +0 -0
  16. data/ext/cld3/fml_parser.o +0 -0
  17. data/ext/cld3/generated_entities.o +0 -0
  18. data/ext/cld3/generated_ulscript.o +0 -0
  19. data/ext/cld3/getonescriptspan.cc +0 -2
  20. data/ext/cld3/getonescriptspan.o +0 -0
  21. data/ext/cld3/lang_id_nn_params.o +0 -0
  22. data/ext/cld3/language_identifier_features.o +0 -0
  23. data/ext/cld3/libcld3.so +0 -0
  24. data/ext/cld3/nnet_language_identifier.o +0 -0
  25. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  26. data/ext/cld3/offsetmap.o +0 -0
  27. data/ext/cld3/registry.o +0 -0
  28. data/ext/cld3/relevant_script_feature.o +0 -0
  29. data/ext/cld3/script_span/fixunicodevalue.h +69 -0
  30. data/ext/cld3/script_span/generated_ulscript.h +142 -0
  31. data/ext/cld3/script_span/getonescriptspan.h +124 -0
  32. data/ext/cld3/script_span/integral_types.h +37 -0
  33. data/ext/cld3/script_span/offsetmap.h +168 -0
  34. data/ext/cld3/script_span/port.h +143 -0
  35. data/ext/cld3/script_span/stringpiece.h +81 -0
  36. data/ext/cld3/script_span/text_processing.h +30 -0
  37. data/ext/cld3/script_span/utf8acceptinterchange.h +486 -0
  38. data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +1631 -0
  39. data/ext/cld3/script_span/utf8repl_lettermarklower.h +758 -0
  40. data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +1455 -0
  41. data/ext/cld3/script_span/utf8statetable.h +285 -0
  42. data/ext/cld3/sentence_features.o +0 -0
  43. data/ext/cld3/task_context.o +0 -0
  44. data/ext/cld3/task_context_params.o +0 -0
  45. data/ext/cld3/text_processing.o +0 -0
  46. data/ext/cld3/unicodetext.o +0 -0
  47. data/ext/cld3/utf8statetable.o +0 -0
  48. data/ext/cld3/utils.o +0 -0
  49. data/ext/cld3/workspace.o +0 -0
  50. data/lib/cld3.rb +4 -1
  51. metadata +33 -25
  52. data/ext/cld3/feature_extractor.pb.o +0 -0
  53. data/ext/cld3/feature_extractor.proto +0 -50
  54. data/ext/cld3/mkmf.log +0 -37
  55. data/ext/cld3/sentence.pb.o +0 -0
  56. data/ext/cld3/sentence.proto +0 -77
  57. data/ext/cld3/task_spec.pb.o +0 -0
  58. data/ext/cld3/task_spec.proto +0 -98
  59. data/lib/a.rb +0 -24
@@ -0,0 +1,142 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ // generated_ulscript.h
16
+ // Machine generated. Do Not Edit.
17
+ //
18
+ // Declarations for scripts recognized by CLD2
19
+ //
20
+
21
+ #ifndef SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
22
+ #define SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
23
+
24
+ namespace chrome_lang_id {
25
+ namespace CLD2 {
26
+
27
+ typedef enum {RTypeNone = 0, RTypeOne, RTypeMany, RTypeCJK} ULScriptRType;
28
+
29
+ typedef struct {const char* s; int i;} CharIntPair;
30
+
31
+ typedef enum {
32
+ ULScript_Common = 0, // Zyyy
33
+ ULScript_Latin = 1, // Latn
34
+ ULScript_Greek = 2, // Grek
35
+ ULScript_Cyrillic = 3, // Cyrl
36
+ ULScript_Armenian = 4, // Armn
37
+ ULScript_Hebrew = 5, // Hebr
38
+ ULScript_Arabic = 6, // Arab
39
+ ULScript_Syriac = 7, // Syrc
40
+ ULScript_Thaana = 8, // Thaa
41
+ ULScript_Devanagari = 9, // Deva
42
+ ULScript_Bengali = 10, // Beng
43
+ ULScript_Gurmukhi = 11, // Guru
44
+ ULScript_Gujarati = 12, // Gujr
45
+ ULScript_Oriya = 13, // Orya
46
+ ULScript_Tamil = 14, // Taml
47
+ ULScript_Telugu = 15, // Telu
48
+ ULScript_Kannada = 16, // Knda
49
+ ULScript_Malayalam = 17, // Mlym
50
+ ULScript_Sinhala = 18, // Sinh
51
+ ULScript_Thai = 19, // Thai
52
+ ULScript_Lao = 20, // Laoo
53
+ ULScript_Tibetan = 21, // Tibt
54
+ ULScript_Myanmar = 22, // Mymr
55
+ ULScript_Georgian = 23, // Geor
56
+ ULScript_Hani = 24, // Hani
57
+ ULScript_Ethiopic = 25, // Ethi
58
+ ULScript_Cherokee = 26, // Cher
59
+ ULScript_Canadian_Aboriginal = 27, // Cans
60
+ ULScript_Ogham = 28, // Ogam
61
+ ULScript_Runic = 29, // Runr
62
+ ULScript_Khmer = 30, // Khmr
63
+ ULScript_Mongolian = 31, // Mong
64
+ ULScript_32 = 32, //
65
+ ULScript_33 = 33, //
66
+ ULScript_Bopomofo = 34, // Bopo
67
+ ULScript_35 = 35, //
68
+ ULScript_Yi = 36, // Yiii
69
+ ULScript_Old_Italic = 37, // Ital
70
+ ULScript_Gothic = 38, // Goth
71
+ ULScript_Deseret = 39, // Dsrt
72
+ ULScript_Inherited = 40, // Zinh
73
+ ULScript_Tagalog = 41, // Tglg
74
+ ULScript_Hanunoo = 42, // Hano
75
+ ULScript_Buhid = 43, // Buhd
76
+ ULScript_Tagbanwa = 44, // Tagb
77
+ ULScript_Limbu = 45, // Limb
78
+ ULScript_Tai_Le = 46, // Tale
79
+ ULScript_Linear_B = 47, // Linb
80
+ ULScript_Ugaritic = 48, // Ugar
81
+ ULScript_Shavian = 49, // Shaw
82
+ ULScript_Osmanya = 50, // Osma
83
+ ULScript_Cypriot = 51, // Cprt
84
+ ULScript_Braille = 52, // Brai
85
+ ULScript_Buginese = 53, // Bugi
86
+ ULScript_Coptic = 54, // Copt
87
+ ULScript_New_Tai_Lue = 55, // Talu
88
+ ULScript_Glagolitic = 56, // Glag
89
+ ULScript_Tifinagh = 57, // Tfng
90
+ ULScript_Syloti_Nagri = 58, // Sylo
91
+ ULScript_Old_Persian = 59, // Xpeo
92
+ ULScript_Kharoshthi = 60, // Khar
93
+ ULScript_Balinese = 61, // Bali
94
+ ULScript_Cuneiform = 62, // Xsux
95
+ ULScript_Phoenician = 63, // Phnx
96
+ ULScript_Phags_Pa = 64, // Phag
97
+ ULScript_Nko = 65, // Nkoo
98
+ ULScript_Sundanese = 66, // Sund
99
+ ULScript_Lepcha = 67, // Lepc
100
+ ULScript_Ol_Chiki = 68, // Olck
101
+ ULScript_Vai = 69, // Vaii
102
+ ULScript_Saurashtra = 70, // Saur
103
+ ULScript_Kayah_Li = 71, // Kali
104
+ ULScript_Rejang = 72, // Rjng
105
+ ULScript_Lycian = 73, // Lyci
106
+ ULScript_Carian = 74, // Cari
107
+ ULScript_Lydian = 75, // Lydi
108
+ ULScript_Cham = 76, // Cham
109
+ ULScript_Tai_Tham = 77, // Lana
110
+ ULScript_Tai_Viet = 78, // Tavt
111
+ ULScript_Avestan = 79, // Avst
112
+ ULScript_Egyptian_Hieroglyphs = 80, // Egyp
113
+ ULScript_Samaritan = 81, // Samr
114
+ ULScript_Lisu = 82, // Lisu
115
+ ULScript_Bamum = 83, // Bamu
116
+ ULScript_Javanese = 84, // Java
117
+ ULScript_Meetei_Mayek = 85, // Mtei
118
+ ULScript_Imperial_Aramaic = 86, // Armi
119
+ ULScript_Old_South_Arabian = 87, // Sarb
120
+ ULScript_Inscriptional_Parthian = 88, // Prti
121
+ ULScript_Inscriptional_Pahlavi = 89, // Phli
122
+ ULScript_Old_Turkic = 90, // Orkh
123
+ ULScript_Kaithi = 91, // Kthi
124
+ ULScript_Batak = 92, // Batk
125
+ ULScript_Brahmi = 93, // Brah
126
+ ULScript_Mandaic = 94, // Mand
127
+ ULScript_Chakma = 95, // Cakm
128
+ ULScript_Meroitic_Cursive = 96, // Merc
129
+ ULScript_Meroitic_Hieroglyphs = 97, // Mero
130
+ ULScript_Miao = 98, // Plrd
131
+ ULScript_Sharada = 99, // Shrd
132
+ ULScript_Sora_Sompeng = 100, // Sora
133
+ ULScript_Takri = 101, // Takr
134
+ NUM_ULSCRIPTS
135
+ } ULScript;
136
+
137
+ #define UNKNOWN_ULSCRIPT ULScript_Common
138
+
139
+ } // namespace CLD2
140
+ } // namespace chrome_lang_id
141
+
142
+ #endif // SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
@@ -0,0 +1,124 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ //
16
+ // Author: dsites@google.com (Dick Sites)
17
+ //
18
+
19
+
20
+ #ifndef SCRIPT_SPAN_GETONESCRIPTSPAN_H_
21
+ #define SCRIPT_SPAN_GETONESCRIPTSPAN_H_
22
+
23
+ #include "generated_ulscript.h"
24
+ #include "integral_types.h"
25
+ #include "offsetmap.h"
26
+
27
+ namespace chrome_lang_id {
28
+ namespace CLD2 {
29
+
30
+ static const int kMaxScriptBuffer = 40960;
31
+ static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
32
+ static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
33
+ static const int kWithinScriptTail = 32; // Stop at word space in last
34
+ // N bytes of script buffer
35
+
36
+ struct LangSpan {
37
+ char* text = nullptr; // Pointer to the span, somewhere
38
+ int text_bytes = 0; // Number of bytes of text in the span
39
+ int offset = 0; // Offset of start of span in original input buffer
40
+ ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
41
+ bool truncated = false; // true if buffer filled up before a
42
+ // different script or EOF was found
43
+ };
44
+
45
+ static inline bool IsContinuationByte(char c) {
46
+ return static_cast<signed char>(c) < -64;
47
+ }
48
+
49
+ // Gets lscript number for letters; always returns
50
+ // 0 (common script) for non-letters
51
+ int GetUTF8LetterScriptNum(const char* src);
52
+
53
+ // Update src pointer to point to next quadgram, +2..+5
54
+ // Looks at src[0..4]
55
+ const char* AdvanceQuad(const char* src);
56
+
57
+ // Utility routine to search alphabetical tables
58
+ int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair);
59
+
60
+ // Returns the length in bytes of the prefix of src that is all
61
+ // interchange valid UTF-8
62
+ int SpanInterchangeValid(const char* src, int byte_length);
63
+
64
+ class ScriptScanner {
65
+ public:
66
+ ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
67
+ ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
68
+ bool any_text, bool any_script);
69
+ ~ScriptScanner();
70
+
71
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
72
+ bool GetOneScriptSpan(LangSpan* span);
73
+
74
+ // Force Latin and Cyrillic scripts to be lowercase
75
+ void LowerScriptSpan(LangSpan* span);
76
+
77
+ // Copy next run of same-script non-tag letters to buffer [NUL terminated]
78
+ // Force Latin and Cyrillic scripts to be lowercase
79
+ bool GetOneScriptSpanLower(LangSpan* span);
80
+
81
+ // Copy next run of non-tag characters to buffer [NUL terminated]
82
+ // This just removes tags and removes entities
83
+ // Buffer has leading space
84
+ bool GetOneTextSpan(LangSpan* span);
85
+
86
+ // Maps byte offset in most recent GetOneScriptSpan/Lower
87
+ // span->text [0..text_bytes] into an additional byte offset from
88
+ // span->offset, to get back to corresponding text in the original
89
+ // input buffer.
90
+ // text_offset must be the first byte
91
+ // of a UTF-8 character, or just beyond the last character. Normally this
92
+ // routine is called with the first byte of an interesting range and
93
+ // again with the first byte of the following range.
94
+ int MapBack(int text_offset);
95
+
96
+ const char* GetBufferStart() {return start_byte_;}
97
+
98
+ private:
99
+ // Skip over tags and non-letters
100
+ int SkipToFrontOfSpan(const char* src, int len, int* script);
101
+
102
+ const char* start_byte_; // Starting byte of buffer to scan
103
+ const char* next_byte_; // First unscanned byte
104
+ int byte_length_; // Bytes left
105
+
106
+ bool is_plain_text_; // true fo text, false for HTML
107
+ char* script_buffer_; // Holds text with expanded entities
108
+ char* script_buffer_lower_; // Holds lowercased text
109
+ bool letters_marks_only_; // To distinguish scriptspan of one
110
+ // letters/marks vs. any mixture of text
111
+ bool one_script_only_; // To distinguish scriptspan of one
112
+ // script vs. any mixture of scripts
113
+ int exit_state_; // For tag parser kTagParseTbl_0, based
114
+ // on letters_marks_only_
115
+ public :
116
+ // Expose for debugging
117
+ OffsetMap map2original_; // map from script_buffer_ to buffer
118
+ OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_
119
+ };
120
+
121
+ } // namespace CLD2
122
+ } // namespace chrome_lang_id
123
+
124
+ #endif // SCRIPT_SPAN_GETONESCRIPTSPAN_H_
@@ -0,0 +1,37 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ #ifndef SCRIPT_SPAN_INTEGRAL_TYPES_H_
16
+ #define SCRIPT_SPAN_INTEGRAL_TYPES_H_
17
+
18
+ // Cheap version
19
+ namespace chrome_lang_id {
20
+ namespace CLD2 {
21
+
22
+ typedef unsigned char uint8;
23
+ typedef unsigned short uint16;
24
+ typedef unsigned int uint32;
25
+ typedef unsigned long long int uint64;
26
+
27
+ typedef signed char int8;
28
+ typedef signed short int16;
29
+ typedef signed int int32;
30
+ typedef signed long long int int64;
31
+
32
+ typedef int32 char32;
33
+
34
+ } // End namespace CLD2
35
+ } // End namespace chrome_lang_id
36
+
37
+ #endif // SCRIPT_SPAN_INTEGRAL_TYPES_H_
@@ -0,0 +1,168 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ //
16
+ // Author: dsites@google.com (Dick Sites)
17
+ //
18
+
19
+ #ifndef SCRIPT_SPAN_OFFSETMAP_H_
20
+ #define SCRIPT_SPAN_OFFSETMAP_H_
21
+
22
+ #include <string> // for string
23
+
24
+ #include "integral_types.h" // for uint32
25
+
26
+ // ***************************** OffsetMap **************************
27
+ //
28
+ // An OffsetMap object is a container for a mapping from offsets in one text
29
+ // buffer A' to offsets in another text buffer A. It is most useful when A' is
30
+ // built from A via substitutions that occasionally do not preserve byte length.
31
+ //
32
+ // A series of operators are used to build the correspondence map, then
33
+ // calls can be made to map an offset in A' to an offset in A, or vice versa.
34
+ // The map starts with offset 0 in A corresponding to offset 0 in A'.
35
+ // The mapping is then built sequentially, adding on byte ranges that are
36
+ // identical in A and A', byte ranges that are inserted in A', and byte ranges
37
+ // that are deleted from A. All bytes beyond those specified when building the
38
+ // map are assumed to correspond, i.e. a Copy(infinity) is assumed at the
39
+ // end of the map.
40
+ //
41
+ // The internal data structure records positions at which bytes are added or
42
+ // deleted. Using the map is O(1) when increasing the A' or A offset
43
+ // monotonically, and O(n) when accessing random offsets, where n is the
44
+ // number of differences.
45
+ //
46
+
47
+ namespace chrome_lang_id {
48
+ namespace CLD2 {
49
+
50
+ class OffsetMap {
51
+ public:
52
+ // Constructor, destructor
53
+ OffsetMap();
54
+ ~OffsetMap();
55
+
56
+ // Clear the map
57
+ void Clear();
58
+
59
+ // Add to mapping from A to A', specifying how many next bytes correspond
60
+ // in A and A'
61
+ void Copy(int bytes);
62
+
63
+ // Add to mapping from A to A', specifying how many next bytes are
64
+ // inserted in A' while not advancing in A at all
65
+ void Insert(int bytes);
66
+
67
+ // Add to mapping from A to A', specifying how many next bytes are
68
+ // deleted from A while not advancing in A' at all
69
+ void Delete(int bytes);
70
+
71
+ // [Finish building map,] Re-position to offset 0
72
+ // This call is optional; MapForward and MapBack finish building the map
73
+ // if necessary
74
+ void Reset();
75
+
76
+ // Map an offset in A' to the corresponding offset in A
77
+ int MapBack(int aprimeoffset);
78
+
79
+ // Map an offset in A to the corresponding offset in A'
80
+ int MapForward(int aoffset);
81
+
82
+ // h = ComposeOffsetMap(g, f), where f is a map from A to A', g is
83
+ // from A' to A'' and h is from A to A''.
84
+ //
85
+ // Note that g->MoveForward(f->MoveForward(aoffset)) always equals
86
+ // to h->MoveForward(aoffset), while
87
+ // f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals
88
+ // to h->MoveBack(aprimeprimeoffset). This happens when deletion in
89
+ // f and insertion in g are at the same place. For example,
90
+ //
91
+ // A 1 2 3 4
92
+ // ^ | ^ ^
93
+ // | | / | f
94
+ // v vv v
95
+ // A' 1' 2' 3'
96
+ // ^ ^^ ^
97
+ // | | \ | g
98
+ // v | v v
99
+ // A'' 1'' 2'' 3'' 4''
100
+ //
101
+ // results in:
102
+ //
103
+ // A 1 2 3 4
104
+ // ^ ^\ ^ ^
105
+ // | | \ | | h
106
+ // v | vv v
107
+ // A'' 1'' 2'' 3'' 4''
108
+ //
109
+ // 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in
110
+ // the latter figure.
111
+ static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h);
112
+
113
+ // For testing only -- force a mapping
114
+ void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset);
115
+
116
+ private:
117
+ enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP};
118
+
119
+ void Flush();
120
+ void FlushAll();
121
+ void MaybeFlushAll();
122
+ void Emit(MapOp op, int len);
123
+
124
+ void SetLeft();
125
+ void SetRight();
126
+
127
+ // Back up over previous range, 1..5 bytes
128
+ // Return subscript at the beginning of that. Pins at 0
129
+ int Backup(int sub);
130
+
131
+ // Parse next range, 1..5 bytes
132
+ // Return subscript just off the end of that
133
+ int ParseNext(int sub, MapOp* op, int* length);
134
+
135
+ // Parse previous range, 1..5 bytes
136
+ // Return current subscript
137
+ int ParsePrevious(int sub, MapOp* op, int* length);
138
+
139
+ bool MoveRight(); // Returns true if OK
140
+ bool MoveLeft(); // Returns true if OK
141
+
142
+ // Copies insert operations from source to dest. Returns true if no
143
+ // other operations are found.
144
+ static bool CopyInserts(OffsetMap* source, OffsetMap* dest);
145
+
146
+ // Copies delete operations from source to dest. Returns true if no other
147
+ // operations are found.
148
+ static bool CopyDeletes(OffsetMap* source, OffsetMap* dest);
149
+
150
+ std::string diffs_;
151
+ MapOp pending_op_;
152
+ uint32 pending_length_;
153
+
154
+ // Offsets in the ranges below correspond to each other, with A' = A + diff
155
+ int next_diff_sub_;
156
+ int current_lo_aoffset_;
157
+ int current_hi_aoffset_;
158
+ int current_lo_aprimeoffset_;
159
+ int current_hi_aprimeoffset_;
160
+ int current_diff_;
161
+ int max_aoffset_;
162
+ int max_aprimeoffset_;
163
+ };
164
+
165
+ } // namespace CLD2
166
+ } // namespace chrome_lang_id
167
+
168
+ #endif // SCRIPT_SPAN_OFFSETMAP_H_
@@ -0,0 +1,143 @@
1
+ // Copyright 2013 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+
15
+ //
16
+ // These are weird things we need to do to get this compiling on
17
+ // random systems [subset].
18
+
19
+ #ifndef SCRIPT_SPAN_PORT_H_
20
+ #define SCRIPT_SPAN_PORT_H_
21
+
22
+ #include <string.h> // for memcpy()
23
+
24
+ #include "integral_types.h"
25
+
26
+ namespace chrome_lang_id {
27
+ namespace CLD2 {
28
+
29
+ // Portable handling of unaligned loads, stores, and copies.
30
+ // On some platforms, like ARM, the copy functions can be more efficient
31
+ // then a load and a store.
32
+
33
+ #if defined(ARCH_PIII) || defined(ARCH_ATHLON) || defined(ARCH_K8) || defined(_ARCH_PPC)
34
+
35
+ // x86 and x86-64 can perform unaligned loads/stores directly;
36
+ // modern PowerPC hardware can also do unaligned integer loads and stores;
37
+ // but note: the FPU still sends unaligned loads and stores to a trap handler!
38
+
39
+ #define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
40
+ #define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
41
+ #define UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64 *>(_p))
42
+
43
+ #define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
44
+ #define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
45
+ #define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val))
46
+
47
+ #elif defined(__arm__) && \
48
+ !defined(__ARM_ARCH_5__) && \
49
+ !defined(__ARM_ARCH_5T__) && \
50
+ !defined(__ARM_ARCH_5TE__) && \
51
+ !defined(__ARM_ARCH_5TEJ__) && \
52
+ !defined(__ARM_ARCH_6__) && \
53
+ !defined(__ARM_ARCH_6J__) && \
54
+ !defined(__ARM_ARCH_6K__) && \
55
+ !defined(__ARM_ARCH_6Z__) && \
56
+ !defined(__ARM_ARCH_6ZK__) && \
57
+ !defined(__ARM_ARCH_6T2__) && \
58
+ !defined(__ARM_ARCH_7__) && \
59
+ !defined(__ARM_ARCH_7A__) && \
60
+ !defined(__ARM_ARCH_7M__) && \
61
+ !defined(__ARM_ARCH_7R__) && \
62
+ !defined(__ARM_ARCH_8__) && \
63
+ !defined(__ARM_ARCH_8A__)
64
+
65
+ // ARMv7 and newer support native unaligned accesses, but only of 16-bit
66
+ // and 32-bit values (not 64-bit); older versions either raise a fatal signal,
67
+ // do an unaligned read and rotate the words around a bit, or do the reads very
68
+ // slowly (trip through kernel mode). There's no simple #define that says just
69
+ // “ARMv7 or higher”, so we have to filter away all ARMv5 and ARMv6
70
+ // sub-architectures. Newer gcc (>= 4.6) set an __ARM_FEATURE_ALIGNED #define,
71
+ // so in time, maybe we can move on to that.
72
+ //
73
+ // Note that even if a chipset supports unaligned access, it might not be
74
+ // enabled in any given system, e.g.:
75
+ // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/CIHCGCFD.html
76
+ // Therefore, it's generally just not safe to allow unaligned access on any ARM
77
+ // variant.
78
+ //
79
+ // This is a mess, but there's not much we can do about it.
80
+
81
+ #define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
82
+ #define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
83
+
84
+ #define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
85
+ #define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
86
+
87
+ // TODO(sesse): NEON supports unaligned 64-bit loads and stores.
88
+ // See if that would be more efficient on platforms supporting it,
89
+ // at least for copies.
90
+
91
+ inline uint64 UNALIGNED_LOAD64(const void *p) {
92
+ uint64 t;
93
+ memcpy(&t, p, sizeof t);
94
+ return t;
95
+ }
96
+
97
+ inline void UNALIGNED_STORE64(void *p, uint64 v) {
98
+ memcpy(p, &v, sizeof v);
99
+ }
100
+
101
+ #else
102
+
103
+ #define NEED_ALIGNED_LOADS
104
+
105
+ // These functions are provided for architectures that don't support
106
+ // unaligned loads and stores.
107
+
108
+ inline uint16 UNALIGNED_LOAD16(const void *p) {
109
+ uint16 t;
110
+ memcpy(&t, p, sizeof t);
111
+ return t;
112
+ }
113
+
114
+ inline uint32 UNALIGNED_LOAD32(const void *p) {
115
+ uint32 t;
116
+ memcpy(&t, p, sizeof t);
117
+ return t;
118
+ }
119
+
120
+ inline uint64 UNALIGNED_LOAD64(const void *p) {
121
+ uint64 t;
122
+ memcpy(&t, p, sizeof t);
123
+ return t;
124
+ }
125
+
126
+ inline void UNALIGNED_STORE16(void *p, uint16 v) {
127
+ memcpy(p, &v, sizeof v);
128
+ }
129
+
130
+ inline void UNALIGNED_STORE32(void *p, uint32 v) {
131
+ memcpy(p, &v, sizeof v);
132
+ }
133
+
134
+ inline void UNALIGNED_STORE64(void *p, uint64 v) {
135
+ memcpy(p, &v, sizeof v);
136
+ }
137
+
138
+ #endif
139
+
140
+ } // End namespace CLD2
141
+ } // End namespace chrome_lang_id
142
+
143
+ #endif // SCRIPT_SPAN_PORT_H_