cld3 3.5.0 → 3.5.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +0 -8
  3. data/cld3.gemspec +6 -6
  4. data/ext/cld3/extconf.rb +1 -2
  5. data/ext/cld3/nnet_language_identifier_c.cc +162 -70
  6. data/lib/cld3.rb +14 -102
  7. data/sig/cld3.rbs +2 -0
  8. metadata +15 -77
  9. data/ext/cld3/Makefile +0 -268
  10. data/ext/cld3/base.o +0 -0
  11. data/ext/cld3/embedding_feature_extractor.o +0 -0
  12. data/ext/cld3/embedding_network.o +0 -0
  13. data/ext/cld3/feature_extractor.o +0 -0
  14. data/ext/cld3/feature_types.o +0 -0
  15. data/ext/cld3/fixunicodevalue.o +0 -0
  16. data/ext/cld3/fml_parser.o +0 -0
  17. data/ext/cld3/generated_entities.o +0 -0
  18. data/ext/cld3/generated_ulscript.o +0 -0
  19. data/ext/cld3/getonescriptspan.o +0 -0
  20. data/ext/cld3/lang_id_nn_params.o +0 -0
  21. data/ext/cld3/language_identifier_features.o +0 -0
  22. data/ext/cld3/libcld3.def +0 -8
  23. data/ext/cld3/libcld3.so +0 -0
  24. data/ext/cld3/nnet_language_identifier.o +0 -0
  25. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  26. data/ext/cld3/offsetmap.o +0 -0
  27. data/ext/cld3/registry.o +0 -0
  28. data/ext/cld3/relevant_script_feature.o +0 -0
  29. data/ext/cld3/script_span/fixunicodevalue.h +0 -69
  30. data/ext/cld3/script_span/generated_ulscript.h +0 -142
  31. data/ext/cld3/script_span/getonescriptspan.h +0 -124
  32. data/ext/cld3/script_span/integral_types.h +0 -37
  33. data/ext/cld3/script_span/offsetmap.h +0 -168
  34. data/ext/cld3/script_span/port.h +0 -143
  35. data/ext/cld3/script_span/stringpiece.h +0 -81
  36. data/ext/cld3/script_span/text_processing.h +0 -30
  37. data/ext/cld3/script_span/utf8acceptinterchange.h +0 -486
  38. data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +0 -1631
  39. data/ext/cld3/script_span/utf8repl_lettermarklower.h +0 -758
  40. data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +0 -1455
  41. data/ext/cld3/script_span/utf8statetable.h +0 -285
  42. data/ext/cld3/sentence_features.o +0 -0
  43. data/ext/cld3/task_context.o +0 -0
  44. data/ext/cld3/task_context_params.o +0 -0
  45. data/ext/cld3/text_processing.o +0 -0
  46. data/ext/cld3/unicodetext.o +0 -0
  47. data/ext/cld3/utf8statetable.o +0 -0
  48. data/ext/cld3/utils.o +0 -0
  49. data/ext/cld3/workspace.o +0 -0
  50. data/lib/cld3/unstable.rb +0 -58
@@ -1,142 +0,0 @@
1
- // Copyright 2013 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
-
15
- // generated_ulscript.h
16
- // Machine generated. Do Not Edit.
17
- //
18
- // Declarations for scripts recognized by CLD2
19
- //
20
-
21
- #ifndef SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
22
- #define SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
23
-
24
- namespace chrome_lang_id {
25
- namespace CLD2 {
26
-
27
- typedef enum {RTypeNone = 0, RTypeOne, RTypeMany, RTypeCJK} ULScriptRType;
28
-
29
- typedef struct {const char* s; int i;} CharIntPair;
30
-
31
- typedef enum {
32
- ULScript_Common = 0, // Zyyy
33
- ULScript_Latin = 1, // Latn
34
- ULScript_Greek = 2, // Grek
35
- ULScript_Cyrillic = 3, // Cyrl
36
- ULScript_Armenian = 4, // Armn
37
- ULScript_Hebrew = 5, // Hebr
38
- ULScript_Arabic = 6, // Arab
39
- ULScript_Syriac = 7, // Syrc
40
- ULScript_Thaana = 8, // Thaa
41
- ULScript_Devanagari = 9, // Deva
42
- ULScript_Bengali = 10, // Beng
43
- ULScript_Gurmukhi = 11, // Guru
44
- ULScript_Gujarati = 12, // Gujr
45
- ULScript_Oriya = 13, // Orya
46
- ULScript_Tamil = 14, // Taml
47
- ULScript_Telugu = 15, // Telu
48
- ULScript_Kannada = 16, // Knda
49
- ULScript_Malayalam = 17, // Mlym
50
- ULScript_Sinhala = 18, // Sinh
51
- ULScript_Thai = 19, // Thai
52
- ULScript_Lao = 20, // Laoo
53
- ULScript_Tibetan = 21, // Tibt
54
- ULScript_Myanmar = 22, // Mymr
55
- ULScript_Georgian = 23, // Geor
56
- ULScript_Hani = 24, // Hani
57
- ULScript_Ethiopic = 25, // Ethi
58
- ULScript_Cherokee = 26, // Cher
59
- ULScript_Canadian_Aboriginal = 27, // Cans
60
- ULScript_Ogham = 28, // Ogam
61
- ULScript_Runic = 29, // Runr
62
- ULScript_Khmer = 30, // Khmr
63
- ULScript_Mongolian = 31, // Mong
64
- ULScript_32 = 32, //
65
- ULScript_33 = 33, //
66
- ULScript_Bopomofo = 34, // Bopo
67
- ULScript_35 = 35, //
68
- ULScript_Yi = 36, // Yiii
69
- ULScript_Old_Italic = 37, // Ital
70
- ULScript_Gothic = 38, // Goth
71
- ULScript_Deseret = 39, // Dsrt
72
- ULScript_Inherited = 40, // Zinh
73
- ULScript_Tagalog = 41, // Tglg
74
- ULScript_Hanunoo = 42, // Hano
75
- ULScript_Buhid = 43, // Buhd
76
- ULScript_Tagbanwa = 44, // Tagb
77
- ULScript_Limbu = 45, // Limb
78
- ULScript_Tai_Le = 46, // Tale
79
- ULScript_Linear_B = 47, // Linb
80
- ULScript_Ugaritic = 48, // Ugar
81
- ULScript_Shavian = 49, // Shaw
82
- ULScript_Osmanya = 50, // Osma
83
- ULScript_Cypriot = 51, // Cprt
84
- ULScript_Braille = 52, // Brai
85
- ULScript_Buginese = 53, // Bugi
86
- ULScript_Coptic = 54, // Copt
87
- ULScript_New_Tai_Lue = 55, // Talu
88
- ULScript_Glagolitic = 56, // Glag
89
- ULScript_Tifinagh = 57, // Tfng
90
- ULScript_Syloti_Nagri = 58, // Sylo
91
- ULScript_Old_Persian = 59, // Xpeo
92
- ULScript_Kharoshthi = 60, // Khar
93
- ULScript_Balinese = 61, // Bali
94
- ULScript_Cuneiform = 62, // Xsux
95
- ULScript_Phoenician = 63, // Phnx
96
- ULScript_Phags_Pa = 64, // Phag
97
- ULScript_Nko = 65, // Nkoo
98
- ULScript_Sundanese = 66, // Sund
99
- ULScript_Lepcha = 67, // Lepc
100
- ULScript_Ol_Chiki = 68, // Olck
101
- ULScript_Vai = 69, // Vaii
102
- ULScript_Saurashtra = 70, // Saur
103
- ULScript_Kayah_Li = 71, // Kali
104
- ULScript_Rejang = 72, // Rjng
105
- ULScript_Lycian = 73, // Lyci
106
- ULScript_Carian = 74, // Cari
107
- ULScript_Lydian = 75, // Lydi
108
- ULScript_Cham = 76, // Cham
109
- ULScript_Tai_Tham = 77, // Lana
110
- ULScript_Tai_Viet = 78, // Tavt
111
- ULScript_Avestan = 79, // Avst
112
- ULScript_Egyptian_Hieroglyphs = 80, // Egyp
113
- ULScript_Samaritan = 81, // Samr
114
- ULScript_Lisu = 82, // Lisu
115
- ULScript_Bamum = 83, // Bamu
116
- ULScript_Javanese = 84, // Java
117
- ULScript_Meetei_Mayek = 85, // Mtei
118
- ULScript_Imperial_Aramaic = 86, // Armi
119
- ULScript_Old_South_Arabian = 87, // Sarb
120
- ULScript_Inscriptional_Parthian = 88, // Prti
121
- ULScript_Inscriptional_Pahlavi = 89, // Phli
122
- ULScript_Old_Turkic = 90, // Orkh
123
- ULScript_Kaithi = 91, // Kthi
124
- ULScript_Batak = 92, // Batk
125
- ULScript_Brahmi = 93, // Brah
126
- ULScript_Mandaic = 94, // Mand
127
- ULScript_Chakma = 95, // Cakm
128
- ULScript_Meroitic_Cursive = 96, // Merc
129
- ULScript_Meroitic_Hieroglyphs = 97, // Mero
130
- ULScript_Miao = 98, // Plrd
131
- ULScript_Sharada = 99, // Shrd
132
- ULScript_Sora_Sompeng = 100, // Sora
133
- ULScript_Takri = 101, // Takr
134
- NUM_ULSCRIPTS
135
- } ULScript;
136
-
137
- #define UNKNOWN_ULSCRIPT ULScript_Common
138
-
139
- } // namespace CLD2
140
- } // namespace chrome_lang_id
141
-
142
- #endif // SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
@@ -1,124 +0,0 @@
1
- // Copyright 2013 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
-
15
- //
16
- // Author: dsites@google.com (Dick Sites)
17
- //
18
-
19
-
20
- #ifndef SCRIPT_SPAN_GETONESCRIPTSPAN_H_
21
- #define SCRIPT_SPAN_GETONESCRIPTSPAN_H_
22
-
23
- #include "generated_ulscript.h"
24
- #include "integral_types.h"
25
- #include "offsetmap.h"
26
-
27
- namespace chrome_lang_id {
28
- namespace CLD2 {
29
-
30
- static const int kMaxScriptBuffer = 40960;
31
- static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
32
- static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
33
- static const int kWithinScriptTail = 32; // Stop at word space in last
34
- // N bytes of script buffer
35
-
36
- struct LangSpan {
37
- char* text = nullptr; // Pointer to the span, somewhere
38
- int text_bytes = 0; // Number of bytes of text in the span
39
- int offset = 0; // Offset of start of span in original input buffer
40
- ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
41
- bool truncated = false; // true if buffer filled up before a
42
- // different script or EOF was found
43
- };
44
-
45
- static inline bool IsContinuationByte(char c) {
46
- return static_cast<signed char>(c) < -64;
47
- }
48
-
49
- // Gets lscript number for letters; always returns
50
- // 0 (common script) for non-letters
51
- int GetUTF8LetterScriptNum(const char* src);
52
-
53
- // Update src pointer to point to next quadgram, +2..+5
54
- // Looks at src[0..4]
55
- const char* AdvanceQuad(const char* src);
56
-
57
- // Utility routine to search alphabetical tables
58
- int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair);
59
-
60
- // Returns the length in bytes of the prefix of src that is all
61
- // interchange valid UTF-8
62
- int SpanInterchangeValid(const char* src, int byte_length);
63
-
64
- class ScriptScanner {
65
- public:
66
- ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
67
- ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
68
- bool any_text, bool any_script);
69
- ~ScriptScanner();
70
-
71
- // Copy next run of same-script non-tag letters to buffer [NUL terminated]
72
- bool GetOneScriptSpan(LangSpan* span);
73
-
74
- // Force Latin and Cyrillic scripts to be lowercase
75
- void LowerScriptSpan(LangSpan* span);
76
-
77
- // Copy next run of same-script non-tag letters to buffer [NUL terminated]
78
- // Force Latin and Cyrillic scripts to be lowercase
79
- bool GetOneScriptSpanLower(LangSpan* span);
80
-
81
- // Copy next run of non-tag characters to buffer [NUL terminated]
82
- // This just removes tags and removes entities
83
- // Buffer has leading space
84
- bool GetOneTextSpan(LangSpan* span);
85
-
86
- // Maps byte offset in most recent GetOneScriptSpan/Lower
87
- // span->text [0..text_bytes] into an additional byte offset from
88
- // span->offset, to get back to corresponding text in the original
89
- // input buffer.
90
- // text_offset must be the first byte
91
- // of a UTF-8 character, or just beyond the last character. Normally this
92
- // routine is called with the first byte of an interesting range and
93
- // again with the first byte of the following range.
94
- int MapBack(int text_offset);
95
-
96
- const char* GetBufferStart() {return start_byte_;}
97
-
98
- private:
99
- // Skip over tags and non-letters
100
- int SkipToFrontOfSpan(const char* src, int len, int* script);
101
-
102
- const char* start_byte_; // Starting byte of buffer to scan
103
- const char* next_byte_; // First unscanned byte
104
- int byte_length_; // Bytes left
105
-
106
- bool is_plain_text_; // true fo text, false for HTML
107
- char* script_buffer_; // Holds text with expanded entities
108
- char* script_buffer_lower_; // Holds lowercased text
109
- bool letters_marks_only_; // To distinguish scriptspan of one
110
- // letters/marks vs. any mixture of text
111
- bool one_script_only_; // To distinguish scriptspan of one
112
- // script vs. any mixture of scripts
113
- int exit_state_; // For tag parser kTagParseTbl_0, based
114
- // on letters_marks_only_
115
- public :
116
- // Expose for debugging
117
- OffsetMap map2original_; // map from script_buffer_ to buffer
118
- OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_
119
- };
120
-
121
- } // namespace CLD2
122
- } // namespace chrome_lang_id
123
-
124
- #endif // SCRIPT_SPAN_GETONESCRIPTSPAN_H_
@@ -1,37 +0,0 @@
1
- // Copyright 2013 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
-
15
- #ifndef SCRIPT_SPAN_INTEGRAL_TYPES_H_
16
- #define SCRIPT_SPAN_INTEGRAL_TYPES_H_
17
-
18
- // Cheap version
19
- namespace chrome_lang_id {
20
- namespace CLD2 {
21
-
22
- typedef unsigned char uint8;
23
- typedef unsigned short uint16;
24
- typedef unsigned int uint32;
25
- typedef unsigned long long int uint64;
26
-
27
- typedef signed char int8;
28
- typedef signed short int16;
29
- typedef signed int int32;
30
- typedef signed long long int int64;
31
-
32
- typedef int32 char32;
33
-
34
- } // End namespace CLD2
35
- } // End namespace chrome_lang_id
36
-
37
- #endif // SCRIPT_SPAN_INTEGRAL_TYPES_H_
@@ -1,168 +0,0 @@
1
- // Copyright 2013 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
-
15
- //
16
- // Author: dsites@google.com (Dick Sites)
17
- //
18
-
19
- #ifndef SCRIPT_SPAN_OFFSETMAP_H_
20
- #define SCRIPT_SPAN_OFFSETMAP_H_
21
-
22
- #include <string> // for string
23
-
24
- #include "integral_types.h" // for uint32
25
-
26
- // ***************************** OffsetMap **************************
27
- //
28
- // An OffsetMap object is a container for a mapping from offsets in one text
29
- // buffer A' to offsets in another text buffer A. It is most useful when A' is
30
- // built from A via substitutions that occasionally do not preserve byte length.
31
- //
32
- // A series of operators are used to build the correspondence map, then
33
- // calls can be made to map an offset in A' to an offset in A, or vice versa.
34
- // The map starts with offset 0 in A corresponding to offset 0 in A'.
35
- // The mapping is then built sequentially, adding on byte ranges that are
36
- // identical in A and A', byte ranges that are inserted in A', and byte ranges
37
- // that are deleted from A. All bytes beyond those specified when building the
38
- // map are assumed to correspond, i.e. a Copy(infinity) is assumed at the
39
- // end of the map.
40
- //
41
- // The internal data structure records positions at which bytes are added or
42
- // deleted. Using the map is O(1) when increasing the A' or A offset
43
- // monotonically, and O(n) when accessing random offsets, where n is the
44
- // number of differences.
45
- //
46
-
47
- namespace chrome_lang_id {
48
- namespace CLD2 {
49
-
50
- class OffsetMap {
51
- public:
52
- // Constructor, destructor
53
- OffsetMap();
54
- ~OffsetMap();
55
-
56
- // Clear the map
57
- void Clear();
58
-
59
- // Add to mapping from A to A', specifying how many next bytes correspond
60
- // in A and A'
61
- void Copy(int bytes);
62
-
63
- // Add to mapping from A to A', specifying how many next bytes are
64
- // inserted in A' while not advancing in A at all
65
- void Insert(int bytes);
66
-
67
- // Add to mapping from A to A', specifying how many next bytes are
68
- // deleted from A while not advancing in A' at all
69
- void Delete(int bytes);
70
-
71
- // [Finish building map,] Re-position to offset 0
72
- // This call is optional; MapForward and MapBack finish building the map
73
- // if necessary
74
- void Reset();
75
-
76
- // Map an offset in A' to the corresponding offset in A
77
- int MapBack(int aprimeoffset);
78
-
79
- // Map an offset in A to the corresponding offset in A'
80
- int MapForward(int aoffset);
81
-
82
- // h = ComposeOffsetMap(g, f), where f is a map from A to A', g is
83
- // from A' to A'' and h is from A to A''.
84
- //
85
- // Note that g->MoveForward(f->MoveForward(aoffset)) always equals
86
- // to h->MoveForward(aoffset), while
87
- // f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals
88
- // to h->MoveBack(aprimeprimeoffset). This happens when deletion in
89
- // f and insertion in g are at the same place. For example,
90
- //
91
- // A 1 2 3 4
92
- // ^ | ^ ^
93
- // | | / | f
94
- // v vv v
95
- // A' 1' 2' 3'
96
- // ^ ^^ ^
97
- // | | \ | g
98
- // v | v v
99
- // A'' 1'' 2'' 3'' 4''
100
- //
101
- // results in:
102
- //
103
- // A 1 2 3 4
104
- // ^ ^\ ^ ^
105
- // | | \ | | h
106
- // v | vv v
107
- // A'' 1'' 2'' 3'' 4''
108
- //
109
- // 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in
110
- // the latter figure.
111
- static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h);
112
-
113
- // For testing only -- force a mapping
114
- void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset);
115
-
116
- private:
117
- enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP};
118
-
119
- void Flush();
120
- void FlushAll();
121
- void MaybeFlushAll();
122
- void Emit(MapOp op, int len);
123
-
124
- void SetLeft();
125
- void SetRight();
126
-
127
- // Back up over previous range, 1..5 bytes
128
- // Return subscript at the beginning of that. Pins at 0
129
- int Backup(int sub);
130
-
131
- // Parse next range, 1..5 bytes
132
- // Return subscript just off the end of that
133
- int ParseNext(int sub, MapOp* op, int* length);
134
-
135
- // Parse previous range, 1..5 bytes
136
- // Return current subscript
137
- int ParsePrevious(int sub, MapOp* op, int* length);
138
-
139
- bool MoveRight(); // Returns true if OK
140
- bool MoveLeft(); // Returns true if OK
141
-
142
- // Copies insert operations from source to dest. Returns true if no
143
- // other operations are found.
144
- static bool CopyInserts(OffsetMap* source, OffsetMap* dest);
145
-
146
- // Copies delete operations from source to dest. Returns true if no other
147
- // operations are found.
148
- static bool CopyDeletes(OffsetMap* source, OffsetMap* dest);
149
-
150
- std::string diffs_;
151
- MapOp pending_op_;
152
- uint32 pending_length_;
153
-
154
- // Offsets in the ranges below correspond to each other, with A' = A + diff
155
- int next_diff_sub_;
156
- int current_lo_aoffset_;
157
- int current_hi_aoffset_;
158
- int current_lo_aprimeoffset_;
159
- int current_hi_aprimeoffset_;
160
- int current_diff_;
161
- int max_aoffset_;
162
- int max_aprimeoffset_;
163
- };
164
-
165
- } // namespace CLD2
166
- } // namespace chrome_lang_id
167
-
168
- #endif // SCRIPT_SPAN_OFFSETMAP_H_
@@ -1,143 +0,0 @@
1
- // Copyright 2013 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
-
15
- //
16
- // These are weird things we need to do to get this compiling on
17
- // random systems [subset].
18
-
19
- #ifndef SCRIPT_SPAN_PORT_H_
20
- #define SCRIPT_SPAN_PORT_H_
21
-
22
- #include <string.h> // for memcpy()
23
-
24
- #include "integral_types.h"
25
-
26
- namespace chrome_lang_id {
27
- namespace CLD2 {
28
-
29
- // Portable handling of unaligned loads, stores, and copies.
30
- // On some platforms, like ARM, the copy functions can be more efficient
31
- // then a load and a store.
32
-
33
- #if defined(ARCH_PIII) || defined(ARCH_ATHLON) || defined(ARCH_K8) || defined(_ARCH_PPC)
34
-
35
- // x86 and x86-64 can perform unaligned loads/stores directly;
36
- // modern PowerPC hardware can also do unaligned integer loads and stores;
37
- // but note: the FPU still sends unaligned loads and stores to a trap handler!
38
-
39
- #define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
40
- #define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
41
- #define UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64 *>(_p))
42
-
43
- #define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
44
- #define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
45
- #define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val))
46
-
47
- #elif defined(__arm__) && \
48
- !defined(__ARM_ARCH_5__) && \
49
- !defined(__ARM_ARCH_5T__) && \
50
- !defined(__ARM_ARCH_5TE__) && \
51
- !defined(__ARM_ARCH_5TEJ__) && \
52
- !defined(__ARM_ARCH_6__) && \
53
- !defined(__ARM_ARCH_6J__) && \
54
- !defined(__ARM_ARCH_6K__) && \
55
- !defined(__ARM_ARCH_6Z__) && \
56
- !defined(__ARM_ARCH_6ZK__) && \
57
- !defined(__ARM_ARCH_6T2__) && \
58
- !defined(__ARM_ARCH_7__) && \
59
- !defined(__ARM_ARCH_7A__) && \
60
- !defined(__ARM_ARCH_7M__) && \
61
- !defined(__ARM_ARCH_7R__) && \
62
- !defined(__ARM_ARCH_8__) && \
63
- !defined(__ARM_ARCH_8A__)
64
-
65
- // ARMv7 and newer support native unaligned accesses, but only of 16-bit
66
- // and 32-bit values (not 64-bit); older versions either raise a fatal signal,
67
- // do an unaligned read and rotate the words around a bit, or do the reads very
68
- // slowly (trip through kernel mode). There's no simple #define that says just
69
- // “ARMv7 or higher”, so we have to filter away all ARMv5 and ARMv6
70
- // sub-architectures. Newer gcc (>= 4.6) set an __ARM_FEATURE_ALIGNED #define,
71
- // so in time, maybe we can move on to that.
72
- //
73
- // Note that even if a chipset supports unaligned access, it might not be
74
- // enabled in any given system, e.g.:
75
- // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/CIHCGCFD.html
76
- // Therefore, it's generally just not safe to allow unaligned access on any ARM
77
- // variant.
78
- //
79
- // This is a mess, but there's not much we can do about it.
80
-
81
- #define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
82
- #define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
83
-
84
- #define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
85
- #define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
86
-
87
- // TODO(sesse): NEON supports unaligned 64-bit loads and stores.
88
- // See if that would be more efficient on platforms supporting it,
89
- // at least for copies.
90
-
91
- inline uint64 UNALIGNED_LOAD64(const void *p) {
92
- uint64 t;
93
- memcpy(&t, p, sizeof t);
94
- return t;
95
- }
96
-
97
- inline void UNALIGNED_STORE64(void *p, uint64 v) {
98
- memcpy(p, &v, sizeof v);
99
- }
100
-
101
- #else
102
-
103
- #define NEED_ALIGNED_LOADS
104
-
105
- // These functions are provided for architectures that don't support
106
- // unaligned loads and stores.
107
-
108
- inline uint16 UNALIGNED_LOAD16(const void *p) {
109
- uint16 t;
110
- memcpy(&t, p, sizeof t);
111
- return t;
112
- }
113
-
114
- inline uint32 UNALIGNED_LOAD32(const void *p) {
115
- uint32 t;
116
- memcpy(&t, p, sizeof t);
117
- return t;
118
- }
119
-
120
- inline uint64 UNALIGNED_LOAD64(const void *p) {
121
- uint64 t;
122
- memcpy(&t, p, sizeof t);
123
- return t;
124
- }
125
-
126
- inline void UNALIGNED_STORE16(void *p, uint16 v) {
127
- memcpy(p, &v, sizeof v);
128
- }
129
-
130
- inline void UNALIGNED_STORE32(void *p, uint32 v) {
131
- memcpy(p, &v, sizeof v);
132
- }
133
-
134
- inline void UNALIGNED_STORE64(void *p, uint64 v) {
135
- memcpy(p, &v, sizeof v);
136
- }
137
-
138
- #endif
139
-
140
- } // End namespace CLD2
141
- } // End namespace chrome_lang_id
142
-
143
- #endif // SCRIPT_SPAN_PORT_H_