cld3 3.5.0 → 3.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +0 -8
  3. data/cld3.gemspec +6 -6
  4. data/ext/cld3/extconf.rb +1 -2
  5. data/ext/cld3/nnet_language_identifier_c.cc +163 -70
  6. data/lib/cld3.rb +14 -102
  7. data/sig/cld3.rbs +2 -0
  8. metadata +15 -77
  9. data/ext/cld3/Makefile +0 -268
  10. data/ext/cld3/base.o +0 -0
  11. data/ext/cld3/embedding_feature_extractor.o +0 -0
  12. data/ext/cld3/embedding_network.o +0 -0
  13. data/ext/cld3/feature_extractor.o +0 -0
  14. data/ext/cld3/feature_types.o +0 -0
  15. data/ext/cld3/fixunicodevalue.o +0 -0
  16. data/ext/cld3/fml_parser.o +0 -0
  17. data/ext/cld3/generated_entities.o +0 -0
  18. data/ext/cld3/generated_ulscript.o +0 -0
  19. data/ext/cld3/getonescriptspan.o +0 -0
  20. data/ext/cld3/lang_id_nn_params.o +0 -0
  21. data/ext/cld3/language_identifier_features.o +0 -0
  22. data/ext/cld3/libcld3.def +0 -8
  23. data/ext/cld3/libcld3.so +0 -0
  24. data/ext/cld3/nnet_language_identifier.o +0 -0
  25. data/ext/cld3/nnet_language_identifier_c.o +0 -0
  26. data/ext/cld3/offsetmap.o +0 -0
  27. data/ext/cld3/registry.o +0 -0
  28. data/ext/cld3/relevant_script_feature.o +0 -0
  29. data/ext/cld3/script_span/fixunicodevalue.h +0 -69
  30. data/ext/cld3/script_span/generated_ulscript.h +0 -142
  31. data/ext/cld3/script_span/getonescriptspan.h +0 -124
  32. data/ext/cld3/script_span/integral_types.h +0 -37
  33. data/ext/cld3/script_span/offsetmap.h +0 -168
  34. data/ext/cld3/script_span/port.h +0 -143
  35. data/ext/cld3/script_span/stringpiece.h +0 -81
  36. data/ext/cld3/script_span/text_processing.h +0 -30
  37. data/ext/cld3/script_span/utf8acceptinterchange.h +0 -486
  38. data/ext/cld3/script_span/utf8prop_lettermarkscriptnum.h +0 -1631
  39. data/ext/cld3/script_span/utf8repl_lettermarklower.h +0 -758
  40. data/ext/cld3/script_span/utf8scannot_lettermarkspecial.h +0 -1455
  41. data/ext/cld3/script_span/utf8statetable.h +0 -285
  42. data/ext/cld3/sentence_features.o +0 -0
  43. data/ext/cld3/task_context.o +0 -0
  44. data/ext/cld3/task_context_params.o +0 -0
  45. data/ext/cld3/text_processing.o +0 -0
  46. data/ext/cld3/unicodetext.o +0 -0
  47. data/ext/cld3/utf8statetable.o +0 -0
  48. data/ext/cld3/utils.o +0 -0
  49. data/ext/cld3/workspace.o +0 -0
  50. data/lib/cld3/unstable.rb +0 -58
@@ -1,142 +0,0 @@
1
- // Copyright 2013 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
-
15
- // generated_ulscript.h
16
- // Machine generated. Do Not Edit.
17
- //
18
- // Declarations for scripts recognized by CLD2
19
- //
20
-
21
- #ifndef SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
22
- #define SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
23
-
24
- namespace chrome_lang_id {
25
- namespace CLD2 {
26
-
27
- typedef enum {RTypeNone = 0, RTypeOne, RTypeMany, RTypeCJK} ULScriptRType;
28
-
29
- typedef struct {const char* s; int i;} CharIntPair;
30
-
31
- typedef enum {
32
- ULScript_Common = 0, // Zyyy
33
- ULScript_Latin = 1, // Latn
34
- ULScript_Greek = 2, // Grek
35
- ULScript_Cyrillic = 3, // Cyrl
36
- ULScript_Armenian = 4, // Armn
37
- ULScript_Hebrew = 5, // Hebr
38
- ULScript_Arabic = 6, // Arab
39
- ULScript_Syriac = 7, // Syrc
40
- ULScript_Thaana = 8, // Thaa
41
- ULScript_Devanagari = 9, // Deva
42
- ULScript_Bengali = 10, // Beng
43
- ULScript_Gurmukhi = 11, // Guru
44
- ULScript_Gujarati = 12, // Gujr
45
- ULScript_Oriya = 13, // Orya
46
- ULScript_Tamil = 14, // Taml
47
- ULScript_Telugu = 15, // Telu
48
- ULScript_Kannada = 16, // Knda
49
- ULScript_Malayalam = 17, // Mlym
50
- ULScript_Sinhala = 18, // Sinh
51
- ULScript_Thai = 19, // Thai
52
- ULScript_Lao = 20, // Laoo
53
- ULScript_Tibetan = 21, // Tibt
54
- ULScript_Myanmar = 22, // Mymr
55
- ULScript_Georgian = 23, // Geor
56
- ULScript_Hani = 24, // Hani
57
- ULScript_Ethiopic = 25, // Ethi
58
- ULScript_Cherokee = 26, // Cher
59
- ULScript_Canadian_Aboriginal = 27, // Cans
60
- ULScript_Ogham = 28, // Ogam
61
- ULScript_Runic = 29, // Runr
62
- ULScript_Khmer = 30, // Khmr
63
- ULScript_Mongolian = 31, // Mong
64
- ULScript_32 = 32, //
65
- ULScript_33 = 33, //
66
- ULScript_Bopomofo = 34, // Bopo
67
- ULScript_35 = 35, //
68
- ULScript_Yi = 36, // Yiii
69
- ULScript_Old_Italic = 37, // Ital
70
- ULScript_Gothic = 38, // Goth
71
- ULScript_Deseret = 39, // Dsrt
72
- ULScript_Inherited = 40, // Zinh
73
- ULScript_Tagalog = 41, // Tglg
74
- ULScript_Hanunoo = 42, // Hano
75
- ULScript_Buhid = 43, // Buhd
76
- ULScript_Tagbanwa = 44, // Tagb
77
- ULScript_Limbu = 45, // Limb
78
- ULScript_Tai_Le = 46, // Tale
79
- ULScript_Linear_B = 47, // Linb
80
- ULScript_Ugaritic = 48, // Ugar
81
- ULScript_Shavian = 49, // Shaw
82
- ULScript_Osmanya = 50, // Osma
83
- ULScript_Cypriot = 51, // Cprt
84
- ULScript_Braille = 52, // Brai
85
- ULScript_Buginese = 53, // Bugi
86
- ULScript_Coptic = 54, // Copt
87
- ULScript_New_Tai_Lue = 55, // Talu
88
- ULScript_Glagolitic = 56, // Glag
89
- ULScript_Tifinagh = 57, // Tfng
90
- ULScript_Syloti_Nagri = 58, // Sylo
91
- ULScript_Old_Persian = 59, // Xpeo
92
- ULScript_Kharoshthi = 60, // Khar
93
- ULScript_Balinese = 61, // Bali
94
- ULScript_Cuneiform = 62, // Xsux
95
- ULScript_Phoenician = 63, // Phnx
96
- ULScript_Phags_Pa = 64, // Phag
97
- ULScript_Nko = 65, // Nkoo
98
- ULScript_Sundanese = 66, // Sund
99
- ULScript_Lepcha = 67, // Lepc
100
- ULScript_Ol_Chiki = 68, // Olck
101
- ULScript_Vai = 69, // Vaii
102
- ULScript_Saurashtra = 70, // Saur
103
- ULScript_Kayah_Li = 71, // Kali
104
- ULScript_Rejang = 72, // Rjng
105
- ULScript_Lycian = 73, // Lyci
106
- ULScript_Carian = 74, // Cari
107
- ULScript_Lydian = 75, // Lydi
108
- ULScript_Cham = 76, // Cham
109
- ULScript_Tai_Tham = 77, // Lana
110
- ULScript_Tai_Viet = 78, // Tavt
111
- ULScript_Avestan = 79, // Avst
112
- ULScript_Egyptian_Hieroglyphs = 80, // Egyp
113
- ULScript_Samaritan = 81, // Samr
114
- ULScript_Lisu = 82, // Lisu
115
- ULScript_Bamum = 83, // Bamu
116
- ULScript_Javanese = 84, // Java
117
- ULScript_Meetei_Mayek = 85, // Mtei
118
- ULScript_Imperial_Aramaic = 86, // Armi
119
- ULScript_Old_South_Arabian = 87, // Sarb
120
- ULScript_Inscriptional_Parthian = 88, // Prti
121
- ULScript_Inscriptional_Pahlavi = 89, // Phli
122
- ULScript_Old_Turkic = 90, // Orkh
123
- ULScript_Kaithi = 91, // Kthi
124
- ULScript_Batak = 92, // Batk
125
- ULScript_Brahmi = 93, // Brah
126
- ULScript_Mandaic = 94, // Mand
127
- ULScript_Chakma = 95, // Cakm
128
- ULScript_Meroitic_Cursive = 96, // Merc
129
- ULScript_Meroitic_Hieroglyphs = 97, // Mero
130
- ULScript_Miao = 98, // Plrd
131
- ULScript_Sharada = 99, // Shrd
132
- ULScript_Sora_Sompeng = 100, // Sora
133
- ULScript_Takri = 101, // Takr
134
- NUM_ULSCRIPTS
135
- } ULScript;
136
-
137
- #define UNKNOWN_ULSCRIPT ULScript_Common
138
-
139
- } // namespace CLD2
140
- } // namespace chrome_lang_id
141
-
142
- #endif // SCRIPT_SPAN_GENERATED_ULSCRIPT_H_
@@ -1,124 +0,0 @@
1
- // Copyright 2013 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
-
15
- //
16
- // Author: dsites@google.com (Dick Sites)
17
- //
18
-
19
-
20
- #ifndef SCRIPT_SPAN_GETONESCRIPTSPAN_H_
21
- #define SCRIPT_SPAN_GETONESCRIPTSPAN_H_
22
-
23
- #include "generated_ulscript.h"
24
- #include "integral_types.h"
25
- #include "offsetmap.h"
26
-
27
- namespace chrome_lang_id {
28
- namespace CLD2 {
29
-
30
- static const int kMaxScriptBuffer = 40960;
31
- static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
32
- static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
33
- static const int kWithinScriptTail = 32; // Stop at word space in last
34
- // N bytes of script buffer
35
-
36
- struct LangSpan {
37
- char* text = nullptr; // Pointer to the span, somewhere
38
- int text_bytes = 0; // Number of bytes of text in the span
39
- int offset = 0; // Offset of start of span in original input buffer
40
- ULScript ulscript = UNKNOWN_ULSCRIPT; // Unicode Letters Script of this span
41
- bool truncated = false; // true if buffer filled up before a
42
- // different script or EOF was found
43
- };
44
-
45
- static inline bool IsContinuationByte(char c) {
46
- return static_cast<signed char>(c) < -64;
47
- }
48
-
49
- // Gets lscript number for letters; always returns
50
- // 0 (common script) for non-letters
51
- int GetUTF8LetterScriptNum(const char* src);
52
-
53
- // Update src pointer to point to next quadgram, +2..+5
54
- // Looks at src[0..4]
55
- const char* AdvanceQuad(const char* src);
56
-
57
- // Utility routine to search alphabetical tables
58
- int BinarySearch(const char* key, int lo, int hi, const CharIntPair* cipair);
59
-
60
- // Returns the length in bytes of the prefix of src that is all
61
- // interchange valid UTF-8
62
- int SpanInterchangeValid(const char* src, int byte_length);
63
-
64
- class ScriptScanner {
65
- public:
66
- ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
67
- ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
68
- bool any_text, bool any_script);
69
- ~ScriptScanner();
70
-
71
- // Copy next run of same-script non-tag letters to buffer [NUL terminated]
72
- bool GetOneScriptSpan(LangSpan* span);
73
-
74
- // Force Latin and Cyrillic scripts to be lowercase
75
- void LowerScriptSpan(LangSpan* span);
76
-
77
- // Copy next run of same-script non-tag letters to buffer [NUL terminated]
78
- // Force Latin and Cyrillic scripts to be lowercase
79
- bool GetOneScriptSpanLower(LangSpan* span);
80
-
81
- // Copy next run of non-tag characters to buffer [NUL terminated]
82
- // This just removes tags and removes entities
83
- // Buffer has leading space
84
- bool GetOneTextSpan(LangSpan* span);
85
-
86
- // Maps byte offset in most recent GetOneScriptSpan/Lower
87
- // span->text [0..text_bytes] into an additional byte offset from
88
- // span->offset, to get back to corresponding text in the original
89
- // input buffer.
90
- // text_offset must be the first byte
91
- // of a UTF-8 character, or just beyond the last character. Normally this
92
- // routine is called with the first byte of an interesting range and
93
- // again with the first byte of the following range.
94
- int MapBack(int text_offset);
95
-
96
- const char* GetBufferStart() {return start_byte_;}
97
-
98
- private:
99
- // Skip over tags and non-letters
100
- int SkipToFrontOfSpan(const char* src, int len, int* script);
101
-
102
- const char* start_byte_; // Starting byte of buffer to scan
103
- const char* next_byte_; // First unscanned byte
104
- int byte_length_; // Bytes left
105
-
106
- bool is_plain_text_; // true fo text, false for HTML
107
- char* script_buffer_; // Holds text with expanded entities
108
- char* script_buffer_lower_; // Holds lowercased text
109
- bool letters_marks_only_; // To distinguish scriptspan of one
110
- // letters/marks vs. any mixture of text
111
- bool one_script_only_; // To distinguish scriptspan of one
112
- // script vs. any mixture of scripts
113
- int exit_state_; // For tag parser kTagParseTbl_0, based
114
- // on letters_marks_only_
115
- public :
116
- // Expose for debugging
117
- OffsetMap map2original_; // map from script_buffer_ to buffer
118
- OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_
119
- };
120
-
121
- } // namespace CLD2
122
- } // namespace chrome_lang_id
123
-
124
- #endif // SCRIPT_SPAN_GETONESCRIPTSPAN_H_
@@ -1,37 +0,0 @@
1
- // Copyright 2013 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
-
15
- #ifndef SCRIPT_SPAN_INTEGRAL_TYPES_H_
16
- #define SCRIPT_SPAN_INTEGRAL_TYPES_H_
17
-
18
- // Cheap version
19
- namespace chrome_lang_id {
20
- namespace CLD2 {
21
-
22
- typedef unsigned char uint8;
23
- typedef unsigned short uint16;
24
- typedef unsigned int uint32;
25
- typedef unsigned long long int uint64;
26
-
27
- typedef signed char int8;
28
- typedef signed short int16;
29
- typedef signed int int32;
30
- typedef signed long long int int64;
31
-
32
- typedef int32 char32;
33
-
34
- } // End namespace CLD2
35
- } // End namespace chrome_lang_id
36
-
37
- #endif // SCRIPT_SPAN_INTEGRAL_TYPES_H_
@@ -1,168 +0,0 @@
1
- // Copyright 2013 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
-
15
- //
16
- // Author: dsites@google.com (Dick Sites)
17
- //
18
-
19
- #ifndef SCRIPT_SPAN_OFFSETMAP_H_
20
- #define SCRIPT_SPAN_OFFSETMAP_H_
21
-
22
- #include <string> // for string
23
-
24
- #include "integral_types.h" // for uint32
25
-
26
- // ***************************** OffsetMap **************************
27
- //
28
- // An OffsetMap object is a container for a mapping from offsets in one text
29
- // buffer A' to offsets in another text buffer A. It is most useful when A' is
30
- // built from A via substitutions that occasionally do not preserve byte length.
31
- //
32
- // A series of operators are used to build the correspondence map, then
33
- // calls can be made to map an offset in A' to an offset in A, or vice versa.
34
- // The map starts with offset 0 in A corresponding to offset 0 in A'.
35
- // The mapping is then built sequentially, adding on byte ranges that are
36
- // identical in A and A', byte ranges that are inserted in A', and byte ranges
37
- // that are deleted from A. All bytes beyond those specified when building the
38
- // map are assumed to correspond, i.e. a Copy(infinity) is assumed at the
39
- // end of the map.
40
- //
41
- // The internal data structure records positions at which bytes are added or
42
- // deleted. Using the map is O(1) when increasing the A' or A offset
43
- // monotonically, and O(n) when accessing random offsets, where n is the
44
- // number of differences.
45
- //
46
-
47
- namespace chrome_lang_id {
48
- namespace CLD2 {
49
-
50
- class OffsetMap {
51
- public:
52
- // Constructor, destructor
53
- OffsetMap();
54
- ~OffsetMap();
55
-
56
- // Clear the map
57
- void Clear();
58
-
59
- // Add to mapping from A to A', specifying how many next bytes correspond
60
- // in A and A'
61
- void Copy(int bytes);
62
-
63
- // Add to mapping from A to A', specifying how many next bytes are
64
- // inserted in A' while not advancing in A at all
65
- void Insert(int bytes);
66
-
67
- // Add to mapping from A to A', specifying how many next bytes are
68
- // deleted from A while not advancing in A' at all
69
- void Delete(int bytes);
70
-
71
- // [Finish building map,] Re-position to offset 0
72
- // This call is optional; MapForward and MapBack finish building the map
73
- // if necessary
74
- void Reset();
75
-
76
- // Map an offset in A' to the corresponding offset in A
77
- int MapBack(int aprimeoffset);
78
-
79
- // Map an offset in A to the corresponding offset in A'
80
- int MapForward(int aoffset);
81
-
82
- // h = ComposeOffsetMap(g, f), where f is a map from A to A', g is
83
- // from A' to A'' and h is from A to A''.
84
- //
85
- // Note that g->MoveForward(f->MoveForward(aoffset)) always equals
86
- // to h->MoveForward(aoffset), while
87
- // f->MoveBack(g->MoveBack(aprimeprimeoffset)) doesn't always equals
88
- // to h->MoveBack(aprimeprimeoffset). This happens when deletion in
89
- // f and insertion in g are at the same place. For example,
90
- //
91
- // A 1 2 3 4
92
- // ^ | ^ ^
93
- // | | / | f
94
- // v vv v
95
- // A' 1' 2' 3'
96
- // ^ ^^ ^
97
- // | | \ | g
98
- // v | v v
99
- // A'' 1'' 2'' 3'' 4''
100
- //
101
- // results in:
102
- //
103
- // A 1 2 3 4
104
- // ^ ^\ ^ ^
105
- // | | \ | | h
106
- // v | vv v
107
- // A'' 1'' 2'' 3'' 4''
108
- //
109
- // 2'' is mapped 3 in the former figure, while 2'' is mapped to 2 in
110
- // the latter figure.
111
- static void ComposeOffsetMap(OffsetMap* g, OffsetMap* f, OffsetMap* h);
112
-
113
- // For testing only -- force a mapping
114
- void StuffIt(const std::string& diffs, int max_aoffset, int max_aprimeoffset);
115
-
116
- private:
117
- enum MapOp {PREFIX_OP, COPY_OP, INSERT_OP, DELETE_OP};
118
-
119
- void Flush();
120
- void FlushAll();
121
- void MaybeFlushAll();
122
- void Emit(MapOp op, int len);
123
-
124
- void SetLeft();
125
- void SetRight();
126
-
127
- // Back up over previous range, 1..5 bytes
128
- // Return subscript at the beginning of that. Pins at 0
129
- int Backup(int sub);
130
-
131
- // Parse next range, 1..5 bytes
132
- // Return subscript just off the end of that
133
- int ParseNext(int sub, MapOp* op, int* length);
134
-
135
- // Parse previous range, 1..5 bytes
136
- // Return current subscript
137
- int ParsePrevious(int sub, MapOp* op, int* length);
138
-
139
- bool MoveRight(); // Returns true if OK
140
- bool MoveLeft(); // Returns true if OK
141
-
142
- // Copies insert operations from source to dest. Returns true if no
143
- // other operations are found.
144
- static bool CopyInserts(OffsetMap* source, OffsetMap* dest);
145
-
146
- // Copies delete operations from source to dest. Returns true if no other
147
- // operations are found.
148
- static bool CopyDeletes(OffsetMap* source, OffsetMap* dest);
149
-
150
- std::string diffs_;
151
- MapOp pending_op_;
152
- uint32 pending_length_;
153
-
154
- // Offsets in the ranges below correspond to each other, with A' = A + diff
155
- int next_diff_sub_;
156
- int current_lo_aoffset_;
157
- int current_hi_aoffset_;
158
- int current_lo_aprimeoffset_;
159
- int current_hi_aprimeoffset_;
160
- int current_diff_;
161
- int max_aoffset_;
162
- int max_aprimeoffset_;
163
- };
164
-
165
- } // namespace CLD2
166
- } // namespace chrome_lang_id
167
-
168
- #endif // SCRIPT_SPAN_OFFSETMAP_H_
@@ -1,143 +0,0 @@
1
- // Copyright 2013 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
-
15
- //
16
- // These are weird things we need to do to get this compiling on
17
- // random systems [subset].
18
-
19
- #ifndef SCRIPT_SPAN_PORT_H_
20
- #define SCRIPT_SPAN_PORT_H_
21
-
22
- #include <string.h> // for memcpy()
23
-
24
- #include "integral_types.h"
25
-
26
- namespace chrome_lang_id {
27
- namespace CLD2 {
28
-
29
- // Portable handling of unaligned loads, stores, and copies.
30
- // On some platforms, like ARM, the copy functions can be more efficient
31
- // then a load and a store.
32
-
33
- #if defined(ARCH_PIII) || defined(ARCH_ATHLON) || defined(ARCH_K8) || defined(_ARCH_PPC)
34
-
35
- // x86 and x86-64 can perform unaligned loads/stores directly;
36
- // modern PowerPC hardware can also do unaligned integer loads and stores;
37
- // but note: the FPU still sends unaligned loads and stores to a trap handler!
38
-
39
- #define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
40
- #define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
41
- #define UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64 *>(_p))
42
-
43
- #define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
44
- #define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
45
- #define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64 *>(_p) = (_val))
46
-
47
- #elif defined(__arm__) && \
48
- !defined(__ARM_ARCH_5__) && \
49
- !defined(__ARM_ARCH_5T__) && \
50
- !defined(__ARM_ARCH_5TE__) && \
51
- !defined(__ARM_ARCH_5TEJ__) && \
52
- !defined(__ARM_ARCH_6__) && \
53
- !defined(__ARM_ARCH_6J__) && \
54
- !defined(__ARM_ARCH_6K__) && \
55
- !defined(__ARM_ARCH_6Z__) && \
56
- !defined(__ARM_ARCH_6ZK__) && \
57
- !defined(__ARM_ARCH_6T2__) && \
58
- !defined(__ARM_ARCH_7__) && \
59
- !defined(__ARM_ARCH_7A__) && \
60
- !defined(__ARM_ARCH_7M__) && \
61
- !defined(__ARM_ARCH_7R__) && \
62
- !defined(__ARM_ARCH_8__) && \
63
- !defined(__ARM_ARCH_8A__)
64
-
65
- // ARMv7 and newer support native unaligned accesses, but only of 16-bit
66
- // and 32-bit values (not 64-bit); older versions either raise a fatal signal,
67
- // do an unaligned read and rotate the words around a bit, or do the reads very
68
- // slowly (trip through kernel mode). There's no simple #define that says just
69
- // “ARMv7 or higher”, so we have to filter away all ARMv5 and ARMv6
70
- // sub-architectures. Newer gcc (>= 4.6) set an __ARM_FEATURE_ALIGNED #define,
71
- // so in time, maybe we can move on to that.
72
- //
73
- // Note that even if a chipset supports unaligned access, it might not be
74
- // enabled in any given system, e.g.:
75
- // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/CIHCGCFD.html
76
- // Therefore, it's generally just not safe to allow unaligned access on any ARM
77
- // variant.
78
- //
79
- // This is a mess, but there's not much we can do about it.
80
-
81
- #define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
82
- #define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
83
-
84
- #define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16 *>(_p) = (_val))
85
- #define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32 *>(_p) = (_val))
86
-
87
- // TODO(sesse): NEON supports unaligned 64-bit loads and stores.
88
- // See if that would be more efficient on platforms supporting it,
89
- // at least for copies.
90
-
91
- inline uint64 UNALIGNED_LOAD64(const void *p) {
92
- uint64 t;
93
- memcpy(&t, p, sizeof t);
94
- return t;
95
- }
96
-
97
- inline void UNALIGNED_STORE64(void *p, uint64 v) {
98
- memcpy(p, &v, sizeof v);
99
- }
100
-
101
- #else
102
-
103
- #define NEED_ALIGNED_LOADS
104
-
105
- // These functions are provided for architectures that don't support
106
- // unaligned loads and stores.
107
-
108
- inline uint16 UNALIGNED_LOAD16(const void *p) {
109
- uint16 t;
110
- memcpy(&t, p, sizeof t);
111
- return t;
112
- }
113
-
114
- inline uint32 UNALIGNED_LOAD32(const void *p) {
115
- uint32 t;
116
- memcpy(&t, p, sizeof t);
117
- return t;
118
- }
119
-
120
- inline uint64 UNALIGNED_LOAD64(const void *p) {
121
- uint64 t;
122
- memcpy(&t, p, sizeof t);
123
- return t;
124
- }
125
-
126
- inline void UNALIGNED_STORE16(void *p, uint16 v) {
127
- memcpy(p, &v, sizeof v);
128
- }
129
-
130
- inline void UNALIGNED_STORE32(void *p, uint32 v) {
131
- memcpy(p, &v, sizeof v);
132
- }
133
-
134
- inline void UNALIGNED_STORE64(void *p, uint64 v) {
135
- memcpy(p, &v, sizeof v);
136
- }
137
-
138
- #endif
139
-
140
- } // End namespace CLD2
141
- } // End namespace chrome_lang_id
142
-
143
- #endif // SCRIPT_SPAN_PORT_H_