cld3 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +18 -0
- data/LICENSE +204 -0
- data/LICENSE_CLD3 +203 -0
- data/README.md +22 -0
- data/cld3.gemspec +35 -0
- data/ext/cld3/base.cc +36 -0
- data/ext/cld3/base.h +106 -0
- data/ext/cld3/casts.h +98 -0
- data/ext/cld3/embedding_feature_extractor.cc +51 -0
- data/ext/cld3/embedding_feature_extractor.h +182 -0
- data/ext/cld3/embedding_network.cc +196 -0
- data/ext/cld3/embedding_network.h +186 -0
- data/ext/cld3/embedding_network_params.h +285 -0
- data/ext/cld3/extconf.rb +49 -0
- data/ext/cld3/feature_extractor.cc +137 -0
- data/ext/cld3/feature_extractor.h +633 -0
- data/ext/cld3/feature_extractor.proto +50 -0
- data/ext/cld3/feature_types.cc +72 -0
- data/ext/cld3/feature_types.h +158 -0
- data/ext/cld3/fixunicodevalue.cc +55 -0
- data/ext/cld3/fixunicodevalue.h +69 -0
- data/ext/cld3/float16.h +58 -0
- data/ext/cld3/fml_parser.cc +308 -0
- data/ext/cld3/fml_parser.h +123 -0
- data/ext/cld3/generated_entities.cc +296 -0
- data/ext/cld3/generated_ulscript.cc +678 -0
- data/ext/cld3/generated_ulscript.h +142 -0
- data/ext/cld3/getonescriptspan.cc +1109 -0
- data/ext/cld3/getonescriptspan.h +124 -0
- data/ext/cld3/integral_types.h +37 -0
- data/ext/cld3/lang_id_nn_params.cc +57449 -0
- data/ext/cld3/lang_id_nn_params.h +178 -0
- data/ext/cld3/language_identifier_features.cc +165 -0
- data/ext/cld3/language_identifier_features.h +116 -0
- data/ext/cld3/nnet_language_identifier.cc +380 -0
- data/ext/cld3/nnet_language_identifier.h +175 -0
- data/ext/cld3/nnet_language_identifier_c.cc +72 -0
- data/ext/cld3/offsetmap.cc +478 -0
- data/ext/cld3/offsetmap.h +168 -0
- data/ext/cld3/port.h +143 -0
- data/ext/cld3/registry.cc +28 -0
- data/ext/cld3/registry.h +242 -0
- data/ext/cld3/relevant_script_feature.cc +89 -0
- data/ext/cld3/relevant_script_feature.h +49 -0
- data/ext/cld3/script_detector.h +156 -0
- data/ext/cld3/sentence.proto +77 -0
- data/ext/cld3/sentence_features.cc +29 -0
- data/ext/cld3/sentence_features.h +35 -0
- data/ext/cld3/simple_adder.h +72 -0
- data/ext/cld3/stringpiece.h +81 -0
- data/ext/cld3/task_context.cc +161 -0
- data/ext/cld3/task_context.h +81 -0
- data/ext/cld3/task_context_params.cc +74 -0
- data/ext/cld3/task_context_params.h +54 -0
- data/ext/cld3/task_spec.proto +98 -0
- data/ext/cld3/text_processing.cc +245 -0
- data/ext/cld3/text_processing.h +30 -0
- data/ext/cld3/unicodetext.cc +96 -0
- data/ext/cld3/unicodetext.h +144 -0
- data/ext/cld3/utf8acceptinterchange.h +486 -0
- data/ext/cld3/utf8prop_lettermarkscriptnum.h +1631 -0
- data/ext/cld3/utf8repl_lettermarklower.h +758 -0
- data/ext/cld3/utf8scannot_lettermarkspecial.h +1455 -0
- data/ext/cld3/utf8statetable.cc +1344 -0
- data/ext/cld3/utf8statetable.h +285 -0
- data/ext/cld3/utils.cc +241 -0
- data/ext/cld3/utils.h +144 -0
- data/ext/cld3/workspace.cc +64 -0
- data/ext/cld3/workspace.h +177 -0
- data/lib/cld3.rb +99 -0
- metadata +158 -0
@@ -0,0 +1,72 @@
|
|
1
|
+
/* Copyright 2017 Akihiko Odaki <akihiko.odaki.4i@stu.hosei.ac.jp>
|
2
|
+
All Rights Reserved.
|
3
|
+
|
4
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
you may not use this file except in compliance with the License.
|
6
|
+
You may obtain a copy of the License at
|
7
|
+
|
8
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
Unless required by applicable law or agreed to in writing, software
|
11
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
See the License for the specific language governing permissions and
|
14
|
+
limitations under the License.
|
15
|
+
==============================================================================*/
|
16
|
+
|
17
|
+
#include <cstddef>
|
18
|
+
#include <iostream>
|
19
|
+
#include "nnet_language_identifier.h"
|
20
|
+
|
21
|
+
#if defined _WIN32 || defined __CYGWIN__
|
22
|
+
#define EXPORT __declspec(dllexport)
|
23
|
+
#else
|
24
|
+
#define EXPORT __attribute__ ((visibility ("default")))
|
25
|
+
#endif
|
26
|
+
|
27
|
+
class NNetLanguageIdentifier : public chrome_lang_id::NNetLanguageIdentifier {
|
28
|
+
public:
|
29
|
+
inline NNetLanguageIdentifier(int min_num_bytes, int max_num_bytes)
|
30
|
+
: chrome_lang_id::NNetLanguageIdentifier(min_num_bytes, max_num_bytes)
|
31
|
+
{
|
32
|
+
}
|
33
|
+
|
34
|
+
std::string language;
|
35
|
+
};
|
36
|
+
|
37
|
+
extern "C" {
|
38
|
+
#include <stddef.h>
|
39
|
+
|
40
|
+
struct result {
|
41
|
+
struct {
|
42
|
+
const char *data;
|
43
|
+
size_t size;
|
44
|
+
} language;
|
45
|
+
float probability;
|
46
|
+
float proportion;
|
47
|
+
bool is_reliable;
|
48
|
+
};
|
49
|
+
|
50
|
+
EXPORT struct result NNetLanguageIdentifier_find_language(void *pointer,
|
51
|
+
const char *data,
|
52
|
+
size_t size) {
|
53
|
+
auto instance = reinterpret_cast<NNetLanguageIdentifier *>(pointer);
|
54
|
+
auto result = instance->FindLanguage(std::string(data, size));
|
55
|
+
instance->language = std::move(result.language);
|
56
|
+
|
57
|
+
return (struct result) {
|
58
|
+
{ instance->language.data(), instance->language.size() },
|
59
|
+
std::move(result.probability),
|
60
|
+
std::move(result.proportion),
|
61
|
+
std::move(result.is_reliable)
|
62
|
+
};
|
63
|
+
}
|
64
|
+
|
65
|
+
EXPORT void delete_NNetLanguageIdentifier(void *pointer) {
|
66
|
+
delete reinterpret_cast<NNetLanguageIdentifier *>(pointer);
|
67
|
+
}
|
68
|
+
|
69
|
+
EXPORT void *new_NNetLanguageIdentifier(int min_num_bytes, int max_num_bytes) {
|
70
|
+
return new NNetLanguageIdentifier(min_num_bytes, max_num_bytes);
|
71
|
+
}
|
72
|
+
}
|
@@ -0,0 +1,478 @@
|
|
1
|
+
// Copyright 2013 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
|
15
|
+
//
|
16
|
+
// Author: dsites@google.com (Dick Sites)
|
17
|
+
//
|
18
|
+
//
|
19
|
+
|
20
|
+
#include "offsetmap.h"
|
21
|
+
|
22
|
+
#include <string.h> // for strcmp
|
23
|
+
#include <algorithm> // for min
|
24
|
+
|
25
|
+
using namespace std;
|
26
|
+
|
27
|
+
namespace chrome_lang_id {
|
28
|
+
namespace CLD2 {
|
29
|
+
|
30
|
+
// Constructor, destructor
|
31
|
+
OffsetMap::OffsetMap() {
|
32
|
+
Clear();
|
33
|
+
}
|
34
|
+
|
35
|
+
OffsetMap::~OffsetMap() {
|
36
|
+
}
|
37
|
+
|
38
|
+
// Clear the map
|
39
|
+
// After:
|
40
|
+
// next_diff_sub_ is 0
|
41
|
+
// Windows are the a and a' ranges covered by diffs_[next_diff_sub_-1]
|
42
|
+
// which is a fake range of width 0 mapping 0=>0
|
43
|
+
void OffsetMap::Clear() {
|
44
|
+
diffs_.clear();
|
45
|
+
pending_op_ = COPY_OP;
|
46
|
+
pending_length_ = 0;
|
47
|
+
next_diff_sub_ = 0;
|
48
|
+
current_lo_aoffset_ = 0;
|
49
|
+
current_hi_aoffset_ = 0;
|
50
|
+
current_lo_aprimeoffset_ = 0;
|
51
|
+
current_hi_aprimeoffset_ = 0;
|
52
|
+
current_diff_ = 0;
|
53
|
+
max_aoffset_ = 0; // Largest seen so far
|
54
|
+
max_aprimeoffset_ = 0; // Largest seen so far
|
55
|
+
}
|
56
|
+
|
57
|
+
static inline char OpPart(const char c) {
|
58
|
+
return (c >> 6) & 3;
|
59
|
+
}
|
60
|
+
static inline char LenPart(const char c) {
|
61
|
+
return c & 0x3f;
|
62
|
+
}
|
63
|
+
|
64
|
+
// Reset to offset 0
|
65
|
+
void OffsetMap::Reset() {
|
66
|
+
MaybeFlushAll();
|
67
|
+
|
68
|
+
next_diff_sub_ = 0;
|
69
|
+
current_lo_aoffset_ = 0;
|
70
|
+
current_hi_aoffset_ = 0;
|
71
|
+
current_lo_aprimeoffset_ = 0;
|
72
|
+
current_hi_aprimeoffset_ = 0;
|
73
|
+
current_diff_ = 0;
|
74
|
+
}
|
75
|
+
|
76
|
+
// Add to mapping from A to A', specifying how many next bytes are
|
77
|
+
// identical in A and A'
|
78
|
+
void OffsetMap::Copy(int bytes) {
|
79
|
+
if (bytes == 0) {return;}
|
80
|
+
max_aoffset_ += bytes; // Largest seen so far
|
81
|
+
max_aprimeoffset_ += bytes; // Largest seen so far
|
82
|
+
if (pending_op_ == COPY_OP) {
|
83
|
+
pending_length_ += bytes;
|
84
|
+
} else {
|
85
|
+
Flush();
|
86
|
+
pending_op_ = COPY_OP;
|
87
|
+
pending_length_ = bytes;
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
// Add to mapping from A to A', specifying how many next bytes are
|
92
|
+
// inserted in A' while not advancing in A at all
|
93
|
+
void OffsetMap::Insert(int bytes){
|
94
|
+
if (bytes == 0) {return;}
|
95
|
+
max_aprimeoffset_ += bytes; // Largest seen so far
|
96
|
+
if (pending_op_ == INSERT_OP) {
|
97
|
+
pending_length_ += bytes;
|
98
|
+
} else if ((bytes == 1) &&
|
99
|
+
(pending_op_ == DELETE_OP) && (pending_length_ == 1)) {
|
100
|
+
// Special-case exactly delete(1) insert(1) +> copy(1);
|
101
|
+
// all others backmap inserts to after deletes
|
102
|
+
pending_op_ = COPY_OP;
|
103
|
+
} else {
|
104
|
+
Flush();
|
105
|
+
pending_op_ = INSERT_OP;
|
106
|
+
pending_length_ = bytes;
|
107
|
+
}
|
108
|
+
}
|
109
|
+
|
110
|
+
// Add to mapping from A to A', specifying how many next bytes are
|
111
|
+
// deleted from A while not advancing in A' at all
|
112
|
+
void OffsetMap::Delete(int bytes){
|
113
|
+
if (bytes == 0) {return;}
|
114
|
+
max_aoffset_ += bytes; // Largest seen so far
|
115
|
+
if (pending_op_ == DELETE_OP) {
|
116
|
+
pending_length_ += bytes;
|
117
|
+
} else if ((bytes == 1) &&
|
118
|
+
(pending_op_ == INSERT_OP) && (pending_length_ == 1)) {
|
119
|
+
// Special-case exactly insert(1) delete(1) => copy(1);
|
120
|
+
// all others backmap deletes to after insertss
|
121
|
+
pending_op_ = COPY_OP;
|
122
|
+
} else {
|
123
|
+
Flush();
|
124
|
+
pending_op_ = DELETE_OP;
|
125
|
+
pending_length_ = bytes;
|
126
|
+
}
|
127
|
+
}
|
128
|
+
|
129
|
+
void OffsetMap::Flush() {
|
130
|
+
if (pending_length_ == 0) {
|
131
|
+
return;
|
132
|
+
}
|
133
|
+
// We may be emitting a copy op just after a copy op because +1 -1 cancelled
|
134
|
+
// inbetween. If the lengths don't need a prefix byte, combine them
|
135
|
+
if ((pending_op_ == COPY_OP) && !diffs_.empty()) {
|
136
|
+
char c = diffs_[diffs_.size() - 1];
|
137
|
+
MapOp prior_op = static_cast<MapOp>(OpPart(c));
|
138
|
+
int prior_len = LenPart(c);
|
139
|
+
if ((prior_op == COPY_OP) && ((prior_len + pending_length_) <= 0x3f)) {
|
140
|
+
diffs_[diffs_.size() - 1] += pending_length_;
|
141
|
+
pending_length_ = 0;
|
142
|
+
return;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
if (pending_length_ > 0x3f) {
|
146
|
+
bool non_zero_emitted = false;
|
147
|
+
for (int shift = 30; shift > 0; shift -= 6) {
|
148
|
+
int prefix = (pending_length_ >> shift) & 0x3f;
|
149
|
+
if ((prefix > 0) || non_zero_emitted) {
|
150
|
+
Emit(PREFIX_OP, prefix);
|
151
|
+
non_zero_emitted = true;
|
152
|
+
}
|
153
|
+
}
|
154
|
+
}
|
155
|
+
Emit(pending_op_, pending_length_ & 0x3f);
|
156
|
+
pending_length_ = 0;
|
157
|
+
}
|
158
|
+
|
159
|
+
|
160
|
+
// Add one more entry to copy one byte off the end, then flush
|
161
|
+
void OffsetMap::FlushAll() {
|
162
|
+
Copy(1);
|
163
|
+
Flush();
|
164
|
+
}
|
165
|
+
|
166
|
+
// Flush all if necessary
|
167
|
+
void OffsetMap::MaybeFlushAll() {
|
168
|
+
if ((0 < pending_length_) || diffs_.empty()) {
|
169
|
+
FlushAll();
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
// Len may be 0, for example as the low piece of length=64
|
174
|
+
void OffsetMap::Emit(MapOp op, int len) {
|
175
|
+
char c = (static_cast<char>(op) << 6) | (len & 0x3f);
|
176
|
+
diffs_.push_back(c);
|
177
|
+
}
|
178
|
+
|
179
|
+
//----------------------------------------------------------------------------//
|
180
|
+
// The guts of the 2013 design //
|
181
|
+
// If there are three ranges a b c in diffs_, we can be in one of five //
|
182
|
+
// states: LEFT of a, in ranges a b c, or RIGHT of c //
|
183
|
+
// In each state, there are windows A[Alo..Ahi), A'[A'lo..A'hi) and diffs_ //
|
184
|
+
// position next_diff_sub_ //
|
185
|
+
// There also are mapping constants max_aoffset_ and max_aprimeoffset_ //
|
186
|
+
// If LEFT, Alo=Ahi=0, A'lo=A'hi=0 and next_diff_sub_=0 //
|
187
|
+
// If RIGHT, Alo=Ahi=max_aoffset_, A'lo=A'hi=max_aprimeoffset_ and //
|
188
|
+
// next_diff_sub_=diffs_.size() //
|
189
|
+
// Otherwise, at least one of A[) and A'[) is non-empty and the first bytes //
|
190
|
+
// correspond to each other. If range i is active, next_diff_sub_ is at //
|
191
|
+
// the first byte of range i+1. Because of the length-prefix operator, //
|
192
|
+
// an individual range item in diffs_ may be multiple bytes //
|
193
|
+
// In all cases aprimeoffset = aoffset + current_diff_ //
|
194
|
+
// i.e. current_diff_ = aprimeoffset - aoffset //
|
195
|
+
// //
|
196
|
+
// In the degenerate case of diffs_.empty(), there are only two states //
|
197
|
+
// LEFT and RIGHT and the mapping is the identity mapping. //
|
198
|
+
// The initial state is LEFT. //
|
199
|
+
// It is an error to move left into LEFT or right into RIGHT, but the code //
|
200
|
+
// below is robust in these cases. //
|
201
|
+
//----------------------------------------------------------------------------//
|
202
|
+
|
203
|
+
void OffsetMap::SetLeft() {
|
204
|
+
current_lo_aoffset_ = 0;
|
205
|
+
current_hi_aoffset_ = 0;
|
206
|
+
current_lo_aprimeoffset_ = 0;
|
207
|
+
current_hi_aprimeoffset_ = 0;
|
208
|
+
current_diff_ = 0;
|
209
|
+
next_diff_sub_ = 0;
|
210
|
+
}
|
211
|
+
|
212
|
+
void OffsetMap::SetRight() {
|
213
|
+
current_lo_aoffset_ = max_aoffset_;
|
214
|
+
current_hi_aoffset_ = max_aoffset_;
|
215
|
+
current_lo_aprimeoffset_ = max_aprimeoffset_;
|
216
|
+
current_hi_aprimeoffset_ = max_aprimeoffset_;
|
217
|
+
current_diff_ = max_aprimeoffset_ - max_aoffset_;
|
218
|
+
next_diff_sub_ = 0;
|
219
|
+
}
|
220
|
+
|
221
|
+
// Back up over previous range, 1..5 bytes
|
222
|
+
// Return subscript at the beginning of that. Pins at 0
|
223
|
+
int OffsetMap::Backup(int sub) {
|
224
|
+
if (sub <= 0) {return 0;}
|
225
|
+
--sub;
|
226
|
+
while ((0 < sub) &&
|
227
|
+
(static_cast<MapOp>(OpPart(diffs_[sub - 1]) == PREFIX_OP))) {
|
228
|
+
--sub;
|
229
|
+
}
|
230
|
+
return sub;
|
231
|
+
}
|
232
|
+
|
233
|
+
// Parse next range, 1..5 bytes
|
234
|
+
// Return subscript just off the end of that
|
235
|
+
int OffsetMap::ParseNext(int sub, MapOp* op, int* length) {
|
236
|
+
*op = PREFIX_OP;
|
237
|
+
*length = 0;
|
238
|
+
char c;
|
239
|
+
while ((sub < static_cast<int>(diffs_.size())) && (*op == PREFIX_OP)) {
|
240
|
+
c = diffs_[sub++];
|
241
|
+
*op = static_cast<MapOp>(OpPart(c));
|
242
|
+
int len = LenPart(c);
|
243
|
+
*length = (*length << 6) + len;
|
244
|
+
}
|
245
|
+
// If mal-formed or in RIGHT, this will return with op = PREFIX_OP
|
246
|
+
// Mal-formed can include a trailing prefix byte with no following op
|
247
|
+
return sub;
|
248
|
+
}
|
249
|
+
|
250
|
+
// Parse previous range, 1..5 bytes
|
251
|
+
// Return current subscript
|
252
|
+
int OffsetMap::ParsePrevious(int sub, MapOp* op, int* length) {
|
253
|
+
sub = Backup(sub);
|
254
|
+
return ParseNext(sub, op, length);
|
255
|
+
}
|
256
|
+
|
257
|
+
// Move active window one range to the right
|
258
|
+
// Return true if move was OK
|
259
|
+
bool OffsetMap::MoveRight() {
|
260
|
+
// If at last range or RIGHT, set to RIGHT, return error
|
261
|
+
if (next_diff_sub_ >= static_cast<int>(diffs_.size())) {
|
262
|
+
SetRight();
|
263
|
+
return false;
|
264
|
+
}
|
265
|
+
// Actually OK to move right
|
266
|
+
MapOp op;
|
267
|
+
int length;
|
268
|
+
bool retval = true;
|
269
|
+
// If mal-formed or in RIGHT, this will return with op = PREFIX_OP
|
270
|
+
next_diff_sub_ = ParseNext(next_diff_sub_, &op, &length);
|
271
|
+
|
272
|
+
current_lo_aoffset_ = current_hi_aoffset_;
|
273
|
+
current_lo_aprimeoffset_ = current_hi_aprimeoffset_;
|
274
|
+
if (op == COPY_OP) {
|
275
|
+
current_hi_aoffset_ = current_lo_aoffset_ + length;
|
276
|
+
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + length;
|
277
|
+
} else if (op == INSERT_OP) {
|
278
|
+
current_hi_aoffset_ = current_lo_aoffset_ + 0;
|
279
|
+
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + length;
|
280
|
+
} else if (op == DELETE_OP) {
|
281
|
+
current_hi_aoffset_ = current_lo_aoffset_ + length;
|
282
|
+
current_hi_aprimeoffset_ = current_lo_aprimeoffset_ + 0;
|
283
|
+
} else {
|
284
|
+
SetRight();
|
285
|
+
retval = false;
|
286
|
+
}
|
287
|
+
current_diff_ = current_lo_aprimeoffset_ - current_lo_aoffset_;
|
288
|
+
return retval;
|
289
|
+
}
|
290
|
+
|
291
|
+
// Move active window one range to the left
|
292
|
+
// Return true if move was OK
|
293
|
+
bool OffsetMap::MoveLeft() {
|
294
|
+
// If at first range or LEFT, set to LEFT, return error
|
295
|
+
if (next_diff_sub_ <= 0) {
|
296
|
+
SetLeft();
|
297
|
+
return false;
|
298
|
+
}
|
299
|
+
// Back up over current active window
|
300
|
+
next_diff_sub_ = Backup(next_diff_sub_);
|
301
|
+
if (next_diff_sub_ <= 0) {
|
302
|
+
SetLeft();
|
303
|
+
return false;
|
304
|
+
}
|
305
|
+
// Actually OK to move left
|
306
|
+
MapOp op;
|
307
|
+
int length;
|
308
|
+
|
309
|
+
// TODO(abakalov): 'retval' below is set but not used, which is suspicious.
|
310
|
+
// Did the authors mean to return this variable, analogously to MoveRight()?
|
311
|
+
// bool retval = true;
|
312
|
+
// If mal-formed or in LEFT, this will return with op = PREFIX_OP
|
313
|
+
next_diff_sub_ = ParsePrevious(next_diff_sub_, &op, &length);
|
314
|
+
|
315
|
+
current_hi_aoffset_ = current_lo_aoffset_;
|
316
|
+
current_hi_aprimeoffset_ = current_lo_aprimeoffset_;
|
317
|
+
if (op == COPY_OP) {
|
318
|
+
current_lo_aoffset_ = current_hi_aoffset_ - length;
|
319
|
+
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - length;
|
320
|
+
} else if (op == INSERT_OP) {
|
321
|
+
current_lo_aoffset_ = current_hi_aoffset_ - 0;
|
322
|
+
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - length;
|
323
|
+
} else if (op == DELETE_OP) {
|
324
|
+
current_lo_aoffset_ = current_hi_aoffset_ - length;
|
325
|
+
current_lo_aprimeoffset_ = current_hi_aprimeoffset_ - 0;
|
326
|
+
} else {
|
327
|
+
SetLeft();
|
328
|
+
// retval = false;
|
329
|
+
}
|
330
|
+
current_diff_ = current_lo_aprimeoffset_ - current_lo_aoffset_;
|
331
|
+
return true;
|
332
|
+
}
|
333
|
+
|
334
|
+
// Map an offset in A' to the corresponding offset in A
|
335
|
+
int OffsetMap::MapBack(int aprimeoffset){
|
336
|
+
MaybeFlushAll();
|
337
|
+
if (aprimeoffset < 0) {return 0;}
|
338
|
+
if (max_aprimeoffset_ <= aprimeoffset) {
|
339
|
+
return (aprimeoffset - max_aprimeoffset_) + max_aoffset_;
|
340
|
+
}
|
341
|
+
|
342
|
+
// If current_lo_aprimeoffset_ <= aprimeoffset < current_hi_aprimeoffset_,
|
343
|
+
// use current mapping, else move window left/right
|
344
|
+
bool ok = true;
|
345
|
+
while (ok && (aprimeoffset < current_lo_aprimeoffset_)) {
|
346
|
+
ok = MoveLeft();
|
347
|
+
}
|
348
|
+
while (ok && (current_hi_aprimeoffset_ <= aprimeoffset)) {
|
349
|
+
ok = MoveRight();
|
350
|
+
}
|
351
|
+
// So now current_lo_aprimeoffset_ <= aprimeoffset < current_hi_aprimeoffset_
|
352
|
+
|
353
|
+
int aoffset = aprimeoffset - current_diff_;
|
354
|
+
if (aoffset >= current_hi_aoffset_) {
|
355
|
+
// A' is in an insert region, all bytes of which backmap to A=hi_aoffset_
|
356
|
+
aoffset = current_hi_aoffset_;
|
357
|
+
}
|
358
|
+
return aoffset;
|
359
|
+
}
|
360
|
+
|
361
|
+
// Map an offset in A to the corresponding offset in A'
|
362
|
+
int OffsetMap::MapForward(int aoffset){
|
363
|
+
MaybeFlushAll();
|
364
|
+
if (aoffset < 0) {return 0;}
|
365
|
+
if (max_aoffset_ <= aoffset) {
|
366
|
+
return (aoffset - max_aoffset_) + max_aprimeoffset_;
|
367
|
+
}
|
368
|
+
|
369
|
+
// If current_lo_aoffset_ <= aoffset < current_hi_aoffset_,
|
370
|
+
// use current mapping, else move window left/right
|
371
|
+
bool ok = true;
|
372
|
+
while (ok && (aoffset < current_lo_aoffset_)) {
|
373
|
+
ok = MoveLeft();
|
374
|
+
}
|
375
|
+
while (ok && (current_hi_aoffset_ <= aoffset)) {
|
376
|
+
ok = MoveRight();
|
377
|
+
}
|
378
|
+
|
379
|
+
int aprimeoffset = aoffset + current_diff_;
|
380
|
+
if (aprimeoffset >= current_hi_aprimeoffset_) {
|
381
|
+
// A is in a delete region, all bytes of which map to A'=hi_aprimeoffset_
|
382
|
+
aprimeoffset = current_hi_aprimeoffset_;
|
383
|
+
}
|
384
|
+
return aprimeoffset;
|
385
|
+
}
|
386
|
+
|
387
|
+
|
388
|
+
// static
|
389
|
+
bool OffsetMap::CopyInserts(OffsetMap* source, OffsetMap* dest) {
|
390
|
+
bool ok = true;
|
391
|
+
while (ok && (source->next_diff_sub_ !=
|
392
|
+
static_cast<int>(source->diffs_.size()))) {
|
393
|
+
ok = source->MoveRight();
|
394
|
+
if (source->current_lo_aoffset_ != source->current_hi_aoffset_) {
|
395
|
+
return false;
|
396
|
+
}
|
397
|
+
dest->Insert(
|
398
|
+
source->current_hi_aprimeoffset_ - source->current_lo_aprimeoffset_);
|
399
|
+
}
|
400
|
+
return true;
|
401
|
+
}
|
402
|
+
|
403
|
+
// static
|
404
|
+
bool OffsetMap::CopyDeletes(OffsetMap* source, OffsetMap* dest) {
|
405
|
+
bool ok = true;
|
406
|
+
while (ok && (source->next_diff_sub_ !=
|
407
|
+
static_cast<int>(source->diffs_.size()))) {
|
408
|
+
ok = source->MoveRight();
|
409
|
+
if (source->current_lo_aprimeoffset_ != source->current_hi_aprimeoffset_) {
|
410
|
+
return false;
|
411
|
+
}
|
412
|
+
dest->Delete(source->current_hi_aoffset_ - source->current_lo_aoffset_);
|
413
|
+
}
|
414
|
+
return true;
|
415
|
+
}
|
416
|
+
|
417
|
+
// static
|
418
|
+
void OffsetMap::ComposeOffsetMap(
|
419
|
+
OffsetMap* g, OffsetMap* f, OffsetMap* h) {
|
420
|
+
h->Clear();
|
421
|
+
f->Reset();
|
422
|
+
g->Reset();
|
423
|
+
|
424
|
+
int lo = 0;
|
425
|
+
for (;;) {
|
426
|
+
// Consume delete operations in f. This moves A without moving
|
427
|
+
// A' and A''.
|
428
|
+
if (lo >= g->current_hi_aoffset_ && CopyInserts(g, h)) {
|
429
|
+
if (lo >= f->current_hi_aprimeoffset_ && CopyDeletes(f, h)) {
|
430
|
+
// fprintf(stderr,
|
431
|
+
// "ComposeOffsetMap ERROR, f is longer than g.<br>\n");
|
432
|
+
}
|
433
|
+
|
434
|
+
// FlushAll(), called by Reset(), MapForward() or MapBack(), has
|
435
|
+
// added an extra COPY_OP to f and g, so this function has
|
436
|
+
// composed an extra COPY_OP in h from those. To avoid
|
437
|
+
// FlushAll() adds one more extra COPY_OP to h later, dispatch
|
438
|
+
// Flush() right now.
|
439
|
+
h->Flush();
|
440
|
+
return;
|
441
|
+
}
|
442
|
+
|
443
|
+
// Consume insert operations in g. This moves A'' without moving A
|
444
|
+
// and A'.
|
445
|
+
if (lo >= f->current_hi_aprimeoffset_) {
|
446
|
+
if (!CopyDeletes(f, h)) {
|
447
|
+
// fprintf(stderr,
|
448
|
+
// "ComposeOffsetMap ERROR, g is longer than f.<br>\n");
|
449
|
+
}
|
450
|
+
}
|
451
|
+
|
452
|
+
// Compose one operation which moves A' from lo to hi.
|
453
|
+
int hi = min(f->current_hi_aprimeoffset_, g->current_hi_aoffset_);
|
454
|
+
if (f->current_lo_aoffset_ != f->current_hi_aoffset_ &&
|
455
|
+
g->current_lo_aprimeoffset_ != g->current_hi_aprimeoffset_) {
|
456
|
+
h->Copy(hi - lo);
|
457
|
+
} else if (f->current_lo_aoffset_ != f->current_hi_aoffset_) {
|
458
|
+
h->Delete(hi - lo);
|
459
|
+
} else if (g->current_lo_aprimeoffset_ != g->current_hi_aprimeoffset_) {
|
460
|
+
h->Insert(hi - lo);
|
461
|
+
}
|
462
|
+
|
463
|
+
lo = hi;
|
464
|
+
}
|
465
|
+
}
|
466
|
+
|
467
|
+
// For testing only -- force a mapping
|
468
|
+
void OffsetMap::StuffIt(const std::string& diffs,
|
469
|
+
int max_aoffset, int max_aprimeoffset) {
|
470
|
+
Clear();
|
471
|
+
diffs_ = diffs;
|
472
|
+
max_aoffset_ = max_aoffset;
|
473
|
+
max_aprimeoffset_ = max_aprimeoffset;
|
474
|
+
}
|
475
|
+
|
476
|
+
|
477
|
+
} // namespace CLD2
|
478
|
+
} // namespace chrome_lang_id
|