chipper 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/util/random.h
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
// Copyright 2005-2009 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
// Modified from Google perftools's tcmalloc_unittest.cc.
|
6
|
+
|
7
|
+
#ifndef RE2_UTIL_RANDOM_H__
|
8
|
+
#define RE2_UTIL_RANDOM_H__
|
9
|
+
|
10
|
+
#include "util/util.h"
|
11
|
+
|
12
|
+
namespace re2 {
|
13
|
+
|
14
|
+
// ACM minimal standard random number generator. (re-entrant.)
|
15
|
+
class ACMRandom {
|
16
|
+
public:
|
17
|
+
ACMRandom(int32 seed) : seed_(seed) {}
|
18
|
+
int32 Next();
|
19
|
+
int32 Uniform(int32);
|
20
|
+
|
21
|
+
void Reset(int32 seed) { seed_ = seed; }
|
22
|
+
|
23
|
+
private:
|
24
|
+
int32 seed_;
|
25
|
+
};
|
26
|
+
|
27
|
+
} // namespace re2
|
28
|
+
|
29
|
+
#endif // RE2_UTIL_RANDOM_H__
|
@@ -0,0 +1,451 @@
|
|
1
|
+
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
// DESCRIPTION
|
6
|
+
//
|
7
|
+
// SparseArray<T>(m) is a map from integers in [0, m) to T values.
|
8
|
+
// It requires (sizeof(T)+sizeof(int))*m memory, but it provides
|
9
|
+
// fast iteration through the elements in the array and fast clearing
|
10
|
+
// of the array. The array has a concept of certain elements being
|
11
|
+
// uninitialized (having no value).
|
12
|
+
//
|
13
|
+
// Insertion and deletion are constant time operations.
|
14
|
+
//
|
15
|
+
// Allocating the array is a constant time operation
|
16
|
+
// when memory allocation is a constant time operation.
|
17
|
+
//
|
18
|
+
// Clearing the array is a constant time operation (unusual!).
|
19
|
+
//
|
20
|
+
// Iterating through the array is an O(n) operation, where n
|
21
|
+
// is the number of items in the array (not O(m)).
|
22
|
+
//
|
23
|
+
// The array iterator visits entries in the order they were first
|
24
|
+
// inserted into the array. It is safe to add items to the array while
|
25
|
+
// using an iterator: the iterator will visit indices added to the array
|
26
|
+
// during the iteration, but will not re-visit indices whose values
|
27
|
+
// change after visiting. Thus SparseArray can be a convenient
|
28
|
+
// implementation of a work queue.
|
29
|
+
//
|
30
|
+
// The SparseArray implementation is NOT thread-safe. It is up to the
|
31
|
+
// caller to make sure only one thread is accessing the array. (Typically
|
32
|
+
// these arrays are temporary values and used in situations where speed is
|
33
|
+
// important.)
|
34
|
+
//
|
35
|
+
// The SparseArray interface does not present all the usual STL bells and
|
36
|
+
// whistles.
|
37
|
+
//
|
38
|
+
// Implemented with reference to Briggs & Torczon, An Efficient
|
39
|
+
// Representation for Sparse Sets, ACM Letters on Programming Languages
|
40
|
+
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
|
41
|
+
//
|
42
|
+
// Briggs & Torczon popularized this technique, but it had been known
|
43
|
+
// long before their paper. They point out that Aho, Hopcroft, and
|
44
|
+
// Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's
|
45
|
+
// 1986 Programming Pearls both hint at the technique in exercises to the
|
46
|
+
// reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1
|
47
|
+
// exercise 8).
|
48
|
+
//
|
49
|
+
// Briggs & Torczon describe a sparse set implementation. I have
|
50
|
+
// trivially generalized it to create a sparse array (actually the original
|
51
|
+
// target of the AHU and Bentley exercises).
|
52
|
+
|
53
|
+
// IMPLEMENTATION
|
54
|
+
//
|
55
|
+
// SparseArray uses a vector dense_ and an array sparse_to_dense_, both of
|
56
|
+
// size max_size_. At any point, the number of elements in the sparse array is
|
57
|
+
// size_.
|
58
|
+
//
|
59
|
+
// The vector dense_ contains the size_ elements in the sparse array (with
|
60
|
+
// their indices),
|
61
|
+
// in the order that the elements were first inserted. This array is dense:
|
62
|
+
// the size_ pairs are dense_[0] through dense_[size_-1].
|
63
|
+
//
|
64
|
+
// The array sparse_to_dense_ maps from indices in [0,m) to indices in
|
65
|
+
// [0,size_).
|
66
|
+
// For indices present in the array, dense_[sparse_to_dense_[i]].index_ == i.
|
67
|
+
// For indices not present in the array, sparse_to_dense_ can contain
|
68
|
+
// any value at all, perhaps outside the range [0, size_) but perhaps not.
|
69
|
+
//
|
70
|
+
// The lax requirement on sparse_to_dense_ values makes clearing
|
71
|
+
// the array very easy: set size_ to 0. Lookups are slightly more
|
72
|
+
// complicated. An index i has a value in the array if and only if:
|
73
|
+
// sparse_to_dense_[i] is in [0, size_) AND
|
74
|
+
// dense_[sparse_to_dense_[i]].index_ == i.
|
75
|
+
// If both these properties hold, only then it is safe to refer to
|
76
|
+
// dense_[sparse_to_dense_[i]].value_
|
77
|
+
// as the value associated with index i.
|
78
|
+
//
|
79
|
+
// To insert a new entry, set sparse_to_dense_[i] to size_,
|
80
|
+
// initialize dense_[size_], and then increment size_.
|
81
|
+
//
|
82
|
+
// Deletion of specific values from the array is implemented by
|
83
|
+
// swapping dense_[size_-1] and the dense_ being deleted and then
|
84
|
+
// updating the appropriate sparse_to_dense_ entries.
|
85
|
+
//
|
86
|
+
// To make the sparse array as efficient as possible for non-primitive types,
|
87
|
+
// elements may or may not be destroyed when they are deleted from the sparse
|
88
|
+
// array through a call to erase(), erase_existing() or resize(). They
|
89
|
+
// immediately become inaccessible, but they are only guaranteed to be
|
90
|
+
// destroyed when the SparseArray destructor is called.
|
91
|
+
|
92
|
+
#ifndef RE2_UTIL_SPARSE_ARRAY_H__
|
93
|
+
#define RE2_UTIL_SPARSE_ARRAY_H__
|
94
|
+
|
95
|
+
#include "util/util.h"
|
96
|
+
|
97
|
+
namespace re2 {
|
98
|
+
|
99
|
+
template<typename Value>
|
100
|
+
class SparseArray {
|
101
|
+
public:
|
102
|
+
SparseArray();
|
103
|
+
SparseArray(int max_size);
|
104
|
+
~SparseArray();
|
105
|
+
|
106
|
+
// IndexValue pairs: exposed in SparseArray::iterator.
|
107
|
+
class IndexValue;
|
108
|
+
|
109
|
+
typedef IndexValue value_type;
|
110
|
+
typedef typename vector<IndexValue>::iterator iterator;
|
111
|
+
typedef typename vector<IndexValue>::const_iterator const_iterator;
|
112
|
+
|
113
|
+
inline const IndexValue& iv(int i) const;
|
114
|
+
|
115
|
+
// Return the number of entries in the array.
|
116
|
+
int size() const {
|
117
|
+
return size_;
|
118
|
+
}
|
119
|
+
|
120
|
+
// Iterate over the array.
|
121
|
+
iterator begin() {
|
122
|
+
return dense_.begin();
|
123
|
+
}
|
124
|
+
iterator end() {
|
125
|
+
return dense_.begin() + size_;
|
126
|
+
}
|
127
|
+
|
128
|
+
const_iterator begin() const {
|
129
|
+
return dense_.begin();
|
130
|
+
}
|
131
|
+
const_iterator end() const {
|
132
|
+
return dense_.begin() + size_;
|
133
|
+
}
|
134
|
+
|
135
|
+
// Change the maximum size of the array.
|
136
|
+
// Invalidates all iterators.
|
137
|
+
void resize(int max_size);
|
138
|
+
|
139
|
+
// Return the maximum size of the array.
|
140
|
+
// Indices can be in the range [0, max_size).
|
141
|
+
int max_size() const {
|
142
|
+
return max_size_;
|
143
|
+
}
|
144
|
+
|
145
|
+
// Clear the array.
|
146
|
+
void clear() {
|
147
|
+
size_ = 0;
|
148
|
+
}
|
149
|
+
|
150
|
+
// Check whether index i is in the array.
|
151
|
+
inline bool has_index(int i) const;
|
152
|
+
|
153
|
+
// Comparison function for sorting.
|
154
|
+
// Can sort the sparse array so that future iterations
|
155
|
+
// will visit indices in increasing order using
|
156
|
+
// sort(arr.begin(), arr.end(), arr.less);
|
157
|
+
static bool less(const IndexValue& a, const IndexValue& b);
|
158
|
+
|
159
|
+
public:
|
160
|
+
// Set the value at index i to v.
|
161
|
+
inline iterator set(int i, Value v);
|
162
|
+
|
163
|
+
pair<iterator, bool> insert(const value_type& new_value);
|
164
|
+
|
165
|
+
// Returns the value at index i
|
166
|
+
// or defaultv if index i is not initialized in the array.
|
167
|
+
inline Value get(int i, Value defaultv) const;
|
168
|
+
|
169
|
+
iterator find(int i);
|
170
|
+
|
171
|
+
const_iterator find(int i) const;
|
172
|
+
|
173
|
+
// Change the value at index i to v.
|
174
|
+
// Fast but unsafe: only use if has_index(i) is true.
|
175
|
+
inline iterator set_existing(int i, Value v);
|
176
|
+
|
177
|
+
// Set the value at the new index i to v.
|
178
|
+
// Fast but unsafe: only use if has_index(i) is false.
|
179
|
+
inline iterator set_new(int i, Value v);
|
180
|
+
|
181
|
+
// Get the value at index i from the array..
|
182
|
+
// Fast but unsafe: only use if has_index(i) is true.
|
183
|
+
inline Value get_existing(int i) const;
|
184
|
+
|
185
|
+
// Erasing items from the array during iteration is in general
|
186
|
+
// NOT safe. There is one special case, which is that the current
|
187
|
+
// index-value pair can be erased as long as the iterator is then
|
188
|
+
// checked for being at the end before being incremented.
|
189
|
+
// For example:
|
190
|
+
//
|
191
|
+
// for (i = m.begin(); i != m.end(); ++i) {
|
192
|
+
// if (ShouldErase(i->index(), i->value())) {
|
193
|
+
// m.erase(i->index());
|
194
|
+
// --i;
|
195
|
+
// }
|
196
|
+
// }
|
197
|
+
//
|
198
|
+
// Except in the specific case just described, elements must
|
199
|
+
// not be erased from the array (including clearing the array)
|
200
|
+
// while iterators are walking over the array. Otherwise,
|
201
|
+
// the iterators could walk past the end of the array.
|
202
|
+
|
203
|
+
// Erases the element at index i from the array.
|
204
|
+
inline void erase(int i);
|
205
|
+
|
206
|
+
// Erases the element at index i from the array.
|
207
|
+
// Fast but unsafe: only use if has_index(i) is true.
|
208
|
+
inline void erase_existing(int i);
|
209
|
+
|
210
|
+
private:
|
211
|
+
// Add the index i to the array.
|
212
|
+
// Only use if has_index(i) is known to be false.
|
213
|
+
// Since it doesn't set the value associated with i,
|
214
|
+
// this function is private, only intended as a helper
|
215
|
+
// for other methods.
|
216
|
+
inline void create_index(int i);
|
217
|
+
|
218
|
+
// In debug mode, verify that some invariant properties of the class
|
219
|
+
// are being maintained. This is called at the end of the constructor
|
220
|
+
// and at the beginning and end of all public non-const member functions.
|
221
|
+
inline void DebugCheckInvariants() const;
|
222
|
+
|
223
|
+
int size_;
|
224
|
+
int max_size_;
|
225
|
+
int* sparse_to_dense_;
|
226
|
+
vector<IndexValue> dense_;
|
227
|
+
|
228
|
+
DISALLOW_EVIL_CONSTRUCTORS(SparseArray);
|
229
|
+
};
|
230
|
+
|
231
|
+
template<typename Value>
|
232
|
+
SparseArray<Value>::SparseArray()
|
233
|
+
: size_(0), max_size_(0), sparse_to_dense_(NULL), dense_() {}
|
234
|
+
|
235
|
+
// IndexValue pairs: exposed in SparseArray::iterator.
|
236
|
+
template<typename Value>
|
237
|
+
class SparseArray<Value>::IndexValue {
|
238
|
+
friend class SparseArray;
|
239
|
+
public:
|
240
|
+
typedef int first_type;
|
241
|
+
typedef Value second_type;
|
242
|
+
|
243
|
+
IndexValue() {}
|
244
|
+
IndexValue(int index, const Value& value) : second(value), index_(index) {}
|
245
|
+
|
246
|
+
int index() const { return index_; }
|
247
|
+
Value value() const { return second; }
|
248
|
+
|
249
|
+
// Provide the data in the 'second' member so that the utilities
|
250
|
+
// in map-util work.
|
251
|
+
Value second;
|
252
|
+
|
253
|
+
private:
|
254
|
+
int index_;
|
255
|
+
};
|
256
|
+
|
257
|
+
template<typename Value>
|
258
|
+
const typename SparseArray<Value>::IndexValue&
|
259
|
+
SparseArray<Value>::iv(int i) const {
|
260
|
+
DCHECK_GE(i, 0);
|
261
|
+
DCHECK_LT(i, size_);
|
262
|
+
return dense_[i];
|
263
|
+
}
|
264
|
+
|
265
|
+
// Change the maximum size of the array.
|
266
|
+
// Invalidates all iterators.
|
267
|
+
template<typename Value>
|
268
|
+
void SparseArray<Value>::resize(int new_max_size) {
|
269
|
+
DebugCheckInvariants();
|
270
|
+
if (new_max_size > max_size_) {
|
271
|
+
int* a = new int[new_max_size];
|
272
|
+
if (sparse_to_dense_) {
|
273
|
+
memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
|
274
|
+
// Don't need to zero the memory but appease Valgrind.
|
275
|
+
if (RunningOnValgrind()) {
|
276
|
+
for (int i = max_size_; i < new_max_size; i++)
|
277
|
+
a[i] = 0xababababU;
|
278
|
+
}
|
279
|
+
delete[] sparse_to_dense_;
|
280
|
+
}
|
281
|
+
sparse_to_dense_ = a;
|
282
|
+
|
283
|
+
dense_.resize(new_max_size);
|
284
|
+
}
|
285
|
+
max_size_ = new_max_size;
|
286
|
+
if (size_ > max_size_)
|
287
|
+
size_ = max_size_;
|
288
|
+
DebugCheckInvariants();
|
289
|
+
}
|
290
|
+
|
291
|
+
// Check whether index i is in the array.
|
292
|
+
template<typename Value>
|
293
|
+
bool SparseArray<Value>::has_index(int i) const {
|
294
|
+
DCHECK_GE(i, 0);
|
295
|
+
DCHECK_LT(i, max_size_);
|
296
|
+
if (static_cast<uint>(i) >= max_size_) {
|
297
|
+
return false;
|
298
|
+
}
|
299
|
+
// Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
|
300
|
+
return (uint)sparse_to_dense_[i] < (uint)size_ &&
|
301
|
+
dense_[sparse_to_dense_[i]].index_ == i;
|
302
|
+
}
|
303
|
+
|
304
|
+
// Set the value at index i to v.
|
305
|
+
template<typename Value>
|
306
|
+
typename SparseArray<Value>::iterator SparseArray<Value>::set(int i, Value v) {
|
307
|
+
DebugCheckInvariants();
|
308
|
+
if (static_cast<uint>(i) >= max_size_) {
|
309
|
+
// Semantically, end() would be better here, but we already know
|
310
|
+
// the user did something stupid, so begin() insulates them from
|
311
|
+
// dereferencing an invalid pointer.
|
312
|
+
return begin();
|
313
|
+
}
|
314
|
+
if (!has_index(i))
|
315
|
+
create_index(i);
|
316
|
+
return set_existing(i, v);
|
317
|
+
}
|
318
|
+
|
319
|
+
template<typename Value>
|
320
|
+
pair<typename SparseArray<Value>::iterator, bool> SparseArray<Value>::insert(
|
321
|
+
const value_type& new_value) {
|
322
|
+
DebugCheckInvariants();
|
323
|
+
pair<typename SparseArray<Value>::iterator, bool> p;
|
324
|
+
if (has_index(new_value.index_)) {
|
325
|
+
p = make_pair(dense_.begin() + sparse_to_dense_[new_value.index_], false);
|
326
|
+
} else {
|
327
|
+
p = make_pair(set_new(new_value.index_, new_value.second), true);
|
328
|
+
}
|
329
|
+
DebugCheckInvariants();
|
330
|
+
return p;
|
331
|
+
}
|
332
|
+
|
333
|
+
template<typename Value>
|
334
|
+
Value SparseArray<Value>::get(int i, Value defaultv) const {
|
335
|
+
if (!has_index(i))
|
336
|
+
return defaultv;
|
337
|
+
return get_existing(i);
|
338
|
+
}
|
339
|
+
|
340
|
+
template<typename Value>
|
341
|
+
typename SparseArray<Value>::iterator SparseArray<Value>::find(int i) {
|
342
|
+
if (has_index(i))
|
343
|
+
return dense_.begin() + sparse_to_dense_[i];
|
344
|
+
return end();
|
345
|
+
}
|
346
|
+
|
347
|
+
template<typename Value>
|
348
|
+
typename SparseArray<Value>::const_iterator
|
349
|
+
SparseArray<Value>::find(int i) const {
|
350
|
+
if (has_index(i)) {
|
351
|
+
return dense_.begin() + sparse_to_dense_[i];
|
352
|
+
}
|
353
|
+
return end();
|
354
|
+
}
|
355
|
+
|
356
|
+
template<typename Value>
|
357
|
+
typename SparseArray<Value>::iterator
|
358
|
+
SparseArray<Value>::set_existing(int i, Value v) {
|
359
|
+
DebugCheckInvariants();
|
360
|
+
DCHECK(has_index(i));
|
361
|
+
dense_[sparse_to_dense_[i]].second = v;
|
362
|
+
DebugCheckInvariants();
|
363
|
+
return dense_.begin() + sparse_to_dense_[i];
|
364
|
+
}
|
365
|
+
|
366
|
+
template<typename Value>
|
367
|
+
typename SparseArray<Value>::iterator
|
368
|
+
SparseArray<Value>::set_new(int i, Value v) {
|
369
|
+
DebugCheckInvariants();
|
370
|
+
if (static_cast<uint>(i) >= max_size_) {
|
371
|
+
// Semantically, end() would be better here, but we already know
|
372
|
+
// the user did something stupid, so begin() insulates them from
|
373
|
+
// dereferencing an invalid pointer.
|
374
|
+
return begin();
|
375
|
+
}
|
376
|
+
DCHECK(!has_index(i));
|
377
|
+
create_index(i);
|
378
|
+
return set_existing(i, v);
|
379
|
+
}
|
380
|
+
|
381
|
+
template<typename Value>
|
382
|
+
Value SparseArray<Value>::get_existing(int i) const {
|
383
|
+
DCHECK(has_index(i));
|
384
|
+
return dense_[sparse_to_dense_[i]].second;
|
385
|
+
}
|
386
|
+
|
387
|
+
template<typename Value>
|
388
|
+
void SparseArray<Value>::erase(int i) {
|
389
|
+
DebugCheckInvariants();
|
390
|
+
if (has_index(i))
|
391
|
+
erase_existing(i);
|
392
|
+
DebugCheckInvariants();
|
393
|
+
}
|
394
|
+
|
395
|
+
template<typename Value>
|
396
|
+
void SparseArray<Value>::erase_existing(int i) {
|
397
|
+
DebugCheckInvariants();
|
398
|
+
DCHECK(has_index(i));
|
399
|
+
int di = sparse_to_dense_[i];
|
400
|
+
if (di < size_ - 1) {
|
401
|
+
dense_[di] = dense_[size_ - 1];
|
402
|
+
sparse_to_dense_[dense_[di].index_] = di;
|
403
|
+
}
|
404
|
+
size_--;
|
405
|
+
DebugCheckInvariants();
|
406
|
+
}
|
407
|
+
|
408
|
+
template<typename Value>
|
409
|
+
void SparseArray<Value>::create_index(int i) {
|
410
|
+
DCHECK(!has_index(i));
|
411
|
+
DCHECK_LT(size_, max_size_);
|
412
|
+
sparse_to_dense_[i] = size_;
|
413
|
+
dense_[size_].index_ = i;
|
414
|
+
size_++;
|
415
|
+
}
|
416
|
+
|
417
|
+
template<typename Value> SparseArray<Value>::SparseArray(int max_size) {
|
418
|
+
max_size_ = max_size;
|
419
|
+
sparse_to_dense_ = new int[max_size];
|
420
|
+
dense_.resize(max_size);
|
421
|
+
// Don't need to zero the new memory, but appease Valgrind.
|
422
|
+
if (RunningOnValgrind()) {
|
423
|
+
for (int i = 0; i < max_size; i++) {
|
424
|
+
sparse_to_dense_[i] = 0xababababU;
|
425
|
+
dense_[i].index_ = 0xababababU;
|
426
|
+
}
|
427
|
+
}
|
428
|
+
size_ = 0;
|
429
|
+
DebugCheckInvariants();
|
430
|
+
}
|
431
|
+
|
432
|
+
template<typename Value> SparseArray<Value>::~SparseArray() {
|
433
|
+
DebugCheckInvariants();
|
434
|
+
delete[] sparse_to_dense_;
|
435
|
+
}
|
436
|
+
|
437
|
+
template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
|
438
|
+
DCHECK_LE(0, size_);
|
439
|
+
DCHECK_LE(size_, max_size_);
|
440
|
+
DCHECK(size_ == 0 || sparse_to_dense_ != NULL);
|
441
|
+
}
|
442
|
+
|
443
|
+
// Comparison function for sorting.
|
444
|
+
template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
|
445
|
+
const IndexValue& b) {
|
446
|
+
return a.index_ < b.index_;
|
447
|
+
}
|
448
|
+
|
449
|
+
} // namespace re2
|
450
|
+
|
451
|
+
#endif // RE2_UTIL_SPARSE_ARRAY_H__
|
@@ -0,0 +1,177 @@
|
|
1
|
+
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
2
|
+
// Use of this source code is governed by a BSD-style
|
3
|
+
// license that can be found in the LICENSE file.
|
4
|
+
|
5
|
+
// DESCRIPTION
|
6
|
+
//
|
7
|
+
// SparseSet<T>(m) is a set of integers in [0, m).
|
8
|
+
// It requires sizeof(int)*m memory, but it provides
|
9
|
+
// fast iteration through the elements in the set and fast clearing
|
10
|
+
// of the set.
|
11
|
+
//
|
12
|
+
// Insertion and deletion are constant time operations.
|
13
|
+
//
|
14
|
+
// Allocating the set is a constant time operation
|
15
|
+
// when memory allocation is a constant time operation.
|
16
|
+
//
|
17
|
+
// Clearing the set is a constant time operation (unusual!).
|
18
|
+
//
|
19
|
+
// Iterating through the set is an O(n) operation, where n
|
20
|
+
// is the number of items in the set (not O(m)).
|
21
|
+
//
|
22
|
+
// The set iterator visits entries in the order they were first
|
23
|
+
// inserted into the array. It is safe to add items to the set while
|
24
|
+
// using an iterator: the iterator will visit indices added to the set
|
25
|
+
// during the iteration, but will not re-visit indices whose values
|
26
|
+
// change after visiting. Thus SparseSet can be a convenient
|
27
|
+
// implementation of a work queue.
|
28
|
+
//
|
29
|
+
// The SparseSet implementation is NOT thread-safe. It is up to the
|
30
|
+
// caller to make sure only one thread is accessing the set. (Typically
|
31
|
+
// these sets are temporary values and used in situations where speed is
|
32
|
+
// important.)
|
33
|
+
//
|
34
|
+
// The SparseSet interface does not present all the usual STL bells and
|
35
|
+
// whistles.
|
36
|
+
//
|
37
|
+
// Implemented with reference to Briggs & Torczon, An Efficient
|
38
|
+
// Representation for Sparse Sets, ACM Letters on Programming Languages
|
39
|
+
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
|
40
|
+
//
|
41
|
+
// For a generalization to sparse array, see sparse_array.h.
|
42
|
+
|
43
|
+
// IMPLEMENTATION
|
44
|
+
//
|
45
|
+
// See sparse_array.h for implementation details
|
46
|
+
|
47
|
+
#ifndef RE2_UTIL_SPARSE_SET_H__
|
48
|
+
#define RE2_UTIL_SPARSE_SET_H__
|
49
|
+
|
50
|
+
#include "util/util.h"
|
51
|
+
|
52
|
+
namespace re2 {
|
53
|
+
|
54
|
+
class SparseSet {
|
55
|
+
public:
|
56
|
+
SparseSet()
|
57
|
+
: size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL) {}
|
58
|
+
|
59
|
+
SparseSet(int max_size) {
|
60
|
+
max_size_ = max_size;
|
61
|
+
sparse_to_dense_ = new int[max_size];
|
62
|
+
dense_ = new int[max_size];
|
63
|
+
// Don't need to zero the memory, but do so anyway
|
64
|
+
// to appease Valgrind.
|
65
|
+
if (RunningOnValgrind()) {
|
66
|
+
for (int i = 0; i < max_size; i++) {
|
67
|
+
dense_[i] = 0xababababU;
|
68
|
+
sparse_to_dense_[i] = 0xababababU;
|
69
|
+
}
|
70
|
+
}
|
71
|
+
size_ = 0;
|
72
|
+
}
|
73
|
+
|
74
|
+
~SparseSet() {
|
75
|
+
delete[] sparse_to_dense_;
|
76
|
+
delete[] dense_;
|
77
|
+
}
|
78
|
+
|
79
|
+
typedef int* iterator;
|
80
|
+
typedef const int* const_iterator;
|
81
|
+
|
82
|
+
int size() const { return size_; }
|
83
|
+
iterator begin() { return dense_; }
|
84
|
+
iterator end() { return dense_ + size_; }
|
85
|
+
const_iterator begin() const { return dense_; }
|
86
|
+
const_iterator end() const { return dense_ + size_; }
|
87
|
+
|
88
|
+
// Change the maximum size of the array.
|
89
|
+
// Invalidates all iterators.
|
90
|
+
void resize(int new_max_size) {
|
91
|
+
if (size_ > new_max_size)
|
92
|
+
size_ = new_max_size;
|
93
|
+
if (new_max_size > max_size_) {
|
94
|
+
int* a = new int[new_max_size];
|
95
|
+
if (sparse_to_dense_) {
|
96
|
+
memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
|
97
|
+
if (RunningOnValgrind()) {
|
98
|
+
for (int i = max_size_; i < new_max_size; i++)
|
99
|
+
a[i] = 0xababababU;
|
100
|
+
}
|
101
|
+
delete[] sparse_to_dense_;
|
102
|
+
}
|
103
|
+
sparse_to_dense_ = a;
|
104
|
+
|
105
|
+
a = new int[new_max_size];
|
106
|
+
if (dense_) {
|
107
|
+
memmove(a, dense_, size_*sizeof a[0]);
|
108
|
+
if (RunningOnValgrind()) {
|
109
|
+
for (int i = size_; i < new_max_size; i++)
|
110
|
+
a[i] = 0xababababU;
|
111
|
+
}
|
112
|
+
delete[] dense_;
|
113
|
+
}
|
114
|
+
dense_ = a;
|
115
|
+
}
|
116
|
+
max_size_ = new_max_size;
|
117
|
+
}
|
118
|
+
|
119
|
+
// Return the maximum size of the array.
|
120
|
+
// Indices can be in the range [0, max_size).
|
121
|
+
int max_size() const { return max_size_; }
|
122
|
+
|
123
|
+
// Clear the array.
|
124
|
+
void clear() { size_ = 0; }
|
125
|
+
|
126
|
+
// Check whether i is in the array.
|
127
|
+
bool contains(int i) const {
|
128
|
+
DCHECK_GE(i, 0);
|
129
|
+
DCHECK_LT(i, max_size_);
|
130
|
+
if (static_cast<uint>(i) >= max_size_) {
|
131
|
+
return false;
|
132
|
+
}
|
133
|
+
// Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
|
134
|
+
return (uint)sparse_to_dense_[i] < (uint)size_ &&
|
135
|
+
dense_[sparse_to_dense_[i]] == i;
|
136
|
+
}
|
137
|
+
|
138
|
+
// Adds i to the set.
|
139
|
+
void insert(int i) {
|
140
|
+
if (!contains(i))
|
141
|
+
insert_new(i);
|
142
|
+
}
|
143
|
+
|
144
|
+
// Set the value at the new index i to v.
|
145
|
+
// Fast but unsafe: only use if contains(i) is false.
|
146
|
+
void insert_new(int i) {
|
147
|
+
if (static_cast<uint>(i) >= max_size_) {
|
148
|
+
// Semantically, end() would be better here, but we already know
|
149
|
+
// the user did something stupid, so begin() insulates them from
|
150
|
+
// dereferencing an invalid pointer.
|
151
|
+
return;
|
152
|
+
}
|
153
|
+
DCHECK(!contains(i));
|
154
|
+
DCHECK_LT(size_, max_size_);
|
155
|
+
sparse_to_dense_[i] = size_;
|
156
|
+
dense_[size_] = i;
|
157
|
+
size_++;
|
158
|
+
}
|
159
|
+
|
160
|
+
// Comparison function for sorting.
|
161
|
+
// Can sort the sparse array so that future iterations
|
162
|
+
// will visit indices in increasing order using
|
163
|
+
// sort(arr.begin(), arr.end(), arr.less);
|
164
|
+
static bool less(int a, int b) { return a < b; }
|
165
|
+
|
166
|
+
private:
|
167
|
+
int size_;
|
168
|
+
int max_size_;
|
169
|
+
int* sparse_to_dense_;
|
170
|
+
int* dense_;
|
171
|
+
|
172
|
+
DISALLOW_EVIL_CONSTRUCTORS(SparseSet);
|
173
|
+
};
|
174
|
+
|
175
|
+
} // namespace re2
|
176
|
+
|
177
|
+
#endif // RE2_UTIL_SPARSE_SET_H__
|