chipper 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +51 -0
- data/ext/extconf.rb +58 -0
- data/ext/libstemmer_c/Makefile +10 -0
- data/ext/libstemmer_c/examples/stemwords.c +209 -0
- data/ext/libstemmer_c/include/libstemmer.h +79 -0
- data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
- data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
- data/ext/libstemmer_c/libstemmer/modules.h +190 -0
- data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
- data/ext/libstemmer_c/mkinc.mak +82 -0
- data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
- data/ext/libstemmer_c/runtime/api.c +66 -0
- data/ext/libstemmer_c/runtime/api.h +26 -0
- data/ext/libstemmer_c/runtime/header.h +58 -0
- data/ext/libstemmer_c/runtime/utilities.c +478 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
- data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
- data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
- data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
- data/ext/re2/bitstate.cc +378 -0
- data/ext/re2/compile.cc +1138 -0
- data/ext/re2/dfa.cc +2086 -0
- data/ext/re2/filtered_re2.cc +100 -0
- data/ext/re2/filtered_re2.h +99 -0
- data/ext/re2/hash.cc +231 -0
- data/ext/re2/mimics_pcre.cc +185 -0
- data/ext/re2/nfa.cc +709 -0
- data/ext/re2/onepass.cc +614 -0
- data/ext/re2/parse.cc +2202 -0
- data/ext/re2/perl_groups.cc +119 -0
- data/ext/re2/prefilter.cc +671 -0
- data/ext/re2/prefilter.h +105 -0
- data/ext/re2/prefilter_tree.cc +398 -0
- data/ext/re2/prefilter_tree.h +130 -0
- data/ext/re2/prog.cc +341 -0
- data/ext/re2/prog.h +376 -0
- data/ext/re2/re2.cc +1180 -0
- data/ext/re2/re2.h +837 -0
- data/ext/re2/regexp.cc +920 -0
- data/ext/re2/regexp.h +632 -0
- data/ext/re2/rune.cc +258 -0
- data/ext/re2/set.cc +113 -0
- data/ext/re2/set.h +55 -0
- data/ext/re2/simplify.cc +393 -0
- data/ext/re2/stringpiece.cc +87 -0
- data/ext/re2/stringpiece.h +182 -0
- data/ext/re2/tostring.cc +341 -0
- data/ext/re2/unicode_casefold.cc +469 -0
- data/ext/re2/unicode_casefold.h +75 -0
- data/ext/re2/unicode_groups.cc +4851 -0
- data/ext/re2/unicode_groups.h +64 -0
- data/ext/re2/valgrind.cc +24 -0
- data/ext/re2/variadic_function.h +346 -0
- data/ext/re2/walker-inl.h +244 -0
- data/ext/src/chipper.cc +626 -0
- data/ext/src/version.h +1 -0
- data/ext/stemmer.rb +40 -0
- data/ext/util/arena.h +103 -0
- data/ext/util/atomicops.h +79 -0
- data/ext/util/benchmark.h +41 -0
- data/ext/util/flags.h +27 -0
- data/ext/util/logging.h +78 -0
- data/ext/util/mutex.h +190 -0
- data/ext/util/pcre.h +679 -0
- data/ext/util/random.h +29 -0
- data/ext/util/sparse_array.h +451 -0
- data/ext/util/sparse_set.h +177 -0
- data/ext/util/test.h +57 -0
- data/ext/util/thread.h +26 -0
- data/ext/util/utf.h +43 -0
- data/ext/util/util.h +127 -0
- data/ext/util/valgrind.h +4517 -0
- data/test/helper.rb +5 -0
- data/test/test_entities.rb +57 -0
- data/test/test_tokens.rb +118 -0
- metadata +199 -0
data/ext/util/random.h
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
// Copyright 2005-2009 The RE2 Authors. All Rights Reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style
|
|
3
|
+
// license that can be found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// Modified from Google perftools's tcmalloc_unittest.cc.
|
|
6
|
+
|
|
7
|
+
#ifndef RE2_UTIL_RANDOM_H__
|
|
8
|
+
#define RE2_UTIL_RANDOM_H__
|
|
9
|
+
|
|
10
|
+
#include "util/util.h"
|
|
11
|
+
|
|
12
|
+
namespace re2 {
|
|
13
|
+
|
|
14
|
+
// ACM minimal standard random number generator. (re-entrant.)
|
|
15
|
+
class ACMRandom {
|
|
16
|
+
public:
|
|
17
|
+
ACMRandom(int32 seed) : seed_(seed) {}
|
|
18
|
+
int32 Next();
|
|
19
|
+
int32 Uniform(int32);
|
|
20
|
+
|
|
21
|
+
void Reset(int32 seed) { seed_ = seed; }
|
|
22
|
+
|
|
23
|
+
private:
|
|
24
|
+
int32 seed_;
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
} // namespace re2
|
|
28
|
+
|
|
29
|
+
#endif // RE2_UTIL_RANDOM_H__
|
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style
|
|
3
|
+
// license that can be found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// DESCRIPTION
|
|
6
|
+
//
|
|
7
|
+
// SparseArray<T>(m) is a map from integers in [0, m) to T values.
|
|
8
|
+
// It requires (sizeof(T)+sizeof(int))*m memory, but it provides
|
|
9
|
+
// fast iteration through the elements in the array and fast clearing
|
|
10
|
+
// of the array. The array has a concept of certain elements being
|
|
11
|
+
// uninitialized (having no value).
|
|
12
|
+
//
|
|
13
|
+
// Insertion and deletion are constant time operations.
|
|
14
|
+
//
|
|
15
|
+
// Allocating the array is a constant time operation
|
|
16
|
+
// when memory allocation is a constant time operation.
|
|
17
|
+
//
|
|
18
|
+
// Clearing the array is a constant time operation (unusual!).
|
|
19
|
+
//
|
|
20
|
+
// Iterating through the array is an O(n) operation, where n
|
|
21
|
+
// is the number of items in the array (not O(m)).
|
|
22
|
+
//
|
|
23
|
+
// The array iterator visits entries in the order they were first
|
|
24
|
+
// inserted into the array. It is safe to add items to the array while
|
|
25
|
+
// using an iterator: the iterator will visit indices added to the array
|
|
26
|
+
// during the iteration, but will not re-visit indices whose values
|
|
27
|
+
// change after visiting. Thus SparseArray can be a convenient
|
|
28
|
+
// implementation of a work queue.
|
|
29
|
+
//
|
|
30
|
+
// The SparseArray implementation is NOT thread-safe. It is up to the
|
|
31
|
+
// caller to make sure only one thread is accessing the array. (Typically
|
|
32
|
+
// these arrays are temporary values and used in situations where speed is
|
|
33
|
+
// important.)
|
|
34
|
+
//
|
|
35
|
+
// The SparseArray interface does not present all the usual STL bells and
|
|
36
|
+
// whistles.
|
|
37
|
+
//
|
|
38
|
+
// Implemented with reference to Briggs & Torczon, An Efficient
|
|
39
|
+
// Representation for Sparse Sets, ACM Letters on Programming Languages
|
|
40
|
+
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
|
|
41
|
+
//
|
|
42
|
+
// Briggs & Torczon popularized this technique, but it had been known
|
|
43
|
+
// long before their paper. They point out that Aho, Hopcroft, and
|
|
44
|
+
// Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's
|
|
45
|
+
// 1986 Programming Pearls both hint at the technique in exercises to the
|
|
46
|
+
// reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1
|
|
47
|
+
// exercise 8).
|
|
48
|
+
//
|
|
49
|
+
// Briggs & Torczon describe a sparse set implementation. I have
|
|
50
|
+
// trivially generalized it to create a sparse array (actually the original
|
|
51
|
+
// target of the AHU and Bentley exercises).
|
|
52
|
+
|
|
53
|
+
// IMPLEMENTATION
|
|
54
|
+
//
|
|
55
|
+
// SparseArray uses a vector dense_ and an array sparse_to_dense_, both of
|
|
56
|
+
// size max_size_. At any point, the number of elements in the sparse array is
|
|
57
|
+
// size_.
|
|
58
|
+
//
|
|
59
|
+
// The vector dense_ contains the size_ elements in the sparse array (with
|
|
60
|
+
// their indices),
|
|
61
|
+
// in the order that the elements were first inserted. This array is dense:
|
|
62
|
+
// the size_ pairs are dense_[0] through dense_[size_-1].
|
|
63
|
+
//
|
|
64
|
+
// The array sparse_to_dense_ maps from indices in [0,m) to indices in
|
|
65
|
+
// [0,size_).
|
|
66
|
+
// For indices present in the array, dense_[sparse_to_dense_[i]].index_ == i.
|
|
67
|
+
// For indices not present in the array, sparse_to_dense_ can contain
|
|
68
|
+
// any value at all, perhaps outside the range [0, size_) but perhaps not.
|
|
69
|
+
//
|
|
70
|
+
// The lax requirement on sparse_to_dense_ values makes clearing
|
|
71
|
+
// the array very easy: set size_ to 0. Lookups are slightly more
|
|
72
|
+
// complicated. An index i has a value in the array if and only if:
|
|
73
|
+
// sparse_to_dense_[i] is in [0, size_) AND
|
|
74
|
+
// dense_[sparse_to_dense_[i]].index_ == i.
|
|
75
|
+
// If both these properties hold, only then it is safe to refer to
|
|
76
|
+
// dense_[sparse_to_dense_[i]].value_
|
|
77
|
+
// as the value associated with index i.
|
|
78
|
+
//
|
|
79
|
+
// To insert a new entry, set sparse_to_dense_[i] to size_,
|
|
80
|
+
// initialize dense_[size_], and then increment size_.
|
|
81
|
+
//
|
|
82
|
+
// Deletion of specific values from the array is implemented by
|
|
83
|
+
// swapping dense_[size_-1] and the dense_ being deleted and then
|
|
84
|
+
// updating the appropriate sparse_to_dense_ entries.
|
|
85
|
+
//
|
|
86
|
+
// To make the sparse array as efficient as possible for non-primitive types,
|
|
87
|
+
// elements may or may not be destroyed when they are deleted from the sparse
|
|
88
|
+
// array through a call to erase(), erase_existing() or resize(). They
|
|
89
|
+
// immediately become inaccessible, but they are only guaranteed to be
|
|
90
|
+
// destroyed when the SparseArray destructor is called.
|
|
91
|
+
|
|
92
|
+
#ifndef RE2_UTIL_SPARSE_ARRAY_H__
|
|
93
|
+
#define RE2_UTIL_SPARSE_ARRAY_H__
|
|
94
|
+
|
|
95
|
+
#include "util/util.h"
|
|
96
|
+
|
|
97
|
+
namespace re2 {
|
|
98
|
+
|
|
99
|
+
template<typename Value>
|
|
100
|
+
class SparseArray {
|
|
101
|
+
public:
|
|
102
|
+
SparseArray();
|
|
103
|
+
SparseArray(int max_size);
|
|
104
|
+
~SparseArray();
|
|
105
|
+
|
|
106
|
+
// IndexValue pairs: exposed in SparseArray::iterator.
|
|
107
|
+
class IndexValue;
|
|
108
|
+
|
|
109
|
+
typedef IndexValue value_type;
|
|
110
|
+
typedef typename vector<IndexValue>::iterator iterator;
|
|
111
|
+
typedef typename vector<IndexValue>::const_iterator const_iterator;
|
|
112
|
+
|
|
113
|
+
inline const IndexValue& iv(int i) const;
|
|
114
|
+
|
|
115
|
+
// Return the number of entries in the array.
|
|
116
|
+
int size() const {
|
|
117
|
+
return size_;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Iterate over the array.
|
|
121
|
+
iterator begin() {
|
|
122
|
+
return dense_.begin();
|
|
123
|
+
}
|
|
124
|
+
iterator end() {
|
|
125
|
+
return dense_.begin() + size_;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
const_iterator begin() const {
|
|
129
|
+
return dense_.begin();
|
|
130
|
+
}
|
|
131
|
+
const_iterator end() const {
|
|
132
|
+
return dense_.begin() + size_;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Change the maximum size of the array.
|
|
136
|
+
// Invalidates all iterators.
|
|
137
|
+
void resize(int max_size);
|
|
138
|
+
|
|
139
|
+
// Return the maximum size of the array.
|
|
140
|
+
// Indices can be in the range [0, max_size).
|
|
141
|
+
int max_size() const {
|
|
142
|
+
return max_size_;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// Clear the array.
|
|
146
|
+
void clear() {
|
|
147
|
+
size_ = 0;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// Check whether index i is in the array.
|
|
151
|
+
inline bool has_index(int i) const;
|
|
152
|
+
|
|
153
|
+
// Comparison function for sorting.
|
|
154
|
+
// Can sort the sparse array so that future iterations
|
|
155
|
+
// will visit indices in increasing order using
|
|
156
|
+
// sort(arr.begin(), arr.end(), arr.less);
|
|
157
|
+
static bool less(const IndexValue& a, const IndexValue& b);
|
|
158
|
+
|
|
159
|
+
public:
|
|
160
|
+
// Set the value at index i to v.
|
|
161
|
+
inline iterator set(int i, Value v);
|
|
162
|
+
|
|
163
|
+
pair<iterator, bool> insert(const value_type& new_value);
|
|
164
|
+
|
|
165
|
+
// Returns the value at index i
|
|
166
|
+
// or defaultv if index i is not initialized in the array.
|
|
167
|
+
inline Value get(int i, Value defaultv) const;
|
|
168
|
+
|
|
169
|
+
iterator find(int i);
|
|
170
|
+
|
|
171
|
+
const_iterator find(int i) const;
|
|
172
|
+
|
|
173
|
+
// Change the value at index i to v.
|
|
174
|
+
// Fast but unsafe: only use if has_index(i) is true.
|
|
175
|
+
inline iterator set_existing(int i, Value v);
|
|
176
|
+
|
|
177
|
+
// Set the value at the new index i to v.
|
|
178
|
+
// Fast but unsafe: only use if has_index(i) is false.
|
|
179
|
+
inline iterator set_new(int i, Value v);
|
|
180
|
+
|
|
181
|
+
// Get the value at index i from the array..
|
|
182
|
+
// Fast but unsafe: only use if has_index(i) is true.
|
|
183
|
+
inline Value get_existing(int i) const;
|
|
184
|
+
|
|
185
|
+
// Erasing items from the array during iteration is in general
|
|
186
|
+
// NOT safe. There is one special case, which is that the current
|
|
187
|
+
// index-value pair can be erased as long as the iterator is then
|
|
188
|
+
// checked for being at the end before being incremented.
|
|
189
|
+
// For example:
|
|
190
|
+
//
|
|
191
|
+
// for (i = m.begin(); i != m.end(); ++i) {
|
|
192
|
+
// if (ShouldErase(i->index(), i->value())) {
|
|
193
|
+
// m.erase(i->index());
|
|
194
|
+
// --i;
|
|
195
|
+
// }
|
|
196
|
+
// }
|
|
197
|
+
//
|
|
198
|
+
// Except in the specific case just described, elements must
|
|
199
|
+
// not be erased from the array (including clearing the array)
|
|
200
|
+
// while iterators are walking over the array. Otherwise,
|
|
201
|
+
// the iterators could walk past the end of the array.
|
|
202
|
+
|
|
203
|
+
// Erases the element at index i from the array.
|
|
204
|
+
inline void erase(int i);
|
|
205
|
+
|
|
206
|
+
// Erases the element at index i from the array.
|
|
207
|
+
// Fast but unsafe: only use if has_index(i) is true.
|
|
208
|
+
inline void erase_existing(int i);
|
|
209
|
+
|
|
210
|
+
private:
|
|
211
|
+
// Add the index i to the array.
|
|
212
|
+
// Only use if has_index(i) is known to be false.
|
|
213
|
+
// Since it doesn't set the value associated with i,
|
|
214
|
+
// this function is private, only intended as a helper
|
|
215
|
+
// for other methods.
|
|
216
|
+
inline void create_index(int i);
|
|
217
|
+
|
|
218
|
+
// In debug mode, verify that some invariant properties of the class
|
|
219
|
+
// are being maintained. This is called at the end of the constructor
|
|
220
|
+
// and at the beginning and end of all public non-const member functions.
|
|
221
|
+
inline void DebugCheckInvariants() const;
|
|
222
|
+
|
|
223
|
+
int size_;
|
|
224
|
+
int max_size_;
|
|
225
|
+
int* sparse_to_dense_;
|
|
226
|
+
vector<IndexValue> dense_;
|
|
227
|
+
|
|
228
|
+
DISALLOW_EVIL_CONSTRUCTORS(SparseArray);
|
|
229
|
+
};
|
|
230
|
+
|
|
231
|
+
template<typename Value>
|
|
232
|
+
SparseArray<Value>::SparseArray()
|
|
233
|
+
: size_(0), max_size_(0), sparse_to_dense_(NULL), dense_() {}
|
|
234
|
+
|
|
235
|
+
// IndexValue pairs: exposed in SparseArray::iterator.
|
|
236
|
+
template<typename Value>
|
|
237
|
+
class SparseArray<Value>::IndexValue {
|
|
238
|
+
friend class SparseArray;
|
|
239
|
+
public:
|
|
240
|
+
typedef int first_type;
|
|
241
|
+
typedef Value second_type;
|
|
242
|
+
|
|
243
|
+
IndexValue() {}
|
|
244
|
+
IndexValue(int index, const Value& value) : second(value), index_(index) {}
|
|
245
|
+
|
|
246
|
+
int index() const { return index_; }
|
|
247
|
+
Value value() const { return second; }
|
|
248
|
+
|
|
249
|
+
// Provide the data in the 'second' member so that the utilities
|
|
250
|
+
// in map-util work.
|
|
251
|
+
Value second;
|
|
252
|
+
|
|
253
|
+
private:
|
|
254
|
+
int index_;
|
|
255
|
+
};
|
|
256
|
+
|
|
257
|
+
template<typename Value>
|
|
258
|
+
const typename SparseArray<Value>::IndexValue&
|
|
259
|
+
SparseArray<Value>::iv(int i) const {
|
|
260
|
+
DCHECK_GE(i, 0);
|
|
261
|
+
DCHECK_LT(i, size_);
|
|
262
|
+
return dense_[i];
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// Change the maximum size of the array.
|
|
266
|
+
// Invalidates all iterators.
|
|
267
|
+
template<typename Value>
|
|
268
|
+
void SparseArray<Value>::resize(int new_max_size) {
|
|
269
|
+
DebugCheckInvariants();
|
|
270
|
+
if (new_max_size > max_size_) {
|
|
271
|
+
int* a = new int[new_max_size];
|
|
272
|
+
if (sparse_to_dense_) {
|
|
273
|
+
memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
|
|
274
|
+
// Don't need to zero the memory but appease Valgrind.
|
|
275
|
+
if (RunningOnValgrind()) {
|
|
276
|
+
for (int i = max_size_; i < new_max_size; i++)
|
|
277
|
+
a[i] = 0xababababU;
|
|
278
|
+
}
|
|
279
|
+
delete[] sparse_to_dense_;
|
|
280
|
+
}
|
|
281
|
+
sparse_to_dense_ = a;
|
|
282
|
+
|
|
283
|
+
dense_.resize(new_max_size);
|
|
284
|
+
}
|
|
285
|
+
max_size_ = new_max_size;
|
|
286
|
+
if (size_ > max_size_)
|
|
287
|
+
size_ = max_size_;
|
|
288
|
+
DebugCheckInvariants();
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
// Check whether index i is in the array.
|
|
292
|
+
template<typename Value>
|
|
293
|
+
bool SparseArray<Value>::has_index(int i) const {
|
|
294
|
+
DCHECK_GE(i, 0);
|
|
295
|
+
DCHECK_LT(i, max_size_);
|
|
296
|
+
if (static_cast<uint>(i) >= max_size_) {
|
|
297
|
+
return false;
|
|
298
|
+
}
|
|
299
|
+
// Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
|
|
300
|
+
return (uint)sparse_to_dense_[i] < (uint)size_ &&
|
|
301
|
+
dense_[sparse_to_dense_[i]].index_ == i;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// Set the value at index i to v.
|
|
305
|
+
template<typename Value>
|
|
306
|
+
typename SparseArray<Value>::iterator SparseArray<Value>::set(int i, Value v) {
|
|
307
|
+
DebugCheckInvariants();
|
|
308
|
+
if (static_cast<uint>(i) >= max_size_) {
|
|
309
|
+
// Semantically, end() would be better here, but we already know
|
|
310
|
+
// the user did something stupid, so begin() insulates them from
|
|
311
|
+
// dereferencing an invalid pointer.
|
|
312
|
+
return begin();
|
|
313
|
+
}
|
|
314
|
+
if (!has_index(i))
|
|
315
|
+
create_index(i);
|
|
316
|
+
return set_existing(i, v);
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
template<typename Value>
|
|
320
|
+
pair<typename SparseArray<Value>::iterator, bool> SparseArray<Value>::insert(
|
|
321
|
+
const value_type& new_value) {
|
|
322
|
+
DebugCheckInvariants();
|
|
323
|
+
pair<typename SparseArray<Value>::iterator, bool> p;
|
|
324
|
+
if (has_index(new_value.index_)) {
|
|
325
|
+
p = make_pair(dense_.begin() + sparse_to_dense_[new_value.index_], false);
|
|
326
|
+
} else {
|
|
327
|
+
p = make_pair(set_new(new_value.index_, new_value.second), true);
|
|
328
|
+
}
|
|
329
|
+
DebugCheckInvariants();
|
|
330
|
+
return p;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
template<typename Value>
|
|
334
|
+
Value SparseArray<Value>::get(int i, Value defaultv) const {
|
|
335
|
+
if (!has_index(i))
|
|
336
|
+
return defaultv;
|
|
337
|
+
return get_existing(i);
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
template<typename Value>
|
|
341
|
+
typename SparseArray<Value>::iterator SparseArray<Value>::find(int i) {
|
|
342
|
+
if (has_index(i))
|
|
343
|
+
return dense_.begin() + sparse_to_dense_[i];
|
|
344
|
+
return end();
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
template<typename Value>
|
|
348
|
+
typename SparseArray<Value>::const_iterator
|
|
349
|
+
SparseArray<Value>::find(int i) const {
|
|
350
|
+
if (has_index(i)) {
|
|
351
|
+
return dense_.begin() + sparse_to_dense_[i];
|
|
352
|
+
}
|
|
353
|
+
return end();
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
template<typename Value>
|
|
357
|
+
typename SparseArray<Value>::iterator
|
|
358
|
+
SparseArray<Value>::set_existing(int i, Value v) {
|
|
359
|
+
DebugCheckInvariants();
|
|
360
|
+
DCHECK(has_index(i));
|
|
361
|
+
dense_[sparse_to_dense_[i]].second = v;
|
|
362
|
+
DebugCheckInvariants();
|
|
363
|
+
return dense_.begin() + sparse_to_dense_[i];
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
template<typename Value>
|
|
367
|
+
typename SparseArray<Value>::iterator
|
|
368
|
+
SparseArray<Value>::set_new(int i, Value v) {
|
|
369
|
+
DebugCheckInvariants();
|
|
370
|
+
if (static_cast<uint>(i) >= max_size_) {
|
|
371
|
+
// Semantically, end() would be better here, but we already know
|
|
372
|
+
// the user did something stupid, so begin() insulates them from
|
|
373
|
+
// dereferencing an invalid pointer.
|
|
374
|
+
return begin();
|
|
375
|
+
}
|
|
376
|
+
DCHECK(!has_index(i));
|
|
377
|
+
create_index(i);
|
|
378
|
+
return set_existing(i, v);
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
template<typename Value>
|
|
382
|
+
Value SparseArray<Value>::get_existing(int i) const {
|
|
383
|
+
DCHECK(has_index(i));
|
|
384
|
+
return dense_[sparse_to_dense_[i]].second;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
template<typename Value>
|
|
388
|
+
void SparseArray<Value>::erase(int i) {
|
|
389
|
+
DebugCheckInvariants();
|
|
390
|
+
if (has_index(i))
|
|
391
|
+
erase_existing(i);
|
|
392
|
+
DebugCheckInvariants();
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
template<typename Value>
|
|
396
|
+
void SparseArray<Value>::erase_existing(int i) {
|
|
397
|
+
DebugCheckInvariants();
|
|
398
|
+
DCHECK(has_index(i));
|
|
399
|
+
int di = sparse_to_dense_[i];
|
|
400
|
+
if (di < size_ - 1) {
|
|
401
|
+
dense_[di] = dense_[size_ - 1];
|
|
402
|
+
sparse_to_dense_[dense_[di].index_] = di;
|
|
403
|
+
}
|
|
404
|
+
size_--;
|
|
405
|
+
DebugCheckInvariants();
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
template<typename Value>
|
|
409
|
+
void SparseArray<Value>::create_index(int i) {
|
|
410
|
+
DCHECK(!has_index(i));
|
|
411
|
+
DCHECK_LT(size_, max_size_);
|
|
412
|
+
sparse_to_dense_[i] = size_;
|
|
413
|
+
dense_[size_].index_ = i;
|
|
414
|
+
size_++;
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
template<typename Value> SparseArray<Value>::SparseArray(int max_size) {
|
|
418
|
+
max_size_ = max_size;
|
|
419
|
+
sparse_to_dense_ = new int[max_size];
|
|
420
|
+
dense_.resize(max_size);
|
|
421
|
+
// Don't need to zero the new memory, but appease Valgrind.
|
|
422
|
+
if (RunningOnValgrind()) {
|
|
423
|
+
for (int i = 0; i < max_size; i++) {
|
|
424
|
+
sparse_to_dense_[i] = 0xababababU;
|
|
425
|
+
dense_[i].index_ = 0xababababU;
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
size_ = 0;
|
|
429
|
+
DebugCheckInvariants();
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
template<typename Value> SparseArray<Value>::~SparseArray() {
|
|
433
|
+
DebugCheckInvariants();
|
|
434
|
+
delete[] sparse_to_dense_;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
|
|
438
|
+
DCHECK_LE(0, size_);
|
|
439
|
+
DCHECK_LE(size_, max_size_);
|
|
440
|
+
DCHECK(size_ == 0 || sparse_to_dense_ != NULL);
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
// Comparison function for sorting.
|
|
444
|
+
template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
|
|
445
|
+
const IndexValue& b) {
|
|
446
|
+
return a.index_ < b.index_;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
} // namespace re2
|
|
450
|
+
|
|
451
|
+
#endif // RE2_UTIL_SPARSE_ARRAY_H__
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
|
2
|
+
// Use of this source code is governed by a BSD-style
|
|
3
|
+
// license that can be found in the LICENSE file.
|
|
4
|
+
|
|
5
|
+
// DESCRIPTION
|
|
6
|
+
//
|
|
7
|
+
// SparseSet<T>(m) is a set of integers in [0, m).
|
|
8
|
+
// It requires sizeof(int)*m memory, but it provides
|
|
9
|
+
// fast iteration through the elements in the set and fast clearing
|
|
10
|
+
// of the set.
|
|
11
|
+
//
|
|
12
|
+
// Insertion and deletion are constant time operations.
|
|
13
|
+
//
|
|
14
|
+
// Allocating the set is a constant time operation
|
|
15
|
+
// when memory allocation is a constant time operation.
|
|
16
|
+
//
|
|
17
|
+
// Clearing the set is a constant time operation (unusual!).
|
|
18
|
+
//
|
|
19
|
+
// Iterating through the set is an O(n) operation, where n
|
|
20
|
+
// is the number of items in the set (not O(m)).
|
|
21
|
+
//
|
|
22
|
+
// The set iterator visits entries in the order they were first
|
|
23
|
+
// inserted into the array. It is safe to add items to the set while
|
|
24
|
+
// using an iterator: the iterator will visit indices added to the set
|
|
25
|
+
// during the iteration, but will not re-visit indices whose values
|
|
26
|
+
// change after visiting. Thus SparseSet can be a convenient
|
|
27
|
+
// implementation of a work queue.
|
|
28
|
+
//
|
|
29
|
+
// The SparseSet implementation is NOT thread-safe. It is up to the
|
|
30
|
+
// caller to make sure only one thread is accessing the set. (Typically
|
|
31
|
+
// these sets are temporary values and used in situations where speed is
|
|
32
|
+
// important.)
|
|
33
|
+
//
|
|
34
|
+
// The SparseSet interface does not present all the usual STL bells and
|
|
35
|
+
// whistles.
|
|
36
|
+
//
|
|
37
|
+
// Implemented with reference to Briggs & Torczon, An Efficient
|
|
38
|
+
// Representation for Sparse Sets, ACM Letters on Programming Languages
|
|
39
|
+
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
|
|
40
|
+
//
|
|
41
|
+
// For a generalization to sparse array, see sparse_array.h.
|
|
42
|
+
|
|
43
|
+
// IMPLEMENTATION
|
|
44
|
+
//
|
|
45
|
+
// See sparse_array.h for implementation details
|
|
46
|
+
|
|
47
|
+
#ifndef RE2_UTIL_SPARSE_SET_H__
|
|
48
|
+
#define RE2_UTIL_SPARSE_SET_H__
|
|
49
|
+
|
|
50
|
+
#include "util/util.h"
|
|
51
|
+
|
|
52
|
+
namespace re2 {
|
|
53
|
+
|
|
54
|
+
class SparseSet {
|
|
55
|
+
public:
|
|
56
|
+
SparseSet()
|
|
57
|
+
: size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL) {}
|
|
58
|
+
|
|
59
|
+
SparseSet(int max_size) {
|
|
60
|
+
max_size_ = max_size;
|
|
61
|
+
sparse_to_dense_ = new int[max_size];
|
|
62
|
+
dense_ = new int[max_size];
|
|
63
|
+
// Don't need to zero the memory, but do so anyway
|
|
64
|
+
// to appease Valgrind.
|
|
65
|
+
if (RunningOnValgrind()) {
|
|
66
|
+
for (int i = 0; i < max_size; i++) {
|
|
67
|
+
dense_[i] = 0xababababU;
|
|
68
|
+
sparse_to_dense_[i] = 0xababababU;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
size_ = 0;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
~SparseSet() {
|
|
75
|
+
delete[] sparse_to_dense_;
|
|
76
|
+
delete[] dense_;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
typedef int* iterator;
|
|
80
|
+
typedef const int* const_iterator;
|
|
81
|
+
|
|
82
|
+
int size() const { return size_; }
|
|
83
|
+
iterator begin() { return dense_; }
|
|
84
|
+
iterator end() { return dense_ + size_; }
|
|
85
|
+
const_iterator begin() const { return dense_; }
|
|
86
|
+
const_iterator end() const { return dense_ + size_; }
|
|
87
|
+
|
|
88
|
+
// Change the maximum size of the array.
|
|
89
|
+
// Invalidates all iterators.
|
|
90
|
+
void resize(int new_max_size) {
|
|
91
|
+
if (size_ > new_max_size)
|
|
92
|
+
size_ = new_max_size;
|
|
93
|
+
if (new_max_size > max_size_) {
|
|
94
|
+
int* a = new int[new_max_size];
|
|
95
|
+
if (sparse_to_dense_) {
|
|
96
|
+
memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
|
|
97
|
+
if (RunningOnValgrind()) {
|
|
98
|
+
for (int i = max_size_; i < new_max_size; i++)
|
|
99
|
+
a[i] = 0xababababU;
|
|
100
|
+
}
|
|
101
|
+
delete[] sparse_to_dense_;
|
|
102
|
+
}
|
|
103
|
+
sparse_to_dense_ = a;
|
|
104
|
+
|
|
105
|
+
a = new int[new_max_size];
|
|
106
|
+
if (dense_) {
|
|
107
|
+
memmove(a, dense_, size_*sizeof a[0]);
|
|
108
|
+
if (RunningOnValgrind()) {
|
|
109
|
+
for (int i = size_; i < new_max_size; i++)
|
|
110
|
+
a[i] = 0xababababU;
|
|
111
|
+
}
|
|
112
|
+
delete[] dense_;
|
|
113
|
+
}
|
|
114
|
+
dense_ = a;
|
|
115
|
+
}
|
|
116
|
+
max_size_ = new_max_size;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Return the maximum size of the array.
|
|
120
|
+
// Indices can be in the range [0, max_size).
|
|
121
|
+
int max_size() const { return max_size_; }
|
|
122
|
+
|
|
123
|
+
// Clear the array.
|
|
124
|
+
void clear() { size_ = 0; }
|
|
125
|
+
|
|
126
|
+
// Check whether i is in the array.
|
|
127
|
+
bool contains(int i) const {
|
|
128
|
+
DCHECK_GE(i, 0);
|
|
129
|
+
DCHECK_LT(i, max_size_);
|
|
130
|
+
if (static_cast<uint>(i) >= max_size_) {
|
|
131
|
+
return false;
|
|
132
|
+
}
|
|
133
|
+
// Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
|
|
134
|
+
return (uint)sparse_to_dense_[i] < (uint)size_ &&
|
|
135
|
+
dense_[sparse_to_dense_[i]] == i;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Adds i to the set.
|
|
139
|
+
void insert(int i) {
|
|
140
|
+
if (!contains(i))
|
|
141
|
+
insert_new(i);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Set the value at the new index i to v.
|
|
145
|
+
// Fast but unsafe: only use if contains(i) is false.
|
|
146
|
+
void insert_new(int i) {
|
|
147
|
+
if (static_cast<uint>(i) >= max_size_) {
|
|
148
|
+
// Semantically, end() would be better here, but we already know
|
|
149
|
+
// the user did something stupid, so begin() insulates them from
|
|
150
|
+
// dereferencing an invalid pointer.
|
|
151
|
+
return;
|
|
152
|
+
}
|
|
153
|
+
DCHECK(!contains(i));
|
|
154
|
+
DCHECK_LT(size_, max_size_);
|
|
155
|
+
sparse_to_dense_[i] = size_;
|
|
156
|
+
dense_[size_] = i;
|
|
157
|
+
size_++;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Comparison function for sorting.
|
|
161
|
+
// Can sort the sparse array so that future iterations
|
|
162
|
+
// will visit indices in increasing order using
|
|
163
|
+
// sort(arr.begin(), arr.end(), arr.less);
|
|
164
|
+
static bool less(int a, int b) { return a < b; }
|
|
165
|
+
|
|
166
|
+
private:
|
|
167
|
+
int size_;
|
|
168
|
+
int max_size_;
|
|
169
|
+
int* sparse_to_dense_;
|
|
170
|
+
int* dense_;
|
|
171
|
+
|
|
172
|
+
DISALLOW_EVIL_CONSTRUCTORS(SparseSet);
|
|
173
|
+
};
|
|
174
|
+
|
|
175
|
+
} // namespace re2
|
|
176
|
+
|
|
177
|
+
#endif // RE2_UTIL_SPARSE_SET_H__
|