chipper 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/util/random.h ADDED
@@ -0,0 +1,29 @@
1
+ // Copyright 2005-2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Modified from Google perftools's tcmalloc_unittest.cc.
6
+
7
+ #ifndef RE2_UTIL_RANDOM_H__
8
+ #define RE2_UTIL_RANDOM_H__
9
+
10
+ #include "util/util.h"
11
+
12
+ namespace re2 {
13
+
14
+ // ACM minimal standard random number generator. (re-entrant.)
15
+ class ACMRandom {
16
+ public:
17
+ ACMRandom(int32 seed) : seed_(seed) {}
18
+ int32 Next();
19
+ int32 Uniform(int32);
20
+
21
+ void Reset(int32 seed) { seed_ = seed; }
22
+
23
+ private:
24
+ int32 seed_;
25
+ };
26
+
27
+ } // namespace re2
28
+
29
+ #endif // RE2_UTIL_RANDOM_H__
@@ -0,0 +1,451 @@
1
+ // Copyright 2006 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // DESCRIPTION
6
+ //
7
+ // SparseArray<T>(m) is a map from integers in [0, m) to T values.
8
+ // It requires (sizeof(T)+sizeof(int))*m memory, but it provides
9
+ // fast iteration through the elements in the array and fast clearing
10
+ // of the array. The array has a concept of certain elements being
11
+ // uninitialized (having no value).
12
+ //
13
+ // Insertion and deletion are constant time operations.
14
+ //
15
+ // Allocating the array is a constant time operation
16
+ // when memory allocation is a constant time operation.
17
+ //
18
+ // Clearing the array is a constant time operation (unusual!).
19
+ //
20
+ // Iterating through the array is an O(n) operation, where n
21
+ // is the number of items in the array (not O(m)).
22
+ //
23
+ // The array iterator visits entries in the order they were first
24
+ // inserted into the array. It is safe to add items to the array while
25
+ // using an iterator: the iterator will visit indices added to the array
26
+ // during the iteration, but will not re-visit indices whose values
27
+ // change after visiting. Thus SparseArray can be a convenient
28
+ // implementation of a work queue.
29
+ //
30
+ // The SparseArray implementation is NOT thread-safe. It is up to the
31
+ // caller to make sure only one thread is accessing the array. (Typically
32
+ // these arrays are temporary values and used in situations where speed is
33
+ // important.)
34
+ //
35
+ // The SparseArray interface does not present all the usual STL bells and
36
+ // whistles.
37
+ //
38
+ // Implemented with reference to Briggs & Torczon, An Efficient
39
+ // Representation for Sparse Sets, ACM Letters on Programming Languages
40
+ // and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
41
+ //
42
+ // Briggs & Torczon popularized this technique, but it had been known
43
+ // long before their paper. They point out that Aho, Hopcroft, and
44
+ // Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's
45
+ // 1986 Programming Pearls both hint at the technique in exercises to the
46
+ // reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1
47
+ // exercise 8).
48
+ //
49
+ // Briggs & Torczon describe a sparse set implementation. I have
50
+ // trivially generalized it to create a sparse array (actually the original
51
+ // target of the AHU and Bentley exercises).
52
+
53
+ // IMPLEMENTATION
54
+ //
55
+ // SparseArray uses a vector dense_ and an array sparse_to_dense_, both of
56
+ // size max_size_. At any point, the number of elements in the sparse array is
57
+ // size_.
58
+ //
59
+ // The vector dense_ contains the size_ elements in the sparse array (with
60
+ // their indices),
61
+ // in the order that the elements were first inserted. This array is dense:
62
+ // the size_ pairs are dense_[0] through dense_[size_-1].
63
+ //
64
+ // The array sparse_to_dense_ maps from indices in [0,m) to indices in
65
+ // [0,size_).
66
+ // For indices present in the array, dense_[sparse_to_dense_[i]].index_ == i.
67
+ // For indices not present in the array, sparse_to_dense_ can contain
68
+ // any value at all, perhaps outside the range [0, size_) but perhaps not.
69
+ //
70
+ // The lax requirement on sparse_to_dense_ values makes clearing
71
+ // the array very easy: set size_ to 0. Lookups are slightly more
72
+ // complicated. An index i has a value in the array if and only if:
73
+ // sparse_to_dense_[i] is in [0, size_) AND
74
+ // dense_[sparse_to_dense_[i]].index_ == i.
75
+ // If both these properties hold, only then it is safe to refer to
76
+ // dense_[sparse_to_dense_[i]].value_
77
+ // as the value associated with index i.
78
+ //
79
+ // To insert a new entry, set sparse_to_dense_[i] to size_,
80
+ // initialize dense_[size_], and then increment size_.
81
+ //
82
+ // Deletion of specific values from the array is implemented by
83
+ // swapping dense_[size_-1] and the dense_ being deleted and then
84
+ // updating the appropriate sparse_to_dense_ entries.
85
+ //
86
+ // To make the sparse array as efficient as possible for non-primitive types,
87
+ // elements may or may not be destroyed when they are deleted from the sparse
88
+ // array through a call to erase(), erase_existing() or resize(). They
89
+ // immediately become inaccessible, but they are only guaranteed to be
90
+ // destroyed when the SparseArray destructor is called.
91
+
92
+ #ifndef RE2_UTIL_SPARSE_ARRAY_H__
93
+ #define RE2_UTIL_SPARSE_ARRAY_H__
94
+
95
+ #include "util/util.h"
96
+
97
+ namespace re2 {
98
+
99
+ template<typename Value>
100
+ class SparseArray {
101
+ public:
102
+ SparseArray();
103
+ SparseArray(int max_size);
104
+ ~SparseArray();
105
+
106
+ // IndexValue pairs: exposed in SparseArray::iterator.
107
+ class IndexValue;
108
+
109
+ typedef IndexValue value_type;
110
+ typedef typename vector<IndexValue>::iterator iterator;
111
+ typedef typename vector<IndexValue>::const_iterator const_iterator;
112
+
113
+ inline const IndexValue& iv(int i) const;
114
+
115
+ // Return the number of entries in the array.
116
+ int size() const {
117
+ return size_;
118
+ }
119
+
120
+ // Iterate over the array.
121
+ iterator begin() {
122
+ return dense_.begin();
123
+ }
124
+ iterator end() {
125
+ return dense_.begin() + size_;
126
+ }
127
+
128
+ const_iterator begin() const {
129
+ return dense_.begin();
130
+ }
131
+ const_iterator end() const {
132
+ return dense_.begin() + size_;
133
+ }
134
+
135
+ // Change the maximum size of the array.
136
+ // Invalidates all iterators.
137
+ void resize(int max_size);
138
+
139
+ // Return the maximum size of the array.
140
+ // Indices can be in the range [0, max_size).
141
+ int max_size() const {
142
+ return max_size_;
143
+ }
144
+
145
+ // Clear the array.
146
+ void clear() {
147
+ size_ = 0;
148
+ }
149
+
150
+ // Check whether index i is in the array.
151
+ inline bool has_index(int i) const;
152
+
153
+ // Comparison function for sorting.
154
+ // Can sort the sparse array so that future iterations
155
+ // will visit indices in increasing order using
156
+ // sort(arr.begin(), arr.end(), arr.less);
157
+ static bool less(const IndexValue& a, const IndexValue& b);
158
+
159
+ public:
160
+ // Set the value at index i to v.
161
+ inline iterator set(int i, Value v);
162
+
163
+ pair<iterator, bool> insert(const value_type& new_value);
164
+
165
+ // Returns the value at index i
166
+ // or defaultv if index i is not initialized in the array.
167
+ inline Value get(int i, Value defaultv) const;
168
+
169
+ iterator find(int i);
170
+
171
+ const_iterator find(int i) const;
172
+
173
+ // Change the value at index i to v.
174
+ // Fast but unsafe: only use if has_index(i) is true.
175
+ inline iterator set_existing(int i, Value v);
176
+
177
+ // Set the value at the new index i to v.
178
+ // Fast but unsafe: only use if has_index(i) is false.
179
+ inline iterator set_new(int i, Value v);
180
+
181
+ // Get the value at index i from the array..
182
+ // Fast but unsafe: only use if has_index(i) is true.
183
+ inline Value get_existing(int i) const;
184
+
185
+ // Erasing items from the array during iteration is in general
186
+ // NOT safe. There is one special case, which is that the current
187
+ // index-value pair can be erased as long as the iterator is then
188
+ // checked for being at the end before being incremented.
189
+ // For example:
190
+ //
191
+ // for (i = m.begin(); i != m.end(); ++i) {
192
+ // if (ShouldErase(i->index(), i->value())) {
193
+ // m.erase(i->index());
194
+ // --i;
195
+ // }
196
+ // }
197
+ //
198
+ // Except in the specific case just described, elements must
199
+ // not be erased from the array (including clearing the array)
200
+ // while iterators are walking over the array. Otherwise,
201
+ // the iterators could walk past the end of the array.
202
+
203
+ // Erases the element at index i from the array.
204
+ inline void erase(int i);
205
+
206
+ // Erases the element at index i from the array.
207
+ // Fast but unsafe: only use if has_index(i) is true.
208
+ inline void erase_existing(int i);
209
+
210
+ private:
211
+ // Add the index i to the array.
212
+ // Only use if has_index(i) is known to be false.
213
+ // Since it doesn't set the value associated with i,
214
+ // this function is private, only intended as a helper
215
+ // for other methods.
216
+ inline void create_index(int i);
217
+
218
+ // In debug mode, verify that some invariant properties of the class
219
+ // are being maintained. This is called at the end of the constructor
220
+ // and at the beginning and end of all public non-const member functions.
221
+ inline void DebugCheckInvariants() const;
222
+
223
+ int size_;
224
+ int max_size_;
225
+ int* sparse_to_dense_;
226
+ vector<IndexValue> dense_;
227
+
228
+ DISALLOW_EVIL_CONSTRUCTORS(SparseArray);
229
+ };
230
+
231
+ template<typename Value>
232
+ SparseArray<Value>::SparseArray()
233
+ : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_() {}
234
+
235
+ // IndexValue pairs: exposed in SparseArray::iterator.
236
+ template<typename Value>
237
+ class SparseArray<Value>::IndexValue {
238
+ friend class SparseArray;
239
+ public:
240
+ typedef int first_type;
241
+ typedef Value second_type;
242
+
243
+ IndexValue() {}
244
+ IndexValue(int index, const Value& value) : second(value), index_(index) {}
245
+
246
+ int index() const { return index_; }
247
+ Value value() const { return second; }
248
+
249
+ // Provide the data in the 'second' member so that the utilities
250
+ // in map-util work.
251
+ Value second;
252
+
253
+ private:
254
+ int index_;
255
+ };
256
+
257
+ template<typename Value>
258
+ const typename SparseArray<Value>::IndexValue&
259
+ SparseArray<Value>::iv(int i) const {
260
+ DCHECK_GE(i, 0);
261
+ DCHECK_LT(i, size_);
262
+ return dense_[i];
263
+ }
264
+
265
+ // Change the maximum size of the array.
266
+ // Invalidates all iterators.
267
+ template<typename Value>
268
+ void SparseArray<Value>::resize(int new_max_size) {
269
+ DebugCheckInvariants();
270
+ if (new_max_size > max_size_) {
271
+ int* a = new int[new_max_size];
272
+ if (sparse_to_dense_) {
273
+ memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
274
+ // Don't need to zero the memory but appease Valgrind.
275
+ if (RunningOnValgrind()) {
276
+ for (int i = max_size_; i < new_max_size; i++)
277
+ a[i] = 0xababababU;
278
+ }
279
+ delete[] sparse_to_dense_;
280
+ }
281
+ sparse_to_dense_ = a;
282
+
283
+ dense_.resize(new_max_size);
284
+ }
285
+ max_size_ = new_max_size;
286
+ if (size_ > max_size_)
287
+ size_ = max_size_;
288
+ DebugCheckInvariants();
289
+ }
290
+
291
+ // Check whether index i is in the array.
292
+ template<typename Value>
293
+ bool SparseArray<Value>::has_index(int i) const {
294
+ DCHECK_GE(i, 0);
295
+ DCHECK_LT(i, max_size_);
296
+ if (static_cast<uint>(i) >= max_size_) {
297
+ return false;
298
+ }
299
+ // Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
300
+ return (uint)sparse_to_dense_[i] < (uint)size_ &&
301
+ dense_[sparse_to_dense_[i]].index_ == i;
302
+ }
303
+
304
+ // Set the value at index i to v.
305
+ template<typename Value>
306
+ typename SparseArray<Value>::iterator SparseArray<Value>::set(int i, Value v) {
307
+ DebugCheckInvariants();
308
+ if (static_cast<uint>(i) >= max_size_) {
309
+ // Semantically, end() would be better here, but we already know
310
+ // the user did something stupid, so begin() insulates them from
311
+ // dereferencing an invalid pointer.
312
+ return begin();
313
+ }
314
+ if (!has_index(i))
315
+ create_index(i);
316
+ return set_existing(i, v);
317
+ }
318
+
319
+ template<typename Value>
320
+ pair<typename SparseArray<Value>::iterator, bool> SparseArray<Value>::insert(
321
+ const value_type& new_value) {
322
+ DebugCheckInvariants();
323
+ pair<typename SparseArray<Value>::iterator, bool> p;
324
+ if (has_index(new_value.index_)) {
325
+ p = make_pair(dense_.begin() + sparse_to_dense_[new_value.index_], false);
326
+ } else {
327
+ p = make_pair(set_new(new_value.index_, new_value.second), true);
328
+ }
329
+ DebugCheckInvariants();
330
+ return p;
331
+ }
332
+
333
+ template<typename Value>
334
+ Value SparseArray<Value>::get(int i, Value defaultv) const {
335
+ if (!has_index(i))
336
+ return defaultv;
337
+ return get_existing(i);
338
+ }
339
+
340
+ template<typename Value>
341
+ typename SparseArray<Value>::iterator SparseArray<Value>::find(int i) {
342
+ if (has_index(i))
343
+ return dense_.begin() + sparse_to_dense_[i];
344
+ return end();
345
+ }
346
+
347
+ template<typename Value>
348
+ typename SparseArray<Value>::const_iterator
349
+ SparseArray<Value>::find(int i) const {
350
+ if (has_index(i)) {
351
+ return dense_.begin() + sparse_to_dense_[i];
352
+ }
353
+ return end();
354
+ }
355
+
356
+ template<typename Value>
357
+ typename SparseArray<Value>::iterator
358
+ SparseArray<Value>::set_existing(int i, Value v) {
359
+ DebugCheckInvariants();
360
+ DCHECK(has_index(i));
361
+ dense_[sparse_to_dense_[i]].second = v;
362
+ DebugCheckInvariants();
363
+ return dense_.begin() + sparse_to_dense_[i];
364
+ }
365
+
366
+ template<typename Value>
367
+ typename SparseArray<Value>::iterator
368
+ SparseArray<Value>::set_new(int i, Value v) {
369
+ DebugCheckInvariants();
370
+ if (static_cast<uint>(i) >= max_size_) {
371
+ // Semantically, end() would be better here, but we already know
372
+ // the user did something stupid, so begin() insulates them from
373
+ // dereferencing an invalid pointer.
374
+ return begin();
375
+ }
376
+ DCHECK(!has_index(i));
377
+ create_index(i);
378
+ return set_existing(i, v);
379
+ }
380
+
381
+ template<typename Value>
382
+ Value SparseArray<Value>::get_existing(int i) const {
383
+ DCHECK(has_index(i));
384
+ return dense_[sparse_to_dense_[i]].second;
385
+ }
386
+
387
+ template<typename Value>
388
+ void SparseArray<Value>::erase(int i) {
389
+ DebugCheckInvariants();
390
+ if (has_index(i))
391
+ erase_existing(i);
392
+ DebugCheckInvariants();
393
+ }
394
+
395
+ template<typename Value>
396
+ void SparseArray<Value>::erase_existing(int i) {
397
+ DebugCheckInvariants();
398
+ DCHECK(has_index(i));
399
+ int di = sparse_to_dense_[i];
400
+ if (di < size_ - 1) {
401
+ dense_[di] = dense_[size_ - 1];
402
+ sparse_to_dense_[dense_[di].index_] = di;
403
+ }
404
+ size_--;
405
+ DebugCheckInvariants();
406
+ }
407
+
408
+ template<typename Value>
409
+ void SparseArray<Value>::create_index(int i) {
410
+ DCHECK(!has_index(i));
411
+ DCHECK_LT(size_, max_size_);
412
+ sparse_to_dense_[i] = size_;
413
+ dense_[size_].index_ = i;
414
+ size_++;
415
+ }
416
+
417
+ template<typename Value> SparseArray<Value>::SparseArray(int max_size) {
418
+ max_size_ = max_size;
419
+ sparse_to_dense_ = new int[max_size];
420
+ dense_.resize(max_size);
421
+ // Don't need to zero the new memory, but appease Valgrind.
422
+ if (RunningOnValgrind()) {
423
+ for (int i = 0; i < max_size; i++) {
424
+ sparse_to_dense_[i] = 0xababababU;
425
+ dense_[i].index_ = 0xababababU;
426
+ }
427
+ }
428
+ size_ = 0;
429
+ DebugCheckInvariants();
430
+ }
431
+
432
+ template<typename Value> SparseArray<Value>::~SparseArray() {
433
+ DebugCheckInvariants();
434
+ delete[] sparse_to_dense_;
435
+ }
436
+
437
+ template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
438
+ DCHECK_LE(0, size_);
439
+ DCHECK_LE(size_, max_size_);
440
+ DCHECK(size_ == 0 || sparse_to_dense_ != NULL);
441
+ }
442
+
443
+ // Comparison function for sorting.
444
+ template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
445
+ const IndexValue& b) {
446
+ return a.index_ < b.index_;
447
+ }
448
+
449
+ } // namespace re2
450
+
451
+ #endif // RE2_UTIL_SPARSE_ARRAY_H__
@@ -0,0 +1,177 @@
1
+ // Copyright 2006 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // DESCRIPTION
6
+ //
7
+ // SparseSet<T>(m) is a set of integers in [0, m).
8
+ // It requires sizeof(int)*m memory, but it provides
9
+ // fast iteration through the elements in the set and fast clearing
10
+ // of the set.
11
+ //
12
+ // Insertion and deletion are constant time operations.
13
+ //
14
+ // Allocating the set is a constant time operation
15
+ // when memory allocation is a constant time operation.
16
+ //
17
+ // Clearing the set is a constant time operation (unusual!).
18
+ //
19
+ // Iterating through the set is an O(n) operation, where n
20
+ // is the number of items in the set (not O(m)).
21
+ //
22
+ // The set iterator visits entries in the order they were first
23
+ // inserted into the array. It is safe to add items to the set while
24
+ // using an iterator: the iterator will visit indices added to the set
25
+ // during the iteration, but will not re-visit indices whose values
26
+ // change after visiting. Thus SparseSet can be a convenient
27
+ // implementation of a work queue.
28
+ //
29
+ // The SparseSet implementation is NOT thread-safe. It is up to the
30
+ // caller to make sure only one thread is accessing the set. (Typically
31
+ // these sets are temporary values and used in situations where speed is
32
+ // important.)
33
+ //
34
+ // The SparseSet interface does not present all the usual STL bells and
35
+ // whistles.
36
+ //
37
+ // Implemented with reference to Briggs & Torczon, An Efficient
38
+ // Representation for Sparse Sets, ACM Letters on Programming Languages
39
+ // and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
40
+ //
41
+ // For a generalization to sparse array, see sparse_array.h.
42
+
43
+ // IMPLEMENTATION
44
+ //
45
+ // See sparse_array.h for implementation details
46
+
47
+ #ifndef RE2_UTIL_SPARSE_SET_H__
48
+ #define RE2_UTIL_SPARSE_SET_H__
49
+
50
+ #include "util/util.h"
51
+
52
+ namespace re2 {
53
+
54
+ class SparseSet {
55
+ public:
56
+ SparseSet()
57
+ : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL) {}
58
+
59
+ SparseSet(int max_size) {
60
+ max_size_ = max_size;
61
+ sparse_to_dense_ = new int[max_size];
62
+ dense_ = new int[max_size];
63
+ // Don't need to zero the memory, but do so anyway
64
+ // to appease Valgrind.
65
+ if (RunningOnValgrind()) {
66
+ for (int i = 0; i < max_size; i++) {
67
+ dense_[i] = 0xababababU;
68
+ sparse_to_dense_[i] = 0xababababU;
69
+ }
70
+ }
71
+ size_ = 0;
72
+ }
73
+
74
+ ~SparseSet() {
75
+ delete[] sparse_to_dense_;
76
+ delete[] dense_;
77
+ }
78
+
79
+ typedef int* iterator;
80
+ typedef const int* const_iterator;
81
+
82
+ int size() const { return size_; }
83
+ iterator begin() { return dense_; }
84
+ iterator end() { return dense_ + size_; }
85
+ const_iterator begin() const { return dense_; }
86
+ const_iterator end() const { return dense_ + size_; }
87
+
88
+ // Change the maximum size of the array.
89
+ // Invalidates all iterators.
90
+ void resize(int new_max_size) {
91
+ if (size_ > new_max_size)
92
+ size_ = new_max_size;
93
+ if (new_max_size > max_size_) {
94
+ int* a = new int[new_max_size];
95
+ if (sparse_to_dense_) {
96
+ memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
97
+ if (RunningOnValgrind()) {
98
+ for (int i = max_size_; i < new_max_size; i++)
99
+ a[i] = 0xababababU;
100
+ }
101
+ delete[] sparse_to_dense_;
102
+ }
103
+ sparse_to_dense_ = a;
104
+
105
+ a = new int[new_max_size];
106
+ if (dense_) {
107
+ memmove(a, dense_, size_*sizeof a[0]);
108
+ if (RunningOnValgrind()) {
109
+ for (int i = size_; i < new_max_size; i++)
110
+ a[i] = 0xababababU;
111
+ }
112
+ delete[] dense_;
113
+ }
114
+ dense_ = a;
115
+ }
116
+ max_size_ = new_max_size;
117
+ }
118
+
119
+ // Return the maximum size of the array.
120
+ // Indices can be in the range [0, max_size).
121
+ int max_size() const { return max_size_; }
122
+
123
+ // Clear the array.
124
+ void clear() { size_ = 0; }
125
+
126
+ // Check whether i is in the array.
127
+ bool contains(int i) const {
128
+ DCHECK_GE(i, 0);
129
+ DCHECK_LT(i, max_size_);
130
+ if (static_cast<uint>(i) >= max_size_) {
131
+ return false;
132
+ }
133
+ // Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
134
+ return (uint)sparse_to_dense_[i] < (uint)size_ &&
135
+ dense_[sparse_to_dense_[i]] == i;
136
+ }
137
+
138
+ // Adds i to the set.
139
+ void insert(int i) {
140
+ if (!contains(i))
141
+ insert_new(i);
142
+ }
143
+
144
+ // Set the value at the new index i to v.
145
+ // Fast but unsafe: only use if contains(i) is false.
146
+ void insert_new(int i) {
147
+ if (static_cast<uint>(i) >= max_size_) {
148
+ // Semantically, end() would be better here, but we already know
149
+ // the user did something stupid, so begin() insulates them from
150
+ // dereferencing an invalid pointer.
151
+ return;
152
+ }
153
+ DCHECK(!contains(i));
154
+ DCHECK_LT(size_, max_size_);
155
+ sparse_to_dense_[i] = size_;
156
+ dense_[size_] = i;
157
+ size_++;
158
+ }
159
+
160
+ // Comparison function for sorting.
161
+ // Can sort the sparse array so that future iterations
162
+ // will visit indices in increasing order using
163
+ // sort(arr.begin(), arr.end(), arr.less);
164
+ static bool less(int a, int b) { return a < b; }
165
+
166
+ private:
167
+ int size_;
168
+ int max_size_;
169
+ int* sparse_to_dense_;
170
+ int* dense_;
171
+
172
+ DISALLOW_EVIL_CONSTRUCTORS(SparseSet);
173
+ };
174
+
175
+ } // namespace re2
176
+
177
+ #endif // RE2_UTIL_SPARSE_SET_H__