chipper 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. data/README.rdoc +51 -0
  2. data/ext/extconf.rb +58 -0
  3. data/ext/libstemmer_c/Makefile +10 -0
  4. data/ext/libstemmer_c/examples/stemwords.c +209 -0
  5. data/ext/libstemmer_c/include/libstemmer.h +79 -0
  6. data/ext/libstemmer_c/libstemmer/libstemmer.c +95 -0
  7. data/ext/libstemmer_c/libstemmer/libstemmer_utf8.c +95 -0
  8. data/ext/libstemmer_c/libstemmer/modules.h +190 -0
  9. data/ext/libstemmer_c/libstemmer/modules_utf8.h +121 -0
  10. data/ext/libstemmer_c/mkinc.mak +82 -0
  11. data/ext/libstemmer_c/mkinc_utf8.mak +52 -0
  12. data/ext/libstemmer_c/runtime/api.c +66 -0
  13. data/ext/libstemmer_c/runtime/api.h +26 -0
  14. data/ext/libstemmer_c/runtime/header.h +58 -0
  15. data/ext/libstemmer_c/runtime/utilities.c +478 -0
  16. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.c +337 -0
  17. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_danish.h +16 -0
  18. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.c +624 -0
  19. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_dutch.h +16 -0
  20. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.c +1117 -0
  21. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_english.h +16 -0
  22. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.c +762 -0
  23. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_finnish.h +16 -0
  24. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.c +1246 -0
  25. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_french.h +16 -0
  26. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.c +521 -0
  27. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_german.h +16 -0
  28. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.c +1230 -0
  29. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_hungarian.h +16 -0
  30. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.c +1065 -0
  31. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_italian.h +16 -0
  32. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.c +297 -0
  33. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_norwegian.h +16 -0
  34. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.c +749 -0
  35. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_porter.h +16 -0
  36. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.c +1017 -0
  37. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_portuguese.h +16 -0
  38. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.c +1093 -0
  39. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_spanish.h +16 -0
  40. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.c +307 -0
  41. data/ext/libstemmer_c/src_c/stem_ISO_8859_1_swedish.h +16 -0
  42. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.c +998 -0
  43. data/ext/libstemmer_c/src_c/stem_ISO_8859_2_romanian.h +16 -0
  44. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.c +700 -0
  45. data/ext/libstemmer_c/src_c/stem_KOI8_R_russian.h +16 -0
  46. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.c +339 -0
  47. data/ext/libstemmer_c/src_c/stem_UTF_8_danish.h +16 -0
  48. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.c +634 -0
  49. data/ext/libstemmer_c/src_c/stem_UTF_8_dutch.h +16 -0
  50. data/ext/libstemmer_c/src_c/stem_UTF_8_english.c +1125 -0
  51. data/ext/libstemmer_c/src_c/stem_UTF_8_english.h +16 -0
  52. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.c +768 -0
  53. data/ext/libstemmer_c/src_c/stem_UTF_8_finnish.h +16 -0
  54. data/ext/libstemmer_c/src_c/stem_UTF_8_french.c +1256 -0
  55. data/ext/libstemmer_c/src_c/stem_UTF_8_french.h +16 -0
  56. data/ext/libstemmer_c/src_c/stem_UTF_8_german.c +527 -0
  57. data/ext/libstemmer_c/src_c/stem_UTF_8_german.h +16 -0
  58. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.c +1234 -0
  59. data/ext/libstemmer_c/src_c/stem_UTF_8_hungarian.h +16 -0
  60. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.c +1073 -0
  61. data/ext/libstemmer_c/src_c/stem_UTF_8_italian.h +16 -0
  62. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.c +299 -0
  63. data/ext/libstemmer_c/src_c/stem_UTF_8_norwegian.h +16 -0
  64. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.c +755 -0
  65. data/ext/libstemmer_c/src_c/stem_UTF_8_porter.h +16 -0
  66. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.c +1023 -0
  67. data/ext/libstemmer_c/src_c/stem_UTF_8_portuguese.h +16 -0
  68. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.c +1004 -0
  69. data/ext/libstemmer_c/src_c/stem_UTF_8_romanian.h +16 -0
  70. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.c +694 -0
  71. data/ext/libstemmer_c/src_c/stem_UTF_8_russian.h +16 -0
  72. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.c +1097 -0
  73. data/ext/libstemmer_c/src_c/stem_UTF_8_spanish.h +16 -0
  74. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.c +309 -0
  75. data/ext/libstemmer_c/src_c/stem_UTF_8_swedish.h +16 -0
  76. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.c +2205 -0
  77. data/ext/libstemmer_c/src_c/stem_UTF_8_turkish.h +16 -0
  78. data/ext/re2/bitstate.cc +378 -0
  79. data/ext/re2/compile.cc +1138 -0
  80. data/ext/re2/dfa.cc +2086 -0
  81. data/ext/re2/filtered_re2.cc +100 -0
  82. data/ext/re2/filtered_re2.h +99 -0
  83. data/ext/re2/hash.cc +231 -0
  84. data/ext/re2/mimics_pcre.cc +185 -0
  85. data/ext/re2/nfa.cc +709 -0
  86. data/ext/re2/onepass.cc +614 -0
  87. data/ext/re2/parse.cc +2202 -0
  88. data/ext/re2/perl_groups.cc +119 -0
  89. data/ext/re2/prefilter.cc +671 -0
  90. data/ext/re2/prefilter.h +105 -0
  91. data/ext/re2/prefilter_tree.cc +398 -0
  92. data/ext/re2/prefilter_tree.h +130 -0
  93. data/ext/re2/prog.cc +341 -0
  94. data/ext/re2/prog.h +376 -0
  95. data/ext/re2/re2.cc +1180 -0
  96. data/ext/re2/re2.h +837 -0
  97. data/ext/re2/regexp.cc +920 -0
  98. data/ext/re2/regexp.h +632 -0
  99. data/ext/re2/rune.cc +258 -0
  100. data/ext/re2/set.cc +113 -0
  101. data/ext/re2/set.h +55 -0
  102. data/ext/re2/simplify.cc +393 -0
  103. data/ext/re2/stringpiece.cc +87 -0
  104. data/ext/re2/stringpiece.h +182 -0
  105. data/ext/re2/tostring.cc +341 -0
  106. data/ext/re2/unicode_casefold.cc +469 -0
  107. data/ext/re2/unicode_casefold.h +75 -0
  108. data/ext/re2/unicode_groups.cc +4851 -0
  109. data/ext/re2/unicode_groups.h +64 -0
  110. data/ext/re2/valgrind.cc +24 -0
  111. data/ext/re2/variadic_function.h +346 -0
  112. data/ext/re2/walker-inl.h +244 -0
  113. data/ext/src/chipper.cc +626 -0
  114. data/ext/src/version.h +1 -0
  115. data/ext/stemmer.rb +40 -0
  116. data/ext/util/arena.h +103 -0
  117. data/ext/util/atomicops.h +79 -0
  118. data/ext/util/benchmark.h +41 -0
  119. data/ext/util/flags.h +27 -0
  120. data/ext/util/logging.h +78 -0
  121. data/ext/util/mutex.h +190 -0
  122. data/ext/util/pcre.h +679 -0
  123. data/ext/util/random.h +29 -0
  124. data/ext/util/sparse_array.h +451 -0
  125. data/ext/util/sparse_set.h +177 -0
  126. data/ext/util/test.h +57 -0
  127. data/ext/util/thread.h +26 -0
  128. data/ext/util/utf.h +43 -0
  129. data/ext/util/util.h +127 -0
  130. data/ext/util/valgrind.h +4517 -0
  131. data/test/helper.rb +5 -0
  132. data/test/test_entities.rb +57 -0
  133. data/test/test_tokens.rb +118 -0
  134. metadata +199 -0
data/ext/util/random.h ADDED
@@ -0,0 +1,29 @@
1
+ // Copyright 2005-2009 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // Modified from Google perftools's tcmalloc_unittest.cc.
6
+
7
+ #ifndef RE2_UTIL_RANDOM_H__
8
+ #define RE2_UTIL_RANDOM_H__
9
+
10
+ #include "util/util.h"
11
+
12
+ namespace re2 {
13
+
14
+ // ACM minimal standard random number generator. (re-entrant.)
15
+ class ACMRandom {
16
+ public:
17
+ ACMRandom(int32 seed) : seed_(seed) {}
18
+ int32 Next();
19
+ int32 Uniform(int32);
20
+
21
+ void Reset(int32 seed) { seed_ = seed; }
22
+
23
+ private:
24
+ int32 seed_;
25
+ };
26
+
27
+ } // namespace re2
28
+
29
+ #endif // RE2_UTIL_RANDOM_H__
@@ -0,0 +1,451 @@
1
+ // Copyright 2006 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // DESCRIPTION
6
+ //
7
+ // SparseArray<T>(m) is a map from integers in [0, m) to T values.
8
+ // It requires (sizeof(T)+sizeof(int))*m memory, but it provides
9
+ // fast iteration through the elements in the array and fast clearing
10
+ // of the array. The array has a concept of certain elements being
11
+ // uninitialized (having no value).
12
+ //
13
+ // Insertion and deletion are constant time operations.
14
+ //
15
+ // Allocating the array is a constant time operation
16
+ // when memory allocation is a constant time operation.
17
+ //
18
+ // Clearing the array is a constant time operation (unusual!).
19
+ //
20
+ // Iterating through the array is an O(n) operation, where n
21
+ // is the number of items in the array (not O(m)).
22
+ //
23
+ // The array iterator visits entries in the order they were first
24
+ // inserted into the array. It is safe to add items to the array while
25
+ // using an iterator: the iterator will visit indices added to the array
26
+ // during the iteration, but will not re-visit indices whose values
27
+ // change after visiting. Thus SparseArray can be a convenient
28
+ // implementation of a work queue.
29
+ //
30
+ // The SparseArray implementation is NOT thread-safe. It is up to the
31
+ // caller to make sure only one thread is accessing the array. (Typically
32
+ // these arrays are temporary values and used in situations where speed is
33
+ // important.)
34
+ //
35
+ // The SparseArray interface does not present all the usual STL bells and
36
+ // whistles.
37
+ //
38
+ // Implemented with reference to Briggs & Torczon, An Efficient
39
+ // Representation for Sparse Sets, ACM Letters on Programming Languages
40
+ // and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
41
+ //
42
+ // Briggs & Torczon popularized this technique, but it had been known
43
+ // long before their paper. They point out that Aho, Hopcroft, and
44
+ // Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's
45
+ // 1986 Programming Pearls both hint at the technique in exercises to the
46
+ // reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1
47
+ // exercise 8).
48
+ //
49
+ // Briggs & Torczon describe a sparse set implementation. I have
50
+ // trivially generalized it to create a sparse array (actually the original
51
+ // target of the AHU and Bentley exercises).
52
+
53
+ // IMPLEMENTATION
54
+ //
55
+ // SparseArray uses a vector dense_ and an array sparse_to_dense_, both of
56
+ // size max_size_. At any point, the number of elements in the sparse array is
57
+ // size_.
58
+ //
59
+ // The vector dense_ contains the size_ elements in the sparse array (with
60
+ // their indices),
61
+ // in the order that the elements were first inserted. This array is dense:
62
+ // the size_ pairs are dense_[0] through dense_[size_-1].
63
+ //
64
+ // The array sparse_to_dense_ maps from indices in [0,m) to indices in
65
+ // [0,size_).
66
+ // For indices present in the array, dense_[sparse_to_dense_[i]].index_ == i.
67
+ // For indices not present in the array, sparse_to_dense_ can contain
68
+ // any value at all, perhaps outside the range [0, size_) but perhaps not.
69
+ //
70
+ // The lax requirement on sparse_to_dense_ values makes clearing
71
+ // the array very easy: set size_ to 0. Lookups are slightly more
72
+ // complicated. An index i has a value in the array if and only if:
73
+ // sparse_to_dense_[i] is in [0, size_) AND
74
+ // dense_[sparse_to_dense_[i]].index_ == i.
75
+ // If both these properties hold, only then it is safe to refer to
76
+ // dense_[sparse_to_dense_[i]].value_
77
+ // as the value associated with index i.
78
+ //
79
+ // To insert a new entry, set sparse_to_dense_[i] to size_,
80
+ // initialize dense_[size_], and then increment size_.
81
+ //
82
+ // Deletion of specific values from the array is implemented by
83
+ // swapping dense_[size_-1] and the dense_ being deleted and then
84
+ // updating the appropriate sparse_to_dense_ entries.
85
+ //
86
+ // To make the sparse array as efficient as possible for non-primitive types,
87
+ // elements may or may not be destroyed when they are deleted from the sparse
88
+ // array through a call to erase(), erase_existing() or resize(). They
89
+ // immediately become inaccessible, but they are only guaranteed to be
90
+ // destroyed when the SparseArray destructor is called.
91
+
92
+ #ifndef RE2_UTIL_SPARSE_ARRAY_H__
93
+ #define RE2_UTIL_SPARSE_ARRAY_H__
94
+
95
+ #include "util/util.h"
96
+
97
+ namespace re2 {
98
+
99
+ template<typename Value>
100
+ class SparseArray {
101
+ public:
102
+ SparseArray();
103
+ SparseArray(int max_size);
104
+ ~SparseArray();
105
+
106
+ // IndexValue pairs: exposed in SparseArray::iterator.
107
+ class IndexValue;
108
+
109
+ typedef IndexValue value_type;
110
+ typedef typename vector<IndexValue>::iterator iterator;
111
+ typedef typename vector<IndexValue>::const_iterator const_iterator;
112
+
113
+ inline const IndexValue& iv(int i) const;
114
+
115
+ // Return the number of entries in the array.
116
+ int size() const {
117
+ return size_;
118
+ }
119
+
120
+ // Iterate over the array.
121
+ iterator begin() {
122
+ return dense_.begin();
123
+ }
124
+ iterator end() {
125
+ return dense_.begin() + size_;
126
+ }
127
+
128
+ const_iterator begin() const {
129
+ return dense_.begin();
130
+ }
131
+ const_iterator end() const {
132
+ return dense_.begin() + size_;
133
+ }
134
+
135
+ // Change the maximum size of the array.
136
+ // Invalidates all iterators.
137
+ void resize(int max_size);
138
+
139
+ // Return the maximum size of the array.
140
+ // Indices can be in the range [0, max_size).
141
+ int max_size() const {
142
+ return max_size_;
143
+ }
144
+
145
+ // Clear the array.
146
+ void clear() {
147
+ size_ = 0;
148
+ }
149
+
150
+ // Check whether index i is in the array.
151
+ inline bool has_index(int i) const;
152
+
153
+ // Comparison function for sorting.
154
+ // Can sort the sparse array so that future iterations
155
+ // will visit indices in increasing order using
156
+ // sort(arr.begin(), arr.end(), arr.less);
157
+ static bool less(const IndexValue& a, const IndexValue& b);
158
+
159
+ public:
160
+ // Set the value at index i to v.
161
+ inline iterator set(int i, Value v);
162
+
163
+ pair<iterator, bool> insert(const value_type& new_value);
164
+
165
+ // Returns the value at index i
166
+ // or defaultv if index i is not initialized in the array.
167
+ inline Value get(int i, Value defaultv) const;
168
+
169
+ iterator find(int i);
170
+
171
+ const_iterator find(int i) const;
172
+
173
+ // Change the value at index i to v.
174
+ // Fast but unsafe: only use if has_index(i) is true.
175
+ inline iterator set_existing(int i, Value v);
176
+
177
+ // Set the value at the new index i to v.
178
+ // Fast but unsafe: only use if has_index(i) is false.
179
+ inline iterator set_new(int i, Value v);
180
+
181
+ // Get the value at index i from the array..
182
+ // Fast but unsafe: only use if has_index(i) is true.
183
+ inline Value get_existing(int i) const;
184
+
185
+ // Erasing items from the array during iteration is in general
186
+ // NOT safe. There is one special case, which is that the current
187
+ // index-value pair can be erased as long as the iterator is then
188
+ // checked for being at the end before being incremented.
189
+ // For example:
190
+ //
191
+ // for (i = m.begin(); i != m.end(); ++i) {
192
+ // if (ShouldErase(i->index(), i->value())) {
193
+ // m.erase(i->index());
194
+ // --i;
195
+ // }
196
+ // }
197
+ //
198
+ // Except in the specific case just described, elements must
199
+ // not be erased from the array (including clearing the array)
200
+ // while iterators are walking over the array. Otherwise,
201
+ // the iterators could walk past the end of the array.
202
+
203
+ // Erases the element at index i from the array.
204
+ inline void erase(int i);
205
+
206
+ // Erases the element at index i from the array.
207
+ // Fast but unsafe: only use if has_index(i) is true.
208
+ inline void erase_existing(int i);
209
+
210
+ private:
211
+ // Add the index i to the array.
212
+ // Only use if has_index(i) is known to be false.
213
+ // Since it doesn't set the value associated with i,
214
+ // this function is private, only intended as a helper
215
+ // for other methods.
216
+ inline void create_index(int i);
217
+
218
+ // In debug mode, verify that some invariant properties of the class
219
+ // are being maintained. This is called at the end of the constructor
220
+ // and at the beginning and end of all public non-const member functions.
221
+ inline void DebugCheckInvariants() const;
222
+
223
+ int size_;
224
+ int max_size_;
225
+ int* sparse_to_dense_;
226
+ vector<IndexValue> dense_;
227
+
228
+ DISALLOW_EVIL_CONSTRUCTORS(SparseArray);
229
+ };
230
+
231
+ template<typename Value>
232
+ SparseArray<Value>::SparseArray()
233
+ : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_() {}
234
+
235
+ // IndexValue pairs: exposed in SparseArray::iterator.
236
+ template<typename Value>
237
+ class SparseArray<Value>::IndexValue {
238
+ friend class SparseArray;
239
+ public:
240
+ typedef int first_type;
241
+ typedef Value second_type;
242
+
243
+ IndexValue() {}
244
+ IndexValue(int index, const Value& value) : second(value), index_(index) {}
245
+
246
+ int index() const { return index_; }
247
+ Value value() const { return second; }
248
+
249
+ // Provide the data in the 'second' member so that the utilities
250
+ // in map-util work.
251
+ Value second;
252
+
253
+ private:
254
+ int index_;
255
+ };
256
+
257
+ template<typename Value>
258
+ const typename SparseArray<Value>::IndexValue&
259
+ SparseArray<Value>::iv(int i) const {
260
+ DCHECK_GE(i, 0);
261
+ DCHECK_LT(i, size_);
262
+ return dense_[i];
263
+ }
264
+
265
+ // Change the maximum size of the array.
266
+ // Invalidates all iterators.
267
+ template<typename Value>
268
+ void SparseArray<Value>::resize(int new_max_size) {
269
+ DebugCheckInvariants();
270
+ if (new_max_size > max_size_) {
271
+ int* a = new int[new_max_size];
272
+ if (sparse_to_dense_) {
273
+ memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
274
+ // Don't need to zero the memory but appease Valgrind.
275
+ if (RunningOnValgrind()) {
276
+ for (int i = max_size_; i < new_max_size; i++)
277
+ a[i] = 0xababababU;
278
+ }
279
+ delete[] sparse_to_dense_;
280
+ }
281
+ sparse_to_dense_ = a;
282
+
283
+ dense_.resize(new_max_size);
284
+ }
285
+ max_size_ = new_max_size;
286
+ if (size_ > max_size_)
287
+ size_ = max_size_;
288
+ DebugCheckInvariants();
289
+ }
290
+
291
+ // Check whether index i is in the array.
292
+ template<typename Value>
293
+ bool SparseArray<Value>::has_index(int i) const {
294
+ DCHECK_GE(i, 0);
295
+ DCHECK_LT(i, max_size_);
296
+ if (static_cast<uint>(i) >= max_size_) {
297
+ return false;
298
+ }
299
+ // Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
300
+ return (uint)sparse_to_dense_[i] < (uint)size_ &&
301
+ dense_[sparse_to_dense_[i]].index_ == i;
302
+ }
303
+
304
+ // Set the value at index i to v.
305
+ template<typename Value>
306
+ typename SparseArray<Value>::iterator SparseArray<Value>::set(int i, Value v) {
307
+ DebugCheckInvariants();
308
+ if (static_cast<uint>(i) >= max_size_) {
309
+ // Semantically, end() would be better here, but we already know
310
+ // the user did something stupid, so begin() insulates them from
311
+ // dereferencing an invalid pointer.
312
+ return begin();
313
+ }
314
+ if (!has_index(i))
315
+ create_index(i);
316
+ return set_existing(i, v);
317
+ }
318
+
319
+ template<typename Value>
320
+ pair<typename SparseArray<Value>::iterator, bool> SparseArray<Value>::insert(
321
+ const value_type& new_value) {
322
+ DebugCheckInvariants();
323
+ pair<typename SparseArray<Value>::iterator, bool> p;
324
+ if (has_index(new_value.index_)) {
325
+ p = make_pair(dense_.begin() + sparse_to_dense_[new_value.index_], false);
326
+ } else {
327
+ p = make_pair(set_new(new_value.index_, new_value.second), true);
328
+ }
329
+ DebugCheckInvariants();
330
+ return p;
331
+ }
332
+
333
+ template<typename Value>
334
+ Value SparseArray<Value>::get(int i, Value defaultv) const {
335
+ if (!has_index(i))
336
+ return defaultv;
337
+ return get_existing(i);
338
+ }
339
+
340
+ template<typename Value>
341
+ typename SparseArray<Value>::iterator SparseArray<Value>::find(int i) {
342
+ if (has_index(i))
343
+ return dense_.begin() + sparse_to_dense_[i];
344
+ return end();
345
+ }
346
+
347
+ template<typename Value>
348
+ typename SparseArray<Value>::const_iterator
349
+ SparseArray<Value>::find(int i) const {
350
+ if (has_index(i)) {
351
+ return dense_.begin() + sparse_to_dense_[i];
352
+ }
353
+ return end();
354
+ }
355
+
356
+ template<typename Value>
357
+ typename SparseArray<Value>::iterator
358
+ SparseArray<Value>::set_existing(int i, Value v) {
359
+ DebugCheckInvariants();
360
+ DCHECK(has_index(i));
361
+ dense_[sparse_to_dense_[i]].second = v;
362
+ DebugCheckInvariants();
363
+ return dense_.begin() + sparse_to_dense_[i];
364
+ }
365
+
366
+ template<typename Value>
367
+ typename SparseArray<Value>::iterator
368
+ SparseArray<Value>::set_new(int i, Value v) {
369
+ DebugCheckInvariants();
370
+ if (static_cast<uint>(i) >= max_size_) {
371
+ // Semantically, end() would be better here, but we already know
372
+ // the user did something stupid, so begin() insulates them from
373
+ // dereferencing an invalid pointer.
374
+ return begin();
375
+ }
376
+ DCHECK(!has_index(i));
377
+ create_index(i);
378
+ return set_existing(i, v);
379
+ }
380
+
381
+ template<typename Value>
382
+ Value SparseArray<Value>::get_existing(int i) const {
383
+ DCHECK(has_index(i));
384
+ return dense_[sparse_to_dense_[i]].second;
385
+ }
386
+
387
+ template<typename Value>
388
+ void SparseArray<Value>::erase(int i) {
389
+ DebugCheckInvariants();
390
+ if (has_index(i))
391
+ erase_existing(i);
392
+ DebugCheckInvariants();
393
+ }
394
+
395
+ template<typename Value>
396
+ void SparseArray<Value>::erase_existing(int i) {
397
+ DebugCheckInvariants();
398
+ DCHECK(has_index(i));
399
+ int di = sparse_to_dense_[i];
400
+ if (di < size_ - 1) {
401
+ dense_[di] = dense_[size_ - 1];
402
+ sparse_to_dense_[dense_[di].index_] = di;
403
+ }
404
+ size_--;
405
+ DebugCheckInvariants();
406
+ }
407
+
408
+ template<typename Value>
409
+ void SparseArray<Value>::create_index(int i) {
410
+ DCHECK(!has_index(i));
411
+ DCHECK_LT(size_, max_size_);
412
+ sparse_to_dense_[i] = size_;
413
+ dense_[size_].index_ = i;
414
+ size_++;
415
+ }
416
+
417
+ template<typename Value> SparseArray<Value>::SparseArray(int max_size) {
418
+ max_size_ = max_size;
419
+ sparse_to_dense_ = new int[max_size];
420
+ dense_.resize(max_size);
421
+ // Don't need to zero the new memory, but appease Valgrind.
422
+ if (RunningOnValgrind()) {
423
+ for (int i = 0; i < max_size; i++) {
424
+ sparse_to_dense_[i] = 0xababababU;
425
+ dense_[i].index_ = 0xababababU;
426
+ }
427
+ }
428
+ size_ = 0;
429
+ DebugCheckInvariants();
430
+ }
431
+
432
+ template<typename Value> SparseArray<Value>::~SparseArray() {
433
+ DebugCheckInvariants();
434
+ delete[] sparse_to_dense_;
435
+ }
436
+
437
+ template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
438
+ DCHECK_LE(0, size_);
439
+ DCHECK_LE(size_, max_size_);
440
+ DCHECK(size_ == 0 || sparse_to_dense_ != NULL);
441
+ }
442
+
443
+ // Comparison function for sorting.
444
+ template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
445
+ const IndexValue& b) {
446
+ return a.index_ < b.index_;
447
+ }
448
+
449
+ } // namespace re2
450
+
451
+ #endif // RE2_UTIL_SPARSE_ARRAY_H__
@@ -0,0 +1,177 @@
1
+ // Copyright 2006 The RE2 Authors. All Rights Reserved.
2
+ // Use of this source code is governed by a BSD-style
3
+ // license that can be found in the LICENSE file.
4
+
5
+ // DESCRIPTION
6
+ //
7
+ // SparseSet<T>(m) is a set of integers in [0, m).
8
+ // It requires sizeof(int)*m memory, but it provides
9
+ // fast iteration through the elements in the set and fast clearing
10
+ // of the set.
11
+ //
12
+ // Insertion and deletion are constant time operations.
13
+ //
14
+ // Allocating the set is a constant time operation
15
+ // when memory allocation is a constant time operation.
16
+ //
17
+ // Clearing the set is a constant time operation (unusual!).
18
+ //
19
+ // Iterating through the set is an O(n) operation, where n
20
+ // is the number of items in the set (not O(m)).
21
+ //
22
+ // The set iterator visits entries in the order they were first
23
+ // inserted into the array. It is safe to add items to the set while
24
+ // using an iterator: the iterator will visit indices added to the set
25
+ // during the iteration, but will not re-visit indices whose values
26
+ // change after visiting. Thus SparseSet can be a convenient
27
+ // implementation of a work queue.
28
+ //
29
+ // The SparseSet implementation is NOT thread-safe. It is up to the
30
+ // caller to make sure only one thread is accessing the set. (Typically
31
+ // these sets are temporary values and used in situations where speed is
32
+ // important.)
33
+ //
34
+ // The SparseSet interface does not present all the usual STL bells and
35
+ // whistles.
36
+ //
37
+ // Implemented with reference to Briggs & Torczon, An Efficient
38
+ // Representation for Sparse Sets, ACM Letters on Programming Languages
39
+ // and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
40
+ //
41
+ // For a generalization to sparse array, see sparse_array.h.
42
+
43
+ // IMPLEMENTATION
44
+ //
45
+ // See sparse_array.h for implementation details
46
+
47
+ #ifndef RE2_UTIL_SPARSE_SET_H__
48
+ #define RE2_UTIL_SPARSE_SET_H__
49
+
50
+ #include "util/util.h"
51
+
52
+ namespace re2 {
53
+
54
+ class SparseSet {
55
+ public:
56
+ SparseSet()
57
+ : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL) {}
58
+
59
+ SparseSet(int max_size) {
60
+ max_size_ = max_size;
61
+ sparse_to_dense_ = new int[max_size];
62
+ dense_ = new int[max_size];
63
+ // Don't need to zero the memory, but do so anyway
64
+ // to appease Valgrind.
65
+ if (RunningOnValgrind()) {
66
+ for (int i = 0; i < max_size; i++) {
67
+ dense_[i] = 0xababababU;
68
+ sparse_to_dense_[i] = 0xababababU;
69
+ }
70
+ }
71
+ size_ = 0;
72
+ }
73
+
74
+ ~SparseSet() {
75
+ delete[] sparse_to_dense_;
76
+ delete[] dense_;
77
+ }
78
+
79
+ typedef int* iterator;
80
+ typedef const int* const_iterator;
81
+
82
+ int size() const { return size_; }
83
+ iterator begin() { return dense_; }
84
+ iterator end() { return dense_ + size_; }
85
+ const_iterator begin() const { return dense_; }
86
+ const_iterator end() const { return dense_ + size_; }
87
+
88
+ // Change the maximum size of the array.
89
+ // Invalidates all iterators.
90
+ void resize(int new_max_size) {
91
+ if (size_ > new_max_size)
92
+ size_ = new_max_size;
93
+ if (new_max_size > max_size_) {
94
+ int* a = new int[new_max_size];
95
+ if (sparse_to_dense_) {
96
+ memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
97
+ if (RunningOnValgrind()) {
98
+ for (int i = max_size_; i < new_max_size; i++)
99
+ a[i] = 0xababababU;
100
+ }
101
+ delete[] sparse_to_dense_;
102
+ }
103
+ sparse_to_dense_ = a;
104
+
105
+ a = new int[new_max_size];
106
+ if (dense_) {
107
+ memmove(a, dense_, size_*sizeof a[0]);
108
+ if (RunningOnValgrind()) {
109
+ for (int i = size_; i < new_max_size; i++)
110
+ a[i] = 0xababababU;
111
+ }
112
+ delete[] dense_;
113
+ }
114
+ dense_ = a;
115
+ }
116
+ max_size_ = new_max_size;
117
+ }
118
+
119
+ // Return the maximum size of the array.
120
+ // Indices can be in the range [0, max_size).
121
+ int max_size() const { return max_size_; }
122
+
123
+ // Clear the array.
124
+ void clear() { size_ = 0; }
125
+
126
+ // Check whether i is in the array.
127
+ bool contains(int i) const {
128
+ DCHECK_GE(i, 0);
129
+ DCHECK_LT(i, max_size_);
130
+ if (static_cast<uint>(i) >= max_size_) {
131
+ return false;
132
+ }
133
+ // Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
134
+ return (uint)sparse_to_dense_[i] < (uint)size_ &&
135
+ dense_[sparse_to_dense_[i]] == i;
136
+ }
137
+
138
+ // Adds i to the set.
139
+ void insert(int i) {
140
+ if (!contains(i))
141
+ insert_new(i);
142
+ }
143
+
144
+ // Set the value at the new index i to v.
145
+ // Fast but unsafe: only use if contains(i) is false.
146
+ void insert_new(int i) {
147
+ if (static_cast<uint>(i) >= max_size_) {
148
+ // Semantically, end() would be better here, but we already know
149
+ // the user did something stupid, so begin() insulates them from
150
+ // dereferencing an invalid pointer.
151
+ return;
152
+ }
153
+ DCHECK(!contains(i));
154
+ DCHECK_LT(size_, max_size_);
155
+ sparse_to_dense_[i] = size_;
156
+ dense_[size_] = i;
157
+ size_++;
158
+ }
159
+
160
+ // Comparison function for sorting.
161
+ // Can sort the sparse array so that future iterations
162
+ // will visit indices in increasing order using
163
+ // sort(arr.begin(), arr.end(), arr.less);
164
+ static bool less(int a, int b) { return a < b; }
165
+
166
+ private:
167
+ int size_;
168
+ int max_size_;
169
+ int* sparse_to_dense_;
170
+ int* dense_;
171
+
172
+ DISALLOW_EVIL_CONSTRUCTORS(SparseSet);
173
+ };
174
+
175
+ } // namespace re2
176
+
177
+ #endif // RE2_UTIL_SPARSE_SET_H__