build-graph 0.1.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1 @@
1
+ Benchmark.cpp.o: Benchmark.cpp Benchmark.h
@@ -0,0 +1,65 @@
1
+ //
2
+ // Benchmark.h
3
+ // DictionarySort
4
+ //
5
+ // Created by Samuel Williams on 2/11/11.
6
+ // Copyright, 2014, by Samuel G. D. Williams. <http://www.codeotaku.com>
7
+ //
8
+
9
+ #pragma once
10
+
11
+ #include <ctime>
12
+
13
+ // A timer class for quickly checking the wall-clock performance of code.
14
+ namespace Benchmark
15
+ {
16
+ typedef double TimeT;
17
+
18
+ class WallTime {
19
+ protected:
20
+ mutable TimeT _last, _total;
21
+
22
+ public:
23
+ WallTime ();
24
+
25
+ void reset ();
26
+ TimeT total () const;
27
+ };
28
+
29
+ class ProcessorTime {
30
+ protected:
31
+ mutable std::clock_t _last, _total;
32
+
33
+ public:
34
+ ProcessorTime();
35
+
36
+ void reset ();
37
+ TimeT total () const;
38
+ };
39
+
40
+ class Timer {
41
+ protected:
42
+ WallTime _wall_time;
43
+ ProcessorTime _processor_time;
44
+
45
+ public:
46
+ Timer();
47
+
48
+ const WallTime & wall_time() const { return _wall_time; }
49
+ const ProcessorTime & processor_time() const { return _processor_time; }
50
+
51
+ void reset ();
52
+
53
+ struct Sample {
54
+ TimeT wall_time_total;
55
+ TimeT processor_time_total;
56
+
57
+ TimeT approximate_processor_usage() const {
58
+ return processor_time_total / wall_time_total;
59
+ }
60
+ };
61
+
62
+ Sample sample() const;
63
+ };
64
+
65
+ }
@@ -0,0 +1,270 @@
1
+ //
2
+ // DictionarySort.h
3
+ // DictionarySort
4
+ //
5
+ // Created by Samuel Williams on 2/11/11.
6
+ // Copyright, 2014, by Samuel G. D. Williams. <http://www.codeotaku.com>
7
+ //
8
+
9
+ #pragma once
10
+
11
+ #include <cmath>
12
+ #include <vector>
13
+ #include <map>
14
+
15
+ #include "ParallelMergeSort.h"
16
+
17
+ template <typename AnyT>
18
+ struct pointer_less_than
19
+ {
20
+ bool operator()(const AnyT a, const AnyT b) {
21
+ return *a < *b;
22
+ }
23
+ };
24
+
25
+ namespace DictionarySort
26
+ {
27
+ // Use std::sort
28
+ //const int SORT_MODE = -1;
29
+ // Use ParallelMergeSort with 2^n threads
30
+ const int SORT_MODE = 3; // = n
31
+
32
+ typedef std::uint64_t IndexT;
33
+
34
+ template <typename CharT, typename MapT>
35
+ class Dictionary {
36
+ public:
37
+ typedef std::vector<CharT> WordT;
38
+ typedef std::vector<WordT> WordsT;
39
+ typedef std::vector<IndexT> OrderT;
40
+
41
+ static const int ORDERED_LT = -1;
42
+ static const int ORDERED_EQ = 0;
43
+ static const int ORDERED_GT = 1;
44
+
45
+ // Compare two order vectors to determine the relative nature of lhs compared to rhs.
46
+ // We assume that lhs and rhs have at least one element each.
47
+ // ORDERED_LT => (lhs < rhs)
48
+ // ORDERED_EQ => (lhs == rhs)
49
+ // ORDERED_GT => (lhs > rhs)
50
+ static int compare(const OrderT & lhs, const OrderT & rhs)
51
+ {
52
+ std::size_t offset = 0;
53
+
54
+ while (offset < lhs.size() && offset < rhs.size()) {
55
+ if (lhs[offset] < rhs[offset])
56
+ return ORDERED_LT;
57
+ else if (lhs[offset] > rhs[offset])
58
+ return ORDERED_GT;
59
+
60
+ offset += 1;
61
+ }
62
+
63
+ if (lhs.size() == rhs.size())
64
+ return ORDERED_EQ;
65
+
66
+ // lhs was longer,
67
+ if (offset < lhs.size())
68
+ return ORDERED_GT;
69
+
70
+ return ORDERED_LT;
71
+ }
72
+
73
+ static int compare(Dictionary * dictionary, const WordT & lhs, const WordT & rhs) {
74
+ std::size_t offset = 0;
75
+
76
+ while (offset < lhs.size() && offset < rhs.size()) {
77
+ IndexT left_order = dictionary->_characterOrder[lhs[offset]];
78
+ IndexT right_order = dictionary->_characterOrder[rhs[offset]];
79
+
80
+ if (left_order < right_order)
81
+ return ORDERED_LT;
82
+ else if (left_order > right_order)
83
+ return ORDERED_GT;
84
+
85
+ offset += 1;
86
+ }
87
+
88
+ if (lhs.size() == rhs.size())
89
+ return ORDERED_EQ;
90
+
91
+ if (offset < lhs.size())
92
+ return ORDERED_GT;
93
+
94
+ return ORDERED_LT;
95
+ }
96
+
97
+ private:
98
+ WordT _alphabet;
99
+
100
+ MapT _characterOrder;
101
+ //int _characterOrder[256];
102
+ //std::map<CharT, IndexT> _characterOrder;
103
+
104
+ IndexT width;
105
+ IndexT characters_per_segment;
106
+
107
+ // This is a light weight wrapper over WordT which caches its OrderT, an integer representation of position based on the given dictionary.
108
+ struct OrderedWord {
109
+ WordT word;
110
+
111
+ // We can generate this as part of the sorting process. Because the sort is parallel, generation of word order (which is relatively expensive) is distributed across multiple processors.
112
+ mutable OrderT order;
113
+
114
+ // The first time this function is called, it must be guaranteed from a single thread.
115
+ // After that, it can be called from multiple threads at the same time.
116
+ // The parallel merge sort algorithm guarantees this.
117
+ const OrderT & fetch_order(Dictionary * dictionary) const {
118
+ if (order.size() == 0 && word.size() > 0)
119
+ order = dictionary->sum(word);
120
+
121
+ return order;
122
+ }
123
+ };
124
+
125
+ struct CompareWordsAscending {
126
+ Dictionary * dictionary;
127
+
128
+ CompareWordsAscending (Dictionary * _dictionary)
129
+ : dictionary(_dictionary)
130
+ {
131
+ }
132
+
133
+ bool operator()(const WordT * a, const WordT * b) const {
134
+ return compare(dictionary, a, b) == ORDERED_LT;
135
+ }
136
+
137
+ bool operator()(const OrderedWord * a, const OrderedWord * b) const {
138
+ return compare(a->fetch_order(dictionary), b->fetch_order(dictionary)) == ORDERED_LT;
139
+ }
140
+ };
141
+
142
+ struct UnorderedWord {
143
+ WordT word;
144
+ Dictionary * dictionary;
145
+
146
+ UnorderedWord(const WordT & _word, Dictionary * _dictionary)
147
+ : word(_word), dictionary(_dictionary) {
148
+
149
+ }
150
+
151
+ bool operator<(const UnorderedWord & other) const {
152
+ return compare(dictionary, *this, other) == ORDERED_LT;
153
+ }
154
+ };
155
+
156
+ public:
157
+ Dictionary(WordT alphabet)
158
+ : _alphabet(alphabet)
159
+ {
160
+ IndexT index = 1;
161
+
162
+ // Build up the character order map
163
+ for (typename WordT::iterator i = _alphabet.begin(); i != _alphabet.end(); ++i) {
164
+ _characterOrder[*i] = index;
165
+ index += 1;
166
+ }
167
+
168
+ width = std::ceil(std::log(alphabet.size()) / std::log(2));
169
+
170
+ // Naturally floor the result by integer division/truncation.
171
+ characters_per_segment = (sizeof(IndexT) * 8) / width;
172
+ }
173
+
174
+ OrderT sum(const WordT & word) {
175
+ OrderT order;
176
+ std::size_t index = 0;
177
+
178
+ while (index < word.size()) {
179
+ IndexT count = characters_per_segment;
180
+ IndexT sum = 0;
181
+
182
+ while (index < word.size()) {
183
+ count -= 1;
184
+
185
+ sum <<= width;
186
+ sum += _characterOrder[word[index]];
187
+
188
+ index += 1;
189
+
190
+ if (count == 0)
191
+ break;
192
+ }
193
+
194
+ // Shift along any remaining count, since we are ordering using the left most significant character.
195
+ sum <<= (count * width);
196
+ order.push_back(sum);
197
+ }
198
+
199
+ return order;
200
+ }
201
+
202
+ // The words will be sorted in-place.
203
+ template <typename ToSortT>
204
+ void sort (ToSortT & words, int mode = 2)
205
+ {
206
+ CompareWordsAscending comparator(this);
207
+
208
+ Benchmark::Timer sort_timer;
209
+
210
+ if (mode == -1) {
211
+ // Sort the words using built-in sorting algorithm, for comparison:
212
+ std::sort(words.begin(), words.end(), comparator);
213
+ } else {
214
+ ParallelMergeSort::sort(words, comparator, std::size_t(mode));
215
+ }
216
+
217
+ auto sample = sort_timer.sample();
218
+
219
+ std::cerr << "--- Completed Dictionary Sort ---" << std::endl;
220
+ std::cerr << " * Dictionary sort time: " << sample.wall_time_total << std::endl;
221
+ std::cerr << " * Processor sort time: " << sample.processor_time_total << std::endl;
222
+ std::cerr << " * Approximate processor usage: " << sample.approximate_processor_usage() << std::endl;
223
+ }
224
+
225
+ // This function can be slow due to the large amount of memory required for large datasets.
226
+ uint64_t sort(const WordsT & input, WordsT & output)
227
+ {
228
+ typedef std::vector<OrderedWord*> OrderedWordsT;
229
+
230
+ // Allocate all words in one go:
231
+ OrderedWord * allocation = new OrderedWord[input.size()];
232
+
233
+ // Copy pointers to intermediate list which will be used for sorting:
234
+ OrderedWordsT words(input.size());
235
+
236
+ // Calculate order vector for each word in preparation for sort.
237
+ for (std::size_t i = 0; i < input.size(); i += 1) {
238
+ words[i] = &allocation[i];
239
+
240
+ words[i]->word = input[i];
241
+
242
+ // We can force generation of the order cache, but performance may be reduced by about 10%.
243
+ //words[i]->fetch_order(this);
244
+ }
245
+
246
+ // Change the mode from -1 for std::sort, to 0..n for ParallelMergeSort where 2^n is the number of threads to use.
247
+ sort(words, SORT_MODE);
248
+
249
+ // Prepare container for sorted output:
250
+ output.reserve(input.size());
251
+ output.resize(0);
252
+
253
+ uint64_t checksum = 1, offset = 1;
254
+ // Copy sorted words to output vector.
255
+ for (typename OrderedWordsT::iterator i = words.begin(); i != words.end(); ++i) {
256
+ output.push_back((*i)->word);
257
+
258
+ // Compute a very simple checksum for verifying sorted order.
259
+ const OrderT & order = (*i)->fetch_order(this);
260
+ for (typename OrderT::const_iterator j = order.begin(); j != order.end(); ++j) {
261
+ checksum ^= *j + (offset++ % checksum);
262
+ }
263
+ }
264
+
265
+ delete[] allocation;
266
+
267
+ return checksum;
268
+ }
269
+ };
270
+ }
@@ -0,0 +1,278 @@
1
+ //
2
+ // ParallelMergeSort.h
3
+ // DictionarySort
4
+ //
5
+ // Created by Samuel Williams on 2/11/11.
6
+ // Copyright, 2014, by Samuel G. D. Williams. <http://www.codeotaku.com>
7
+ //
8
+
9
+ #pragma once
10
+
11
+ #include <thread>
12
+ #include "Benchmark.h"
13
+
14
+ // A parallel merge sort algorithm template implemented using C++11 threads.
15
+ namespace ParallelMergeSort
16
+ {
17
+ /*
18
+ # Parallel Merge Algorithm
19
+
20
+ This parallel merge algorithm uses two threads and requires no synchrnoisation (e.g. lock free).
21
+
22
+ Given two sorted sequences i and j, such that |i| == |j| or |i| == |j|-1, we can merge these together by taking the |i| smallest items and |j| biggest items independently. The final sorted list q which consists of all items from i and j in order, has a basic property such that the lower |i| items in q and the upper |j| items in q are mutually exclusive. Therefore, we can select each half of the list independently:
23
+
24
+ ij = [1, 3, 5, 2, 4, 6]
25
+ i = [1, 3, 5]
26
+ j = [2, 4, 6]
27
+
28
+ q = [1, 2, 3, 4, 5, 6]
29
+
30
+ In this case, we can see that q[0,3] can be formed by merging the first 3 smallest items, and q[3,6] can be formed by merging the largest 3 items. Because these are mutually exclusive, this process can be done on two threads.
31
+
32
+ Other merging algorithms exist, but may require locking. Another approach worth exploring would be to form in parallel n heaps, where all items heap[k] < heap[k+1]. If the heaps can be constructed in sorted order, the destination array will naturally contain the final sorted list.
33
+ */
34
+
35
+ // This implementation assumes that if there are |i| items on the left side, there must be at least |i| items on the right side.
36
+ template <typename ArrayT, typename ComparatorT>
37
+ struct ParallelLeftMerge {
38
+ ArrayT & source, & destination;
39
+ const ComparatorT & comparator;
40
+ std::size_t lower_bound, middle_bound;
41
+
42
+ void operator()() {
43
+ std::size_t left = lower_bound;
44
+ std::size_t right = middle_bound;
45
+ std::size_t offset = lower_bound;
46
+
47
+ while (offset < middle_bound) {
48
+ if (comparator(source[left], source[right])) {
49
+ destination[offset++] = source[left++];
50
+ } else {
51
+ destination[offset++] = source[right++];
52
+ }
53
+ }
54
+ }
55
+ };
56
+
57
+ // This implementation assumes that if there are |j| items on the right side, there are at least |j| - 1 items on the left side.
58
+ template <typename ArrayT, typename ComparatorT>
59
+ struct ParallelRightMerge {
60
+ ArrayT & source, & destination;
61
+ const ComparatorT & comparator;
62
+ std::size_t lower_bound, middle_bound, upper_bound;
63
+
64
+ void operator()() {
65
+ std::size_t left = middle_bound-1;
66
+ std::size_t right = upper_bound-1;
67
+ std::size_t offset = upper_bound-1;
68
+
69
+ while (offset >= middle_bound) {
70
+ if (comparator(source[left], source[right])) {
71
+ destination[offset--] = source[right--];
72
+ } else {
73
+ destination[offset--] = source[left--];
74
+ if (left == lower_bound) {
75
+ // There are no more items on left hand side - in this case, there is at most one more item on right side to copy.
76
+ if (offset >= middle_bound) {
77
+ destination[offset] = source[right];
78
+ }
79
+
80
+ break;
81
+ }
82
+ }
83
+ }
84
+ }
85
+ };
86
+
87
+ // Merge two sorted sub-sequences sequentially (from left to right).
88
+ // Is it possible to merge without copying from source to destination, and what are the performance implications?
89
+ template <typename ArrayT, typename ComparatorT>
90
+ void merge (ArrayT & source, ArrayT & destination, const ComparatorT & comparator, std::size_t lower_bound, std::size_t middle_bound, std::size_t upper_bound) {
91
+ std::size_t left = lower_bound;
92
+ std::size_t right = middle_bound;
93
+ std::size_t offset = lower_bound;
94
+
95
+ // We merge both sub-sequences, defined as [lower_bound, middle_bound] and [middle_bound, upper_bound].
96
+ while (true) {
97
+ if (comparator(source[left], source[right])) {
98
+ destination[offset++] = source[left++];
99
+
100
+ // If we have adjusted left, we may have exhausted left side:
101
+ if (left == middle_bound) {
102
+ // We have no more elements in lower half.
103
+ std::copy(source.begin() + right, source.begin() + upper_bound, destination.begin() + offset);
104
+ break;
105
+ }
106
+ } else {
107
+ destination[offset++] = source[right++];
108
+
109
+ // As above, we may have exhausted right side:
110
+ if (right == upper_bound) {
111
+ // We have no more elements in upper half.
112
+ std::copy(source.begin() + left, source.begin() + middle_bound, destination.begin() + offset);
113
+ break;
114
+ }
115
+ }
116
+ }
117
+ }
118
+
119
+ template <typename ArrayT, typename ComparatorT>
120
+ void partition(ArrayT & array, ArrayT & temporary, const ComparatorT & comparator, std::size_t lower_bound, std::size_t upper_bound, std::size_t threaded);
121
+
122
+ // This functor is used for parallelizing the top level partition function.
123
+ template <typename ArrayT, typename ComparatorT>
124
+ struct ParallelPartition {
125
+ ArrayT & array, & temporary;
126
+ const ComparatorT & comparator;
127
+ std::size_t lower_bound, upper_bound, threaded;
128
+
129
+ void operator()() {
130
+ partition(array, temporary, comparator, lower_bound, upper_bound, threaded);
131
+ }
132
+ };
133
+
134
+ /** Recursive Partition Algorithm.
135
+
136
+ This algorithm uses O(2n) memory to reduce the amount of copies that occurs. It does this by using a parity such that at each point in the partition tree we provide a source and destination. Given the functions P (partition) and M (merge), we have the following theorem:
137
+
138
+ P(A=source, B=destination) sorts source into destination. A=[...] means that we are considering only a subset of A. Subscript is not given, but should be intuitive given the definition of merge sort. (x) on the left gives the order of each step as performed sequentially.
139
+
140
+ == [ PARTITION ] == == [ MERGE ] ==
141
+
142
+ (1) P(A=[1,3,4,2], B=[1,3,2,4]) (14) M(A=[1,3,2,4], B): B = [1,2,3,4]
143
+ |
144
+ (2) |---P(B=[1,3], A=[1,3]) (7) M(B=[1,3], A): A=[1,3]
145
+ | |
146
+ (3) | |---P(A=[1], B=[1]) (4) M(A=[1], B): B=[1]
147
+ (5) | \---P(A=[3], B=[3]) (6) M(A=[3], B): B=[3]
148
+ |
149
+ (8) \---P(B=[4,2], A=[4,2]) (13) M(B=[4,2], A): A = [2,4]
150
+ |
151
+ (9) |---P(A=[4],B=[4]) (10) M(A=[4], B): B=[4]
152
+ (11) \---P(A=[2],B=[2]) (12) M(A=[2], B): B=[2]
153
+
154
+ During merge, we fold back up, and alternate between A and B for the current storage. This avoids the need to dynamically allocate memory during sort and avoids unnecessary copies.
155
+
156
+ */
157
+
158
+ // Sequential partition algorithm. Provide an array, and an upper and lower bound to sort.
159
+ template <typename ArrayT, typename ComparatorT>
160
+ void partition(ArrayT & source, ArrayT & destination, const ComparatorT & comparator, const std::size_t & lower_bound, const std::size_t & upper_bound) {
161
+ std::size_t count = upper_bound - lower_bound;
162
+
163
+ // In the case where count == 1, we are at the very bottom of the tree and both source and destination will be the same.
164
+ // The same applies when count == 2, but we might need to swap the items around if they are not in the right order.
165
+ if (count == 2) {
166
+ if (!comparator(destination[lower_bound], destination[lower_bound+1])) {
167
+ std::swap(destination[lower_bound], destination[lower_bound+1]);
168
+ }
169
+ // After this point, where count > 2, source and destination are different.
170
+ } else if (count > 2) {
171
+ std::size_t middle_bound = (lower_bound + upper_bound) / 2;
172
+
173
+ // While it is possible to simply call partition, we try to avoid recursion by folding up the bottom two cases:
174
+ // (count == 1), do nothing
175
+ // (count == 2), swap if order is not correct
176
+ // (count > 2), partition
177
+ // After profilling, I found that the benefit of unrolling (count == 2) was minimal - there was about a 2-3% improvement.
178
+
179
+ std::size_t lower_count = middle_bound - lower_bound;
180
+ if (lower_count > 1)
181
+ partition(destination, source, comparator, lower_bound, middle_bound);
182
+
183
+ std::size_t upper_count = upper_bound - middle_bound;
184
+ if (upper_count > 1)
185
+ partition(destination, source, comparator, middle_bound, upper_bound);
186
+
187
+ merge(source, destination, comparator, lower_bound, middle_bound, upper_bound);
188
+ }
189
+ }
190
+
191
+ /** Parallel Partition Algorithm
192
+
193
+ This parallel partition algorithm which controls the downward descent of the merge sort algorithm is designed for large datasets. Because merge sort follows a binary tree structure, the work is essentially split between two threads at each node in the tree. Firstly, we must recursively call partition on two separate threads. Once this is done, we have two ascending sequences, and we merge these together, again using two threads, one for left sequence and one for right sequence.
194
+
195
+ Because higher level threads will be waiting on lower level threads, the value of threaded should be equal to 2^threaded == processors for best performance.
196
+
197
+ */
198
+
199
+ // Use this to control whether parallal partition is used.
200
+ // For large data sets > 500_000 items, you will see an improvement of about ~50% per thread.
201
+ const bool PARALLEL_PARTITION = true;
202
+
203
+ // Use this to control whether parallel merge is used.
204
+ // For large data sets > 1_000_000 items, you will see an improvement of about 15%.
205
+ const bool PARALLEL_MERGE = true;
206
+
207
+ // If you make this number too small, e.g. <= 2, you may cause synchronsation issues, because you will force parallelisation
208
+ // for base cases which actually need to be sequential to ensure that comparison cache is generated correctly.
209
+ const std::size_t PARALLEL_MERGE_MINIMUM_COUNT = 128;
210
+
211
+ // Provide an array, and an upper and lower bound, along with the number of threads to use.
212
+ template <typename ArrayT, typename ComparatorT>
213
+ void partition(ArrayT & source, ArrayT & destination, const ComparatorT & comparator, std::size_t lower_bound, std::size_t upper_bound, std::size_t threaded) {
214
+ std::size_t count = upper_bound - lower_bound;
215
+
216
+ if (count > 1) {
217
+ std::size_t middle_bound = (lower_bound + upper_bound) / 2;
218
+
219
+ //Benchmark::WallTime tp;
220
+ if (PARALLEL_PARTITION && threaded > 0) {
221
+ // We could check whether there is any work to do before creating threads, but we assume
222
+ // that threads will only be created high up in the tree by default, so there *should*
223
+ // be a significant work available per-thread.
224
+ ParallelPartition<ArrayT, ComparatorT>
225
+ lower_partition = {destination, source, comparator, lower_bound, middle_bound, threaded - 1},
226
+ upper_partition = {destination, source, comparator, middle_bound, upper_bound, threaded - 1};
227
+
228
+ std::thread
229
+ lower_thread(lower_partition),
230
+ upper_thread(upper_partition);
231
+
232
+ upper_thread.join();
233
+ lower_thread.join();
234
+ } else {
235
+ // We have hit the bottom of our thread limit - could you use std::sort here for improved performance?
236
+ partition(destination, source, comparator, lower_bound, middle_bound);
237
+ partition(destination, source, comparator, middle_bound, upper_bound);
238
+ }
239
+ //std::cerr << "Partition Time: " << tp.total() << " [" << lower_bound << " -> " << upper_bound << " : " << threaded << " ]" << std::endl;
240
+
241
+ //Benchmark::WallTime tm;
242
+ if (PARALLEL_MERGE && threaded > 0 && count > PARALLEL_MERGE_MINIMUM_COUNT) {
243
+ // By the time we get here, we are sure that both left and right partitions have been merged, e.g. we have two ordered sequences [lower_bound, middle_bound] and [middle_bound, upper_bound]. Now, we need to join them together:
244
+ ParallelLeftMerge<ArrayT, ComparatorT> left_merge = {source, destination, comparator, lower_bound, middle_bound};
245
+ ParallelRightMerge<ArrayT, ComparatorT> right_merge = {source, destination, comparator, lower_bound, middle_bound, upper_bound};
246
+
247
+ std::thread
248
+ left_thread(left_merge),
249
+ right_thread(right_merge);
250
+
251
+ left_thread.join();
252
+ right_thread.join();
253
+ } else {
254
+ // We have hit the bottom of our thread limit, or the merge minimum count.
255
+ merge(source, destination, comparator, lower_bound, middle_bound, upper_bound);
256
+ }
257
+ //std::cerr << "Merge Time: " << tm.total() << " [" << lower_bound << " -> " << upper_bound << " : " << threaded << " ]" << std::endl;
258
+ }
259
+ }
260
+
261
+ /** Parallel Merge Sort, main entry point.
262
+
263
+ Given an array of items, a comparator functor, use at most 2^threaded threads to sort the items.
264
+
265
+ */
266
+ template <typename ArrayT, typename ComparatorT>
267
+ void sort(ArrayT & array, const ComparatorT & comparator, std::size_t threaded = 2) {
268
+ // Is all this swapping around really necessary?
269
+ ArrayT temporary(array.begin(), array.end());
270
+
271
+ //Benchmark::WallTime ts;
272
+ if (threaded == 0)
273
+ partition(temporary, array, comparator, 0, array.size());
274
+ else
275
+ partition(temporary, array, comparator, 0, array.size(), threaded);
276
+ //std::cerr << "Total sort time: " << ts.total() << std::endl;
277
+ }
278
+ }