crmf 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +12 -0
  3. data/crmf.gemspec +105 -3
  4. data/ext/crlibm-1.0beta5/AUTHORS +2 -0
  5. data/ext/crlibm-1.0beta5/CMakeLists.txt +154 -0
  6. data/ext/crlibm-1.0beta5/COPYING +340 -0
  7. data/ext/crlibm-1.0beta5/COPYING.LIB +504 -0
  8. data/ext/crlibm-1.0beta5/ChangeLog +125 -0
  9. data/ext/crlibm-1.0beta5/Makefile.am +134 -0
  10. data/ext/crlibm-1.0beta5/NEWS +0 -0
  11. data/ext/crlibm-1.0beta5/README +31 -0
  12. data/ext/crlibm-1.0beta5/README.DEV +23 -0
  13. data/ext/crlibm-1.0beta5/README.md +5 -0
  14. data/ext/crlibm-1.0beta5/TODO +66 -0
  15. data/ext/crlibm-1.0beta5/VERSION +1 -0
  16. data/ext/crlibm-1.0beta5/acos-td.c +1195 -0
  17. data/ext/crlibm-1.0beta5/acos-td.h +629 -0
  18. data/ext/crlibm-1.0beta5/asin-td.c +1297 -0
  19. data/ext/crlibm-1.0beta5/asin-td.h +620 -0
  20. data/ext/crlibm-1.0beta5/asincos.c +4488 -0
  21. data/ext/crlibm-1.0beta5/asincos.h +575 -0
  22. data/ext/crlibm-1.0beta5/atan-itanium.c +846 -0
  23. data/ext/crlibm-1.0beta5/atan-pentium.c +280 -0
  24. data/ext/crlibm-1.0beta5/atan-pentium.h +343 -0
  25. data/ext/crlibm-1.0beta5/atan_accurate.c +341 -0
  26. data/ext/crlibm-1.0beta5/atan_accurate.h +198 -0
  27. data/ext/crlibm-1.0beta5/atan_fast.c +506 -0
  28. data/ext/crlibm-1.0beta5/atan_fast.h +680 -0
  29. data/ext/crlibm-1.0beta5/configure.ac +419 -0
  30. data/ext/crlibm-1.0beta5/crlibm.h +204 -0
  31. data/ext/crlibm-1.0beta5/crlibm.spec +42 -0
  32. data/ext/crlibm-1.0beta5/crlibm_private.c +397 -0
  33. data/ext/crlibm-1.0beta5/crlibm_private.h +1048 -0
  34. data/ext/crlibm-1.0beta5/csh_fast.c +721 -0
  35. data/ext/crlibm-1.0beta5/csh_fast.h +771 -0
  36. data/ext/crlibm-1.0beta5/double-extended.h +496 -0
  37. data/ext/crlibm-1.0beta5/exp-itanium.c +723 -0
  38. data/ext/crlibm-1.0beta5/exp-td-standalone.c +87 -0
  39. data/ext/crlibm-1.0beta5/exp-td.c +1363 -0
  40. data/ext/crlibm-1.0beta5/exp-td.h +685 -0
  41. data/ext/crlibm-1.0beta5/exp_build_coeffs/exp_fast_table.c +125 -0
  42. data/ext/crlibm-1.0beta5/expm1-standalone.c +119 -0
  43. data/ext/crlibm-1.0beta5/expm1.c +2515 -0
  44. data/ext/crlibm-1.0beta5/expm1.h +715 -0
  45. data/ext/crlibm-1.0beta5/interval.h +238 -0
  46. data/ext/crlibm-1.0beta5/log-de.c +480 -0
  47. data/ext/crlibm-1.0beta5/log-de.h +747 -0
  48. data/ext/crlibm-1.0beta5/log-de2.c +280 -0
  49. data/ext/crlibm-1.0beta5/log-de2.h +2352 -0
  50. data/ext/crlibm-1.0beta5/log-td.c +1158 -0
  51. data/ext/crlibm-1.0beta5/log-td.h +819 -0
  52. data/ext/crlibm-1.0beta5/log.c +2244 -0
  53. data/ext/crlibm-1.0beta5/log.h +1592 -0
  54. data/ext/crlibm-1.0beta5/log10-td.c +906 -0
  55. data/ext/crlibm-1.0beta5/log10-td.h +823 -0
  56. data/ext/crlibm-1.0beta5/log1p.c +1295 -0
  57. data/ext/crlibm-1.0beta5/log2-td.c +1521 -0
  58. data/ext/crlibm-1.0beta5/log2-td.h +821 -0
  59. data/ext/crlibm-1.0beta5/log2_accurate.c +330 -0
  60. data/ext/crlibm-1.0beta5/log2_accurate.h +261 -0
  61. data/ext/crlibm-1.0beta5/log_accurate.c +133 -0
  62. data/ext/crlibm-1.0beta5/log_accurate.h +261 -0
  63. data/ext/crlibm-1.0beta5/log_fast.c +360 -0
  64. data/ext/crlibm-1.0beta5/log_fast.h +440 -0
  65. data/ext/crlibm-1.0beta5/pow.c +1396 -0
  66. data/ext/crlibm-1.0beta5/pow.h +3101 -0
  67. data/ext/crlibm-1.0beta5/prepare +20 -0
  68. data/ext/crlibm-1.0beta5/rem_pio2_accurate.c +219 -0
  69. data/ext/crlibm-1.0beta5/rem_pio2_accurate.h +53 -0
  70. data/ext/crlibm-1.0beta5/scs_lib/AUTHORS +3 -0
  71. data/ext/crlibm-1.0beta5/scs_lib/COPYING +504 -0
  72. data/ext/crlibm-1.0beta5/scs_lib/ChangeLog +16 -0
  73. data/ext/crlibm-1.0beta5/scs_lib/Doxyfile.dev +939 -0
  74. data/ext/crlibm-1.0beta5/scs_lib/Doxyfile.user +939 -0
  75. data/ext/crlibm-1.0beta5/scs_lib/INSTALL +215 -0
  76. data/ext/crlibm-1.0beta5/scs_lib/Makefile.am +17 -0
  77. data/ext/crlibm-1.0beta5/scs_lib/NEWS +0 -0
  78. data/ext/crlibm-1.0beta5/scs_lib/README +9 -0
  79. data/ext/crlibm-1.0beta5/scs_lib/README.DEV +38 -0
  80. data/ext/crlibm-1.0beta5/scs_lib/TODO +4 -0
  81. data/ext/crlibm-1.0beta5/scs_lib/VERSION +1 -0
  82. data/ext/crlibm-1.0beta5/scs_lib/addition_scs.c +623 -0
  83. data/ext/crlibm-1.0beta5/scs_lib/division_scs.c +110 -0
  84. data/ext/crlibm-1.0beta5/scs_lib/double2scs.c +174 -0
  85. data/ext/crlibm-1.0beta5/scs_lib/main.dox +104 -0
  86. data/ext/crlibm-1.0beta5/scs_lib/multiplication_scs.c +339 -0
  87. data/ext/crlibm-1.0beta5/scs_lib/poly_fct.c +112 -0
  88. data/ext/crlibm-1.0beta5/scs_lib/print_scs.c +73 -0
  89. data/ext/crlibm-1.0beta5/scs_lib/rand_scs.c +63 -0
  90. data/ext/crlibm-1.0beta5/scs_lib/scs.h +353 -0
  91. data/ext/crlibm-1.0beta5/scs_lib/scs2double.c +411 -0
  92. data/ext/crlibm-1.0beta5/scs_lib/scs2mpf.c +58 -0
  93. data/ext/crlibm-1.0beta5/scs_lib/scs2mpfr.c +61 -0
  94. data/ext/crlibm-1.0beta5/scs_lib/scs_private.c +23 -0
  95. data/ext/crlibm-1.0beta5/scs_lib/scs_private.h +133 -0
  96. data/ext/crlibm-1.0beta5/scs_lib/wrapper_scs.h +486 -0
  97. data/ext/crlibm-1.0beta5/scs_lib/zero_scs.c +52 -0
  98. data/ext/crlibm-1.0beta5/trigo_accurate.c +501 -0
  99. data/ext/crlibm-1.0beta5/trigo_accurate.h +331 -0
  100. data/ext/crlibm-1.0beta5/trigo_fast.c +1243 -0
  101. data/ext/crlibm-1.0beta5/trigo_fast.h +639 -0
  102. data/ext/crlibm-1.0beta5/trigpi.c +1169 -0
  103. data/ext/crlibm-1.0beta5/trigpi.h +556 -0
  104. data/ext/crlibm-1.0beta5/triple-double.c +57 -0
  105. data/ext/crlibm-1.0beta5/triple-double.h +1380 -0
  106. data/ext/crmf/crmf.c +117 -20
  107. data/ext/crmf/extconf.rb +12 -8
  108. data/lib/crmf/version.rb +1 -1
  109. data/tests/perf.rb +100 -219
  110. metadata +108 -10
  111. data/ext/crlibm-1.0beta4.tar.gz +0 -0
@@ -0,0 +1,1295 @@
1
+ /*
2
+ * Correctly rounded log1p(x) = log(1 + x)
3
+ *
4
+ * Author : Christoph Lauter (ENS Lyon)
5
+ *
6
+ * This file is part of the crlibm library developed by the Arenaire
7
+ * project at Ecole Normale Superieure de Lyon
8
+ *
9
+ * This program is free software; you can redistribute it and/or modify
10
+ * it under the terms of the GNU Lesser General Public License as published by
11
+ * the Free Software Foundation; either version 2 of the License, or
12
+ * (at your option) any later version.
13
+ *
14
+ * This program is distributed in the hope that it will be useful,
15
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
+ * GNU General Public License for more details.
18
+ *
19
+ * You should have received a copy of the GNU Lesser General Public License
20
+ * along with this program; if not, write to the Free Software
21
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22
+ */
23
+
24
+
25
+ #include <stdio.h>
26
+ #include <stdlib.h>
27
+ #include "crlibm.h"
28
+ #include "crlibm_private.h"
29
+ #include "triple-double.h"
30
+ #include "log-td.h"
31
+
32
+ #define AVOID_FMA 0
33
+
34
+
35
+
36
+ void log1p_td_accurate(double *logh, double *logm, double *logl, double ed, int index,
37
+ double zh, double zm, double zl, double logih, double logim) {
38
+ double highPoly, t1h, t1l, t2h, t2l, t3h, t3l, t4h, t4l, t5h, t5l, t6h, t6l, t7h, t7l, t8h, t8l, t9h, t9l, t10h, t10l, t11h, t11l;
39
+ double t12h, t12l, t13h, t13l, t14h, t14l, zSquareh, zSquarem, zSquarel, zCubeh, zCubem, zCubel, higherPolyMultZh, higherPolyMultZm;
40
+ double higherPolyMultZl, zSquareHalfh, zSquareHalfm, zSquareHalfl, polyWithSquareh, polyWithSquarem, polyWithSquarel;
41
+ double polyh, polym, polyl, logil, logyh, logym, logyl, loghover, logmover, loglover, log2edhover, log2edmover, log2edlover;
42
+ double log2edh, log2edm, log2edl;
43
+
44
+
45
+ #if EVAL_PERF
46
+ crlibm_second_step_taken++;
47
+ #endif
48
+
49
+
50
+ /* Accurate phase:
51
+
52
+ Argument reduction is already done.
53
+ We must return logh, logm and logl representing the intermediate result in 118 bits precision.
54
+
55
+ We use a 14 degree polynomial, computing the first 3 (the first is 0) coefficients in triple double,
56
+ calculating the next 7 coefficients in double double arithmetics and the last in double.
57
+
58
+ */
59
+
60
+ /* Start of the horner scheme */
61
+
62
+ #if defined(PROCESSOR_HAS_FMA) && !defined(AVOID_FMA)
63
+ highPoly = FMA(FMA(FMA(FMA(accPolyC14,zh,accPolyC13),zh,accPolyC12),zh,accPolyC11),zh,accPolyC10);
64
+ #else
65
+ highPoly = accPolyC10 + zh * (accPolyC11 + zh * (accPolyC12 + zh * (accPolyC13 + zh * accPolyC14)));
66
+ #endif
67
+
68
+ /* We want to write
69
+
70
+ accPolyC3 + zh * (accPoly4 + zh * (accPoly5 + zh * (accPoly6 + zh * (accPoly7 + zh * (accPoly8 + zh * (accPoly9 + zh * highPoly))))));
71
+ ( t14 t13 t12 t11 t10 t9 t8 t7 t6 t5 t4 t3 t2 t1 )
72
+
73
+ with all additions and multiplications in double double arithmetics
74
+ but we will produce intermediate results labelled t1h/t1l thru t14h/t14l
75
+ */
76
+
77
+ Mul12(&t1h, &t1l, zh, highPoly);
78
+ Add22(&t2h, &t2l, accPolyC9h, accPolyC9l, t1h, t1l);
79
+ Mul22(&t3h, &t3l, zh, zm, t2h, t2l);
80
+ Add22(&t4h, &t4l, accPolyC8h, accPolyC8l, t3h, t3l);
81
+ Mul22(&t5h, &t5l, zh, zm, t4h, t4l);
82
+ Add22(&t6h, &t6l, accPolyC7h, accPolyC7l, t5h, t5l);
83
+ Mul22(&t7h, &t7l, zh, zm, t6h, t6l);
84
+ Add22(&t8h, &t8l, accPolyC6h, accPolyC6l, t7h, t7l);
85
+ Mul22(&t9h, &t9l, zh, zm, t8h, t8l);
86
+ Add22(&t10h, &t10l, accPolyC5h, accPolyC5l, t9h, t9l);
87
+ Mul22(&t11h, &t11l, zh, zm, t10h, t10l);
88
+ Add22(&t12h, &t12l, accPolyC4h, accPolyC4l, t11h, t11l);
89
+ Mul22(&t13h, &t13l, zh, zm, t12h, t12l);
90
+ Add22(&t14h, &t14l, accPolyC3h, accPolyC3l, t13h, t13l);
91
+
92
+ /* We must now prepare (zh + zm)^2 and (zh + zm)^3 as triple doubles */
93
+
94
+ Mul33(&zSquareh, &zSquarem, &zSquarel, zh, zm, zl, zh, zm, zl);
95
+ Mul33(&zCubeh, &zCubem, &zCubel, zh, zm, zl, zSquareh, zSquarem, zSquarel);
96
+
97
+ /* We can now multiplicate the middle and higher polynomial by z^3 */
98
+
99
+ Mul233(&higherPolyMultZh, &higherPolyMultZm, &higherPolyMultZl, t14h, t14l, zCubeh, zCubem, zCubel);
100
+
101
+ /* Multiply now z^2 by -1/2 (exact op) and add to middle and higher polynomial */
102
+
103
+ zSquareHalfh = zSquareh * -0.5;
104
+ zSquareHalfm = zSquarem * -0.5;
105
+ zSquareHalfl = zSquarel * -0.5;
106
+
107
+ Add33(&polyWithSquareh, &polyWithSquarem, &polyWithSquarel,
108
+ zSquareHalfh, zSquareHalfm, zSquareHalfl,
109
+ higherPolyMultZh, higherPolyMultZm, higherPolyMultZl);
110
+
111
+ /* Add now zh and zm to obtain the polynomial evaluation result */
112
+
113
+ Add33(&polyh, &polym, &polyl, zh, zm, zl, polyWithSquareh, polyWithSquarem, polyWithSquarel);
114
+
115
+ /* Reconstruct now log(y) = log(1 + z) - log(ri) by adding logih, logim, logil
116
+ logil has not been read to the time, do this first
117
+ */
118
+
119
+ logil = argredtable[index].logil;
120
+
121
+ Add33(&logyh, &logym, &logyl, logih, logim, logil, polyh, polym, polyl);
122
+
123
+ /* Multiply log2 with E, i.e. log2h, log2m, log2l by ed
124
+ ed is always less than 2^(12) and log2h and log2m are stored with at least 12 trailing zeros
125
+ So multiplying naively is correct (up to 134 bits at least)
126
+
127
+ The final result is thus obtained by adding log2 * E to log(y)
128
+ */
129
+
130
+ log2edhover = log2h * ed;
131
+ log2edmover = log2m * ed;
132
+ log2edlover = log2l * ed;
133
+
134
+ /* It may be necessary to renormalize the tabulated value (multiplied by ed) before adding
135
+ the to the log(y)-result
136
+
137
+ If needed, uncomment the following Renormalize3-Statement and comment out the copies
138
+ following it.
139
+ */
140
+
141
+ /* Renormalize3(&log2edh, &log2edm, &log2edl, log2edhover, log2edmover, log2edlover); */
142
+
143
+ log2edh = log2edhover;
144
+ log2edm = log2edmover;
145
+ log2edl = log2edlover;
146
+
147
+ Add33(&loghover, &logmover, &loglover, log2edh, log2edm, log2edl, logyh, logym, logyl);
148
+
149
+ /* Since we can not guarantee in each addition and multiplication procedure that
150
+ the results are not overlapping, we must renormalize the result before handing
151
+ it over to the final rounding
152
+ */
153
+
154
+ Renormalize3(logh,logm,logl,loghover,logmover,loglover);
155
+
156
+ }
157
+
158
+
159
+
160
+ /*************************************************************
161
+ *************************************************************
162
+ * ROUNDED TO NEAREST *
163
+ *************************************************************
164
+ *************************************************************/
165
+ double log1p_rn(double x){
166
+ db_number xdb, shdb, scaledb;
167
+ double yh, yl, ed, ri, logih, logim, yhrih, yhril, ylri, t1, t2, t3, t4, t5, t6, zh, zm, zl;
168
+ double polyHorner, zhSquareh, zhSquarel, polyUpper, zhSquareHalfh, zhSquareHalfl;
169
+ double t1h, t1l, t2h, t2l, ph, pl, log2edh, log2edl, logTabPolyh, logTabPolyl, logh, logm, logl, roundcst;
170
+ double sh, sl;
171
+ int E, index;
172
+
173
+
174
+ xdb.d=x;
175
+
176
+ /* Filter cases */
177
+ if ((xdb.i[HI] & 0x7fffffff) < 0x3c900000) {
178
+ /* We are less than 2^(-54) and return simply an adjusted x
179
+ This captures also the algebraic case x = 0
180
+ */
181
+ return x;
182
+ }
183
+
184
+ if (((xdb.i[HI] & 0x80000000) != 0) && ((xdb.i[HI] & 0x7fffffff) >= 0x3ff00000)) {
185
+ /* We are less or equal than -1 (-inf and NaN, too),
186
+ we return -inf for -1 and NaN otherwise
187
+ */
188
+ if (x == -1.0) return x/0.0;
189
+
190
+
191
+ return (x-x)/0.0;
192
+ }
193
+
194
+ if ((xdb.i[HI] & 0x7ff00000) == 0x7ff00000) {
195
+ /* We are +inf or NaN
196
+ If +inf, we return +inf (x+x)
197
+ If NaN, we return NaN (x+x)
198
+ */
199
+ return x+x;
200
+ }
201
+
202
+ /* Test if |x| < 2^(-8)
203
+
204
+ If yes, short-circuit the range reduction
205
+
206
+ */
207
+
208
+ if ((xdb.i[HI] & 0x7fffffff) < 0x3f700000) {
209
+ /* Use the polynomial p(zh + zl) approximating log(1+zh+zl) directly
210
+ Set E and index to values that read 0.0 in the accurate phase.
211
+ */
212
+ logih = 0.0;
213
+ logim = 0.0;
214
+ index = 0;
215
+ ed = 0.0;
216
+ index = 0;
217
+ zh = x;
218
+ zm = 0.0;
219
+ zl = 0.0;
220
+ } else {
221
+ /* If we are here, |x| >= 2^(-8) and we must perform range reduction */
222
+
223
+ /* Compute first exactly
224
+
225
+ sh + sl = 1 + x
226
+
227
+ x can move over 1, so use a conditional Add12
228
+ */
229
+
230
+ Add12Cond(sh,sl,1.0,x);
231
+
232
+ /* Transform higher order double to integer */
233
+
234
+ shdb.d = sh;
235
+
236
+ /* Extract exponent and mantissa
237
+ Do range reduction,
238
+ yielding to E holding the exponent and
239
+ y the mantissa between sqrt(2)/2 and sqrt(2)
240
+ */
241
+ E = 0;
242
+ E += (shdb.i[HI]>>20)-1023; /* extract the exponent */
243
+ index = (shdb.i[HI] & 0x000fffff);
244
+ shdb.i[HI] = index | 0x3ff00000; /* do exponent = 0 */
245
+ index = (index + (1<<(20-L-1))) >> (20-L);
246
+
247
+ /* reduce such that sqrt(2)/2 < xdb.d < sqrt(2) */
248
+ if (index >= MAXINDEX){ /* corresponds to xdb>sqrt(2)*/
249
+ shdb.i[HI] -= 0x00100000;
250
+ E++;
251
+ }
252
+
253
+
254
+ /* Transform shdb to yh */
255
+ yh = shdb.d;
256
+
257
+
258
+ /* Compute the index to the table */
259
+ index = index & INDEXMASK;
260
+
261
+ /* Cast integer E into double ed for multiplication later */
262
+ ed = (double) E;
263
+
264
+ /*
265
+ Read tables:
266
+ Read one float for ri
267
+ Read the first two doubles for -log(r_i) (out of three)
268
+
269
+ Organization of the table:
270
+
271
+ one struct entry per index, the struct entry containing
272
+ r, logih, logim and logil in this order
273
+ */
274
+
275
+
276
+ ri = argredtable[index].ri;
277
+ /*
278
+ Actually we don't need the logarithm entries now
279
+ Move the following two lines to the eventual reconstruction
280
+ As long as we don't have any if in the following code, we can overlap
281
+ memory access with calculations
282
+ */
283
+ logih = argredtable[index].logih;
284
+ logim = argredtable[index].logim;
285
+
286
+ /* Test if we have a simple range reduction or a complicated one
287
+
288
+ Simple range reduction for x < 0: x + 1 is exact, sl = 0 exactly
289
+ Simple range reduction for x > 2^(125) (sh > 2^(125)): x + 1 is not exact but its error less than 2^(-125)
290
+
291
+ Complicated range reduction: other cases
292
+
293
+ */
294
+
295
+
296
+ if ((sl == 0.0) || (E > 125)) {
297
+ /* Simple range reduction */
298
+
299
+ Mul12(&yhrih, &yhril, yh, ri);
300
+ t1 = yhrih - 1.0;
301
+ Add12Cond(zh, zm, t1, yhril);
302
+ zl = 0.0;
303
+
304
+ } else {
305
+ /* Complicated range reduction; E <= 125 */
306
+
307
+
308
+ /* Scale sl accordingly to sh, from which the exponent was extracted
309
+
310
+ We form first 2^(-E) and multiply sl with this value; this gives yl.
311
+ */
312
+
313
+ scaledb.i[HI] = (-E + 1023) << 20;
314
+ scaledb.i[LO] = 0;
315
+
316
+ yl = sl * scaledb.d;
317
+
318
+
319
+ /* Do complicated range reduction:
320
+
321
+ zh + zm + zl = (yh + yl) * ri - 1.0
322
+
323
+
324
+ We use zh + zm in the quick phase and zh + zm + zl in the accurate phase
325
+
326
+ The multiplication yl * ri is exact because yl contains at most 9 bits and
327
+ ri contains at most 24 bits.
328
+
329
+ The substraction yhrih - 1.0 is exact as per Sterbenz' lemma.
330
+
331
+ */
332
+
333
+ Mul12(&yhrih,&yhril,yh,ri);
334
+ ylri = yl * ri;
335
+
336
+ t1 = yhrih - 1.0;
337
+
338
+ /* The unnormalized triple-double t1 + yhril + ylri is equal to (yh + yl) * ri - 1.0
339
+ As t1 can move over yhril and yhri can move over ylri, we normalize first these
340
+ values pairwise with Add12Conds. Then we renormalize the pairs by a
341
+ "inverted" (A.E.) Renormalize3.
342
+ */
343
+
344
+ Add12Cond(t2,t3,yhril,ylri);
345
+ Add12Cond(t4,t5,t1,t2);
346
+
347
+ Add12Cond(t6,zl,t3,t5);
348
+ Add12Cond(zh,zm,t4,t6);
349
+
350
+ }
351
+ }
352
+
353
+
354
+ /*
355
+ Polynomial evaluation
356
+
357
+ Use a 7 degree polynomial
358
+ Evaluate the higher 5 terms in double precision (-7 * 3 = -21) using Horner's scheme
359
+ Evaluate the lower 3 terms (the last is 0) in double double precision accounting also for zm
360
+ using an ad hoc method
361
+
362
+ */
363
+
364
+
365
+
366
+ #if defined(PROCESSOR_HAS_FMA) && !defined(AVOID_FMA)
367
+ polyHorner = FMA(FMA(FMA(FMA(c7,zh,c6),zh,c5),zh,c4),zh,c3);
368
+ #else
369
+ polyHorner = c3 + zh * (c4 + zh * (c5 + zh * (c6 + zh * c7)));
370
+ #endif
371
+
372
+ Mul12(&zhSquareh, &zhSquarel, zh, zh);
373
+ polyUpper = polyHorner * (zh * zhSquareh);
374
+ zhSquareHalfh = zhSquareh * -0.5;
375
+ zhSquareHalfl = zhSquarel * -0.5;
376
+ Add12(t1h, t1l, polyUpper, -1 * (zh * zm));
377
+ Add22(&t2h, &t2l, zh, zm, zhSquareHalfh, zhSquareHalfl);
378
+ Add22(&ph, &pl, t2h, t2l, t1h, t1l);
379
+
380
+ /* Reconstruction
381
+
382
+ Read logih and logim in the tables (already done)
383
+
384
+ Compute log(x) = E * log(2) + log(1+z) - log(ri)
385
+ i.e. log(x) = ed * (log2h + log2m) + (ph + pl) + (logih + logim) + delta
386
+
387
+ Carry out everything in double double precision
388
+
389
+ */
390
+
391
+ /*
392
+ We store log2 as log2h + log2m + log2l where log2h and log2m have 12 trailing zeros
393
+ Multiplication of ed (double E) and log2h is thus correct
394
+ The overall accuracy of log2h + log2m + log2l is 53 * 3 - 24 = 135 which
395
+ is enough for the accurate phase
396
+ The accuracy suffices also for the quick phase: 53 * 2 - 24 = 82
397
+ Nevertheless the storage with trailing zeros implies an overlap of the tabulated
398
+ triple double values. We have to take it into account for the accurate phase
399
+ basic procedures for addition and multiplication
400
+ The condition on the next Add12 is verified as log2m is smaller than log2h
401
+ and both are scaled by ed
402
+ */
403
+
404
+ Add12(log2edh, log2edl, log2h * ed, log2m * ed);
405
+
406
+ /* Add logih and logim to ph and pl
407
+
408
+ We must use conditioned Add22 as logih can move over ph
409
+ */
410
+
411
+ Add22Cond(&logTabPolyh, &logTabPolyl, logih, logim, ph, pl);
412
+
413
+ /* Add log2edh + log2edl to logTabPolyh + logTabPolyl */
414
+
415
+ Add22Cond(&logh, &logm, log2edh, log2edl, logTabPolyh, logTabPolyl);
416
+
417
+ /* Rounding test and eventual return or call to the accurate function */
418
+
419
+ if(E==0)
420
+ roundcst = ROUNDCST1;
421
+ else
422
+ roundcst = ROUNDCST2;
423
+
424
+
425
+ if(logh == (logh + (logm * roundcst)))
426
+ return logh;
427
+ else
428
+ {
429
+
430
+ #if DEBUG
431
+ printf("Going for Accurate Phase for x=%1.50e\n",x);
432
+ #endif
433
+
434
+ log1p_td_accurate(&logh, &logm, &logl, ed, index, zh, zm, zl, logih, logim);
435
+
436
+ ReturnRoundToNearest3(logh, logm, logl);
437
+
438
+ } /* Accurate phase launched */
439
+ }
440
+
441
+
442
+
443
+
444
+ double log1p_ru(double x) {
445
+ db_number xdb, shdb, scaledb;
446
+ double yh, yl, ed, ri, logih, logim, yhrih, yhril, ylri, t1, t2, t3, t4, t5, t6, zh, zm, zl;
447
+ double polyHorner, zhSquareh, zhSquarel, polyUpper, zhSquareHalfh, zhSquareHalfl;
448
+ double t1h, t1l, t2h, t2l, ph, pl, log2edh, log2edl, logTabPolyh, logTabPolyl, logh, logm, logl, roundcst;
449
+ double sh, sl;
450
+ int E, index;
451
+
452
+
453
+ xdb.d=x;
454
+
455
+ /* Filter cases */
456
+ if ((xdb.i[HI] & 0x7fffffff) < 0x3c900000) {
457
+ /* We are less than 2^(-54) and return simply an adjusted x
458
+
459
+ If x = 0, the result is algebraic and equal to 0.
460
+
461
+ The series for log(1 + x) = x - 1/2 * x^2 + ... is alternated
462
+ and converges in this interval.
463
+ The truncation rest -1/2 * x^2 + 1/3 * x^3 - ... is
464
+ always negative, so log(1 + x) is always less than x but less than
465
+ 1 ulp of x away.
466
+ We round up, so we return x.
467
+
468
+ */
469
+ return x;
470
+ }
471
+
472
+ if (((xdb.i[HI] & 0x80000000) != 0) && ((xdb.i[HI] & 0x7fffffff) >= 0x3ff00000)) {
473
+ /* We are less or equal than -1 (-inf and NaN, too),
474
+ we return -inf for -1 and NaN otherwise
475
+ */
476
+ if (x == -1.0) return x/0.0;
477
+
478
+
479
+ return (x-x)/0.0;
480
+ }
481
+
482
+ if ((xdb.i[HI] & 0x7ff00000) == 0x7ff00000) {
483
+ /* We are +inf or NaN
484
+ If +inf, we return +inf (x+x)
485
+ If NaN, we return NaN (x+x)
486
+ */
487
+ return x+x;
488
+ }
489
+
490
+ /* Test if |x| < 2^(-8)
491
+
492
+ If yes, short-circuit the range reduction
493
+
494
+ */
495
+
496
+ if ((xdb.i[HI] & 0x7fffffff) < 0x3f700000) {
497
+ /* Use the polynomial p(zh + zl) approximating log(1+zh+zl) directly
498
+ Set E and index to values that read 0.0 in the accurate phase.
499
+ */
500
+ logih = 0.0;
501
+ logim = 0.0;
502
+ index = 0;
503
+ ed = 0.0;
504
+ index = 0;
505
+ zh = x;
506
+ zm = 0.0;
507
+ zl = 0.0;
508
+ } else {
509
+ /* If we are here, |x| >= 2^(-8) and we must perform range reduction */
510
+
511
+ /* Compute first exactly
512
+
513
+ sh + sl = 1 + x
514
+
515
+ x can move over 1, so use a conditional Add12
516
+ */
517
+
518
+ Add12Cond(sh,sl,1.0,x);
519
+
520
+ /* Transform higher order double to integer */
521
+
522
+ shdb.d = sh;
523
+
524
+ /* Extract exponent and mantissa
525
+ Do range reduction,
526
+ yielding to E holding the exponent and
527
+ y the mantissa between sqrt(2)/2 and sqrt(2)
528
+ */
529
+ E = 0;
530
+ E += (shdb.i[HI]>>20)-1023; /* extract the exponent */
531
+ index = (shdb.i[HI] & 0x000fffff);
532
+ shdb.i[HI] = index | 0x3ff00000; /* do exponent = 0 */
533
+ index = (index + (1<<(20-L-1))) >> (20-L);
534
+
535
+ /* reduce such that sqrt(2)/2 < xdb.d < sqrt(2) */
536
+ if (index >= MAXINDEX){ /* corresponds to xdb>sqrt(2)*/
537
+ shdb.i[HI] -= 0x00100000;
538
+ E++;
539
+ }
540
+
541
+
542
+ /* Transform shdb to yh */
543
+ yh = shdb.d;
544
+
545
+
546
+ /* Compute the index to the table */
547
+ index = index & INDEXMASK;
548
+
549
+ /* Cast integer E into double ed for multiplication later */
550
+ ed = (double) E;
551
+
552
+ /*
553
+ Read tables:
554
+ Read one float for ri
555
+ Read the first two doubles for -log(r_i) (out of three)
556
+
557
+ Organization of the table:
558
+
559
+ one struct entry per index, the struct entry containing
560
+ r, logih, logim and logil in this order
561
+ */
562
+
563
+
564
+ ri = argredtable[index].ri;
565
+ /*
566
+ Actually we don't need the logarithm entries now
567
+ Move the following two lines to the eventual reconstruction
568
+ As long as we don't have any if in the following code, we can overlap
569
+ memory access with calculations
570
+ */
571
+ logih = argredtable[index].logih;
572
+ logim = argredtable[index].logim;
573
+
574
+ /* Test if we have a simple range reduction or a complicated one
575
+
576
+ Simple range reduction for x < 0: x + 1 is exact, sl = 0 exactly
577
+ Simple range reduction for x > 2^(125) (sh > 2^(125)): x + 1 is not exact but its error less than 2^(-125)
578
+
579
+ Complicated range reduction: other cases
580
+
581
+ */
582
+
583
+
584
+ if ((sl == 0.0) || (E > 125)) {
585
+ /* Simple range reduction */
586
+
587
+ Mul12(&yhrih, &yhril, yh, ri);
588
+ t1 = yhrih - 1.0;
589
+ Add12Cond(zh, zm, t1, yhril);
590
+ zl = 0.0;
591
+
592
+ } else {
593
+ /* Complicated range reduction; E <= 125 */
594
+
595
+
596
+ /* Scale sl accordingly to sh, from which the exponent was extracted
597
+
598
+ We form first 2^(-E) and multiply sl with this value; this gives yl.
599
+ */
600
+
601
+ scaledb.i[HI] = (-E + 1023) << 20;
602
+ scaledb.i[LO] = 0;
603
+
604
+ yl = sl * scaledb.d;
605
+
606
+
607
+ /* Do complicated range reduction:
608
+
609
+ zh + zm + zl = (yh + yl) * ri - 1.0
610
+
611
+
612
+ We use zh + zm in the quick phase and zh + zm + zl in the accurate phase
613
+
614
+ The multiplication yl * ri is exact because yl contains at most 9 bits and
615
+ ri contains at most 24 bits.
616
+
617
+ The substraction yhrih - 1.0 is exact as per Sterbenz' lemma.
618
+
619
+ */
620
+
621
+ Mul12(&yhrih,&yhril,yh,ri);
622
+ ylri = yl * ri;
623
+
624
+ t1 = yhrih - 1.0;
625
+
626
+ /* The unnormalized triple-double t1 + yhril + ylri is equal to (yh + yl) * ri - 1.0
627
+ As t1 can move over yhril and yhri can move over ylri, we normalize first these
628
+ values pairwise with Add12Conds. Then we renormalize the pairs by a
629
+ "inverted" (A.E.) Renormalize3.
630
+ */
631
+
632
+ Add12Cond(t2,t3,yhril,ylri);
633
+ Add12Cond(t4,t5,t1,t2);
634
+
635
+ Add12Cond(t6,zl,t3,t5);
636
+ Add12Cond(zh,zm,t4,t6);
637
+
638
+ }
639
+ }
640
+
641
+
642
+ /*
643
+ Polynomial evaluation
644
+
645
+ Use a 7 degree polynomial
646
+ Evaluate the higher 5 terms in double precision (-7 * 3 = -21) using Horner's scheme
647
+ Evaluate the lower 3 terms (the last is 0) in double double precision accounting also for zm
648
+ using an ad hoc method
649
+
650
+ */
651
+
652
+
653
+
654
+ #if defined(PROCESSOR_HAS_FMA) && !defined(AVOID_FMA)
655
+ polyHorner = FMA(FMA(FMA(FMA(c7,zh,c6),zh,c5),zh,c4),zh,c3);
656
+ #else
657
+ polyHorner = c3 + zh * (c4 + zh * (c5 + zh * (c6 + zh * c7)));
658
+ #endif
659
+
660
+ Mul12(&zhSquareh, &zhSquarel, zh, zh);
661
+ polyUpper = polyHorner * (zh * zhSquareh);
662
+ zhSquareHalfh = zhSquareh * -0.5;
663
+ zhSquareHalfl = zhSquarel * -0.5;
664
+ Add12(t1h, t1l, polyUpper, -1 * (zh * zm));
665
+ Add22(&t2h, &t2l, zh, zm, zhSquareHalfh, zhSquareHalfl);
666
+ Add22(&ph, &pl, t2h, t2l, t1h, t1l);
667
+
668
+ /* Reconstruction
669
+
670
+ Read logih and logim in the tables (already done)
671
+
672
+ Compute log(x) = E * log(2) + log(1+z) - log(ri)
673
+ i.e. log(x) = ed * (log2h + log2m) + (ph + pl) + (logih + logim) + delta
674
+
675
+ Carry out everything in double double precision
676
+
677
+ */
678
+
679
+ /*
680
+ We store log2 as log2h + log2m + log2l where log2h and log2m have 12 trailing zeros
681
+ Multiplication of ed (double E) and log2h is thus correct
682
+ The overall accuracy of log2h + log2m + log2l is 53 * 3 - 24 = 135 which
683
+ is enough for the accurate phase
684
+ The accuracy suffices also for the quick phase: 53 * 2 - 24 = 82
685
+ Nevertheless the storage with trailing zeros implies an overlap of the tabulated
686
+ triple double values. We have to take it into account for the accurate phase
687
+ basic procedures for addition and multiplication
688
+ The condition on the next Add12 is verified as log2m is smaller than log2h
689
+ and both are scaled by ed
690
+ */
691
+
692
+ Add12(log2edh, log2edl, log2h * ed, log2m * ed);
693
+
694
+ /* Add logih and logim to ph and pl
695
+
696
+ We must use conditioned Add22 as logih can move over ph
697
+ */
698
+
699
+ Add22Cond(&logTabPolyh, &logTabPolyl, logih, logim, ph, pl);
700
+
701
+ /* Add log2edh + log2edl to logTabPolyh + logTabPolyl */
702
+
703
+ Add22Cond(&logh, &logm, log2edh, log2edl, logTabPolyh, logTabPolyl);
704
+
705
+ /* Rounding test and eventual return or call to the accurate function */
706
+
707
+ if(E==0)
708
+ roundcst = RDROUNDCST1;
709
+ else
710
+ roundcst = RDROUNDCST2;
711
+
712
+ TEST_AND_RETURN_RU(logh, logm, roundcst);
713
+
714
+ #if DEBUG
715
+ printf("Going for Accurate Phase for x=%1.50e\n",x);
716
+ #endif
717
+
718
+ log1p_td_accurate(&logh, &logm, &logl, ed, index, zh, zm, zl, logih, logim);
719
+
720
+ ReturnRoundUpwards3(logh, logm, logl);
721
+ }
722
+
723
+ double log1p_rd(double x) {
724
+ db_number xdb, shdb, scaledb;
725
+ double yh, yl, ed, ri, logih, logim, yhrih, yhril, ylri, t1, t2, t3, t4, t5, t6, zh, zm, zl;
726
+ double polyHorner, zhSquareh, zhSquarel, polyUpper, zhSquareHalfh, zhSquareHalfl;
727
+ double t1h, t1l, t2h, t2l, ph, pl, log2edh, log2edl, logTabPolyh, logTabPolyl, logh, logm, logl, roundcst;
728
+ double sh, sl;
729
+ int E, index;
730
+
731
+
732
+ xdb.d=x;
733
+
734
+ /* Filter cases */
735
+ if ((xdb.i[HI] & 0x7fffffff) < 0x3c900000) {
736
+ /* We are less than 2^(-54) and return simply an adjusted x
737
+
738
+ If x = 0, the result is algebraic and equal to 0.
739
+
740
+ The series for log(1 + x) = x - 1/2 * x^2 + ... is alternated
741
+ and converges in this interval.
742
+ The truncation rest -1/2 * x^2 + 1/3 * x^3 - ... is
743
+ always negative, so log(1 + x) is always less than x but less than
744
+ 1 ulp of x away.
745
+ We round down, so we return x - 1ulp;
746
+
747
+ */
748
+
749
+ if (x == 0.0) return x;
750
+
751
+ if (x > 0) {
752
+ xdb.l--;
753
+ } else {
754
+ xdb.l++;
755
+ }
756
+ return xdb.d;
757
+ }
758
+
759
+ if (((xdb.i[HI] & 0x80000000) != 0) && ((xdb.i[HI] & 0x7fffffff) >= 0x3ff00000)) {
760
+ /* We are less or equal than -1 (-inf and NaN, too),
761
+ we return -inf for -1 and NaN otherwise
762
+ */
763
+ if (x == -1.0) return x/0.0;
764
+
765
+
766
+ return (x-x)/0.0;
767
+ }
768
+
769
+ if ((xdb.i[HI] & 0x7ff00000) == 0x7ff00000) {
770
+ /* We are +inf or NaN
771
+ If +inf, we return +inf (x+x)
772
+ If NaN, we return NaN (x+x)
773
+ */
774
+ return x+x;
775
+ }
776
+
777
+ /* Test if |x| < 2^(-8)
778
+
779
+ If yes, short-circuit the range reduction
780
+
781
+ */
782
+
783
+ if ((xdb.i[HI] & 0x7fffffff) < 0x3f700000) {
784
+ /* Use the polynomial p(zh + zl) approximating log(1+zh+zl) directly
785
+ Set E and index to values that read 0.0 in the accurate phase.
786
+ */
787
+ logih = 0.0;
788
+ logim = 0.0;
789
+ index = 0;
790
+ ed = 0.0;
791
+ index = 0;
792
+ zh = x;
793
+ zm = 0.0;
794
+ zl = 0.0;
795
+ } else {
796
+ /* If we are here, |x| >= 2^(-8) and we must perform range reduction */
797
+
798
+ /* Compute first exactly
799
+
800
+ sh + sl = 1 + x
801
+
802
+ x can move over 1, so use a conditional Add12
803
+ */
804
+
805
+ Add12Cond(sh,sl,1.0,x);
806
+
807
+ /* Transform higher order double to integer */
808
+
809
+ shdb.d = sh;
810
+
811
+ /* Extract exponent and mantissa
812
+ Do range reduction,
813
+ yielding to E holding the exponent and
814
+ y the mantissa between sqrt(2)/2 and sqrt(2)
815
+ */
816
+ E = 0;
817
+ E += (shdb.i[HI]>>20)-1023; /* extract the exponent */
818
+ index = (shdb.i[HI] & 0x000fffff);
819
+ shdb.i[HI] = index | 0x3ff00000; /* do exponent = 0 */
820
+ index = (index + (1<<(20-L-1))) >> (20-L);
821
+
822
+ /* reduce such that sqrt(2)/2 < xdb.d < sqrt(2) */
823
+ if (index >= MAXINDEX){ /* corresponds to xdb>sqrt(2)*/
824
+ shdb.i[HI] -= 0x00100000;
825
+ E++;
826
+ }
827
+
828
+
829
+ /* Transform shdb to yh */
830
+ yh = shdb.d;
831
+
832
+
833
+ /* Compute the index to the table */
834
+ index = index & INDEXMASK;
835
+
836
+ /* Cast integer E into double ed for multiplication later */
837
+ ed = (double) E;
838
+
839
+ /*
840
+ Read tables:
841
+ Read one float for ri
842
+ Read the first two doubles for -log(r_i) (out of three)
843
+
844
+ Organization of the table:
845
+
846
+ one struct entry per index, the struct entry containing
847
+ r, logih, logim and logil in this order
848
+ */
849
+
850
+
851
+ ri = argredtable[index].ri;
852
+ /*
853
+ Actually we don't need the logarithm entries now
854
+ Move the following two lines to the eventual reconstruction
855
+ As long as we don't have any if in the following code, we can overlap
856
+ memory access with calculations
857
+ */
858
+ logih = argredtable[index].logih;
859
+ logim = argredtable[index].logim;
860
+
861
+ /* Test if we have a simple range reduction or a complicated one
862
+
863
+ Simple range reduction for x < 0: x + 1 is exact, sl = 0 exactly
864
+ Simple range reduction for x > 2^(125) (sh > 2^(125)): x + 1 is not exact but its error less than 2^(-125)
865
+
866
+ Complicated range reduction: other cases
867
+
868
+ */
869
+
870
+
871
+ if ((sl == 0.0) || (E > 125)) {
872
+ /* Simple range reduction */
873
+
874
+ Mul12(&yhrih, &yhril, yh, ri);
875
+ t1 = yhrih - 1.0;
876
+ Add12Cond(zh, zm, t1, yhril);
877
+ zl = 0.0;
878
+
879
+ } else {
880
+ /* Complicated range reduction; E <= 125 */
881
+
882
+
883
+ /* Scale sl accordingly to sh, from which the exponent was extracted
884
+
885
+ We form first 2^(-E) and multiply sl with this value; this gives yl.
886
+ */
887
+
888
+ scaledb.i[HI] = (-E + 1023) << 20;
889
+ scaledb.i[LO] = 0;
890
+
891
+ yl = sl * scaledb.d;
892
+
893
+
894
+ /* Do complicated range reduction:
895
+
896
+ zh + zm + zl = (yh + yl) * ri - 1.0
897
+
898
+
899
+ We use zh + zm in the quick phase and zh + zm + zl in the accurate phase
900
+
901
+ The multiplication yl * ri is exact because yl contains at most 9 bits and
902
+ ri contains at most 24 bits.
903
+
904
+ The substraction yhrih - 1.0 is exact as per Sterbenz' lemma.
905
+
906
+ */
907
+
908
+ Mul12(&yhrih,&yhril,yh,ri);
909
+ ylri = yl * ri;
910
+
911
+ t1 = yhrih - 1.0;
912
+
913
+ /* The unnormalized triple-double t1 + yhril + ylri is equal to (yh + yl) * ri - 1.0
914
+ As t1 can move over yhril and yhri can move over ylri, we normalize first these
915
+ values pairwise with Add12Conds. Then we renormalize the pairs by a
916
+ "inverted" (A.E.) Renormalize3.
917
+ */
918
+
919
+ Add12Cond(t2,t3,yhril,ylri);
920
+ Add12Cond(t4,t5,t1,t2);
921
+
922
+ Add12Cond(t6,zl,t3,t5);
923
+ Add12Cond(zh,zm,t4,t6);
924
+
925
+ }
926
+ }
927
+
928
+
929
+ /*
930
+ Polynomial evaluation
931
+
932
+ Use a 7 degree polynomial
933
+ Evaluate the higher 5 terms in double precision (-7 * 3 = -21) using Horner's scheme
934
+ Evaluate the lower 3 terms (the last is 0) in double double precision accounting also for zm
935
+ using an ad hoc method
936
+
937
+ */
938
+
939
+
940
+
941
+ #if defined(PROCESSOR_HAS_FMA) && !defined(AVOID_FMA)
942
+ polyHorner = FMA(FMA(FMA(FMA(c7,zh,c6),zh,c5),zh,c4),zh,c3);
943
+ #else
944
+ polyHorner = c3 + zh * (c4 + zh * (c5 + zh * (c6 + zh * c7)));
945
+ #endif
946
+
947
+ Mul12(&zhSquareh, &zhSquarel, zh, zh);
948
+ polyUpper = polyHorner * (zh * zhSquareh);
949
+ zhSquareHalfh = zhSquareh * -0.5;
950
+ zhSquareHalfl = zhSquarel * -0.5;
951
+ Add12(t1h, t1l, polyUpper, -1 * (zh * zm));
952
+ Add22(&t2h, &t2l, zh, zm, zhSquareHalfh, zhSquareHalfl);
953
+ Add22(&ph, &pl, t2h, t2l, t1h, t1l);
954
+
955
+ /* Reconstruction
956
+
957
+ Read logih and logim in the tables (already done)
958
+
959
+ Compute log(x) = E * log(2) + log(1+z) - log(ri)
960
+ i.e. log(x) = ed * (log2h + log2m) + (ph + pl) + (logih + logim) + delta
961
+
962
+ Carry out everything in double double precision
963
+
964
+ */
965
+
966
+ /*
967
+ We store log2 as log2h + log2m + log2l where log2h and log2m have 12 trailing zeros
968
+ Multiplication of ed (double E) and log2h is thus correct
969
+ The overall accuracy of log2h + log2m + log2l is 53 * 3 - 24 = 135 which
970
+ is enough for the accurate phase
971
+ The accuracy suffices also for the quick phase: 53 * 2 - 24 = 82
972
+ Nevertheless the storage with trailing zeros implies an overlap of the tabulated
973
+ triple double values. We have to take it into account for the accurate phase
974
+ basic procedures for addition and multiplication
975
+ The condition on the next Add12 is verified as log2m is smaller than log2h
976
+ and both are scaled by ed
977
+ */
978
+
979
+ Add12(log2edh, log2edl, log2h * ed, log2m * ed);
980
+
981
+ /* Add logih and logim to ph and pl
982
+
983
+ We must use conditioned Add22 as logih can move over ph
984
+ */
985
+
986
+ Add22Cond(&logTabPolyh, &logTabPolyl, logih, logim, ph, pl);
987
+
988
+ /* Add log2edh + log2edl to logTabPolyh + logTabPolyl */
989
+
990
+ Add22Cond(&logh, &logm, log2edh, log2edl, logTabPolyh, logTabPolyl);
991
+
992
+ /* Rounding test and eventual return or call to the accurate function */
993
+
994
+ if(E==0)
995
+ roundcst = RDROUNDCST1;
996
+ else
997
+ roundcst = RDROUNDCST2;
998
+
999
+ TEST_AND_RETURN_RD(logh, logm, roundcst);
1000
+
1001
+ #if DEBUG
1002
+ printf("Going for Accurate Phase for x=%1.50e\n",x);
1003
+ #endif
1004
+
1005
+ log1p_td_accurate(&logh, &logm, &logl, ed, index, zh, zm, zl, logih, logim);
1006
+
1007
+ ReturnRoundDownwards3(logh, logm, logl);
1008
+ }
1009
+
1010
+ double log1p_rz(double x) {
1011
+ db_number xdb, shdb, scaledb;
1012
+ double yh, yl, ed, ri, logih, logim, yhrih, yhril, ylri, t1, t2, t3, t4, t5, t6, zh, zm, zl;
1013
+ double polyHorner, zhSquareh, zhSquarel, polyUpper, zhSquareHalfh, zhSquareHalfl;
1014
+ double t1h, t1l, t2h, t2l, ph, pl, log2edh, log2edl, logTabPolyh, logTabPolyl, logh, logm, logl, roundcst;
1015
+ double sh, sl;
1016
+ int E, index;
1017
+
1018
+
1019
+ xdb.d=x;
1020
+
1021
+ /* Filter cases */
1022
+ if ((xdb.i[HI] & 0x7fffffff) < 0x3c900000) {
1023
+ /* We are less than 2^(-54) and return simply an adjusted x
1024
+
1025
+ If x = 0, the result is algebraic and equal to 0.
1026
+
1027
+ The series for log(1 + x) = x - 1/2 * x^2 + ... is alternated
1028
+ and converges in this interval.
1029
+ The truncation rest -1/2 * x^2 + 1/3 * x^3 - ... is
1030
+ always negative, so log(1 + x) is always less than x but less than
1031
+ 1 ulp of x away.
1032
+ For x < 0, we have log(1 + x) < 0, so we round up and return x;
1033
+ For x > 0, we round down and return x - 1ulp
1034
+
1035
+ */
1036
+ if (x > 0) {
1037
+ xdb.l--;
1038
+ return xdb.d;
1039
+ }
1040
+
1041
+ /* Algebraic case x == 0.0 and round up */
1042
+
1043
+ return x;
1044
+ }
1045
+
1046
+ if (((xdb.i[HI] & 0x80000000) != 0) && ((xdb.i[HI] & 0x7fffffff) >= 0x3ff00000)) {
1047
+ /* We are less or equal than -1 (-inf and NaN, too),
1048
+ we return -inf for -1 and NaN otherwise
1049
+ */
1050
+ if (x == -1.0) return x/0.0;
1051
+
1052
+
1053
+ return (x-x)/0.0;
1054
+ }
1055
+
1056
+ if ((xdb.i[HI] & 0x7ff00000) == 0x7ff00000) {
1057
+ /* We are +inf or NaN
1058
+ If +inf, we return +inf (x+x)
1059
+ If NaN, we return NaN (x+x)
1060
+ */
1061
+ return x+x;
1062
+ }
1063
+
1064
+ /* Test if |x| < 2^(-8)
1065
+
1066
+ If yes, short-circuit the range reduction
1067
+
1068
+ */
1069
+
1070
+ if ((xdb.i[HI] & 0x7fffffff) < 0x3f700000) {
1071
+ /* Use the polynomial p(zh + zl) approximating log(1+zh+zl) directly
1072
+ Set E and index to values that read 0.0 in the accurate phase.
1073
+ */
1074
+ logih = 0.0;
1075
+ logim = 0.0;
1076
+ index = 0;
1077
+ ed = 0.0;
1078
+ index = 0;
1079
+ zh = x;
1080
+ zm = 0.0;
1081
+ zl = 0.0;
1082
+ } else {
1083
+ /* If we are here, |x| >= 2^(-8) and we must perform range reduction */
1084
+
1085
+ /* Compute first exactly
1086
+
1087
+ sh + sl = 1 + x
1088
+
1089
+ x can move over 1, so use a conditional Add12
1090
+ */
1091
+
1092
+ Add12Cond(sh,sl,1.0,x);
1093
+
1094
+ /* Transform higher order double to integer */
1095
+
1096
+ shdb.d = sh;
1097
+
1098
+ /* Extract exponent and mantissa
1099
+ Do range reduction,
1100
+ yielding to E holding the exponent and
1101
+ y the mantissa between sqrt(2)/2 and sqrt(2)
1102
+ */
1103
+ E = 0;
1104
+ E += (shdb.i[HI]>>20)-1023; /* extract the exponent */
1105
+ index = (shdb.i[HI] & 0x000fffff);
1106
+ shdb.i[HI] = index | 0x3ff00000; /* do exponent = 0 */
1107
+ index = (index + (1<<(20-L-1))) >> (20-L);
1108
+
1109
+ /* reduce such that sqrt(2)/2 < xdb.d < sqrt(2) */
1110
+ if (index >= MAXINDEX){ /* corresponds to xdb>sqrt(2)*/
1111
+ shdb.i[HI] -= 0x00100000;
1112
+ E++;
1113
+ }
1114
+
1115
+
1116
+ /* Transform shdb to yh */
1117
+ yh = shdb.d;
1118
+
1119
+
1120
+ /* Compute the index to the table */
1121
+ index = index & INDEXMASK;
1122
+
1123
+ /* Cast integer E into double ed for multiplication later */
1124
+ ed = (double) E;
1125
+
1126
+ /*
1127
+ Read tables:
1128
+ Read one float for ri
1129
+ Read the first two doubles for -log(r_i) (out of three)
1130
+
1131
+ Organization of the table:
1132
+
1133
+ one struct entry per index, the struct entry containing
1134
+ r, logih, logim and logil in this order
1135
+ */
1136
+
1137
+
1138
+ ri = argredtable[index].ri;
1139
+ /*
1140
+ Actually we don't need the logarithm entries now
1141
+ Move the following two lines to the eventual reconstruction
1142
+ As long as we don't have any if in the following code, we can overlap
1143
+ memory access with calculations
1144
+ */
1145
+ logih = argredtable[index].logih;
1146
+ logim = argredtable[index].logim;
1147
+
1148
+ /* Test if we have a simple range reduction or a complicated one
1149
+
1150
+ Simple range reduction for x < 0: x + 1 is exact, sl = 0 exactly
1151
+ Simple range reduction for x > 2^(125) (sh > 2^(125)): x + 1 is not exact but its error less than 2^(-125)
1152
+
1153
+ Complicated range reduction: other cases
1154
+
1155
+ */
1156
+
1157
+
1158
+ if ((sl == 0.0) || (E > 125)) {
1159
+ /* Simple range reduction */
1160
+
1161
+ Mul12(&yhrih, &yhril, yh, ri);
1162
+ t1 = yhrih - 1.0;
1163
+ Add12Cond(zh, zm, t1, yhril);
1164
+ zl = 0.0;
1165
+
1166
+ } else {
1167
+ /* Complicated range reduction; E <= 125 */
1168
+
1169
+
1170
+ /* Scale sl accordingly to sh, from which the exponent was extracted
1171
+
1172
+ We form first 2^(-E) and multiply sl with this value; this gives yl.
1173
+ */
1174
+
1175
+ scaledb.i[HI] = (-E + 1023) << 20;
1176
+ scaledb.i[LO] = 0;
1177
+
1178
+ yl = sl * scaledb.d;
1179
+
1180
+
1181
+ /* Do complicated range reduction:
1182
+
1183
+ zh + zm + zl = (yh + yl) * ri - 1.0
1184
+
1185
+
1186
+ We use zh + zm in the quick phase and zh + zm + zl in the accurate phase
1187
+
1188
+ The multiplication yl * ri is exact because yl contains at most 9 bits and
1189
+ ri contains at most 24 bits.
1190
+
1191
+ The substraction yhrih - 1.0 is exact as per Sterbenz' lemma.
1192
+
1193
+ */
1194
+
1195
+ Mul12(&yhrih,&yhril,yh,ri);
1196
+ ylri = yl * ri;
1197
+
1198
+ t1 = yhrih - 1.0;
1199
+
1200
+ /* The unnormalized triple-double t1 + yhril + ylri is equal to (yh + yl) * ri - 1.0
1201
+ As t1 can move over yhril and yhri can move over ylri, we normalize first these
1202
+ values pairwise with Add12Conds. Then we renormalize the pairs by a
1203
+ "inverted" (A.E.) Renormalize3.
1204
+ */
1205
+
1206
+ Add12Cond(t2,t3,yhril,ylri);
1207
+ Add12Cond(t4,t5,t1,t2);
1208
+
1209
+ Add12Cond(t6,zl,t3,t5);
1210
+ Add12Cond(zh,zm,t4,t6);
1211
+
1212
+ }
1213
+ }
1214
+
1215
+
1216
+ /*
1217
+ Polynomial evaluation
1218
+
1219
+ Use a 7 degree polynomial
1220
+ Evaluate the higher 5 terms in double precision (-7 * 3 = -21) using Horner's scheme
1221
+ Evaluate the lower 3 terms (the last is 0) in double double precision accounting also for zm
1222
+ using an ad hoc method
1223
+
1224
+ */
1225
+
1226
+
1227
+
1228
+ #if defined(PROCESSOR_HAS_FMA) && !defined(AVOID_FMA)
1229
+ polyHorner = FMA(FMA(FMA(FMA(c7,zh,c6),zh,c5),zh,c4),zh,c3);
1230
+ #else
1231
+ polyHorner = c3 + zh * (c4 + zh * (c5 + zh * (c6 + zh * c7)));
1232
+ #endif
1233
+
1234
+ Mul12(&zhSquareh, &zhSquarel, zh, zh);
1235
+ polyUpper = polyHorner * (zh * zhSquareh);
1236
+ zhSquareHalfh = zhSquareh * -0.5;
1237
+ zhSquareHalfl = zhSquarel * -0.5;
1238
+ Add12(t1h, t1l, polyUpper, -1 * (zh * zm));
1239
+ Add22(&t2h, &t2l, zh, zm, zhSquareHalfh, zhSquareHalfl);
1240
+ Add22(&ph, &pl, t2h, t2l, t1h, t1l);
1241
+
1242
+ /* Reconstruction
1243
+
1244
+ Read logih and logim in the tables (already done)
1245
+
1246
+ Compute log(x) = E * log(2) + log(1+z) - log(ri)
1247
+ i.e. log(x) = ed * (log2h + log2m) + (ph + pl) + (logih + logim) + delta
1248
+
1249
+ Carry out everything in double double precision
1250
+
1251
+ */
1252
+
1253
+ /*
1254
+ We store log2 as log2h + log2m + log2l where log2h and log2m have 12 trailing zeros
1255
+ Multiplication of ed (double E) and log2h is thus correct
1256
+ The overall accuracy of log2h + log2m + log2l is 53 * 3 - 24 = 135 which
1257
+ is enough for the accurate phase
1258
+ The accuracy suffices also for the quick phase: 53 * 2 - 24 = 82
1259
+ Nevertheless the storage with trailing zeros implies an overlap of the tabulated
1260
+ triple double values. We have to take it into account for the accurate phase
1261
+ basic procedures for addition and multiplication
1262
+ The condition on the next Add12 is verified as log2m is smaller than log2h
1263
+ and both are scaled by ed
1264
+ */
1265
+
1266
+ Add12(log2edh, log2edl, log2h * ed, log2m * ed);
1267
+
1268
+ /* Add logih and logim to ph and pl
1269
+
1270
+ We must use conditioned Add22 as logih can move over ph
1271
+ */
1272
+
1273
+ Add22Cond(&logTabPolyh, &logTabPolyl, logih, logim, ph, pl);
1274
+
1275
+ /* Add log2edh + log2edl to logTabPolyh + logTabPolyl */
1276
+
1277
+ Add22Cond(&logh, &logm, log2edh, log2edl, logTabPolyh, logTabPolyl);
1278
+
1279
+ /* Rounding test and eventual return or call to the accurate function */
1280
+
1281
+ if(E==0)
1282
+ roundcst = RDROUNDCST1;
1283
+ else
1284
+ roundcst = RDROUNDCST2;
1285
+
1286
+ TEST_AND_RETURN_RZ(logh, logm, roundcst);
1287
+
1288
+ #if DEBUG
1289
+ printf("Going for Accurate Phase for x=%1.50e\n",x);
1290
+ #endif
1291
+
1292
+ log1p_td_accurate(&logh, &logm, &logl, ed, index, zh, zm, zl, logih, logim);
1293
+
1294
+ ReturnRoundTowardsZero3(logh, logm, logl);
1295
+ }