crmf 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +12 -0
  3. data/crmf.gemspec +105 -3
  4. data/ext/crlibm-1.0beta5/AUTHORS +2 -0
  5. data/ext/crlibm-1.0beta5/CMakeLists.txt +154 -0
  6. data/ext/crlibm-1.0beta5/COPYING +340 -0
  7. data/ext/crlibm-1.0beta5/COPYING.LIB +504 -0
  8. data/ext/crlibm-1.0beta5/ChangeLog +125 -0
  9. data/ext/crlibm-1.0beta5/Makefile.am +134 -0
  10. data/ext/crlibm-1.0beta5/NEWS +0 -0
  11. data/ext/crlibm-1.0beta5/README +31 -0
  12. data/ext/crlibm-1.0beta5/README.DEV +23 -0
  13. data/ext/crlibm-1.0beta5/README.md +5 -0
  14. data/ext/crlibm-1.0beta5/TODO +66 -0
  15. data/ext/crlibm-1.0beta5/VERSION +1 -0
  16. data/ext/crlibm-1.0beta5/acos-td.c +1195 -0
  17. data/ext/crlibm-1.0beta5/acos-td.h +629 -0
  18. data/ext/crlibm-1.0beta5/asin-td.c +1297 -0
  19. data/ext/crlibm-1.0beta5/asin-td.h +620 -0
  20. data/ext/crlibm-1.0beta5/asincos.c +4488 -0
  21. data/ext/crlibm-1.0beta5/asincos.h +575 -0
  22. data/ext/crlibm-1.0beta5/atan-itanium.c +846 -0
  23. data/ext/crlibm-1.0beta5/atan-pentium.c +280 -0
  24. data/ext/crlibm-1.0beta5/atan-pentium.h +343 -0
  25. data/ext/crlibm-1.0beta5/atan_accurate.c +341 -0
  26. data/ext/crlibm-1.0beta5/atan_accurate.h +198 -0
  27. data/ext/crlibm-1.0beta5/atan_fast.c +506 -0
  28. data/ext/crlibm-1.0beta5/atan_fast.h +680 -0
  29. data/ext/crlibm-1.0beta5/configure.ac +419 -0
  30. data/ext/crlibm-1.0beta5/crlibm.h +204 -0
  31. data/ext/crlibm-1.0beta5/crlibm.spec +42 -0
  32. data/ext/crlibm-1.0beta5/crlibm_private.c +397 -0
  33. data/ext/crlibm-1.0beta5/crlibm_private.h +1048 -0
  34. data/ext/crlibm-1.0beta5/csh_fast.c +721 -0
  35. data/ext/crlibm-1.0beta5/csh_fast.h +771 -0
  36. data/ext/crlibm-1.0beta5/double-extended.h +496 -0
  37. data/ext/crlibm-1.0beta5/exp-itanium.c +723 -0
  38. data/ext/crlibm-1.0beta5/exp-td-standalone.c +87 -0
  39. data/ext/crlibm-1.0beta5/exp-td.c +1363 -0
  40. data/ext/crlibm-1.0beta5/exp-td.h +685 -0
  41. data/ext/crlibm-1.0beta5/exp_build_coeffs/exp_fast_table.c +125 -0
  42. data/ext/crlibm-1.0beta5/expm1-standalone.c +119 -0
  43. data/ext/crlibm-1.0beta5/expm1.c +2515 -0
  44. data/ext/crlibm-1.0beta5/expm1.h +715 -0
  45. data/ext/crlibm-1.0beta5/interval.h +238 -0
  46. data/ext/crlibm-1.0beta5/log-de.c +480 -0
  47. data/ext/crlibm-1.0beta5/log-de.h +747 -0
  48. data/ext/crlibm-1.0beta5/log-de2.c +280 -0
  49. data/ext/crlibm-1.0beta5/log-de2.h +2352 -0
  50. data/ext/crlibm-1.0beta5/log-td.c +1158 -0
  51. data/ext/crlibm-1.0beta5/log-td.h +819 -0
  52. data/ext/crlibm-1.0beta5/log.c +2244 -0
  53. data/ext/crlibm-1.0beta5/log.h +1592 -0
  54. data/ext/crlibm-1.0beta5/log10-td.c +906 -0
  55. data/ext/crlibm-1.0beta5/log10-td.h +823 -0
  56. data/ext/crlibm-1.0beta5/log1p.c +1295 -0
  57. data/ext/crlibm-1.0beta5/log2-td.c +1521 -0
  58. data/ext/crlibm-1.0beta5/log2-td.h +821 -0
  59. data/ext/crlibm-1.0beta5/log2_accurate.c +330 -0
  60. data/ext/crlibm-1.0beta5/log2_accurate.h +261 -0
  61. data/ext/crlibm-1.0beta5/log_accurate.c +133 -0
  62. data/ext/crlibm-1.0beta5/log_accurate.h +261 -0
  63. data/ext/crlibm-1.0beta5/log_fast.c +360 -0
  64. data/ext/crlibm-1.0beta5/log_fast.h +440 -0
  65. data/ext/crlibm-1.0beta5/pow.c +1396 -0
  66. data/ext/crlibm-1.0beta5/pow.h +3101 -0
  67. data/ext/crlibm-1.0beta5/prepare +20 -0
  68. data/ext/crlibm-1.0beta5/rem_pio2_accurate.c +219 -0
  69. data/ext/crlibm-1.0beta5/rem_pio2_accurate.h +53 -0
  70. data/ext/crlibm-1.0beta5/scs_lib/AUTHORS +3 -0
  71. data/ext/crlibm-1.0beta5/scs_lib/COPYING +504 -0
  72. data/ext/crlibm-1.0beta5/scs_lib/ChangeLog +16 -0
  73. data/ext/crlibm-1.0beta5/scs_lib/Doxyfile.dev +939 -0
  74. data/ext/crlibm-1.0beta5/scs_lib/Doxyfile.user +939 -0
  75. data/ext/crlibm-1.0beta5/scs_lib/INSTALL +215 -0
  76. data/ext/crlibm-1.0beta5/scs_lib/Makefile.am +17 -0
  77. data/ext/crlibm-1.0beta5/scs_lib/NEWS +0 -0
  78. data/ext/crlibm-1.0beta5/scs_lib/README +9 -0
  79. data/ext/crlibm-1.0beta5/scs_lib/README.DEV +38 -0
  80. data/ext/crlibm-1.0beta5/scs_lib/TODO +4 -0
  81. data/ext/crlibm-1.0beta5/scs_lib/VERSION +1 -0
  82. data/ext/crlibm-1.0beta5/scs_lib/addition_scs.c +623 -0
  83. data/ext/crlibm-1.0beta5/scs_lib/division_scs.c +110 -0
  84. data/ext/crlibm-1.0beta5/scs_lib/double2scs.c +174 -0
  85. data/ext/crlibm-1.0beta5/scs_lib/main.dox +104 -0
  86. data/ext/crlibm-1.0beta5/scs_lib/multiplication_scs.c +339 -0
  87. data/ext/crlibm-1.0beta5/scs_lib/poly_fct.c +112 -0
  88. data/ext/crlibm-1.0beta5/scs_lib/print_scs.c +73 -0
  89. data/ext/crlibm-1.0beta5/scs_lib/rand_scs.c +63 -0
  90. data/ext/crlibm-1.0beta5/scs_lib/scs.h +353 -0
  91. data/ext/crlibm-1.0beta5/scs_lib/scs2double.c +411 -0
  92. data/ext/crlibm-1.0beta5/scs_lib/scs2mpf.c +58 -0
  93. data/ext/crlibm-1.0beta5/scs_lib/scs2mpfr.c +61 -0
  94. data/ext/crlibm-1.0beta5/scs_lib/scs_private.c +23 -0
  95. data/ext/crlibm-1.0beta5/scs_lib/scs_private.h +133 -0
  96. data/ext/crlibm-1.0beta5/scs_lib/wrapper_scs.h +486 -0
  97. data/ext/crlibm-1.0beta5/scs_lib/zero_scs.c +52 -0
  98. data/ext/crlibm-1.0beta5/trigo_accurate.c +501 -0
  99. data/ext/crlibm-1.0beta5/trigo_accurate.h +331 -0
  100. data/ext/crlibm-1.0beta5/trigo_fast.c +1243 -0
  101. data/ext/crlibm-1.0beta5/trigo_fast.h +639 -0
  102. data/ext/crlibm-1.0beta5/trigpi.c +1169 -0
  103. data/ext/crlibm-1.0beta5/trigpi.h +556 -0
  104. data/ext/crlibm-1.0beta5/triple-double.c +57 -0
  105. data/ext/crlibm-1.0beta5/triple-double.h +1380 -0
  106. data/ext/crmf/crmf.c +117 -20
  107. data/ext/crmf/extconf.rb +12 -8
  108. data/lib/crmf/version.rb +1 -1
  109. data/tests/perf.rb +100 -219
  110. metadata +108 -10
  111. data/ext/crlibm-1.0beta4.tar.gz +0 -0
@@ -0,0 +1,1048 @@
1
+ /*
2
+ * crlibm_private.h
3
+ *
4
+ * This file contains useful tools and data for the crlibm functions.
5
+ *
6
+ */
7
+
8
+ #ifndef CRLIBM_PRIVATE_H
9
+ #define CRLIBM_PRIVATE_H 1
10
+
11
+ #include "scs_lib/scs.h"
12
+ #include "scs_lib/scs_private.h"
13
+
14
+ #ifdef HAVE_CONFIG_H
15
+ #include "crlibm_config.h"
16
+ #endif
17
+ /* otherwise CMake is used, and defines all the useful variables using -D switch */
18
+
19
+ #ifdef HAVE_INTTYPES_H
20
+ #include <inttypes.h>
21
+ #endif
22
+
23
+
24
+
25
+ #if (defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64))
26
+ # ifdef CRLIBM_HAS_FPU_CONTROL
27
+ # include <fpu_control.h>
28
+ # ifndef _FPU_SETCW
29
+ # define _FPU_SETCW(cw) __asm__ ("fldcw %0" : : "m" (*&cw))
30
+ # endif
31
+ # ifndef _FPU_GETCW
32
+ # define _FPU_GETCW(cw) __asm__ ("fnstcw %0" : "=m" (*&cw))
33
+ # endif
34
+ # endif
35
+ #endif
36
+
37
+ /* 64 bit arithmetic may be standardised, but people still do what they want */
38
+ #ifdef HAVE_INTTYPES_H
39
+ #define ULL(bits) 0x##bits##uLL
40
+ #elif defined(_WIN32)
41
+ /* Windows garbage there */
42
+ typedef long long int int64_t;
43
+ typedef unsigned long long int uint64_t;
44
+ #define ULL(bits) 0x##bits##i64
45
+ /* Default, hoping it works, hopefully less and less relevant */
46
+ #else
47
+ typedef long long int int64_t;
48
+ typedef unsigned long long int uint64_t;
49
+ #define ULL(bits) 0x##bits##uLL
50
+ #endif
51
+
52
+ #ifndef SCS_DEF_INT64
53
+ #define SCS_DEF_INT64
54
+ #ifdef CRLIBM_TYPEOS_HPUX
55
+ #ifndef __LP64__ /* To solve the problem with 64 bits integer on HPPA */
56
+ typedef long long int64_t;
57
+ typedef unsigned long long uint64_t;
58
+ #define ULL(bits) 0x##bits##uLL
59
+ #endif
60
+ #endif
61
+ #endif
62
+
63
+
64
+
65
+
66
+ /* The Add22 and Add22 functions, as well as double-double
67
+ multiplications of the Dekker family may be either defined as
68
+ functions, or as #defines. Which one is better depends on the
69
+ processor/compiler/OS. As #define has to be used with more care (not
70
+ type-safe), the two following variables should be set to 1 in the
71
+ development/debugging phase, until no type warning remains.
72
+
73
+ */
74
+
75
+ #define ADD22_AS_FUNCTIONS 0
76
+ #define DEKKER_AS_FUNCTIONS 0
77
+ #define SQRT_AS_FUNCTIONS 0
78
+
79
+ /* The conditional version of the Add12 can be implemented either
80
+ using 3 floating point additions, a absolute value test and
81
+ a branch or using 6 floating point additions but no branch.
82
+ The Add22 sequence is similar.
83
+ The branchless versions might be faster on some systems.
84
+
85
+ The function versions of Add12Cond and Add22Cond are not
86
+ implemented in branchless versions.
87
+ */
88
+
89
+ #define AVOID_BRANCHES 1
90
+
91
+
92
+ /* setting the following variable adds variables and code for
93
+ monitoring the performance.
94
+ Note that sometimes only round to nearest is instrumented */
95
+ #define EVAL_PERF 1
96
+
97
+
98
+ #if EVAL_PERF==1
99
+ /* counter of calls to the second step (accurate step) */
100
+ extern int crlibm_second_step_taken;
101
+ #endif
102
+
103
+
104
+
105
+ /* The prototypes of the second steps */
106
+ /* extern void exp_SC(scs_ptr res_scs, double x);*/
107
+
108
+
109
+
110
+
111
+
112
+ /*
113
+ * i = d in rounding to nearest
114
+ The constant added is 2^52 + 2^51
115
+ */
116
+ #define DOUBLE2INT(_i, _d) \
117
+ {db_number _t; \
118
+ _t.d = (_d+6755399441055744.0); \
119
+ _i = _t.i[LO];}
120
+
121
+
122
+ /* Same idea but beware: works only for |_i| < 2^51 -1 */
123
+ #define DOUBLE2LONGINT(_i, _d) \
124
+ { \
125
+ db_number _t; \
126
+ _t.d = (_d+6755399441055744.0); \
127
+ if (_d >= 0) /* sign extend */ \
128
+ _i = _t.l & ULL(0007FFFFFFFFFFFF); \
129
+ else \
130
+ _i = (_t.l & ULL(0007FFFFFFFFFFFF)) | (ULL(FFF8000000000000)); \
131
+ }
132
+
133
+
134
+
135
+
136
+
137
+ /* Macros for the rounding tests in directed modes */
138
+ /* After Evgeny Gvozdev pointed out a bug in the rounding procedures I
139
+ decided to centralize them here
140
+
141
+ Note that these tests launch the accurate phase when yl=0, in
142
+ particular in the exceptional cases when the image of a double is a
143
+ double. See the chapter about the log for an example
144
+
145
+ All this does not work for denormals, of course
146
+ */
147
+
148
+
149
+ #define TEST_AND_RETURN_RU(__yh__, __yl__, __eps__) \
150
+ { \
151
+ db_number __yhdb__, __yldb__, u53; int yh_neg, yl_neg; \
152
+ __yhdb__.d = __yh__; __yldb__.d = __yl__; \
153
+ yh_neg = (__yhdb__.i[HI] & 0x80000000); \
154
+ yl_neg = (__yldb__.i[HI] & 0x80000000); \
155
+ __yhdb__.l = __yhdb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \
156
+ __yldb__.l = __yldb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \
157
+ u53.l = (__yhdb__.l & ULL(7ff0000000000000)) + ULL(0010000000000000); \
158
+ if(__yldb__.d > __eps__ * u53.d){ \
159
+ if(!yl_neg) { /* The case yl==0 is filtered by the above test*/ \
160
+ /* return next up */ \
161
+ __yhdb__.d = __yh__; \
162
+ if(yh_neg) __yhdb__.l--; else __yhdb__.l++; /* Beware: fails for zero */ \
163
+ return __yhdb__.d ; \
164
+ } \
165
+ else return __yh__; \
166
+ } \
167
+ }
168
+
169
+
170
+ #define TEST_AND_RETURN_RD(__yh__, __yl__, __eps__) \
171
+ { \
172
+ db_number __yhdb__, __yldb__, u53; int yh_neg, yl_neg; \
173
+ __yhdb__.d = __yh__; __yldb__.d = __yl__; \
174
+ yh_neg = (__yhdb__.i[HI] & 0x80000000); \
175
+ yl_neg = (__yldb__.i[HI] & 0x80000000); \
176
+ __yhdb__.l = __yhdb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \
177
+ __yldb__.l = __yldb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \
178
+ u53.l = (__yhdb__.l & ULL(7ff0000000000000)) + ULL(0010000000000000); \
179
+ if(__yldb__.d > __eps__ * u53.d){ \
180
+ if(yl_neg) { /* The case yl==0 is filtered by the above test*/ \
181
+ /* return next down */ \
182
+ __yhdb__.d = __yh__; \
183
+ if(yh_neg) __yhdb__.l++; else __yhdb__.l--; /* Beware: fails for zero */ \
184
+ return __yhdb__.d ; \
185
+ } \
186
+ else return __yh__; \
187
+ } \
188
+ }
189
+
190
+
191
+
192
+ #define TEST_AND_RETURN_RZ(__yh__, __yl__, __eps__) \
193
+ { \
194
+ db_number __yhdb__, __yldb__, u53; int yh_neg, yl_neg; \
195
+ __yhdb__.d = __yh__; __yldb__.d = __yl__; \
196
+ yh_neg = (__yhdb__.i[HI] & 0x80000000); \
197
+ yl_neg = (__yldb__.i[HI] & 0x80000000); \
198
+ __yhdb__.l = __yhdb__.l & ULL(7fffffffffffffff); /* compute the absolute value*/\
199
+ __yldb__.l = __yldb__.l & ULL(7fffffffffffffff); /* compute the absolute value*/\
200
+ u53.l = (__yhdb__.l & ULL(7ff0000000000000)) + ULL(0010000000000000); \
201
+ if(__yldb__.d > __eps__ * u53.d){ \
202
+ if(yl_neg!=yh_neg) { \
203
+ __yhdb__.d = __yh__; \
204
+ __yhdb__.l--; /* Beware: fails for zero */ \
205
+ return __yhdb__.d ; \
206
+ } \
207
+ else return __yh__; \
208
+ } \
209
+ }
210
+
211
+
212
+
213
+ #define TEST_AND_COPY_RU(__cond__, __res__, __yh__, __yl__, __eps__) \
214
+ { \
215
+ db_number __yhdb__, __yldb__, u53; int yh_neg, yl_neg; \
216
+ __yhdb__.d = __yh__; __yldb__.d = __yl__; \
217
+ yh_neg = (__yhdb__.i[HI] & 0x80000000); \
218
+ yl_neg = (__yldb__.i[HI] & 0x80000000); \
219
+ __yhdb__.l = __yhdb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \
220
+ __yldb__.l = __yldb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \
221
+ u53.l = (__yhdb__.l & ULL(7ff0000000000000)) + ULL(0010000000000000); \
222
+ __cond__ = 0; \
223
+ if(__yldb__.d > __eps__ * u53.d){ \
224
+ __cond__ = 1; \
225
+ if(!yl_neg) { /* The case yl==0 is filtered by the above test*/ \
226
+ /* return next up */ \
227
+ __yhdb__.d = __yh__; \
228
+ if(yh_neg) __yhdb__.l--; else __yhdb__.l++; /* Beware: fails for zero */ \
229
+ __res__ = __yhdb__.d ; \
230
+ } \
231
+ else { \
232
+ __res__ = __yh__; \
233
+ } \
234
+ } \
235
+ }
236
+
237
+ #define TEST_AND_COPY_RD(__cond__, __res__, __yh__, __yl__, __eps__) \
238
+ { \
239
+ db_number __yhdb__, __yldb__, u53; int yh_neg, yl_neg; \
240
+ __yhdb__.d = __yh__; __yldb__.d = __yl__; \
241
+ yh_neg = (__yhdb__.i[HI] & 0x80000000); \
242
+ yl_neg = (__yldb__.i[HI] & 0x80000000); \
243
+ __yhdb__.l = __yhdb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \
244
+ __yldb__.l = __yldb__.l & 0x7fffffffffffffffLL; /* compute the absolute value*/ \
245
+ u53.l = (__yhdb__.l & ULL(7ff0000000000000)) + ULL(0010000000000000); \
246
+ __cond__ = 0; \
247
+ if(__yldb__.d > __eps__ * u53.d){ \
248
+ __cond__ = 1; \
249
+ if(yl_neg) { /* The case yl==0 is filtered by the above test*/ \
250
+ /* return next down */ \
251
+ __yhdb__.d = __yh__; \
252
+ if(yh_neg) __yhdb__.l++; else __yhdb__.l--; /* Beware: fails for zero */ \
253
+ __res__ = __yhdb__.d ; \
254
+ } \
255
+ else { \
256
+ __res__ = __yh__; \
257
+ } \
258
+ } \
259
+ }
260
+
261
+
262
+ #define TEST_AND_COPY_RZ(__cond__, __res__, __yh__, __yl__, __eps__) \
263
+ { \
264
+ db_number __yhdb__, __yldb__, u53; int yh_neg, yl_neg; \
265
+ __yhdb__.d = __yh__; __yldb__.d = __yl__; \
266
+ yh_neg = (__yhdb__.i[HI] & 0x80000000); \
267
+ yl_neg = (__yldb__.i[HI] & 0x80000000); \
268
+ __yhdb__.l = __yhdb__.l & ULL(7fffffffffffffff); /* compute the absolute value*/\
269
+ __yldb__.l = __yldb__.l & ULL(7fffffffffffffff); /* compute the absolute value*/\
270
+ u53.l = (__yhdb__.l & ULL(7ff0000000000000)) + ULL(0010000000000000); \
271
+ __cond__ = 0; \
272
+ if(__yldb__.d > __eps__ * u53.d){ \
273
+ if(yl_neg!=yh_neg) { \
274
+ __yhdb__.d = __yh__; \
275
+ __yhdb__.l--; /* Beware: fails for zero */ \
276
+ __res__ = __yhdb__.d ; \
277
+ __cond__ = 1; \
278
+ } \
279
+ else { \
280
+ __res__ = __yh__; \
281
+ __cond__ = 1; \
282
+ } \
283
+ }
284
+
285
+
286
+
287
+ /* If the processor has a FMA, use it ! **/
288
+
289
+ /* All this probably works only with gcc.
290
+ See Markstein book for the case of HP's compiler */
291
+
292
+ #if defined(CRLIBM_TYPECPU_POWERPC) && defined(__GNUC__)
293
+ #define PROCESSOR_HAS_FMA 1
294
+ #define FMA(a,b,c) /* r = a*b + c*/ \
295
+ ({ \
296
+ double _a, _b,_c,_r; \
297
+ _a=a; _b=b;_c=c; \
298
+ __asm__ ("fmadd %0, %1, %2, %3\n ;;\n" \
299
+ : "=f"(_r) \
300
+ : "f"(_a), "f"(_b), "f"(_c) \
301
+ ); \
302
+ _r; \
303
+ })
304
+
305
+
306
+ #define FMS(a,b,c) /* r = a*b - c*/ \
307
+ ({ \
308
+ double _a, _b,_c,_r; \
309
+ _a=a; _b=b;_c=c; \
310
+ __asm__ ("fmsub %0, %1, %2, %3\n ;;\n" \
311
+ : "=f"(_r) \
312
+ : "f"(_a), "f"(_b), "f"(_c) \
313
+ ); \
314
+ _r; \
315
+ })
316
+
317
+ #endif /* defined(CRLIBM_TYPECPU_POWERPC) && defined(__GCC__) */
318
+
319
+
320
+
321
+
322
+ /* On the Itanium 1 / gcc3.2 we lose 10 cycles when using the FMA !?!
323
+ It probably breaks the scheduling algorithms somehow...
324
+ To test again with higher gcc versions
325
+ */
326
+
327
+ #if defined(CRLIBM_TYPECPU_ITANIUM) && defined(__GNUC__) && !defined(__INTEL_COMPILER) && 0
328
+ #define PROCESSOR_HAS_FMA 1
329
+ #define FMA(a,b,c) /* r = a*b + c*/ \
330
+ ({ \
331
+ double _a, _b,_c,_r; \
332
+ _a=a; _b=b;_c=c; \
333
+ __asm__ ("fma %0 = %1, %2, %3\n ;;\n" \
334
+ : "=f"(_r) \
335
+ : "f"(_a), "f"(_b), "f"(_c) \
336
+ ); \
337
+ _r; \
338
+ })
339
+
340
+
341
+ #define FMS(a,b,c) /* r = a*b - c*/ \
342
+ ({ \
343
+ double _a, _b, _c, _r; \
344
+ _a=a; _b=b;_c=c; \
345
+ __asm__ ("fms %0 = %1, %2, %3\n ;;\n" \
346
+ : "=f"(_r) \
347
+ : "f"(_a), "f"(_b), "f"(_c) \
348
+ ); \
349
+ _r; \
350
+ })
351
+ #endif /* defined(CRLIBM_TYPECPU_ITANIUM) && defined(__GCC__) && !defined(__INTEL_COMPILER) */
352
+
353
+
354
+
355
+
356
+ #if defined(CRLIBM_TYPECPU_ITANIUM) && defined(__INTEL_COMPILER)
357
+ #define PROCESSOR_HAS_FMA 1
358
+ #if 0 /* Commented out because it shouldn't be there: There should be
359
+ a standard #include doing all this, but as of april 2005
360
+ it doesn't exist, say intel people). Leave
361
+ it as documentation, though, until it is replaced by #include
362
+ */
363
+ /* Table 1-17: legal floating-point precision completers (.pc) */
364
+ typedef enum {
365
+ _PC_S = 1 /* single .s */
366
+ ,_PC_D = 2 /* double .d */
367
+ ,_PC_NONE = 3 /* dynamic */
368
+ } _Asm_pc;
369
+
370
+ /* Table 1-22: legal getf/setf floating-point register access completers */
371
+ typedef enum {
372
+ _FR_S = 1 /* single form .s */
373
+ ,_FR_D = 2 /* double form .d */
374
+ ,_FR_EXP = 3 /* exponent form .exp */
375
+ ,_FR_SIG = 4 /* significand form .sig */
376
+ } _Asm_fr_access;
377
+
378
+ /* Table 1-24: legal floating-point FPSR status field completers (.sf) */
379
+ typedef enum {
380
+ _SF0 = 0 /* FPSR status field 0 .s0 */
381
+ ,_SF1 = 1 /* FPSR status field 1 .s1 */
382
+ ,_SF2 = 2 /* FPSR status field 2 .s2 */
383
+ ,_SF3 = 3 /* FPSR status field 3 .s3 */
384
+ } _Asm_sf;
385
+ #endif
386
+
387
+ #define FMA(a,b,c) /* r = a*b + c*/ \
388
+ _Asm_fma( 2/*_PC_D*/, a, b, c, 0/*_SF0*/ );
389
+
390
+
391
+ #define FMS(a,b,c) /* r = a*b - c*/ \
392
+ _Asm_fms( 2/*_PC_D*/, a, b, c, 0/*_SF0*/);
393
+
394
+ #endif /*defined(CRLIBM_TYPECPU_ITANIUM) && defined(__INTEL_COMPILER)*/
395
+
396
+
397
+
398
+
399
+
400
+
401
+
402
+
403
+ #ifdef WORDS_BIGENDIAN
404
+ #define DB_ONE {{0x3ff00000, 0x00000000}}
405
+ #else
406
+ #define DB_ONE {{0x00000000 ,0x3ff00000}}
407
+ #endif
408
+
409
+
410
+
411
+
412
+
413
+
414
+ extern const scs scs_zer, scs_half, scs_one, scs_two, scs_sixinv;
415
+
416
+
417
+ #define SCS_ZERO (scs_ptr)(&scs_zer)
418
+ #define SCS_HALF (scs_ptr)(&scs_half)
419
+ #define SCS_ONE (scs_ptr)(&scs_one)
420
+ #define SCS_TWO (scs_ptr)(&scs_two)
421
+ #define SCS_SIXINV (scs_ptr)(&scs_sixinv)
422
+
423
+
424
+
425
+
426
+ #if defined(__GNUC__)
427
+ #define ABS(x) (__builtin_fabs((x)))
428
+ #else
429
+ #define ABS(x) (((x)>0) ? (x) : (-(x)))
430
+ #endif
431
+
432
+
433
+
434
+
435
+ /*
436
+ * In the following, when an operator is preceded by a '@' it means that we
437
+ * are considering the IEEE-compliant machine operator, otherwise it
438
+ * is the mathematical operator.
439
+ *
440
+ */
441
+
442
+
443
+ /*
444
+ * computes s and r such that s + r = a + b, with s = a @+ b exactly
445
+ */
446
+ #if AVOID_BRANCHES
447
+ #define Add12Cond(s, r, a, b) \
448
+ { \
449
+ double _u1, _u2, _u3, _u4; \
450
+ double _a=a, _b=b; \
451
+ \
452
+ s = _a + _b; \
453
+ _u1 = s - _a; \
454
+ _u2 = s - _u1; \
455
+ _u3 = _b - _u1; \
456
+ _u4 = _a - _u2; \
457
+ r = _u4 + _u3; \
458
+ }
459
+
460
+ #else
461
+ #define Add12Cond(s, r, a, b) \
462
+ {double _z, _a=a, _b=b; \
463
+ s = _a + _b; \
464
+ if (ABS(a) > ABS(b)){ \
465
+ _z = s - _a; \
466
+ r = _b - _z; \
467
+ }else { \
468
+ _z = s - _b; \
469
+ r = _a - _z;}}
470
+ #endif
471
+
472
+ /*
473
+ * computes s and r such that s + r = a + b, with s = a @+ b exactly
474
+ * under the condition a >= b
475
+ */
476
+ #define Add12(s, r, a, b) \
477
+ {double _z, _a=a, _b=b; \
478
+ s = _a + _b; \
479
+ _z = s - _a; \
480
+ r = _b - _z; }
481
+
482
+
483
+ /*
484
+ * computes r1, r2, r3 such that r1 + r2 + r3 = a + b + c exactly
485
+ */
486
+ #define Fast3Sum(r1, r2, r3, a, b, c) \
487
+ {double u, v, w; \
488
+ Fast2Sum(u, v, b, c); \
489
+ Fast2Sum(r1, w, a, u); \
490
+ Fast2Sum(r2, r3, w, v); }
491
+
492
+
493
+
494
+
495
+
496
+
497
+
498
+ /*
499
+ * Functions to computes double-double addition: zh+zl = xh+xl + yh+yl
500
+ * knowing that xh>yh
501
+ * relative error is smaller than 2^-103
502
+ */
503
+
504
+
505
+ #if ADD22_AS_FUNCTIONS
506
+ extern void Add22(double *zh, double *zl, double xh, double xl, double yh, double yl);
507
+ extern void Add22Cond(double *zh, double *zl, double xh, double xl, double yh, double yl);
508
+
509
+ #else /* ADD22_AS_FUNCTIONS */
510
+
511
+ #if AVOID_BRANCHES
512
+ #define Add22Cond(zh,zl,xh,xl,yh,yl) \
513
+ do { \
514
+ double _v1, _v2, _v3, _v4; \
515
+ \
516
+ Add12Cond(_v1, _v2, (xh), (yh)); \
517
+ _v3 = (xl) + (yl); \
518
+ _v4 = _v2 + _v3; \
519
+ Add12((*(zh)),(*(zl)),_v1,_v4); \
520
+ } while (2+2==5)
521
+ #else
522
+ #define Add22Cond(zh,zl,xh,xl,yh,yl) \
523
+ do { \
524
+ double _r,_s; \
525
+ _r = (xh)+(yh); \
526
+ _s = ((ABS(xh)) > (ABS(yh)))? ((xh)-_r+(yh)+(yl)+(xl)) : ((yh)-_r+(xh)+(xl)+(yl)); \
527
+ *zh = _r+_s; \
528
+ *zl = (_r - (*zh)) + _s; \
529
+ } while(2+2==5)
530
+ #endif
531
+
532
+
533
+ #define Add22(zh,zl,xh,xl,yh,yl) \
534
+ do { \
535
+ double _r,_s; \
536
+ _r = (xh)+(yh); \
537
+ _s = ((((xh)-_r) +(yh)) + (yl)) + (xl); \
538
+ *zh = _r+_s; \
539
+ *zl = (_r - (*zh)) + _s; \
540
+ } while(0)
541
+
542
+ #endif /* ADD22_AS_FUNCTIONS */
543
+
544
+
545
+
546
+ #ifdef PROCESSOR_HAS_FMA
547
+ /* One of the nice things with the fused multiply-and-add is that it
548
+ greatly simplifies the double-double multiplications : */
549
+ #define Mul12(rh,rl,u,v) \
550
+ { \
551
+ *rh = u*v; \
552
+ *rl = FMS(u,v, *rh); \
553
+ }
554
+
555
+ #define Mul22(pzh,pzl, xh,xl, yh,yl) \
556
+ { \
557
+ double ph, pl; \
558
+ ph = xh*yh; \
559
+ pl = FMS(xh, yh, ph); \
560
+ pl = FMA(xh,yl, pl); \
561
+ pl = FMA(xl,yh,pl); \
562
+ *pzh = ph+pl; \
563
+ *pzl = ph - (*pzh); \
564
+ *pzl += pl; \
565
+ }
566
+
567
+
568
+ /* besides we don't care anymore about overflows in the mult */
569
+ #define Mul12Cond Mul12
570
+ #define Mul22cond Mul22
571
+
572
+
573
+ #else /* ! PROCESSOR_HAS_FMA */
574
+
575
+
576
+ #if DEKKER_AS_FUNCTIONS
577
+ extern void Mul12(double *rh, double *rl, double u, double v);
578
+ extern void Mul12Cond(double *rh, double *rl, double a, double b);
579
+ extern void Mul22(double *zh, double *zl, double xh, double xl, double yh, double yl);
580
+ #else /* if DEKKER_AS_FUNCTIONS */
581
+ /*
582
+ * computes rh and rl such that rh + rl = a * b with rh = a @* b exactly
583
+ * under the conditions : a < 2^970 et b < 2^970
584
+ */
585
+ #if 1
586
+ #define Mul12(rh,rl,u,v) \
587
+ { \
588
+ const double c = 134217729.; /* 2^27 +1 */ \
589
+ double up, u1, u2, vp, v1, v2; \
590
+ double _u=u, _v=v; \
591
+ up = _u*c; vp = _v*c; \
592
+ u1 = (_u-up)+up; v1 = (_v-vp)+vp; \
593
+ u2 = _u-u1; v2 = _v-v1; \
594
+ \
595
+ *rh = _u*_v; \
596
+ *rl = (((u1*v1-*rh)+(u1*v2))+(u2*v1))+(u2*v2);\
597
+ }
598
+ #else
599
+ /* This works but is much slower. Problem:
600
+ SSE2 instructions are two-address, and intrinsincs are 3-address */
601
+ #include<emmintrin.h>
602
+ #define Mul12(rh,rl,u,v) \
603
+ { \
604
+ const double c = 134217729.; /* 2^27 +1 */ \
605
+ __m128d _u_v = _mm_set_pd (u,v); \
606
+ __m128d c2=_mm_set1_pd(c); \
607
+ c2 = _mm_mul_pd(c2, _u_v); \
608
+ __m128d u1v1 = _mm_sub_pd(_u_v, c2); \
609
+ u1v1 = _mm_add_pd(u1v1, c2); \
610
+ __m128d u2v2 = _mm_sub_pd(_u_v, u1v1); \
611
+ __m128d _v_u = _mm_shuffle_pd(_u_v, _u_v, _MM_SHUFFLE2 (0,1)); \
612
+ __m128d rhrh = _mm_mul_pd(_v_u, _u_v); \
613
+ _mm_store_sd (rh, rhrh); \
614
+ __m128d v2u2 = _mm_shuffle_pd(u2v2, u2v2, _MM_SHUFFLE2 (0,1)); \
615
+ __m128d u1v2u2v1 = _mm_mul_pd(u1v1, v2u2); \
616
+ __m128d u2v1u1v2 = _mm_shuffle_pd(u1v2u2v1, u1v2u2v1, _MM_SHUFFLE2 (0,1)); \
617
+ __m128d uvmed = _mm_add_pd(u1v2u2v1, u2v1u1v2); \
618
+ __m128d u1u2 = _mm_shuffle_pd(u1v1, u2v2, _MM_SHUFFLE2 (1,1)); \
619
+ __m128d v1v2 = _mm_shuffle_pd(u1v1, u2v2, _MM_SHUFFLE2 (0,0)); \
620
+ __m128d u1v1u2v2 = _mm_mul_pd(u1u2, v1v2); \
621
+ __m128d tmp = _mm_sub_pd(u1v1u2v2, rhrh); \
622
+ tmp = _mm_add_pd(tmp, uvmed); \
623
+ __m128d u2v2u2v2 = _mm_mul_pd(u2v2, v2u2); \
624
+ tmp = _mm_add_pd(tmp, u2v2u2v2); \
625
+ _mm_store_sd (rl, tmp); \
626
+ }
627
+ #endif
628
+
629
+ /*
630
+ double _u =u, _v=v; \
631
+ __m128d _u_v = _mm_set_pd(_u, _v); \
632
+ */ \
633
+ /*
634
+ * Computes rh and rl such that rh + rl = a * b and rh = a @* b exactly
635
+ */
636
+ #define Mul12Cond(rh, rl, a, b) \
637
+ {\
638
+ const double two_em53 = 1.1102230246251565404e-16; /* 0x3CA00000, 0x00000000 */\
639
+ const double two_e53 = 9007199254740992.; /* 0x43400000, 0x00000000 */\
640
+ double u, v; \
641
+ db_number _a=a, _b=b; \
642
+ \
643
+ if (_a.i[HI]>0x7C900000) u = _a*two_em53; \
644
+ else u = _a; \
645
+ if (_b.i[HI]>0x7C900000) v = _b*two_em53; \
646
+ else v = _b; \
647
+ \
648
+ Mul12(rh, rl, u, v); \
649
+ \
650
+ if (_a.i[HI]>0x7C900000) {*rh *= two_e53; *rl *= two_e53;} \
651
+ if (_b.i[HI]>0x7C900000) {*rh *= two_e53; *rl *= two_e53;} \
652
+ }
653
+
654
+
655
+
656
+ /*
657
+ * computes double-double multiplication: zh+zl = (xh+xl) * (yh+yl)
658
+ * relative error is smaller than 2^-102
659
+ */
660
+
661
+
662
+
663
+ #define Mul22(zh,zl,xh,xl,yh,yl) \
664
+ { \
665
+ double mh, ml; \
666
+ \
667
+ const double c = 134217729.; \
668
+ double up, u1, u2, vp, v1, v2; \
669
+ \
670
+ up = (xh)*c; vp = (yh)*c; \
671
+ u1 = ((xh)-up)+up; v1 = ((yh)-vp)+vp; \
672
+ u2 = (xh)-u1; v2 = (yh)-v1; \
673
+ \
674
+ mh = (xh)*(yh); \
675
+ ml = (((u1*v1-mh)+(u1*v2))+(u2*v1))+(u2*v2); \
676
+ \
677
+ ml += (xh)*(yl) + (xl)*(yh); \
678
+ *zh = mh+ml; \
679
+ *zl = mh - (*zh) + ml; \
680
+ }
681
+
682
+
683
+
684
+ #endif /* DEKKER_AS_FUNCTIONS */
685
+
686
+ #endif /* PROCESSOR_HAS_FMA */
687
+
688
+ /* Additional double-double operators */
689
+
690
+ /* Eps Mul122 <= 2^-102 */
691
+ #define Mul122(resh,resl,a,bh,bl) \
692
+ { \
693
+ double _t1, _t2, _t3, _t4; \
694
+ \
695
+ Mul12(&_t1,&_t2,(a),(bh)); \
696
+ _t3 = (a) * (bl); \
697
+ _t4 = _t2 + _t3; \
698
+ Add12((*(resh)),(*(resl)),_t1,_t4); \
699
+ }
700
+
701
+ /* Eps MulAdd212 <= 2^-100 for |a * (bh + bl)| <= 1/4 * |ch + cl| */
702
+ #define MulAdd212(resh,resl,ch,cl,a,bh,bl) \
703
+ { \
704
+ double _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8; \
705
+ \
706
+ Mul12(&_t1,&_t2,(a),(bh)); \
707
+ Add12(_t3,_t4,(ch),_t1); \
708
+ _t5 = (bl) * (a); \
709
+ _t6 = (cl) + _t2; \
710
+ _t7 = _t5 + _t6; \
711
+ _t8 = _t7 + _t4; \
712
+ Add12((*(resh)),(*(resl)),_t3,_t8); \
713
+ }
714
+
715
+ /* Eps MulAdd212 <= 2^-100
716
+ for |(ah + bh) * (bh + bl)| <= 1/4 * |ch + cl|
717
+ */
718
+ #define MulAdd22(resh,resl,ch,cl,ah,al,bh,bl) \
719
+ { \
720
+ double _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8; \
721
+ double _t9, _t10; \
722
+ \
723
+ Mul12(&_t1,&_t2,(ah),(bh)); \
724
+ Add12(_t3,_t4,(ch),_t1); \
725
+ _t5 = (ah) * (bl); \
726
+ _t6 = (al) * (bh); \
727
+ _t7 = _t2 + (cl); \
728
+ _t8 = _t4 + _t7; \
729
+ _t9 = _t5 + _t6; \
730
+ _t10 = _t8 + _t9; \
731
+ Add12((*(resh)),(*(resl)),_t3,_t10); \
732
+ }
733
+
734
+ #define Add122(resh,resl,a,bh,bl) \
735
+ { \
736
+ double _t1, _t2, _t3; \
737
+ \
738
+ Add12(_t1,_t2,(a),(bh)); \
739
+ _t3 = _t2 + (bl); \
740
+ Add12((*(resh)),(*(resl)),_t1,_t3); \
741
+ }
742
+
743
+ #define Add122Cond(resh,resl,a,bh,bl) \
744
+ { \
745
+ double _t1, _t2, _t3; \
746
+ \
747
+ Add12Cond(_t1,_t2,(a),(bh)); \
748
+ _t3 = _t2 + (bl); \
749
+ Add12((*(resh)),(*(resl)),_t1,_t3); \
750
+ }
751
+
752
+
753
+ #define Add212(resh,resl,ah,al,b) \
754
+ { \
755
+ double _t1, _t2, _t3; \
756
+ \
757
+ Add12(_t1,_t2,(ah),b); \
758
+ _t3 = _t2 + (al); \
759
+ Add12((*(resh)),(*(resl)),_t1,_t3); \
760
+ }
761
+
762
+
763
+ /* In the following the one-line computation of _cl was split so that
764
+ icc(8.1) would compile it properly. It's a bug of icc */
765
+
766
+ #if DEKKER_AS_FUNCTIONS
767
+ extern void Div22(double *z, double *zz, double x, double xx, double y, double yy);
768
+ #else
769
+ #define Div22(pzh,pzl,xh,xl,yh,yl) { \
770
+ double _ch,_cl,_uh,_ul; \
771
+ _ch=(xh)/(yh); Mul12(&_uh,&_ul,_ch,(yh)); \
772
+ _cl=((xh)-_uh); \
773
+ _cl -= _ul; \
774
+ _cl += (xl); \
775
+ _cl -= _ch*(yl); \
776
+ _cl /= (yh); \
777
+ *pzh=_ch+_cl; *pzl=(_ch-(*pzh))+_cl; \
778
+ }
779
+ #endif /* DEKKER_AS_FUNCTIONS */
780
+
781
+
782
+
783
+ /*
784
+ Coefficients for 1/sqrt(m) with 1/2 < m < 2
785
+ The corresponding relative polynomial approximation error is less than
786
+ eps < 2^(-8.3127) (cf. Maple file)
787
+ The Itanium instruction frsqrta is slightly more accurate; it can
788
+ therefore easily replace the polynomial evaluation.
789
+ */
790
+
791
+ #define SQRTPOLYC0 2.50385236695888790947606139525305479764938354492188e+00
792
+ #define SQRTPOLYC1 -3.29763389114324168005509818613063544034957885742188e+00
793
+ #define SQRTPOLYC2 2.75726076139124520736345402838196605443954467773438e+00
794
+ #define SQRTPOLYC3 -1.15233725777933848632983426796272397041320800781250e+00
795
+ #define SQRTPOLYC4 1.86900066679800969104974228685023263096809387207031e-01
796
+ #define SQRTTWO52 4.50359962737049600000000000000000000000000000000000e+15
797
+
798
+ #if SQRT_AS_FUNCTIONS
799
+ extern void sqrt12(double *resh, double *resl, double x);
800
+ #else
801
+
802
+ /* Concerning special case handling see crlibm_private.h */
803
+ #define sqrt12(resh, resl, x) { \
804
+ db_number _xdb; \
805
+ int _E; \
806
+ double _m, _r0, _r1, _r2, _r3h, _r3l, _r4h, _r4l, _srtmh, _srtml; \
807
+ double _r2PHr2h, _r2PHr2l, _r2Sqh, _r2Sql; \
808
+ double _mMr2h, _mMr2l, _mMr2Ch, _mMr2Cl; \
809
+ double _MHmMr2Ch, _MHmMr2Cl; \
810
+ double _r3Sqh, _r3Sql, _mMr3Sqh, _mMr3Sql; \
811
+ double _half; \
812
+ \
813
+ /* Special case x = 0 */ \
814
+ if ((x) == 0.0) { \
815
+ (*(resh)) = (x); \
816
+ (*(resl)) = 0.0; \
817
+ } else { \
818
+ \
819
+ _E = 0; \
820
+ \
821
+ /* Convert to integer format */ \
822
+ _xdb.d = (x); \
823
+ \
824
+ /* Handle subnormal case */ \
825
+ if (_xdb.i[HI] < 0x00100000) { \
826
+ _E = -52; \
827
+ _xdb.d *= ((db_number) ((double) SQRTTWO52)).d; \
828
+ /* make x a normal number */ \
829
+ } \
830
+ \
831
+ /* Extract exponent E and mantissa m */ \
832
+ _E += (_xdb.i[HI]>>20)-1023; \
833
+ _xdb.i[HI] = (_xdb.i[HI] & 0x000fffff) | 0x3ff00000; \
834
+ _m = _xdb.d; \
835
+ \
836
+ _half = 0.5; \
837
+ /* Make exponent even */ \
838
+ if (_E & 0x00000001) { \
839
+ _E++; \
840
+ _m *= _half; /* Suppose now 1/2 <= m <= 2 */ \
841
+ } \
842
+ \
843
+ /* Construct sqrt(2^E) = 2^(E/2) */ \
844
+ _xdb.i[HI] = (_E/2 + 1023) << 20; \
845
+ _xdb.i[LO] = 0; \
846
+ \
847
+ /* Compute initial approximation to r = 1/sqrt(m) */ \
848
+ \
849
+ _r0 = SQRTPOLYC0 + \
850
+ _m * (SQRTPOLYC1 + _m * (SQRTPOLYC2 + _m * (SQRTPOLYC3 + _m * SQRTPOLYC4))); \
851
+ \
852
+ /* Iterate two times on double precision */ \
853
+ \
854
+ _r1 = _half * _r0 * (3.0 - _m * (_r0 * _r0)); \
855
+ _r2 = _half * _r1 * (3.0 - _m * (_r1 * _r1)); \
856
+ \
857
+ /* Iterate two times on double-double precision */ \
858
+ \
859
+ Mul12(&_r2Sqh, &_r2Sql, _r2, _r2); \
860
+ Add12(_r2PHr2h, _r2PHr2l, _r2, (_half * _r2)); \
861
+ Mul12(&_mMr2h, &_mMr2l, _m, _r2); \
862
+ Mul22(&_mMr2Ch, &_mMr2Cl, _mMr2h, _mMr2l, _r2Sqh, _r2Sql); \
863
+ \
864
+ _MHmMr2Ch = -_half * _mMr2Ch; \
865
+ _MHmMr2Cl = -_half * _mMr2Cl; \
866
+ \
867
+ Add22(&_r3h, &_r3l, _r2PHr2h, _r2PHr2l, _MHmMr2Ch, _MHmMr2Cl); \
868
+ \
869
+ Mul22(&_r3Sqh, &_r3Sql, _r3h, _r3l, _r3h, _r3l); \
870
+ Mul22(&_mMr3Sqh, &_mMr3Sql, _m, 0.0, _r3Sqh, _r3Sql); \
871
+ /* To prove: mMr3Sqh = 1.0 in each case */ \
872
+ \
873
+ Mul22(&_r4h, &_r4l, _r3h, _r3l, 1.0, (-_half * _mMr3Sql)); \
874
+ \
875
+ /* Multiply obtained reciprocal square root by m */ \
876
+ \
877
+ Mul22(&_srtmh,&_srtml,_m,0.0,_r4h,_r4l); \
878
+ \
879
+ /* Multiply componentwise by sqrt(2^E) */ \
880
+ /* which is an integer power of 2 that may not produce a subnormal */ \
881
+ \
882
+ (*(resh)) = _xdb.d * _srtmh; \
883
+ (*(resl)) = _xdb.d * _srtml; \
884
+ \
885
+ } /* End: special case 0 */ \
886
+ }
887
+
888
+
889
+ #define sqrt12_64(resh, resl, x) { \
890
+ db_number _xdb; \
891
+ int _E; \
892
+ double _m, _r0, _r1, _r2, _r3h, _r3l, _r4h, _r4l, _srtmh, _srtml; \
893
+ double _r2PHr2h, _r2PHr2l, _r2Sqh, _r2Sql; \
894
+ double _mMr2h, _mMr2l, _mMr2Ch, _mMr2Cl; \
895
+ double _MHmMr2Ch, _MHmMr2Cl; \
896
+ double _r3Sqh, _r3Sql, _mMr3Sqh, _mMr3Sql; \
897
+ double _half; \
898
+ \
899
+ /* Special case x = 0 */ \
900
+ if ((x) == 0.0) { \
901
+ (*(resh)) = (x); \
902
+ (*(resl)) = 0.0; \
903
+ } else { \
904
+ \
905
+ _E = 0.0; \
906
+ \
907
+ /* Convert to integer format */ \
908
+ _xdb.d = (x); \
909
+ \
910
+ /* Handle subnormal case */ \
911
+ if (_xdb.i[HI] < 0x00100000) { \
912
+ _E = -52; \
913
+ _xdb.d *= ((db_number) ((double) SQRTTWO52)).d; \
914
+ /* make x a normal number */ \
915
+ } \
916
+ \
917
+ /* Extract exponent E and mantissa m */ \
918
+ _E += (_xdb.i[HI]>>20)-1023; \
919
+ _xdb.i[HI] = (_xdb.i[HI] & 0x000fffff) | 0x3ff00000; \
920
+ _m = _xdb.d; \
921
+ \
922
+ _half = 0.5; \
923
+ /* Make exponent even */ \
924
+ if (_E & 0x00000001) { \
925
+ _E++; \
926
+ _m *= _half; /* Suppose now 1/2 <= m <= 2 */ \
927
+ } \
928
+ \
929
+ /* Construct sqrt(2^E) = 2^(E/2) */ \
930
+ _xdb.i[HI] = (_E/2 + 1023) << 20; \
931
+ _xdb.i[LO] = 0; \
932
+ \
933
+ /* Compute initial approximation to r = 1/sqrt(m) */ \
934
+ \
935
+ _r0 = SQRTPOLYC0 + \
936
+ _m * (SQRTPOLYC1 + _m * (SQRTPOLYC2 + _m * (SQRTPOLYC3 + _m * SQRTPOLYC4))); \
937
+ \
938
+ /* Iterate two times on double precision */ \
939
+ \
940
+ _r1 = _half * _r0 * (3.0 - _m * (_r0 * _r0)); \
941
+ _r2 = _half * _r1 * (3.0 - _m * (_r1 * _r1)); \
942
+ \
943
+ /* Iterate once on double-double precision */ \
944
+ \
945
+ Mul12(&_r2Sqh, &_r2Sql, _r2, _r2); \
946
+ Add12(_r2PHr2h, _r2PHr2l, _r2, (_half * _r2)); \
947
+ Mul12(&_mMr2h, &_mMr2l, _m, _r2); \
948
+ Mul22(&_mMr2Ch, &_mMr2Cl, _mMr2h, _mMr2l, _r2Sqh, _r2Sql); \
949
+ \
950
+ _MHmMr2Ch = -_half * _mMr2Ch; \
951
+ _MHmMr2Cl = -_half * _mMr2Cl; \
952
+ \
953
+ Add22(&_r3h, &_r3l, _r2PHr2h, _r2PHr2l, _MHmMr2Ch, _MHmMr2Cl); \
954
+ \
955
+ /* Multiply obtained reciprocal square root by m */ \
956
+ \
957
+ Mul22(&_srtmh,&_srtml,_m,0.0,_r3h,_r3l); \
958
+ \
959
+ /* Multiply componentwise by sqrt(2^E) */ \
960
+ /* which is an integer power of 2 that may not produce a subnormal */ \
961
+ \
962
+ (*(resh)) = _xdb.d * _srtmh; \
963
+ (*(resl)) = _xdb.d * _srtml; \
964
+ \
965
+ } /* End: special case 0 */ \
966
+ }
967
+
968
+ /*
969
+ sqrt12_64_unfiltered = sqrt(x) * (1 + eps) where abs(eps) <= 2^(-64)
970
+
971
+ if x is neither subnormal nor 0
972
+
973
+ */
974
+ #define sqrt12_64_unfiltered(resh, resl, x) { \
975
+ db_number _xdb; \
976
+ int _E; \
977
+ double _m, _r0, _r1, _r2, _r3h, _r3l, _srtmh, _srtml; \
978
+ double _r2PHr2h, _r2PHr2l, _r2Sqh, _r2Sql; \
979
+ double _mMr2h, _mMr2l, _mMr2Ch, _mMr2Cl; \
980
+ double _MHmMr2Ch, _MHmMr2Cl; \
981
+ double _half; \
982
+ \
983
+ \
984
+ \
985
+ /* Convert to integer format */ \
986
+ _xdb.d = (x); \
987
+ \
988
+ \
989
+ /* Extract exponent E and mantissa m */ \
990
+ _E = (_xdb.i[HI]>>20)-1023; \
991
+ _xdb.i[HI] = (_xdb.i[HI] & 0x000fffff) | 0x3ff00000; \
992
+ _m = _xdb.d; \
993
+ \
994
+ _half = 0.5; \
995
+ /* Make exponent even */ \
996
+ if (_E & 0x00000001) { \
997
+ _E++; \
998
+ _m *= _half; /* Suppose now 1/2 <= m <= 2 */ \
999
+ } \
1000
+ \
1001
+ /* Construct sqrt(2^E) = 2^(E/2) */ \
1002
+ _xdb.i[HI] = (_E/2 + 1023) << 20; \
1003
+ _xdb.i[LO] = 0; \
1004
+ \
1005
+ /* Compute initial approximation to r = 1/sqrt(m) */ \
1006
+ \
1007
+ _r0 = SQRTPOLYC0 + \
1008
+ _m * (SQRTPOLYC1 + _m * (SQRTPOLYC2 + _m * (SQRTPOLYC3 + _m * SQRTPOLYC4))); \
1009
+ \
1010
+ /* Iterate two times on double precision */ \
1011
+ \
1012
+ _r1 = _half * _r0 * (3.0 - _m * (_r0 * _r0)); \
1013
+ _r2 = _half * _r1 * (3.0 - _m * (_r1 * _r1)); \
1014
+ \
1015
+ /* Iterate once on double-double precision */ \
1016
+ \
1017
+ Mul12(&_r2Sqh, &_r2Sql, _r2, _r2); \
1018
+ Add12(_r2PHr2h, _r2PHr2l, _r2, (_half * _r2)); \
1019
+ Mul12(&_mMr2h, &_mMr2l, _m, _r2); \
1020
+ Mul22(&_mMr2Ch, &_mMr2Cl, _mMr2h, _mMr2l, _r2Sqh, _r2Sql); \
1021
+ \
1022
+ _MHmMr2Ch = -_half * _mMr2Ch; \
1023
+ _MHmMr2Cl = -_half * _mMr2Cl; \
1024
+ \
1025
+ Add22(&_r3h, &_r3l, _r2PHr2h, _r2PHr2l, _MHmMr2Ch, _MHmMr2Cl); \
1026
+ \
1027
+ /* Multiply obtained reciprocal square root by m */ \
1028
+ \
1029
+ Mul122(&_srtmh,&_srtml,_m,_r3h,_r3l); \
1030
+ \
1031
+ /* Multiply componentwise by sqrt(2^E) */ \
1032
+ /* which is an integer power of 2 that may not produce a subnormal */ \
1033
+ \
1034
+ (*(resh)) = _xdb.d * _srtmh; \
1035
+ (*(resl)) = _xdb.d * _srtml; \
1036
+ \
1037
+ }
1038
+
1039
+
1040
+
1041
+ #endif /*SQRT_AS_FUNCTIONS*/
1042
+
1043
+ /* Declaration of the debug function */
1044
+
1045
+ void printHexa(char* s, double x);
1046
+
1047
+
1048
+ #endif /*CRLIBM_PRIVATE_H*/