smarter_csv 1.17.3 → 1.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +50 -1
- data/CONTRIBUTORS.md +2 -1
- data/README.md +7 -2
- data/docs/data_transformations.md +33 -0
- data/docs/migrating_from_csv.md +18 -0
- data/docs/options.md +2 -1
- data/docs/upgrade_wizard.html +14 -10
- data/ext/smarter_csv/smarter_csv.c +204 -32
- data/ext/smarter_csv/vendor/LICENSE-fast_float-MIT +27 -0
- data/ext/smarter_csv/vendor/eisel_lemire.h +117 -0
- data/ext/smarter_csv/vendor/eisel_lemire.md +29 -0
- data/ext/smarter_csv/vendor/eisel_lemire_powers.h +663 -0
- data/lib/smarter_csv/hash_transformations.rb +51 -2
- data/lib/smarter_csv/reader.rb +18 -6
- data/lib/smarter_csv/reader_options.rb +24 -0
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv.rb +1 -0
- data/smarter_csv.gemspec +3 -0
- metadata +22 -4
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2021 The fast_float authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any
|
|
6
|
+
person obtaining a copy of this software and associated
|
|
7
|
+
documentation files (the "Software"), to deal in the
|
|
8
|
+
Software without restriction, including without
|
|
9
|
+
limitation the rights to use, copy, modify, merge,
|
|
10
|
+
publish, distribute, sublicense, and/or sell copies of
|
|
11
|
+
the Software, and to permit persons to whom the Software
|
|
12
|
+
is furnished to do so, subject to the following
|
|
13
|
+
conditions:
|
|
14
|
+
|
|
15
|
+
The above copyright notice and this permission notice
|
|
16
|
+
shall be included in all copies or substantial portions
|
|
17
|
+
of the Software.
|
|
18
|
+
|
|
19
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
|
20
|
+
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
|
21
|
+
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
|
22
|
+
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
|
23
|
+
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
24
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
25
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
|
26
|
+
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
27
|
+
DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
/* Eisel-Lemire decimal->double, ported from fast_float:
|
|
2
|
+
* include/fast_float/decimal_to_binary.h, the compute_float<binary_format<double>>
|
|
3
|
+
* + compute_product_approximation routines.
|
|
4
|
+
*
|
|
5
|
+
* Algorithm authors: Michael Eisel (original approach) and Daniel Lemire
|
|
6
|
+
* (formalization, proof, and the fast_float implementation) — hence "Eisel-Lemire".
|
|
7
|
+
*
|
|
8
|
+
* Copyright (c) 2021 The fast_float authors. Tri-licensed Apache-2.0 / MIT / BSL-1.0;
|
|
9
|
+
* used here under MIT — see LICENSE-fast_float-MIT in this directory.
|
|
10
|
+
*
|
|
11
|
+
* This is the "without fallback" variant (Noble Mushtak & Daniel Lemire, "Fast
|
|
12
|
+
* Number Parsing Without Fallback"): for ANY nonzero mantissa w that fits exactly
|
|
13
|
+
* in a uint64 (i.e. <= 19 significant digits, not truncated) and decimal exponent
|
|
14
|
+
* q, it returns the correctly-rounded binary64 with no slow-path needed.
|
|
15
|
+
*
|
|
16
|
+
* smarter_json uses it as THE decimal->double path for mantissas up to 18 digits
|
|
17
|
+
* (everything wider / overflowed / with an extreme exponent goes to the strtod
|
|
18
|
+
* round-to-odd fallback). It is correctly-rounded across that whole range, with no
|
|
19
|
+
* round-to-even tie loss, and is fast on the common short-mantissa case.
|
|
20
|
+
* Verified bit-for-bit vs JSON.parse. See eisel_lemire.md for provenance. */
|
|
21
|
+
#ifndef FJ_EISEL_LEMIRE_H
|
|
22
|
+
#define FJ_EISEL_LEMIRE_H
|
|
23
|
+
|
|
24
|
+
#include <stdint.h>
|
|
25
|
+
#include <string.h>
|
|
26
|
+
#include "eisel_lemire_powers.h"
|
|
27
|
+
|
|
28
|
+
/* binary_format<double> constants from fast_float. */
|
|
29
|
+
#define FJ_EL_MANTISSA_BITS 52
|
|
30
|
+
#define FJ_EL_MIN_EXPONENT (-1023)
|
|
31
|
+
#define FJ_EL_INFINITE_POWER 0x7FF
|
|
32
|
+
#define FJ_EL_SMALLEST_POW10 (-342)
|
|
33
|
+
#define FJ_EL_LARGEST_POW10 308
|
|
34
|
+
#define FJ_EL_MIN_RTE (-4) /* min_exponent_round_to_even */
|
|
35
|
+
#define FJ_EL_MAX_RTE 23 /* max_exponent_round_to_even */
|
|
36
|
+
|
|
37
|
+
/* (((152170 + 65536) * q) >> 16) + 63 == floor(log2(10^q)) + q + 63, see paper. */
|
|
38
|
+
static inline int32_t fj_el_power(int32_t q) {
|
|
39
|
+
return (((152170 + 65536) * q) >> 16) + 63;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
static inline void fj_el_mul128(uint64_t a, uint64_t b, uint64_t *hi, uint64_t *lo) {
|
|
43
|
+
#if defined(__SIZEOF_INT128__)
|
|
44
|
+
__uint128_t p = (__uint128_t)a * (__uint128_t)b;
|
|
45
|
+
*lo = (uint64_t)p;
|
|
46
|
+
*hi = (uint64_t)(p >> 64);
|
|
47
|
+
#else
|
|
48
|
+
uint64_t a0 = (uint32_t)a, a1 = a >> 32, b0 = (uint32_t)b, b1 = b >> 32;
|
|
49
|
+
uint64_t p00 = a0 * b0, p01 = a0 * b1, p10 = a1 * b0, p11 = a1 * b1;
|
|
50
|
+
uint64_t mid = p10 + (p00 >> 32) + (uint32_t)p01;
|
|
51
|
+
*hi = p11 + (mid >> 32) + (p01 >> 32);
|
|
52
|
+
*lo = (mid << 32) | (uint32_t)p00;
|
|
53
|
+
#endif
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
static inline double fj_el_bits2double(uint64_t bits) {
|
|
57
|
+
double d;
|
|
58
|
+
memcpy(&d, &bits, sizeof(d));
|
|
59
|
+
return d;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/* q = power of ten, w = mantissa (NONZERO, exact, fits in uint64). neg = sign. */
|
|
63
|
+
static inline double fj_eisel_lemire_s2d(int64_t q, uint64_t w, int neg) {
|
|
64
|
+
const uint64_t sign = (uint64_t)(neg != 0) << 63;
|
|
65
|
+
uint64_t mantissa, prod_hi, prod_lo, sp_hi, sp_lo;
|
|
66
|
+
int32_t power2;
|
|
67
|
+
int lz, upperbit, shift, index;
|
|
68
|
+
|
|
69
|
+
if (q < FJ_EL_SMALLEST_POW10) return fj_el_bits2double(sign); /* underflow -> 0 */
|
|
70
|
+
if (q > FJ_EL_LARGEST_POW10)
|
|
71
|
+
return fj_el_bits2double(sign | ((uint64_t)FJ_EL_INFINITE_POWER << FJ_EL_MANTISSA_BITS));
|
|
72
|
+
|
|
73
|
+
lz = __builtin_clzll(w);
|
|
74
|
+
w <<= lz;
|
|
75
|
+
|
|
76
|
+
/* compute_product_approximation<mantissa_bits + 3 = 55>: precision_mask = 0x1FF. */
|
|
77
|
+
index = 2 * (int)(q - FJ_EL_SMALLEST_POWER_OF_FIVE);
|
|
78
|
+
fj_el_mul128(w, fj_power_of_five_128[index], &prod_hi, &prod_lo);
|
|
79
|
+
if ((prod_hi & 0x1FF) == 0x1FF) {
|
|
80
|
+
fj_el_mul128(w, fj_power_of_five_128[index + 1], &sp_hi, &sp_lo);
|
|
81
|
+
prod_lo += sp_hi;
|
|
82
|
+
if (sp_hi > prod_lo) prod_hi++;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
upperbit = (int)(prod_hi >> 63);
|
|
86
|
+
shift = upperbit + 64 - FJ_EL_MANTISSA_BITS - 3; /* upperbit + 9 */
|
|
87
|
+
mantissa = prod_hi >> shift;
|
|
88
|
+
power2 = (int32_t)(fj_el_power((int32_t)q) + upperbit - lz - FJ_EL_MIN_EXPONENT);
|
|
89
|
+
|
|
90
|
+
if (power2 <= 0) { /* subnormal */
|
|
91
|
+
if (-power2 + 1 >= 64) return fj_el_bits2double(sign); /* far below min -> 0 */
|
|
92
|
+
mantissa >>= (-power2 + 1);
|
|
93
|
+
mantissa += (mantissa & 1);
|
|
94
|
+
mantissa >>= 1;
|
|
95
|
+
power2 = (mantissa < ((uint64_t)1 << FJ_EL_MANTISSA_BITS)) ? 0 : 1;
|
|
96
|
+
return fj_el_bits2double(sign | ((uint64_t)power2 << FJ_EL_MANTISSA_BITS) | mantissa);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/* round-to-even: if we land exactly between two doubles, round down. */
|
|
100
|
+
if ((prod_lo <= 1) && (q >= FJ_EL_MIN_RTE) && (q <= FJ_EL_MAX_RTE) &&
|
|
101
|
+
((mantissa & 3) == 1) && ((mantissa << shift) == prod_hi)) {
|
|
102
|
+
mantissa &= ~(uint64_t)1;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
mantissa += (mantissa & 1);
|
|
106
|
+
mantissa >>= 1;
|
|
107
|
+
if (mantissa >= ((uint64_t)2 << FJ_EL_MANTISSA_BITS)) {
|
|
108
|
+
mantissa = (uint64_t)1 << FJ_EL_MANTISSA_BITS;
|
|
109
|
+
power2++;
|
|
110
|
+
}
|
|
111
|
+
mantissa &= ~((uint64_t)1 << FJ_EL_MANTISSA_BITS); /* drop implicit bit */
|
|
112
|
+
if (power2 >= FJ_EL_INFINITE_POWER)
|
|
113
|
+
return fj_el_bits2double(sign | ((uint64_t)FJ_EL_INFINITE_POWER << FJ_EL_MANTISSA_BITS));
|
|
114
|
+
return fj_el_bits2double(sign | ((uint64_t)power2 << FJ_EL_MANTISSA_BITS) | mantissa);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
#endif
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Eisel-Lemire decimal→double, from fast_float
|
|
2
|
+
|
|
3
|
+
- The algorithm is **Eisel-Lemire**, named for **Michael Eisel** (who proposed/motivated the original approach) and **Daniel Lemire** (who formalized it, proved its bounds, and wrote the fast_float implementation). We use the later "without fallback" form proven by **Noble Mushtak & Daniel Lemire, _Fast Number Parsing Without Fallback_**. It converts a decimal mantissa+exponent to a correctly-rounded binary64 with no slow path, for any nonzero mantissa that fits exactly in a `uint64` (≤ 19 significant digits).
|
|
4
|
+
- Vendored from **fastfloat/fast_float** — https://github.com/fastfloat/fast_float — license Apache-2.0 / MIT / Boost-1.0 (your choice).
|
|
5
|
+
|
|
6
|
+
## What smarter_json uses it for
|
|
7
|
+
|
|
8
|
+
`try_numeric_conversion` (in `smarter_csv.c`) routes decimal tokens with `m10digits ≤ 19 → fj_eisel_lemire_s2d`, and `> 19 / overflow / extreme exponent → strtod`. Eisel-Lemire is correctly-rounded across the whole ≤19-digit range — every mantissa that fits exactly in a uint64, no round-to-even tie loss — **and** fast on the common short-mantissa case. (smarter_json uses it the same way via `fj_float_from_parts`.)
|
|
9
|
+
|
|
10
|
+
## These two files are DERIVED, not verbatim copies
|
|
11
|
+
|
|
12
|
+
- **`eisel_lemire_powers.h`** — the `power_of_five_128` table (1302 × `uint64`), extracted **verbatim** (the constants are byte-for-byte) from fast_float `include/fast_float/fast_table.h`, but **rewrapped**: a plain C `static const uint64_t fj_power_of_five_128[...]` array instead of fast_float's C++ `powers_template` struct. `FJ_EL_SMALLEST_POWER_OF_FIVE` / `FJ_EL_LARGEST_POWER_OF_FIVE` are the `-342` / `308` bounds.
|
|
13
|
+
- **`eisel_lemire.h`** — a C **port** of `compute_float<binary_format<double>>` + `compute_product_approximation` from fast_float `include/fast_float/decimal_to_binary.h`. Adapted to: (a) plain C (no templates), (b) take our already-extracted `(q, w)` = `(e10, m10)` instead of re-parsing a string, (c) the binary64 constants inlined as `FJ_EL_*` macros, (d) a portable `fj_el_mul128` (`__uint128_t` when available, else a 32×32 split). The control flow — `compute_product_approximation<55>`, the `0x1FF` precision mask, `upperbit`/`shift`, the subnormal branch, and the round-to-even "land exactly between two doubles → round down" check — mirrors the source.
|
|
14
|
+
|
|
15
|
+
Because they're derived (rewrapped table, ported algorithm), they're named after the **algorithm** (Eisel-Lemire) rather than their upstream source (`fast_float`). The verification that they faithfully reproduce upstream is the bit-for-bit stress vs `JSON.parse` (8–10M random numbers incl. ties / subnormals / near-overflow, 0 mismatches).
|
|
16
|
+
|
|
17
|
+
## To refresh from upstream
|
|
18
|
+
|
|
19
|
+
Re-pull the two source files and re-derive:
|
|
20
|
+
|
|
21
|
+
- table: `curl -L https://raw.githubusercontent.com/fastfloat/fast_float/main/include/fast_float/fast_table.h` — copy the `power_of_five_128[...] = { ... };` body into `eisel_lemire_powers.h`'s array (constants only).
|
|
22
|
+
- algorithm: `curl -L https://raw.githubusercontent.com/fastfloat/fast_float/main/include/fast_float/decimal_to_binary.h` — re-check `compute_float` / `compute_product_approximation` against the port in `eisel_lemire.h`.
|
|
23
|
+
|
|
24
|
+
Then re-run the bit-exact stress (≥ several million random 1–19-digit numbers, with forced round-to-even ties and exponents spanning subnormal → overflow) vs `JSON.parse` before trusting it.
|
|
25
|
+
|
|
26
|
+
- origin: Eisel-Lemire (`fastfloat/fast_float`)
|
|
27
|
+
- vendored from: fast_float (upstream `main`)
|
|
28
|
+
- copyright: (c) 2021 The fast_float authors
|
|
29
|
+
- license: tri-licensed Apache-2.0 / MIT / BSL-1.0; vendored here under **MIT**, full text in `LICENSE-fast_float-MIT` (this directory)
|