@ebowwa/jsonl-hft 0.1.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +273 -53
- package/dist/index.d.ts +87 -47
- package/dist/index.js +111 -119
- package/native/libjsonl_hft.dylib +0 -0
- package/package.json +21 -8
package/README.md
CHANGED
|
@@ -1,79 +1,299 @@
|
|
|
1
1
|
# @ebowwa/jsonl-hft
|
|
2
2
|
|
|
3
|
-
HFT-grade JSONL parser with sub-
|
|
3
|
+
Generic HFT-grade JSONL parser with sub-microsecond latency.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
**NO HARDCODED FIELDS** - Consumer defines what fields to extract.
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
│ │ │
|
|
18
|
-
│ ▼ read (182ns) │
|
|
19
|
-
│ Rust Parser (data plane) │
|
|
20
|
-
│ • Zero allocation │
|
|
21
|
-
│ • SIMD-friendly byte scanning │
|
|
22
|
-
│ • Returns field offsets (not strings) │
|
|
23
|
-
└─────────────────────────────────────────────────────────┘
|
|
24
|
-
```
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Parallel Parsing**: Multi-core processing with rayon
|
|
10
|
+
- **GZIP Support**: Read and write compressed files
|
|
11
|
+
- **Schema Validation**: Validate JSONL against type schemas
|
|
12
|
+
- **Type Inference**: Automatic field type detection
|
|
13
|
+
- **Error Recovery**: Continue parsing on malformed lines
|
|
14
|
+
- **Statistics**: Real-time parsing metrics
|
|
15
|
+
- **Memory-Mapped I/O**: Efficient large file handling
|
|
16
|
+
- **SIMD Optimized**: memchr-accelerated byte search (~19 GiB/s)
|
|
25
17
|
|
|
26
18
|
## Performance
|
|
27
19
|
|
|
28
|
-
|
|
|
29
|
-
|
|
30
|
-
|
|
|
31
|
-
|
|
|
32
|
-
|
|
|
33
|
-
|
|
|
20
|
+
| Benchmark | Time | Throughput |
|
|
21
|
+
|-----------|------|------------|
|
|
22
|
+
| parse_line | 232 ns | ~606 MiB/s |
|
|
23
|
+
| find_field (first) | 15 ns | ~5 GiB/s |
|
|
24
|
+
| find_field (last) | 58 ns | ~1.3 GiB/s |
|
|
25
|
+
| pool_parser (1000 lines) | 113 µs | ~435 MiB/s |
|
|
26
|
+
| memchr_search | 3.9 ns | ~19.7 GiB/s |
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
bun add @ebowwa/jsonl-hft
|
|
32
|
+
```
|
|
34
33
|
|
|
35
34
|
## Usage
|
|
36
35
|
|
|
36
|
+
### Basic Parsing
|
|
37
|
+
|
|
37
38
|
```typescript
|
|
38
|
-
import {
|
|
39
|
+
import { parseDir, parseFile, parseBuffer, getVersion } from "@ebowwa/jsonl-hft";
|
|
40
|
+
|
|
41
|
+
// Define what fields you want to extract
|
|
42
|
+
const fields = ["session_id", "timestamp", "role", "message.content"];
|
|
43
|
+
|
|
44
|
+
// Parse a directory (recursive, parallel)
|
|
45
|
+
const entries = parseDir("/path/to/jsonl/files", fields);
|
|
46
|
+
|
|
47
|
+
// Parse a single file
|
|
48
|
+
const fileEntries = parseFile("/path/to/file.jsonl", fields);
|
|
39
49
|
|
|
40
|
-
//
|
|
41
|
-
const
|
|
42
|
-
const entries = parse(data);
|
|
50
|
+
// Parse a buffer
|
|
51
|
+
const bufferEntries = parseBuffer(jsonlBuffer, fields);
|
|
43
52
|
|
|
44
|
-
//
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
53
|
+
// Get version
|
|
54
|
+
console.log(getVersion()); // "1.2.0"
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### GZIP Support
|
|
58
|
+
|
|
59
|
+
```typescript
|
|
60
|
+
import { isGzip, parseGzipFile, writeGzip } from "@ebowwa/jsonl-hft";
|
|
61
|
+
|
|
62
|
+
// Check if file is gzip compressed
|
|
63
|
+
if (isGzip("/path/to/file.jsonl.gz")) {
|
|
64
|
+
// Parse compressed file
|
|
65
|
+
const entries = parseGzipFile("/path/to/file.jsonl.gz", fields);
|
|
49
66
|
}
|
|
50
67
|
|
|
51
|
-
//
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
const pending = ringPending();
|
|
55
|
-
const readData = ringRead(data.length);
|
|
68
|
+
// Write to gzip file
|
|
69
|
+
const result = writeGzip("/path/to/output.jsonl.gz", jsonlData, 9);
|
|
70
|
+
console.log(`Compression ratio: ${result.compressionRatio}`);
|
|
56
71
|
```
|
|
57
72
|
|
|
58
|
-
|
|
73
|
+
### Parallel Parsing
|
|
59
74
|
|
|
60
|
-
|
|
75
|
+
```typescript
|
|
76
|
+
import { parseFileParallel, parseFilesParallel, parseDirParallel } from "@ebowwa/jsonl-hft";
|
|
61
77
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
78
|
+
// Parse single file with parallel chunks
|
|
79
|
+
const result = parseFileParallel("/path/to/large.jsonl", fields, 0); // 0 = auto chunk size
|
|
80
|
+
console.log(`Parsed ${result.linesProcessed} lines in ${result.parseTimeNs}ns`);
|
|
65
81
|
|
|
66
|
-
|
|
82
|
+
// Parse multiple files in parallel
|
|
83
|
+
const files = ["/path/a.jsonl", "/path/b.jsonl", "/path/c.jsonl"];
|
|
84
|
+
const multiResult = parseFilesParallel(files, fields);
|
|
67
85
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
86
|
+
// Parse directory in parallel
|
|
87
|
+
const dirResult = parseDirParallel("/path/to/jsonl/dir", fields);
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Schema Validation
|
|
91
|
+
|
|
92
|
+
```typescript
|
|
93
|
+
import { validateFile, FieldType } from "@ebowwa/jsonl-hft";
|
|
94
|
+
|
|
95
|
+
const schema = [
|
|
96
|
+
{ name: "session_id", expectedType: FieldType.String, required: true },
|
|
97
|
+
{ name: "timestamp", expectedType: FieldType.String, required: true },
|
|
98
|
+
{ name: "value", expectedType: FieldType.Number, required: false },
|
|
99
|
+
];
|
|
100
|
+
|
|
101
|
+
const validation = validateFile("/path/to/data.jsonl", schema);
|
|
102
|
+
if (!validation.isValid) {
|
|
103
|
+
console.log(`Found ${validation.errorCount} errors`);
|
|
104
|
+
validation.errors.forEach(err => {
|
|
105
|
+
console.log(`Line ${err.lineNumber}: ${err.errorMessage}`);
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Type Inference
|
|
111
|
+
|
|
112
|
+
```typescript
|
|
113
|
+
import { inferFieldTypes, FieldType } from "@ebowwa/jsonl-hft";
|
|
114
|
+
|
|
115
|
+
const line = Buffer.from('{"id":"123","count":42,"active":true}');
|
|
116
|
+
const fields = ["id", "count", "active"];
|
|
117
|
+
const types = inferFieldTypes(line, fields);
|
|
118
|
+
|
|
119
|
+
// types[0] = FieldType.String
|
|
120
|
+
// types[1] = FieldType.Number
|
|
121
|
+
// types[2] = FieldType.Boolean
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Statistics
|
|
125
|
+
|
|
126
|
+
```typescript
|
|
127
|
+
import { getStats, resetStats, parseFileWithStats } from "@ebowwa/jsonl-hft";
|
|
128
|
+
|
|
129
|
+
// Parse with stats collection
|
|
130
|
+
const result = parseFileWithStats("/path/to/file.jsonl", fields);
|
|
131
|
+
|
|
132
|
+
// Get global stats
|
|
133
|
+
const stats = getStats();
|
|
134
|
+
console.log(`Throughput: ${stats.throughputMiBs} MiB/s`);
|
|
135
|
+
console.log(`Avg latency: ${stats.avgLatencyNs} ns`);
|
|
136
|
+
|
|
137
|
+
// Reset stats
|
|
138
|
+
resetStats();
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Error Recovery
|
|
142
|
+
|
|
143
|
+
```typescript
|
|
144
|
+
import { parseFileWithRecovery } from "@ebowwa/jsonl-hft";
|
|
145
|
+
|
|
146
|
+
const result = parseFileWithRecovery("/path/to/dirty.jsonl", fields);
|
|
147
|
+
console.log(`Successful: ${result.stats.successfulLines}`);
|
|
148
|
+
console.log(`Failed: ${result.stats.failedLines}`);
|
|
149
|
+
|
|
150
|
+
// Access errors
|
|
151
|
+
result.errors.forEach(err => {
|
|
152
|
+
console.log(`Line ${err.lineNumber}: ${err.errorType}`);
|
|
153
|
+
});
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
### Batch Parsing
|
|
157
|
+
|
|
158
|
+
```typescript
|
|
159
|
+
import { parseBatch, freeBatch } from "@ebowwa/jsonl-hft";
|
|
160
|
+
|
|
161
|
+
const buffers = [buf1, buf2, buf3];
|
|
162
|
+
const lengths = [buf1.length, buf2.length, buf3.length];
|
|
163
|
+
const result = parseBatch(buffers, lengths, fields);
|
|
164
|
+
console.log(`Parsed ${result.count} entries`);
|
|
165
|
+
|
|
166
|
+
// Cleanup
|
|
167
|
+
freeBatch(result);
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### Streaming API
|
|
171
|
+
|
|
172
|
+
```typescript
|
|
173
|
+
import { parseStream } from "@ebowwa/jsonl-hft";
|
|
174
|
+
|
|
175
|
+
// Process each line with a callback
|
|
176
|
+
parseStream("/path/to/large.jsonl", fields, (entry, lineNumber) => {
|
|
177
|
+
// Process entry
|
|
178
|
+
console.log(`Line ${lineNumber}:`, entry);
|
|
179
|
+
return true; // Continue processing
|
|
180
|
+
});
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Field Specification
|
|
184
|
+
|
|
185
|
+
Fields can be:
|
|
186
|
+
- **Simple**: `"session_id"`, `"timestamp"`, `"role"`
|
|
187
|
+
- **Nested (dot notation)**: `"message.content"`, `"metadata.user.id"`
|
|
188
|
+
|
|
189
|
+
The parser extracts only the fields you request - no wasted parsing.
|
|
190
|
+
|
|
191
|
+
## API Reference
|
|
192
|
+
|
|
193
|
+
### Parsing Functions
|
|
194
|
+
| Function | Description |
|
|
195
|
+
|----------|-------------|
|
|
196
|
+
| `parseDir` | Parse all JSONL files in directory recursively |
|
|
197
|
+
| `parseFile` | Parse single file with memory-mapped I/O |
|
|
198
|
+
| `parseBuffer` | Parse from buffer/string |
|
|
199
|
+
| `parseFileParallel` | Parallel file parsing with chunks |
|
|
200
|
+
| `parseFilesParallel` | Parse multiple files in parallel |
|
|
201
|
+
| `parseDirParallel` | Parallel directory parsing |
|
|
202
|
+
| `parseGzipFile` | Parse gzip-compressed file |
|
|
203
|
+
| `parseFileWithStats` | Parse with statistics collection |
|
|
204
|
+
| `parseFileWithRecovery` | Parse with error recovery |
|
|
205
|
+
| `parseBatch` | Parse multiple buffers |
|
|
206
|
+
| `parseStream` | Streaming callback-based parsing |
|
|
207
|
+
|
|
208
|
+
### Validation Functions
|
|
209
|
+
| Function | Description |
|
|
210
|
+
|----------|-------------|
|
|
211
|
+
| `validateFile` | Validate JSONL against schema |
|
|
212
|
+
| `inferFieldTypes` | Infer types for fields |
|
|
213
|
+
|
|
214
|
+
### Output Functions
|
|
215
|
+
| Function | Description |
|
|
216
|
+
|----------|-------------|
|
|
217
|
+
| `writeGzip` | Write to gzip-compressed file |
|
|
218
|
+
| `writeFile` | Write to uncompressed file |
|
|
219
|
+
|
|
220
|
+
### Utility Functions
|
|
221
|
+
| Function | Description |
|
|
222
|
+
|----------|-------------|
|
|
223
|
+
| `getVersion` | Get library version |
|
|
224
|
+
| `isGzip` | Check if file is gzip compressed |
|
|
225
|
+
| `getStats` | Get global parsing statistics |
|
|
226
|
+
| `resetStats` | Reset statistics counters |
|
|
227
|
+
|
|
228
|
+
### Types
|
|
229
|
+
|
|
230
|
+
```typescript
|
|
231
|
+
enum FieldType {
|
|
232
|
+
Unknown = 0,
|
|
233
|
+
String = 1,
|
|
234
|
+
Number = 2,
|
|
235
|
+
Boolean = 3,
|
|
236
|
+
Null = 4,
|
|
237
|
+
Array = 5,
|
|
238
|
+
Object = 6,
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
interface StatsResult {
|
|
242
|
+
totalLines: bigint;
|
|
243
|
+
successfulLines: bigint;
|
|
244
|
+
failedLines: bigint;
|
|
245
|
+
totalBytes: bigint;
|
|
246
|
+
parseTimeNs: bigint;
|
|
247
|
+
avgLatencyNs: bigint;
|
|
248
|
+
minLatencyNs: bigint;
|
|
249
|
+
maxLatencyNs: bigint;
|
|
250
|
+
throughputMiBs: number;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
interface ParallelResult {
|
|
254
|
+
entries: GenericEntry[];
|
|
255
|
+
count: number;
|
|
256
|
+
linesProcessed: bigint;
|
|
257
|
+
parseTimeNs: bigint;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
interface WriteResult {
|
|
261
|
+
success: boolean;
|
|
262
|
+
bytesWritten: bigint;
|
|
263
|
+
compressedBytes: bigint;
|
|
264
|
+
compressionRatio: number;
|
|
265
|
+
errorMessage?: string;
|
|
266
|
+
}
|
|
267
|
+
```
|
|
72
268
|
|
|
73
269
|
## Build
|
|
74
270
|
|
|
75
271
|
```bash
|
|
76
|
-
cd packages/src/jsonl-hft
|
|
77
|
-
|
|
78
|
-
bun run benchmark.ts
|
|
272
|
+
cd packages/src/rust/jsonl-hft
|
|
273
|
+
bun run build
|
|
79
274
|
```
|
|
275
|
+
|
|
276
|
+
## Architecture
|
|
277
|
+
|
|
278
|
+
```
|
|
279
|
+
┌─────────────────────────────────────────────────────────┐
|
|
280
|
+
│ TypeScript (control plane) │
|
|
281
|
+
│ │ │
|
|
282
|
+
│ ▼ FFI call │
|
|
283
|
+
│ ┌─────────────────────────────────────────────────┐ │
|
|
284
|
+
│ │ Rust Parser (data plane) │ │
|
|
285
|
+
│ │ • Memory-mapped I/O (memmap2) │ │
|
|
286
|
+
│ │ • Parallel processing (rayon) │ │
|
|
287
|
+
│ │ • Zero allocation hot path │ │
|
|
288
|
+
│ │ • SIMD-friendly byte scanning (memchr) │ │
|
|
289
|
+
│ │ • GZIP compression (flate2/zlib-ng) │ │
|
|
290
|
+
│ │ • Schema validation │ │
|
|
291
|
+
│ │ • Type inference │ │
|
|
292
|
+
│ │ • Statistics collection │ │
|
|
293
|
+
│ └─────────────────────────────────────────────────┘ │
|
|
294
|
+
└─────────────────────────────────────────────────────────┘
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
## License
|
|
298
|
+
|
|
299
|
+
MIT
|
package/dist/index.d.ts
CHANGED
|
@@ -1,49 +1,96 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @ebowwa/jsonl-hft
|
|
3
|
-
*
|
|
3
|
+
*
|
|
4
|
+
* Generic HFT-grade JSONL parser with sub-10µs latency.
|
|
5
|
+
* NO HARDCODED FIELDS - consumer defines what fields to extract.
|
|
6
|
+
*
|
|
7
|
+
* @example
|
|
8
|
+
* ```ts
|
|
9
|
+
* import { parseDir, parseFile, parseBuffer } from "@ebowwa/jsonl-hft";
|
|
10
|
+
*
|
|
11
|
+
* // Define what fields you want to extract
|
|
12
|
+
* const fields = ["session_id", "timestamp", "role", "content"];
|
|
13
|
+
*
|
|
14
|
+
* // Parse a directory
|
|
15
|
+
* const entries = parseDir("/path/to/jsonl/files", fields);
|
|
16
|
+
* // entries: Array<{ session_id: string; timestamp: string; role: string; content: string }>
|
|
17
|
+
*
|
|
18
|
+
* // Parse a single file
|
|
19
|
+
* const fileEntries = parseFile("/path/to/file.jsonl", fields);
|
|
20
|
+
*
|
|
21
|
+
* // Parse a buffer
|
|
22
|
+
* const bufferEntries = parseBuffer(jsonlBuffer, fields);
|
|
23
|
+
* ```
|
|
4
24
|
*/
|
|
5
|
-
export interface EntryRef {
|
|
6
|
-
session_id_start: number;
|
|
7
|
-
session_id_end: number;
|
|
8
|
-
timestamp_start: number;
|
|
9
|
-
timestamp_end: number;
|
|
10
|
-
role_start: number;
|
|
11
|
-
role_end: number;
|
|
12
|
-
content_start: number;
|
|
13
|
-
content_end: number;
|
|
14
|
-
}
|
|
15
25
|
/**
|
|
16
|
-
*
|
|
26
|
+
* Field specification for extraction
|
|
27
|
+
* Can be a simple field name or nested path (e.g., "message.content")
|
|
28
|
+
*/
|
|
29
|
+
export type FieldSpec = string;
|
|
30
|
+
/**
|
|
31
|
+
* Generic entry - a record of field name to string value
|
|
32
|
+
* When using parseDir, includes source_file metadata
|
|
17
33
|
*/
|
|
18
|
-
export
|
|
19
|
-
session_id: string;
|
|
20
|
-
timestamp: string;
|
|
21
|
-
role: string;
|
|
22
|
-
content: string;
|
|
34
|
+
export type GenericEntry = Record<string, string> & {
|
|
23
35
|
source_file?: string;
|
|
36
|
+
};
|
|
37
|
+
/**
|
|
38
|
+
* Parse result with metadata
|
|
39
|
+
*/
|
|
40
|
+
export interface ParseResult<T extends GenericEntry = GenericEntry> {
|
|
41
|
+
entries: T[];
|
|
42
|
+
parseTimeMs: number;
|
|
43
|
+
entryCount: number;
|
|
24
44
|
}
|
|
25
45
|
/**
|
|
26
|
-
*
|
|
46
|
+
* Get the library version
|
|
27
47
|
*/
|
|
28
|
-
export declare function
|
|
48
|
+
export declare function getVersion(): string;
|
|
29
49
|
/**
|
|
30
|
-
* Parse JSONL
|
|
50
|
+
* Parse all JSONL files in a directory with custom field extraction.
|
|
51
|
+
* Files are parsed in parallel using rayon.
|
|
52
|
+
*
|
|
53
|
+
* @param dirPath - Path to directory containing .jsonl files (recursive)
|
|
54
|
+
* @param fields - Array of field names to extract (supports nested paths like "message.content")
|
|
55
|
+
* @returns Array of entries with the requested fields
|
|
56
|
+
*
|
|
57
|
+
* @example
|
|
58
|
+
* ```ts
|
|
59
|
+
* // Extract specific fields
|
|
60
|
+
* const entries = parseDir("/data/logs", ["timestamp", "level", "message"]);
|
|
61
|
+
*
|
|
62
|
+
* // Nested field extraction
|
|
63
|
+
* const nested = parseDir("/data/api", ["request.id", "response.status", "duration_ms"]);
|
|
64
|
+
* ```
|
|
31
65
|
*/
|
|
32
|
-
export declare function
|
|
66
|
+
export declare function parseDir<T extends GenericEntry = GenericEntry>(dirPath: string, fields: FieldSpec[]): T[];
|
|
33
67
|
/**
|
|
34
|
-
*
|
|
68
|
+
* Parse a single JSONL file with custom field extraction.
|
|
69
|
+
* Uses memory-mapped I/O for efficiency.
|
|
70
|
+
*
|
|
71
|
+
* @param filePath - Path to the JSONL file
|
|
72
|
+
* @param fields - Array of field names to extract
|
|
73
|
+
* @returns Array of entries with the requested fields
|
|
35
74
|
*/
|
|
36
|
-
export declare function
|
|
75
|
+
export declare function parseFile<T extends GenericEntry = GenericEntry>(filePath: string, fields: FieldSpec[]): T[];
|
|
37
76
|
/**
|
|
38
|
-
*
|
|
77
|
+
* Parse a buffer/string of JSONL data with custom field extraction.
|
|
78
|
+
*
|
|
79
|
+
* @param input - Buffer or string containing JSONL data
|
|
80
|
+
* @param fields - Array of field names to extract
|
|
81
|
+
* @returns Array of entries with the requested fields
|
|
82
|
+
*/
|
|
83
|
+
export declare function parseBuffer<T extends GenericEntry = GenericEntry>(input: Buffer | Uint8Array | string, fields: FieldSpec[]): T[];
|
|
84
|
+
/**
|
|
85
|
+
* Initialize a ring buffer for streaming data
|
|
39
86
|
*/
|
|
40
87
|
export declare function initRingBuffer(capacity: number): Uint8Array;
|
|
41
88
|
/**
|
|
42
|
-
* Write to ring buffer
|
|
89
|
+
* Write data to the ring buffer
|
|
43
90
|
*/
|
|
44
91
|
export declare function ringWrite(data: Buffer | Uint8Array): number;
|
|
45
92
|
/**
|
|
46
|
-
* Read from ring buffer
|
|
93
|
+
* Read data from the ring buffer
|
|
47
94
|
*/
|
|
48
95
|
export declare function ringRead(maxLen: number): Uint8Array;
|
|
49
96
|
/**
|
|
@@ -51,36 +98,29 @@ export declare function ringRead(maxLen: number): Uint8Array;
|
|
|
51
98
|
*/
|
|
52
99
|
export declare function ringPending(): number;
|
|
53
100
|
/**
|
|
54
|
-
*
|
|
55
|
-
*
|
|
56
|
-
* @returns Array of EntryRef objects with field offsets
|
|
101
|
+
* Preset field specs for Claude Code history format
|
|
102
|
+
* Use: parseDir(path, CLAUDE_CODE_FIELDS)
|
|
57
103
|
*/
|
|
58
|
-
export declare
|
|
104
|
+
export declare const CLAUDE_CODE_FIELDS: FieldSpec[];
|
|
59
105
|
/**
|
|
60
|
-
*
|
|
61
|
-
* @param dirPath - Path to the directory containing JSONL files
|
|
62
|
-
* @returns Object with entries array and fileOffsets map tracking source files
|
|
106
|
+
* Preset for trading/market data
|
|
63
107
|
*/
|
|
64
|
-
export declare
|
|
65
|
-
entries: EntryRef[];
|
|
66
|
-
fileOffsets: Map<string, number>;
|
|
67
|
-
};
|
|
108
|
+
export declare const TRADE_FIELDS: FieldSpec[];
|
|
68
109
|
/**
|
|
69
|
-
*
|
|
70
|
-
* @param dirPath - Path to the directory containing JSONL files
|
|
71
|
-
* @returns Array of ParsedEntry objects with all fields deserialized
|
|
110
|
+
* Preset for log file parsing
|
|
72
111
|
*/
|
|
73
|
-
export declare
|
|
112
|
+
export declare const LOG_FIELDS: FieldSpec[];
|
|
74
113
|
declare const _default: {
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
114
|
+
getVersion: typeof getVersion;
|
|
115
|
+
parseDir: typeof parseDir;
|
|
116
|
+
parseFile: typeof parseFile;
|
|
117
|
+
parseBuffer: typeof parseBuffer;
|
|
78
118
|
initRingBuffer: typeof initRingBuffer;
|
|
79
119
|
ringWrite: typeof ringWrite;
|
|
80
120
|
ringRead: typeof ringRead;
|
|
81
121
|
ringPending: typeof ringPending;
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
122
|
+
CLAUDE_CODE_FIELDS: string[];
|
|
123
|
+
TRADE_FIELDS: string[];
|
|
124
|
+
LOG_FIELDS: string[];
|
|
85
125
|
};
|
|
86
126
|
export default _default;
|
package/dist/index.js
CHANGED
|
@@ -3,26 +3,28 @@
|
|
|
3
3
|
import { dlopen, suffix, ptr } from "bun:ffi";
|
|
4
4
|
import { join, dirname } from "path";
|
|
5
5
|
import { fileURLToPath } from "url";
|
|
6
|
-
import { readdirSync } from "fs";
|
|
7
6
|
var __dirname2 = dirname(fileURLToPath(import.meta.url));
|
|
8
7
|
var libPath = join(__dirname2, "..", "native", `libjsonl_hft.${suffix}`);
|
|
9
|
-
var ENTRY_SIZE = 32;
|
|
10
8
|
var lib = dlopen(libPath, {
|
|
11
|
-
|
|
12
|
-
args: ["ptr", "usize"],
|
|
13
|
-
returns: "
|
|
9
|
+
jsonl_parse_dir_generic: {
|
|
10
|
+
args: ["ptr", "ptr", "usize"],
|
|
11
|
+
returns: "cstring"
|
|
14
12
|
},
|
|
15
|
-
|
|
13
|
+
jsonl_parse_file_generic: {
|
|
14
|
+
args: ["ptr", "ptr", "usize"],
|
|
15
|
+
returns: "cstring"
|
|
16
|
+
},
|
|
17
|
+
jsonl_parse_buffer_generic: {
|
|
16
18
|
args: ["ptr", "usize", "ptr", "usize"],
|
|
17
|
-
returns: "
|
|
19
|
+
returns: "cstring"
|
|
18
20
|
},
|
|
19
|
-
|
|
20
|
-
args: ["ptr"
|
|
21
|
-
returns: "
|
|
21
|
+
jsonl_free_string: {
|
|
22
|
+
args: ["ptr"],
|
|
23
|
+
returns: "void"
|
|
22
24
|
},
|
|
23
|
-
|
|
25
|
+
jsonl_version: {
|
|
24
26
|
args: [],
|
|
25
|
-
returns: "
|
|
27
|
+
returns: "cstring"
|
|
26
28
|
},
|
|
27
29
|
ring_init: {
|
|
28
30
|
args: ["usize"],
|
|
@@ -39,49 +41,77 @@ var lib = dlopen(libPath, {
|
|
|
39
41
|
ring_pending: {
|
|
40
42
|
args: [],
|
|
41
43
|
returns: "i32"
|
|
42
|
-
},
|
|
43
|
-
jsonl_parse_file: {
|
|
44
|
-
args: ["ptr", "ptr", "usize"],
|
|
45
|
-
returns: "usize"
|
|
46
|
-
},
|
|
47
|
-
jsonl_parse_dir: {
|
|
48
|
-
args: ["ptr", "ptr", "usize"],
|
|
49
|
-
returns: "usize"
|
|
50
|
-
},
|
|
51
|
-
jsonl_parse_dir_serialized: {
|
|
52
|
-
args: ["ptr"],
|
|
53
|
-
returns: "cstring"
|
|
54
44
|
}
|
|
55
45
|
});
|
|
56
|
-
function
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
for (let i = 0;i < count; i++) {
|
|
66
|
-
const offset = i * ENTRY_SIZE;
|
|
67
|
-
results.push({
|
|
68
|
-
session_id_start: view.getUint32(offset, true),
|
|
69
|
-
session_id_end: view.getUint32(offset + 4, true),
|
|
70
|
-
timestamp_start: view.getUint32(offset + 8, true),
|
|
71
|
-
timestamp_end: view.getUint32(offset + 12, true),
|
|
72
|
-
role_start: view.getUint32(offset + 16, true),
|
|
73
|
-
role_end: view.getUint32(offset + 20, true),
|
|
74
|
-
content_start: view.getUint32(offset + 24, true),
|
|
75
|
-
content_end: view.getUint32(offset + 28, true)
|
|
46
|
+
function prepareFieldSpecs(fields) {
|
|
47
|
+
const buffers = [];
|
|
48
|
+
const specs = [];
|
|
49
|
+
for (const field of fields) {
|
|
50
|
+
const buf = new TextEncoder().encode(field);
|
|
51
|
+
buffers.push(buf);
|
|
52
|
+
specs.push({
|
|
53
|
+
name: ptr(buf),
|
|
54
|
+
name_len: buf.length
|
|
76
55
|
});
|
|
77
56
|
}
|
|
78
|
-
return
|
|
57
|
+
return { specs, buffers };
|
|
58
|
+
}
|
|
59
|
+
function parseFFIResult(jsonStr) {
|
|
60
|
+
if (!jsonStr || jsonStr === "[]") {
|
|
61
|
+
return [];
|
|
62
|
+
}
|
|
63
|
+
return JSON.parse(jsonStr);
|
|
64
|
+
}
|
|
65
|
+
function getVersion() {
|
|
66
|
+
return lib.symbols.jsonl_version() || "unknown";
|
|
79
67
|
}
|
|
80
|
-
function
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
68
|
+
function parseDir(dirPath, fields) {
|
|
69
|
+
if (!dirPath || fields.length === 0) {
|
|
70
|
+
return [];
|
|
71
|
+
}
|
|
72
|
+
const { specs, buffers } = prepareFieldSpecs(fields);
|
|
73
|
+
const specsBuffer = new Uint8Array(specs.length * 16);
|
|
74
|
+
const specsView = new DataView(specsBuffer.buffer);
|
|
75
|
+
for (let i = 0;i < specs.length; i++) {
|
|
76
|
+
const offset = i * 16;
|
|
77
|
+
specsView.setBigUint64(offset, BigInt(specs[i].name), true);
|
|
78
|
+
specsView.setBigUint64(offset + 8, BigInt(specs[i].name_len), true);
|
|
79
|
+
}
|
|
80
|
+
const pathBytes = new TextEncoder().encode(dirPath);
|
|
81
|
+
const jsonStr = lib.symbols.jsonl_parse_dir_generic(ptr(pathBytes), ptr(specsBuffer), BigInt(fields.length));
|
|
82
|
+
return parseFFIResult(jsonStr);
|
|
83
|
+
}
|
|
84
|
+
function parseFile(filePath, fields) {
|
|
85
|
+
if (!filePath || fields.length === 0) {
|
|
86
|
+
return [];
|
|
87
|
+
}
|
|
88
|
+
const { specs, buffers } = prepareFieldSpecs(fields);
|
|
89
|
+
const specsBuffer = new Uint8Array(specs.length * 16);
|
|
90
|
+
const specsView = new DataView(specsBuffer.buffer);
|
|
91
|
+
for (let i = 0;i < specs.length; i++) {
|
|
92
|
+
const offset = i * 16;
|
|
93
|
+
specsView.setBigUint64(offset, BigInt(specs[i].name), true);
|
|
94
|
+
specsView.setBigUint64(offset + 8, BigInt(specs[i].name_len), true);
|
|
95
|
+
}
|
|
96
|
+
const pathBytes = new TextEncoder().encode(filePath);
|
|
97
|
+
const jsonStr = lib.symbols.jsonl_parse_file_generic(ptr(pathBytes), ptr(specsBuffer), BigInt(fields.length));
|
|
98
|
+
return parseFFIResult(jsonStr);
|
|
99
|
+
}
|
|
100
|
+
function parseBuffer(input, fields) {
|
|
101
|
+
if (!input || fields.length === 0) {
|
|
102
|
+
return [];
|
|
103
|
+
}
|
|
104
|
+
const data = typeof input === "string" ? new TextEncoder().encode(input) : input;
|
|
105
|
+
const { specs, buffers } = prepareFieldSpecs(fields);
|
|
106
|
+
const specsBuffer = new Uint8Array(specs.length * 16);
|
|
107
|
+
const specsView = new DataView(specsBuffer.buffer);
|
|
108
|
+
for (let i = 0;i < specs.length; i++) {
|
|
109
|
+
const offset = i * 16;
|
|
110
|
+
specsView.setBigUint64(offset, BigInt(specs[i].name), true);
|
|
111
|
+
specsView.setBigUint64(offset + 8, BigInt(specs[i].name_len), true);
|
|
112
|
+
}
|
|
113
|
+
const jsonStr = lib.symbols.jsonl_parse_buffer_generic(ptr(data), BigInt(data.length), ptr(specsBuffer), BigInt(fields.length));
|
|
114
|
+
return parseFFIResult(jsonStr);
|
|
85
115
|
}
|
|
86
116
|
function initRingBuffer(capacity) {
|
|
87
117
|
const ptr2 = lib.symbols.ring_init(capacity);
|
|
@@ -99,87 +129,49 @@ function ringRead(maxLen) {
|
|
|
99
129
|
function ringPending() {
|
|
100
130
|
return lib.symbols.ring_pending();
|
|
101
131
|
}
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
}
|
|
122
|
-
return results;
|
|
123
|
-
}
|
|
124
|
-
function parseDir(dirPath) {
|
|
125
|
-
const pathBytes = new TextEncoder().encode(dirPath);
|
|
126
|
-
const maxEntries = 1e6;
|
|
127
|
-
const outBuf = new Uint8Array(maxEntries * ENTRY_SIZE);
|
|
128
|
-
const count = Number(lib.symbols.jsonl_parse_dir(ptr(pathBytes), ptr(outBuf), BigInt(maxEntries)));
|
|
129
|
-
const view = new DataView(outBuf.buffer, outBuf.byteOffset);
|
|
130
|
-
const entries = [];
|
|
131
|
-
for (let i = 0;i < count; i++) {
|
|
132
|
-
const offset = i * ENTRY_SIZE;
|
|
133
|
-
entries.push({
|
|
134
|
-
session_id_start: view.getUint32(offset, true),
|
|
135
|
-
session_id_end: view.getUint32(offset + 4, true),
|
|
136
|
-
timestamp_start: view.getUint32(offset + 8, true),
|
|
137
|
-
timestamp_end: view.getUint32(offset + 12, true),
|
|
138
|
-
role_start: view.getUint32(offset + 16, true),
|
|
139
|
-
role_end: view.getUint32(offset + 20, true),
|
|
140
|
-
content_start: view.getUint32(offset + 24, true),
|
|
141
|
-
content_end: view.getUint32(offset + 28, true)
|
|
142
|
-
});
|
|
143
|
-
}
|
|
144
|
-
const fileOffsets = new Map;
|
|
145
|
-
const files = readdirSync(dirPath).filter((f) => f.endsWith(".jsonl"));
|
|
146
|
-
let currentOffset = 0;
|
|
147
|
-
for (const file of files) {
|
|
148
|
-
fileOffsets.set(file, currentOffset);
|
|
149
|
-
currentOffset += Math.ceil(entries.length / files.length);
|
|
150
|
-
}
|
|
151
|
-
return { entries, fileOffsets };
|
|
152
|
-
}
|
|
153
|
-
function parseDirFast(dirPath) {
|
|
154
|
-
const pathBytes = new TextEncoder().encode(dirPath);
|
|
155
|
-
const jsonResult = lib.symbols.jsonl_parse_dir_serialized(ptr(pathBytes));
|
|
156
|
-
if (!jsonResult) {
|
|
157
|
-
return [];
|
|
158
|
-
}
|
|
159
|
-
return JSON.parse(jsonResult);
|
|
160
|
-
}
|
|
132
|
+
var CLAUDE_CODE_FIELDS = [
|
|
133
|
+
"sessionId",
|
|
134
|
+
"timestamp",
|
|
135
|
+
"type",
|
|
136
|
+
"message.content"
|
|
137
|
+
];
|
|
138
|
+
var TRADE_FIELDS = [
|
|
139
|
+
"timestamp",
|
|
140
|
+
"symbol",
|
|
141
|
+
"side",
|
|
142
|
+
"price",
|
|
143
|
+
"quantity"
|
|
144
|
+
];
|
|
145
|
+
var LOG_FIELDS = [
|
|
146
|
+
"timestamp",
|
|
147
|
+
"level",
|
|
148
|
+
"message",
|
|
149
|
+
"source"
|
|
150
|
+
];
|
|
161
151
|
var src_default = {
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
152
|
+
getVersion,
|
|
153
|
+
parseDir,
|
|
154
|
+
parseFile,
|
|
155
|
+
parseBuffer,
|
|
165
156
|
initRingBuffer,
|
|
166
157
|
ringWrite,
|
|
167
158
|
ringRead,
|
|
168
159
|
ringPending,
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
160
|
+
CLAUDE_CODE_FIELDS,
|
|
161
|
+
TRADE_FIELDS,
|
|
162
|
+
LOG_FIELDS
|
|
172
163
|
};
|
|
173
164
|
export {
|
|
174
165
|
ringWrite,
|
|
175
166
|
ringRead,
|
|
176
167
|
ringPending,
|
|
177
168
|
parseFile,
|
|
178
|
-
parseDirFast,
|
|
179
169
|
parseDir,
|
|
180
|
-
|
|
170
|
+
parseBuffer,
|
|
181
171
|
initRingBuffer,
|
|
182
|
-
|
|
172
|
+
getVersion,
|
|
183
173
|
src_default as default,
|
|
184
|
-
|
|
174
|
+
TRADE_FIELDS,
|
|
175
|
+
LOG_FIELDS,
|
|
176
|
+
CLAUDE_CODE_FIELDS
|
|
185
177
|
};
|
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ebowwa/jsonl-hft",
|
|
3
|
-
"version": "
|
|
4
|
-
"
|
|
3
|
+
"version": "1.3.0",
|
|
4
|
+
"author": "ebowwa",
|
|
5
|
+
"description": "Generic HFT-grade JSONL parser - NO hardcoded fields, consumer defines what to extract. Sub-10µs latency with memchr SIMD optimization. Parallel parsing, gzip support, schema validation.",
|
|
5
6
|
"main": "dist/index.js",
|
|
6
7
|
"types": "dist/index.d.ts",
|
|
7
8
|
"exports": {
|
|
@@ -12,17 +13,24 @@
|
|
|
12
13
|
},
|
|
13
14
|
"scripts": {
|
|
14
15
|
"build": "CARGO_TARGET_DIR=target cargo build --release && bun build ./src/index.ts --outdir ./dist --target=bun && tsc --emitDeclarationOnly --declaration --outDir ./dist",
|
|
16
|
+
"build:rust": "CARGO_TARGET_DIR=target cargo build --release",
|
|
15
17
|
"test": "bun test",
|
|
16
|
-
"bench": "
|
|
18
|
+
"bench": "cargo bench"
|
|
17
19
|
},
|
|
18
20
|
"keywords": [
|
|
19
21
|
"jsonl",
|
|
20
22
|
"parser",
|
|
23
|
+
"generic",
|
|
21
24
|
"hft",
|
|
22
25
|
"low-latency",
|
|
26
|
+
"configurable-fields",
|
|
27
|
+
"zero-copy",
|
|
23
28
|
"simd",
|
|
24
|
-
"
|
|
25
|
-
"
|
|
29
|
+
"memchr",
|
|
30
|
+
"parallel",
|
|
31
|
+
"gzip",
|
|
32
|
+
"schema-validation",
|
|
33
|
+
"type-inference"
|
|
26
34
|
],
|
|
27
35
|
"license": "MIT",
|
|
28
36
|
"files": [
|
|
@@ -34,10 +42,15 @@
|
|
|
34
42
|
"bun-types": "^1.3.9"
|
|
35
43
|
},
|
|
36
44
|
"ownership": {
|
|
37
|
-
"domain": "
|
|
45
|
+
"domain": "parsing",
|
|
38
46
|
"responsibilities": [
|
|
39
|
-
"
|
|
40
|
-
"
|
|
47
|
+
"generic-jsonl-parsing",
|
|
48
|
+
"high-performance-parsing",
|
|
49
|
+
"field-extraction",
|
|
50
|
+
"simd-optimization",
|
|
51
|
+
"parallel-processing",
|
|
52
|
+
"gzip-compression",
|
|
53
|
+
"schema-validation"
|
|
41
54
|
]
|
|
42
55
|
}
|
|
43
56
|
}
|