npm - @ebowwa/jsonl-hft - Versions diffs - 1.0.0 → 1.3.0 - Mend

@ebowwa/jsonl-hft 1.0.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -1,16 +1,29 @@
 # @ebowwa/jsonl-hft
-Generic HFT-grade JSONL parser with sub-10us latency.
+Generic HFT-grade JSONL parser with sub-microsecond latency.
 **NO HARDCODED FIELDS** - Consumer defines what fields to extract.
+## Features
+- **Parallel Parsing**: Multi-core processing with rayon
+- **GZIP Support**: Read and write compressed files
+- **Schema Validation**: Validate JSONL against type schemas
+- **Type Inference**: Automatic field type detection
+- **Error Recovery**: Continue parsing on malformed lines
+- **Statistics**: Real-time parsing metrics
+- **Memory-Mapped I/O**: Efficient large file handling
+- **SIMD Optimized**: memchr-accelerated byte search (~19 GiB/s)
 ## Performance
-| Metric | Value |
-|--------|-------|
-| Latency per entry | 6.21us |
-| Throughput | ~161K entries/sec |
-| vs old npm package | 1.13x faster |
+| Benchmark | Time | Throughput |
+|-----------|------|------------|
+| parse_line | 232 ns | ~606 MiB/s |
+| find_field (first) | 15 ns | ~5 GiB/s |
+| find_field (last) | 58 ns | ~1.3 GiB/s |
+| pool_parser (1000 lines) | 113 µs | ~435 MiB/s |
+| memchr_search | 3.9 ns | ~19.7 GiB/s |
 ## Installation
@@ -20,6 +33,8 @@ bun add @ebowwa/jsonl-hft
 ## Usage
+### Basic Parsing
 ```typescript
 import { parseDir, parseFile, parseBuffer, getVersion } from "@ebowwa/jsonl-hft";
@@ -28,7 +43,6 @@ const fields = ["session_id", "timestamp", "role", "message.content"];
 // Parse a directory (recursive, parallel)
 const entries = parseDir("/path/to/jsonl/files", fields);
-// entries: Array<{ session_id: string; timestamp: string; role: string; "message.content": string }>
 // Parse a single file
 const fileEntries = parseFile("/path/to/file.jsonl", fields);
@@ -37,54 +51,219 @@ const fileEntries = parseFile("/path/to/file.jsonl", fields);
 const bufferEntries = parseBuffer(jsonlBuffer, fields);
 // Get version
-console.log(getVersion()); // "1.0.0"
+console.log(getVersion()); // "1.2.0"
 ```
-## Field Specification
+### GZIP Support
-Fields can be:
-- Simple: `"session_id"`, `"timestamp"`, `"role"`
-- Nested (dot notation): `"message.content"`, `"metadata.user.id"`
+```typescript
+import { isGzip, parseGzipFile, writeGzip } from "@ebowwa/jsonl-hft";
-The parser extracts only the fields you request - no wasted parsing.
+// Check if file is gzip compressed
+if (isGzip("/path/to/file.jsonl.gz")) {
+  // Parse compressed file
+  const entries = parseGzipFile("/path/to/file.jsonl.gz", fields);
+}
+// Write to gzip file
+const result = writeGzip("/path/to/output.jsonl.gz", jsonlData, 9);
+console.log(`Compression ratio: ${result.compressionRatio}`);
+```
+### Parallel Parsing
+```typescript
+import { parseFileParallel, parseFilesParallel, parseDirParallel } from "@ebowwa/jsonl-hft";
+// Parse single file with parallel chunks
+const result = parseFileParallel("/path/to/large.jsonl", fields, 0); // 0 = auto chunk size
+console.log(`Parsed ${result.linesProcessed} lines in ${result.parseTimeNs}ns`);
+// Parse multiple files in parallel
+const files = ["/path/a.jsonl", "/path/b.jsonl", "/path/c.jsonl"];
+const multiResult = parseFilesParallel(files, fields);
+// Parse directory in parallel
+const dirResult = parseDirParallel("/path/to/jsonl/dir", fields);
+```
+### Schema Validation
+```typescript
+import { validateFile, FieldType } from "@ebowwa/jsonl-hft";
+const schema = [
+  { name: "session_id", expectedType: FieldType.String, required: true },
+  { name: "timestamp", expectedType: FieldType.String, required: true },
+  { name: "value", expectedType: FieldType.Number, required: false },
+];
+const validation = validateFile("/path/to/data.jsonl", schema);
+if (!validation.isValid) {
+  console.log(`Found ${validation.errorCount} errors`);
+  validation.errors.forEach(err => {
+    console.log(`Line ${err.lineNumber}: ${err.errorMessage}`);
+  });
+}
+```
+### Type Inference
+```typescript
+import { inferFieldTypes, FieldType } from "@ebowwa/jsonl-hft";
+const line = Buffer.from('{"id":"123","count":42,"active":true}');
+const fields = ["id", "count", "active"];
+const types = inferFieldTypes(line, fields);
+// types[0] = FieldType.String
+// types[1] = FieldType.Number
+// types[2] = FieldType.Boolean
+```
+### Statistics
+```typescript
+import { getStats, resetStats, parseFileWithStats } from "@ebowwa/jsonl-hft";
+// Parse with stats collection
+const result = parseFileWithStats("/path/to/file.jsonl", fields);
+// Get global stats
+const stats = getStats();
+console.log(`Throughput: ${stats.throughputMiBs} MiB/s`);
+console.log(`Avg latency: ${stats.avgLatencyNs} ns`);
+// Reset stats
+resetStats();
+```
-## API
+### Error Recovery
-### `parseDir<T>(dirPath: string, fields: FieldSpec[]): T[]`
-Parse all JSONL files in a directory recursively. Files are parsed in parallel using rayon.
+```typescript
+import { parseFileWithRecovery } from "@ebowwa/jsonl-hft";
+const result = parseFileWithRecovery("/path/to/dirty.jsonl", fields);
+console.log(`Successful: ${result.stats.successfulLines}`);
+console.log(`Failed: ${result.stats.failedLines}`);
-### `parseFile<T>(filePath: string, fields: FieldSpec[]): T[]`
-Parse a single JSONL file using memory-mapped I/O.
+// Access errors
+result.errors.forEach(err => {
+  console.log(`Line ${err.lineNumber}: ${err.errorType}`);
+});
+```
-### `parseBuffer<T>(input: Buffer | Uint8Array | string, fields: FieldSpec[]): T[]`
-Parse JSONL data from a buffer or string.
+### Batch Parsing
-### `getVersion(): string`
-Get the library version.
+```typescript
+import { parseBatch, freeBatch } from "@ebowwa/jsonl-hft";
-### Preset Field Arrays
-- `CLAUDE_CODE_FIELDS` - For Claude Code history format
-- `TRADE_FIELDS` - For trading/market data
-- `LOG_FIELDS` - For log file parsing
+const buffers = [buf1, buf2, buf3];
+const lengths = [buf1.length, buf2.length, buf3.length];
+const result = parseBatch(buffers, lengths, fields);
+console.log(`Parsed ${result.count} entries`);
-## Ring Buffer (HFT Streaming)
+// Cleanup
+freeBatch(result);
+```
-For high-frequency streaming scenarios:
+### Streaming API
 ```typescript
-import { initRingBuffer, ringWrite, ringRead, ringPending } from "@ebowwa/jsonl-hft";
+import { parseStream } from "@ebowwa/jsonl-hft";
+// Process each line with a callback
+parseStream("/path/to/large.jsonl", fields, (entry, lineNumber) => {
+  // Process entry
+  console.log(`Line ${lineNumber}:`, entry);
+  return true; // Continue processing
+});
+```
-// Initialize 1MB ring buffer
-initRingBuffer(1024 * 1024);
+## Field Specification
-// Write data
-ringWrite(data);
+Fields can be:
+- **Simple**: `"session_id"`, `"timestamp"`, `"role"`
+- **Nested (dot notation)**: `"message.content"`, `"metadata.user.id"`
-// Check pending bytes
-const pending = ringPending();
+The parser extracts only the fields you request - no wasted parsing.
-// Read data
-const readData = ringRead(maxLen);
+## API Reference
+### Parsing Functions
+| Function | Description |
+|----------|-------------|
+| `parseDir` | Parse all JSONL files in directory recursively |
+| `parseFile` | Parse single file with memory-mapped I/O |
+| `parseBuffer` | Parse from buffer/string |
+| `parseFileParallel` | Parallel file parsing with chunks |
+| `parseFilesParallel` | Parse multiple files in parallel |
+| `parseDirParallel` | Parallel directory parsing |
+| `parseGzipFile` | Parse gzip-compressed file |
+| `parseFileWithStats` | Parse with statistics collection |
+| `parseFileWithRecovery` | Parse with error recovery |
+| `parseBatch` | Parse multiple buffers |
+| `parseStream` | Streaming callback-based parsing |
+### Validation Functions
+| Function | Description |
+|----------|-------------|
+| `validateFile` | Validate JSONL against schema |
+| `inferFieldTypes` | Infer types for fields |
+### Output Functions
+| Function | Description |
+|----------|-------------|
+| `writeGzip` | Write to gzip-compressed file |
+| `writeFile` | Write to uncompressed file |
+### Utility Functions
+| Function | Description |
+|----------|-------------|
+| `getVersion` | Get library version |
+| `isGzip` | Check if file is gzip compressed |
+| `getStats` | Get global parsing statistics |
+| `resetStats` | Reset statistics counters |
+### Types
+```typescript
+enum FieldType {
+  Unknown = 0,
+  String = 1,
+  Number = 2,
+  Boolean = 3,
+  Null = 4,
+  Array = 5,
+  Object = 6,
+}
+interface StatsResult {
+  totalLines: bigint;
+  successfulLines: bigint;
+  failedLines: bigint;
+  totalBytes: bigint;
+  parseTimeNs: bigint;
+  avgLatencyNs: bigint;
+  minLatencyNs: bigint;
+  maxLatencyNs: bigint;
+  throughputMiBs: number;
+}
+interface ParallelResult {
+  entries: GenericEntry[];
+  count: number;
+  linesProcessed: bigint;
+  parseTimeNs: bigint;
+}
+interface WriteResult {
+  success: boolean;
+  bytesWritten: bigint;
+  compressedBytes: bigint;
+  compressionRatio: number;
+  errorMessage?: string;
+}
 ```
 ## Build
@@ -107,6 +286,10 @@ bun run build
 │  │  • Parallel processing (rayon)                   │    │
 │  │  • Zero allocation hot path                      │    │
 │  │  • SIMD-friendly byte scanning (memchr)          │    │
+│  │  • GZIP compression (flate2/zlib-ng)             │    │
+│  │  • Schema validation                             │    │
+│  │  • Type inference                                │    │
+│  │  • Statistics collection                         │    │
 │  └─────────────────────────────────────────────────┘    │
 └─────────────────────────────────────────────────────────┘
 ```

package/dist/index.d.ts CHANGED Viewed

@@ -29,8 +29,11 @@
 export type FieldSpec = string;
 /**
  * Generic entry - a record of field name to string value
+ * When using parseDir, includes source_file metadata
  */
-export type GenericEntry = Record<string, string>;
+export type GenericEntry = Record<string, string> & {
+    source_file?: string;
+};
 /**
  * Parse result with metadata
  */

package/native/libjsonl_hft.dylib CHANGED Viewed

Binary file

package/package.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "name": "@ebowwa/jsonl-hft",
-  "version": "1.0.0",
+  "version": "1.3.0",
   "author": "ebowwa",
-  "description": "Generic HFT-grade JSONL parser - NO hardcoded fields, consumer defines what to extract. Sub-10µs latency.",
+  "description": "Generic HFT-grade JSONL parser - NO hardcoded fields, consumer defines what to extract. Sub-10µs latency with memchr SIMD optimization. Parallel parsing, gzip support, schema validation.",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",
   "exports": {
@@ -15,7 +15,7 @@
     "build": "CARGO_TARGET_DIR=target cargo build --release && bun build ./src/index.ts --outdir ./dist --target=bun && tsc --emitDeclarationOnly --declaration --outDir ./dist",
     "build:rust": "CARGO_TARGET_DIR=target cargo build --release",
     "test": "bun test",
-    "bench": "bun run benchmark.ts"
+    "bench": "cargo bench"
   },
   "keywords": [
     "jsonl",
@@ -24,7 +24,13 @@
     "hft",
     "low-latency",
     "configurable-fields",
-    "zero-copy"
+    "zero-copy",
+    "simd",
+    "memchr",
+    "parallel",
+    "gzip",
+    "schema-validation",
+    "type-inference"
   ],
   "license": "MIT",
   "files": [
@@ -40,7 +46,11 @@
     "responsibilities": [
       "generic-jsonl-parsing",
       "high-performance-parsing",
-      "field-extraction"
+      "field-extraction",
+      "simd-optimization",
+      "parallel-processing",
+      "gzip-compression",
+      "schema-validation"
     ]
   }
 }