cisv 0.0.76__py3-none-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cisv/__init__.py +28 -0
- cisv/libs/__init__.py +3 -0
- cisv/libs/libcisv.dylib +0 -0
- cisv/parser.py +421 -0
- cisv-0.0.76.dist-info/METADATA +214 -0
- cisv-0.0.76.dist-info/RECORD +9 -0
- cisv-0.0.76.dist-info/WHEEL +5 -0
- cisv-0.0.76.dist-info/licenses/LICENSE +21 -0
- cisv-0.0.76.dist-info/top_level.txt +1 -0
cisv/__init__.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CISV - High-performance CSV parser with SIMD optimizations
|
|
3
|
+
|
|
4
|
+
This module provides Python bindings to the CISV C library using ctypes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .parser import (
|
|
8
|
+
CisvParser,
|
|
9
|
+
parse_file,
|
|
10
|
+
parse_string,
|
|
11
|
+
count_rows,
|
|
12
|
+
CisvError,
|
|
13
|
+
CisvValidationError,
|
|
14
|
+
CisvParseError,
|
|
15
|
+
MAX_FILE_SIZE,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__version__ = '0.0.76'
|
|
19
|
+
__all__ = [
|
|
20
|
+
'CisvParser',
|
|
21
|
+
'parse_file',
|
|
22
|
+
'parse_string',
|
|
23
|
+
'count_rows',
|
|
24
|
+
'CisvError',
|
|
25
|
+
'CisvValidationError',
|
|
26
|
+
'CisvParseError',
|
|
27
|
+
'MAX_FILE_SIZE',
|
|
28
|
+
]
|
cisv/libs/__init__.py
ADDED
cisv/libs/libcisv.dylib
ADDED
|
Binary file
|
cisv/parser.py
ADDED
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CISV Parser - Python bindings using ctypes
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import ctypes
|
|
6
|
+
import os
|
|
7
|
+
import stat
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Optional, Callable, Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CisvError(Exception):
|
|
13
|
+
"""Base exception for CISV errors."""
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CisvValidationError(CisvError):
|
|
18
|
+
"""Raised when input validation fails."""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CisvParseError(CisvError):
|
|
23
|
+
"""Raised when parsing fails."""
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# Maximum file size to process (default 10GB)
|
|
28
|
+
MAX_FILE_SIZE = 10 * 1024 * 1024 * 1024
|
|
29
|
+
|
|
30
|
+
# Find the shared library
|
|
31
|
+
def _find_library():
|
|
32
|
+
"""Find the cisv shared library."""
|
|
33
|
+
pkg_dir = Path(__file__).parent
|
|
34
|
+
|
|
35
|
+
# First, check for bundled library in package (installed via pip)
|
|
36
|
+
bundled_locations = [
|
|
37
|
+
pkg_dir / 'libs' / 'libcisv.so',
|
|
38
|
+
pkg_dir / 'libs' / 'libcisv.dylib',
|
|
39
|
+
pkg_dir / 'libcisv.so',
|
|
40
|
+
pkg_dir / 'libcisv.dylib',
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
for loc in bundled_locations:
|
|
44
|
+
if loc.exists():
|
|
45
|
+
return str(loc)
|
|
46
|
+
|
|
47
|
+
# Fallback to development locations (when running from source)
|
|
48
|
+
base_dir = pkg_dir.parent.parent.parent
|
|
49
|
+
|
|
50
|
+
dev_locations = [
|
|
51
|
+
base_dir / 'core' / 'build' / 'libcisv.so',
|
|
52
|
+
base_dir / 'core' / 'build' / 'libcisv.dylib',
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
for loc in dev_locations:
|
|
56
|
+
if loc.exists():
|
|
57
|
+
return str(loc)
|
|
58
|
+
|
|
59
|
+
# System library paths
|
|
60
|
+
system_locations = [
|
|
61
|
+
Path('/usr/local/lib/libcisv.so'),
|
|
62
|
+
Path('/usr/local/lib/libcisv.dylib'),
|
|
63
|
+
Path('/usr/lib/libcisv.so'),
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
for loc in system_locations:
|
|
67
|
+
if loc.exists():
|
|
68
|
+
return str(loc)
|
|
69
|
+
|
|
70
|
+
# Try system library path via ctypes
|
|
71
|
+
try:
|
|
72
|
+
lib_path = ctypes.util.find_library('cisv')
|
|
73
|
+
if lib_path:
|
|
74
|
+
return lib_path
|
|
75
|
+
except Exception:
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
raise RuntimeError(
|
|
79
|
+
"Could not find libcisv shared library.\n"
|
|
80
|
+
"If you installed via pip, this may indicate a packaging issue.\n"
|
|
81
|
+
"If running from source, build the core library first:\n"
|
|
82
|
+
" cd core && make"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Load the library
|
|
86
|
+
_lib = None
|
|
87
|
+
|
|
88
|
+
def _get_lib():
|
|
89
|
+
global _lib
|
|
90
|
+
if _lib is None:
|
|
91
|
+
_lib = ctypes.CDLL(_find_library())
|
|
92
|
+
_setup_bindings(_lib)
|
|
93
|
+
return _lib
|
|
94
|
+
|
|
95
|
+
# Callback types
|
|
96
|
+
FieldCallback = ctypes.CFUNCTYPE(None, ctypes.c_void_p, ctypes.c_char_p, ctypes.c_size_t)
|
|
97
|
+
RowCallback = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
|
|
98
|
+
ErrorCallback = ctypes.CFUNCTYPE(None, ctypes.c_void_p, ctypes.c_int, ctypes.c_char_p)
|
|
99
|
+
|
|
100
|
+
# Config structure - must match cisv_config in parser.h exactly
|
|
101
|
+
class CisvConfig(ctypes.Structure):
|
|
102
|
+
_fields_ = [
|
|
103
|
+
('delimiter', ctypes.c_char),
|
|
104
|
+
('quote', ctypes.c_char),
|
|
105
|
+
('escape', ctypes.c_char),
|
|
106
|
+
('skip_empty_lines', ctypes.c_bool),
|
|
107
|
+
('comment', ctypes.c_char),
|
|
108
|
+
('trim', ctypes.c_bool),
|
|
109
|
+
('relaxed', ctypes.c_bool),
|
|
110
|
+
('max_row_size', ctypes.c_size_t),
|
|
111
|
+
('from_line', ctypes.c_int),
|
|
112
|
+
('to_line', ctypes.c_int),
|
|
113
|
+
('skip_lines_with_error', ctypes.c_bool),
|
|
114
|
+
('field_cb', FieldCallback),
|
|
115
|
+
('row_cb', RowCallback),
|
|
116
|
+
('error_cb', ErrorCallback),
|
|
117
|
+
('user', ctypes.c_void_p),
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
def _setup_bindings(lib):
|
|
121
|
+
"""Setup ctypes bindings for the library."""
|
|
122
|
+
# cisv_config_init
|
|
123
|
+
lib.cisv_config_init.argtypes = [ctypes.POINTER(CisvConfig)]
|
|
124
|
+
lib.cisv_config_init.restype = None
|
|
125
|
+
|
|
126
|
+
# cisv_parser_create_with_config
|
|
127
|
+
lib.cisv_parser_create_with_config.argtypes = [ctypes.POINTER(CisvConfig)]
|
|
128
|
+
lib.cisv_parser_create_with_config.restype = ctypes.c_void_p
|
|
129
|
+
|
|
130
|
+
# cisv_parser_destroy
|
|
131
|
+
lib.cisv_parser_destroy.argtypes = [ctypes.c_void_p]
|
|
132
|
+
lib.cisv_parser_destroy.restype = None
|
|
133
|
+
|
|
134
|
+
# cisv_parser_parse_file
|
|
135
|
+
lib.cisv_parser_parse_file.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
|
|
136
|
+
lib.cisv_parser_parse_file.restype = ctypes.c_int
|
|
137
|
+
|
|
138
|
+
# cisv_parser_write
|
|
139
|
+
lib.cisv_parser_write.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_size_t]
|
|
140
|
+
lib.cisv_parser_write.restype = None
|
|
141
|
+
|
|
142
|
+
# cisv_parser_end
|
|
143
|
+
lib.cisv_parser_end.argtypes = [ctypes.c_void_p]
|
|
144
|
+
lib.cisv_parser_end.restype = None
|
|
145
|
+
|
|
146
|
+
# cisv_parser_count_rows
|
|
147
|
+
lib.cisv_parser_count_rows.argtypes = [ctypes.c_char_p]
|
|
148
|
+
lib.cisv_parser_count_rows.restype = ctypes.c_size_t
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class CisvParser:
|
|
152
|
+
"""High-performance CSV parser with SIMD optimizations."""
|
|
153
|
+
|
|
154
|
+
def __init__(
|
|
155
|
+
self,
|
|
156
|
+
delimiter: str = ',',
|
|
157
|
+
quote: str = '"',
|
|
158
|
+
escape: Optional[str] = None,
|
|
159
|
+
comment: Optional[str] = None,
|
|
160
|
+
trim: bool = False,
|
|
161
|
+
skip_empty_lines: bool = False,
|
|
162
|
+
max_file_size: int = MAX_FILE_SIZE,
|
|
163
|
+
raise_on_error: bool = True,
|
|
164
|
+
):
|
|
165
|
+
self._lib = _get_lib()
|
|
166
|
+
self._rows: List[List[str]] = []
|
|
167
|
+
self._current_row: List[str] = []
|
|
168
|
+
self._parser = None
|
|
169
|
+
self._parse_errors: List[tuple] = []
|
|
170
|
+
|
|
171
|
+
# SECURITY: Validate delimiter
|
|
172
|
+
if not delimiter:
|
|
173
|
+
raise CisvValidationError("Delimiter cannot be empty")
|
|
174
|
+
if len(delimiter) > 1:
|
|
175
|
+
raise CisvValidationError(
|
|
176
|
+
f"Delimiter must be a single character, got '{delimiter}' "
|
|
177
|
+
f"(length {len(delimiter)}). Multi-byte delimiters are not supported."
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# SECURITY: Validate quote character
|
|
181
|
+
if not quote:
|
|
182
|
+
raise CisvValidationError("Quote character cannot be empty")
|
|
183
|
+
if len(quote) > 1:
|
|
184
|
+
raise CisvValidationError(
|
|
185
|
+
f"Quote must be a single character, got '{quote}' "
|
|
186
|
+
f"(length {len(quote)}). Multi-byte quote characters are not supported."
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# SECURITY: Validate escape character
|
|
190
|
+
if escape is not None and len(escape) > 1:
|
|
191
|
+
raise CisvValidationError(
|
|
192
|
+
f"Escape must be a single character, got '{escape}' "
|
|
193
|
+
f"(length {len(escape)}). Multi-byte escape characters are not supported."
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# SECURITY: Validate comment character
|
|
197
|
+
if comment is not None and len(comment) > 1:
|
|
198
|
+
raise CisvValidationError(
|
|
199
|
+
f"Comment must be a single character, got '{comment}' "
|
|
200
|
+
f"(length {len(comment)}). Multi-byte comment characters are not supported."
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# SECURITY: Validate delimiter/quote are different
|
|
204
|
+
if delimiter == quote:
|
|
205
|
+
raise CisvValidationError(
|
|
206
|
+
f"Delimiter and quote character cannot be the same ('{delimiter}')"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Store config
|
|
210
|
+
self._delimiter = delimiter
|
|
211
|
+
self._quote = quote
|
|
212
|
+
self._escape = escape
|
|
213
|
+
self._comment = comment
|
|
214
|
+
self._trim = trim
|
|
215
|
+
self._skip_empty_lines = skip_empty_lines
|
|
216
|
+
self._max_file_size = max_file_size
|
|
217
|
+
self._raise_on_error = raise_on_error
|
|
218
|
+
|
|
219
|
+
# Create callbacks that store references to prevent garbage collection
|
|
220
|
+
self._field_cb = FieldCallback(self._on_field)
|
|
221
|
+
self._row_cb = RowCallback(self._on_row)
|
|
222
|
+
self._error_cb = ErrorCallback(self._on_error)
|
|
223
|
+
|
|
224
|
+
def _on_field(self, user: ctypes.c_void_p, data: ctypes.c_char_p, length: int):
|
|
225
|
+
"""Called for each field."""
|
|
226
|
+
# Use ctypes.string_at to safely copy data before pointer is invalidated
|
|
227
|
+
# The data pointer is only valid during this callback
|
|
228
|
+
field_bytes = ctypes.string_at(data, length)
|
|
229
|
+
field = field_bytes.decode('utf-8', errors='replace')
|
|
230
|
+
self._current_row.append(field)
|
|
231
|
+
|
|
232
|
+
def _on_row(self, user: ctypes.c_void_p):
|
|
233
|
+
"""Called at end of each row."""
|
|
234
|
+
self._rows.append(self._current_row)
|
|
235
|
+
self._current_row = []
|
|
236
|
+
|
|
237
|
+
def _on_error(self, user: ctypes.c_void_p, line: int, msg: ctypes.c_char_p):
|
|
238
|
+
"""Called on parse error."""
|
|
239
|
+
# SECURITY FIX: Don't silently ignore errors
|
|
240
|
+
error_msg = msg.decode('utf-8', errors='replace') if msg else "Unknown error"
|
|
241
|
+
self._parse_errors.append((line, error_msg))
|
|
242
|
+
|
|
243
|
+
if self._raise_on_error:
|
|
244
|
+
raise CisvParseError(f"Parse error at line {line}: {error_msg}")
|
|
245
|
+
|
|
246
|
+
def _create_parser(self) -> ctypes.c_void_p:
|
|
247
|
+
"""Create a new parser instance."""
|
|
248
|
+
config = CisvConfig()
|
|
249
|
+
self._lib.cisv_config_init(ctypes.byref(config))
|
|
250
|
+
|
|
251
|
+
# c_char expects bytes of length 1, not a slice
|
|
252
|
+
config.delimiter = self._delimiter.encode('utf-8')[0:1]
|
|
253
|
+
config.quote = self._quote.encode('utf-8')[0:1]
|
|
254
|
+
if self._escape:
|
|
255
|
+
config.escape = self._escape.encode('utf-8')[0:1]
|
|
256
|
+
if self._comment:
|
|
257
|
+
config.comment = self._comment.encode('utf-8')[0:1]
|
|
258
|
+
config.trim = self._trim
|
|
259
|
+
config.skip_empty_lines = self._skip_empty_lines
|
|
260
|
+
|
|
261
|
+
config.field_cb = self._field_cb
|
|
262
|
+
config.row_cb = self._row_cb
|
|
263
|
+
config.error_cb = self._error_cb
|
|
264
|
+
|
|
265
|
+
return self._lib.cisv_parser_create_with_config(ctypes.byref(config))
|
|
266
|
+
|
|
267
|
+
def _validate_file_path(self, path: str) -> Path:
|
|
268
|
+
"""
|
|
269
|
+
SECURITY: Validate file path to prevent various attacks.
|
|
270
|
+
|
|
271
|
+
Checks for:
|
|
272
|
+
- Path traversal attempts
|
|
273
|
+
- Symlink attacks (follows to final target, checks it's a regular file)
|
|
274
|
+
- Device files (/dev/zero, /dev/random, etc.)
|
|
275
|
+
- File size limits
|
|
276
|
+
"""
|
|
277
|
+
file_path = Path(path)
|
|
278
|
+
|
|
279
|
+
# Check if file exists
|
|
280
|
+
if not file_path.exists():
|
|
281
|
+
raise CisvValidationError(f"File not found: {path}")
|
|
282
|
+
|
|
283
|
+
# Resolve symlinks and get the real path
|
|
284
|
+
real_path = file_path.resolve()
|
|
285
|
+
|
|
286
|
+
# SECURITY: Check for device files
|
|
287
|
+
try:
|
|
288
|
+
file_stat = real_path.stat()
|
|
289
|
+
if stat.S_ISBLK(file_stat.st_mode) or stat.S_ISCHR(file_stat.st_mode):
|
|
290
|
+
raise CisvValidationError(
|
|
291
|
+
f"Cannot parse device file: {path}"
|
|
292
|
+
)
|
|
293
|
+
if stat.S_ISFIFO(file_stat.st_mode):
|
|
294
|
+
raise CisvValidationError(
|
|
295
|
+
f"Cannot parse FIFO/pipe: {path}"
|
|
296
|
+
)
|
|
297
|
+
if stat.S_ISSOCK(file_stat.st_mode):
|
|
298
|
+
raise CisvValidationError(
|
|
299
|
+
f"Cannot parse socket: {path}"
|
|
300
|
+
)
|
|
301
|
+
if not stat.S_ISREG(file_stat.st_mode):
|
|
302
|
+
raise CisvValidationError(
|
|
303
|
+
f"Path is not a regular file: {path}"
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
# SECURITY: Check file size limit
|
|
307
|
+
if file_stat.st_size > self._max_file_size:
|
|
308
|
+
raise CisvValidationError(
|
|
309
|
+
f"File too large: {file_stat.st_size} bytes "
|
|
310
|
+
f"(max {self._max_file_size} bytes). "
|
|
311
|
+
f"Increase max_file_size if this is intentional."
|
|
312
|
+
)
|
|
313
|
+
except OSError as e:
|
|
314
|
+
raise CisvValidationError(f"Cannot access file {path}: {e}")
|
|
315
|
+
|
|
316
|
+
return real_path
|
|
317
|
+
|
|
318
|
+
def parse_file(self, path: str, validate_path: bool = True) -> List[List[str]]:
|
|
319
|
+
"""
|
|
320
|
+
Parse a CSV file and return all rows.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
path: Path to the CSV file
|
|
324
|
+
validate_path: If True (default), validates the file path for security.
|
|
325
|
+
Set to False only if you've already validated the path.
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
List of rows, where each row is a list of field values.
|
|
329
|
+
|
|
330
|
+
Raises:
|
|
331
|
+
CisvValidationError: If path validation fails
|
|
332
|
+
CisvParseError: If parsing fails
|
|
333
|
+
RuntimeError: If parser creation fails
|
|
334
|
+
"""
|
|
335
|
+
self._rows = []
|
|
336
|
+
self._current_row = []
|
|
337
|
+
self._parse_errors = []
|
|
338
|
+
|
|
339
|
+
# SECURITY: Validate file path
|
|
340
|
+
if validate_path:
|
|
341
|
+
real_path = self._validate_file_path(path)
|
|
342
|
+
path_to_parse = str(real_path)
|
|
343
|
+
else:
|
|
344
|
+
path_to_parse = path
|
|
345
|
+
|
|
346
|
+
parser = self._create_parser()
|
|
347
|
+
if not parser:
|
|
348
|
+
raise RuntimeError("Failed to create parser")
|
|
349
|
+
|
|
350
|
+
try:
|
|
351
|
+
result = self._lib.cisv_parser_parse_file(parser, path_to_parse.encode('utf-8'))
|
|
352
|
+
if result < 0:
|
|
353
|
+
raise CisvParseError(f"Parse error code: {result}")
|
|
354
|
+
finally:
|
|
355
|
+
self._lib.cisv_parser_destroy(parser)
|
|
356
|
+
|
|
357
|
+
return self._rows
|
|
358
|
+
|
|
359
|
+
@property
|
|
360
|
+
def errors(self) -> List[tuple]:
|
|
361
|
+
"""Return list of (line_number, error_message) tuples from last parse."""
|
|
362
|
+
return self._parse_errors.copy()
|
|
363
|
+
|
|
364
|
+
def parse_string(self, content: str) -> List[List[str]]:
|
|
365
|
+
"""
|
|
366
|
+
Parse a CSV string and return all rows.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
content: CSV content as a string
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
List of rows, where each row is a list of field values.
|
|
373
|
+
|
|
374
|
+
Raises:
|
|
375
|
+
CisvParseError: If parsing fails (when raise_on_error=True)
|
|
376
|
+
RuntimeError: If parser creation fails
|
|
377
|
+
"""
|
|
378
|
+
self._rows = []
|
|
379
|
+
self._current_row = []
|
|
380
|
+
self._parse_errors = []
|
|
381
|
+
|
|
382
|
+
parser = self._create_parser()
|
|
383
|
+
if not parser:
|
|
384
|
+
raise RuntimeError("Failed to create parser")
|
|
385
|
+
|
|
386
|
+
try:
|
|
387
|
+
data = content.encode('utf-8')
|
|
388
|
+
self._lib.cisv_parser_write(parser, data, len(data))
|
|
389
|
+
self._lib.cisv_parser_end(parser)
|
|
390
|
+
finally:
|
|
391
|
+
self._lib.cisv_parser_destroy(parser)
|
|
392
|
+
|
|
393
|
+
return self._rows
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def parse_file(
|
|
397
|
+
path: str,
|
|
398
|
+
delimiter: str = ',',
|
|
399
|
+
quote: str = '"',
|
|
400
|
+
**kwargs
|
|
401
|
+
) -> List[List[str]]:
|
|
402
|
+
"""Parse a CSV file and return all rows."""
|
|
403
|
+
parser = CisvParser(delimiter=delimiter, quote=quote, **kwargs)
|
|
404
|
+
return parser.parse_file(path)
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def parse_string(
|
|
408
|
+
content: str,
|
|
409
|
+
delimiter: str = ',',
|
|
410
|
+
quote: str = '"',
|
|
411
|
+
**kwargs
|
|
412
|
+
) -> List[List[str]]:
|
|
413
|
+
"""Parse a CSV string and return all rows."""
|
|
414
|
+
parser = CisvParser(delimiter=delimiter, quote=quote, **kwargs)
|
|
415
|
+
return parser.parse_string(content)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def count_rows(path: str) -> int:
|
|
419
|
+
"""Count the number of rows in a CSV file without full parsing."""
|
|
420
|
+
lib = _get_lib()
|
|
421
|
+
return lib.cisv_parser_count_rows(path.encode('utf-8'))
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cisv
|
|
3
|
+
Version: 0.0.76
|
|
4
|
+
Summary: High-performance CSV parser with SIMD optimizations (AVX-512/AVX2)
|
|
5
|
+
Home-page: https://github.com/sanix-darker/cisv
|
|
6
|
+
Author: Sanix Darker
|
|
7
|
+
Author-email: Sanix Darker <s4nixd@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Project-URL: Homepage, https://github.com/sanix-darker/cisv
|
|
10
|
+
Project-URL: Documentation, https://github.com/sanix-darker/cisv#readme
|
|
11
|
+
Project-URL: Repository, https://github.com/sanix-darker/cisv
|
|
12
|
+
Project-URL: Issues, https://github.com/sanix-darker/cisv/issues
|
|
13
|
+
Keywords: csv,parser,simd,avx,performance,fast,high-performance
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
18
|
+
Classifier: Operating System :: MacOS
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
26
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
27
|
+
Classifier: Topic :: Text Processing
|
|
28
|
+
Requires-Python: >=3.8
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
License-File: LICENSE
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-benchmark; extra == "dev"
|
|
34
|
+
Dynamic: author
|
|
35
|
+
Dynamic: home-page
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
Dynamic: requires-python
|
|
38
|
+
|
|
39
|
+
# CISV Python Binding
|
|
40
|
+
|
|
41
|
+
High-performance CSV parser with SIMD optimizations for Python.
|
|
42
|
+
|
|
43
|
+
## Requirements
|
|
44
|
+
|
|
45
|
+
- Python 3.7+
|
|
46
|
+
- CISV core library (`libcisv.so`)
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
### Build Core Library First
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
cd ../../core
|
|
54
|
+
make
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Install Python Package
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install -e .
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Or using the Makefile:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
make build
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Quick Start
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from cisv import CisvParser, parse_file, parse_string, count_rows
|
|
73
|
+
|
|
74
|
+
# Simple file parsing
|
|
75
|
+
rows = parse_file('data.csv')
|
|
76
|
+
for row in rows:
|
|
77
|
+
print(row)
|
|
78
|
+
|
|
79
|
+
# Parse with custom options
|
|
80
|
+
parser = CisvParser(
|
|
81
|
+
delimiter=';',
|
|
82
|
+
quote="'",
|
|
83
|
+
trim=True
|
|
84
|
+
)
|
|
85
|
+
rows = parser.parse_file('data.csv')
|
|
86
|
+
|
|
87
|
+
# Parse from string
|
|
88
|
+
csv_data = """name,age,email
|
|
89
|
+
John,30,john@example.com
|
|
90
|
+
Jane,25,jane@example.com"""
|
|
91
|
+
|
|
92
|
+
rows = parse_string(csv_data)
|
|
93
|
+
|
|
94
|
+
# Fast row counting (without full parsing)
|
|
95
|
+
total = count_rows('large.csv')
|
|
96
|
+
print(f"Total rows: {total}")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## API Reference
|
|
100
|
+
|
|
101
|
+
### CisvParser Class
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
class CisvParser:
|
|
105
|
+
def __init__(
|
|
106
|
+
self,
|
|
107
|
+
delimiter: str = ',',
|
|
108
|
+
quote: str = '"',
|
|
109
|
+
escape: Optional[str] = None,
|
|
110
|
+
comment: Optional[str] = None,
|
|
111
|
+
trim: bool = False,
|
|
112
|
+
skip_empty_lines: bool = False,
|
|
113
|
+
):
|
|
114
|
+
"""
|
|
115
|
+
Create a new CSV parser.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
delimiter: Field separator character (default: ',')
|
|
119
|
+
quote: Quote character for fields (default: '"')
|
|
120
|
+
escape: Escape character (default: None for RFC4180 "" style)
|
|
121
|
+
comment: Comment line prefix (default: None)
|
|
122
|
+
trim: Strip whitespace from fields (default: False)
|
|
123
|
+
skip_empty_lines: Skip empty lines (default: False)
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
def parse_file(self, path: str) -> List[List[str]]:
|
|
127
|
+
"""Parse a CSV file and return all rows."""
|
|
128
|
+
|
|
129
|
+
def parse_string(self, content: str) -> List[List[str]]:
|
|
130
|
+
"""Parse a CSV string and return all rows."""
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Convenience Functions
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
def parse_file(
|
|
137
|
+
path: str,
|
|
138
|
+
delimiter: str = ',',
|
|
139
|
+
quote: str = '"',
|
|
140
|
+
**kwargs
|
|
141
|
+
) -> List[List[str]]:
|
|
142
|
+
"""Parse a CSV file with the given options."""
|
|
143
|
+
|
|
144
|
+
def parse_string(
|
|
145
|
+
content: str,
|
|
146
|
+
delimiter: str = ',',
|
|
147
|
+
quote: str = '"',
|
|
148
|
+
**kwargs
|
|
149
|
+
) -> List[List[str]]:
|
|
150
|
+
"""Parse a CSV string with the given options."""
|
|
151
|
+
|
|
152
|
+
def count_rows(path: str) -> int:
|
|
153
|
+
"""Count rows in a CSV file without full parsing."""
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Configuration Options
|
|
157
|
+
|
|
158
|
+
| Option | Type | Default | Description |
|
|
159
|
+
|--------|------|---------|-------------|
|
|
160
|
+
| `delimiter` | str | `','` | Field delimiter character |
|
|
161
|
+
| `quote` | str | `'"'` | Quote character |
|
|
162
|
+
| `escape` | str | `None` | Escape character |
|
|
163
|
+
| `comment` | str | `None` | Comment line prefix |
|
|
164
|
+
| `trim` | bool | `False` | Trim whitespace from fields |
|
|
165
|
+
| `skip_empty_lines` | bool | `False` | Skip empty lines |
|
|
166
|
+
|
|
167
|
+
## Examples
|
|
168
|
+
|
|
169
|
+
### TSV Parsing
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
from cisv import CisvParser
|
|
173
|
+
|
|
174
|
+
parser = CisvParser(delimiter='\t')
|
|
175
|
+
rows = parser.parse_file('data.tsv')
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Skip Comments and Empty Lines
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
parser = CisvParser(
|
|
182
|
+
comment='#',
|
|
183
|
+
skip_empty_lines=True,
|
|
184
|
+
trim=True
|
|
185
|
+
)
|
|
186
|
+
rows = parser.parse_file('config.csv')
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### Parse CSV String
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from cisv import parse_string
|
|
193
|
+
|
|
194
|
+
data = """
|
|
195
|
+
id,name,value
|
|
196
|
+
1,foo,100
|
|
197
|
+
2,bar,200
|
|
198
|
+
"""
|
|
199
|
+
|
|
200
|
+
rows = parse_string(data, trim=True)
|
|
201
|
+
# [['id', 'name', 'value'], ['1', 'foo', '100'], ['2', 'bar', '200']]
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## Performance
|
|
205
|
+
|
|
206
|
+
CISV uses SIMD optimizations (AVX-512, AVX2, SSE2) for high-performance parsing. The Python binding uses ctypes to call directly into the native C library with minimal overhead.
|
|
207
|
+
|
|
208
|
+
Typical performance on modern hardware:
|
|
209
|
+
- 500MB+ CSV files parsed in under 1 second
|
|
210
|
+
- 10-50x faster than pure Python CSV parsers
|
|
211
|
+
|
|
212
|
+
## License
|
|
213
|
+
|
|
214
|
+
MIT
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
cisv/__init__.py,sha256=4EKMwARKli__E6hfjlAdVNIF8TIHHGD4QDyBnspI8yM,510
|
|
2
|
+
cisv/parser.py,sha256=aJlPqh6mzLh1E-KM0rO6R0rb6-VOet1Ow5-7XWOdmxw,13854
|
|
3
|
+
cisv/libs/__init__.py,sha256=EMdmDPn_RcTdaDJtjTIqSrpV5axLm9VzHaT9lOtqwQQ,185
|
|
4
|
+
cisv/libs/libcisv.dylib,sha256=RWSUioFXFssKyOe4e8o-jx2up16ealF8dSvU5OrxlG0,53952
|
|
5
|
+
cisv-0.0.76.dist-info/licenses/LICENSE,sha256=CBnIn1RPPcq_tLqt0e8uJ40t2qpHG6OtO_v94-ZN5iA,1079
|
|
6
|
+
cisv-0.0.76.dist-info/METADATA,sha256=rwOkEKePfjqm_FWjePzpPU4sVtInFAbUSS5t74u5O-M,5193
|
|
7
|
+
cisv-0.0.76.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
8
|
+
cisv-0.0.76.dist-info/top_level.txt,sha256=u5amhz2gvYUuFcRnDeGQYcv79eXNwPn-f5pxgI2jIzw,5
|
|
9
|
+
cisv-0.0.76.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 - cisv - sanix-darker.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
cisv
|