fastparse 0.1.0rc13__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fastparse-0.1.0rc13.data/purelib/fastparse/__init__.py +5 -0
- fastparse-0.1.0rc13.data/purelib/fastparse/_version.py +1 -0
- fastparse-0.1.0rc13.data/purelib/fastparse/native/fastparse.dll +0 -0
- fastparse-0.1.0rc13.data/purelib/tsmp/__init__.py +63 -0
- fastparse-0.1.0rc13.data/purelib/tsmp/native.py +465 -0
- fastparse-0.1.0rc13.dist-info/METADATA +88 -0
- fastparse-0.1.0rc13.dist-info/RECORD +9 -0
- fastparse-0.1.0rc13.dist-info/WHEEL +5 -0
- fastparse-0.1.0rc13.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0rc13"
|
|
Binary file
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from .native import (
|
|
2
|
+
FIELD_NAMES,
|
|
3
|
+
TSMP_FIELD_ALL,
|
|
4
|
+
TSMP_FIELD_BYTE_RANGE,
|
|
5
|
+
TSMP_FIELD_CHILDREN,
|
|
6
|
+
TSMP_FIELD_CHILD_COUNT,
|
|
7
|
+
TSMP_FIELD_DIAGNOSTICS,
|
|
8
|
+
TSMP_FIELD_ID,
|
|
9
|
+
TSMP_FIELD_PARENT_ID,
|
|
10
|
+
TSMP_FIELD_RANGE,
|
|
11
|
+
TSMP_FIELD_RULE,
|
|
12
|
+
TSMP_FIELD_TEXT,
|
|
13
|
+
TSMP_FORMAT_CSV,
|
|
14
|
+
TSMP_FORMAT_DIAGNOSTICS,
|
|
15
|
+
TSMP_FORMAT_JSON,
|
|
16
|
+
TSMP_FORMAT_STATS,
|
|
17
|
+
TSMP_FORMAT_BINARY,
|
|
18
|
+
TSMP_NORMALIZATION_AUTO_SAFE,
|
|
19
|
+
TSMP_NORMALIZATION_COBOL_FIXED_LEGACY,
|
|
20
|
+
TSMP_NORMALIZATION_NONE,
|
|
21
|
+
NativeParseError,
|
|
22
|
+
LanguageLoadResult,
|
|
23
|
+
ParseResult,
|
|
24
|
+
ParseSummary,
|
|
25
|
+
FastParse,
|
|
26
|
+
Tsmp,
|
|
27
|
+
TsmpError,
|
|
28
|
+
TsmpLibrary,
|
|
29
|
+
default_library_path,
|
|
30
|
+
parse_field_mask,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
"FIELD_NAMES",
|
|
35
|
+
"NativeParseError",
|
|
36
|
+
"LanguageLoadResult",
|
|
37
|
+
"ParseResult",
|
|
38
|
+
"ParseSummary",
|
|
39
|
+
"FastParse",
|
|
40
|
+
"TSMP_FIELD_ALL",
|
|
41
|
+
"TSMP_FIELD_BYTE_RANGE",
|
|
42
|
+
"TSMP_FIELD_CHILDREN",
|
|
43
|
+
"TSMP_FIELD_CHILD_COUNT",
|
|
44
|
+
"TSMP_FIELD_DIAGNOSTICS",
|
|
45
|
+
"TSMP_FIELD_ID",
|
|
46
|
+
"TSMP_FIELD_PARENT_ID",
|
|
47
|
+
"TSMP_FIELD_RANGE",
|
|
48
|
+
"TSMP_FIELD_RULE",
|
|
49
|
+
"TSMP_FIELD_TEXT",
|
|
50
|
+
"TSMP_FORMAT_CSV",
|
|
51
|
+
"TSMP_FORMAT_DIAGNOSTICS",
|
|
52
|
+
"TSMP_FORMAT_JSON",
|
|
53
|
+
"TSMP_FORMAT_STATS",
|
|
54
|
+
"TSMP_FORMAT_BINARY",
|
|
55
|
+
"TSMP_NORMALIZATION_AUTO_SAFE",
|
|
56
|
+
"TSMP_NORMALIZATION_COBOL_FIXED_LEGACY",
|
|
57
|
+
"TSMP_NORMALIZATION_NONE",
|
|
58
|
+
"Tsmp",
|
|
59
|
+
"TsmpError",
|
|
60
|
+
"TsmpLibrary",
|
|
61
|
+
"default_library_path",
|
|
62
|
+
"parse_field_mask",
|
|
63
|
+
]
|
|
@@ -0,0 +1,465 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ctypes
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Iterable
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
TSMP_FORMAT_JSON = 1
|
|
13
|
+
TSMP_FORMAT_CSV = 2
|
|
14
|
+
TSMP_FORMAT_STATS = 3
|
|
15
|
+
TSMP_FORMAT_BINARY = 4
|
|
16
|
+
TSMP_FORMAT_DIAGNOSTICS = 5
|
|
17
|
+
|
|
18
|
+
TSMP_NORMALIZATION_AUTO_SAFE = 0
|
|
19
|
+
TSMP_NORMALIZATION_NONE = 1
|
|
20
|
+
TSMP_NORMALIZATION_COBOL_FIXED_LEGACY = 2
|
|
21
|
+
|
|
22
|
+
TSMP_FIELD_ID = 1 << 0
|
|
23
|
+
TSMP_FIELD_PARENT_ID = 1 << 1
|
|
24
|
+
TSMP_FIELD_RULE = 1 << 2
|
|
25
|
+
TSMP_FIELD_TEXT = 1 << 3
|
|
26
|
+
TSMP_FIELD_RANGE = 1 << 4
|
|
27
|
+
TSMP_FIELD_BYTE_RANGE = 1 << 5
|
|
28
|
+
TSMP_FIELD_CHILD_COUNT = 1 << 6
|
|
29
|
+
TSMP_FIELD_CHILDREN = 1 << 7
|
|
30
|
+
TSMP_FIELD_DIAGNOSTICS = 1 << 8
|
|
31
|
+
TSMP_FIELD_ALL = 0xFFFFFFFF
|
|
32
|
+
|
|
33
|
+
FORMAT_NAMES = {
|
|
34
|
+
"json": TSMP_FORMAT_JSON,
|
|
35
|
+
"csv": TSMP_FORMAT_CSV,
|
|
36
|
+
"stats": TSMP_FORMAT_STATS,
|
|
37
|
+
"binary": TSMP_FORMAT_BINARY,
|
|
38
|
+
"msgpack": TSMP_FORMAT_BINARY,
|
|
39
|
+
"diagnostics": TSMP_FORMAT_DIAGNOSTICS,
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
FIELD_NAMES = {
|
|
43
|
+
"id": TSMP_FIELD_ID,
|
|
44
|
+
"parent_id": TSMP_FIELD_PARENT_ID,
|
|
45
|
+
"parent": TSMP_FIELD_PARENT_ID,
|
|
46
|
+
"rule": TSMP_FIELD_RULE,
|
|
47
|
+
"text": TSMP_FIELD_TEXT,
|
|
48
|
+
"range": TSMP_FIELD_RANGE,
|
|
49
|
+
"byte_range": TSMP_FIELD_BYTE_RANGE,
|
|
50
|
+
"bytes": TSMP_FIELD_BYTE_RANGE,
|
|
51
|
+
"child_count": TSMP_FIELD_CHILD_COUNT,
|
|
52
|
+
"children": TSMP_FIELD_CHILDREN,
|
|
53
|
+
"diagnostics": TSMP_FIELD_DIAGNOSTICS,
|
|
54
|
+
"all": TSMP_FIELD_ALL,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
NORMALIZATION_NAMES = {
|
|
58
|
+
"auto": TSMP_NORMALIZATION_AUTO_SAFE,
|
|
59
|
+
"auto_safe": TSMP_NORMALIZATION_AUTO_SAFE,
|
|
60
|
+
"safe": TSMP_NORMALIZATION_AUTO_SAFE,
|
|
61
|
+
"none": TSMP_NORMALIZATION_NONE,
|
|
62
|
+
"off": TSMP_NORMALIZATION_NONE,
|
|
63
|
+
"cobol_fixed_legacy": TSMP_NORMALIZATION_COBOL_FIXED_LEGACY,
|
|
64
|
+
"cobol": TSMP_NORMALIZATION_COBOL_FIXED_LEGACY,
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class TsmpError(RuntimeError):
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class NativeParseError(TsmpError):
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass(frozen=True)
|
|
77
|
+
class ParseResult:
|
|
78
|
+
data: bytes
|
|
79
|
+
node_count: int
|
|
80
|
+
output_format: str
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def text(self) -> str:
|
|
84
|
+
return self.data.decode("utf-8")
|
|
85
|
+
|
|
86
|
+
def json(self) -> Any:
|
|
87
|
+
return json.loads(self.data)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass(frozen=True)
|
|
91
|
+
class ParseSummary:
|
|
92
|
+
output_length: int
|
|
93
|
+
node_count: int
|
|
94
|
+
output_format: str
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass(frozen=True)
|
|
98
|
+
class LanguageLoadResult:
|
|
99
|
+
language: str
|
|
100
|
+
display_name: str
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class _TsmpOptions(ctypes.Structure):
|
|
104
|
+
_fields_ = [
|
|
105
|
+
("language", ctypes.c_char_p),
|
|
106
|
+
("format", ctypes.c_int),
|
|
107
|
+
("include_rules", ctypes.c_char_p),
|
|
108
|
+
("fields", ctypes.c_uint),
|
|
109
|
+
("include_tokens", ctypes.c_int),
|
|
110
|
+
("pretty", ctypes.c_int),
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class _TsmpOptionsV2(ctypes.Structure):
|
|
115
|
+
_fields_ = [
|
|
116
|
+
("language", ctypes.c_char_p),
|
|
117
|
+
("format", ctypes.c_int),
|
|
118
|
+
("include_rules", ctypes.c_char_p),
|
|
119
|
+
("fields", ctypes.c_uint),
|
|
120
|
+
("include_tokens", ctypes.c_int),
|
|
121
|
+
("pretty", ctypes.c_int),
|
|
122
|
+
("normalization", ctypes.c_int),
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class _TsmpResult(ctypes.Structure):
|
|
127
|
+
_fields_ = [
|
|
128
|
+
("status", ctypes.c_int),
|
|
129
|
+
("data", ctypes.c_void_p),
|
|
130
|
+
("length", ctypes.c_size_t),
|
|
131
|
+
("node_count", ctypes.c_size_t),
|
|
132
|
+
("error_message", ctypes.c_void_p),
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class _LanguageLoadResult(ctypes.Structure):
|
|
137
|
+
_fields_ = [
|
|
138
|
+
("status", ctypes.c_int),
|
|
139
|
+
("language", ctypes.c_void_p),
|
|
140
|
+
("display_name", ctypes.c_void_p),
|
|
141
|
+
("error_message", ctypes.c_void_p),
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def default_library_path() -> Path:
|
|
146
|
+
env_path = os.environ.get("FASTPARSE_LIBRARY_PATH") or os.environ.get("TSMP_LIBRARY_PATH")
|
|
147
|
+
if env_path:
|
|
148
|
+
return Path(env_path)
|
|
149
|
+
|
|
150
|
+
project_root = Path(__file__).resolve().parents[3]
|
|
151
|
+
if sys.platform == "darwin":
|
|
152
|
+
candidates = ("libfastparse.dylib", "libtsmp.dylib", "libts_multi_parser.dylib")
|
|
153
|
+
elif sys.platform == "win32":
|
|
154
|
+
candidates = ("fastparse.dll", "tsmp.dll")
|
|
155
|
+
else:
|
|
156
|
+
candidates = ("libfastparse.so", "libtsmp.so", "libts_multi_parser.so")
|
|
157
|
+
|
|
158
|
+
site_packages = Path(__file__).resolve().parents[1]
|
|
159
|
+
wheel_native_dirs = (
|
|
160
|
+
Path(__file__).resolve().parent / "native",
|
|
161
|
+
site_packages / "fastparse" / "native",
|
|
162
|
+
)
|
|
163
|
+
for directory in wheel_native_dirs:
|
|
164
|
+
for name in candidates:
|
|
165
|
+
candidate = directory / name
|
|
166
|
+
if candidate.exists():
|
|
167
|
+
return candidate
|
|
168
|
+
|
|
169
|
+
for directory in ("bin", "lib"):
|
|
170
|
+
for name in candidates:
|
|
171
|
+
candidate = project_root / directory / name
|
|
172
|
+
if candidate.exists():
|
|
173
|
+
return candidate
|
|
174
|
+
return project_root / ("bin" if sys.platform == "win32" else "lib") / candidates[0]
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def parse_field_mask(fields: int | str | Iterable[str] | None) -> int:
|
|
178
|
+
if fields is None or fields == "":
|
|
179
|
+
return 0
|
|
180
|
+
if isinstance(fields, int):
|
|
181
|
+
return fields
|
|
182
|
+
if isinstance(fields, str):
|
|
183
|
+
raw_names = fields.split(",")
|
|
184
|
+
else:
|
|
185
|
+
raw_names = fields
|
|
186
|
+
|
|
187
|
+
mask = 0
|
|
188
|
+
for name in raw_names:
|
|
189
|
+
normalized = str(name).strip().lower().replace("-", "_")
|
|
190
|
+
if not normalized:
|
|
191
|
+
continue
|
|
192
|
+
try:
|
|
193
|
+
value = FIELD_NAMES[normalized]
|
|
194
|
+
except KeyError as exc:
|
|
195
|
+
valid = ", ".join(sorted(FIELD_NAMES))
|
|
196
|
+
raise ValueError(f"unknown field '{name}'. Valid fields: {valid}") from exc
|
|
197
|
+
if value == TSMP_FIELD_ALL:
|
|
198
|
+
return 0
|
|
199
|
+
mask |= value
|
|
200
|
+
return mask
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _format_value(output_format: str | int) -> tuple[str, int]:
|
|
204
|
+
if isinstance(output_format, int):
|
|
205
|
+
for name, value in FORMAT_NAMES.items():
|
|
206
|
+
if value == output_format:
|
|
207
|
+
return name, value
|
|
208
|
+
raise ValueError(f"unknown TSMP format value: {output_format}")
|
|
209
|
+
|
|
210
|
+
normalized = output_format.strip().lower().replace("-", "_")
|
|
211
|
+
try:
|
|
212
|
+
return normalized, FORMAT_NAMES[normalized]
|
|
213
|
+
except KeyError as exc:
|
|
214
|
+
valid = ", ".join(sorted(FORMAT_NAMES))
|
|
215
|
+
raise ValueError(f"unknown format '{output_format}'. Valid formats: {valid}") from exc
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _normalization_value(normalization: str | int | None) -> tuple[str, int]:
|
|
219
|
+
if normalization is None:
|
|
220
|
+
return "auto_safe", TSMP_NORMALIZATION_AUTO_SAFE
|
|
221
|
+
if isinstance(normalization, int):
|
|
222
|
+
for name, value in NORMALIZATION_NAMES.items():
|
|
223
|
+
if value == normalization:
|
|
224
|
+
return name, value
|
|
225
|
+
raise ValueError(f"unknown TSMP normalization value: {normalization}")
|
|
226
|
+
|
|
227
|
+
normalized = normalization.strip().lower().replace("-", "_")
|
|
228
|
+
try:
|
|
229
|
+
return normalized, NORMALIZATION_NAMES[normalized]
|
|
230
|
+
except KeyError as exc:
|
|
231
|
+
valid = ", ".join(sorted(NORMALIZATION_NAMES))
|
|
232
|
+
raise ValueError(f"unknown normalization '{normalization}'. Valid values: {valid}") from exc
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _rules_value(include_rules: str | Iterable[str] | None) -> bytes | None:
|
|
236
|
+
if include_rules is None:
|
|
237
|
+
return None
|
|
238
|
+
if isinstance(include_rules, str):
|
|
239
|
+
value = include_rules
|
|
240
|
+
else:
|
|
241
|
+
value = "|".join(rule for rule in include_rules if rule)
|
|
242
|
+
return value.encode("utf-8") if value else None
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _native_string(pointer: Any) -> str:
|
|
246
|
+
if not pointer:
|
|
247
|
+
return ""
|
|
248
|
+
return ctypes.string_at(pointer).decode("utf-8", errors="replace")
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _native_function(lib: ctypes.CDLL, preferred: str, fallback: str) -> Any:
|
|
252
|
+
try:
|
|
253
|
+
return getattr(lib, preferred)
|
|
254
|
+
except AttributeError:
|
|
255
|
+
return getattr(lib, fallback)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class Tsmp:
|
|
259
|
+
def __init__(self, library_path: str | Path | None = None) -> None:
|
|
260
|
+
self.library_path = Path(library_path) if library_path else default_library_path()
|
|
261
|
+
self._dll_directory = None
|
|
262
|
+
if sys.platform == "win32" and hasattr(os, "add_dll_directory"):
|
|
263
|
+
self._dll_directory = os.add_dll_directory(str(self.library_path.parent))
|
|
264
|
+
self._lib = ctypes.CDLL(str(self.library_path))
|
|
265
|
+
|
|
266
|
+
self._version_fn = _native_function(self._lib, "fastparse_version", "tsmp_version")
|
|
267
|
+
self._parse_fn = _native_function(self._lib, "fastparse_parse", "tsmp_parse")
|
|
268
|
+
self._parse_v2_fn = getattr(self._lib, "fastparse_parse_v2", None)
|
|
269
|
+
self._free_fn = _native_function(self._lib, "fastparse_result_free", "tsmp_result_free")
|
|
270
|
+
self._load_language_extension_fn = getattr(self._lib, "fastparse_load_language_extension")
|
|
271
|
+
self._language_available_fn = getattr(self._lib, "fastparse_language_available")
|
|
272
|
+
self._language_load_result_free_fn = getattr(self._lib, "fastparse_language_load_result_free")
|
|
273
|
+
|
|
274
|
+
self._version_fn.argtypes = []
|
|
275
|
+
self._version_fn.restype = ctypes.c_char_p
|
|
276
|
+
|
|
277
|
+
self._parse_fn.argtypes = [
|
|
278
|
+
ctypes.c_void_p,
|
|
279
|
+
ctypes.c_size_t,
|
|
280
|
+
ctypes.POINTER(_TsmpOptions),
|
|
281
|
+
ctypes.POINTER(_TsmpResult),
|
|
282
|
+
]
|
|
283
|
+
self._parse_fn.restype = ctypes.c_int
|
|
284
|
+
|
|
285
|
+
if self._parse_v2_fn is not None:
|
|
286
|
+
self._parse_v2_fn.argtypes = [
|
|
287
|
+
ctypes.c_void_p,
|
|
288
|
+
ctypes.c_size_t,
|
|
289
|
+
ctypes.POINTER(_TsmpOptionsV2),
|
|
290
|
+
ctypes.POINTER(_TsmpResult),
|
|
291
|
+
]
|
|
292
|
+
self._parse_v2_fn.restype = ctypes.c_int
|
|
293
|
+
|
|
294
|
+
self._free_fn.argtypes = [ctypes.POINTER(_TsmpResult)]
|
|
295
|
+
self._free_fn.restype = None
|
|
296
|
+
|
|
297
|
+
self._load_language_extension_fn.argtypes = [ctypes.c_char_p, ctypes.POINTER(_LanguageLoadResult)]
|
|
298
|
+
self._load_language_extension_fn.restype = ctypes.c_int
|
|
299
|
+
|
|
300
|
+
self._language_available_fn.argtypes = [ctypes.c_char_p]
|
|
301
|
+
self._language_available_fn.restype = ctypes.c_int
|
|
302
|
+
|
|
303
|
+
self._language_load_result_free_fn.argtypes = [ctypes.POINTER(_LanguageLoadResult)]
|
|
304
|
+
self._language_load_result_free_fn.restype = None
|
|
305
|
+
|
|
306
|
+
@property
|
|
307
|
+
def version(self) -> str:
|
|
308
|
+
return self._version_fn().decode("utf-8")
|
|
309
|
+
|
|
310
|
+
def language_available(self, language: str) -> bool:
|
|
311
|
+
return bool(self._language_available_fn(language.encode("utf-8")))
|
|
312
|
+
|
|
313
|
+
def load_language_extension(self, path: str | Path) -> LanguageLoadResult:
|
|
314
|
+
result = _LanguageLoadResult()
|
|
315
|
+
status = self._load_language_extension_fn(str(path).encode("utf-8"), ctypes.byref(result))
|
|
316
|
+
try:
|
|
317
|
+
if status != 0 or result.status != 0:
|
|
318
|
+
message = _native_string(result.error_message)
|
|
319
|
+
raise NativeParseError(
|
|
320
|
+
f"fastparse_load_language_extension failed with status {status}/{result.status}: "
|
|
321
|
+
f"{message or 'no error detail'}"
|
|
322
|
+
)
|
|
323
|
+
return LanguageLoadResult(
|
|
324
|
+
language=_native_string(result.language),
|
|
325
|
+
display_name=_native_string(result.display_name),
|
|
326
|
+
)
|
|
327
|
+
finally:
|
|
328
|
+
self._language_load_result_free_fn(ctypes.byref(result))
|
|
329
|
+
|
|
330
|
+
def parse_bytes(
|
|
331
|
+
self,
|
|
332
|
+
source: bytes | bytearray | memoryview,
|
|
333
|
+
*,
|
|
334
|
+
language: str = "java",
|
|
335
|
+
output_format: str | int = "json",
|
|
336
|
+
include_rules: str | Iterable[str] | None = None,
|
|
337
|
+
fields: int | str | Iterable[str] | None = None,
|
|
338
|
+
include_tokens: bool = False,
|
|
339
|
+
pretty: bool = False,
|
|
340
|
+
normalization: str | int | None = None,
|
|
341
|
+
) -> ParseResult:
|
|
342
|
+
data, node_count, format_name, _output_length = self._parse_native(
|
|
343
|
+
source,
|
|
344
|
+
language=language,
|
|
345
|
+
output_format=output_format,
|
|
346
|
+
include_rules=include_rules,
|
|
347
|
+
fields=fields,
|
|
348
|
+
include_tokens=include_tokens,
|
|
349
|
+
pretty=pretty,
|
|
350
|
+
normalization=normalization,
|
|
351
|
+
copy_data=True,
|
|
352
|
+
)
|
|
353
|
+
return ParseResult(data, node_count, format_name)
|
|
354
|
+
|
|
355
|
+
def parse_bytes_summary(
|
|
356
|
+
self,
|
|
357
|
+
source: bytes | bytearray | memoryview,
|
|
358
|
+
*,
|
|
359
|
+
language: str = "java",
|
|
360
|
+
output_format: str | int = "json",
|
|
361
|
+
include_rules: str | Iterable[str] | None = None,
|
|
362
|
+
fields: int | str | Iterable[str] | None = None,
|
|
363
|
+
include_tokens: bool = False,
|
|
364
|
+
pretty: bool = False,
|
|
365
|
+
normalization: str | int | None = None,
|
|
366
|
+
) -> ParseSummary:
|
|
367
|
+
_data, node_count, format_name, output_length = self._parse_native(
|
|
368
|
+
source,
|
|
369
|
+
language=language,
|
|
370
|
+
output_format=output_format,
|
|
371
|
+
include_rules=include_rules,
|
|
372
|
+
fields=fields,
|
|
373
|
+
include_tokens=include_tokens,
|
|
374
|
+
pretty=pretty,
|
|
375
|
+
normalization=normalization,
|
|
376
|
+
copy_data=False,
|
|
377
|
+
)
|
|
378
|
+
return ParseSummary(output_length, node_count, format_name)
|
|
379
|
+
|
|
380
|
+
def _parse_native(
|
|
381
|
+
self,
|
|
382
|
+
source: bytes | bytearray | memoryview,
|
|
383
|
+
*,
|
|
384
|
+
language: str,
|
|
385
|
+
output_format: str | int,
|
|
386
|
+
include_rules: str | Iterable[str] | None,
|
|
387
|
+
fields: int | str | Iterable[str] | None,
|
|
388
|
+
include_tokens: bool,
|
|
389
|
+
pretty: bool,
|
|
390
|
+
normalization: str | int | None,
|
|
391
|
+
copy_data: bool,
|
|
392
|
+
) -> tuple[bytes, int, str, int]:
|
|
393
|
+
source_bytes = source if isinstance(source, bytes) else bytes(source)
|
|
394
|
+
format_name, format_code = _format_value(output_format)
|
|
395
|
+
_normalization_name, normalization_code = _normalization_value(normalization)
|
|
396
|
+
source_pointer = ctypes.c_char_p(source_bytes) if source_bytes else None
|
|
397
|
+
if self._parse_v2_fn is not None:
|
|
398
|
+
options = _TsmpOptionsV2(
|
|
399
|
+
language.encode("utf-8"),
|
|
400
|
+
format_code,
|
|
401
|
+
_rules_value(include_rules),
|
|
402
|
+
parse_field_mask(fields),
|
|
403
|
+
1 if include_tokens else 0,
|
|
404
|
+
1 if pretty else 0,
|
|
405
|
+
normalization_code,
|
|
406
|
+
)
|
|
407
|
+
parse_fn = self._parse_v2_fn
|
|
408
|
+
else:
|
|
409
|
+
if normalization_code != TSMP_NORMALIZATION_AUTO_SAFE:
|
|
410
|
+
raise TsmpError("native FastParse library does not support explicit normalization options")
|
|
411
|
+
options = _TsmpOptions(
|
|
412
|
+
language.encode("utf-8"),
|
|
413
|
+
format_code,
|
|
414
|
+
_rules_value(include_rules),
|
|
415
|
+
parse_field_mask(fields),
|
|
416
|
+
1 if include_tokens else 0,
|
|
417
|
+
1 if pretty else 0,
|
|
418
|
+
)
|
|
419
|
+
parse_fn = self._parse_fn
|
|
420
|
+
result = _TsmpResult()
|
|
421
|
+
|
|
422
|
+
status = parse_fn(
|
|
423
|
+
ctypes.cast(source_pointer, ctypes.c_void_p) if source_pointer is not None else None,
|
|
424
|
+
len(source_bytes),
|
|
425
|
+
ctypes.byref(options),
|
|
426
|
+
ctypes.byref(result),
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
try:
|
|
430
|
+
if status != 0 or result.status != 0:
|
|
431
|
+
message = _native_string(result.error_message)
|
|
432
|
+
raise NativeParseError(
|
|
433
|
+
f"tsmp_parse failed with status {status}/{result.status}: "
|
|
434
|
+
f"{message or 'no error detail'}"
|
|
435
|
+
)
|
|
436
|
+
output_length = int(result.length)
|
|
437
|
+
data = (
|
|
438
|
+
ctypes.string_at(result.data, result.length)
|
|
439
|
+
if copy_data and result.data and result.length > 0
|
|
440
|
+
else b""
|
|
441
|
+
)
|
|
442
|
+
return data, int(result.node_count), format_name, output_length
|
|
443
|
+
finally:
|
|
444
|
+
self._free_fn(ctypes.byref(result))
|
|
445
|
+
|
|
446
|
+
def parse_text(
|
|
447
|
+
self,
|
|
448
|
+
source: str,
|
|
449
|
+
*,
|
|
450
|
+
encoding: str = "utf-8",
|
|
451
|
+
errors: str = "strict",
|
|
452
|
+
**kwargs: Any,
|
|
453
|
+
) -> ParseResult:
|
|
454
|
+
return self.parse_bytes(source.encode(encoding, errors=errors), **kwargs)
|
|
455
|
+
|
|
456
|
+
def parse_result(self, source: bytes | bytearray | memoryview, **kwargs: Any) -> tuple[bytes, int]:
|
|
457
|
+
result = self.parse_bytes(source, **kwargs)
|
|
458
|
+
return result.data, result.node_count
|
|
459
|
+
|
|
460
|
+
def parse(self, source: bytes | bytearray | memoryview, **kwargs: Any) -> bytes:
|
|
461
|
+
return self.parse_bytes(source, **kwargs).data
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
TsmpLibrary = Tsmp
|
|
465
|
+
FastParse = Tsmp
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fastparse
|
|
3
|
+
Version: 0.1.0rc13
|
|
4
|
+
Summary: Python binding for the FastParse Tree-sitter parser library with bundled native runtime
|
|
5
|
+
Project-URL: Homepage, https://github.com/natan-sysview/fast_parser
|
|
6
|
+
Project-URL: Repository, https://github.com/natan-sysview/fast_parser
|
|
7
|
+
Project-URL: Documentation, https://github.com/natan-sysview/fast_parser/tree/main/docs
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# FastParse Python Binding
|
|
12
|
+
|
|
13
|
+
Small `ctypes` binding for the FastParse native C library.
|
|
14
|
+
|
|
15
|
+
The binding keeps the same design boundary as the native library:
|
|
16
|
+
|
|
17
|
+
- Python owns file I/O.
|
|
18
|
+
- Python passes bytes already loaded in RAM.
|
|
19
|
+
- FastParse returns JSON, CSV, MessagePack binary, diagnostics, or stats in RAM.
|
|
20
|
+
- Python copies the returned native buffer and frees the native result.
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
From PyPI, once the package is published:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install fastparse
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
From a downloaded wheel:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install fastparse-0.1.0rc13-py3-none-macosx_11_0_arm64.whl
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
The wheel includes the native library for its platform, so normal users do not need `FASTPARSE_LIBRARY_PATH`.
|
|
37
|
+
|
|
38
|
+
## Quick Use
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from fastparse import FastParse
|
|
42
|
+
|
|
43
|
+
parser = FastParse()
|
|
44
|
+
source = b"class Demo { void run() {} }"
|
|
45
|
+
|
|
46
|
+
result = parser.parse_bytes(
|
|
47
|
+
source,
|
|
48
|
+
language="java",
|
|
49
|
+
output_format="json",
|
|
50
|
+
fields=["rule", "text", "byte_range"],
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
print(result.node_count)
|
|
54
|
+
print(result.json())
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Binary MessagePack output:
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
result = tsmp.parse_bytes(
|
|
61
|
+
source,
|
|
62
|
+
language="java",
|
|
63
|
+
output_format="binary",
|
|
64
|
+
fields=["rule", "text", "byte_range"],
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
print(result.node_count)
|
|
68
|
+
print(result.data) # MessagePack bytes
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
If the caller only needs node/output counts and does not need to copy the generated JSON/CSV into Python:
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
summary = parser.parse_bytes_summary(
|
|
75
|
+
source,
|
|
76
|
+
language="java",
|
|
77
|
+
output_format="json",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
print(summary.node_count)
|
|
81
|
+
print(summary.output_length)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Use `FASTPARSE_LIBRARY_PATH` to point at a specific native library:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
FASTPARSE_LIBRARY_PATH=/path/to/libfastparse.dylib python your_script.py
|
|
88
|
+
```
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
fastparse-0.1.0rc13.data/purelib/fastparse/__init__.py,sha256=yIeab8-qQRYh5VvFD6ShWqd6UWSipx8ShnOtmHqlJ3Q,167
|
|
2
|
+
fastparse-0.1.0rc13.data/purelib/fastparse/_version.py,sha256=ijZLxffeYWZW82GGb4k0zVqfeY9nlv4RHD7fCgRu3qc,27
|
|
3
|
+
fastparse-0.1.0rc13.data/purelib/fastparse/native/fastparse.dll,sha256=tgiaCUU9RCVsZmEwaQovVhLUKflAtyLEneOL06HydBY,504320
|
|
4
|
+
fastparse-0.1.0rc13.data/purelib/tsmp/__init__.py,sha256=FWB3rjwfihsD1zyZtBkxDD1ow7GUA17BR4jHoZXmCEA,1490
|
|
5
|
+
fastparse-0.1.0rc13.data/purelib/tsmp/native.py,sha256=fJNldsIcCcoyNojdvWTUh_tY2keMO4jr-m9LV6sgqBQ,16057
|
|
6
|
+
fastparse-0.1.0rc13.dist-info/METADATA,sha256=xUWB7xEVnjXebxvU6GZYUVC-q-UP_d6tTHQSbMhorHw,2192
|
|
7
|
+
fastparse-0.1.0rc13.dist-info/WHEEL,sha256=GjDPPQwEcripVP6P2r3RxLa-h5Lb9ifGB7FYYtbLDT0,98
|
|
8
|
+
fastparse-0.1.0rc13.dist-info/top_level.txt,sha256=latRbd5VWXx8kFVXF17DbmnDlmH15LvC995ajzu_AAY,15
|
|
9
|
+
fastparse-0.1.0rc13.dist-info/RECORD,,
|