pyjsonfrag 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/customers.json +16 -0
- examples/parse_frag.py +43 -0
- examples/parse_stream.py +59 -0
- examples/parse_tree.py +4 -0
- examples/pyjsonfrag/__init__.py +765 -0
- pyjsonfrag-0.0.1.dist-info/METADATA +230 -0
- pyjsonfrag-0.0.1.dist-info/RECORD +9 -0
- pyjsonfrag-0.0.1.dist-info/WHEEL +4 -0
- pyjsonfrag-0.0.1.dist-info/licenses/LICENSE +19 -0
examples/customers.json
ADDED
examples/parse_frag.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import pyjsonfrag
|
|
2
|
+
|
|
3
|
+
class Customer(object):
|
|
4
|
+
def __init__(self, customerId = None, name = None, accountCount = None, totalBalance = None):
|
|
5
|
+
if customerId is not None:
|
|
6
|
+
self.customerId = int(customerId)
|
|
7
|
+
else:
|
|
8
|
+
self.customerId = None
|
|
9
|
+
self.name = name
|
|
10
|
+
if accountCount is not None:
|
|
11
|
+
self.accountCount = int(accountCount)
|
|
12
|
+
else:
|
|
13
|
+
self.accountCount = None
|
|
14
|
+
if totalBalance is not None:
|
|
15
|
+
self.totalBalance = float(totalBalance)
|
|
16
|
+
else:
|
|
17
|
+
self.totalBalance = None
|
|
18
|
+
def __repr__(s):
|
|
19
|
+
return ("Customer(%d,%s,%d,%.2f)" % (s.customerId,s.name,s.accountCount,s.totalBalance))
|
|
20
|
+
|
|
21
|
+
cs = {}
|
|
22
|
+
|
|
23
|
+
class MyHandler(pyjsonfrag.FragmentHandler):
|
|
24
|
+
def start_frag_dict(self, key):
|
|
25
|
+
if self.path_is([None, "customers", None]):
|
|
26
|
+
self.start_frag_collection()
|
|
27
|
+
def end_frag_dict(self, key, val):
|
|
28
|
+
if self.path_is([None, "customers", None]):
|
|
29
|
+
c = Customer(customerId=val["id"], name=val["name"],
|
|
30
|
+
accountCount=val["accountCount"], totalBalance=val["totalBalance"])
|
|
31
|
+
cs[c.customerId] = c
|
|
32
|
+
|
|
33
|
+
handler = MyHandler()
|
|
34
|
+
stream = pyjsonfrag.JsonStream(handler)
|
|
35
|
+
with open("customers.json", "r") as f:
|
|
36
|
+
while True:
|
|
37
|
+
buf = f.read(4096)
|
|
38
|
+
if buf == '':
|
|
39
|
+
stream.feed(buf, 0, len(buf), True)
|
|
40
|
+
break
|
|
41
|
+
else:
|
|
42
|
+
stream.feed(buf, 0, len(buf), False)
|
|
43
|
+
print(cs)
|
examples/parse_stream.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import pyjsonfrag
|
|
2
|
+
|
|
3
|
+
class Customer(object):
|
|
4
|
+
def __init__(self, customerId = None, name = None, accountCount = None, totalBalance = None):
|
|
5
|
+
if customerId is not None:
|
|
6
|
+
self.customerId = int(customerId)
|
|
7
|
+
else:
|
|
8
|
+
self.customerId = None
|
|
9
|
+
self.name = name
|
|
10
|
+
if accountCount is not None:
|
|
11
|
+
self.accountCount = int(accountCount)
|
|
12
|
+
else:
|
|
13
|
+
self.accountCount = None
|
|
14
|
+
if totalBalance is not None:
|
|
15
|
+
self.totalBalance = float(totalBalance)
|
|
16
|
+
else:
|
|
17
|
+
self.totalBalance = None
|
|
18
|
+
def __repr__(s):
|
|
19
|
+
return ("Customer(%d,%s,%d,%.2f)" % (s.customerId,s.name,s.accountCount,s.totalBalance))
|
|
20
|
+
|
|
21
|
+
context = []
|
|
22
|
+
cs = {}
|
|
23
|
+
c = None
|
|
24
|
+
|
|
25
|
+
class MyHandler(pyjsonfrag.JsonHandler):
|
|
26
|
+
def start_dict(stream, key):
|
|
27
|
+
global c
|
|
28
|
+
context.append(key)
|
|
29
|
+
if context == [None, "customers", None]:
|
|
30
|
+
c = Customer()
|
|
31
|
+
def start_array(stream, key):
|
|
32
|
+
context.append(key)
|
|
33
|
+
def end_dict(stream, key):
|
|
34
|
+
context.pop()
|
|
35
|
+
def end_array(stream, key):
|
|
36
|
+
context.pop()
|
|
37
|
+
def handle_string(stream, key, val):
|
|
38
|
+
if key == "name":
|
|
39
|
+
c.name = val
|
|
40
|
+
def handle_number(stream, key, num, is_integer):
|
|
41
|
+
if key == "id":
|
|
42
|
+
cs[int(num)] = c
|
|
43
|
+
c.customerId = int(num)
|
|
44
|
+
elif key == "accountCount":
|
|
45
|
+
c.accountCount = int(num)
|
|
46
|
+
elif key == "totalBalance":
|
|
47
|
+
c.totalBalance = num
|
|
48
|
+
|
|
49
|
+
handler = MyHandler()
|
|
50
|
+
stream = pyjsonfrag.JsonStream(handler)
|
|
51
|
+
with open("customers.json", "r") as f:
|
|
52
|
+
while True:
|
|
53
|
+
buf = f.read(4096)
|
|
54
|
+
if buf == '':
|
|
55
|
+
stream.feed(buf, 0, len(buf), True)
|
|
56
|
+
break
|
|
57
|
+
else:
|
|
58
|
+
stream.feed(buf, 0, len(buf), False)
|
|
59
|
+
print(cs)
|
examples/parse_tree.py
ADDED
|
@@ -0,0 +1,765 @@
|
|
|
1
|
+
import io
|
|
2
|
+
|
|
3
|
+
JSONSTREAM_MODE_KEYSTRING = 1
|
|
4
|
+
JSONSTREAM_MODE_KEYSTRING_ESCAPE = 2
|
|
5
|
+
JSONSTREAM_MODE_KEYSTRING_UESCAPE = 3
|
|
6
|
+
JSONSTREAM_MODE_STRING = 4
|
|
7
|
+
JSONSTREAM_MODE_STRING_ESCAPE = 5
|
|
8
|
+
JSONSTREAM_MODE_STRING_UESCAPE = 6
|
|
9
|
+
JSONSTREAM_MODE_TRUE = 7
|
|
10
|
+
JSONSTREAM_MODE_FALSE = 8
|
|
11
|
+
JSONSTREAM_MODE_NULL = 9
|
|
12
|
+
JSONSTREAM_MODE_FIRSTKEY = 10
|
|
13
|
+
JSONSTREAM_MODE_KEY = 11
|
|
14
|
+
JSONSTREAM_MODE_FIRSTVAL = 12
|
|
15
|
+
JSONSTREAM_MODE_VAL = 13
|
|
16
|
+
JSONSTREAM_MODE_COLON = 14
|
|
17
|
+
JSONSTREAM_MODE_COMMA = 15
|
|
18
|
+
JSONSTREAM_MODE_NUMBER = 16
|
|
19
|
+
JSONSTREAM_MODE_ENDWS = 17
|
|
20
|
+
|
|
21
|
+
class JsonHandler(object):
|
|
22
|
+
def handle_comment(self, comma_seen, val, is_multiline):
|
|
23
|
+
pass
|
|
24
|
+
def start_dict(self, key):
|
|
25
|
+
pass
|
|
26
|
+
def start_array(self, key):
|
|
27
|
+
pass
|
|
28
|
+
def end_dict(self, key):
|
|
29
|
+
pass
|
|
30
|
+
def end_array(self, key):
|
|
31
|
+
pass
|
|
32
|
+
def handle_number(self, key, num, is_integer):
|
|
33
|
+
pass
|
|
34
|
+
def handle_string(self, key, val):
|
|
35
|
+
pass
|
|
36
|
+
def handle_boolean(self, key, val):
|
|
37
|
+
pass
|
|
38
|
+
def handle_null(self, key):
|
|
39
|
+
pass
|
|
40
|
+
|
|
41
|
+
class FragmentHandler(JsonHandler):
|
|
42
|
+
def __init__(self):
|
|
43
|
+
self.stack = []
|
|
44
|
+
self.fragstack = []
|
|
45
|
+
self.collect = False
|
|
46
|
+
self.val = None
|
|
47
|
+
def path_is(self, path):
|
|
48
|
+
return self.stack == path
|
|
49
|
+
def start_frag_collection(self):
|
|
50
|
+
self.collect = True
|
|
51
|
+
def handle_frag_comment(self, comma_seen, val, is_multiline):
|
|
52
|
+
pass
|
|
53
|
+
def handle_comment(self, comma_seen, val, is_multiline):
|
|
54
|
+
self.handle_frag_comment(comma_seen, val, is_multiline)
|
|
55
|
+
def start_frag_dict(self, key):
|
|
56
|
+
pass
|
|
57
|
+
def start_dict(self, key):
|
|
58
|
+
self.stack.append(key)
|
|
59
|
+
if not self.collect:
|
|
60
|
+
self.start_frag_dict(key)
|
|
61
|
+
if self.collect:
|
|
62
|
+
obj = {}
|
|
63
|
+
if self.fragstack:
|
|
64
|
+
if key is not None:
|
|
65
|
+
self.fragstack[-1][key] = obj
|
|
66
|
+
else:
|
|
67
|
+
self.fragstack[-1].append(obj)
|
|
68
|
+
self.fragstack.append(obj)
|
|
69
|
+
def start_frag_array(self, key):
|
|
70
|
+
pass
|
|
71
|
+
def start_array(self, key):
|
|
72
|
+
self.stack.append(key)
|
|
73
|
+
if not self.collect:
|
|
74
|
+
self.start_frag_array(key)
|
|
75
|
+
if self.collect:
|
|
76
|
+
obj = []
|
|
77
|
+
if self.fragstack:
|
|
78
|
+
if key is not None:
|
|
79
|
+
self.fragstack[-1][key] = obj
|
|
80
|
+
else:
|
|
81
|
+
self.fragstack[-1].append(obj)
|
|
82
|
+
self.fragstack.append(obj)
|
|
83
|
+
def end_frag_dict(self, key, val):
|
|
84
|
+
pass
|
|
85
|
+
def end_dict(self, key):
|
|
86
|
+
val = None
|
|
87
|
+
if self.fragstack:
|
|
88
|
+
val1 = self.fragstack.pop()
|
|
89
|
+
if not self.fragstack:
|
|
90
|
+
val = val1
|
|
91
|
+
self.collect = False
|
|
92
|
+
if not self.collect:
|
|
93
|
+
self.end_frag_dict(key, val)
|
|
94
|
+
self.stack.pop()
|
|
95
|
+
def end_frag_array(self, key, val):
|
|
96
|
+
pass
|
|
97
|
+
def end_array(self, key):
|
|
98
|
+
val = None
|
|
99
|
+
if self.fragstack:
|
|
100
|
+
val1 = self.fragstack.pop()
|
|
101
|
+
if not self.fragstack:
|
|
102
|
+
val = val1
|
|
103
|
+
self.collect = False
|
|
104
|
+
if not self.collect:
|
|
105
|
+
self.end_frag_array(key, val)
|
|
106
|
+
self.stack.pop()
|
|
107
|
+
def handle_frag_number(self, key, num, is_integer):
|
|
108
|
+
pass
|
|
109
|
+
def handle_frag_string(self, key, val):
|
|
110
|
+
pass
|
|
111
|
+
def handle_frag_boolean(self, key, val):
|
|
112
|
+
pass
|
|
113
|
+
def handle_frag_null(self, key):
|
|
114
|
+
pass
|
|
115
|
+
def handle_number(self, key, num, is_integer):
|
|
116
|
+
if is_integer:
|
|
117
|
+
num = int(num)
|
|
118
|
+
if self.collect and self.fragstack:
|
|
119
|
+
if key is not None:
|
|
120
|
+
self.fragstack[-1][key] = num
|
|
121
|
+
else:
|
|
122
|
+
self.fragstack[-1].append(num)
|
|
123
|
+
if self.collect:
|
|
124
|
+
return
|
|
125
|
+
self.handle_frag_number(key, num, is_integer)
|
|
126
|
+
def handle_string(self, key, val):
|
|
127
|
+
if self.collect and self.fragstack:
|
|
128
|
+
if key is not None:
|
|
129
|
+
self.fragstack[-1][key] = val
|
|
130
|
+
else:
|
|
131
|
+
self.fragstack[-1].append(val)
|
|
132
|
+
if self.collect:
|
|
133
|
+
return
|
|
134
|
+
self.handle_frag_string(key, val)
|
|
135
|
+
def handle_boolean(self, key, val):
|
|
136
|
+
if self.collect and self.fragstack:
|
|
137
|
+
if key is not None:
|
|
138
|
+
self.fragstack[-1][key] = val
|
|
139
|
+
else:
|
|
140
|
+
self.fragstack[-1].append(val)
|
|
141
|
+
if self.collect:
|
|
142
|
+
return
|
|
143
|
+
self.handle_frag_boolean(key, val)
|
|
144
|
+
def handle_null(self, key):
|
|
145
|
+
val = None
|
|
146
|
+
if self.collect and self.fragstack:
|
|
147
|
+
if key is not None:
|
|
148
|
+
self.fragstack[-1][key] = val
|
|
149
|
+
else:
|
|
150
|
+
self.fragstack[-1].append(val)
|
|
151
|
+
if self.collect:
|
|
152
|
+
return
|
|
153
|
+
self.handle_frag_null(key)
|
|
154
|
+
|
|
155
|
+
class JsonStream(object):
|
|
156
|
+
def __init__(self, handler):
|
|
157
|
+
self.mode = JSONSTREAM_MODE_VAL
|
|
158
|
+
self.sz = 0
|
|
159
|
+
self.uescape = io.StringIO()
|
|
160
|
+
self.c_comment_seen = False
|
|
161
|
+
self.c_comment_seen_star = False
|
|
162
|
+
self.cpp_comment_seen = False
|
|
163
|
+
self.comment_seen_preliminary = False
|
|
164
|
+
self.comments = False
|
|
165
|
+
self.trailing_commas = False
|
|
166
|
+
self.keypresent = False
|
|
167
|
+
self.key = io.StringIO()
|
|
168
|
+
self.keystack = []
|
|
169
|
+
self.val = io.StringIO()
|
|
170
|
+
self.handler = handler
|
|
171
|
+
self.is_integer = False
|
|
172
|
+
self.comma_seen = False
|
|
173
|
+
def allow_comments(self):
|
|
174
|
+
self.comments = True
|
|
175
|
+
def allow_trailing_comma(self):
|
|
176
|
+
self.trailing_commas = True
|
|
177
|
+
def get_keystack(self): # for internal use only
|
|
178
|
+
if (self.keystack[-1] == None):
|
|
179
|
+
self.keypresent = False
|
|
180
|
+
self.keystack.pop()
|
|
181
|
+
return
|
|
182
|
+
self.keypresent = True
|
|
183
|
+
self.key = self.keystack[-1]
|
|
184
|
+
self.keystack.pop()
|
|
185
|
+
def put_keystack_1(self): # for internal use only
|
|
186
|
+
if not self.keypresent:
|
|
187
|
+
self.keystack.append(None)
|
|
188
|
+
self.keypresent = False
|
|
189
|
+
return
|
|
190
|
+
self.keystack.append(self.key)
|
|
191
|
+
def put_keystack_2(self): # for internal use only
|
|
192
|
+
self.keypresent = False
|
|
193
|
+
def get_key(self): # for internal use only
|
|
194
|
+
if not self.keypresent:
|
|
195
|
+
return None
|
|
196
|
+
return self.key.getvalue()
|
|
197
|
+
def strip_comment(self, buf, start, i, sz, eof): # for internal use only
|
|
198
|
+
i+=1
|
|
199
|
+
self.mode = JSONSTREAM_MODE_ENDWS
|
|
200
|
+
while i < sz:
|
|
201
|
+
if self.comments and (not self.comment_seen_preliminary) and (not self.cpp_comment_seen) and (not self.c_comment_seen) and buf[start+i] == '/':
|
|
202
|
+
self.comment_seen_preliminary = True
|
|
203
|
+
self.val = io.StringIO()
|
|
204
|
+
i+=1
|
|
205
|
+
continue
|
|
206
|
+
if self.comment_seen_preliminary:
|
|
207
|
+
if buf[start+i] == '*':
|
|
208
|
+
self.comment_seen_preliminary = False
|
|
209
|
+
self.c_comment_seen = True
|
|
210
|
+
self.c_comment_seen_star = False
|
|
211
|
+
self.val = io.StringIO()
|
|
212
|
+
i+=1
|
|
213
|
+
continue
|
|
214
|
+
if buf[start+i] != '/':
|
|
215
|
+
self.errloc = i
|
|
216
|
+
raise Exception("illegal comment")
|
|
217
|
+
self.comment_seen_preliminary = False
|
|
218
|
+
self.cpp_comment_seen = True
|
|
219
|
+
self.val = io.StringIO()
|
|
220
|
+
i+=1
|
|
221
|
+
continue
|
|
222
|
+
if self.c_comment_seen:
|
|
223
|
+
if buf[start+i] == '*':
|
|
224
|
+
self.c_comment_seen_star = True
|
|
225
|
+
elif self.c_comment_seen_star and buf[start+i] == '/':
|
|
226
|
+
self.c_comment_seen = False
|
|
227
|
+
self.c_comment_seen_star = False
|
|
228
|
+
if self.handler.handle_comment:
|
|
229
|
+
self.handler.handle_comment(self.comma_seen, self.val.getvalue(), True)
|
|
230
|
+
else:
|
|
231
|
+
if self.c_comment_seen_star:
|
|
232
|
+
self.val.write('*')
|
|
233
|
+
self.c_comment_seen_star = False
|
|
234
|
+
self.val.write(buf[start+i])
|
|
235
|
+
i+=1
|
|
236
|
+
continue
|
|
237
|
+
if self.cpp_comment_seen:
|
|
238
|
+
if buf[start+i] == '\n':
|
|
239
|
+
self.cpp_comment_seen = False
|
|
240
|
+
if self.handler.handle_comment:
|
|
241
|
+
self.handler.handle_comment(self.comma_seen, self.val.getvalue(), False)
|
|
242
|
+
else:
|
|
243
|
+
self.val.write(buf[start+i])
|
|
244
|
+
i+=1
|
|
245
|
+
continue
|
|
246
|
+
if buf[start+i] == ' ' or buf[start+i] == '\n' or buf[start+i] == '\r' or buf[start+i] == '\t':
|
|
247
|
+
i+=1
|
|
248
|
+
continue
|
|
249
|
+
self.errloc = i
|
|
250
|
+
raise Exception("Overflow")
|
|
251
|
+
if eof and (self.c_comment_seen or self.comment_seen_preliminary):
|
|
252
|
+
self.errloc = i
|
|
253
|
+
raise Exception("Unterminated beginning of comment")
|
|
254
|
+
def feed(self, buf, start, sz, eof):
|
|
255
|
+
if sz < 0 or start+sz > len(buf):
|
|
256
|
+
raise Exception("out of bounds")
|
|
257
|
+
if self.mode == JSONSTREAM_MODE_ENDWS:
|
|
258
|
+
self.strip_comment(buf, start, -1, sz, eof)
|
|
259
|
+
if eof:
|
|
260
|
+
return 0
|
|
261
|
+
return -1
|
|
262
|
+
i = 0
|
|
263
|
+
while i < sz:
|
|
264
|
+
if self.mode == JSONSTREAM_MODE_ENDWS:
|
|
265
|
+
i -= 1
|
|
266
|
+
self.strip_comment(buf, start, i, sz, eof)
|
|
267
|
+
if eof:
|
|
268
|
+
return 0
|
|
269
|
+
return -1
|
|
270
|
+
if self.mode == JSONSTREAM_MODE_KEYSTRING:
|
|
271
|
+
if buf[start+i] == '\\':
|
|
272
|
+
self.mode = JSONSTREAM_MODE_KEYSTRING_ESCAPE
|
|
273
|
+
elif buf[start+i] == '"':
|
|
274
|
+
self.keypresent = True
|
|
275
|
+
self.mode = JSONSTREAM_MODE_COLON
|
|
276
|
+
else:
|
|
277
|
+
self.key.write(buf[start+i])
|
|
278
|
+
i += 1
|
|
279
|
+
continue
|
|
280
|
+
elif self.mode == JSONSTREAM_MODE_STRING:
|
|
281
|
+
if buf[start+i] == '\\':
|
|
282
|
+
self.mode = JSONSTREAM_MODE_STRING_ESCAPE
|
|
283
|
+
elif buf[start+i] == '"':
|
|
284
|
+
self.mode = JSONSTREAM_MODE_COMMA
|
|
285
|
+
if not self.handler.handle_string:
|
|
286
|
+
if len(self.keystack) == 0:
|
|
287
|
+
self.strip_comment(buf, start, i, sz, eof)
|
|
288
|
+
if eof:
|
|
289
|
+
return 0
|
|
290
|
+
return -1
|
|
291
|
+
i += 1
|
|
292
|
+
continue
|
|
293
|
+
self.handler.handle_string(self.get_key(), self.val.getvalue())
|
|
294
|
+
if len(self.keystack) == 0:
|
|
295
|
+
self.strip_comment(buf, start, i, sz, eof)
|
|
296
|
+
if eof:
|
|
297
|
+
return 0
|
|
298
|
+
return -1
|
|
299
|
+
else:
|
|
300
|
+
self.val.write(buf[start+i])
|
|
301
|
+
i += 1
|
|
302
|
+
continue
|
|
303
|
+
elif self.mode == JSONSTREAM_MODE_KEYSTRING_ESCAPE:
|
|
304
|
+
if buf[start+i] == 'b':
|
|
305
|
+
self.key.write('\b')
|
|
306
|
+
elif buf[start+i] == 'f':
|
|
307
|
+
self.key.write('\f')
|
|
308
|
+
elif buf[start+i] == 'r':
|
|
309
|
+
self.key.write('\r')
|
|
310
|
+
elif buf[start+i] == 'n':
|
|
311
|
+
self.key.write('\n')
|
|
312
|
+
elif buf[start+i] == 't':
|
|
313
|
+
self.key.write('\t')
|
|
314
|
+
elif buf[start+i] == 'u':
|
|
315
|
+
self.mode = JSONSTREAM_MODE_KEYSTRING_UESCAPE
|
|
316
|
+
self.uescape = io.StringIO()
|
|
317
|
+
else:
|
|
318
|
+
self.errloc = i
|
|
319
|
+
raise Exception("Illegal sequence")
|
|
320
|
+
i += 1
|
|
321
|
+
continue
|
|
322
|
+
elif self.mode == JSONSTREAM_MODE_STRING_ESCAPE:
|
|
323
|
+
if buf[start+i] == 'b':
|
|
324
|
+
self.val.write('\b')
|
|
325
|
+
elif buf[start+i] == 'f':
|
|
326
|
+
self.val.write('\f')
|
|
327
|
+
elif buf[start+i] == 'r':
|
|
328
|
+
self.val.write('\r')
|
|
329
|
+
elif buf[start+i] == 'n':
|
|
330
|
+
self.val.write('\n')
|
|
331
|
+
elif buf[start+i] == 't':
|
|
332
|
+
self.val.write('\t')
|
|
333
|
+
elif buf[start+i] == 'u':
|
|
334
|
+
self.mode = JSONSTREAM_MODE_STRING_UESCAPE
|
|
335
|
+
self.uescape = io.StringIO()
|
|
336
|
+
else:
|
|
337
|
+
self.errloc = i
|
|
338
|
+
raise Exception("Illegal sequence")
|
|
339
|
+
i += 1
|
|
340
|
+
continue
|
|
341
|
+
elif self.mode == JSONSTREAM_MODE_STRING_UESCAPE and len(self.uescape.getvalue()) < 4:
|
|
342
|
+
if (buf[start+i] >= '0' and buf[start+i] <= '9') or (buf[start+i] >= 'A' and buf[start+i] <= 'F') or (buf[start+i] >= 'a' and buf[start+i] <= 'f'):
|
|
343
|
+
self.uescape.write(buf[start+i])
|
|
344
|
+
if len(self.uescape.getvalue()) == 4:
|
|
345
|
+
self.val.write(chr(int(self.uescape.getvalue(),16)))
|
|
346
|
+
self.mode = JSONSTREAM_MODE_STRING
|
|
347
|
+
i += 1
|
|
348
|
+
continue
|
|
349
|
+
raise Exception("Illegal unicode escape")
|
|
350
|
+
elif self.mode == JSONSTREAM_MODE_KEYSTRING_UESCAPE and len(self.uescape.getvalue()) < 4:
|
|
351
|
+
if (buf[start+i] >= '0' and buf[start+i] <= '9') or (buf[start+i] >= 'A' and buf[start+i] <= 'F') or (buf[start+i] >= 'a' and buf[start+i] <= 'f'):
|
|
352
|
+
self.uescape.write(buf[start+i])
|
|
353
|
+
if len(self.uescape.getvalue()) == 4:
|
|
354
|
+
self.key.write(chr(int(self.uescape.getvalue(),16)))
|
|
355
|
+
self.mode = JSONSTREAM_MODE_KEYSTRING
|
|
356
|
+
i += 1
|
|
357
|
+
continue
|
|
358
|
+
raise Exception("Illegal unicode escape")
|
|
359
|
+
if self.comments and (not self.comment_seen_preliminary) and (not self.cpp_comment_seen) and (not self.c_comment_seen) and buf[start+i] == '/' and (self.mode == JSONSTREAM_MODE_COLON or self.mode == JSONSTREAM_MODE_COMMA or self.mode == JSONSTREAM_MODE_FIRSTKEY or self.mode == JSONSTREAM_MODE_FIRSTVAL or self.mode == JSONSTREAM_MODE_KEY or self.mode == JSONSTREAM_MODE_VAL):
|
|
360
|
+
self.comment_seen_preliminary = True
|
|
361
|
+
self.val = io.StringIO()
|
|
362
|
+
i += 1
|
|
363
|
+
continue
|
|
364
|
+
if self.comment_seen_preliminary:
|
|
365
|
+
if buf[start+i] == '*':
|
|
366
|
+
self.comment_seen_preliminary = False
|
|
367
|
+
self.c_comment_seen = True
|
|
368
|
+
self.c_comment_seen_star = False
|
|
369
|
+
self.val = io.StringIO()
|
|
370
|
+
i += 1
|
|
371
|
+
continue
|
|
372
|
+
if buf[start+i] != '/':
|
|
373
|
+
self.errloc = i
|
|
374
|
+
raise Exception("illegal comment")
|
|
375
|
+
self.comment_seen_preliminary = False
|
|
376
|
+
self.cpp_comment_seen = True
|
|
377
|
+
self.val = io.StringIO()
|
|
378
|
+
i += 1
|
|
379
|
+
continue
|
|
380
|
+
if self.c_comment_seen:
|
|
381
|
+
if buf[start+i] == '*':
|
|
382
|
+
self.c_comment_seen_star = True
|
|
383
|
+
elif self.c_comment_seen_star and buf[start+i] == '/':
|
|
384
|
+
self.c_comment_seen = False
|
|
385
|
+
self.c_comment_seen_star = False
|
|
386
|
+
if self.handler.handle_comment:
|
|
387
|
+
self.handler.handle_comment(self.comma_seen, self.val.getvalue(), True)
|
|
388
|
+
else:
|
|
389
|
+
if self.c_comment_seen_star:
|
|
390
|
+
self.val.write('*')
|
|
391
|
+
self.c_comment_seen_star = False
|
|
392
|
+
self.val.write(buf[start+i])
|
|
393
|
+
i += 1
|
|
394
|
+
continue
|
|
395
|
+
if self.cpp_comment_seen:
|
|
396
|
+
if buf[start+i] == '\n':
|
|
397
|
+
self.cpp_comment_seen = False
|
|
398
|
+
if self.handler.handle_comment:
|
|
399
|
+
self.handler.handle_comment(self.comma_seen, self.val.getvalue(), False)
|
|
400
|
+
else:
|
|
401
|
+
self.val.write(buf[start+i])
|
|
402
|
+
i += 1
|
|
403
|
+
continue
|
|
404
|
+
if (buf[start+i] == ' ' or buf[start+i] == '\n' or buf[start+i] == '\r' or buf[start+i] == '\t') and (self.mode == JSONSTREAM_MODE_COLON or self.mode == JSONSTREAM_MODE_COMMA or self.mode == JSONSTREAM_MODE_FIRSTKEY or self.mode == JSONSTREAM_MODE_FIRSTVAL or self.mode == JSONSTREAM_MODE_KEY or self.mode == JSONSTREAM_MODE_VAL):
|
|
405
|
+
i += 1
|
|
406
|
+
continue
|
|
407
|
+
if self.mode == JSONSTREAM_MODE_COLON:
|
|
408
|
+
if buf[start+i] != ':':
|
|
409
|
+
self.errloc = i
|
|
410
|
+
raise Exception("invalid JSON")
|
|
411
|
+
self.mode = JSONSTREAM_MODE_VAL
|
|
412
|
+
i += 1
|
|
413
|
+
continue
|
|
414
|
+
if self.mode == JSONSTREAM_MODE_COMMA:
|
|
415
|
+
if buf[start+i] == ',':
|
|
416
|
+
self.comma_seen = True
|
|
417
|
+
if self.keypresent:
|
|
418
|
+
self.mode = JSONSTREAM_MODE_KEY
|
|
419
|
+
self.keypresent = False
|
|
420
|
+
else:
|
|
421
|
+
self.mode = JSONSTREAM_MODE_VAL
|
|
422
|
+
i += 1
|
|
423
|
+
continue
|
|
424
|
+
self.comma_seen = False
|
|
425
|
+
if (self.mode == JSONSTREAM_MODE_COMMA or self.mode == JSONSTREAM_MODE_FIRSTKEY or (self.trailing_commas and self.mode == JSONSTREAM_MODE_KEY)) and buf[start+i] == '}':
|
|
426
|
+
if self.mode == JSONSTREAM_MODE_COMMA:
|
|
427
|
+
if not self.keypresent:
|
|
428
|
+
self.errloc = i
|
|
429
|
+
raise Exception("invalid JSON")
|
|
430
|
+
# could be array or dict
|
|
431
|
+
self.mode = JSONSTREAM_MODE_COMMA
|
|
432
|
+
self.get_keystack()
|
|
433
|
+
if not self.handler.end_dict:
|
|
434
|
+
if len(self.keystack) == 0:
|
|
435
|
+
self.strip_comment(buf, start, i, sz, eof)
|
|
436
|
+
if eof:
|
|
437
|
+
return 0
|
|
438
|
+
return -1
|
|
439
|
+
i += 1
|
|
440
|
+
continue
|
|
441
|
+
self.handler.end_dict(self.get_key())
|
|
442
|
+
if len(self.keystack) == 0:
|
|
443
|
+
self.strip_comment(buf, start, i, sz, eof)
|
|
444
|
+
if eof:
|
|
445
|
+
return 0
|
|
446
|
+
return -1
|
|
447
|
+
i += 1
|
|
448
|
+
continue
|
|
449
|
+
if (self.mode == JSONSTREAM_MODE_COMMA or self.mode == JSONSTREAM_MODE_FIRSTVAL or (self.trailing_commas and self.mode == JSONSTREAM_MODE_VAL)) and buf[start+i] == ']':
|
|
450
|
+
if self.mode == JSONSTREAM_MODE_COMMA or self.mode == JSONSTREAM_MODE_VAL:
|
|
451
|
+
if self.keypresent or len(self.keystack) == 0:
|
|
452
|
+
self.errloc = i
|
|
453
|
+
raise Exception("invalid JSON")
|
|
454
|
+
# could be array or dict
|
|
455
|
+
self.mode = JSONSTREAM_MODE_COMMA
|
|
456
|
+
self.get_keystack()
|
|
457
|
+
if not self.handler.end_array:
|
|
458
|
+
if len(self.keystack) == 0:
|
|
459
|
+
self.strip_comment(buf, start, i, sz, eof)
|
|
460
|
+
if eof:
|
|
461
|
+
return 0
|
|
462
|
+
return -1
|
|
463
|
+
i += 1
|
|
464
|
+
continue
|
|
465
|
+
self.handler.end_array(self.get_key())
|
|
466
|
+
if len(self.keystack) == 0:
|
|
467
|
+
self.strip_comment(buf, start, i, sz, eof)
|
|
468
|
+
if eof:
|
|
469
|
+
return 0
|
|
470
|
+
return -1
|
|
471
|
+
i += 1
|
|
472
|
+
continue
|
|
473
|
+
if (self.mode == JSONSTREAM_MODE_FIRSTVAL or self.mode == JSONSTREAM_MODE_VAL) and buf[start+i] == '{':
|
|
474
|
+
self.put_keystack_1()
|
|
475
|
+
self.mode = JSONSTREAM_MODE_FIRSTKEY
|
|
476
|
+
if not self.handler.start_dict:
|
|
477
|
+
self.put_keystack_2()
|
|
478
|
+
i += 1
|
|
479
|
+
continue
|
|
480
|
+
self.handler.start_dict(self.get_key())
|
|
481
|
+
self.put_keystack_2()
|
|
482
|
+
i += 1
|
|
483
|
+
continue
|
|
484
|
+
if (self.mode == JSONSTREAM_MODE_FIRSTVAL or self.mode == JSONSTREAM_MODE_VAL) and buf[start+i] == '[':
|
|
485
|
+
self.put_keystack_1()
|
|
486
|
+
self.mode = JSONSTREAM_MODE_FIRSTVAL
|
|
487
|
+
if not self.handler.start_array:
|
|
488
|
+
self.put_keystack_2()
|
|
489
|
+
i += 1
|
|
490
|
+
continue
|
|
491
|
+
self.handler.start_array(self.get_key())
|
|
492
|
+
self.put_keystack_2()
|
|
493
|
+
i += 1
|
|
494
|
+
continue
|
|
495
|
+
if self.mode == JSONSTREAM_MODE_TRUE:
|
|
496
|
+
if buf[start+i] != "true"[self.sz]:
|
|
497
|
+
self.errloc = i
|
|
498
|
+
raise Exception("invalid JSON")
|
|
499
|
+
self.sz += 1
|
|
500
|
+
if self.sz < 4:
|
|
501
|
+
i += 1
|
|
502
|
+
continue
|
|
503
|
+
self.mode = JSONSTREAM_MODE_COMMA
|
|
504
|
+
if not self.handler.handle_boolean:
|
|
505
|
+
if len(self.keystack) == 0:
|
|
506
|
+
self.strip_comment(buf, start, i, sz, eof)
|
|
507
|
+
if eof:
|
|
508
|
+
return 0
|
|
509
|
+
return -1
|
|
510
|
+
i += 1
|
|
511
|
+
continue
|
|
512
|
+
self.handler.handle_boolean(self.get_key(), True)
|
|
513
|
+
if len(self.keystack) == 0:
|
|
514
|
+
self.strip_comment(buf, start, i, sz, eof)
|
|
515
|
+
if eof:
|
|
516
|
+
return 0
|
|
517
|
+
return -1
|
|
518
|
+
i += 1
|
|
519
|
+
continue
|
|
520
|
+
if self.mode == JSONSTREAM_MODE_FALSE:
|
|
521
|
+
if buf[start+i] != "false"[self.sz]:
|
|
522
|
+
self.errloc = i
|
|
523
|
+
raise Exception("invalid JSON")
|
|
524
|
+
self.sz += 1
|
|
525
|
+
if self.sz < 5:
|
|
526
|
+
i += 1
|
|
527
|
+
continue
|
|
528
|
+
self.mode = JSONSTREAM_MODE_COMMA
|
|
529
|
+
if not self.handler.handle_boolean:
|
|
530
|
+
if len(self.keystack) == 0:
|
|
531
|
+
self.strip_comment(buf, start, i, sz, eof)
|
|
532
|
+
if eof:
|
|
533
|
+
return 0
|
|
534
|
+
return -1
|
|
535
|
+
i += 1
|
|
536
|
+
continue
|
|
537
|
+
self.handler.handle_boolean(self.get_key(), False)
|
|
538
|
+
if len(self.keystack) == 0:
|
|
539
|
+
self.strip_comment(buf, start, i, sz, eof)
|
|
540
|
+
if eof:
|
|
541
|
+
return 0
|
|
542
|
+
return -1
|
|
543
|
+
i += 1
|
|
544
|
+
continue
|
|
545
|
+
if self.mode == JSONSTREAM_MODE_NULL:
|
|
546
|
+
if buf[start+i] != "null"[self.sz]:
|
|
547
|
+
self.errloc = i
|
|
548
|
+
raise Exception("invalid JSON")
|
|
549
|
+
self.sz += 1
|
|
550
|
+
if self.sz < 4:
|
|
551
|
+
i += 1
|
|
552
|
+
continue
|
|
553
|
+
self.mode = JSONSTREAM_MODE_COMMA
|
|
554
|
+
if not self.handler.handle_null:
|
|
555
|
+
if len(self.keystack) == 0:
|
|
556
|
+
self.strip_comment(buf, start, i, sz, eof)
|
|
557
|
+
if eof:
|
|
558
|
+
return 0
|
|
559
|
+
return -1
|
|
560
|
+
i += 1
|
|
561
|
+
continue
|
|
562
|
+
self.handler.handle_null(self.get_key())
|
|
563
|
+
if len(self.keystack) == 0:
|
|
564
|
+
self.strip_comment(buf, start, i, sz, eof)
|
|
565
|
+
if eof:
|
|
566
|
+
return 0
|
|
567
|
+
return -1
|
|
568
|
+
i += 1
|
|
569
|
+
continue
|
|
570
|
+
if (self.mode == JSONSTREAM_MODE_VAL or self.mode == JSONSTREAM_MODE_FIRSTVAL) and buf[start+i] == 'n':
|
|
571
|
+
self.mode = JSONSTREAM_MODE_NULL
|
|
572
|
+
self.sz = 1
|
|
573
|
+
i += 1
|
|
574
|
+
continue
|
|
575
|
+
if (self.mode == JSONSTREAM_MODE_VAL or self.mode == JSONSTREAM_MODE_FIRSTVAL) and buf[start+i] == 'f':
|
|
576
|
+
self.mode = JSONSTREAM_MODE_FALSE
|
|
577
|
+
self.sz = 1
|
|
578
|
+
i += 1
|
|
579
|
+
continue
|
|
580
|
+
if (self.mode == JSONSTREAM_MODE_VAL or self.mode == JSONSTREAM_MODE_FIRSTVAL) and buf[start+i] == 't':
|
|
581
|
+
self.mode = JSONSTREAM_MODE_TRUE
|
|
582
|
+
self.sz = 1
|
|
583
|
+
i += 1
|
|
584
|
+
continue
|
|
585
|
+
if (self.mode == JSONSTREAM_MODE_KEY or self.mode == JSONSTREAM_MODE_FIRSTKEY) and buf[start+i] == '"':
|
|
586
|
+
self.mode = JSONSTREAM_MODE_KEYSTRING
|
|
587
|
+
self.key = io.StringIO()
|
|
588
|
+
i += 1
|
|
589
|
+
continue
|
|
590
|
+
if (self.mode == JSONSTREAM_MODE_VAL or self.mode == JSONSTREAM_MODE_FIRSTVAL) and buf[start+i] == '"':
|
|
591
|
+
self.mode = JSONSTREAM_MODE_STRING
|
|
592
|
+
self.val = io.StringIO()
|
|
593
|
+
i += 1
|
|
594
|
+
continue
|
|
595
|
+
if (self.mode == JSONSTREAM_MODE_VAL or self.mode == JSONSTREAM_MODE_FIRSTVAL) and (buf[start+i] == '-' or (buf[start+i] >= '0' and buf[start+i] <= '9')):
|
|
596
|
+
self.mode = JSONSTREAM_MODE_NUMBER
|
|
597
|
+
self.is_integer = True
|
|
598
|
+
self.val = io.StringIO()
|
|
599
|
+
if self.mode == JSONSTREAM_MODE_NUMBER:
|
|
600
|
+
if self.val.getvalue() == "" and buf[start+i] == '-':
|
|
601
|
+
self.val.write(buf[start+i])
|
|
602
|
+
i += 1
|
|
603
|
+
continue
|
|
604
|
+
if (self.val.getvalue() == "" or self.val.getvalue() == "-") and buf[start+i] >= '0' and buf[start+i] <= '9':
|
|
605
|
+
self.val.write(buf[start+i])
|
|
606
|
+
i += 1
|
|
607
|
+
continue
|
|
608
|
+
if (self.val.getvalue() != "0" and self.val.getvalue() != "-0") and buf[start+i] >= '0' and buf[start+i] <= '9':
|
|
609
|
+
self.val.write(buf[start+i])
|
|
610
|
+
i += 1
|
|
611
|
+
continue
|
|
612
|
+
if buf[start+i] == '.' and ("." not in self.val.getvalue()) and ("E" not in self.val.getvalue()) and ("e" not in self.val.getvalue()):
|
|
613
|
+
self.is_integer = False
|
|
614
|
+
self.val.write(buf[start+i])
|
|
615
|
+
i += 1
|
|
616
|
+
continue
|
|
617
|
+
if (buf[start+i] == 'E' or buf[start+i] == 'e') and ("E" not in self.val.getvalue()) and ("e" not in self.val.getvalue()):
|
|
618
|
+
self.is_integer = False
|
|
619
|
+
self.val.write(buf[start+i])
|
|
620
|
+
i += 1
|
|
621
|
+
continue
|
|
622
|
+
if (buf[start+i] == '-' or buf[start+i] == '+') and len(self.val.getvalue()) and (self.val.getvalue()[-1] == 'E' or self.val.getvalue()[-1] == 'e'):
|
|
623
|
+
self.val.write(buf[start+i])
|
|
624
|
+
i += 1
|
|
625
|
+
continue
|
|
626
|
+
numval = float(self.val.getvalue())
|
|
627
|
+
self.mode = JSONSTREAM_MODE_COMMA
|
|
628
|
+
if not self.handler.handle_number:
|
|
629
|
+
if len(self.keystack) == 0:
|
|
630
|
+
self.strip_comment(buf, start, i-1, sz, eof)
|
|
631
|
+
if eof:
|
|
632
|
+
return 0
|
|
633
|
+
return -1
|
|
634
|
+
continue # without i += 1 on purpose
|
|
635
|
+
self.handler.handle_number(self.get_key(), numval, self.is_integer)
|
|
636
|
+
if len(self.keystack) == 0:
|
|
637
|
+
self.strip_comment(buf, start, i-1, sz, eof)
|
|
638
|
+
if eof:
|
|
639
|
+
return 0
|
|
640
|
+
return -1
|
|
641
|
+
continue # without i += 1 on purpose
|
|
642
|
+
self.errloc = i
|
|
643
|
+
raise Exception("invalid JSON")
|
|
644
|
+
if self.mode == JSONSTREAM_MODE_NUMBER and eof:
|
|
645
|
+
self.mode = JSONSTREAM_MODE_COMMA
|
|
646
|
+
if not self.handler.handle_number:
|
|
647
|
+
if len(self.keystack) == 0:
|
|
648
|
+
return 0
|
|
649
|
+
self.errloc = i
|
|
650
|
+
raise Exception("invalid JSON")
|
|
651
|
+
self.handler.handle_number(self.get_key(), float(self.val.getvalue()), self.is_integer)
|
|
652
|
+
if len(self.keystack) == 0:
|
|
653
|
+
return 0
|
|
654
|
+
self.errloc = i
|
|
655
|
+
raise Exception("invalid JSON")
|
|
656
|
+
if eof and (self.c_comment_seen or self.comment_seen_preliminary):
|
|
657
|
+
self.errloc = i
|
|
658
|
+
raise Exception("Unterminated beginning of comment")
|
|
659
|
+
if len(self.keystack) == 0 and eof and self.mode == JSONSTREAM_MODE_ENDWS:
|
|
660
|
+
return 0
|
|
661
|
+
if eof:
|
|
662
|
+
self.errloc = i
|
|
663
|
+
raise Exception("invalid JSON, parsing not finished at end")
|
|
664
|
+
return -1
|
|
665
|
+
|
|
666
|
+
def jsonstream_tree_parse(buf, allow_comments=False, allow_trailing_comma=False):
|
|
667
|
+
class ElementContainer(object):
|
|
668
|
+
def __init__(self):
|
|
669
|
+
self.has_element = False
|
|
670
|
+
self.element = None
|
|
671
|
+
c = ElementContainer()
|
|
672
|
+
stack = []
|
|
673
|
+
class MyHandler(JsonHandler):
|
|
674
|
+
def start_dict(self, key):
|
|
675
|
+
obj = {}
|
|
676
|
+
if not c.has_element:
|
|
677
|
+
c.has_element = True
|
|
678
|
+
c.element = obj
|
|
679
|
+
stack.append(obj)
|
|
680
|
+
return
|
|
681
|
+
if key is not None:
|
|
682
|
+
stack[-1][key] = obj
|
|
683
|
+
else:
|
|
684
|
+
stack[-1].append(obj)
|
|
685
|
+
stack.append(obj)
|
|
686
|
+
def start_array(self, key):
|
|
687
|
+
obj = []
|
|
688
|
+
if not c.has_element:
|
|
689
|
+
c.has_element = True
|
|
690
|
+
c.element = obj
|
|
691
|
+
stack.append(obj)
|
|
692
|
+
return
|
|
693
|
+
if key is not None:
|
|
694
|
+
stack[-1][key] = obj
|
|
695
|
+
else:
|
|
696
|
+
stack[-1].append(obj)
|
|
697
|
+
stack.append(obj)
|
|
698
|
+
def end_dict(self, key):
|
|
699
|
+
stack.pop()
|
|
700
|
+
def end_array(self, key):
|
|
701
|
+
stack.pop()
|
|
702
|
+
def handle_number(self, key, num, is_integer):
|
|
703
|
+
if is_integer:
|
|
704
|
+
num = int(num)
|
|
705
|
+
if not c.has_element:
|
|
706
|
+
c.has_element = True
|
|
707
|
+
c.element = num
|
|
708
|
+
return
|
|
709
|
+
if key is not None:
|
|
710
|
+
stack[-1][key] = num
|
|
711
|
+
else:
|
|
712
|
+
stack[-1].append(num)
|
|
713
|
+
if not c.has_element:
|
|
714
|
+
c.has_element = True
|
|
715
|
+
c.element = num
|
|
716
|
+
def handle_string(self, key, val):
|
|
717
|
+
if not c.has_element:
|
|
718
|
+
c.has_element = True
|
|
719
|
+
c.element = val
|
|
720
|
+
return
|
|
721
|
+
if key is not None:
|
|
722
|
+
stack[-1][key] = val
|
|
723
|
+
else:
|
|
724
|
+
stack[-1].append(val)
|
|
725
|
+
if not c.has_element:
|
|
726
|
+
c.has_element = True
|
|
727
|
+
c.element = val
|
|
728
|
+
def handle_boolean(self, key, val):
|
|
729
|
+
if not c.has_element:
|
|
730
|
+
c.has_element = True
|
|
731
|
+
c.element = val
|
|
732
|
+
return
|
|
733
|
+
if key is not None:
|
|
734
|
+
stack[-1][key] = val
|
|
735
|
+
else:
|
|
736
|
+
stack[-1].append(val)
|
|
737
|
+
def handle_null(self, key):
|
|
738
|
+
val = None
|
|
739
|
+
if not c.has_element:
|
|
740
|
+
c.has_element = True
|
|
741
|
+
c.element = val
|
|
742
|
+
return
|
|
743
|
+
if key is not None:
|
|
744
|
+
stack[-1][key] = val
|
|
745
|
+
else:
|
|
746
|
+
stack[-1].append(val)
|
|
747
|
+
handler = MyHandler()
|
|
748
|
+
stream = JsonStream(handler)
|
|
749
|
+
if allow_comments:
|
|
750
|
+
stream.allow_comments()
|
|
751
|
+
if allow_trailing_comma:
|
|
752
|
+
stream.allow_trailing_comma()
|
|
753
|
+
stream.feed(buf, 0, len(buf), True)
|
|
754
|
+
if not c.has_element:
|
|
755
|
+
raise Exception("invalid JSON")
|
|
756
|
+
return c.element
|
|
757
|
+
|
|
758
|
+
if __name__ == '__main__':
|
|
759
|
+
handler = JsonHandler()
|
|
760
|
+
buf = "//foo\n /* fof */ { //bar\n \"foo\\u03a9\": [1 //baz\n, /*2,*/ 3 //quux\n], \"bar\": 4.0, \"baz\": {}, \"barf\": [] , \"quux\": [true, false, null,], } // endcomment"
|
|
761
|
+
stream = JsonStream(handler)
|
|
762
|
+
stream.allow_comments()
|
|
763
|
+
stream.allow_trailing_comma()
|
|
764
|
+
print(stream.feed(buf, 0, len(buf), True))
|
|
765
|
+
print(jsonstream_tree_parse(buf, True, True))
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyjsonfrag
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Combined tree/event based JSON parser
|
|
5
|
+
Project-URL: Homepage, https://github.com/jmtilli/pyjsonfrag
|
|
6
|
+
Project-URL: Issues, https://github.com/jmtilli/pyjsonfrag/issues
|
|
7
|
+
Author-email: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# PyJsonFrag: a powerful combined tree-based and event-based parser for JSON
|
|
16
|
+
|
|
17
|
+
Typically, JSON is parsed by a tree-based parser unlike XML that can be parsed by a tree-based parser or an event-based parser. Event-based parsers are fast and have a low memory footprint, but a drawback is that it is cumbersome to write the required event handlers. Tree-based parsers make the code easier to write, to understand and to maintain but have a large memory footprint as a drawback. Sometimes, JSON is used for huge files such as database dumps that would be preferably parsed by event-based parsing, or so it would appear at a glance, because a tree-based parser cannot hold the whole parse tree in memory at the same time, if the file is huge.
|
|
18
|
+
|
|
19
|
+
## How to install: PyJsonFrag at PyPI
|
|
20
|
+
|
|
21
|
+
PyJsonFrag is available at [PyPI](https://pypi.org/project/pyjsonfrag/).
|
|
22
|
+
|
|
23
|
+
How to install:
|
|
24
|
+
```
|
|
25
|
+
python3 -m pip install pyjsonfrag
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Example application: customers in a major bank
|
|
29
|
+
|
|
30
|
+
Let us consider an example application: a listing of a customers in a major bank that has 30 million customers. The test file is in the following format:
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
{
|
|
34
|
+
"customers": [
|
|
35
|
+
{
|
|
36
|
+
"id": 1,
|
|
37
|
+
"name": "Clark Henson",
|
|
38
|
+
"accountCount": 1,
|
|
39
|
+
"totalBalance": 5085.96
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"id": 2,
|
|
43
|
+
"name": "Elnora Ericson",
|
|
44
|
+
"accountCount": 3,
|
|
45
|
+
"totalBalance": 3910.11
|
|
46
|
+
},
|
|
47
|
+
...
|
|
48
|
+
]
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
The example format requires about 100 bytes per customer plus customer name length. If we assume an average customer name is 15 characters long, the required storage is about 115 bytes per customer. For 30 million customers, this is 3.5 gigabytes. In the example, the file is read to the following structure:
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
class Customer(object):
|
|
56
|
+
def __init__(self, customerId = None, name = None, accountCount = None, totalBalance = None):
|
|
57
|
+
if customerId is not None:
|
|
58
|
+
self.customerId = int(customerId)
|
|
59
|
+
else:
|
|
60
|
+
self.customerId = None
|
|
61
|
+
self.name = name
|
|
62
|
+
if accountCount is not None:
|
|
63
|
+
self.accountCount = int(accountCount)
|
|
64
|
+
else:
|
|
65
|
+
self.accountCount = None
|
|
66
|
+
if totalBalance is not None:
|
|
67
|
+
self.totalBalance = float(totalBalance)
|
|
68
|
+
else:
|
|
69
|
+
self.totalBalance = None
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Python jsonstream API
|
|
73
|
+
|
|
74
|
+
For XML, there is Simple API for XML (SAX). However, for JSON the usual parse
|
|
75
|
+
methods read the whole data into memory at once, not supporting event-driven
|
|
76
|
+
parsing. Thus, we provide Python jsonstream API to provide the possibility
|
|
77
|
+
for event-driven parsing. It is faster and less memory-hungry than the "read
|
|
78
|
+
all at once" parsing methods, but it is cumbersome.
|
|
79
|
+
|
|
80
|
+
A jsonstream-based parser is implemented here:
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
import pyjsonfrag
|
|
84
|
+
|
|
85
|
+
class Customer(object):
|
|
86
|
+
def __init__(self, customerId = None, name = None, accountCount = None, totalBalance = None):
|
|
87
|
+
if customerId is not None:
|
|
88
|
+
self.customerId = int(customerId)
|
|
89
|
+
else:
|
|
90
|
+
self.customerId = None
|
|
91
|
+
self.name = name
|
|
92
|
+
if accountCount is not None:
|
|
93
|
+
self.accountCount = int(accountCount)
|
|
94
|
+
else:
|
|
95
|
+
self.accountCount = None
|
|
96
|
+
if totalBalance is not None:
|
|
97
|
+
self.totalBalance = float(totalBalance)
|
|
98
|
+
else:
|
|
99
|
+
self.totalBalance = None
|
|
100
|
+
def __repr__(s):
|
|
101
|
+
return ("Customer(%d,%s,%d,%.2f)" % (s.customerId,s.name,s.accountCount,s.totalBalance))
|
|
102
|
+
|
|
103
|
+
context = []
|
|
104
|
+
cs = {}
|
|
105
|
+
c = None
|
|
106
|
+
|
|
107
|
+
class MyHandler(pyjsonfrag.JsonHandler):
|
|
108
|
+
def start_dict(stream, key):
|
|
109
|
+
global c
|
|
110
|
+
context.append(key)
|
|
111
|
+
if context == [None, "customers", None]:
|
|
112
|
+
c = Customer()
|
|
113
|
+
def start_array(stream, key):
|
|
114
|
+
context.append(key)
|
|
115
|
+
def end_dict(stream, key):
|
|
116
|
+
context.pop()
|
|
117
|
+
def end_array(stream, key):
|
|
118
|
+
context.pop()
|
|
119
|
+
def handle_string(stream, key, val):
|
|
120
|
+
if key == "name":
|
|
121
|
+
c.name = val
|
|
122
|
+
def handle_number(stream, key, num, is_integer):
|
|
123
|
+
if key == "id":
|
|
124
|
+
cs[int(num)] = c
|
|
125
|
+
c.customerId = int(num)
|
|
126
|
+
elif key == "accountCount":
|
|
127
|
+
c.accountCount = int(num)
|
|
128
|
+
elif key == "totalBalance":
|
|
129
|
+
c.totalBalance = num
|
|
130
|
+
|
|
131
|
+
handler = MyHandler()
|
|
132
|
+
stream = pyjsonfrag.JsonStream(handler)
|
|
133
|
+
with open("customers.json", "r") as f:
|
|
134
|
+
while True:
|
|
135
|
+
buf = f.read(4096)
|
|
136
|
+
if buf == '':
|
|
137
|
+
stream.feed(buf, 0, len(buf), True)
|
|
138
|
+
break
|
|
139
|
+
else:
|
|
140
|
+
stream.feed(buf, 0, len(buf), False)
|
|
141
|
+
print(cs)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
It can be seen that the parser is quite cumbersome and the code to construct a customer is scattered to two different places. Yet it is fast and has a low memory footprint.
|
|
145
|
+
|
|
146
|
+
## Parser with the new library
|
|
147
|
+
|
|
148
|
+
What if we could combine the benefits of the jsonstream-based approach with the benefits of the "read whole parse tree into memory" based approach? A parse tree fragment for a single customer dictionary is small enough to be kept in memory. This is what the new library is about. Here is the code to parse the customer file with the new library:
|
|
149
|
+
|
|
150
|
+
```
|
|
151
|
+
import pyjsonfrag
|
|
152
|
+
|
|
153
|
+
class Customer(object):
|
|
154
|
+
def __init__(self, customerId = None, name = None, accountCount = None, totalBalance = None):
|
|
155
|
+
if customerId is not None:
|
|
156
|
+
self.customerId = int(customerId)
|
|
157
|
+
else:
|
|
158
|
+
self.customerId = None
|
|
159
|
+
self.name = name
|
|
160
|
+
if accountCount is not None:
|
|
161
|
+
self.accountCount = int(accountCount)
|
|
162
|
+
else:
|
|
163
|
+
self.accountCount = None
|
|
164
|
+
if totalBalance is not None:
|
|
165
|
+
self.totalBalance = float(totalBalance)
|
|
166
|
+
else:
|
|
167
|
+
self.totalBalance = None
|
|
168
|
+
def __repr__(s):
|
|
169
|
+
return ("Customer(%d,%s,%d,%.2f)" % (s.customerId,s.name,s.accountCount,s.totalBalance))
|
|
170
|
+
|
|
171
|
+
cs = {}
|
|
172
|
+
|
|
173
|
+
class MyHandler(pyjsonfrag.FragmentHandler):
|
|
174
|
+
def start_frag_dict(self, key):
|
|
175
|
+
if self.path_is([None, "customers", None]):
|
|
176
|
+
self.start_frag_collection()
|
|
177
|
+
def end_frag_dict(self, key, val):
|
|
178
|
+
if self.path_is([None, "customers", None]):
|
|
179
|
+
c = Customer(customerId=val["id"], name=val["name"],
|
|
180
|
+
accountCount=val["accountCount"], totalBalance=val["totalBalance"])
|
|
181
|
+
cs[c.customerId] = c
|
|
182
|
+
|
|
183
|
+
handler = MyHandler()
|
|
184
|
+
stream = pyjsonfrag.JsonStream(handler)
|
|
185
|
+
with open("customers.json", "r") as f:
|
|
186
|
+
while True:
|
|
187
|
+
buf = f.read(4096)
|
|
188
|
+
if buf == '':
|
|
189
|
+
stream.feed(buf, 0, len(buf), True)
|
|
190
|
+
break
|
|
191
|
+
else:
|
|
192
|
+
stream.feed(buf, 0, len(buf), False)
|
|
193
|
+
print(cs)
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
Note how the code is significantly more simple than for the event-based approach. Performance is close to the event-based approach, and memory consumption is essentially the same as for the event-based approach.
|
|
197
|
+
|
|
198
|
+
Of course, the new library supports getting the whole parse tree in memory:
|
|
199
|
+
|
|
200
|
+
```
|
|
201
|
+
import pyjsonfrag
|
|
202
|
+
|
|
203
|
+
with open("customers.json", "r") as f:
|
|
204
|
+
print(pyjsonfrag.jsonstream_tree_parse(f.read()))
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## License
|
|
208
|
+
|
|
209
|
+
All of the material related to PyJsonFrag is licensed under the following MIT
|
|
210
|
+
license:
|
|
211
|
+
|
|
212
|
+
Copyright (C) 2026 Juha-Matti Tilli
|
|
213
|
+
|
|
214
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
215
|
+
this software and associated documentation files (the "Software"), to deal in
|
|
216
|
+
the Software without restriction, including without limitation the rights to
|
|
217
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
218
|
+
of the Software, and to permit persons to whom the Software is furnished to do
|
|
219
|
+
so, subject to the following conditions:
|
|
220
|
+
|
|
221
|
+
The above copyright notice and this permission notice shall be included in all
|
|
222
|
+
copies or substantial portions of the Software.
|
|
223
|
+
|
|
224
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
225
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
226
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
227
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
228
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
229
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
230
|
+
SOFTWARE.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
examples/customers.json,sha256=pjKFcftqntyImW0W7UgNZ8deW_3F7v7pcNjftVCLvyE,252
|
|
2
|
+
examples/parse_frag.py,sha256=ZNmHqOOMrRg0tY32sepaT27zKrXkBac9sl6JE0nLoqA,1324
|
|
3
|
+
examples/parse_stream.py,sha256=07KUrvTQANoVZ9BxIxjspgrUwWfhIv9abDfpFfZ0Vew,1579
|
|
4
|
+
examples/parse_tree.py,sha256=wBz4PbRFyOZEb2rOTTwzL8nAdt2HEoQwpDidp85MVsM,110
|
|
5
|
+
examples/pyjsonfrag/__init__.py,sha256=dlwC7DG10gm9jj6_Pua8H0QKwJdyH_qScRsSrjIMBWg,26055
|
|
6
|
+
pyjsonfrag-0.0.1.dist-info/METADATA,sha256=HMuO6PFU-7187hQBNM6LGhFDby89z4QiKnlULyBtwDs,8255
|
|
7
|
+
pyjsonfrag-0.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
8
|
+
pyjsonfrag-0.0.1.dist-info/licenses/LICENSE,sha256=znzHZMVYPnIJF9OxEwqUY2MrMf8NyFEaAWIVsmWvB_w,1060
|
|
9
|
+
pyjsonfrag-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2026 Juha-Matti Tilli
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
4
|
+
this software and associated documentation files (the "Software"), to deal in
|
|
5
|
+
the Software without restriction, including without limitation the rights to
|
|
6
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
7
|
+
of the Software, and to permit persons to whom the Software is furnished to do
|
|
8
|
+
so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|