json-repair 0.47.4__py3-none-any.whl → 0.47.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,25 @@
1
- from typing import Any, ClassVar, Literal, TextIO
1
+ from typing import Literal, TextIO
2
2
 
3
- from .json_context import ContextValues, JsonContext
3
+ from .constants import STRING_DELIMITERS, JSONReturnType
4
+ from .json_context import JsonContext
4
5
  from .object_comparer import ObjectComparer
6
+ from .parse_array import parse_array
7
+ from .parse_boolean_or_null import parse_boolean_or_null
8
+ from .parse_comment import parse_comment
9
+ from .parse_number import parse_number
10
+ from .parse_object import parse_object
11
+ from .parse_string import parse_string
5
12
  from .string_file_wrapper import StringFileWrapper
6
13
 
7
- JSONReturnType = dict[str, Any] | list[Any] | str | float | int | bool | None
8
-
9
14
 
10
15
  class JSONParser:
11
- # Constants
12
- STRING_DELIMITERS: ClassVar[list[str]] = ['"', "'", "“", "”"]
13
- NUMBER_CHARS: ClassVar[set[str]] = set("0123456789-.eE/,")
16
+ # Split the parse methods into separate files because this one was like 3000 lines
17
+ parse_array = parse_array
18
+ parse_boolean_or_null = parse_boolean_or_null
19
+ parse_comment = parse_comment
20
+ parse_number = parse_number
21
+ parse_object = parse_object
22
+ parse_string = parse_string
14
23
 
15
24
  def __init__(
16
25
  self,
@@ -98,7 +107,7 @@ class JSONParser:
98
107
  self.index += 1
99
108
  return self.parse_array()
100
109
  # <string> starts with a quote
101
- elif not self.context.empty and (char in self.STRING_DELIMITERS or char.isalpha()):
110
+ elif not self.context.empty and (char in STRING_DELIMITERS or char.isalpha()):
102
111
  return self.parse_string()
103
112
  # <number> starts with [0-9] or minus
104
113
  elif not self.context.empty and (char.isdigit() or char == "-" or char == "."):
@@ -109,672 +118,6 @@ class JSONParser:
109
118
  else:
110
119
  self.index += 1
111
120
 
112
- def parse_object(self) -> dict[str, JSONReturnType]:
113
- # <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
114
- obj: dict[str, JSONReturnType] = {}
115
- # Stop when you either find the closing parentheses or you have iterated over the entire string
116
- while (self.get_char_at() or "}") != "}":
117
- # This is what we expect to find:
118
- # <member> ::= <string> ': ' <json>
119
-
120
- # Skip filler whitespaces
121
- self.skip_whitespaces_at()
122
-
123
- # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
124
- if (self.get_char_at() or "") == ":":
125
- self.log(
126
- "While parsing an object we found a : before a key, ignoring",
127
- )
128
- self.index += 1
129
-
130
- # We are now searching for they string key
131
- # Context is used in the string parser to manage the lack of quotes
132
- self.context.set(ContextValues.OBJECT_KEY)
133
-
134
- # Save this index in case we need find a duplicate key
135
- rollback_index = self.index
136
-
137
- # <member> starts with a <string>
138
- key = ""
139
- while self.get_char_at():
140
- # The rollback index needs to be updated here in case the key is empty
141
- rollback_index = self.index
142
- if self.get_char_at() == "[" and key == "":
143
- # Is this an array?
144
- # Need to check if the previous parsed value contained in obj is an array and in that case parse and merge the two
145
- prev_key = list(obj.keys())[-1] if obj else None
146
- if prev_key and isinstance(obj[prev_key], list):
147
- # If the previous key's value is an array, parse the new array and merge
148
- self.index += 1
149
- new_array = self.parse_array()
150
- if isinstance(new_array, list):
151
- # Merge and flatten the arrays
152
- prev_value = obj[prev_key]
153
- if isinstance(prev_value, list):
154
- prev_value.extend(
155
- new_array[0]
156
- if len(new_array) == 1 and isinstance(new_array[0], list)
157
- else new_array
158
- )
159
- self.skip_whitespaces_at()
160
- if self.get_char_at() == ",":
161
- self.index += 1
162
- self.skip_whitespaces_at()
163
- continue
164
- key = str(self.parse_string())
165
- if key == "":
166
- self.skip_whitespaces_at()
167
- if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
168
- # If the string is empty but there is a object divider, we are done here
169
- break
170
- if ContextValues.ARRAY in self.context.context and key in obj:
171
- self.log(
172
- "While parsing an object we found a duplicate key, closing the object here and rolling back the index",
173
- )
174
- self.index = rollback_index - 1
175
- # add an opening curly brace to make this work
176
- self.json_str = self.json_str[: self.index + 1] + "{" + self.json_str[self.index + 1 :]
177
- break
178
-
179
- # Skip filler whitespaces
180
- self.skip_whitespaces_at()
181
-
182
- # We reached the end here
183
- if (self.get_char_at() or "}") == "}":
184
- continue
185
-
186
- self.skip_whitespaces_at()
187
-
188
- # An extreme case of missing ":" after a key
189
- if (self.get_char_at() or "") != ":":
190
- self.log(
191
- "While parsing an object we missed a : after a key",
192
- )
193
-
194
- self.index += 1
195
- self.context.reset()
196
- self.context.set(ContextValues.OBJECT_VALUE)
197
- # The value can be any valid json
198
- self.skip_whitespaces_at()
199
- # Corner case, a lone comma
200
- value: JSONReturnType = ""
201
- if (self.get_char_at() or "") in [",", "}"]:
202
- self.log(
203
- "While parsing an object value we found a stray , ignoring it",
204
- )
205
- else:
206
- value = self.parse_json()
207
-
208
- # Reset context since our job is done
209
- self.context.reset()
210
- obj[key] = value
211
-
212
- if (self.get_char_at() or "") in [",", "'", '"']:
213
- self.index += 1
214
-
215
- # Remove trailing spaces
216
- self.skip_whitespaces_at()
217
-
218
- self.index += 1
219
- return obj
220
-
221
- def parse_array(self) -> list[JSONReturnType]:
222
- # <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
223
- arr = []
224
- self.context.set(ContextValues.ARRAY)
225
- # Stop when you either find the closing parentheses or you have iterated over the entire string
226
- char = self.get_char_at()
227
- while char and char not in ["]", "}"]:
228
- self.skip_whitespaces_at()
229
- value: JSONReturnType = ""
230
- if char in self.STRING_DELIMITERS:
231
- # Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
232
- # So we are going to check if this string is followed by a : or not
233
- # And either parse the string or parse the object
234
- i = 1
235
- i = self.skip_to_character(char, i)
236
- i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
237
- value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
238
- else:
239
- value = self.parse_json()
240
-
241
- # It is possible that parse_json() returns nothing valid, so we increase by 1
242
- if value == "":
243
- self.index += 1
244
- elif value == "..." and self.get_char_at(-1) == ".":
245
- self.log(
246
- "While parsing an array, found a stray '...'; ignoring it",
247
- )
248
- else:
249
- arr.append(value)
250
-
251
- # skip over whitespace after a value but before closing ]
252
- char = self.get_char_at()
253
- while char and char != "]" and (char.isspace() or char == ","):
254
- self.index += 1
255
- char = self.get_char_at()
256
-
257
- # Especially at the end of an LLM generated json you might miss the last "]"
258
- if char and char != "]":
259
- self.log(
260
- "While parsing an array we missed the closing ], ignoring it",
261
- )
262
-
263
- self.index += 1
264
-
265
- self.context.reset()
266
- return arr
267
-
268
- def parse_string(self) -> str | bool | None:
269
- # <string> is a string of valid characters enclosed in quotes
270
- # i.e. { name: "John" }
271
- # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
272
-
273
- # Flag to manage corner cases related to missing starting quote
274
- missing_quotes = False
275
- doubled_quotes = False
276
- lstring_delimiter = rstring_delimiter = '"'
277
-
278
- char = self.get_char_at()
279
- if char in ["#", "/"]:
280
- return self.parse_comment()
281
- # A valid string can only start with a valid quote or, in our case, with a literal
282
- while char and char not in self.STRING_DELIMITERS and not char.isalnum():
283
- self.index += 1
284
- char = self.get_char_at()
285
-
286
- if not char:
287
- # This is an empty string
288
- return ""
289
-
290
- # Ensuring we use the right delimiter
291
- if char == "'":
292
- lstring_delimiter = rstring_delimiter = "'"
293
- elif char == "“":
294
- lstring_delimiter = "“"
295
- rstring_delimiter = "”"
296
- elif char.isalnum():
297
- # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
298
- # But remember, object keys are only of type string
299
- if char.lower() in ["t", "f", "n"] and self.context.current != ContextValues.OBJECT_KEY:
300
- value = self.parse_boolean_or_null()
301
- if value != "":
302
- return value
303
- self.log(
304
- "While parsing a string, we found a literal instead of a quote",
305
- )
306
- missing_quotes = True
307
-
308
- if not missing_quotes:
309
- self.index += 1
310
-
311
- # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
312
- if self.get_char_at() in self.STRING_DELIMITERS and self.get_char_at() == lstring_delimiter:
313
- # If it's an empty key, this was easy
314
- if (self.context.current == ContextValues.OBJECT_KEY and self.get_char_at(1) == ":") or (
315
- self.context.current == ContextValues.OBJECT_VALUE and self.get_char_at(1) in [",", "}"]
316
- ):
317
- self.index += 1
318
- return ""
319
- elif self.get_char_at(1) == lstring_delimiter:
320
- # There's something fishy about this, we found doubled quotes and then again quotes
321
- self.log(
322
- "While parsing a string, we found a doubled quote and then a quote again, ignoring it",
323
- )
324
- return ""
325
- # Find the next delimiter
326
- i = self.skip_to_character(character=rstring_delimiter, idx=1)
327
- next_c = self.get_char_at(i)
328
- # Now check that the next character is also a delimiter to ensure that we have "".....""
329
- # In that case we ignore this rstring delimiter
330
- if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
331
- self.log(
332
- "While parsing a string, we found a valid starting doubled quote",
333
- )
334
- doubled_quotes = True
335
- self.index += 1
336
- else:
337
- # Ok this is not a doubled quote, check if this is an empty string or not
338
- i = self.skip_whitespaces_at(idx=1, move_main_index=False)
339
- next_c = self.get_char_at(i)
340
- if next_c in self.STRING_DELIMITERS + ["{", "["]:
341
- # something fishy is going on here
342
- self.log(
343
- "While parsing a string, we found a doubled quote but also another quote afterwards, ignoring it",
344
- )
345
- self.index += 1
346
- return ""
347
- elif next_c not in [",", "]", "}"]:
348
- self.log(
349
- "While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
350
- )
351
- self.index += 1
352
-
353
- # Initialize our return value
354
- string_acc = ""
355
-
356
- # Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
357
- # In that case we need to use the ":|,|}" characters as terminators of the string
358
- # So this will stop if:
359
- # * It finds a closing quote
360
- # * It iterated over the entire sequence
361
- # * If we are fixing missing quotes in an object, when it finds the special terminators
362
- char = self.get_char_at()
363
- unmatched_delimiter = False
364
- while char and char != rstring_delimiter:
365
- if missing_quotes and self.context.current == ContextValues.OBJECT_KEY and (char == ":" or char.isspace()):
366
- self.log(
367
- "While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
368
- )
369
- break
370
- if (
371
- not self.stream_stable
372
- and self.context.current == ContextValues.OBJECT_VALUE
373
- and char
374
- in [
375
- ",",
376
- "}",
377
- ]
378
- and (not string_acc or string_acc[-1] != rstring_delimiter)
379
- ):
380
- rstring_delimiter_missing = True
381
- # check if this is a case in which the closing comma is NOT missing instead
382
- self.skip_whitespaces_at()
383
- if self.get_char_at(1) == "\\":
384
- # Ok this is a quoted string, skip
385
- rstring_delimiter_missing = False
386
- i = self.skip_to_character(character=rstring_delimiter, idx=1)
387
- next_c = self.get_char_at(i)
388
- if next_c:
389
- i += 1
390
- # found a delimiter, now we need to check that is followed strictly by a comma or brace
391
- # or the string ended
392
- i = self.skip_whitespaces_at(idx=i, move_main_index=False)
393
- next_c = self.get_char_at(i)
394
- if not next_c or next_c in [",", "}"]:
395
- rstring_delimiter_missing = False
396
- else:
397
- # OK but this could still be some garbage at the end of the string
398
- # So we need to check if we find a new lstring_delimiter afterwards
399
- # If we do, maybe this is a missing delimiter
400
- i = self.skip_to_character(character=lstring_delimiter, idx=i)
401
- next_c = self.get_char_at(i)
402
- if not next_c:
403
- rstring_delimiter_missing = False
404
- else:
405
- # But again, this could just be something a bit stupid like "lorem, "ipsum" sic"
406
- # Check if we find a : afterwards (skipping space)
407
- i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
408
- next_c = self.get_char_at(i)
409
- if next_c and next_c != ":":
410
- rstring_delimiter_missing = False
411
- else:
412
- # There could be a case in which even the next key:value is missing delimeters
413
- # because it might be a systemic issue with the output
414
- # So let's check if we can find a : in the string instead
415
- i = self.skip_to_character(character=":", idx=1)
416
- next_c = self.get_char_at(i)
417
- if next_c:
418
- # OK then this is a systemic issue with the output
419
- break
420
- else:
421
- # skip any whitespace first
422
- i = self.skip_whitespaces_at(idx=1, move_main_index=False)
423
- # We couldn't find any rstring_delimeter before the end of the string
424
- # check if this is the last string of an object and therefore we can keep going
425
- # make an exception if this is the last char before the closing brace
426
- j = self.skip_to_character(character="}", idx=i)
427
- if j - i > 1:
428
- # Ok it's not right after the comma
429
- # Let's ignore
430
- rstring_delimiter_missing = False
431
- # Check that j was not out of bound
432
- elif self.get_char_at(j):
433
- # Check for an unmatched opening brace in string_acc
434
- for c in reversed(string_acc):
435
- if c == "{":
436
- # Ok then this is part of the string
437
- rstring_delimiter_missing = False
438
- break
439
- if rstring_delimiter_missing:
440
- self.log(
441
- "While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
442
- )
443
- break
444
- if (
445
- not self.stream_stable
446
- and char == "]"
447
- and ContextValues.ARRAY in self.context.context
448
- and string_acc[-1] != rstring_delimiter
449
- ):
450
- # We found the end of an array and we are in array context
451
- # So let's check if we find a rstring_delimiter forward otherwise end early
452
- i = self.skip_to_character(rstring_delimiter)
453
- if not self.get_char_at(i):
454
- # No delimiter found
455
- break
456
- string_acc += char
457
- self.index += 1
458
- char = self.get_char_at()
459
- # Unclosed string ends with a \ character. This character is ignored if stream_stable = True.
460
- if self.stream_stable and not char and string_acc[-1] == "\\":
461
- string_acc = string_acc[:-1]
462
- if char and string_acc[-1] == "\\":
463
- # This is a special case, if people use real strings this might happen
464
- self.log("Found a stray escape sequence, normalizing it")
465
- if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
466
- string_acc = string_acc[:-1]
467
- escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
468
- string_acc += escape_seqs.get(char, char)
469
- self.index += 1
470
- char = self.get_char_at()
471
- while char and string_acc[-1] == "\\" and char in [rstring_delimiter, "\\"]:
472
- # this is a bit of a special case, if I don't do this it will close the loop or create a train of \\
473
- # I don't love it though
474
- string_acc = string_acc[:-1]
475
- string_acc += char
476
- self.index += 1
477
- char = self.get_char_at()
478
- continue
479
- elif char in ["u", "x"]:
480
- # If we find a unicode escape sequence, normalize it
481
- num_chars = 4 if char == "u" else 2
482
- next_chars = self.json_str[self.index + 1 : self.index + 1 + num_chars]
483
- if len(next_chars) == num_chars and all(c in "0123456789abcdefABCDEF" for c in next_chars):
484
- self.log("Found a unicode escape sequence, normalizing it")
485
- string_acc = string_acc[:-1]
486
- string_acc += chr(int(next_chars, 16))
487
- self.index += 1 + num_chars
488
- char = self.get_char_at()
489
- continue
490
- # If we are in object key context and we find a colon, it could be a missing right quote
491
- if char == ":" and not missing_quotes and self.context.current == ContextValues.OBJECT_KEY:
492
- # Ok now we need to check if this is followed by a value like "..."
493
- i = self.skip_to_character(character=lstring_delimiter, idx=1)
494
- next_c = self.get_char_at(i)
495
- if next_c:
496
- i += 1
497
- # found the first delimiter
498
- i = self.skip_to_character(character=rstring_delimiter, idx=i)
499
- next_c = self.get_char_at(i)
500
- if next_c:
501
- # found a second delimiter
502
- i += 1
503
- # Skip spaces
504
- i = self.skip_whitespaces_at(idx=i, move_main_index=False)
505
- next_c = self.get_char_at(i)
506
- if next_c and next_c in [",", "}"]:
507
- # Ok then this is a missing right quote
508
- self.log(
509
- "While parsing a string missing the right delimiter in object key context, we found a :, stopping here",
510
- )
511
- break
512
- else:
513
- # The string ended without finding a lstring_delimiter, I will assume this is a missing right quote
514
- self.log(
515
- "While parsing a string missing the right delimiter in object key context, we found a :, stopping here",
516
- )
517
- break
518
- # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
519
- if char == rstring_delimiter and string_acc[-1] != "\\":
520
- # Special case here, in case of double quotes one after another
521
- if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
522
- self.log("While parsing a string, we found a doubled quote, ignoring it")
523
- self.index += 1
524
- elif missing_quotes and self.context.current == ContextValues.OBJECT_VALUE:
525
- # In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
526
- i = 1
527
- next_c = self.get_char_at(i)
528
- while next_c and next_c not in [
529
- rstring_delimiter,
530
- lstring_delimiter,
531
- ]:
532
- i += 1
533
- next_c = self.get_char_at(i)
534
- if next_c:
535
- # We found a quote, now let's make sure there's a ":" following
536
- i += 1
537
- # found a delimiter, now we need to check that is followed strictly by a comma or brace
538
- i = self.skip_whitespaces_at(idx=i, move_main_index=False)
539
- next_c = self.get_char_at(i)
540
- if next_c and next_c == ":":
541
- # Reset the cursor
542
- self.index -= 1
543
- char = self.get_char_at()
544
- self.log(
545
- "In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.",
546
- )
547
- break
548
- elif unmatched_delimiter:
549
- unmatched_delimiter = False
550
- string_acc += str(char)
551
- self.index += 1
552
- char = self.get_char_at()
553
- else:
554
- # Check if eventually there is a rstring delimiter, otherwise we bail
555
- i = 1
556
- next_c = self.get_char_at(i)
557
- check_comma_in_object_value = True
558
- while next_c and next_c not in [
559
- rstring_delimiter,
560
- lstring_delimiter,
561
- ]:
562
- # This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
563
- # This is because the routine after will make sure to correct any bad guess and this solves a corner case
564
- if check_comma_in_object_value and next_c.isalpha():
565
- check_comma_in_object_value = False
566
- # If we are in an object context, let's check for the right delimiters
567
- if (
568
- (ContextValues.OBJECT_KEY in self.context.context and next_c in [":", "}"])
569
- or (ContextValues.OBJECT_VALUE in self.context.context and next_c == "}")
570
- or (ContextValues.ARRAY in self.context.context and next_c in ["]", ","])
571
- or (
572
- check_comma_in_object_value
573
- and self.context.current == ContextValues.OBJECT_VALUE
574
- and next_c == ","
575
- )
576
- ):
577
- break
578
- i += 1
579
- next_c = self.get_char_at(i)
580
- # If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
581
- if next_c == "," and self.context.current == ContextValues.OBJECT_VALUE:
582
- i += 1
583
- i = self.skip_to_character(character=rstring_delimiter, idx=i)
584
- next_c = self.get_char_at(i)
585
- # Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
586
- i += 1
587
- i = self.skip_whitespaces_at(idx=i, move_main_index=False)
588
- next_c = self.get_char_at(i)
589
- elif next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\":
590
- # Check if self.index:self.index+i is only whitespaces, break if that's the case
591
- if all(str(self.get_char_at(j)).isspace() for j in range(1, i) if self.get_char_at(j)):
592
- break
593
- if self.context.current == ContextValues.OBJECT_VALUE:
594
- # But this might not be it! This could be just a missing comma
595
- # We found a delimiter and we need to check if this is a key
596
- # so find a rstring_delimiter and a colon after
597
- i = self.skip_to_character(character=rstring_delimiter, idx=i + 1)
598
- i += 1
599
- next_c = self.get_char_at(i)
600
- while next_c and next_c != ":":
601
- if next_c in [",", "]", "}"] or (
602
- next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\"
603
- ):
604
- break
605
- i += 1
606
- next_c = self.get_char_at(i)
607
- # Only if we fail to find a ':' then we know this is misplaced quote
608
- if next_c != ":":
609
- self.log(
610
- "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
611
- )
612
- unmatched_delimiter = not unmatched_delimiter
613
- string_acc += str(char)
614
- self.index += 1
615
- char = self.get_char_at()
616
- elif self.context.current == ContextValues.ARRAY:
617
- # If we got up to here it means that this is a situation like this:
618
- # ["bla bla bla "puppy" bla bla bla "kitty" bla bla"]
619
- # So we need to ignore this quote
620
- self.log(
621
- "While parsing a string in Array context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
622
- )
623
- unmatched_delimiter = not unmatched_delimiter
624
- string_acc += str(char)
625
- self.index += 1
626
- char = self.get_char_at()
627
- elif self.context.current == ContextValues.OBJECT_KEY:
628
- # In this case we just ignore this and move on
629
- self.log(
630
- "While parsing a string in Object Key context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
631
- )
632
- string_acc += str(char)
633
- self.index += 1
634
- char = self.get_char_at()
635
- if char and missing_quotes and self.context.current == ContextValues.OBJECT_KEY and char.isspace():
636
- self.log(
637
- "While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
638
- )
639
- self.skip_whitespaces_at()
640
- if self.get_char_at() not in [":", ","]:
641
- return ""
642
-
643
- # A fallout of the previous special case in the while loop,
644
- # we need to update the index only if we had a closing quote
645
- if char != rstring_delimiter:
646
- # if stream_stable = True, unclosed strings do not trim trailing whitespace characters
647
- if not self.stream_stable:
648
- self.log(
649
- "While parsing a string, we missed the closing quote, ignoring",
650
- )
651
- string_acc = string_acc.rstrip()
652
- else:
653
- self.index += 1
654
-
655
- if not self.stream_stable and (missing_quotes or (string_acc and string_acc[-1] == "\n")):
656
- # Clean the whitespaces for some corner cases
657
- string_acc = string_acc.rstrip()
658
-
659
- return string_acc
660
-
661
- def parse_number(self) -> float | int | str | JSONReturnType:
662
- # <number> is a valid real number expressed in one of a number of given formats
663
- number_str = ""
664
- char = self.get_char_at()
665
- is_array = self.context.current == ContextValues.ARRAY
666
- while char and char in self.NUMBER_CHARS and (not is_array or char != ","):
667
- number_str += char
668
- self.index += 1
669
- char = self.get_char_at()
670
- if number_str and number_str[-1] in "-eE/,":
671
- # The number ends with a non valid character for a number/currency, rolling back one
672
- number_str = number_str[:-1]
673
- self.index -= 1
674
- elif (self.get_char_at() or "").isalpha():
675
- # this was a string instead, sorry
676
- self.index -= len(number_str)
677
- return self.parse_string()
678
- try:
679
- if "," in number_str:
680
- return str(number_str)
681
- if "." in number_str or "e" in number_str or "E" in number_str:
682
- return float(number_str)
683
- else:
684
- return int(number_str)
685
- except ValueError:
686
- return number_str
687
-
688
- def parse_boolean_or_null(self) -> bool | str | None:
689
- # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
690
- starting_index = self.index
691
- char = (self.get_char_at() or "").lower()
692
- value: tuple[str, bool | None] | None = None
693
- if char == "t":
694
- value = ("true", True)
695
- elif char == "f":
696
- value = ("false", False)
697
- elif char == "n":
698
- value = ("null", None)
699
-
700
- if value:
701
- i = 0
702
- while char and i < len(value[0]) and char == value[0][i]:
703
- i += 1
704
- self.index += 1
705
- char = (self.get_char_at() or "").lower()
706
- if i == len(value[0]):
707
- return value[1]
708
-
709
- # If nothing works reset the index before returning
710
- self.index = starting_index
711
- return ""
712
-
713
- def parse_comment(self) -> str:
714
- """
715
- Parse code-like comments:
716
-
717
- - "# comment": A line comment that continues until a newline.
718
- - "// comment": A line comment that continues until a newline.
719
- - "/* comment */": A block comment that continues until the closing delimiter "*/".
720
-
721
- The comment is skipped over and an empty string is returned so that comments do not interfere
722
- with the actual JSON elements.
723
- """
724
- char = self.get_char_at()
725
- termination_characters = ["\n", "\r"]
726
- if ContextValues.ARRAY in self.context.context:
727
- termination_characters.append("]")
728
- if ContextValues.OBJECT_VALUE in self.context.context:
729
- termination_characters.append("}")
730
- if ContextValues.OBJECT_KEY in self.context.context:
731
- termination_characters.append(":")
732
- # Line comment starting with #
733
- if char == "#":
734
- comment = ""
735
- while char and char not in termination_characters:
736
- comment += char
737
- self.index += 1
738
- char = self.get_char_at()
739
- self.log(f"Found line comment: {comment}")
740
- return ""
741
-
742
- # Comments starting with '/'
743
- elif char == "/":
744
- next_char = self.get_char_at(1)
745
- # Handle line comment starting with //
746
- if next_char == "/":
747
- comment = "//"
748
- self.index += 2 # Skip both slashes.
749
- char = self.get_char_at()
750
- while char and char not in termination_characters:
751
- comment += char
752
- self.index += 1
753
- char = self.get_char_at()
754
- self.log(f"Found line comment: {comment}")
755
- return ""
756
- # Handle block comment starting with /*
757
- elif next_char == "*":
758
- comment = "/*"
759
- self.index += 2 # Skip '/*'
760
- while True:
761
- char = self.get_char_at()
762
- if not char:
763
- self.log("Reached end-of-string while parsing block comment; unclosed block comment.")
764
- break
765
- comment += char
766
- self.index += 1
767
- if comment.endswith("*/"):
768
- break
769
- self.log(f"Found block comment: {comment}")
770
- return ""
771
- else:
772
- # Skip standalone '/' characters that are not part of a comment
773
- # to avoid getting stuck in an infinite loop
774
- self.index += 1
775
- return ""
776
- return "" # pragma: no cover
777
-
778
121
  def get_char_at(self, count: int = 0) -> str | Literal[False]:
779
122
  # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
780
123
  try: