sparrow-parse 1.0.9__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/PKG-INFO +1 -1
  2. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/setup.py +1 -1
  3. sparrow-parse-1.1.0/sparrow_parse/__init__.py +1 -0
  4. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/vllm/mlx_inference.py +39 -10
  5. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse.egg-info/PKG-INFO +1 -1
  6. sparrow-parse-1.0.9/sparrow_parse/__init__.py +0 -1
  7. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/README.md +0 -0
  8. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/setup.cfg +0 -0
  9. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/__main__.py +0 -0
  10. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/extractors/__init__.py +0 -0
  11. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/extractors/vllm_extractor.py +0 -0
  12. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/helpers/__init__.py +0 -0
  13. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/helpers/image_optimizer.py +0 -0
  14. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/helpers/pdf_optimizer.py +0 -0
  15. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/processors/__init__.py +0 -0
  16. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/processors/table_structure_processor.py +0 -0
  17. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/text_extraction.py +0 -0
  18. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/vllm/__init__.py +0 -0
  19. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/vllm/huggingface_inference.py +0 -0
  20. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/vllm/inference_base.py +0 -0
  21. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/vllm/inference_factory.py +0 -0
  22. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/vllm/local_gpu_inference.py +0 -0
  23. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse.egg-info/SOURCES.txt +0 -0
  24. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse.egg-info/dependency_links.txt +0 -0
  25. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse.egg-info/entry_points.txt +0 -0
  26. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse.egg-info/requires.txt +0 -0
  27. {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 1.0.9
3
+ Version: 1.1.0
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -8,7 +8,7 @@ with open("requirements.txt", "r", encoding="utf-8") as fh:
8
8
 
9
9
  setup(
10
10
  name="sparrow-parse",
11
- version="1.0.9",
11
+ version="1.1.0",
12
12
  author="Andrej Baranovskij",
13
13
  author_email="andrejus.baranovskis@gmail.com",
14
14
  description="Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.",
@@ -0,0 +1 @@
1
+ __version__ = '1.1.0'
@@ -249,6 +249,7 @@ class MLXInference(ModelInference):
249
249
  Transform JSON schema in text_input to include value, bbox, and confidence.
250
250
  Works with formats like: "retrieve field1, field2. return response in JSON format,
251
251
  by strictly following this JSON schema: [{...}]."
252
+ Handles complex nested structures including arrays.
252
253
 
253
254
  Args:
254
255
  text_input (str): The input text containing a JSON schema
@@ -256,29 +257,57 @@ class MLXInference(ModelInference):
256
257
  Returns:
257
258
  str: Text with transformed JSON including value, bbox, and confidence
258
259
  """
260
+ # Find where the schema starts
261
+ schema_start_marker = "JSON schema:"
262
+ schema_start_pos = text_input.find(schema_start_marker)
259
263
 
260
- schema_pattern = r'JSON schema:\s*(\[.*?\]|\{.*?\})'
261
- schema_match = re.search(schema_pattern, text_input, re.DOTALL)
264
+ if schema_start_pos == -1:
265
+ return text_input # Return original if marker not found
262
266
 
263
- if not schema_match:
264
- return text_input # Return original if pattern not found
267
+ # Find the actual schema by tracking opening and closing braces
268
+ start_pos = schema_start_pos + len(schema_start_marker)
265
269
 
266
- # Extract the schema part and its position
267
- schema_str = schema_match.group(1).strip()
268
- schema_start = schema_match.start(1)
269
- schema_end = schema_match.end(1)
270
+ # Skip whitespace to find first opening brace or bracket
271
+ while start_pos < len(text_input) and text_input[start_pos] not in ['{', '[']:
272
+ start_pos += 1
273
+
274
+ if start_pos >= len(text_input):
275
+ return text_input # No opening brace found
276
+
277
+ # Determine if we're dealing with an object or array
278
+ is_object = text_input[start_pos] == '{'
279
+
280
+ # Now extract the full JSON schema by counting braces
281
+ open_char = '{' if is_object else '['
282
+ close_char = '}' if is_object else ']'
283
+ count = 1 # Already found one opening brace/bracket
284
+ end_pos = start_pos + 1
285
+
286
+ while end_pos < len(text_input) and count > 0:
287
+ if text_input[end_pos] == open_char:
288
+ count += 1
289
+ elif text_input[end_pos] == close_char:
290
+ count -= 1
291
+ end_pos += 1
292
+
293
+ if count != 0:
294
+ print("Warning: Unbalanced braces in JSON schema")
295
+ return text_input # Unbalanced braces, return original
296
+
297
+ # Extract the schema
298
+ schema_str = text_input[start_pos:end_pos]
270
299
 
271
- # Parse and transform the JSON
272
300
  try:
273
301
  # Handle single quotes if needed
274
302
  schema_str = schema_str.replace("'", '"')
275
303
 
304
+ # Parse and transform the JSON
276
305
  json_obj = json.loads(schema_str)
277
306
  transformed_json = self.transform_query_structure(json_obj)
278
307
  transformed_json_str = json.dumps(transformed_json)
279
308
 
280
309
  # Rebuild the text by replacing just the schema portion
281
- result = text_input[:schema_start] + transformed_json_str + text_input[schema_end:]
310
+ result = text_input[:start_pos] + transformed_json_str + text_input[end_pos:]
282
311
 
283
312
  return result
284
313
  except json.JSONDecodeError as e:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 1.0.9
3
+ Version: 1.1.0
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -1 +0,0 @@
1
- __version__ = '1.0.9'
File without changes
File without changes