sparrow-parse 1.0.9__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/PKG-INFO +1 -1
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/setup.py +1 -1
- sparrow-parse-1.1.0/sparrow_parse/__init__.py +1 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/vllm/mlx_inference.py +39 -10
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse.egg-info/PKG-INFO +1 -1
- sparrow-parse-1.0.9/sparrow_parse/__init__.py +0 -1
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/README.md +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/setup.cfg +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/__main__.py +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/extractors/__init__.py +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/extractors/vllm_extractor.py +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/helpers/__init__.py +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/helpers/image_optimizer.py +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/helpers/pdf_optimizer.py +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/processors/__init__.py +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/processors/table_structure_processor.py +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/text_extraction.py +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/vllm/__init__.py +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/vllm/huggingface_inference.py +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/vllm/inference_base.py +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/vllm/inference_factory.py +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/vllm/local_gpu_inference.py +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse.egg-info/SOURCES.txt +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse.egg-info/dependency_links.txt +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse.egg-info/entry_points.txt +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse.egg-info/requires.txt +0 -0
- {sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 1.0
|
3
|
+
Version: 1.1.0
|
4
4
|
Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
Author: Andrej Baranovskij
|
@@ -8,7 +8,7 @@ with open("requirements.txt", "r", encoding="utf-8") as fh:
|
|
8
8
|
|
9
9
|
setup(
|
10
10
|
name="sparrow-parse",
|
11
|
-
version="1.0
|
11
|
+
version="1.1.0",
|
12
12
|
author="Andrej Baranovskij",
|
13
13
|
author_email="andrejus.baranovskis@gmail.com",
|
14
14
|
description="Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.",
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = '1.1.0'
|
@@ -249,6 +249,7 @@ class MLXInference(ModelInference):
|
|
249
249
|
Transform JSON schema in text_input to include value, bbox, and confidence.
|
250
250
|
Works with formats like: "retrieve field1, field2. return response in JSON format,
|
251
251
|
by strictly following this JSON schema: [{...}]."
|
252
|
+
Handles complex nested structures including arrays.
|
252
253
|
|
253
254
|
Args:
|
254
255
|
text_input (str): The input text containing a JSON schema
|
@@ -256,29 +257,57 @@ class MLXInference(ModelInference):
|
|
256
257
|
Returns:
|
257
258
|
str: Text with transformed JSON including value, bbox, and confidence
|
258
259
|
"""
|
260
|
+
# Find where the schema starts
|
261
|
+
schema_start_marker = "JSON schema:"
|
262
|
+
schema_start_pos = text_input.find(schema_start_marker)
|
259
263
|
|
260
|
-
|
261
|
-
|
264
|
+
if schema_start_pos == -1:
|
265
|
+
return text_input # Return original if marker not found
|
262
266
|
|
263
|
-
|
264
|
-
|
267
|
+
# Find the actual schema by tracking opening and closing braces
|
268
|
+
start_pos = schema_start_pos + len(schema_start_marker)
|
265
269
|
|
266
|
-
#
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
+
# Skip whitespace to find first opening brace or bracket
|
271
|
+
while start_pos < len(text_input) and text_input[start_pos] not in ['{', '[']:
|
272
|
+
start_pos += 1
|
273
|
+
|
274
|
+
if start_pos >= len(text_input):
|
275
|
+
return text_input # No opening brace found
|
276
|
+
|
277
|
+
# Determine if we're dealing with an object or array
|
278
|
+
is_object = text_input[start_pos] == '{'
|
279
|
+
|
280
|
+
# Now extract the full JSON schema by counting braces
|
281
|
+
open_char = '{' if is_object else '['
|
282
|
+
close_char = '}' if is_object else ']'
|
283
|
+
count = 1 # Already found one opening brace/bracket
|
284
|
+
end_pos = start_pos + 1
|
285
|
+
|
286
|
+
while end_pos < len(text_input) and count > 0:
|
287
|
+
if text_input[end_pos] == open_char:
|
288
|
+
count += 1
|
289
|
+
elif text_input[end_pos] == close_char:
|
290
|
+
count -= 1
|
291
|
+
end_pos += 1
|
292
|
+
|
293
|
+
if count != 0:
|
294
|
+
print("Warning: Unbalanced braces in JSON schema")
|
295
|
+
return text_input # Unbalanced braces, return original
|
296
|
+
|
297
|
+
# Extract the schema
|
298
|
+
schema_str = text_input[start_pos:end_pos]
|
270
299
|
|
271
|
-
# Parse and transform the JSON
|
272
300
|
try:
|
273
301
|
# Handle single quotes if needed
|
274
302
|
schema_str = schema_str.replace("'", '"')
|
275
303
|
|
304
|
+
# Parse and transform the JSON
|
276
305
|
json_obj = json.loads(schema_str)
|
277
306
|
transformed_json = self.transform_query_structure(json_obj)
|
278
307
|
transformed_json_str = json.dumps(transformed_json)
|
279
308
|
|
280
309
|
# Rebuild the text by replacing just the schema portion
|
281
|
-
result = text_input[:
|
310
|
+
result = text_input[:start_pos] + transformed_json_str + text_input[end_pos:]
|
282
311
|
|
283
312
|
return result
|
284
313
|
except json.JSONDecodeError as e:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 1.0
|
3
|
+
Version: 1.1.0
|
4
4
|
Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
Author: Andrej Baranovskij
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = '1.0.9'
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sparrow-parse-1.0.9 → sparrow-parse-1.1.0}/sparrow_parse/processors/table_structure_processor.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|