sparrow-parse 1.0.6__tar.gz → 1.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/PKG-INFO +1 -1
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/setup.py +1 -1
- sparrow-parse-1.0.7/sparrow_parse/__init__.py +1 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/vllm/mlx_inference.py +14 -19
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse.egg-info/PKG-INFO +1 -1
- sparrow-parse-1.0.6/sparrow_parse/__init__.py +0 -1
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/README.md +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/setup.cfg +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/__main__.py +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/extractors/__init__.py +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/extractors/vllm_extractor.py +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/helpers/__init__.py +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/helpers/image_optimizer.py +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/helpers/pdf_optimizer.py +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/processors/__init__.py +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/processors/table_structure_processor.py +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/text_extraction.py +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/vllm/__init__.py +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/vllm/huggingface_inference.py +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/vllm/inference_base.py +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/vllm/inference_factory.py +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/vllm/local_gpu_inference.py +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse.egg-info/SOURCES.txt +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse.egg-info/dependency_links.txt +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse.egg-info/entry_points.txt +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse.egg-info/requires.txt +0 -0
- {sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.7
|
4
4
|
Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
Author: Andrej Baranovskij
|
@@ -8,7 +8,7 @@ with open("requirements.txt", "r", encoding="utf-8") as fh:
|
|
8
8
|
|
9
9
|
setup(
|
10
10
|
name="sparrow-parse",
|
11
|
-
version="1.0.
|
11
|
+
version="1.0.7",
|
12
12
|
author="Andrej Baranovskij",
|
13
13
|
author_email="andrejus.baranovskis@gmail.com",
|
14
14
|
description="Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.",
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = '1.0.7'
|
@@ -188,11 +188,11 @@ class MLXInference(ModelInference):
|
|
188
188
|
|
189
189
|
return results
|
190
190
|
|
191
|
-
|
192
191
|
def transform_query_with_bbox(self, text_input):
|
193
192
|
"""
|
194
193
|
Transform JSON schema in text_input to include value, bbox, and confidence.
|
195
|
-
Works with
|
194
|
+
Works with formats like: "retrieve field1, field2. return response in JSON format,
|
195
|
+
by strictly following this JSON schema: [{...}]."
|
196
196
|
|
197
197
|
Args:
|
198
198
|
text_input (str): The input text containing a JSON schema
|
@@ -200,38 +200,33 @@ class MLXInference(ModelInference):
|
|
200
200
|
Returns:
|
201
201
|
str: Text with transformed JSON including value, bbox, and confidence
|
202
202
|
"""
|
203
|
-
# Split text into parts - find the JSON portion between "retrieve" and "return response"
|
204
|
-
retrieve_pattern = r'retrieve\s+'
|
205
|
-
return_pattern = r'\.\s+return\s+response'
|
206
203
|
|
207
|
-
|
208
|
-
|
204
|
+
schema_pattern = r'JSON schema:\s*(\[.*?\]|\{.*?\})'
|
205
|
+
schema_match = re.search(schema_pattern, text_input, re.DOTALL)
|
209
206
|
|
210
|
-
if not
|
207
|
+
if not schema_match:
|
211
208
|
return text_input # Return original if pattern not found
|
212
209
|
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
json_str = text_input[json_start:json_end].strip()
|
218
|
-
suffix = text_input[json_end:]
|
210
|
+
# Extract the schema part and its position
|
211
|
+
schema_str = schema_match.group(1).strip()
|
212
|
+
schema_start = schema_match.start(1)
|
213
|
+
schema_end = schema_match.end(1)
|
219
214
|
|
220
215
|
# Parse and transform the JSON
|
221
216
|
try:
|
222
217
|
# Handle single quotes if needed
|
223
|
-
|
218
|
+
schema_str = schema_str.replace("'", '"')
|
224
219
|
|
225
|
-
json_obj = json.loads(
|
220
|
+
json_obj = json.loads(schema_str)
|
226
221
|
transformed_json = self.transform_query_structure(json_obj)
|
227
222
|
transformed_json_str = json.dumps(transformed_json)
|
228
223
|
|
229
|
-
# Rebuild the text
|
230
|
-
result =
|
224
|
+
# Rebuild the text by replacing just the schema portion
|
225
|
+
result = text_input[:schema_start] + transformed_json_str + text_input[schema_end:]
|
231
226
|
|
232
227
|
return result
|
233
228
|
except json.JSONDecodeError as e:
|
234
|
-
print(f"Error parsing JSON: {e}")
|
229
|
+
print(f"Error parsing JSON schema: {e}")
|
235
230
|
return text_input # Return original if parsing fails
|
236
231
|
|
237
232
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sparrow-parse
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.7
|
4
4
|
Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
|
5
5
|
Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
|
6
6
|
Author: Andrej Baranovskij
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = '1.0.6'
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sparrow-parse-1.0.6 → sparrow-parse-1.0.7}/sparrow_parse/processors/table_structure_processor.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|