sparrow-parse 0.4.1__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/PKG-INFO +1 -1
  2. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/setup.py +1 -1
  3. sparrow-parse-0.4.2/sparrow_parse/__init__.py +1 -0
  4. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse/extractors/vllm_extractor.py +13 -4
  5. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse.egg-info/PKG-INFO +1 -1
  6. sparrow-parse-0.4.1/sparrow_parse/__init__.py +0 -1
  7. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/README.md +0 -0
  8. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/setup.cfg +0 -0
  9. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse/__main__.py +0 -0
  10. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse/extractors/__init__.py +0 -0
  11. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse/helpers/__init__.py +0 -0
  12. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse/helpers/pdf_optimizer.py +0 -0
  13. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse/processors/__init__.py +0 -0
  14. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse/processors/table_structure_processor.py +0 -0
  15. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse/vllm/__init__.py +0 -0
  16. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse/vllm/huggingface_inference.py +0 -0
  17. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse/vllm/inference_base.py +0 -0
  18. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse/vllm/inference_factory.py +0 -0
  19. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse/vllm/local_gpu_inference.py +0 -0
  20. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse/vllm/mlx_inference.py +0 -0
  21. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse.egg-info/SOURCES.txt +0 -0
  22. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse.egg-info/dependency_links.txt +0 -0
  23. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse.egg-info/entry_points.txt +0 -0
  24. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse.egg-info/requires.txt +0 -0
  25. {sparrow-parse-0.4.1 → sparrow-parse-0.4.2}/sparrow_parse.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -8,7 +8,7 @@ with open("requirements.txt", "r", encoding="utf-8") as fh:
8
8
 
9
9
  setup(
10
10
  name="sparrow-parse",
11
- version="0.4.1",
11
+ version="0.4.2",
12
12
  author="Andrej Baranovskij",
13
13
  author_email="andrejus.baranovskis@gmail.com",
14
14
  description="Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.",
@@ -0,0 +1 @@
1
+ __version__ = '0.4.2'
@@ -54,7 +54,7 @@ class VLLMExtractor(object):
54
54
  """
55
55
  file_path = input_data[0]["file_path"]
56
56
  if tables_only:
57
- return [self._extract_tables(model_inference_instance, file_path, input_data, debug, debug_dir)], 1
57
+ return self._extract_tables(model_inference_instance, file_path, input_data, debug, debug_dir), 1
58
58
  else:
59
59
  input_data[0]["file_path"] = [file_path]
60
60
  results = model_inference_instance.inference(input_data)
@@ -85,7 +85,8 @@ class VLLMExtractor(object):
85
85
  tables_result = self._extract_tables(
86
86
  model_inference_instance, file_path, input_data, debug, debug_dir, page_index=i
87
87
  )
88
- results_array.append(tables_result)
88
+ # Since _extract_tables returns a list with one JSON string, unpack it
89
+ results_array.extend(tables_result) # Unpack the single JSON string
89
90
  else:
90
91
  if debug:
91
92
  print(f"Processing {len(output_files)} pages for inference at once.")
@@ -118,7 +119,15 @@ class VLLMExtractor(object):
118
119
  results_array.append(result)
119
120
 
120
121
  shutil.rmtree(temp_dir, ignore_errors=True)
121
- return json.dumps(results_array, indent=4)
122
+
123
+ # Merge results_array elements into a single JSON structure
124
+ merged_results = {"page_tables": results_array}
125
+
126
+ # Format the merged results as a JSON string with indentation
127
+ formatted_results = json.dumps(merged_results, indent=4)
128
+
129
+ # Return the formatted JSON string wrapped in a list
130
+ return [formatted_results]
122
131
 
123
132
 
124
133
  @staticmethod
@@ -166,7 +175,7 @@ if __name__ == "__main__":
166
175
  # ]
167
176
  #
168
177
  # # Now you can run inference without knowing which implementation is used
169
- # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
178
+ # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=True,
170
179
  # generic_query=False,
171
180
  # debug_dir="/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/",
172
181
  # debug=True,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -1 +0,0 @@
1
- __version__ = '0.4.1'
File without changes
File without changes