sparrow-parse 0.5.0__tar.gz → 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/PKG-INFO +26 -3
  2. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/README.md +25 -2
  3. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/setup.py +1 -1
  4. sparrow-parse-0.5.2/sparrow_parse/__init__.py +1 -0
  5. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/extractors/vllm_extractor.py +55 -11
  6. sparrow-parse-0.5.2/sparrow_parse/helpers/image_optimizer.py +59 -0
  7. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/helpers/pdf_optimizer.py +9 -7
  8. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/processors/table_structure_processor.py +2 -2
  9. sparrow-parse-0.5.2/sparrow_parse/text_extraction.py +30 -0
  10. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/vllm/mlx_inference.py +1 -1
  11. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse.egg-info/PKG-INFO +26 -3
  12. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse.egg-info/SOURCES.txt +2 -0
  13. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse.egg-info/requires.txt +2 -1
  14. sparrow-parse-0.5.0/sparrow_parse/__init__.py +0 -1
  15. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/setup.cfg +0 -0
  16. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/__main__.py +0 -0
  17. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/extractors/__init__.py +0 -0
  18. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/helpers/__init__.py +0 -0
  19. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/processors/__init__.py +0 -0
  20. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/vllm/__init__.py +0 -0
  21. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/vllm/huggingface_inference.py +0 -0
  22. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/vllm/inference_base.py +0 -0
  23. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/vllm/inference_factory.py +0 -0
  24. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse/vllm/local_gpu_inference.py +0 -0
  25. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse.egg-info/dependency_links.txt +0 -0
  26. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse.egg-info/entry_points.txt +0 -0
  27. {sparrow-parse-0.5.0 → sparrow-parse-0.5.2}/sparrow_parse.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.5.0
3
+ Version: 0.5.2
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -60,6 +60,7 @@ input_data = [
60
60
  # Now you can run inference without knowing which implementation is used
61
61
  results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
62
62
  generic_query=False,
63
+ crop_size=80,
63
64
  debug_dir=None,
64
65
  debug=True,
65
66
  mode=None)
@@ -71,6 +72,8 @@ print(f"Number of pages: {num_pages}")
71
72
 
72
73
  Use `tables_only=True` if you want to extract only tables.
73
74
 
75
+ Use `crop_size=N` (where `N` is an integer) to crop N pixels from all borders of the input images. This can be helpful for removing unwanted borders or frame artifacts from scanned documents.
76
+
74
77
  Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
75
78
 
76
79
  Method `run_inference` will return results and number of pages processed.
@@ -95,7 +98,7 @@ from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
95
98
  pdf_optimizer = PDFOptimizer()
96
99
 
97
100
  num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
98
- output_directory,
101
+ debug_dir,
99
102
  convert_to_images)
100
103
 
101
104
  ```
@@ -104,10 +107,30 @@ Example:
104
107
 
105
108
  *file_path* - `/data/invoice_1.pdf`
106
109
 
107
- *output_directory* - set to not `None`, for debug purposes only
110
+ *debug_dir* - set to not `None`, for debug purposes only
108
111
 
109
112
  *convert_to_images* - default `False`, to split into PDF files
110
113
 
114
+ ## Image cropping
115
+
116
+ ```
117
+ from sparrow_parse.helpers.image_optimizer import ImageOptimizer
118
+
119
+ image_optimizer = ImageOptimizer()
120
+
121
+ cropped_file_path = image_optimizer.crop_image_borders(file_path, temp_dir, debug_dir, crop_size)
122
+ ```
123
+
124
+ Example:
125
+
126
+ *file_path* - `/data/invoice_1.jpg`
127
+
128
+ *temp_dir* - directory to store cropped files
129
+
130
+ *debug_dir* - set to not `None`, for debug purposes only
131
+
132
+ *crop_size* - Number of pixels to crop from each border
133
+
111
134
  ## Library build
112
135
 
113
136
  Create Python virtual environment
@@ -41,6 +41,7 @@ input_data = [
41
41
  # Now you can run inference without knowing which implementation is used
42
42
  results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
43
43
  generic_query=False,
44
+ crop_size=80,
44
45
  debug_dir=None,
45
46
  debug=True,
46
47
  mode=None)
@@ -52,6 +53,8 @@ print(f"Number of pages: {num_pages}")
52
53
 
53
54
  Use `tables_only=True` if you want to extract only tables.
54
55
 
56
+ Use `crop_size=N` (where `N` is an integer) to crop N pixels from all borders of the input images. This can be helpful for removing unwanted borders or frame artifacts from scanned documents.
57
+
55
58
  Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
56
59
 
57
60
  Method `run_inference` will return results and number of pages processed.
@@ -76,7 +79,7 @@ from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
76
79
  pdf_optimizer = PDFOptimizer()
77
80
 
78
81
  num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
79
- output_directory,
82
+ debug_dir,
80
83
  convert_to_images)
81
84
 
82
85
  ```
@@ -85,10 +88,30 @@ Example:
85
88
 
86
89
  *file_path* - `/data/invoice_1.pdf`
87
90
 
88
- *output_directory* - set to not `None`, for debug purposes only
91
+ *debug_dir* - set to not `None`, for debug purposes only
89
92
 
90
93
  *convert_to_images* - default `False`, to split into PDF files
91
94
 
95
+ ## Image cropping
96
+
97
+ ```
98
+ from sparrow_parse.helpers.image_optimizer import ImageOptimizer
99
+
100
+ image_optimizer = ImageOptimizer()
101
+
102
+ cropped_file_path = image_optimizer.crop_image_borders(file_path, temp_dir, debug_dir, crop_size)
103
+ ```
104
+
105
+ Example:
106
+
107
+ *file_path* - `/data/invoice_1.jpg`
108
+
109
+ *temp_dir* - directory to store cropped files
110
+
111
+ *debug_dir* - set to not `None`, for debug purposes only
112
+
113
+ *crop_size* - Number of pixels to crop from each border
114
+
92
115
  ## Library build
93
116
 
94
117
  Create Python virtual environment
@@ -8,7 +8,7 @@ with open("requirements.txt", "r", encoding="utf-8") as fh:
8
8
 
9
9
  setup(
10
10
  name="sparrow-parse",
11
- version="0.5.0",
11
+ version="0.5.2",
12
12
  author="Andrej Baranovskij",
13
13
  author_email="andrejus.baranovskis@gmail.com",
14
14
  description="Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.",
@@ -0,0 +1 @@
1
+ __version__ = '0.5.2'
@@ -1,7 +1,7 @@
1
1
  import json
2
-
3
2
  from sparrow_parse.vllm.inference_factory import InferenceFactory
4
3
  from sparrow_parse.helpers.pdf_optimizer import PDFOptimizer
4
+ from sparrow_parse.helpers.image_optimizer import ImageOptimizer
5
5
  from sparrow_parse.processors.table_structure_processor import TableDetector
6
6
  from rich import print
7
7
  import os
@@ -14,7 +14,7 @@ class VLLMExtractor(object):
14
14
  pass
15
15
 
16
16
  def run_inference(self, model_inference_instance, input_data, tables_only=False,
17
- generic_query=False, debug_dir=None, debug=False, mode=None):
17
+ generic_query=False, crop_size=None, debug_dir=None, debug=False, mode=None):
18
18
  """
19
19
  Main entry point for processing input data using a model inference instance.
20
20
  Handles generic queries, PDFs, and table extraction.
@@ -27,12 +27,12 @@ class VLLMExtractor(object):
27
27
 
28
28
  file_path = input_data[0]["file_path"]
29
29
  if self.is_pdf(file_path):
30
- return self._process_pdf(model_inference_instance, input_data, tables_only, debug, debug_dir, mode)
30
+ return self._process_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode)
31
31
 
32
- return self._process_non_pdf(model_inference_instance, input_data, tables_only, debug, debug_dir)
32
+ return self._process_non_pdf(model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir)
33
33
 
34
34
 
35
- def _process_pdf(self, model_inference_instance, input_data, tables_only, debug, debug_dir, mode):
35
+ def _process_pdf(self, model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir, mode):
36
36
  """
37
37
  Handles processing and inference for PDF files, including page splitting and optional table extraction.
38
38
  """
@@ -40,26 +40,40 @@ class VLLMExtractor(object):
40
40
  num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(input_data[0]["file_path"],
41
41
  debug_dir, convert_to_images=True)
42
42
 
43
- results = self._process_pages(model_inference_instance, output_files, input_data, tables_only, debug, debug_dir)
43
+ results = self._process_pages(model_inference_instance, output_files, input_data, tables_only, crop_size, debug, debug_dir)
44
44
 
45
45
  # Clean up temporary directory
46
46
  shutil.rmtree(temp_dir, ignore_errors=True)
47
47
  return results, num_pages
48
48
 
49
49
 
50
- def _process_non_pdf(self, model_inference_instance, input_data, tables_only, debug, debug_dir):
50
+ def _process_non_pdf(self, model_inference_instance, input_data, tables_only, crop_size, debug, debug_dir):
51
51
  """
52
52
  Handles processing and inference for non-PDF files, with optional table extraction.
53
53
  """
54
54
  file_path = input_data[0]["file_path"]
55
+
55
56
  if tables_only:
56
57
  return self._extract_tables(model_inference_instance, file_path, input_data, debug, debug_dir), 1
57
58
  else:
59
+ temp_dir = tempfile.mkdtemp()
60
+
61
+ if crop_size:
62
+ if debug:
63
+ print(f"Cropping image borders by {crop_size} pixels.")
64
+ image_optimizer = ImageOptimizer()
65
+ cropped_file_path = image_optimizer.crop_image_borders(file_path, temp_dir, debug_dir, crop_size)
66
+ input_data[0]["file_path"] = cropped_file_path
67
+
68
+ file_path = input_data[0]["file_path"]
58
69
  input_data[0]["file_path"] = [file_path]
59
70
  results = model_inference_instance.inference(input_data)
71
+
72
+ shutil.rmtree(temp_dir, ignore_errors=True)
73
+
60
74
  return results, 1
61
75
 
62
- def _process_pages(self, model_inference_instance, output_files, input_data, tables_only, debug, debug_dir):
76
+ def _process_pages(self, model_inference_instance, output_files, input_data, tables_only, crop_size, debug, debug_dir):
63
77
  """
64
78
  Processes individual pages (PDF split) and handles table extraction or inference.
65
79
 
@@ -68,6 +82,7 @@ class VLLMExtractor(object):
68
82
  output_files: List of file paths for the split PDF pages.
69
83
  input_data: Input data for inference.
70
84
  tables_only: Whether to only process tables.
85
+ crop_size: Size for cropping image borders.
71
86
  debug: Debug flag for logging.
72
87
  debug_dir: Directory for saving debug information.
73
88
 
@@ -89,11 +104,39 @@ class VLLMExtractor(object):
89
104
  else:
90
105
  if debug:
91
106
  print(f"Processing {len(output_files)} pages for inference at once.")
92
- # Pass all output files to the inference method for processing at once
93
- input_data[0]["file_path"] = output_files
107
+
108
+ temp_dir = tempfile.mkdtemp()
109
+ cropped_files = []
110
+
111
+ if crop_size:
112
+ if debug:
113
+ print(f"Cropping image borders by {crop_size} pixels from {len(output_files)} images.")
114
+
115
+ image_optimizer = ImageOptimizer()
116
+
117
+ # Process each file in the output_files array
118
+ for file_path in output_files:
119
+ cropped_file_path = image_optimizer.crop_image_borders(
120
+ file_path,
121
+ temp_dir,
122
+ debug_dir,
123
+ crop_size
124
+ )
125
+ cropped_files.append(cropped_file_path)
126
+
127
+ # Use the cropped files for inference
128
+ input_data[0]["file_path"] = cropped_files
129
+ else:
130
+ # If no cropping needed, use original files directly
131
+ input_data[0]["file_path"] = output_files
132
+
133
+ # Process all files at once
94
134
  results = model_inference_instance.inference(input_data)
95
135
  results_array.extend(results)
96
136
 
137
+ # Clean up temporary directory
138
+ shutil.rmtree(temp_dir, ignore_errors=True)
139
+
97
140
  return results_array
98
141
 
99
142
 
@@ -174,8 +217,9 @@ if __name__ == "__main__":
174
217
  # ]
175
218
  #
176
219
  # # Now you can run inference without knowing which implementation is used
177
- # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=True,
220
+ # results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
178
221
  # generic_query=False,
222
+ # crop_size=80,
179
223
  # debug_dir="/Users/andrejb/Work/katana-git/sparrow/sparrow-ml/llm/data/",
180
224
  # debug=True,
181
225
  # mode=None)
@@ -0,0 +1,59 @@
1
+ from PIL import Image
2
+ import os
3
+
4
+
5
+ class ImageOptimizer(object):
6
+ def __init__(self):
7
+ pass
8
+
9
+ def crop_image_borders(self, file_path, temp_dir, debug_dir=None, crop_size=60):
10
+ """
11
+ Crops all four borders of an image by the specified size.
12
+
13
+ Args:
14
+ file_path (str): Path to the input image
15
+ temp_dir (str): Temporary directory to store the cropped image
16
+ debug_dir (str, optional): Directory to save a debug copy of the cropped image
17
+ crop_size (int): Number of pixels to crop from each border
18
+
19
+ Returns:
20
+ str: Path to the cropped image in temp_dir
21
+ """
22
+ try:
23
+ # Open the image
24
+ with Image.open(file_path) as img:
25
+ # Get image dimensions
26
+ width, height = img.size
27
+
28
+ # Calculate the crop box
29
+ left = crop_size
30
+ top = crop_size
31
+ right = width - crop_size
32
+ bottom = height - crop_size
33
+
34
+ # Ensure we're not trying to crop more than the image size
35
+ if right <= left or bottom <= top:
36
+ raise ValueError("Crop size is too large for the image dimensions")
37
+
38
+ # Perform the crop
39
+ cropped_img = img.crop((left, top, right, bottom))
40
+
41
+ # Get original filename without path
42
+ filename = os.path.basename(file_path)
43
+ name, ext = os.path.splitext(filename)
44
+
45
+ # Save cropped image in temp_dir
46
+ output_path = os.path.join(temp_dir, f"{name}_cropped{ext}")
47
+ cropped_img.save(output_path)
48
+
49
+ # If debug_dir is provided, save a debug copy
50
+ if debug_dir:
51
+ os.makedirs(debug_dir, exist_ok=True)
52
+ debug_path = os.path.join(debug_dir, f"{name}_cropped_debug{ext}")
53
+ cropped_img.save(debug_path)
54
+ print(f"Debug cropped image saved to: {debug_path}")
55
+
56
+ return output_path
57
+
58
+ except Exception as e:
59
+ raise Exception(f"Error processing image: {str(e)}")
@@ -9,7 +9,7 @@ class PDFOptimizer(object):
9
9
  def __init__(self):
10
10
  pass
11
11
 
12
- def split_pdf_to_pages(self, file_path, output_dir=None, convert_to_images=False):
12
+ def split_pdf_to_pages(self, file_path, debug_dir=None, convert_to_images=False):
13
13
  # Create a temporary directory
14
14
  temp_dir = tempfile.mkdtemp()
15
15
  output_files = []
@@ -30,9 +30,9 @@ class PDFOptimizer(object):
30
30
  writer.write(output_file)
31
31
  output_files.append(output_filename)
32
32
 
33
- if output_dir:
33
+ if debug_dir:
34
34
  # Save each page to the debug folder
35
- debug_output_filename = os.path.join(output_dir, f'page_{page_num + 1}.pdf')
35
+ debug_output_filename = os.path.join(debug_dir, f'page_{page_num + 1}.pdf')
36
36
  with open(debug_output_filename, 'wb') as output_file:
37
37
  writer.write(output_file)
38
38
 
@@ -49,10 +49,12 @@ class PDFOptimizer(object):
49
49
  image.save(output_filename, 'JPEG')
50
50
  output_files.append(output_filename)
51
51
 
52
- if output_dir:
52
+ if debug_dir:
53
53
  # Save each image to the debug folder
54
- debug_output_filename = os.path.join(output_dir, f'{base_name}_page_{i + 1}.jpg')
54
+ os.makedirs(debug_dir, exist_ok=True)
55
+ debug_output_filename = os.path.join(debug_dir, f'{base_name}_page_{i + 1}_debug.jpg')
55
56
  image.save(debug_output_filename, 'JPEG')
57
+ print(f"Debug image saved to: {debug_output_filename}")
56
58
 
57
59
  # Return the number of pages, the list of file paths, and the temporary directory
58
60
  return len(images), output_files, temp_dir
@@ -61,13 +63,13 @@ class PDFOptimizer(object):
61
63
  if __name__ == "__main__":
62
64
  pdf_optimizer = PDFOptimizer()
63
65
 
64
- # output_directory = "/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/"
66
+ # debug_dir = "/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/"
65
67
  # # Ensure the output directory exists
66
68
  # os.makedirs(output_directory, exist_ok=True)
67
69
  #
68
70
  # # Split the optimized PDF into separate pages
69
71
  # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/oracle_10k_2014_q1_small.pdf",
70
- # output_directory,
72
+ # debug_dir,
71
73
  # True)
72
74
  #
73
75
  # print(f"Number of pages: {num_pages}")
@@ -131,7 +131,7 @@ class TableDetector(object):
131
131
  cropped_tables.append(cropped_table)
132
132
 
133
133
  if debug_dir:
134
- file_name_table = self.append_filename(file_path, debug_dir, f"cropped_{i + 1}")
134
+ file_name_table = self.append_filename(file_path, debug_dir, f"table_cropped_{i + 1}")
135
135
  cropped_table.save(file_name_table)
136
136
  else:
137
137
  if debug:
@@ -141,7 +141,7 @@ class TableDetector(object):
141
141
  cropped_tables.append(cropped_table)
142
142
 
143
143
  if debug_dir:
144
- file_name_table = self.append_filename(file_path, debug_dir, "cropped")
144
+ file_name_table = self.append_filename(file_path, debug_dir, "table_cropped")
145
145
  cropped_table.save(file_name_table)
146
146
 
147
147
  return cropped_tables
@@ -0,0 +1,30 @@
1
+ from mlx_vlm import load, apply_chat_template, generate
2
+ from mlx_vlm.utils import load_image
3
+
4
+ # For test purposes, we will use a sample image
5
+
6
+ # Load model and processor
7
+ qwen_vl_model, qwen_vl_processor = load("mlx-community/Qwen2-VL-7B-Instruct-8bit")
8
+ qwen_vl_config = qwen_vl_model.config
9
+
10
+ image = load_image("images/graph.png")
11
+
12
+ messages = [
13
+ {"role": "system", "content": "You are an expert at extracting text from images. Format your response in json."},
14
+ {"role": "user", "content": "Extract the names, labels and y coordinates from the image."}
15
+ ]
16
+
17
+ # Apply chat template
18
+ prompt = apply_chat_template(qwen_vl_processor, qwen_vl_config, messages)
19
+
20
+ # Generate text
21
+ qwen_vl_output = generate(
22
+ qwen_vl_model,
23
+ qwen_vl_processor,
24
+ prompt,
25
+ image,
26
+ max_tokens=1000,
27
+ temperature=0.7,
28
+ )
29
+
30
+ print(qwen_vl_output)
@@ -112,8 +112,8 @@ class MLXInference(ModelInference):
112
112
  response = generate(
113
113
  model,
114
114
  processor,
115
- image,
116
115
  prompt,
116
+ image,
117
117
  resize_shape=(width, height),
118
118
  max_tokens=4000,
119
119
  temperature=0.0,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.5.0
3
+ Version: 0.5.2
4
4
  Summary: Sparrow Parse is a Python package (part of Sparrow) for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  Author: Andrej Baranovskij
@@ -60,6 +60,7 @@ input_data = [
60
60
  # Now you can run inference without knowing which implementation is used
61
61
  results_array, num_pages = extractor.run_inference(model_inference_instance, input_data, tables_only=False,
62
62
  generic_query=False,
63
+ crop_size=80,
63
64
  debug_dir=None,
64
65
  debug=True,
65
66
  mode=None)
@@ -71,6 +72,8 @@ print(f"Number of pages: {num_pages}")
71
72
 
72
73
  Use `tables_only=True` if you want to extract only tables.
73
74
 
75
+ Use `crop_size=N` (where `N` is an integer) to crop N pixels from all borders of the input images. This can be helpful for removing unwanted borders or frame artifacts from scanned documents.
76
+
74
77
  Use `mode="static"` if you want to simulate LLM call, without executing LLM backend.
75
78
 
76
79
  Method `run_inference` will return results and number of pages processed.
@@ -95,7 +98,7 @@ from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
95
98
  pdf_optimizer = PDFOptimizer()
96
99
 
97
100
  num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
98
- output_directory,
101
+ debug_dir,
99
102
  convert_to_images)
100
103
 
101
104
  ```
@@ -104,10 +107,30 @@ Example:
104
107
 
105
108
  *file_path* - `/data/invoice_1.pdf`
106
109
 
107
- *output_directory* - set to not `None`, for debug purposes only
110
+ *debug_dir* - set to not `None`, for debug purposes only
108
111
 
109
112
  *convert_to_images* - default `False`, to split into PDF files
110
113
 
114
+ ## Image cropping
115
+
116
+ ```
117
+ from sparrow_parse.helpers.image_optimizer import ImageOptimizer
118
+
119
+ image_optimizer = ImageOptimizer()
120
+
121
+ cropped_file_path = image_optimizer.crop_image_borders(file_path, temp_dir, debug_dir, crop_size)
122
+ ```
123
+
124
+ Example:
125
+
126
+ *file_path* - `/data/invoice_1.jpg`
127
+
128
+ *temp_dir* - directory to store cropped files
129
+
130
+ *debug_dir* - set to not `None`, for debug purposes only
131
+
132
+ *crop_size* - Number of pixels to crop from each border
133
+
111
134
  ## Library build
112
135
 
113
136
  Create Python virtual environment
@@ -2,6 +2,7 @@ README.md
2
2
  setup.py
3
3
  sparrow_parse/__init__.py
4
4
  sparrow_parse/__main__.py
5
+ sparrow_parse/text_extraction.py
5
6
  sparrow_parse.egg-info/PKG-INFO
6
7
  sparrow_parse.egg-info/SOURCES.txt
7
8
  sparrow_parse.egg-info/dependency_links.txt
@@ -11,6 +12,7 @@ sparrow_parse.egg-info/top_level.txt
11
12
  sparrow_parse/extractors/__init__.py
12
13
  sparrow_parse/extractors/vllm_extractor.py
13
14
  sparrow_parse/helpers/__init__.py
15
+ sparrow_parse/helpers/image_optimizer.py
14
16
  sparrow_parse/helpers/pdf_optimizer.py
15
17
  sparrow_parse/processors/__init__.py
16
18
  sparrow_parse/processors/table_structure_processor.py
@@ -7,4 +7,5 @@ gradio_client
7
7
  pdf2image
8
8
 
9
9
  [:sys_platform == "darwin" and platform_machine == "arm64"]
10
- mlx-vlm==0.1.10
10
+ mlx>=0.22.0
11
+ mlx-vlm==0.1.11
@@ -1 +0,0 @@
1
- __version__ = '0.5.0'
File without changes