sparrow-parse 0.2.9__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sparrow-parse
3
- Version: 0.2.9
3
+ Version: 0.3.0
4
4
  Summary: Sparrow Parse is a Python package for parsing and extracting information from documents.
5
5
  Home-page: https://github.com/katanaml/sparrow/tree/main/sparrow-data/parse
6
6
  License: GPL-3.0
@@ -15,8 +15,9 @@ Classifier: Programming Language :: Python :: 3.9
15
15
  Classifier: Programming Language :: Python :: 3.10
16
16
  Classifier: Programming Language :: Python :: 3.11
17
17
  Classifier: Topic :: Software Development
18
+ Requires-Dist: PyPDF2 (==3.0.1)
18
19
  Requires-Dist: numpy (==1.26.4)
19
- Requires-Dist: pymupdf4llm (==0.0.5)
20
+ Requires-Dist: pymupdf4llm (==0.0.6)
20
21
  Requires-Dist: rich (>=13.7.1,<14.0.0)
21
22
  Requires-Dist: sentence-transformers (==3.0.1)
22
23
  Requires-Dist: torch (==2.2.2)
@@ -128,6 +129,27 @@ Example:
128
129
 
129
130
  *debug* - `True`
130
131
 
132
+ ## PDF optimization
133
+
134
+ ```
135
+ from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
136
+
137
+ pdf_optimizer = PDFOptimizer()
138
+
139
+ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
140
+ output_directory,
141
+ convert_to_images)
142
+
143
+ ```
144
+
145
+ Example:
146
+
147
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
148
+
149
+ *output_directory* - set to not `None`, for debug purposes only
150
+
151
+ *convert_to_images* - default `False`, to split into PDF files
152
+
131
153
  ## Library build
132
154
 
133
155
  ```
@@ -100,6 +100,27 @@ Example:
100
100
 
101
101
  *debug* - `True`
102
102
 
103
+ ## PDF optimization
104
+
105
+ ```
106
+ from sparrow_parse.extractor.pdf_optimizer import PDFOptimizer
107
+
108
+ pdf_optimizer = PDFOptimizer()
109
+
110
+ num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages(file_path,
111
+ output_directory,
112
+ convert_to_images)
113
+
114
+ ```
115
+
116
+ Example:
117
+
118
+ *file_path* - `/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/invoice_1.pdf`
119
+
120
+ *output_directory* - set to not `None`, for debug purposes only
121
+
122
+ *convert_to_images* - default `False`, to split into PDF files
123
+
103
124
  ## Library build
104
125
 
105
126
  ```
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sparrow-parse"
3
- version = "0.2.9"
3
+ version = "0.3.0"
4
4
  description = "Sparrow Parse is a Python package for parsing and extracting information from documents."
5
5
  authors = ["Andrej Baranovskij <andrejus.baranovskis@gmail.com>"]
6
6
  license = "GPL-3.0"
@@ -25,10 +25,11 @@ torch = {version = "2.2.2", source = "pypi"}
25
25
  unstructured = {version = "0.14.5", extras = ["all-docs"]}
26
26
  unstructured-inference = "0.7.33"
27
27
  rich = "^13.7.1"
28
- pymupdf4llm = "0.0.5"
28
+ pymupdf4llm = "0.0.6"
29
29
  transformers = "4.41.2"
30
30
  sentence-transformers = "3.0.1"
31
31
  numpy = "1.26.4"
32
+ PyPDF2 = "3.0.1"
32
33
 
33
34
 
34
35
  [tool.poetry.scripts]
@@ -0,0 +1 @@
1
+ __version__ = '0.3.0'
@@ -244,7 +244,7 @@ if __name__ == "__main__":
244
244
  # ['deposits', 'account_number', 'od_limit', 'currency_balance', 'sgd_balance', 'transaction_date',
245
245
  # 'value_date', 'description', 'cheque', 'withdrawal', 'deposit', 'balance', 'maturity_date'],
246
246
  # True,
247
- # True,
247
+ # False,
248
248
  # True,
249
249
  # True)
250
250
  #
@@ -0,0 +1,72 @@
1
+ import PyPDF2
2
+ from pdf2image import convert_from_path
3
+ import os
4
+ import tempfile
5
+ import shutil
6
+
7
+
8
+ class PDFOptimizer(object):
9
+ def __init__(self):
10
+ pass
11
+
12
+ def split_pdf_to_pages(self, file_path, output_dir=None, convert_to_images=False):
13
+ # Create a temporary directory
14
+ temp_dir = tempfile.mkdtemp()
15
+ output_files = []
16
+
17
+ if not convert_to_images:
18
+ # Open the PDF file
19
+ with open(file_path, 'rb') as pdf_file:
20
+ reader = PyPDF2.PdfReader(pdf_file)
21
+ number_of_pages = len(reader.pages)
22
+
23
+ # Split the PDF into separate files per page
24
+ for page_num in range(number_of_pages):
25
+ writer = PyPDF2.PdfWriter()
26
+ writer.add_page(reader.pages[page_num])
27
+
28
+ output_filename = os.path.join(temp_dir, f'page_{page_num + 1}.pdf')
29
+ with open(output_filename, 'wb') as output_file:
30
+ writer.write(output_file)
31
+ output_files.append(output_filename)
32
+
33
+ if output_dir:
34
+ # Save each page to the debug folder
35
+ debug_output_filename = os.path.join(output_dir, f'page_{page_num + 1}.pdf')
36
+ with open(debug_output_filename, 'wb') as output_file:
37
+ writer.write(output_file)
38
+
39
+ # Return the number of pages, the list of file paths, and the temporary directory
40
+ return number_of_pages, output_files, temp_dir
41
+ else:
42
+ # Convert the PDF to images
43
+ images = convert_from_path(file_path, dpi=400)
44
+
45
+ # Save the images to the temporary directory
46
+ for i, image in enumerate(images):
47
+ output_filename = os.path.join(temp_dir, f'page_{i + 1}.jpg')
48
+ image.save(output_filename, 'JPEG')
49
+ output_files.append(output_filename)
50
+
51
+ if output_dir:
52
+ # Save each image to the debug folder
53
+ debug_output_filename = os.path.join(output_dir, f'page_{i + 1}.jpg')
54
+ image.save(debug_output_filename, 'JPEG')
55
+
56
+ # Return the number of pages, the list of file paths, and the temporary directory
57
+ return len(images), output_files, temp_dir
58
+
59
+
60
+ if __name__ == "__main__":
61
+ pdf_optimizer = PDFOptimizer()
62
+
63
+ # output_directory = "/Users/andrejb/Documents/work/bankstatement/output_pages"
64
+ # # Ensure the output directory exists
65
+ # os.makedirs(output_directory, exist_ok=True)
66
+ #
67
+ # # Split the optimized PDF into separate pages
68
+ # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/Documents/work/bankstatement/statement.pdf",
69
+ # output_directory,
70
+ # False)
71
+ #
72
+ # shutil.rmtree(temp_dir, ignore_errors=True)
@@ -176,4 +176,3 @@ if __name__ == "__main__":
176
176
  # ['tables', 'unstructured'],
177
177
  # True,
178
178
  # True)
179
-
@@ -0,0 +1,27 @@
1
+ # content, table_content = processor.extract_data(
2
+ # '/Users/andrejb/Documents/work/schreiber/invoice_data/test/2618407.pdf',
3
+ # 'hi_res',
4
+ # 'yolox',
5
+ # # 'detectron2_onnx',
6
+ # ['tables', 'unstructured'],
7
+ # True,
8
+ # True)
9
+
10
+ # content, table_content = processor.extract_data(
11
+ # '/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_1.pdf',
12
+ # 'hi_res',
13
+ # 'yolox',
14
+ # ['tables', 'unstructured'],
15
+ # True,
16
+ # True)
17
+
18
+ # output_directory = "/Users/andrejb/Documents/work/epik/bankstatement/output_pages"
19
+ # # Ensure the output directory exists
20
+ # os.makedirs(output_directory, exist_ok=True)
21
+ #
22
+ # # Split the optimized PDF into separate pages
23
+ # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_statement.pdf",
24
+ # output_directory,
25
+ # False)
26
+ #
27
+ # shutil.rmtree(temp_dir, ignore_errors=True)
@@ -1 +0,0 @@
1
- __version__ = '0.2.9'
@@ -1,16 +0,0 @@
1
- # content, table_content = processor.extract_data(
2
- # '/Users/andrejb/Documents/work/schreiber/invoice_data/test/2618407.pdf',
3
- # 'hi_res',
4
- # 'yolox',
5
- # # 'detectron2_onnx',
6
- # ['tables', 'unstructured'],
7
- # True,
8
- # True)
9
-
10
- # content, table_content = processor.extract_data(
11
- # '/Users/andrejb/Documents/work/epik/bankstatement/OCBC_1_1.pdf',
12
- # 'hi_res',
13
- # 'yolox',
14
- # ['tables', 'unstructured'],
15
- # True,
16
- # True)