abstract-pdfs 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_pdfs-0.0.1/PKG-INFO +243 -0
- abstract_pdfs-0.0.1/README.md +216 -0
- abstract_pdfs-0.0.1/pyproject.toml +3 -0
- abstract_pdfs-0.0.1/setup.cfg +21 -0
- abstract_pdfs-0.0.1/setup.py +31 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs/AbstractPDFManager.py +136 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs/SliceManager.py +154 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs/__init__.py +4 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs/imports/__init__.py +2 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs/imports/imports.py +9 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs/imports/manifest_utils.py +45 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs/pdf_utils/__init__.py +2 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs/pdf_utils/imports.py +1 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs/pdf_utils/pdf_to_text.py +170 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs/pdf_utils/pdf_tools.py +138 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs.egg-info/PKG-INFO +243 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs.egg-info/SOURCES.txt +19 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs.egg-info/dependency_links.txt +1 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs.egg-info/requires.txt +2 -0
- abstract_pdfs-0.0.1/src/abstract_pdfs.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: abstract_pdfs
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A modular OCR and PDF-processing toolkit for automated text extraction, deduplication, and multi-engine column-aware OCR using Tesseract, EasyOCR, and PaddleOCR
|
|
5
|
+
Home-page: https://github.com/AbstractEndeavors/abstract_pdfs
|
|
6
|
+
Author: putkoff
|
|
7
|
+
Author-email: partners@abstractendeavors.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Requires-Python: >=3.6
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
Requires-Dist: abstract_ocr
|
|
17
|
+
Requires-Dist: abstract_utilities
|
|
18
|
+
Dynamic: author
|
|
19
|
+
Dynamic: author-email
|
|
20
|
+
Dynamic: classifier
|
|
21
|
+
Dynamic: description
|
|
22
|
+
Dynamic: description-content-type
|
|
23
|
+
Dynamic: home-page
|
|
24
|
+
Dynamic: requires-dist
|
|
25
|
+
Dynamic: requires-python
|
|
26
|
+
Dynamic: summary
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## **Abstract PDFs**
|
|
31
|
+
|
|
32
|
+
[](https://badge.fury.io/py/abstract-pdfs)
|
|
33
|
+
[](https://opensource.org/licenses/MIT)
|
|
34
|
+
[](https://www.python.org/downloads/)
|
|
35
|
+
|
|
36
|
+
**Abstract PDFs** is a modular OCR and PDF-processing toolkit built for automation pipelines.
|
|
37
|
+
It provides a structured way to **ingest**, **deduplicate**, **split**, and **extract text** from PDF documents — including **column-aware OCR** through multiple engines (PaddleOCR, EasyOCR, and Tesseract).
|
|
38
|
+
|
|
39
|
+
Designed to integrate seamlessly with other *Abstract* modules (like `abstract_ocr` and `abstract_utilities`), it forms the foundation for scalable document analysis, digital archiving, and machine learning dataset preparation.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
### **Table of Contents**
|
|
44
|
+
|
|
45
|
+
* [Features](#features)
|
|
46
|
+
* [Installation](#installation)
|
|
47
|
+
* [Usage](#usage)
|
|
48
|
+
* [Architecture](#architecture)
|
|
49
|
+
* [Classes](#classes)
|
|
50
|
+
* [Dependencies](#dependencies)
|
|
51
|
+
* [Example Workflow](#example-workflow)
|
|
52
|
+
* [License](#license)
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
## Architecture
|
|
56
|
+
|
|
57
|
+
`Architecture
|
|
58
|
+
|
|
59
|
+
This module is organized in a straightforward way. It consists of two main folders: 'pdf_utils', responsible for various PDF handling operations, and 'imports', managing the necessary import functions.
|
|
60
|
+
|
|
61
|
+
`
|
|
62
|
+
```bash
|
|
63
|
+
├── home
|
|
64
|
+
│ └── computron
|
|
65
|
+
│ └── Documents
|
|
66
|
+
│ └── pythonTools
|
|
67
|
+
│ └── modules
|
|
68
|
+
│ └── src
|
|
69
|
+
│ └── modules
|
|
70
|
+
│ └── abstract_pdfs
|
|
71
|
+
│ ├── abstract_pdfs
|
|
72
|
+
│ │ ├── AbstractPDFManager.py # Primary module for managing PDF operations
|
|
73
|
+
│ │ ├── SliceManager.py # Module for handling slice operations
|
|
74
|
+
│ │ ├── imports
|
|
75
|
+
│ │ │ ├── imports.py # General import functions used across the module
|
|
76
|
+
│ │ │ ├── manifest_utils.py # Utility functions for manifest handling
|
|
77
|
+
│ │ ├── pdf_utils
|
|
78
|
+
│ │ │ ├── imports.py # Import functions for PDF utilities
|
|
79
|
+
│ │ │ ├── pdf_to_text.py # Converts PDF files to text
|
|
80
|
+
│ │ │ ├── pdf_tools.py # Utility functions for PDF manipulation
|
|
81
|
+
│ │ ├── __init__.py # Initializer file for the abstract_pdfs module
|
|
82
|
+
│ ├── __init__.py # Initializer file for the outer structure
|
|
83
|
+
│
|
|
84
|
+
└── # End of structure
|
|
85
|
+
```
|
|
86
|
+
`The structure provided allows for modularity and separation of concerns. Each Python file serves a specific purpose, like converting PDFs to text, or managing PDF operations. This makes the module easy to maintain and extend.`
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
## Classes
|
|
93
|
+
|
|
94
|
+
### Classes & API
|
|
95
|
+
The utilized classes and their key methods in this module are as follows:
|
|
96
|
+
|
|
97
|
+
1. **AbstractPDFManager:** This class manages various PDF operations.
|
|
98
|
+
* `convert_pdf_to_image()`: Converts PDF files to images.
|
|
99
|
+
* `extract_text_from_pdf()`: Extracts text from PDF files.
|
|
100
|
+
* `split_pdf()`: Splits a PDF into separate pages.
|
|
101
|
+
|
|
102
|
+
2. **SliceManager:** This class handles slice operations.
|
|
103
|
+
* `generate_slices()`: Generates slices from an image.
|
|
104
|
+
* `save_slices()`: Saves the generated slices to a directory.
|
|
105
|
+
|
|
106
|
+
3. **PDFTools:** This class contains utility functions for PDF manipulation.
|
|
107
|
+
* `merge_pdfs()`: Merges multiple PDFs into a single PDF.
|
|
108
|
+
* `rotate_pdf()`: Rotates pages in a PDF.
|
|
109
|
+
* `resize_pdf()`: Resizes pages in a PDF.
|
|
110
|
+
|
|
111
|
+
These classes are designed to achieve modularity and separation of concerns, each serving a specific purpose like converting PDFs to text, splitting PDFs into separate pages, or managing slice operations. The methods contained within these classes provide easy access to the available functionalities of the module. The module encourages code reuse and simplifies complex tasks related to PDFs and image processing which ultimately makes it easy to maintain and extend.
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
## Features
|
|
119
|
+
|
|
120
|
+
- **PDF to Image Conversion:** Convert PDF files to images using the `convert_pdf_to_image()` method, which works with the `PIL` and `pdf2image` libraries.
|
|
121
|
+
- **Text Extraction from PDFs:** Extract text from PDF files with the `extract_text_from_pdf()` method. The package relies on `abstract_ocr` for this functionality.
|
|
122
|
+
- **PDF Manipulation:** Aggregate operations like splitting, merging, rotating, and resizing PDF files are possible using the `split_pdf()`, `merge_pdfs()`, `rotate_pdf()`, and `resize_pdf()` methods.
|
|
123
|
+
- **Slice Management:** Generate and save slices from an image with `generate_slices()` and `save_slices()` methods. This feature is integral to the OCR process.
|
|
124
|
+
- **Modular Architecture:** The architecture of this module is designed to be modular, which promotes code reuse and simplifies complex tasks related to PDFs and image processing.
|
|
125
|
+
- **Compatibility:** The module requires Python 3.6 or higher, supporting compatibility with modern Python versions.
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
## Installation
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
### Prerequisites
|
|
135
|
+
|
|
136
|
+
The `abstract_pdfs` module requires Python 3.6 or higher for working compatibility. Also, make sure you have the following system libraries installed:
|
|
137
|
+
|
|
138
|
+
- poppler-utils
|
|
139
|
+
- tesseract
|
|
140
|
+
|
|
141
|
+
### Installation with pip
|
|
142
|
+
|
|
143
|
+
You can install `abstract_pdfs` via pip.
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
pip install abstract_pdfs
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Installation from source
|
|
150
|
+
|
|
151
|
+
If you prefer to install from the source, you can clone the repository and use pip to handle the installation:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
# Clone the repository
|
|
155
|
+
|
|
156
|
+
git clone https://github.com/AbstractEndeavors/abstract_pdfs.git
|
|
157
|
+
|
|
158
|
+
# Navigate to the project directory
|
|
159
|
+
|
|
160
|
+
cd abstract_pdfs
|
|
161
|
+
|
|
162
|
+
# Install the package
|
|
163
|
+
|
|
164
|
+
pip install .
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
## License and Summary
|
|
173
|
+
|
|
174
|
+
The `abstract_pdfs` module is released under the [MIT License](https://opensource.org/licenses/MIT) and is authored by `putkoff`. An abstract endeavor, this module enables powerful and flexible handling of PDF operations, ranging from conversion of PDFs into images, text extraction from PDF files, to a rich variety of aggregate operations like splitting, merging, rotating, and resizing PDF files.
|
|
175
|
+
|
|
176
|
+
For more information on this module, visit the official repository [here](https://github.com/AbstractEndeavors/abstract_pdfs). For other abstract projects, refer to the [AbstractEndeavors](https://github.com/AbstractEndeavors) Github page.
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
## Overview
|
|
183
|
+
|
|
184
|
+
## `abstract_pdfs` - Powerful PDF Handling for the Modern Python Developer - Version 0.0.0.001
|
|
185
|
+
|
|
186
|
+
Welcome to the documentation for the `abstract_pdfs` Python module. Authored by `putkoff` and maintained by [`AbstractEndeavors`](https://github.com/AbstractEndeavors), this module is a part of a larger ecosystem of Python tools designed for tackling a host of programming challenges.
|
|
187
|
+
|
|
188
|
+
The primary purpose of `abstract_pdfs` is to provide developers with a powerful, flexible interface for managing PDF files. With dependency packages like `PIL`, `abstract_ocr`, and `abstract_utilities`, this module allows you to convert PDFs to images, extract text from PDFs, and perform various aggregate operations such as splitting, merging, rotating, and resizing PDF files.
|
|
189
|
+
|
|
190
|
+
The `abstract_pdfs` module fits within the broader Abstract ecosystem as a go-to solution for PDF management. Its functionality synergistically integrates with modules like `abstract_ocr` for optical character recognition, leveraging the power of the Abstract tools collection.
|
|
191
|
+
|
|
192
|
+
This module is in Alpha stage (Development Status 3) and is ready for integration by developers. It requires Python 3.6 or higher for optimal use. You will find further details on installation and features in the subsequent sections of this README.
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
## Usage
|
|
199
|
+
|
|
200
|
+
Here is a basic python sample of how to use the `abstract_pdfs` module.
|
|
201
|
+
|
|
202
|
+
``` python
|
|
203
|
+
# Import required modules
|
|
204
|
+
from abstract_pdfs import PdfHandler
|
|
205
|
+
|
|
206
|
+
# Specify the required PDF file
|
|
207
|
+
path_to_pdf = "/path/to/your/pdf"
|
|
208
|
+
|
|
209
|
+
# Create an instance of PdfHandler
|
|
210
|
+
pdf_handler = PdfHandler(path_to_pdf)
|
|
211
|
+
|
|
212
|
+
# Now you can perform various operations like
|
|
213
|
+
# Converting PDF to Image
|
|
214
|
+
image_path = pdf_handler.to_image()
|
|
215
|
+
|
|
216
|
+
# Extract text from PDF
|
|
217
|
+
text = pdf_handler.extract_text()
|
|
218
|
+
|
|
219
|
+
# The extracted text will be in string format
|
|
220
|
+
print(text)
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
The `abstract_pdfs` module also comes with built-in support for handling multiple PDF files at once.
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
# Import required modules
|
|
227
|
+
from abstract_pdfs import PdfHandler
|
|
228
|
+
|
|
229
|
+
# Specify a list of PDF files
|
|
230
|
+
paths_to_pdfs = ["/path/to/your/pdf1", "/path/to/your/pdf2", "/path/to/your/pdf3"]
|
|
231
|
+
|
|
232
|
+
# Create instances of PdfHandler in a single line
|
|
233
|
+
pdf_handlers = [PdfHandler(path) for path in paths_to_pdfs]
|
|
234
|
+
|
|
235
|
+
# Now you can iterate over pdf_handlers to do various operations. For example:
|
|
236
|
+
for handler in pdf_handlers:
|
|
237
|
+
# Print the number of pages in each PDF
|
|
238
|
+
print(handler.number_of_pages)
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
These are just a few basic usage examples. The `abstract_pdfs` module exposes a rich API for manipulating and interrogating PDF files. Check out the API documentation for a full list of available methods and their descriptions.'
|
|
242
|
+
|
|
243
|
+
---
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
---
|
|
2
|
+
|
|
3
|
+
## **Abstract PDFs**
|
|
4
|
+
|
|
5
|
+
[](https://badge.fury.io/py/abstract-pdfs)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
[](https://www.python.org/downloads/)
|
|
8
|
+
|
|
9
|
+
**Abstract PDFs** is a modular OCR and PDF-processing toolkit built for automation pipelines.
|
|
10
|
+
It provides a structured way to **ingest**, **deduplicate**, **split**, and **extract text** from PDF documents — including **column-aware OCR** through multiple engines (PaddleOCR, EasyOCR, and Tesseract).
|
|
11
|
+
|
|
12
|
+
Designed to integrate seamlessly with other *Abstract* modules (like `abstract_ocr` and `abstract_utilities`), it forms the foundation for scalable document analysis, digital archiving, and machine learning dataset preparation.
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
### **Table of Contents**
|
|
17
|
+
|
|
18
|
+
* [Features](#features)
|
|
19
|
+
* [Installation](#installation)
|
|
20
|
+
* [Usage](#usage)
|
|
21
|
+
* [Architecture](#architecture)
|
|
22
|
+
* [Classes](#classes)
|
|
23
|
+
* [Dependencies](#dependencies)
|
|
24
|
+
* [Example Workflow](#example-workflow)
|
|
25
|
+
* [License](#license)
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
## Architecture
|
|
29
|
+
|
|
30
|
+
`Architecture
|
|
31
|
+
|
|
32
|
+
This module is organized in a straightforward way. It consists of two main folders: 'pdf_utils', responsible for various PDF handling operations, and 'imports', managing the necessary import functions.
|
|
33
|
+
|
|
34
|
+
`
|
|
35
|
+
```bash
|
|
36
|
+
├── home
|
|
37
|
+
│ └── computron
|
|
38
|
+
│ └── Documents
|
|
39
|
+
│ └── pythonTools
|
|
40
|
+
│ └── modules
|
|
41
|
+
│ └── src
|
|
42
|
+
│ └── modules
|
|
43
|
+
│ └── abstract_pdfs
|
|
44
|
+
│ ├── abstract_pdfs
|
|
45
|
+
│ │ ├── AbstractPDFManager.py # Primary module for managing PDF operations
|
|
46
|
+
│ │ ├── SliceManager.py # Module for handling slice operations
|
|
47
|
+
│ │ ├── imports
|
|
48
|
+
│ │ │ ├── imports.py # General import functions used across the module
|
|
49
|
+
│ │ │ ├── manifest_utils.py # Utility functions for manifest handling
|
|
50
|
+
│ │ ├── pdf_utils
|
|
51
|
+
│ │ │ ├── imports.py # Import functions for PDF utilities
|
|
52
|
+
│ │ │ ├── pdf_to_text.py # Converts PDF files to text
|
|
53
|
+
│ │ │ ├── pdf_tools.py # Utility functions for PDF manipulation
|
|
54
|
+
│ │ ├── __init__.py # Initializer file for the abstract_pdfs module
|
|
55
|
+
│ ├── __init__.py # Initializer file for the outer structure
|
|
56
|
+
│
|
|
57
|
+
└── # End of structure
|
|
58
|
+
```
|
|
59
|
+
`The structure provided allows for modularity and separation of concerns. Each Python file serves a specific purpose, like converting PDFs to text, or managing PDF operations. This makes the module easy to maintain and extend.`
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
## Classes
|
|
66
|
+
|
|
67
|
+
### Classes & API
|
|
68
|
+
The utilized classes and their key methods in this module are as follows:
|
|
69
|
+
|
|
70
|
+
1. **AbstractPDFManager:** This class manages various PDF operations.
|
|
71
|
+
* `convert_pdf_to_image()`: Converts PDF files to images.
|
|
72
|
+
* `extract_text_from_pdf()`: Extracts text from PDF files.
|
|
73
|
+
* `split_pdf()`: Splits a PDF into separate pages.
|
|
74
|
+
|
|
75
|
+
2. **SliceManager:** This class handles slice operations.
|
|
76
|
+
* `generate_slices()`: Generates slices from an image.
|
|
77
|
+
* `save_slices()`: Saves the generated slices to a directory.
|
|
78
|
+
|
|
79
|
+
3. **PDFTools:** This class contains utility functions for PDF manipulation.
|
|
80
|
+
* `merge_pdfs()`: Merges multiple PDFs into a single PDF.
|
|
81
|
+
* `rotate_pdf()`: Rotates pages in a PDF.
|
|
82
|
+
* `resize_pdf()`: Resizes pages in a PDF.
|
|
83
|
+
|
|
84
|
+
These classes are designed to achieve modularity and separation of concerns, each serving a specific purpose like converting PDFs to text, splitting PDFs into separate pages, or managing slice operations. The methods contained within these classes provide easy access to the available functionalities of the module. The module encourages code reuse and simplifies complex tasks related to PDFs and image processing which ultimately makes it easy to maintain and extend.
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
## Features
|
|
92
|
+
|
|
93
|
+
- **PDF to Image Conversion:** Convert PDF files to images using the `convert_pdf_to_image()` method, which works with the `PIL` and `pdf2image` libraries.
|
|
94
|
+
- **Text Extraction from PDFs:** Extract text from PDF files with the `extract_text_from_pdf()` method. The package relies on `abstract_ocr` for this functionality.
|
|
95
|
+
- **PDF Manipulation:** Aggregate operations like splitting, merging, rotating, and resizing PDF files are possible using the `split_pdf()`, `merge_pdfs()`, `rotate_pdf()`, and `resize_pdf()` methods.
|
|
96
|
+
- **Slice Management:** Generate and save slices from an image with `generate_slices()` and `save_slices()` methods. This feature is integral to the OCR process.
|
|
97
|
+
- **Modular Architecture:** The architecture of this module is designed to be modular, which promotes code reuse and simplifies complex tasks related to PDFs and image processing.
|
|
98
|
+
- **Compatibility:** The module requires Python 3.6 or higher, supporting compatibility with modern Python versions.
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
## Installation
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
### Prerequisites
|
|
108
|
+
|
|
109
|
+
The `abstract_pdfs` module requires Python 3.6 or higher for working compatibility. Also, make sure you have the following system libraries installed:
|
|
110
|
+
|
|
111
|
+
- poppler-utils
|
|
112
|
+
- tesseract
|
|
113
|
+
|
|
114
|
+
### Installation with pip
|
|
115
|
+
|
|
116
|
+
You can install `abstract_pdfs` via pip.
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
pip install abstract_pdfs
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Installation from source
|
|
123
|
+
|
|
124
|
+
If you prefer to install from the source, you can clone the repository and use pip to handle the installation:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
# Clone the repository
|
|
128
|
+
|
|
129
|
+
git clone https://github.com/AbstractEndeavors/abstract_pdfs.git
|
|
130
|
+
|
|
131
|
+
# Navigate to the project directory
|
|
132
|
+
|
|
133
|
+
cd abstract_pdfs
|
|
134
|
+
|
|
135
|
+
# Install the package
|
|
136
|
+
|
|
137
|
+
pip install .
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
## License and Summary
|
|
146
|
+
|
|
147
|
+
The `abstract_pdfs` module is released under the [MIT License](https://opensource.org/licenses/MIT) and is authored by `putkoff`. An abstract endeavor, this module enables powerful and flexible handling of PDF operations, ranging from conversion of PDFs into images, text extraction from PDF files, to a rich variety of aggregate operations like splitting, merging, rotating, and resizing PDF files.
|
|
148
|
+
|
|
149
|
+
For more information on this module, visit the official repository [here](https://github.com/AbstractEndeavors/abstract_pdfs). For other abstract projects, refer to the [AbstractEndeavors](https://github.com/AbstractEndeavors) Github page.
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
## Overview
|
|
156
|
+
|
|
157
|
+
## `abstract_pdfs` - Powerful PDF Handling for the Modern Python Developer - Version 0.0.0.001
|
|
158
|
+
|
|
159
|
+
Welcome to the documentation for the `abstract_pdfs` Python module. Authored by `putkoff` and maintained by [`AbstractEndeavors`](https://github.com/AbstractEndeavors), this module is a part of a larger ecosystem of Python tools designed for tackling a host of programming challenges.
|
|
160
|
+
|
|
161
|
+
The primary purpose of `abstract_pdfs` is to provide developers with a powerful, flexible interface for managing PDF files. With dependency packages like `PIL`, `abstract_ocr`, and `abstract_utilities`, this module allows you to convert PDFs to images, extract text from PDFs, and perform various aggregate operations such as splitting, merging, rotating, and resizing PDF files.
|
|
162
|
+
|
|
163
|
+
The `abstract_pdfs` module fits within the broader Abstract ecosystem as a go-to solution for PDF management. Its functionality synergistically integrates with modules like `abstract_ocr` for optical character recognition, leveraging the power of the Abstract tools collection.
|
|
164
|
+
|
|
165
|
+
This module is in Alpha stage (Development Status 3) and is ready for integration by developers. It requires Python 3.6 or higher for optimal use. You will find further details on installation and features in the subsequent sections of this README.
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
## Usage
|
|
172
|
+
|
|
173
|
+
Here is a basic python sample of how to use the `abstract_pdfs` module.
|
|
174
|
+
|
|
175
|
+
``` python
|
|
176
|
+
# Import required modules
|
|
177
|
+
from abstract_pdfs import PdfHandler
|
|
178
|
+
|
|
179
|
+
# Specify the required PDF file
|
|
180
|
+
path_to_pdf = "/path/to/your/pdf"
|
|
181
|
+
|
|
182
|
+
# Create an instance of PdfHandler
|
|
183
|
+
pdf_handler = PdfHandler(path_to_pdf)
|
|
184
|
+
|
|
185
|
+
# Now you can perform various operations like
|
|
186
|
+
# Converting PDF to Image
|
|
187
|
+
image_path = pdf_handler.to_image()
|
|
188
|
+
|
|
189
|
+
# Extract text from PDF
|
|
190
|
+
text = pdf_handler.extract_text()
|
|
191
|
+
|
|
192
|
+
# The extracted text will be in string format
|
|
193
|
+
print(text)
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
The `abstract_pdfs` module also comes with built-in support for handling multiple PDF files at once.
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
# Import required modules
|
|
200
|
+
from abstract_pdfs import PdfHandler
|
|
201
|
+
|
|
202
|
+
# Specify a list of PDF files
|
|
203
|
+
paths_to_pdfs = ["/path/to/your/pdf1", "/path/to/your/pdf2", "/path/to/your/pdf3"]
|
|
204
|
+
|
|
205
|
+
# Create instances of PdfHandler in a single line
|
|
206
|
+
pdf_handlers = [PdfHandler(path) for path in paths_to_pdfs]
|
|
207
|
+
|
|
208
|
+
# Now you can iterate over pdf_handlers to do various operations. For example:
|
|
209
|
+
for handler in pdf_handlers:
|
|
210
|
+
# Print the number of pages in each PDF
|
|
211
|
+
print(handler.number_of_pages)
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
These are just a few basic usage examples. The `abstract_pdfs` module exposes a rich API for manipulating and interrogating PDF files. Check out the API documentation for a full list of available methods and their descriptions.'
|
|
215
|
+
|
|
216
|
+
---
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
[metadata]
|
|
2
|
+
name = abstract_pdfs
|
|
3
|
+
version = 0.0.0.001
|
|
4
|
+
author = putkoff
|
|
5
|
+
author_email = partners@abstractendeavors.com
|
|
6
|
+
description = A modular OCR and PDF-processing toolkit for automated text extraction, deduplication, and multi-engine column-aware OCR using Tesseract, EasyOCR, and PaddleOCR.
|
|
7
|
+
long_description = file: README.md
|
|
8
|
+
long_description_content_type = text/markdown
|
|
9
|
+
url = https://github.com/AbstractEndeavors/abstract_pdfs
|
|
10
|
+
license = MIT
|
|
11
|
+
classifiers =
|
|
12
|
+
Development Status :: 3 - Alpha
|
|
13
|
+
Intended Audience :: Developers
|
|
14
|
+
License :: OSI Approved :: MIT License
|
|
15
|
+
Programming Language :: Python :: 3
|
|
16
|
+
Programming Language :: Python :: 3.11
|
|
17
|
+
|
|
18
|
+
[egg_info]
|
|
19
|
+
tag_build =
|
|
20
|
+
tag_date = 0
|
|
21
|
+
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from time import time
|
|
2
|
+
import setuptools
|
|
3
|
+
with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
|
+
long_description = fh.read()
|
|
5
|
+
setuptools.setup(
|
|
6
|
+
name='abstract_pdfs',
|
|
7
|
+
version='0.0.1',
|
|
8
|
+
author='putkoff',
|
|
9
|
+
author_email='partners@abstractendeavors.com',
|
|
10
|
+
description='A modular OCR and PDF-processing toolkit for automated text extraction, deduplication, and multi-engine column-aware OCR using Tesseract, EasyOCR, and PaddleOCR',
|
|
11
|
+
long_description=long_description,
|
|
12
|
+
long_description_content_type='text/markdown',
|
|
13
|
+
url='https://github.com/AbstractEndeavors/abstract_pdfs',
|
|
14
|
+
classifiers=[
|
|
15
|
+
'Development Status :: 3 - Alpha',
|
|
16
|
+
'Intended Audience :: Developers',
|
|
17
|
+
'License :: OSI Approved :: MIT License',
|
|
18
|
+
'Programming Language :: Python :: 3',
|
|
19
|
+
'Programming Language :: Python :: 3.11',
|
|
20
|
+
],
|
|
21
|
+
install_requires=[
|
|
22
|
+
'abstract_ocr',
|
|
23
|
+
'abstract_utilities',
|
|
24
|
+
]
|
|
25
|
+
,
|
|
26
|
+
package_dir={"": "src"},
|
|
27
|
+
packages=setuptools.find_packages(where="src"),
|
|
28
|
+
python_requires=">=3.6",
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
)
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
from .imports import *
|
|
2
|
+
from .pdf_utils import pdf_tools, pdf_to_text
|
|
3
|
+
from .SliceManager import SliceManager
|
|
4
|
+
|
|
5
|
+
class AbstractPDFManager:
|
|
6
|
+
"""
|
|
7
|
+
Central manager for PDF ingestion, deduplication, splitting, OCR conversion, and slice-aware text extraction.
|
|
8
|
+
|
|
9
|
+
Layers:
|
|
10
|
+
1️⃣ Basic utilities (hashing, splitting, per-page image conversion)
|
|
11
|
+
2️⃣ Directory-oriented bulk conversion (`pdf_to_text_in_folders`)
|
|
12
|
+
3️⃣ Slice-aware column detection via SliceManager
|
|
13
|
+
4️⃣ Manifest-driven persistence
|
|
14
|
+
|
|
15
|
+
Example:
|
|
16
|
+
mgr = AbstractPDFManager("/mnt/24T/media/docs/sample.pdf")
|
|
17
|
+
mgr.run_basic_extraction()
|
|
18
|
+
mgr.run_slice_analysis(engine='paddle')
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, pdf_path: str, out_root: Optional[str] = None, engines: Union[str, List[str]] = 'paddle'):
|
|
22
|
+
self.pdf_path = get_pdf_path(pdf_path)
|
|
23
|
+
if not self.pdf_path:
|
|
24
|
+
raise FileNotFoundError(f"❌ Could not locate valid PDF in {pdf_path}")
|
|
25
|
+
|
|
26
|
+
self.pdf_dir = get_pdf_dir(pdf_path)
|
|
27
|
+
self.file_parts = get_file_parts(self.pdf_path)
|
|
28
|
+
self.filename = self.file_parts.get("filename")
|
|
29
|
+
self.dirname = self.file_parts.get("dirname")
|
|
30
|
+
self.out_root = out_root or os.path.join(self.dirname, f"{self.filename}_output")
|
|
31
|
+
mkdirs(self.out_root)
|
|
32
|
+
|
|
33
|
+
self.engines = make_list(engines)
|
|
34
|
+
self.manifest = load_manifest(pdf_dir=self.pdf_dir)
|
|
35
|
+
logger.info(f"📁 Initialized AbstractPDFManager for {self.filename}")
|
|
36
|
+
|
|
37
|
+
# ------------------------------------------------------------
|
|
38
|
+
# ✅ 1. Basic Splitting & Conversion
|
|
39
|
+
# ------------------------------------------------------------
|
|
40
|
+
def split_pdf(self) -> List[str]:
|
|
41
|
+
"""Split PDF into individual pages (stored in out_root/pdf_pages)."""
|
|
42
|
+
pdf_pages_dir = os.path.join(self.out_root, "pdf_pages")
|
|
43
|
+
mkdirs(pdf_pages_dir)
|
|
44
|
+
pages = pdf_to_text.split_pdf(self.pdf_path, pdf_pages_dir, self.filename)
|
|
45
|
+
logger.info(f"📄 Split {self.filename} into {len(pages)} pages")
|
|
46
|
+
return pages
|
|
47
|
+
|
|
48
|
+
def convert_to_images(self, pdf_pages: Optional[List[str]] = None) -> List[str]:
|
|
49
|
+
"""Convert PDF pages into images."""
|
|
50
|
+
pdf_pages = pdf_pages or self.split_pdf()
|
|
51
|
+
images_dir = os.path.join(self.out_root, "images")
|
|
52
|
+
mkdirs(images_dir)
|
|
53
|
+
|
|
54
|
+
img_paths = []
|
|
55
|
+
for pdf_page in pdf_pages:
|
|
56
|
+
try:
|
|
57
|
+
images = convert_from_path(pdf_page)
|
|
58
|
+
if not images:
|
|
59
|
+
continue
|
|
60
|
+
out_path = os.path.join(images_dir, Path(pdf_page).stem + ".png")
|
|
61
|
+
images[0].save(out_path, "PNG")
|
|
62
|
+
img_paths.append(out_path)
|
|
63
|
+
except Exception as e:
|
|
64
|
+
logger.error(f"❌ Error converting {pdf_page} to image: {e}")
|
|
65
|
+
logger.info(f"🖼️ Converted {len(img_paths)} page(s) to PNGs")
|
|
66
|
+
return img_paths
|
|
67
|
+
|
|
68
|
+
# ------------------------------------------------------------
|
|
69
|
+
# ✅ 2. Text Extraction (image_to_text)
|
|
70
|
+
# ------------------------------------------------------------
|
|
71
|
+
def extract_text(self, img_paths: Optional[List[str]] = None) -> List[str]:
|
|
72
|
+
"""Extract text from images using base OCR."""
|
|
73
|
+
img_paths = img_paths or self.convert_to_images()
|
|
74
|
+
text_dir = os.path.join(self.out_root, "text")
|
|
75
|
+
mkdirs(text_dir)
|
|
76
|
+
|
|
77
|
+
extracted = []
|
|
78
|
+
for img in img_paths:
|
|
79
|
+
try:
|
|
80
|
+
txt = image_to_text(img)
|
|
81
|
+
txt_path = os.path.join(text_dir, Path(img).stem + ".txt")
|
|
82
|
+
write_to_file(txt_path, txt)
|
|
83
|
+
extracted.append(txt_path)
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.error(f"❌ Error extracting text from {img}: {e}")
|
|
86
|
+
logger.info(f"✍️ Extracted text from {len(extracted)} images")
|
|
87
|
+
return extracted
|
|
88
|
+
|
|
89
|
+
# ------------------------------------------------------------
|
|
90
|
+
# ✅ 3. Full Batch (deduplication + per-folder extraction)
|
|
91
|
+
# ------------------------------------------------------------
|
|
92
|
+
def run_batch_extraction(self, src_dir: str = None, dest_dir: str = None):
|
|
93
|
+
"""Run deduplication and full OCR extraction across a directory tree."""
|
|
94
|
+
src_dir = src_dir or self.pdf_dir
|
|
95
|
+
dest_dir = dest_dir or os.path.join(self.out_root, "pdf_convert")
|
|
96
|
+
pdf_to_text.pdf_to_text_in_folders(src_dir, dest_dir)
|
|
97
|
+
logger.info(f"📚 Completed batch extraction for {src_dir}")
|
|
98
|
+
|
|
99
|
+
# ------------------------------------------------------------
|
|
100
|
+
# ✅ 4. Slice-aware Column OCR
|
|
101
|
+
# ------------------------------------------------------------
|
|
102
|
+
def run_slice_analysis(self, engine: str = "paddle"):
|
|
103
|
+
"""Run full SliceManager pipeline for a single or multiple engines."""
|
|
104
|
+
try:
|
|
105
|
+
slicer = SliceManager(pdf_path=self.pdf_path, out_root=self.out_root, engines=engine)
|
|
106
|
+
slicer.process_pdf()
|
|
107
|
+
logger.info(f"🏁 Slice analysis complete for {self.filename} [{engine}]")
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.error(f"❌ Slice analysis failed for {self.filename}: {e}")
|
|
110
|
+
|
|
111
|
+
# ------------------------------------------------------------
|
|
112
|
+
# ✅ 5. Manifest Handling
|
|
113
|
+
# ------------------------------------------------------------
|
|
114
|
+
def save_manifest(self, override=False):
|
|
115
|
+
"""Write manifest updates to disk."""
|
|
116
|
+
save_manifest_data(data=self.manifest, pdf_dir=self.pdf_dir, override=override)
|
|
117
|
+
logger.info(f"🧾 Manifest saved at {self.pdf_dir}")
|
|
118
|
+
|
|
119
|
+
# ------------------------------------------------------------
|
|
120
|
+
# ✅ 6. Orchestrated High-Level Runner
|
|
121
|
+
# ------------------------------------------------------------
|
|
122
|
+
def run_basic_extraction(self):
|
|
123
|
+
"""Split → Convert → Extract text"""
|
|
124
|
+
pages = self.split_pdf()
|
|
125
|
+
imgs = self.convert_to_images(pages)
|
|
126
|
+
self.extract_text(imgs)
|
|
127
|
+
logger.info(f"✅ Completed base extraction for {self.filename}")
|
|
128
|
+
|
|
129
|
+
def run_full_pipeline(self, include_slice=True):
|
|
130
|
+
"""Perform entire pipeline, optionally including SliceManager OCR."""
|
|
131
|
+
self.run_basic_extraction()
|
|
132
|
+
if include_slice:
|
|
133
|
+
for engine in self.engines:
|
|
134
|
+
self.run_slice_analysis(engine)
|
|
135
|
+
self.save_manifest()
|
|
136
|
+
logger.info(f"🚀 Full pipeline completed for {self.filename}")
|