pdfstructx 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfstructx-0.2.0/LICENSE +38 -0
- pdfstructx-0.2.0/MANIFEST.in +4 -0
- pdfstructx-0.2.0/PKG-INFO +217 -0
- pdfstructx-0.2.0/README.md +150 -0
- pdfstructx-0.2.0/pyproject.toml +52 -0
- pdfstructx-0.2.0/setup.cfg +4 -0
- pdfstructx-0.2.0/src/pdfstruct/__init__.py +76 -0
- pdfstructx-0.2.0/src/pdfstruct/extractors/__init__.py +0 -0
- pdfstructx-0.2.0/src/pdfstruct/extractors/images.py +607 -0
- pdfstructx-0.2.0/src/pdfstruct/extractors/text.py +269 -0
- pdfstructx-0.2.0/src/pdfstruct/layout/__init__.py +0 -0
- pdfstructx-0.2.0/src/pdfstruct/layout/analyzer.py +210 -0
- pdfstructx-0.2.0/src/pdfstruct/models/__init__.py +27 -0
- pdfstructx-0.2.0/src/pdfstruct/models/document.py +475 -0
- pdfstructx-0.2.0/src/pdfstruct/models/metadata.py +96 -0
- pdfstructx-0.2.0/src/pdfstruct/output/__init__.py +0 -0
- pdfstructx-0.2.0/src/pdfstruct/output/json_output.py +18 -0
- pdfstructx-0.2.0/src/pdfstruct/output/markdown.py +162 -0
- pdfstructx-0.2.0/src/pdfstruct/output/text_output.py +65 -0
- pdfstructx-0.2.0/src/pdfstruct/parser.py +413 -0
- pdfstructx-0.2.0/src/pdfstruct/structure/__init__.py +0 -0
- pdfstructx-0.2.0/src/pdfstruct/structure/headers_footers.py +110 -0
- pdfstructx-0.2.0/src/pdfstruct/structure/headings.py +151 -0
- pdfstructx-0.2.0/src/pdfstruct/structure/lists.py +124 -0
- pdfstructx-0.2.0/src/pdfstruct/structure/sections.py +132 -0
- pdfstructx-0.2.0/src/pdfstruct/tables/__init__.py +0 -0
- pdfstructx-0.2.0/src/pdfstruct/tables/detector.py +347 -0
- pdfstructx-0.2.0/src/pdfstruct/utils/__init__.py +0 -0
- pdfstructx-0.2.0/src/pdfstruct/utils/fonts.py +153 -0
- pdfstructx-0.2.0/src/pdfstruct/utils/geometry.py +146 -0
- pdfstructx-0.2.0/src/pdfstruct/utils/language.py +84 -0
- pdfstructx-0.2.0/src/pdfstructx.egg-info/PKG-INFO +217 -0
- pdfstructx-0.2.0/src/pdfstructx.egg-info/SOURCES.txt +35 -0
- pdfstructx-0.2.0/src/pdfstructx.egg-info/dependency_links.txt +1 -0
- pdfstructx-0.2.0/src/pdfstructx.egg-info/requires.txt +7 -0
- pdfstructx-0.2.0/src/pdfstructx.egg-info/top_level.txt +1 -0
- pdfstructx-0.2.0/tests/test_image_extraction.py +362 -0
pdfstructx-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
pdfstruct - Personal Use License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Kyros Groupe. All rights reserved.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to use,
|
|
7
|
+
copy, modify, and distribute the Software for PERSONAL, EDUCATIONAL, and
|
|
8
|
+
NON-COMMERCIAL purposes only, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
1. PERSONAL USE: You may use the Software for personal projects, learning,
|
|
11
|
+
research, and non-commercial open-source projects.
|
|
12
|
+
|
|
13
|
+
2. COMMERCIAL USE PROHIBITED: You may NOT use the Software, in whole or in
|
|
14
|
+
part, for any commercial purpose without obtaining a separate commercial
|
|
15
|
+
license from Kyros Groupe. Commercial use includes, but is not limited to:
|
|
16
|
+
- Incorporating the Software into a product or service sold for profit
|
|
17
|
+
- Using the Software to provide paid services or SaaS offerings
|
|
18
|
+
- Using the Software within a for-profit business for internal operations
|
|
19
|
+
- Distributing the Software as part of a commercial product
|
|
20
|
+
|
|
21
|
+
3. ATTRIBUTION: All copies or substantial portions of the Software must
|
|
22
|
+
include this license notice and the above copyright notice.
|
|
23
|
+
|
|
24
|
+
4. NO SUBLICENSING: You may not sublicense the Software under different terms.
|
|
25
|
+
|
|
26
|
+
5. MODIFICATIONS: You may modify the Software for personal use. Modified
|
|
27
|
+
versions must retain this license and cannot be distributed under
|
|
28
|
+
different terms.
|
|
29
|
+
|
|
30
|
+
For commercial licensing inquiries, contact: licensing@kyros-groupe.com
|
|
31
|
+
|
|
32
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
33
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
34
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
35
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
36
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
37
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
38
|
+
SOFTWARE.
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: pdfstructx
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Intelligent PDF parser with font-aware structure detection, table extraction, and multi-column support
|
|
5
|
+
Author: Kyros Groupe
|
|
6
|
+
License: pdfstruct - Personal Use License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Kyros Groupe. All rights reserved.
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to use,
|
|
12
|
+
copy, modify, and distribute the Software for PERSONAL, EDUCATIONAL, and
|
|
13
|
+
NON-COMMERCIAL purposes only, subject to the following conditions:
|
|
14
|
+
|
|
15
|
+
1. PERSONAL USE: You may use the Software for personal projects, learning,
|
|
16
|
+
research, and non-commercial open-source projects.
|
|
17
|
+
|
|
18
|
+
2. COMMERCIAL USE PROHIBITED: You may NOT use the Software, in whole or in
|
|
19
|
+
part, for any commercial purpose without obtaining a separate commercial
|
|
20
|
+
license from Kyros Groupe. Commercial use includes, but is not limited to:
|
|
21
|
+
- Incorporating the Software into a product or service sold for profit
|
|
22
|
+
- Using the Software to provide paid services or SaaS offerings
|
|
23
|
+
- Using the Software within a for-profit business for internal operations
|
|
24
|
+
- Distributing the Software as part of a commercial product
|
|
25
|
+
|
|
26
|
+
3. ATTRIBUTION: All copies or substantial portions of the Software must
|
|
27
|
+
include this license notice and the above copyright notice.
|
|
28
|
+
|
|
29
|
+
4. NO SUBLICENSING: You may not sublicense the Software under different terms.
|
|
30
|
+
|
|
31
|
+
5. MODIFICATIONS: You may modify the Software for personal use. Modified
|
|
32
|
+
versions must retain this license and cannot be distributed under
|
|
33
|
+
different terms.
|
|
34
|
+
|
|
35
|
+
For commercial licensing inquiries, contact: licensing@kyros-groupe.com
|
|
36
|
+
|
|
37
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
38
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
39
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
40
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
41
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
42
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
43
|
+
SOFTWARE.
|
|
44
|
+
|
|
45
|
+
Project-URL: Homepage, https://github.com/kyros-groupe/pdfstruct
|
|
46
|
+
Project-URL: Documentation, https://github.com/kyros-groupe/pdfstruct#readme
|
|
47
|
+
Project-URL: Issues, https://github.com/kyros-groupe/pdfstruct/issues
|
|
48
|
+
Keywords: pdf,parser,document,extraction,tables,structure
|
|
49
|
+
Classifier: Development Status :: 3 - Alpha
|
|
50
|
+
Classifier: Intended Audience :: Developers
|
|
51
|
+
Classifier: Programming Language :: Python :: 3
|
|
52
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
53
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
54
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
55
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
56
|
+
Classifier: Topic :: Text Processing :: General
|
|
57
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
58
|
+
Requires-Python: >=3.10
|
|
59
|
+
Description-Content-Type: text/markdown
|
|
60
|
+
License-File: LICENSE
|
|
61
|
+
Requires-Dist: pdfminer.six>=20231228
|
|
62
|
+
Requires-Dist: Pillow>=10.0.0
|
|
63
|
+
Provides-Extra: dev
|
|
64
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
65
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
66
|
+
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
67
|
+
|
|
68
|
+
# pdfstruct
|
|
69
|
+
|
|
70
|
+
Intelligent PDF parser with font-aware structure detection, table extraction, and multi-column support.
|
|
71
|
+
|
|
72
|
+
[](https://www.python.org/)
|
|
73
|
+
|
|
74
|
+
## Overview
|
|
75
|
+
|
|
76
|
+
**pdfstruct** is a Python library that extracts structured content from PDF documents. Unlike basic text extraction tools, pdfstruct understands document layout — detecting headings, sections, tables, lists, headers/footers, and multi-column layouts using font analysis and geometric reasoning.
|
|
77
|
+
|
|
78
|
+
## Features
|
|
79
|
+
|
|
80
|
+
- **Font-aware heading detection**: Uses font size, weight, and frequency analysis to classify headings (H1–H6)
|
|
81
|
+
- **Table extraction**: Detects tables from grid lines and whitespace-aligned columns
|
|
82
|
+
- **Section hierarchy**: Builds a document tree from headings and content
|
|
83
|
+
- **Multi-column support**: Handles two-column and multi-column layouts
|
|
84
|
+
- **Header/footer removal**: Identifies and filters repeating page content
|
|
85
|
+
- **List detection**: Recognizes bulleted, numbered, lettered, and Roman numeral lists
|
|
86
|
+
- **Multiple output formats**: JSON, Markdown, and plain text
|
|
87
|
+
- **Rich metadata**: Word count, language detection, reading time, font statistics
|
|
88
|
+
|
|
89
|
+
## Installation
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pip install pdfstruct
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Or install from source:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
git clone https://github.com/kyros-groupe/pdfstruct.git
|
|
99
|
+
cd pdfstruct
|
|
100
|
+
pip install -e .
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Quickstart
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
import pdfstruct
|
|
107
|
+
|
|
108
|
+
# Parse a PDF
|
|
109
|
+
doc = pdfstruct.parse("contract.pdf")
|
|
110
|
+
|
|
111
|
+
# Access structured content
|
|
112
|
+
print(doc.title)
|
|
113
|
+
print(f"{doc.page_count} pages, {doc.metadata.word_count} words")
|
|
114
|
+
|
|
115
|
+
# Browse sections
|
|
116
|
+
for section in doc.sections:
|
|
117
|
+
print(f"{section.heading} ({len(section.content)} chars)")
|
|
118
|
+
for sub in section.subsections:
|
|
119
|
+
print(f" {sub.heading}")
|
|
120
|
+
|
|
121
|
+
# Get tables
|
|
122
|
+
for table in doc.tables:
|
|
123
|
+
print(table.to_dicts()) # List of row dicts
|
|
124
|
+
|
|
125
|
+
# Export to different formats
|
|
126
|
+
print(pdfstruct.to_markdown(doc))
|
|
127
|
+
print(pdfstruct.to_text(doc))
|
|
128
|
+
print(pdfstruct.to_json(doc))
|
|
129
|
+
|
|
130
|
+
# Full dict for programmatic use
|
|
131
|
+
data = pdfstruct.to_dict(doc)
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## API Reference
|
|
135
|
+
|
|
136
|
+
### `pdfstruct.parse(source, **options) -> Document`
|
|
137
|
+
|
|
138
|
+
Parse a PDF file, bytes, or file-like object.
|
|
139
|
+
|
|
140
|
+
**Options:**
|
|
141
|
+
- `detect_tables` (bool, default True) — Enable table detection
|
|
142
|
+
- `detect_headers_footers` (bool, default True) — Remove repeating headers/footers
|
|
143
|
+
- `detect_lists` (bool, default True) — Detect list structures
|
|
144
|
+
- `detect_columns` (bool, default True) — Handle multi-column layouts
|
|
145
|
+
|
|
146
|
+
### Document
|
|
147
|
+
|
|
148
|
+
- `doc.title` — Detected document title
|
|
149
|
+
- `doc.pages` — List of Page objects
|
|
150
|
+
- `doc.sections` — Hierarchical section tree
|
|
151
|
+
- `doc.tables` — All detected tables
|
|
152
|
+
- `doc.metadata` — DocumentMetadata with statistics
|
|
153
|
+
- `doc.text` — Full document text (concatenated from pages)
|
|
154
|
+
- `doc.to_dict()` — JSON-serializable dictionary
|
|
155
|
+
|
|
156
|
+
### Section
|
|
157
|
+
|
|
158
|
+
- `section.heading` — Section heading text
|
|
159
|
+
- `section.heading_level` — HeadingLevel enum (H1–H6)
|
|
160
|
+
- `section.content` — Section body text
|
|
161
|
+
- `section.paragraphs` — List of Paragraph objects
|
|
162
|
+
- `section.subsections` — Nested subsections
|
|
163
|
+
|
|
164
|
+
### Table
|
|
165
|
+
|
|
166
|
+
- `table.rows` — List of TableRow objects
|
|
167
|
+
- `table.to_list()` — 2D list of cell text
|
|
168
|
+
- `table.to_dicts()` — List of dicts (header row as keys)
|
|
169
|
+
- `table.num_rows`, `table.num_cols` — Dimensions
|
|
170
|
+
|
|
171
|
+
### Metadata
|
|
172
|
+
|
|
173
|
+
- `metadata.word_count`, `metadata.char_count` — Text statistics
|
|
174
|
+
- `metadata.language` — Detected language code
|
|
175
|
+
- `metadata.page_count` — Number of pages
|
|
176
|
+
- `metadata.is_scanned` — Whether PDF appears to be scanned
|
|
177
|
+
- `metadata.has_tables`, `metadata.has_images` — Content flags
|
|
178
|
+
- `metadata.primary_font`, `metadata.primary_font_size` — Font info
|
|
179
|
+
|
|
180
|
+
## Architecture
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
pdfstruct/
|
|
184
|
+
├── parser.py # Main PDFParser class and parse() entry point
|
|
185
|
+
├── models/
|
|
186
|
+
│ ├── document.py # Core models: Document, Page, Section, TextLine, Table, etc.
|
|
187
|
+
│ └── metadata.py # DocumentMetadata with computed statistics
|
|
188
|
+
├── extractors/
|
|
189
|
+
│ └── text.py # PDF text extraction via pdfminer.six
|
|
190
|
+
├── layout/
|
|
191
|
+
│ └── analyzer.py # Paragraph grouping, reading order, margins
|
|
192
|
+
├── structure/
|
|
193
|
+
│ ├── headings.py # Font-aware heading detection
|
|
194
|
+
│ ├── headers_footers.py # Repeating content detection
|
|
195
|
+
│ ├── lists.py # List structure detection
|
|
196
|
+
│ └── sections.py # Section hierarchy builder
|
|
197
|
+
├── tables/
|
|
198
|
+
│ └── detector.py # Grid and whitespace table detection
|
|
199
|
+
├── output/
|
|
200
|
+
│ ├── json_output.py # JSON/dict export
|
|
201
|
+
│ ├── markdown.py # Markdown export
|
|
202
|
+
│ └── text_output.py # Plain text export
|
|
203
|
+
└── utils/
|
|
204
|
+
├── fonts.py # Font analysis and heading classification
|
|
205
|
+
├── geometry.py # Bounding box utilities, column detection
|
|
206
|
+
└── language.py # Language detection heuristics
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
## Requirements
|
|
210
|
+
|
|
211
|
+
- Python >= 3.10
|
|
212
|
+
- pdfminer.six >= 20231228
|
|
213
|
+
- Pillow >= 10.0.0
|
|
214
|
+
|
|
215
|
+
## License
|
|
216
|
+
|
|
217
|
+
Personal Use License by Kyros Groupe. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# pdfstruct
|
|
2
|
+
|
|
3
|
+
Intelligent PDF parser with font-aware structure detection, table extraction, and multi-column support.
|
|
4
|
+
|
|
5
|
+
[](https://www.python.org/)
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
**pdfstruct** is a Python library that extracts structured content from PDF documents. Unlike basic text extraction tools, pdfstruct understands document layout — detecting headings, sections, tables, lists, headers/footers, and multi-column layouts using font analysis and geometric reasoning.
|
|
10
|
+
|
|
11
|
+
## Features
|
|
12
|
+
|
|
13
|
+
- **Font-aware heading detection**: Uses font size, weight, and frequency analysis to classify headings (H1–H6)
|
|
14
|
+
- **Table extraction**: Detects tables from grid lines and whitespace-aligned columns
|
|
15
|
+
- **Section hierarchy**: Builds a document tree from headings and content
|
|
16
|
+
- **Multi-column support**: Handles two-column and multi-column layouts
|
|
17
|
+
- **Header/footer removal**: Identifies and filters repeating page content
|
|
18
|
+
- **List detection**: Recognizes bulleted, numbered, lettered, and Roman numeral lists
|
|
19
|
+
- **Multiple output formats**: JSON, Markdown, and plain text
|
|
20
|
+
- **Rich metadata**: Word count, language detection, reading time, font statistics
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install pdfstruct
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Or install from source:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
git clone https://github.com/kyros-groupe/pdfstruct.git
|
|
32
|
+
cd pdfstruct
|
|
33
|
+
pip install -e .
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Quickstart
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
import pdfstruct
|
|
40
|
+
|
|
41
|
+
# Parse a PDF
|
|
42
|
+
doc = pdfstruct.parse("contract.pdf")
|
|
43
|
+
|
|
44
|
+
# Access structured content
|
|
45
|
+
print(doc.title)
|
|
46
|
+
print(f"{doc.page_count} pages, {doc.metadata.word_count} words")
|
|
47
|
+
|
|
48
|
+
# Browse sections
|
|
49
|
+
for section in doc.sections:
|
|
50
|
+
print(f"{section.heading} ({len(section.content)} chars)")
|
|
51
|
+
for sub in section.subsections:
|
|
52
|
+
print(f" {sub.heading}")
|
|
53
|
+
|
|
54
|
+
# Get tables
|
|
55
|
+
for table in doc.tables:
|
|
56
|
+
print(table.to_dicts()) # List of row dicts
|
|
57
|
+
|
|
58
|
+
# Export to different formats
|
|
59
|
+
print(pdfstruct.to_markdown(doc))
|
|
60
|
+
print(pdfstruct.to_text(doc))
|
|
61
|
+
print(pdfstruct.to_json(doc))
|
|
62
|
+
|
|
63
|
+
# Full dict for programmatic use
|
|
64
|
+
data = pdfstruct.to_dict(doc)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## API Reference
|
|
68
|
+
|
|
69
|
+
### `pdfstruct.parse(source, **options) -> Document`
|
|
70
|
+
|
|
71
|
+
Parse a PDF file, bytes, or file-like object.
|
|
72
|
+
|
|
73
|
+
**Options:**
|
|
74
|
+
- `detect_tables` (bool, default True) — Enable table detection
|
|
75
|
+
- `detect_headers_footers` (bool, default True) — Remove repeating headers/footers
|
|
76
|
+
- `detect_lists` (bool, default True) — Detect list structures
|
|
77
|
+
- `detect_columns` (bool, default True) — Handle multi-column layouts
|
|
78
|
+
|
|
79
|
+
### Document
|
|
80
|
+
|
|
81
|
+
- `doc.title` — Detected document title
|
|
82
|
+
- `doc.pages` — List of Page objects
|
|
83
|
+
- `doc.sections` — Hierarchical section tree
|
|
84
|
+
- `doc.tables` — All detected tables
|
|
85
|
+
- `doc.metadata` — DocumentMetadata with statistics
|
|
86
|
+
- `doc.text` — Full document text (concatenated from pages)
|
|
87
|
+
- `doc.to_dict()` — JSON-serializable dictionary
|
|
88
|
+
|
|
89
|
+
### Section
|
|
90
|
+
|
|
91
|
+
- `section.heading` — Section heading text
|
|
92
|
+
- `section.heading_level` — HeadingLevel enum (H1–H6)
|
|
93
|
+
- `section.content` — Section body text
|
|
94
|
+
- `section.paragraphs` — List of Paragraph objects
|
|
95
|
+
- `section.subsections` — Nested subsections
|
|
96
|
+
|
|
97
|
+
### Table
|
|
98
|
+
|
|
99
|
+
- `table.rows` — List of TableRow objects
|
|
100
|
+
- `table.to_list()` — 2D list of cell text
|
|
101
|
+
- `table.to_dicts()` — List of dicts (header row as keys)
|
|
102
|
+
- `table.num_rows`, `table.num_cols` — Dimensions
|
|
103
|
+
|
|
104
|
+
### Metadata
|
|
105
|
+
|
|
106
|
+
- `metadata.word_count`, `metadata.char_count` — Text statistics
|
|
107
|
+
- `metadata.language` — Detected language code
|
|
108
|
+
- `metadata.page_count` — Number of pages
|
|
109
|
+
- `metadata.is_scanned` — Whether PDF appears to be scanned
|
|
110
|
+
- `metadata.has_tables`, `metadata.has_images` — Content flags
|
|
111
|
+
- `metadata.primary_font`, `metadata.primary_font_size` — Font info
|
|
112
|
+
|
|
113
|
+
## Architecture
|
|
114
|
+
|
|
115
|
+
```
|
|
116
|
+
pdfstruct/
|
|
117
|
+
├── parser.py # Main PDFParser class and parse() entry point
|
|
118
|
+
├── models/
|
|
119
|
+
│ ├── document.py # Core models: Document, Page, Section, TextLine, Table, etc.
|
|
120
|
+
│ └── metadata.py # DocumentMetadata with computed statistics
|
|
121
|
+
├── extractors/
|
|
122
|
+
│ └── text.py # PDF text extraction via pdfminer.six
|
|
123
|
+
├── layout/
|
|
124
|
+
│ └── analyzer.py # Paragraph grouping, reading order, margins
|
|
125
|
+
├── structure/
|
|
126
|
+
│ ├── headings.py # Font-aware heading detection
|
|
127
|
+
│ ├── headers_footers.py # Repeating content detection
|
|
128
|
+
│ ├── lists.py # List structure detection
|
|
129
|
+
│ └── sections.py # Section hierarchy builder
|
|
130
|
+
├── tables/
|
|
131
|
+
│ └── detector.py # Grid and whitespace table detection
|
|
132
|
+
├── output/
|
|
133
|
+
│ ├── json_output.py # JSON/dict export
|
|
134
|
+
│ ├── markdown.py # Markdown export
|
|
135
|
+
│ └── text_output.py # Plain text export
|
|
136
|
+
└── utils/
|
|
137
|
+
├── fonts.py # Font analysis and heading classification
|
|
138
|
+
├── geometry.py # Bounding box utilities, column detection
|
|
139
|
+
└── language.py # Language detection heuristics
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Requirements
|
|
143
|
+
|
|
144
|
+
- Python >= 3.10
|
|
145
|
+
- pdfminer.six >= 20231228
|
|
146
|
+
- Pillow >= 10.0.0
|
|
147
|
+
|
|
148
|
+
## License
|
|
149
|
+
|
|
150
|
+
Personal Use License by Kyros Groupe. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0,<77", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pdfstructx"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Intelligent PDF parser with font-aware structure detection, table extraction, and multi-column support"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {file = "LICENSE"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Kyros Groupe"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["pdf", "parser", "document", "extraction", "tables", "structure"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
24
|
+
"Topic :: Text Processing :: General",
|
|
25
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
26
|
+
]
|
|
27
|
+
dependencies = [
|
|
28
|
+
"pdfminer.six>=20231228",
|
|
29
|
+
"Pillow>=10.0.0",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
dev = [
|
|
34
|
+
"pytest>=8.0",
|
|
35
|
+
"pytest-cov>=4.0",
|
|
36
|
+
"ruff>=0.4.0",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[project.urls]
|
|
40
|
+
Homepage = "https://github.com/kyros-groupe/pdfstruct"
|
|
41
|
+
Documentation = "https://github.com/kyros-groupe/pdfstruct#readme"
|
|
42
|
+
Issues = "https://github.com/kyros-groupe/pdfstruct/issues"
|
|
43
|
+
|
|
44
|
+
[tool.setuptools.packages.find]
|
|
45
|
+
where = ["src"]
|
|
46
|
+
|
|
47
|
+
[tool.pytest.ini_options]
|
|
48
|
+
testpaths = ["tests"]
|
|
49
|
+
|
|
50
|
+
[tool.ruff]
|
|
51
|
+
line-length = 100
|
|
52
|
+
target-version = "py310"
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""pdfstruct — Intelligent PDF parser with structure detection.
|
|
2
|
+
|
|
3
|
+
Parse PDFs into structured documents with headings, sections, tables,
|
|
4
|
+
lists, and metadata. Font-aware heading detection. Multi-column support.
|
|
5
|
+
Grid and whitespace table extraction.
|
|
6
|
+
|
|
7
|
+
Basic usage:
|
|
8
|
+
>>> import pdfstruct
|
|
9
|
+
>>> doc = pdfstruct.parse("document.pdf")
|
|
10
|
+
>>> print(doc.title)
|
|
11
|
+
>>> print(doc.sections)
|
|
12
|
+
>>> print(doc.tables)
|
|
13
|
+
|
|
14
|
+
Export formats:
|
|
15
|
+
>>> pdfstruct.to_json(doc)
|
|
16
|
+
>>> pdfstruct.to_markdown(doc)
|
|
17
|
+
>>> pdfstruct.to_text(doc)
|
|
18
|
+
>>> pdfstruct.to_dict(doc)
|
|
19
|
+
|
|
20
|
+
License: Personal Use Only — see LICENSE file.
|
|
21
|
+
Copyright (c) 2026 Kyros Groupe.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
__version__ = "0.2.0"
|
|
25
|
+
|
|
26
|
+
from pdfstruct.parser import parse, PDFParser
|
|
27
|
+
from pdfstruct.models.document import (
|
|
28
|
+
Document,
|
|
29
|
+
Page,
|
|
30
|
+
Section,
|
|
31
|
+
Paragraph,
|
|
32
|
+
TextLine,
|
|
33
|
+
Table,
|
|
34
|
+
TableRow,
|
|
35
|
+
TableCell,
|
|
36
|
+
BBox,
|
|
37
|
+
FontInfo,
|
|
38
|
+
HeadingLevel,
|
|
39
|
+
ListType,
|
|
40
|
+
ListItem,
|
|
41
|
+
ImageInfo,
|
|
42
|
+
)
|
|
43
|
+
from pdfstruct.models.metadata import DocumentMetadata
|
|
44
|
+
from pdfstruct.output.json_output import to_json, to_dict
|
|
45
|
+
from pdfstruct.output.markdown import to_markdown
|
|
46
|
+
from pdfstruct.output.text_output import to_text
|
|
47
|
+
from pdfstruct.extractors.images import generate_thumbnail
|
|
48
|
+
|
|
49
|
+
__all__ = [
|
|
50
|
+
# Core API
|
|
51
|
+
"parse",
|
|
52
|
+
"PDFParser",
|
|
53
|
+
# Models
|
|
54
|
+
"Document",
|
|
55
|
+
"Page",
|
|
56
|
+
"Section",
|
|
57
|
+
"Paragraph",
|
|
58
|
+
"TextLine",
|
|
59
|
+
"Table",
|
|
60
|
+
"TableRow",
|
|
61
|
+
"TableCell",
|
|
62
|
+
"BBox",
|
|
63
|
+
"FontInfo",
|
|
64
|
+
"HeadingLevel",
|
|
65
|
+
"ListType",
|
|
66
|
+
"ListItem",
|
|
67
|
+
"ImageInfo",
|
|
68
|
+
"DocumentMetadata",
|
|
69
|
+
# Output
|
|
70
|
+
"to_json",
|
|
71
|
+
"to_dict",
|
|
72
|
+
"to_markdown",
|
|
73
|
+
"to_text",
|
|
74
|
+
# Utilities
|
|
75
|
+
"generate_thumbnail",
|
|
76
|
+
]
|
|
File without changes
|