pdfstructx 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. pdfstructx-0.2.0/LICENSE +38 -0
  2. pdfstructx-0.2.0/MANIFEST.in +4 -0
  3. pdfstructx-0.2.0/PKG-INFO +217 -0
  4. pdfstructx-0.2.0/README.md +150 -0
  5. pdfstructx-0.2.0/pyproject.toml +52 -0
  6. pdfstructx-0.2.0/setup.cfg +4 -0
  7. pdfstructx-0.2.0/src/pdfstruct/__init__.py +76 -0
  8. pdfstructx-0.2.0/src/pdfstruct/extractors/__init__.py +0 -0
  9. pdfstructx-0.2.0/src/pdfstruct/extractors/images.py +607 -0
  10. pdfstructx-0.2.0/src/pdfstruct/extractors/text.py +269 -0
  11. pdfstructx-0.2.0/src/pdfstruct/layout/__init__.py +0 -0
  12. pdfstructx-0.2.0/src/pdfstruct/layout/analyzer.py +210 -0
  13. pdfstructx-0.2.0/src/pdfstruct/models/__init__.py +27 -0
  14. pdfstructx-0.2.0/src/pdfstruct/models/document.py +475 -0
  15. pdfstructx-0.2.0/src/pdfstruct/models/metadata.py +96 -0
  16. pdfstructx-0.2.0/src/pdfstruct/output/__init__.py +0 -0
  17. pdfstructx-0.2.0/src/pdfstruct/output/json_output.py +18 -0
  18. pdfstructx-0.2.0/src/pdfstruct/output/markdown.py +162 -0
  19. pdfstructx-0.2.0/src/pdfstruct/output/text_output.py +65 -0
  20. pdfstructx-0.2.0/src/pdfstruct/parser.py +413 -0
  21. pdfstructx-0.2.0/src/pdfstruct/structure/__init__.py +0 -0
  22. pdfstructx-0.2.0/src/pdfstruct/structure/headers_footers.py +110 -0
  23. pdfstructx-0.2.0/src/pdfstruct/structure/headings.py +151 -0
  24. pdfstructx-0.2.0/src/pdfstruct/structure/lists.py +124 -0
  25. pdfstructx-0.2.0/src/pdfstruct/structure/sections.py +132 -0
  26. pdfstructx-0.2.0/src/pdfstruct/tables/__init__.py +0 -0
  27. pdfstructx-0.2.0/src/pdfstruct/tables/detector.py +347 -0
  28. pdfstructx-0.2.0/src/pdfstruct/utils/__init__.py +0 -0
  29. pdfstructx-0.2.0/src/pdfstruct/utils/fonts.py +153 -0
  30. pdfstructx-0.2.0/src/pdfstruct/utils/geometry.py +146 -0
  31. pdfstructx-0.2.0/src/pdfstruct/utils/language.py +84 -0
  32. pdfstructx-0.2.0/src/pdfstructx.egg-info/PKG-INFO +217 -0
  33. pdfstructx-0.2.0/src/pdfstructx.egg-info/SOURCES.txt +35 -0
  34. pdfstructx-0.2.0/src/pdfstructx.egg-info/dependency_links.txt +1 -0
  35. pdfstructx-0.2.0/src/pdfstructx.egg-info/requires.txt +7 -0
  36. pdfstructx-0.2.0/src/pdfstructx.egg-info/top_level.txt +1 -0
  37. pdfstructx-0.2.0/tests/test_image_extraction.py +362 -0
@@ -0,0 +1,38 @@
1
+ pdfstruct - Personal Use License
2
+
3
+ Copyright (c) 2026 Kyros Groupe. All rights reserved.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to use,
7
+ copy, modify, and distribute the Software for PERSONAL, EDUCATIONAL, and
8
+ NON-COMMERCIAL purposes only, subject to the following conditions:
9
+
10
+ 1. PERSONAL USE: You may use the Software for personal projects, learning,
11
+ research, and non-commercial open-source projects.
12
+
13
+ 2. COMMERCIAL USE PROHIBITED: You may NOT use the Software, in whole or in
14
+ part, for any commercial purpose without obtaining a separate commercial
15
+ license from Kyros Groupe. Commercial use includes, but is not limited to:
16
+ - Incorporating the Software into a product or service sold for profit
17
+ - Using the Software to provide paid services or SaaS offerings
18
+ - Using the Software within a for-profit business for internal operations
19
+ - Distributing the Software as part of a commercial product
20
+
21
+ 3. ATTRIBUTION: All copies or substantial portions of the Software must
22
+ include this license notice and the above copyright notice.
23
+
24
+ 4. NO SUBLICENSING: You may not sublicense the Software under different terms.
25
+
26
+ 5. MODIFICATIONS: You may modify the Software for personal use. Modified
27
+ versions must retain this license and cannot be distributed under
28
+ different terms.
29
+
30
+ For commercial licensing inquiries, contact: licensing@kyros-groupe.com
31
+
32
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
33
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
34
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
35
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
36
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
37
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
38
+ SOFTWARE.
@@ -0,0 +1,4 @@
1
+ include LICENSE
2
+ include README.md
3
+ include pyproject.toml
4
+ recursive-include src/pdfstruct *.py
@@ -0,0 +1,217 @@
1
+ Metadata-Version: 2.2
2
+ Name: pdfstructx
3
+ Version: 0.2.0
4
+ Summary: Intelligent PDF parser with font-aware structure detection, table extraction, and multi-column support
5
+ Author: Kyros Groupe
6
+ License: pdfstruct - Personal Use License
7
+
8
+ Copyright (c) 2026 Kyros Groupe. All rights reserved.
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to use,
12
+ copy, modify, and distribute the Software for PERSONAL, EDUCATIONAL, and
13
+ NON-COMMERCIAL purposes only, subject to the following conditions:
14
+
15
+ 1. PERSONAL USE: You may use the Software for personal projects, learning,
16
+ research, and non-commercial open-source projects.
17
+
18
+ 2. COMMERCIAL USE PROHIBITED: You may NOT use the Software, in whole or in
19
+ part, for any commercial purpose without obtaining a separate commercial
20
+ license from Kyros Groupe. Commercial use includes, but is not limited to:
21
+ - Incorporating the Software into a product or service sold for profit
22
+ - Using the Software to provide paid services or SaaS offerings
23
+ - Using the Software within a for-profit business for internal operations
24
+ - Distributing the Software as part of a commercial product
25
+
26
+ 3. ATTRIBUTION: All copies or substantial portions of the Software must
27
+ include this license notice and the above copyright notice.
28
+
29
+ 4. NO SUBLICENSING: You may not sublicense the Software under different terms.
30
+
31
+ 5. MODIFICATIONS: You may modify the Software for personal use. Modified
32
+ versions must retain this license and cannot be distributed under
33
+ different terms.
34
+
35
+ For commercial licensing inquiries, contact: licensing@kyros-groupe.com
36
+
37
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
38
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
39
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
40
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
41
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
42
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
43
+ SOFTWARE.
44
+
45
+ Project-URL: Homepage, https://github.com/kyros-groupe/pdfstruct
46
+ Project-URL: Documentation, https://github.com/kyros-groupe/pdfstruct#readme
47
+ Project-URL: Issues, https://github.com/kyros-groupe/pdfstruct/issues
48
+ Keywords: pdf,parser,document,extraction,tables,structure
49
+ Classifier: Development Status :: 3 - Alpha
50
+ Classifier: Intended Audience :: Developers
51
+ Classifier: Programming Language :: Python :: 3
52
+ Classifier: Programming Language :: Python :: 3.10
53
+ Classifier: Programming Language :: Python :: 3.11
54
+ Classifier: Programming Language :: Python :: 3.12
55
+ Classifier: Programming Language :: Python :: 3.13
56
+ Classifier: Topic :: Text Processing :: General
57
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
58
+ Requires-Python: >=3.10
59
+ Description-Content-Type: text/markdown
60
+ License-File: LICENSE
61
+ Requires-Dist: pdfminer.six>=20231228
62
+ Requires-Dist: Pillow>=10.0.0
63
+ Provides-Extra: dev
64
+ Requires-Dist: pytest>=8.0; extra == "dev"
65
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
66
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
67
+
68
+ # pdfstruct
69
+
70
+ Intelligent PDF parser with font-aware structure detection, table extraction, and multi-column support.
71
+
72
+ [![Python](https://img.shields.io/badge/python-≥3.10-blue.svg)](https://www.python.org/)
73
+
74
+ ## Overview
75
+
76
+ **pdfstruct** is a Python library that extracts structured content from PDF documents. Unlike basic text extraction tools, pdfstruct understands document layout — detecting headings, sections, tables, lists, headers/footers, and multi-column layouts using font analysis and geometric reasoning.
77
+
78
+ ## Features
79
+
80
+ - **Font-aware heading detection**: Uses font size, weight, and frequency analysis to classify headings (H1–H6)
81
+ - **Table extraction**: Detects tables from grid lines and whitespace-aligned columns
82
+ - **Section hierarchy**: Builds a document tree from headings and content
83
+ - **Multi-column support**: Handles two-column and multi-column layouts
84
+ - **Header/footer removal**: Identifies and filters repeating page content
85
+ - **List detection**: Recognizes bulleted, numbered, lettered, and Roman numeral lists
86
+ - **Multiple output formats**: JSON, Markdown, and plain text
87
+ - **Rich metadata**: Word count, language detection, reading time, font statistics
88
+
89
+ ## Installation
90
+
91
+ ```bash
92
+ pip install pdfstruct
93
+ ```
94
+
95
+ Or install from source:
96
+
97
+ ```bash
98
+ git clone https://github.com/kyros-groupe/pdfstruct.git
99
+ cd pdfstruct
100
+ pip install -e .
101
+ ```
102
+
103
+ ## Quickstart
104
+
105
+ ```python
106
+ import pdfstruct
107
+
108
+ # Parse a PDF
109
+ doc = pdfstruct.parse("contract.pdf")
110
+
111
+ # Access structured content
112
+ print(doc.title)
113
+ print(f"{doc.page_count} pages, {doc.metadata.word_count} words")
114
+
115
+ # Browse sections
116
+ for section in doc.sections:
117
+ print(f"{section.heading} ({len(section.content)} chars)")
118
+ for sub in section.subsections:
119
+ print(f" {sub.heading}")
120
+
121
+ # Get tables
122
+ for table in doc.tables:
123
+ print(table.to_dicts()) # List of row dicts
124
+
125
+ # Export to different formats
126
+ print(pdfstruct.to_markdown(doc))
127
+ print(pdfstruct.to_text(doc))
128
+ print(pdfstruct.to_json(doc))
129
+
130
+ # Full dict for programmatic use
131
+ data = pdfstruct.to_dict(doc)
132
+ ```
133
+
134
+ ## API Reference
135
+
136
+ ### `pdfstruct.parse(source, **options) -> Document`
137
+
138
+ Parse a PDF file, bytes, or file-like object.
139
+
140
+ **Options:**
141
+ - `detect_tables` (bool, default True) — Enable table detection
142
+ - `detect_headers_footers` (bool, default True) — Remove repeating headers/footers
143
+ - `detect_lists` (bool, default True) — Detect list structures
144
+ - `detect_columns` (bool, default True) — Handle multi-column layouts
145
+
146
+ ### Document
147
+
148
+ - `doc.title` — Detected document title
149
+ - `doc.pages` — List of Page objects
150
+ - `doc.sections` — Hierarchical section tree
151
+ - `doc.tables` — All detected tables
152
+ - `doc.metadata` — DocumentMetadata with statistics
153
+ - `doc.text` — Full document text (concatenated from pages)
154
+ - `doc.to_dict()` — JSON-serializable dictionary
155
+
156
+ ### Section
157
+
158
+ - `section.heading` — Section heading text
159
+ - `section.heading_level` — HeadingLevel enum (H1–H6)
160
+ - `section.content` — Section body text
161
+ - `section.paragraphs` — List of Paragraph objects
162
+ - `section.subsections` — Nested subsections
163
+
164
+ ### Table
165
+
166
+ - `table.rows` — List of TableRow objects
167
+ - `table.to_list()` — 2D list of cell text
168
+ - `table.to_dicts()` — List of dicts (header row as keys)
169
+ - `table.num_rows`, `table.num_cols` — Dimensions
170
+
171
+ ### Metadata
172
+
173
+ - `metadata.word_count`, `metadata.char_count` — Text statistics
174
+ - `metadata.language` — Detected language code
175
+ - `metadata.page_count` — Number of pages
176
+ - `metadata.is_scanned` — Whether PDF appears to be scanned
177
+ - `metadata.has_tables`, `metadata.has_images` — Content flags
178
+ - `metadata.primary_font`, `metadata.primary_font_size` — Font info
179
+
180
+ ## Architecture
181
+
182
+ ```
183
+ pdfstruct/
184
+ ├── parser.py # Main PDFParser class and parse() entry point
185
+ ├── models/
186
+ │ ├── document.py # Core models: Document, Page, Section, TextLine, Table, etc.
187
+ │ └── metadata.py # DocumentMetadata with computed statistics
188
+ ├── extractors/
189
+ │ └── text.py # PDF text extraction via pdfminer.six
190
+ ├── layout/
191
+ │ └── analyzer.py # Paragraph grouping, reading order, margins
192
+ ├── structure/
193
+ │ ├── headings.py # Font-aware heading detection
194
+ │ ├── headers_footers.py # Repeating content detection
195
+ │ ├── lists.py # List structure detection
196
+ │ └── sections.py # Section hierarchy builder
197
+ ├── tables/
198
+ │ └── detector.py # Grid and whitespace table detection
199
+ ├── output/
200
+ │ ├── json_output.py # JSON/dict export
201
+ │ ├── markdown.py # Markdown export
202
+ │ └── text_output.py # Plain text export
203
+ └── utils/
204
+ ├── fonts.py # Font analysis and heading classification
205
+ ├── geometry.py # Bounding box utilities, column detection
206
+ └── language.py # Language detection heuristics
207
+ ```
208
+
209
+ ## Requirements
210
+
211
+ - Python >= 3.10
212
+ - pdfminer.six >= 20231228
213
+ - Pillow >= 10.0.0
214
+
215
+ ## License
216
+
217
+ Personal Use License by Kyros Groupe. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,150 @@
1
+ # pdfstruct
2
+
3
+ Intelligent PDF parser with font-aware structure detection, table extraction, and multi-column support.
4
+
5
+ [![Python](https://img.shields.io/badge/python-≥3.10-blue.svg)](https://www.python.org/)
6
+
7
+ ## Overview
8
+
9
+ **pdfstruct** is a Python library that extracts structured content from PDF documents. Unlike basic text extraction tools, pdfstruct understands document layout — detecting headings, sections, tables, lists, headers/footers, and multi-column layouts using font analysis and geometric reasoning.
10
+
11
+ ## Features
12
+
13
+ - **Font-aware heading detection**: Uses font size, weight, and frequency analysis to classify headings (H1–H6)
14
+ - **Table extraction**: Detects tables from grid lines and whitespace-aligned columns
15
+ - **Section hierarchy**: Builds a document tree from headings and content
16
+ - **Multi-column support**: Handles two-column and multi-column layouts
17
+ - **Header/footer removal**: Identifies and filters repeating page content
18
+ - **List detection**: Recognizes bulleted, numbered, lettered, and Roman numeral lists
19
+ - **Multiple output formats**: JSON, Markdown, and plain text
20
+ - **Rich metadata**: Word count, language detection, reading time, font statistics
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ pip install pdfstruct
26
+ ```
27
+
28
+ Or install from source:
29
+
30
+ ```bash
31
+ git clone https://github.com/kyros-groupe/pdfstruct.git
32
+ cd pdfstruct
33
+ pip install -e .
34
+ ```
35
+
36
+ ## Quickstart
37
+
38
+ ```python
39
+ import pdfstruct
40
+
41
+ # Parse a PDF
42
+ doc = pdfstruct.parse("contract.pdf")
43
+
44
+ # Access structured content
45
+ print(doc.title)
46
+ print(f"{doc.page_count} pages, {doc.metadata.word_count} words")
47
+
48
+ # Browse sections
49
+ for section in doc.sections:
50
+ print(f"{section.heading} ({len(section.content)} chars)")
51
+ for sub in section.subsections:
52
+ print(f" {sub.heading}")
53
+
54
+ # Get tables
55
+ for table in doc.tables:
56
+ print(table.to_dicts()) # List of row dicts
57
+
58
+ # Export to different formats
59
+ print(pdfstruct.to_markdown(doc))
60
+ print(pdfstruct.to_text(doc))
61
+ print(pdfstruct.to_json(doc))
62
+
63
+ # Full dict for programmatic use
64
+ data = pdfstruct.to_dict(doc)
65
+ ```
66
+
67
+ ## API Reference
68
+
69
+ ### `pdfstruct.parse(source, **options) -> Document`
70
+
71
+ Parse a PDF file, bytes, or file-like object.
72
+
73
+ **Options:**
74
+ - `detect_tables` (bool, default True) — Enable table detection
75
+ - `detect_headers_footers` (bool, default True) — Remove repeating headers/footers
76
+ - `detect_lists` (bool, default True) — Detect list structures
77
+ - `detect_columns` (bool, default True) — Handle multi-column layouts
78
+
79
+ ### Document
80
+
81
+ - `doc.title` — Detected document title
82
+ - `doc.pages` — List of Page objects
83
+ - `doc.sections` — Hierarchical section tree
84
+ - `doc.tables` — All detected tables
85
+ - `doc.metadata` — DocumentMetadata with statistics
86
+ - `doc.text` — Full document text (concatenated from pages)
87
+ - `doc.to_dict()` — JSON-serializable dictionary
88
+
89
+ ### Section
90
+
91
+ - `section.heading` — Section heading text
92
+ - `section.heading_level` — HeadingLevel enum (H1–H6)
93
+ - `section.content` — Section body text
94
+ - `section.paragraphs` — List of Paragraph objects
95
+ - `section.subsections` — Nested subsections
96
+
97
+ ### Table
98
+
99
+ - `table.rows` — List of TableRow objects
100
+ - `table.to_list()` — 2D list of cell text
101
+ - `table.to_dicts()` — List of dicts (header row as keys)
102
+ - `table.num_rows`, `table.num_cols` — Dimensions
103
+
104
+ ### Metadata
105
+
106
+ - `metadata.word_count`, `metadata.char_count` — Text statistics
107
+ - `metadata.language` — Detected language code
108
+ - `metadata.page_count` — Number of pages
109
+ - `metadata.is_scanned` — Whether PDF appears to be scanned
110
+ - `metadata.has_tables`, `metadata.has_images` — Content flags
111
+ - `metadata.primary_font`, `metadata.primary_font_size` — Font info
112
+
113
+ ## Architecture
114
+
115
+ ```
116
+ pdfstruct/
117
+ ├── parser.py # Main PDFParser class and parse() entry point
118
+ ├── models/
119
+ │ ├── document.py # Core models: Document, Page, Section, TextLine, Table, etc.
120
+ │ └── metadata.py # DocumentMetadata with computed statistics
121
+ ├── extractors/
122
+ │ └── text.py # PDF text extraction via pdfminer.six
123
+ ├── layout/
124
+ │ └── analyzer.py # Paragraph grouping, reading order, margins
125
+ ├── structure/
126
+ │ ├── headings.py # Font-aware heading detection
127
+ │ ├── headers_footers.py # Repeating content detection
128
+ │ ├── lists.py # List structure detection
129
+ │ └── sections.py # Section hierarchy builder
130
+ ├── tables/
131
+ │ └── detector.py # Grid and whitespace table detection
132
+ ├── output/
133
+ │ ├── json_output.py # JSON/dict export
134
+ │ ├── markdown.py # Markdown export
135
+ │ └── text_output.py # Plain text export
136
+ └── utils/
137
+ ├── fonts.py # Font analysis and heading classification
138
+ ├── geometry.py # Bounding box utilities, column detection
139
+ └── language.py # Language detection heuristics
140
+ ```
141
+
142
+ ## Requirements
143
+
144
+ - Python >= 3.10
145
+ - pdfminer.six >= 20231228
146
+ - Pillow >= 10.0.0
147
+
148
+ ## License
149
+
150
+ Personal Use License by Kyros Groupe. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,52 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0,<77", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pdfstructx"
7
+ version = "0.2.0"
8
+ description = "Intelligent PDF parser with font-aware structure detection, table extraction, and multi-column support"
9
+ readme = "README.md"
10
+ license = {file = "LICENSE"}
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "Kyros Groupe"},
14
+ ]
15
+ keywords = ["pdf", "parser", "document", "extraction", "tables", "structure"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
24
+ "Topic :: Text Processing :: General",
25
+ "Topic :: Software Development :: Libraries :: Python Modules",
26
+ ]
27
+ dependencies = [
28
+ "pdfminer.six>=20231228",
29
+ "Pillow>=10.0.0",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ dev = [
34
+ "pytest>=8.0",
35
+ "pytest-cov>=4.0",
36
+ "ruff>=0.4.0",
37
+ ]
38
+
39
+ [project.urls]
40
+ Homepage = "https://github.com/kyros-groupe/pdfstruct"
41
+ Documentation = "https://github.com/kyros-groupe/pdfstruct#readme"
42
+ Issues = "https://github.com/kyros-groupe/pdfstruct/issues"
43
+
44
+ [tool.setuptools.packages.find]
45
+ where = ["src"]
46
+
47
+ [tool.pytest.ini_options]
48
+ testpaths = ["tests"]
49
+
50
+ [tool.ruff]
51
+ line-length = 100
52
+ target-version = "py310"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,76 @@
1
+ """pdfstruct — Intelligent PDF parser with structure detection.
2
+
3
+ Parse PDFs into structured documents with headings, sections, tables,
4
+ lists, and metadata. Font-aware heading detection. Multi-column support.
5
+ Grid and whitespace table extraction.
6
+
7
+ Basic usage:
8
+ >>> import pdfstruct
9
+ >>> doc = pdfstruct.parse("document.pdf")
10
+ >>> print(doc.title)
11
+ >>> print(doc.sections)
12
+ >>> print(doc.tables)
13
+
14
+ Export formats:
15
+ >>> pdfstruct.to_json(doc)
16
+ >>> pdfstruct.to_markdown(doc)
17
+ >>> pdfstruct.to_text(doc)
18
+ >>> pdfstruct.to_dict(doc)
19
+
20
+ License: Personal Use Only — see LICENSE file.
21
+ Copyright (c) 2026 Kyros Groupe.
22
+ """
23
+
24
+ __version__ = "0.2.0"
25
+
26
+ from pdfstruct.parser import parse, PDFParser
27
+ from pdfstruct.models.document import (
28
+ Document,
29
+ Page,
30
+ Section,
31
+ Paragraph,
32
+ TextLine,
33
+ Table,
34
+ TableRow,
35
+ TableCell,
36
+ BBox,
37
+ FontInfo,
38
+ HeadingLevel,
39
+ ListType,
40
+ ListItem,
41
+ ImageInfo,
42
+ )
43
+ from pdfstruct.models.metadata import DocumentMetadata
44
+ from pdfstruct.output.json_output import to_json, to_dict
45
+ from pdfstruct.output.markdown import to_markdown
46
+ from pdfstruct.output.text_output import to_text
47
+ from pdfstruct.extractors.images import generate_thumbnail
48
+
49
+ __all__ = [
50
+ # Core API
51
+ "parse",
52
+ "PDFParser",
53
+ # Models
54
+ "Document",
55
+ "Page",
56
+ "Section",
57
+ "Paragraph",
58
+ "TextLine",
59
+ "Table",
60
+ "TableRow",
61
+ "TableCell",
62
+ "BBox",
63
+ "FontInfo",
64
+ "HeadingLevel",
65
+ "ListType",
66
+ "ListItem",
67
+ "ImageInfo",
68
+ "DocumentMetadata",
69
+ # Output
70
+ "to_json",
71
+ "to_dict",
72
+ "to_markdown",
73
+ "to_text",
74
+ # Utilities
75
+ "generate_thumbnail",
76
+ ]
File without changes