layoutir 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- layoutir-1.0.0/LICENSE +21 -0
- layoutir-1.0.0/PKG-INFO +314 -0
- layoutir-1.0.0/README.md +282 -0
- layoutir-1.0.0/pyproject.toml +72 -0
- layoutir-1.0.0/setup.cfg +4 -0
- layoutir-1.0.0/src/layoutir/__init__.py +33 -0
- layoutir-1.0.0/src/layoutir/adapters/__init__.py +9 -0
- layoutir-1.0.0/src/layoutir/adapters/base.py +50 -0
- layoutir-1.0.0/src/layoutir/adapters/docling_adapter.py +132 -0
- layoutir-1.0.0/src/layoutir/chunking/__init__.py +15 -0
- layoutir-1.0.0/src/layoutir/chunking/strategies.py +282 -0
- layoutir-1.0.0/src/layoutir/cli.py +183 -0
- layoutir-1.0.0/src/layoutir/exporters/__init__.py +15 -0
- layoutir-1.0.0/src/layoutir/exporters/asset_writer.py +108 -0
- layoutir-1.0.0/src/layoutir/exporters/base.py +27 -0
- layoutir-1.0.0/src/layoutir/exporters/markdown_exporter.py +148 -0
- layoutir-1.0.0/src/layoutir/exporters/parquet_exporter.py +154 -0
- layoutir-1.0.0/src/layoutir/exporters/text_exporter.py +63 -0
- layoutir-1.0.0/src/layoutir/extraction/__init__.py +17 -0
- layoutir-1.0.0/src/layoutir/extraction/docling_extractor.py +320 -0
- layoutir-1.0.0/src/layoutir/normalization/__init__.py +5 -0
- layoutir-1.0.0/src/layoutir/normalization/normalizer.py +332 -0
- layoutir-1.0.0/src/layoutir/pipeline.py +263 -0
- layoutir-1.0.0/src/layoutir/schema.py +153 -0
- layoutir-1.0.0/src/layoutir/utils/__init__.py +26 -0
- layoutir-1.0.0/src/layoutir/utils/hashing.py +188 -0
- layoutir-1.0.0/src/layoutir/utils/logging_config.py +125 -0
- layoutir-1.0.0/src/layoutir.egg-info/PKG-INFO +314 -0
- layoutir-1.0.0/src/layoutir.egg-info/SOURCES.txt +31 -0
- layoutir-1.0.0/src/layoutir.egg-info/dependency_links.txt +1 -0
- layoutir-1.0.0/src/layoutir.egg-info/entry_points.txt +2 -0
- layoutir-1.0.0/src/layoutir.egg-info/requires.txt +11 -0
- layoutir-1.0.0/src/layoutir.egg-info/top_level.txt +1 -0
layoutir-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Rahul Patnaik
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
layoutir-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: layoutir
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Production-grade Document Ingestion & Canonicalization Engine
|
|
5
|
+
Author-email: Rahul Patnaik <rahulpatnaik@example.com>
|
|
6
|
+
Maintainer-email: Rahul Patnaik <rahulpatnaik@example.com>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/RahulPatnaik/layoutir
|
|
9
|
+
Project-URL: Documentation, https://github.com/RahulPatnaik/layoutir/blob/main/README.md
|
|
10
|
+
Project-URL: Repository, https://github.com/RahulPatnaik/layoutir
|
|
11
|
+
Keywords: pdf,document,parsing,ir,layout,extraction,chunking
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Requires-Python: >=3.12
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: pydantic>=2.0.0
|
|
22
|
+
Requires-Dist: docling>=1.0.0
|
|
23
|
+
Requires-Dist: torch>=2.0.0
|
|
24
|
+
Requires-Dist: pandas>=2.0.0
|
|
25
|
+
Requires-Dist: pyarrow>=10.0.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
28
|
+
Requires-Dist: black>=23.0; extra == "dev"
|
|
29
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
30
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# Document IR - Production Document Ingestion Engine
|
|
34
|
+
|
|
35
|
+
**An IR-first, extensible document compiler for AI systems.**
|
|
36
|
+
|
|
37
|
+
This is NOT a PDF-to-Markdown script. It is a production-grade document ingestion and canonicalization engine designed with compiler-like architecture: Input → IR → Backends.
|
|
38
|
+
|
|
39
|
+
## Architecture
|
|
40
|
+
|
|
41
|
+
### Design Philosophy
|
|
42
|
+
|
|
43
|
+
Think like a compiler engineer:
|
|
44
|
+
- **Input Layer**: Format-specific parsers (currently PDF via Docling)
|
|
45
|
+
- **AST/IR**: Canonical intermediate representation with strict schema
|
|
46
|
+
- **Backends**: Multiple export formats (Markdown, Text, Parquet)
|
|
47
|
+
|
|
48
|
+
### Layer Separation (Non-Negotiable)
|
|
49
|
+
|
|
50
|
+
```mermaid
|
|
51
|
+
flowchart TD
|
|
52
|
+
A[Input Adapter Layer<br/>Format-specific parsing only]
|
|
53
|
+
B[Extraction Layer<br/>Extract raw structural elements]
|
|
54
|
+
C[Normalization Layer<br/>Convert to canonical IR with hashing]
|
|
55
|
+
D[Canonical IR Layer<br/>Typed schema, stable IDs, relationships]
|
|
56
|
+
E[Export Layer<br/>Markdown, Text, Parquet, Assets]
|
|
57
|
+
|
|
58
|
+
A --> B
|
|
59
|
+
B --> C
|
|
60
|
+
C --> D
|
|
61
|
+
D --> E
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Key Features
|
|
65
|
+
|
|
66
|
+
### ✅ Deterministic & Idempotent
|
|
67
|
+
- Hash-based stable IDs (document, block, table, image, chunk)
|
|
68
|
+
- Running pipeline twice produces identical output
|
|
69
|
+
- No UUIDs, no randomness
|
|
70
|
+
|
|
71
|
+
### ✅ Canonical IR Schema
|
|
72
|
+
```python
|
|
73
|
+
Document
|
|
74
|
+
├── document_id: str (hash-based)
|
|
75
|
+
├── schema_version: str
|
|
76
|
+
├── parser_version: str
|
|
77
|
+
├── metadata: DocumentMetadata
|
|
78
|
+
├── blocks: List[Block]
|
|
79
|
+
│ ├── block_id: str (deterministic)
|
|
80
|
+
│ ├── type: BlockType (heading, paragraph, table, image, etc.)
|
|
81
|
+
│ ├── content: str
|
|
82
|
+
│ ├── page_number: int
|
|
83
|
+
│ ├── bbox: BoundingBox
|
|
84
|
+
│ └── metadata: dict
|
|
85
|
+
└── relationships: List[Relationship]
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### ✅ Pluggable Chunking
|
|
89
|
+
- `SemanticSectionChunker`: Section-based (headings)
|
|
90
|
+
- `TokenWindowChunker`: Fixed token windows with overlap
|
|
91
|
+
- `LayoutAwareChunker`: Layout-aware (stub)
|
|
92
|
+
|
|
93
|
+
All chunking operates on IR, not raw text.
|
|
94
|
+
|
|
95
|
+
### ✅ Multiple Export Formats
|
|
96
|
+
- **Markdown**: Human-readable with formatting
|
|
97
|
+
- **Plain Text**: Simple text extraction
|
|
98
|
+
- **Parquet**: Efficient structured storage for tables/blocks
|
|
99
|
+
- **Assets**: Extracted images (PNG) and tables (CSV)
|
|
100
|
+
|
|
101
|
+
### ✅ Structured Output
|
|
102
|
+
```
|
|
103
|
+
/<document_id>/
|
|
104
|
+
manifest.json # Processing metadata
|
|
105
|
+
ir.json # Canonical IR
|
|
106
|
+
chunks.json # Chunk definitions
|
|
107
|
+
/assets/
|
|
108
|
+
/images/ # Extracted images
|
|
109
|
+
/tables/ # Tables as CSV
|
|
110
|
+
/exports/
|
|
111
|
+
/markdown/ # Markdown output
|
|
112
|
+
/text/ # Plain text output
|
|
113
|
+
/parquet/ # Parquet datasets
|
|
114
|
+
/logs/ # Processing logs
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Installation
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
# Install from PyPI
|
|
121
|
+
pip install layoutir
|
|
122
|
+
|
|
123
|
+
# Or install from source
|
|
124
|
+
git clone https://github.com/RahulPatnaik/layoutir.git
|
|
125
|
+
cd layoutir
|
|
126
|
+
pip install -e .
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Usage
|
|
130
|
+
|
|
131
|
+
### Basic Usage
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
# Using the CLI
|
|
135
|
+
layoutir --input file.pdf --output ./out
|
|
136
|
+
|
|
137
|
+
# Or using Python directly
|
|
138
|
+
python -m layoutir.cli --input file.pdf --output ./out
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Advanced Options
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
# Semantic chunking (default)
|
|
145
|
+
layoutir --input file.pdf --output ./out --chunk-strategy semantic
|
|
146
|
+
|
|
147
|
+
# Token-based chunking with custom size
|
|
148
|
+
layoutir --input file.pdf --output ./out \
|
|
149
|
+
--chunk-strategy token \
|
|
150
|
+
--chunk-size 1024 \
|
|
151
|
+
--chunk-overlap 128
|
|
152
|
+
|
|
153
|
+
# Enable GPU acceleration
|
|
154
|
+
layoutir --input file.pdf --output ./out --use-gpu
|
|
155
|
+
|
|
156
|
+
# Debug mode with structured logging
|
|
157
|
+
layoutir --input file.pdf --output ./out \
|
|
158
|
+
--log-level DEBUG \
|
|
159
|
+
--structured-logs
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Python API
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
from pathlib import Path
|
|
166
|
+
from layoutir import Pipeline
|
|
167
|
+
from layoutir.adapters import DoclingAdapter
|
|
168
|
+
from layoutir.chunking import SemanticSectionChunker
|
|
169
|
+
|
|
170
|
+
# Create pipeline
|
|
171
|
+
adapter = DoclingAdapter(use_gpu=True)
|
|
172
|
+
chunker = SemanticSectionChunker(max_heading_level=2)
|
|
173
|
+
pipeline = Pipeline(adapter=adapter, chunk_strategy=chunker)
|
|
174
|
+
|
|
175
|
+
# Process document
|
|
176
|
+
document = pipeline.process(
|
|
177
|
+
input_path=Path("document.pdf"),
|
|
178
|
+
output_dir=Path("./output")
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# Access results
|
|
182
|
+
print(f"Extracted {len(document.blocks)} blocks")
|
|
183
|
+
print(f"Document ID: {document.document_id}")
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Project Structure
|
|
187
|
+
|
|
188
|
+
```
|
|
189
|
+
src/layoutir/
|
|
190
|
+
├── schema.py # Canonical IR schema (Pydantic)
|
|
191
|
+
├── pipeline.py # Main orchestrator
|
|
192
|
+
│
|
|
193
|
+
├── adapters/ # Input adapters
|
|
194
|
+
│ ├── base.py # Abstract interface
|
|
195
|
+
│ └── docling_adapter.py # PDF via Docling
|
|
196
|
+
│
|
|
197
|
+
├── extraction/ # Raw element extraction
|
|
198
|
+
│ └── docling_extractor.py
|
|
199
|
+
│
|
|
200
|
+
├── normalization/ # IR normalization
|
|
201
|
+
│ └── normalizer.py
|
|
202
|
+
│
|
|
203
|
+
├── chunking/ # Chunking strategies
|
|
204
|
+
│ └── strategies.py
|
|
205
|
+
│
|
|
206
|
+
├── exporters/ # Export backends
|
|
207
|
+
│ ├── markdown_exporter.py
|
|
208
|
+
│ ├── text_exporter.py
|
|
209
|
+
│ ├── parquet_exporter.py
|
|
210
|
+
│ └── asset_writer.py
|
|
211
|
+
│
|
|
212
|
+
└── utils/
|
|
213
|
+
├── hashing.py # Deterministic ID generation
|
|
214
|
+
└── logging_config.py # Structured logging
|
|
215
|
+
|
|
216
|
+
ingest.py # CLI entrypoint
|
|
217
|
+
benchmark.py # Performance benchmark
|
|
218
|
+
test_pipeline.py # Integration test
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
## Design Constraints
|
|
222
|
+
|
|
223
|
+
### ✅ What We DO
|
|
224
|
+
- Strict layer separation
|
|
225
|
+
- Deterministic processing
|
|
226
|
+
- Schema validation
|
|
227
|
+
- Pluggable strategies
|
|
228
|
+
- Observability/timing
|
|
229
|
+
- Efficient storage (Parquet)
|
|
230
|
+
|
|
231
|
+
### ❌ What We DON'T DO
|
|
232
|
+
- Mix business logic into adapters
|
|
233
|
+
- Hardcode paths or configurations
|
|
234
|
+
- Use non-deterministic IDs (UUIDs)
|
|
235
|
+
- Combine IR and export logic
|
|
236
|
+
- Skip schema validation
|
|
237
|
+
- Load entire files into memory unnecessarily
|
|
238
|
+
|
|
239
|
+
## Extensibility
|
|
240
|
+
|
|
241
|
+
### Adding New Input Formats
|
|
242
|
+
|
|
243
|
+
1. Implement `InputAdapter` interface:
|
|
244
|
+
```python
|
|
245
|
+
class DocxAdapter(InputAdapter):
|
|
246
|
+
def parse(self, file_path: Path) -> Any: ...
|
|
247
|
+
def supports_format(self, file_path: Path) -> bool: ...
|
|
248
|
+
def get_parser_version(self) -> str: ...
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
2. Implement corresponding extractor
|
|
252
|
+
3. Update pipeline to use new adapter
|
|
253
|
+
|
|
254
|
+
### Adding New Chunk Strategies
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
class CustomChunker(ChunkStrategy):
|
|
258
|
+
def chunk(self, document: Document) -> List[Chunk]:
|
|
259
|
+
# Operate on IR blocks
|
|
260
|
+
...
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
### Adding New Export Formats
|
|
264
|
+
|
|
265
|
+
```python
|
|
266
|
+
class JsonExporter(Exporter):
|
|
267
|
+
def export(self, document: Document, output_dir: Path, chunks: List[Chunk]):
|
|
268
|
+
# Export from canonical IR
|
|
269
|
+
...
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
## Performance
|
|
273
|
+
|
|
274
|
+
Designed to handle 200+ page PDFs efficiently:
|
|
275
|
+
- Streaming processing where possible
|
|
276
|
+
- Lazy loading of heavy dependencies
|
|
277
|
+
- GPU acceleration support
|
|
278
|
+
- Parallel export operations
|
|
279
|
+
- Efficient Parquet storage for tables
|
|
280
|
+
|
|
281
|
+
## Observability
|
|
282
|
+
|
|
283
|
+
- Structured JSON logging
|
|
284
|
+
- Stage-level timing metrics
|
|
285
|
+
- Extraction statistics
|
|
286
|
+
- Deterministic output for debugging
|
|
287
|
+
|
|
288
|
+
## Schema Versioning
|
|
289
|
+
|
|
290
|
+
Current schema version: `1.0.0`
|
|
291
|
+
|
|
292
|
+
Future schema changes will be tracked via semantic versioning:
|
|
293
|
+
- Major: Breaking changes to IR structure
|
|
294
|
+
- Minor: Backwards-compatible additions
|
|
295
|
+
- Patch: Bug fixes
|
|
296
|
+
|
|
297
|
+
## Future Enhancements
|
|
298
|
+
|
|
299
|
+
- [ ] DOCX input adapter
|
|
300
|
+
- [ ] HTML input adapter
|
|
301
|
+
- [ ] Advanced layout-aware chunking
|
|
302
|
+
- [ ] Parallel page processing
|
|
303
|
+
- [ ] Incremental updates (only reprocess changed pages)
|
|
304
|
+
- [ ] Vector embeddings export
|
|
305
|
+
- [ ] OCR fallback for scanned PDFs
|
|
306
|
+
|
|
307
|
+
## License
|
|
308
|
+
|
|
309
|
+
See project root for license information.
|
|
310
|
+
|
|
311
|
+
## Contributing
|
|
312
|
+
|
|
313
|
+
This is a research/prototype phase project. See main project README for contribution guidelines.
|
|
314
|
+
# layoutir
|
layoutir-1.0.0/README.md
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
# Document IR - Production Document Ingestion Engine
|
|
2
|
+
|
|
3
|
+
**An IR-first, extensible document compiler for AI systems.**
|
|
4
|
+
|
|
5
|
+
This is NOT a PDF-to-Markdown script. It is a production-grade document ingestion and canonicalization engine designed with compiler-like architecture: Input → IR → Backends.
|
|
6
|
+
|
|
7
|
+
## Architecture
|
|
8
|
+
|
|
9
|
+
### Design Philosophy
|
|
10
|
+
|
|
11
|
+
Think like a compiler engineer:
|
|
12
|
+
- **Input Layer**: Format-specific parsers (currently PDF via Docling)
|
|
13
|
+
- **AST/IR**: Canonical intermediate representation with strict schema
|
|
14
|
+
- **Backends**: Multiple export formats (Markdown, Text, Parquet)
|
|
15
|
+
|
|
16
|
+
### Layer Separation (Non-Negotiable)
|
|
17
|
+
|
|
18
|
+
```mermaid
|
|
19
|
+
flowchart TD
|
|
20
|
+
A[Input Adapter Layer<br/>Format-specific parsing only]
|
|
21
|
+
B[Extraction Layer<br/>Extract raw structural elements]
|
|
22
|
+
C[Normalization Layer<br/>Convert to canonical IR with hashing]
|
|
23
|
+
D[Canonical IR Layer<br/>Typed schema, stable IDs, relationships]
|
|
24
|
+
E[Export Layer<br/>Markdown, Text, Parquet, Assets]
|
|
25
|
+
|
|
26
|
+
A --> B
|
|
27
|
+
B --> C
|
|
28
|
+
C --> D
|
|
29
|
+
D --> E
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Key Features
|
|
33
|
+
|
|
34
|
+
### ✅ Deterministic & Idempotent
|
|
35
|
+
- Hash-based stable IDs (document, block, table, image, chunk)
|
|
36
|
+
- Running pipeline twice produces identical output
|
|
37
|
+
- No UUIDs, no randomness
|
|
38
|
+
|
|
39
|
+
### ✅ Canonical IR Schema
|
|
40
|
+
```python
|
|
41
|
+
Document
|
|
42
|
+
├── document_id: str (hash-based)
|
|
43
|
+
├── schema_version: str
|
|
44
|
+
├── parser_version: str
|
|
45
|
+
├── metadata: DocumentMetadata
|
|
46
|
+
├── blocks: List[Block]
|
|
47
|
+
│ ├── block_id: str (deterministic)
|
|
48
|
+
│ ├── type: BlockType (heading, paragraph, table, image, etc.)
|
|
49
|
+
│ ├── content: str
|
|
50
|
+
│ ├── page_number: int
|
|
51
|
+
│ ├── bbox: BoundingBox
|
|
52
|
+
│ └── metadata: dict
|
|
53
|
+
└── relationships: List[Relationship]
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### ✅ Pluggable Chunking
|
|
57
|
+
- `SemanticSectionChunker`: Section-based (headings)
|
|
58
|
+
- `TokenWindowChunker`: Fixed token windows with overlap
|
|
59
|
+
- `LayoutAwareChunker`: Layout-aware (stub)
|
|
60
|
+
|
|
61
|
+
All chunking operates on IR, not raw text.
|
|
62
|
+
|
|
63
|
+
### ✅ Multiple Export Formats
|
|
64
|
+
- **Markdown**: Human-readable with formatting
|
|
65
|
+
- **Plain Text**: Simple text extraction
|
|
66
|
+
- **Parquet**: Efficient structured storage for tables/blocks
|
|
67
|
+
- **Assets**: Extracted images (PNG) and tables (CSV)
|
|
68
|
+
|
|
69
|
+
### ✅ Structured Output
|
|
70
|
+
```
|
|
71
|
+
/<document_id>/
|
|
72
|
+
manifest.json # Processing metadata
|
|
73
|
+
ir.json # Canonical IR
|
|
74
|
+
chunks.json # Chunk definitions
|
|
75
|
+
/assets/
|
|
76
|
+
/images/ # Extracted images
|
|
77
|
+
/tables/ # Tables as CSV
|
|
78
|
+
/exports/
|
|
79
|
+
/markdown/ # Markdown output
|
|
80
|
+
/text/ # Plain text output
|
|
81
|
+
/parquet/ # Parquet datasets
|
|
82
|
+
/logs/ # Processing logs
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Installation
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# Install from PyPI
|
|
89
|
+
pip install layoutir
|
|
90
|
+
|
|
91
|
+
# Or install from source
|
|
92
|
+
git clone https://github.com/RahulPatnaik/layoutir.git
|
|
93
|
+
cd layoutir
|
|
94
|
+
pip install -e .
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Usage
|
|
98
|
+
|
|
99
|
+
### Basic Usage
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
# Using the CLI
|
|
103
|
+
layoutir --input file.pdf --output ./out
|
|
104
|
+
|
|
105
|
+
# Or using Python directly
|
|
106
|
+
python -m layoutir.cli --input file.pdf --output ./out
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Advanced Options
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
# Semantic chunking (default)
|
|
113
|
+
layoutir --input file.pdf --output ./out --chunk-strategy semantic
|
|
114
|
+
|
|
115
|
+
# Token-based chunking with custom size
|
|
116
|
+
layoutir --input file.pdf --output ./out \
|
|
117
|
+
--chunk-strategy token \
|
|
118
|
+
--chunk-size 1024 \
|
|
119
|
+
--chunk-overlap 128
|
|
120
|
+
|
|
121
|
+
# Enable GPU acceleration
|
|
122
|
+
layoutir --input file.pdf --output ./out --use-gpu
|
|
123
|
+
|
|
124
|
+
# Debug mode with structured logging
|
|
125
|
+
layoutir --input file.pdf --output ./out \
|
|
126
|
+
--log-level DEBUG \
|
|
127
|
+
--structured-logs
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Python API
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from pathlib import Path
|
|
134
|
+
from layoutir import Pipeline
|
|
135
|
+
from layoutir.adapters import DoclingAdapter
|
|
136
|
+
from layoutir.chunking import SemanticSectionChunker
|
|
137
|
+
|
|
138
|
+
# Create pipeline
|
|
139
|
+
adapter = DoclingAdapter(use_gpu=True)
|
|
140
|
+
chunker = SemanticSectionChunker(max_heading_level=2)
|
|
141
|
+
pipeline = Pipeline(adapter=adapter, chunk_strategy=chunker)
|
|
142
|
+
|
|
143
|
+
# Process document
|
|
144
|
+
document = pipeline.process(
|
|
145
|
+
input_path=Path("document.pdf"),
|
|
146
|
+
output_dir=Path("./output")
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Access results
|
|
150
|
+
print(f"Extracted {len(document.blocks)} blocks")
|
|
151
|
+
print(f"Document ID: {document.document_id}")
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## Project Structure
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
src/layoutir/
|
|
158
|
+
├── schema.py # Canonical IR schema (Pydantic)
|
|
159
|
+
├── pipeline.py # Main orchestrator
|
|
160
|
+
│
|
|
161
|
+
├── adapters/ # Input adapters
|
|
162
|
+
│ ├── base.py # Abstract interface
|
|
163
|
+
│ └── docling_adapter.py # PDF via Docling
|
|
164
|
+
│
|
|
165
|
+
├── extraction/ # Raw element extraction
|
|
166
|
+
│ └── docling_extractor.py
|
|
167
|
+
│
|
|
168
|
+
├── normalization/ # IR normalization
|
|
169
|
+
│ └── normalizer.py
|
|
170
|
+
│
|
|
171
|
+
├── chunking/ # Chunking strategies
|
|
172
|
+
│ └── strategies.py
|
|
173
|
+
│
|
|
174
|
+
├── exporters/ # Export backends
|
|
175
|
+
│ ├── markdown_exporter.py
|
|
176
|
+
│ ├── text_exporter.py
|
|
177
|
+
│ ├── parquet_exporter.py
|
|
178
|
+
│ └── asset_writer.py
|
|
179
|
+
│
|
|
180
|
+
└── utils/
|
|
181
|
+
├── hashing.py # Deterministic ID generation
|
|
182
|
+
└── logging_config.py # Structured logging
|
|
183
|
+
|
|
184
|
+
ingest.py # CLI entrypoint
|
|
185
|
+
benchmark.py # Performance benchmark
|
|
186
|
+
test_pipeline.py # Integration test
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
## Design Constraints
|
|
190
|
+
|
|
191
|
+
### ✅ What We DO
|
|
192
|
+
- Strict layer separation
|
|
193
|
+
- Deterministic processing
|
|
194
|
+
- Schema validation
|
|
195
|
+
- Pluggable strategies
|
|
196
|
+
- Observability/timing
|
|
197
|
+
- Efficient storage (Parquet)
|
|
198
|
+
|
|
199
|
+
### ❌ What We DON'T DO
|
|
200
|
+
- Mix business logic into adapters
|
|
201
|
+
- Hardcode paths or configurations
|
|
202
|
+
- Use non-deterministic IDs (UUIDs)
|
|
203
|
+
- Combine IR and export logic
|
|
204
|
+
- Skip schema validation
|
|
205
|
+
- Load entire files into memory unnecessarily
|
|
206
|
+
|
|
207
|
+
## Extensibility
|
|
208
|
+
|
|
209
|
+
### Adding New Input Formats
|
|
210
|
+
|
|
211
|
+
1. Implement `InputAdapter` interface:
|
|
212
|
+
```python
|
|
213
|
+
class DocxAdapter(InputAdapter):
|
|
214
|
+
def parse(self, file_path: Path) -> Any: ...
|
|
215
|
+
def supports_format(self, file_path: Path) -> bool: ...
|
|
216
|
+
def get_parser_version(self) -> str: ...
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
2. Implement corresponding extractor
|
|
220
|
+
3. Update pipeline to use new adapter
|
|
221
|
+
|
|
222
|
+
### Adding New Chunk Strategies
|
|
223
|
+
|
|
224
|
+
```python
|
|
225
|
+
class CustomChunker(ChunkStrategy):
|
|
226
|
+
def chunk(self, document: Document) -> List[Chunk]:
|
|
227
|
+
# Operate on IR blocks
|
|
228
|
+
...
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### Adding New Export Formats
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
class JsonExporter(Exporter):
|
|
235
|
+
def export(self, document: Document, output_dir: Path, chunks: List[Chunk]):
|
|
236
|
+
# Export from canonical IR
|
|
237
|
+
...
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
## Performance
|
|
241
|
+
|
|
242
|
+
Designed to handle 200+ page PDFs efficiently:
|
|
243
|
+
- Streaming processing where possible
|
|
244
|
+
- Lazy loading of heavy dependencies
|
|
245
|
+
- GPU acceleration support
|
|
246
|
+
- Parallel export operations
|
|
247
|
+
- Efficient Parquet storage for tables
|
|
248
|
+
|
|
249
|
+
## Observability
|
|
250
|
+
|
|
251
|
+
- Structured JSON logging
|
|
252
|
+
- Stage-level timing metrics
|
|
253
|
+
- Extraction statistics
|
|
254
|
+
- Deterministic output for debugging
|
|
255
|
+
|
|
256
|
+
## Schema Versioning
|
|
257
|
+
|
|
258
|
+
Current schema version: `1.0.0`
|
|
259
|
+
|
|
260
|
+
Future schema changes will be tracked via semantic versioning:
|
|
261
|
+
- Major: Breaking changes to IR structure
|
|
262
|
+
- Minor: Backwards-compatible additions
|
|
263
|
+
- Patch: Bug fixes
|
|
264
|
+
|
|
265
|
+
## Future Enhancements
|
|
266
|
+
|
|
267
|
+
- [ ] DOCX input adapter
|
|
268
|
+
- [ ] HTML input adapter
|
|
269
|
+
- [ ] Advanced layout-aware chunking
|
|
270
|
+
- [ ] Parallel page processing
|
|
271
|
+
- [ ] Incremental updates (only reprocess changed pages)
|
|
272
|
+
- [ ] Vector embeddings export
|
|
273
|
+
- [ ] OCR fallback for scanned PDFs
|
|
274
|
+
|
|
275
|
+
## License
|
|
276
|
+
|
|
277
|
+
See project root for license information.
|
|
278
|
+
|
|
279
|
+
## Contributing
|
|
280
|
+
|
|
281
|
+
This is a research/prototype phase project. See main project README for contribution guidelines.
|
|
282
|
+
# layoutir
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "layoutir"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Production-grade Document Ingestion & Canonicalization Engine"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Rahul Patnaik", email = "rahulpatnaik@example.com"}
|
|
14
|
+
]
|
|
15
|
+
maintainers = [
|
|
16
|
+
{name = "Rahul Patnaik", email = "rahulpatnaik@example.com"}
|
|
17
|
+
]
|
|
18
|
+
keywords = ["pdf", "document", "parsing", "ir", "layout", "extraction", "chunking"]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Development Status :: 4 - Beta",
|
|
21
|
+
"Intended Audience :: Developers",
|
|
22
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
23
|
+
"Programming Language :: Python :: 3",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Programming Language :: Python :: 3.13",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
dependencies = [
|
|
29
|
+
"pydantic>=2.0.0",
|
|
30
|
+
"docling>=1.0.0",
|
|
31
|
+
"torch>=2.0.0",
|
|
32
|
+
"pandas>=2.0.0",
|
|
33
|
+
"pyarrow>=10.0.0",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.optional-dependencies]
|
|
37
|
+
dev = [
|
|
38
|
+
"pytest>=7.0",
|
|
39
|
+
"black>=23.0",
|
|
40
|
+
"ruff>=0.1.0",
|
|
41
|
+
"mypy>=1.0",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[project.scripts]
|
|
45
|
+
layoutir = "layoutir.cli:main"
|
|
46
|
+
|
|
47
|
+
[project.urls]
|
|
48
|
+
Homepage = "https://github.com/RahulPatnaik/layoutir"
|
|
49
|
+
Documentation = "https://github.com/RahulPatnaik/layoutir/blob/main/README.md"
|
|
50
|
+
Repository = "https://github.com/RahulPatnaik/layoutir"
|
|
51
|
+
|
|
52
|
+
[tool.setuptools]
|
|
53
|
+
package-dir = {"" = "src"}
|
|
54
|
+
|
|
55
|
+
[tool.setuptools.packages.find]
|
|
56
|
+
where = ["src"]
|
|
57
|
+
include = ["layoutir*"]
|
|
58
|
+
exclude = ["tests*"]
|
|
59
|
+
|
|
60
|
+
[tool.black]
|
|
61
|
+
line-length = 100
|
|
62
|
+
target-version = ['py312']
|
|
63
|
+
|
|
64
|
+
[tool.ruff]
|
|
65
|
+
line-length = 100
|
|
66
|
+
target-version = "py312"
|
|
67
|
+
|
|
68
|
+
[tool.mypy]
|
|
69
|
+
python_version = "3.12"
|
|
70
|
+
warn_return_any = true
|
|
71
|
+
warn_unused_configs = true
|
|
72
|
+
disallow_untyped_defs = false
|
layoutir-1.0.0/setup.cfg
ADDED