layoutir 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. layoutir-1.0.0/LICENSE +21 -0
  2. layoutir-1.0.0/PKG-INFO +314 -0
  3. layoutir-1.0.0/README.md +282 -0
  4. layoutir-1.0.0/pyproject.toml +72 -0
  5. layoutir-1.0.0/setup.cfg +4 -0
  6. layoutir-1.0.0/src/layoutir/__init__.py +33 -0
  7. layoutir-1.0.0/src/layoutir/adapters/__init__.py +9 -0
  8. layoutir-1.0.0/src/layoutir/adapters/base.py +50 -0
  9. layoutir-1.0.0/src/layoutir/adapters/docling_adapter.py +132 -0
  10. layoutir-1.0.0/src/layoutir/chunking/__init__.py +15 -0
  11. layoutir-1.0.0/src/layoutir/chunking/strategies.py +282 -0
  12. layoutir-1.0.0/src/layoutir/cli.py +183 -0
  13. layoutir-1.0.0/src/layoutir/exporters/__init__.py +15 -0
  14. layoutir-1.0.0/src/layoutir/exporters/asset_writer.py +108 -0
  15. layoutir-1.0.0/src/layoutir/exporters/base.py +27 -0
  16. layoutir-1.0.0/src/layoutir/exporters/markdown_exporter.py +148 -0
  17. layoutir-1.0.0/src/layoutir/exporters/parquet_exporter.py +154 -0
  18. layoutir-1.0.0/src/layoutir/exporters/text_exporter.py +63 -0
  19. layoutir-1.0.0/src/layoutir/extraction/__init__.py +17 -0
  20. layoutir-1.0.0/src/layoutir/extraction/docling_extractor.py +320 -0
  21. layoutir-1.0.0/src/layoutir/normalization/__init__.py +5 -0
  22. layoutir-1.0.0/src/layoutir/normalization/normalizer.py +332 -0
  23. layoutir-1.0.0/src/layoutir/pipeline.py +263 -0
  24. layoutir-1.0.0/src/layoutir/schema.py +153 -0
  25. layoutir-1.0.0/src/layoutir/utils/__init__.py +26 -0
  26. layoutir-1.0.0/src/layoutir/utils/hashing.py +188 -0
  27. layoutir-1.0.0/src/layoutir/utils/logging_config.py +125 -0
  28. layoutir-1.0.0/src/layoutir.egg-info/PKG-INFO +314 -0
  29. layoutir-1.0.0/src/layoutir.egg-info/SOURCES.txt +31 -0
  30. layoutir-1.0.0/src/layoutir.egg-info/dependency_links.txt +1 -0
  31. layoutir-1.0.0/src/layoutir.egg-info/entry_points.txt +2 -0
  32. layoutir-1.0.0/src/layoutir.egg-info/requires.txt +11 -0
  33. layoutir-1.0.0/src/layoutir.egg-info/top_level.txt +1 -0
layoutir-1.0.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Rahul Patnaik
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,314 @@
1
+ Metadata-Version: 2.4
2
+ Name: layoutir
3
+ Version: 1.0.0
4
+ Summary: Production-grade Document Ingestion & Canonicalization Engine
5
+ Author-email: Rahul Patnaik <rahulpatnaik@example.com>
6
+ Maintainer-email: Rahul Patnaik <rahulpatnaik@example.com>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/RahulPatnaik/layoutir
9
+ Project-URL: Documentation, https://github.com/RahulPatnaik/layoutir/blob/main/README.md
10
+ Project-URL: Repository, https://github.com/RahulPatnaik/layoutir
11
+ Keywords: pdf,document,parsing,ir,layout,extraction,chunking
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Requires-Python: >=3.12
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: pydantic>=2.0.0
22
+ Requires-Dist: docling>=1.0.0
23
+ Requires-Dist: torch>=2.0.0
24
+ Requires-Dist: pandas>=2.0.0
25
+ Requires-Dist: pyarrow>=10.0.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=7.0; extra == "dev"
28
+ Requires-Dist: black>=23.0; extra == "dev"
29
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
30
+ Requires-Dist: mypy>=1.0; extra == "dev"
31
+ Dynamic: license-file
32
+
33
+ # Document IR - Production Document Ingestion Engine
34
+
35
+ **An IR-first, extensible document compiler for AI systems.**
36
+
37
+ This is NOT a PDF-to-Markdown script. It is a production-grade document ingestion and canonicalization engine designed with compiler-like architecture: Input → IR → Backends.
38
+
39
+ ## Architecture
40
+
41
+ ### Design Philosophy
42
+
43
+ Think like a compiler engineer:
44
+ - **Input Layer**: Format-specific parsers (currently PDF via Docling)
45
+ - **AST/IR**: Canonical intermediate representation with strict schema
46
+ - **Backends**: Multiple export formats (Markdown, Text, Parquet)
47
+
48
+ ### Layer Separation (Non-Negotiable)
49
+
50
+ ```mermaid
51
+ flowchart TD
52
+ A[Input Adapter Layer<br/>Format-specific parsing only]
53
+ B[Extraction Layer<br/>Extract raw structural elements]
54
+ C[Normalization Layer<br/>Convert to canonical IR with hashing]
55
+ D[Canonical IR Layer<br/>Typed schema, stable IDs, relationships]
56
+ E[Export Layer<br/>Markdown, Text, Parquet, Assets]
57
+
58
+ A --> B
59
+ B --> C
60
+ C --> D
61
+ D --> E
62
+ ```
63
+
64
+ ## Key Features
65
+
66
+ ### ✅ Deterministic & Idempotent
67
+ - Hash-based stable IDs (document, block, table, image, chunk)
68
+ - Running pipeline twice produces identical output
69
+ - No UUIDs, no randomness
70
+
71
+ ### ✅ Canonical IR Schema
72
+ ```python
73
+ Document
74
+ ├── document_id: str (hash-based)
75
+ ├── schema_version: str
76
+ ├── parser_version: str
77
+ ├── metadata: DocumentMetadata
78
+ ├── blocks: List[Block]
79
+ │ ├── block_id: str (deterministic)
80
+ │ ├── type: BlockType (heading, paragraph, table, image, etc.)
81
+ │ ├── content: str
82
+ │ ├── page_number: int
83
+ │ ├── bbox: BoundingBox
84
+ │ └── metadata: dict
85
+ └── relationships: List[Relationship]
86
+ ```
87
+
88
+ ### ✅ Pluggable Chunking
89
+ - `SemanticSectionChunker`: Section-based (headings)
90
+ - `TokenWindowChunker`: Fixed token windows with overlap
91
+ - `LayoutAwareChunker`: Layout-aware (stub)
92
+
93
+ All chunking operates on IR, not raw text.
94
+
95
+ ### ✅ Multiple Export Formats
96
+ - **Markdown**: Human-readable with formatting
97
+ - **Plain Text**: Simple text extraction
98
+ - **Parquet**: Efficient structured storage for tables/blocks
99
+ - **Assets**: Extracted images (PNG) and tables (CSV)
100
+
101
+ ### ✅ Structured Output
102
+ ```
103
+ /<document_id>/
104
+ manifest.json # Processing metadata
105
+ ir.json # Canonical IR
106
+ chunks.json # Chunk definitions
107
+ /assets/
108
+ /images/ # Extracted images
109
+ /tables/ # Tables as CSV
110
+ /exports/
111
+ /markdown/ # Markdown output
112
+ /text/ # Plain text output
113
+ /parquet/ # Parquet datasets
114
+ /logs/ # Processing logs
115
+ ```
116
+
117
+ ## Installation
118
+
119
+ ```bash
120
+ # Install from PyPI
121
+ pip install layoutir
122
+
123
+ # Or install from source
124
+ git clone https://github.com/RahulPatnaik/layoutir.git
125
+ cd layoutir
126
+ pip install -e .
127
+ ```
128
+
129
+ ## Usage
130
+
131
+ ### Basic Usage
132
+
133
+ ```bash
134
+ # Using the CLI
135
+ layoutir --input file.pdf --output ./out
136
+
137
+ # Or using Python directly
138
+ python -m layoutir.cli --input file.pdf --output ./out
139
+ ```
140
+
141
+ ### Advanced Options
142
+
143
+ ```bash
144
+ # Semantic chunking (default)
145
+ layoutir --input file.pdf --output ./out --chunk-strategy semantic
146
+
147
+ # Token-based chunking with custom size
148
+ layoutir --input file.pdf --output ./out \
149
+ --chunk-strategy token \
150
+ --chunk-size 1024 \
151
+ --chunk-overlap 128
152
+
153
+ # Enable GPU acceleration
154
+ layoutir --input file.pdf --output ./out --use-gpu
155
+
156
+ # Debug mode with structured logging
157
+ layoutir --input file.pdf --output ./out \
158
+ --log-level DEBUG \
159
+ --structured-logs
160
+ ```
161
+
162
+ ### Python API
163
+
164
+ ```python
165
+ from pathlib import Path
166
+ from layoutir import Pipeline
167
+ from layoutir.adapters import DoclingAdapter
168
+ from layoutir.chunking import SemanticSectionChunker
169
+
170
+ # Create pipeline
171
+ adapter = DoclingAdapter(use_gpu=True)
172
+ chunker = SemanticSectionChunker(max_heading_level=2)
173
+ pipeline = Pipeline(adapter=adapter, chunk_strategy=chunker)
174
+
175
+ # Process document
176
+ document = pipeline.process(
177
+ input_path=Path("document.pdf"),
178
+ output_dir=Path("./output")
179
+ )
180
+
181
+ # Access results
182
+ print(f"Extracted {len(document.blocks)} blocks")
183
+ print(f"Document ID: {document.document_id}")
184
+ ```
185
+
186
+ ## Project Structure
187
+
188
+ ```
189
+ src/layoutir/
190
+ ├── schema.py # Canonical IR schema (Pydantic)
191
+ ├── pipeline.py # Main orchestrator
192
+
193
+ ├── adapters/ # Input adapters
194
+ │ ├── base.py # Abstract interface
195
+ │ └── docling_adapter.py # PDF via Docling
196
+
197
+ ├── extraction/ # Raw element extraction
198
+ │ └── docling_extractor.py
199
+
200
+ ├── normalization/ # IR normalization
201
+ │ └── normalizer.py
202
+
203
+ ├── chunking/ # Chunking strategies
204
+ │ └── strategies.py
205
+
206
+ ├── exporters/ # Export backends
207
+ │ ├── markdown_exporter.py
208
+ │ ├── text_exporter.py
209
+ │ ├── parquet_exporter.py
210
+ │ └── asset_writer.py
211
+
212
+ └── utils/
213
+ ├── hashing.py # Deterministic ID generation
214
+ └── logging_config.py # Structured logging
215
+
216
+ ingest.py # CLI entrypoint
217
+ benchmark.py # Performance benchmark
218
+ test_pipeline.py # Integration test
219
+ ```
220
+
221
+ ## Design Constraints
222
+
223
+ ### ✅ What We DO
224
+ - Strict layer separation
225
+ - Deterministic processing
226
+ - Schema validation
227
+ - Pluggable strategies
228
+ - Observability/timing
229
+ - Efficient storage (Parquet)
230
+
231
+ ### ❌ What We DON'T DO
232
+ - Mix business logic into adapters
233
+ - Hardcode paths or configurations
234
+ - Use non-deterministic IDs (UUIDs)
235
+ - Combine IR and export logic
236
+ - Skip schema validation
237
+ - Load entire files into memory unnecessarily
238
+
239
+ ## Extensibility
240
+
241
+ ### Adding New Input Formats
242
+
243
+ 1. Implement `InputAdapter` interface:
244
+ ```python
245
+ class DocxAdapter(InputAdapter):
246
+ def parse(self, file_path: Path) -> Any: ...
247
+ def supports_format(self, file_path: Path) -> bool: ...
248
+ def get_parser_version(self) -> str: ...
249
+ ```
250
+
251
+ 2. Implement corresponding extractor
252
+ 3. Update pipeline to use new adapter
253
+
254
+ ### Adding New Chunk Strategies
255
+
256
+ ```python
257
+ class CustomChunker(ChunkStrategy):
258
+ def chunk(self, document: Document) -> List[Chunk]:
259
+ # Operate on IR blocks
260
+ ...
261
+ ```
262
+
263
+ ### Adding New Export Formats
264
+
265
+ ```python
266
+ class JsonExporter(Exporter):
267
+ def export(self, document: Document, output_dir: Path, chunks: List[Chunk]):
268
+ # Export from canonical IR
269
+ ...
270
+ ```
271
+
272
+ ## Performance
273
+
274
+ Designed to handle 200+ page PDFs efficiently:
275
+ - Streaming processing where possible
276
+ - Lazy loading of heavy dependencies
277
+ - GPU acceleration support
278
+ - Parallel export operations
279
+ - Efficient Parquet storage for tables
280
+
281
+ ## Observability
282
+
283
+ - Structured JSON logging
284
+ - Stage-level timing metrics
285
+ - Extraction statistics
286
+ - Deterministic output for debugging
287
+
288
+ ## Schema Versioning
289
+
290
+ Current schema version: `1.0.0`
291
+
292
+ Future schema changes will be tracked via semantic versioning:
293
+ - Major: Breaking changes to IR structure
294
+ - Minor: Backwards-compatible additions
295
+ - Patch: Bug fixes
296
+
297
+ ## Future Enhancements
298
+
299
+ - [ ] DOCX input adapter
300
+ - [ ] HTML input adapter
301
+ - [ ] Advanced layout-aware chunking
302
+ - [ ] Parallel page processing
303
+ - [ ] Incremental updates (only reprocess changed pages)
304
+ - [ ] Vector embeddings export
305
+ - [ ] OCR fallback for scanned PDFs
306
+
307
+ ## License
308
+
309
+ See project root for license information.
310
+
311
+ ## Contributing
312
+
313
+ This is a research/prototype phase project. See main project README for contribution guidelines.
314
+ # layoutir
@@ -0,0 +1,282 @@
1
+ # Document IR - Production Document Ingestion Engine
2
+
3
+ **An IR-first, extensible document compiler for AI systems.**
4
+
5
+ This is NOT a PDF-to-Markdown script. It is a production-grade document ingestion and canonicalization engine designed with compiler-like architecture: Input → IR → Backends.
6
+
7
+ ## Architecture
8
+
9
+ ### Design Philosophy
10
+
11
+ Think like a compiler engineer:
12
+ - **Input Layer**: Format-specific parsers (currently PDF via Docling)
13
+ - **AST/IR**: Canonical intermediate representation with strict schema
14
+ - **Backends**: Multiple export formats (Markdown, Text, Parquet)
15
+
16
+ ### Layer Separation (Non-Negotiable)
17
+
18
+ ```mermaid
19
+ flowchart TD
20
+ A[Input Adapter Layer<br/>Format-specific parsing only]
21
+ B[Extraction Layer<br/>Extract raw structural elements]
22
+ C[Normalization Layer<br/>Convert to canonical IR with hashing]
23
+ D[Canonical IR Layer<br/>Typed schema, stable IDs, relationships]
24
+ E[Export Layer<br/>Markdown, Text, Parquet, Assets]
25
+
26
+ A --> B
27
+ B --> C
28
+ C --> D
29
+ D --> E
30
+ ```
31
+
32
+ ## Key Features
33
+
34
+ ### ✅ Deterministic & Idempotent
35
+ - Hash-based stable IDs (document, block, table, image, chunk)
36
+ - Running pipeline twice produces identical output
37
+ - No UUIDs, no randomness
38
+
39
+ ### ✅ Canonical IR Schema
40
+ ```python
41
+ Document
42
+ ├── document_id: str (hash-based)
43
+ ├── schema_version: str
44
+ ├── parser_version: str
45
+ ├── metadata: DocumentMetadata
46
+ ├── blocks: List[Block]
47
+ │ ├── block_id: str (deterministic)
48
+ │ ├── type: BlockType (heading, paragraph, table, image, etc.)
49
+ │ ├── content: str
50
+ │ ├── page_number: int
51
+ │ ├── bbox: BoundingBox
52
+ │ └── metadata: dict
53
+ └── relationships: List[Relationship]
54
+ ```
55
+
56
+ ### ✅ Pluggable Chunking
57
+ - `SemanticSectionChunker`: Section-based (headings)
58
+ - `TokenWindowChunker`: Fixed token windows with overlap
59
+ - `LayoutAwareChunker`: Layout-aware (stub)
60
+
61
+ All chunking operates on IR, not raw text.
62
+
63
+ ### ✅ Multiple Export Formats
64
+ - **Markdown**: Human-readable with formatting
65
+ - **Plain Text**: Simple text extraction
66
+ - **Parquet**: Efficient structured storage for tables/blocks
67
+ - **Assets**: Extracted images (PNG) and tables (CSV)
68
+
69
+ ### ✅ Structured Output
70
+ ```
71
+ /<document_id>/
72
+ manifest.json # Processing metadata
73
+ ir.json # Canonical IR
74
+ chunks.json # Chunk definitions
75
+ /assets/
76
+ /images/ # Extracted images
77
+ /tables/ # Tables as CSV
78
+ /exports/
79
+ /markdown/ # Markdown output
80
+ /text/ # Plain text output
81
+ /parquet/ # Parquet datasets
82
+ /logs/ # Processing logs
83
+ ```
84
+
85
+ ## Installation
86
+
87
+ ```bash
88
+ # Install from PyPI
89
+ pip install layoutir
90
+
91
+ # Or install from source
92
+ git clone https://github.com/RahulPatnaik/layoutir.git
93
+ cd layoutir
94
+ pip install -e .
95
+ ```
96
+
97
+ ## Usage
98
+
99
+ ### Basic Usage
100
+
101
+ ```bash
102
+ # Using the CLI
103
+ layoutir --input file.pdf --output ./out
104
+
105
+ # Or using Python directly
106
+ python -m layoutir.cli --input file.pdf --output ./out
107
+ ```
108
+
109
+ ### Advanced Options
110
+
111
+ ```bash
112
+ # Semantic chunking (default)
113
+ layoutir --input file.pdf --output ./out --chunk-strategy semantic
114
+
115
+ # Token-based chunking with custom size
116
+ layoutir --input file.pdf --output ./out \
117
+ --chunk-strategy token \
118
+ --chunk-size 1024 \
119
+ --chunk-overlap 128
120
+
121
+ # Enable GPU acceleration
122
+ layoutir --input file.pdf --output ./out --use-gpu
123
+
124
+ # Debug mode with structured logging
125
+ layoutir --input file.pdf --output ./out \
126
+ --log-level DEBUG \
127
+ --structured-logs
128
+ ```
129
+
130
+ ### Python API
131
+
132
+ ```python
133
+ from pathlib import Path
134
+ from layoutir import Pipeline
135
+ from layoutir.adapters import DoclingAdapter
136
+ from layoutir.chunking import SemanticSectionChunker
137
+
138
+ # Create pipeline
139
+ adapter = DoclingAdapter(use_gpu=True)
140
+ chunker = SemanticSectionChunker(max_heading_level=2)
141
+ pipeline = Pipeline(adapter=adapter, chunk_strategy=chunker)
142
+
143
+ # Process document
144
+ document = pipeline.process(
145
+ input_path=Path("document.pdf"),
146
+ output_dir=Path("./output")
147
+ )
148
+
149
+ # Access results
150
+ print(f"Extracted {len(document.blocks)} blocks")
151
+ print(f"Document ID: {document.document_id}")
152
+ ```
153
+
154
+ ## Project Structure
155
+
156
+ ```
157
+ src/layoutir/
158
+ ├── schema.py # Canonical IR schema (Pydantic)
159
+ ├── pipeline.py # Main orchestrator
160
+
161
+ ├── adapters/ # Input adapters
162
+ │ ├── base.py # Abstract interface
163
+ │ └── docling_adapter.py # PDF via Docling
164
+
165
+ ├── extraction/ # Raw element extraction
166
+ │ └── docling_extractor.py
167
+
168
+ ├── normalization/ # IR normalization
169
+ │ └── normalizer.py
170
+
171
+ ├── chunking/ # Chunking strategies
172
+ │ └── strategies.py
173
+
174
+ ├── exporters/ # Export backends
175
+ │ ├── markdown_exporter.py
176
+ │ ├── text_exporter.py
177
+ │ ├── parquet_exporter.py
178
+ │ └── asset_writer.py
179
+
180
+ └── utils/
181
+ ├── hashing.py # Deterministic ID generation
182
+ └── logging_config.py # Structured logging
183
+
184
+ ingest.py # CLI entrypoint
185
+ benchmark.py # Performance benchmark
186
+ test_pipeline.py # Integration test
187
+ ```
188
+
189
+ ## Design Constraints
190
+
191
+ ### ✅ What We DO
192
+ - Strict layer separation
193
+ - Deterministic processing
194
+ - Schema validation
195
+ - Pluggable strategies
196
+ - Observability/timing
197
+ - Efficient storage (Parquet)
198
+
199
+ ### ❌ What We DON'T DO
200
+ - Mix business logic into adapters
201
+ - Hardcode paths or configurations
202
+ - Use non-deterministic IDs (UUIDs)
203
+ - Combine IR and export logic
204
+ - Skip schema validation
205
+ - Load entire files into memory unnecessarily
206
+
207
+ ## Extensibility
208
+
209
+ ### Adding New Input Formats
210
+
211
+ 1. Implement `InputAdapter` interface:
212
+ ```python
213
+ class DocxAdapter(InputAdapter):
214
+ def parse(self, file_path: Path) -> Any: ...
215
+ def supports_format(self, file_path: Path) -> bool: ...
216
+ def get_parser_version(self) -> str: ...
217
+ ```
218
+
219
+ 2. Implement corresponding extractor
220
+ 3. Update pipeline to use new adapter
221
+
222
+ ### Adding New Chunk Strategies
223
+
224
+ ```python
225
+ class CustomChunker(ChunkStrategy):
226
+ def chunk(self, document: Document) -> List[Chunk]:
227
+ # Operate on IR blocks
228
+ ...
229
+ ```
230
+
231
+ ### Adding New Export Formats
232
+
233
+ ```python
234
+ class JsonExporter(Exporter):
235
+ def export(self, document: Document, output_dir: Path, chunks: List[Chunk]):
236
+ # Export from canonical IR
237
+ ...
238
+ ```
239
+
240
+ ## Performance
241
+
242
+ Designed to handle 200+ page PDFs efficiently:
243
+ - Streaming processing where possible
244
+ - Lazy loading of heavy dependencies
245
+ - GPU acceleration support
246
+ - Parallel export operations
247
+ - Efficient Parquet storage for tables
248
+
249
+ ## Observability
250
+
251
+ - Structured JSON logging
252
+ - Stage-level timing metrics
253
+ - Extraction statistics
254
+ - Deterministic output for debugging
255
+
256
+ ## Schema Versioning
257
+
258
+ Current schema version: `1.0.0`
259
+
260
+ Future schema changes will be tracked via semantic versioning:
261
+ - Major: Breaking changes to IR structure
262
+ - Minor: Backwards-compatible additions
263
+ - Patch: Bug fixes
264
+
265
+ ## Future Enhancements
266
+
267
+ - [ ] DOCX input adapter
268
+ - [ ] HTML input adapter
269
+ - [ ] Advanced layout-aware chunking
270
+ - [ ] Parallel page processing
271
+ - [ ] Incremental updates (only reprocess changed pages)
272
+ - [ ] Vector embeddings export
273
+ - [ ] OCR fallback for scanned PDFs
274
+
275
+ ## License
276
+
277
+ See project root for license information.
278
+
279
+ ## Contributing
280
+
281
+ This is a research/prototype phase project. See main project README for contribution guidelines.
282
+ # layoutir
@@ -0,0 +1,72 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "layoutir"
7
+ version = "1.0.0"
8
+ description = "Production-grade Document Ingestion & Canonicalization Engine"
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ license = "MIT"
12
+ authors = [
13
+ {name = "Rahul Patnaik", email = "rahulpatnaik@example.com"}
14
+ ]
15
+ maintainers = [
16
+ {name = "Rahul Patnaik", email = "rahulpatnaik@example.com"}
17
+ ]
18
+ keywords = ["pdf", "document", "parsing", "ir", "layout", "extraction", "chunking"]
19
+ classifiers = [
20
+ "Development Status :: 4 - Beta",
21
+ "Intended Audience :: Developers",
22
+ "Topic :: Software Development :: Libraries :: Python Modules",
23
+ "Programming Language :: Python :: 3",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Programming Language :: Python :: 3.13",
26
+ ]
27
+
28
+ dependencies = [
29
+ "pydantic>=2.0.0",
30
+ "docling>=1.0.0",
31
+ "torch>=2.0.0",
32
+ "pandas>=2.0.0",
33
+ "pyarrow>=10.0.0",
34
+ ]
35
+
36
+ [project.optional-dependencies]
37
+ dev = [
38
+ "pytest>=7.0",
39
+ "black>=23.0",
40
+ "ruff>=0.1.0",
41
+ "mypy>=1.0",
42
+ ]
43
+
44
+ [project.scripts]
45
+ layoutir = "layoutir.cli:main"
46
+
47
+ [project.urls]
48
+ Homepage = "https://github.com/RahulPatnaik/layoutir"
49
+ Documentation = "https://github.com/RahulPatnaik/layoutir/blob/main/README.md"
50
+ Repository = "https://github.com/RahulPatnaik/layoutir"
51
+
52
+ [tool.setuptools]
53
+ package-dir = {"" = "src"}
54
+
55
+ [tool.setuptools.packages.find]
56
+ where = ["src"]
57
+ include = ["layoutir*"]
58
+ exclude = ["tests*"]
59
+
60
+ [tool.black]
61
+ line-length = 100
62
+ target-version = ['py312']
63
+
64
+ [tool.ruff]
65
+ line-length = 100
66
+ target-version = "py312"
67
+
68
+ [tool.mypy]
69
+ python_version = "3.12"
70
+ warn_return_any = true
71
+ warn_unused_configs = true
72
+ disallow_untyped_defs = false
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+