document-extraction-tools 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of document-extraction-tools might be problematic. Click here for more details.

Files changed (47) hide show
  1. document_extraction_tools-0.1.0/PKG-INFO +533 -0
  2. document_extraction_tools-0.1.0/README.md +506 -0
  3. document_extraction_tools-0.1.0/pyproject.toml +115 -0
  4. document_extraction_tools-0.1.0/src/document_extraction_tools/__init__.py +0 -0
  5. document_extraction_tools-0.1.0/src/document_extraction_tools/base/__init__.py +27 -0
  6. document_extraction_tools-0.1.0/src/document_extraction_tools/base/converter/__init__.py +0 -0
  7. document_extraction_tools-0.1.0/src/document_extraction_tools/base/converter/base_converter.py +40 -0
  8. document_extraction_tools-0.1.0/src/document_extraction_tools/base/evaluator/__init__.py +0 -0
  9. document_extraction_tools-0.1.0/src/document_extraction_tools/base/evaluator/base_evaluator.py +40 -0
  10. document_extraction_tools-0.1.0/src/document_extraction_tools/base/exporter/__init__.py +0 -0
  11. document_extraction_tools-0.1.0/src/document_extraction_tools/base/exporter/base_evaluation_exporter.py +43 -0
  12. document_extraction_tools-0.1.0/src/document_extraction_tools/base/exporter/base_extraction_exporter.py +41 -0
  13. document_extraction_tools-0.1.0/src/document_extraction_tools/base/extractor/__init__.py +0 -0
  14. document_extraction_tools-0.1.0/src/document_extraction_tools/base/extractor/base_extractor.py +41 -0
  15. document_extraction_tools-0.1.0/src/document_extraction_tools/base/file_lister/__init__.py +0 -0
  16. document_extraction_tools-0.1.0/src/document_extraction_tools/base/file_lister/base_file_lister.py +37 -0
  17. document_extraction_tools-0.1.0/src/document_extraction_tools/base/reader/__init__.py +0 -0
  18. document_extraction_tools-0.1.0/src/document_extraction_tools/base/reader/base_reader.py +36 -0
  19. document_extraction_tools-0.1.0/src/document_extraction_tools/base/test_data_loader/__init__.py +0 -0
  20. document_extraction_tools-0.1.0/src/document_extraction_tools/base/test_data_loader/base_test_data_loader.py +44 -0
  21. document_extraction_tools-0.1.0/src/document_extraction_tools/config/__init__.py +51 -0
  22. document_extraction_tools-0.1.0/src/document_extraction_tools/config/base_converter_config.py +14 -0
  23. document_extraction_tools-0.1.0/src/document_extraction_tools/config/base_evaluation_exporter_config.py +14 -0
  24. document_extraction_tools-0.1.0/src/document_extraction_tools/config/base_evaluator_config.py +14 -0
  25. document_extraction_tools-0.1.0/src/document_extraction_tools/config/base_extraction_exporter_config.py +14 -0
  26. document_extraction_tools-0.1.0/src/document_extraction_tools/config/base_extractor_config.py +14 -0
  27. document_extraction_tools-0.1.0/src/document_extraction_tools/config/base_file_lister_config.py +14 -0
  28. document_extraction_tools-0.1.0/src/document_extraction_tools/config/base_reader_config.py +14 -0
  29. document_extraction_tools-0.1.0/src/document_extraction_tools/config/base_test_data_loader_config.py +14 -0
  30. document_extraction_tools-0.1.0/src/document_extraction_tools/config/config_loader.py +201 -0
  31. document_extraction_tools-0.1.0/src/document_extraction_tools/config/evaluation_orchestrator_config.py +20 -0
  32. document_extraction_tools-0.1.0/src/document_extraction_tools/config/evaluation_pipeline_config.py +32 -0
  33. document_extraction_tools-0.1.0/src/document_extraction_tools/config/extraction_orchestrator_config.py +20 -0
  34. document_extraction_tools-0.1.0/src/document_extraction_tools/config/extraction_pipeline_config.py +30 -0
  35. document_extraction_tools-0.1.0/src/document_extraction_tools/py.typed +0 -0
  36. document_extraction_tools-0.1.0/src/document_extraction_tools/runners/__init__.py +10 -0
  37. document_extraction_tools-0.1.0/src/document_extraction_tools/runners/evaluation/__init__.py +0 -0
  38. document_extraction_tools-0.1.0/src/document_extraction_tools/runners/evaluation/evaluation_orchestrator.py +260 -0
  39. document_extraction_tools-0.1.0/src/document_extraction_tools/runners/extraction/__init__.py +0 -0
  40. document_extraction_tools-0.1.0/src/document_extraction_tools/runners/extraction/extraction_orchestrator.py +202 -0
  41. document_extraction_tools-0.1.0/src/document_extraction_tools/types/__init__.py +20 -0
  42. document_extraction_tools-0.1.0/src/document_extraction_tools/types/document.py +79 -0
  43. document_extraction_tools-0.1.0/src/document_extraction_tools/types/document_bytes.py +27 -0
  44. document_extraction_tools-0.1.0/src/document_extraction_tools/types/evaluation_example.py +21 -0
  45. document_extraction_tools-0.1.0/src/document_extraction_tools/types/evaluation_result.py +16 -0
  46. document_extraction_tools-0.1.0/src/document_extraction_tools/types/path_identifier.py +16 -0
  47. document_extraction_tools-0.1.0/src/document_extraction_tools/types/schema.py +7 -0
@@ -0,0 +1,533 @@
1
+ Metadata-Version: 2.4
2
+ Name: document-extraction-tools
3
+ Version: 0.1.0
4
+ Summary: A modular, high-performance toolkit for extracting structured data from documents.
5
+ Keywords: document,extraction,pdf,ocr
6
+ Author: Ollie Kemp, Nikolas Moatsos
7
+ Author-email: Ollie Kemp <oliver.kemp@artefact.com>, Nikolas Moatsos <nikolas.moatsos@artefact.com>
8
+ License-Expression: MIT
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Dist: pydantic>=2.0.0
15
+ Requires-Dist: pyyaml>=6.0.3
16
+ Requires-Dist: numpy>=2.4.1
17
+ Requires-Dist: pillow>=12.1.0
18
+ Requires-Dist: pytest>=8.0,<9.0 ; extra == 'dev'
19
+ Requires-Dist: pytest-asyncio>=1.3.0 ; extra == 'dev'
20
+ Requires-Dist: pre-commit>=3.3,<4.0 ; extra == 'dev'
21
+ Requires-Python: >=3.12
22
+ Project-URL: Homepage, https://github.com/artefactory-uk/document-extraction-tools
23
+ Project-URL: Repository, https://github.com/artefactory-uk/document-extraction-tools
24
+ Project-URL: Issues, https://github.com/artefactory-uk/document-extraction-tools/issues
25
+ Provides-Extra: dev
26
+ Description-Content-Type: text/markdown
27
+
28
+ # document-extraction-tools
29
+
30
+ A modular, high-performance toolkit for building document extraction pipelines. The library provides clear interfaces for every pipeline stage, plus orchestrators that wire the stages together with async I/O and CPU-bound parallelism.
31
+
32
+ This repo is intentionally implementation-light: you plug in your own components (readers, converters, extractors, exporters, evaluators) for each specific document type or data source.
33
+
34
+ ## Table of Contents
35
+
36
+ - [document-extraction-tools](#document-extraction-tools)
37
+ - [Table of Contents](#table-of-contents)
38
+ - [Project layout](#project-layout)
39
+ - [What this library gives you](#what-this-library-gives-you)
40
+ - [Core concepts and components](#core-concepts-and-components)
41
+ - [Data models](#data-models)
42
+ - [Extraction pipeline](#extraction-pipeline)
43
+ - [Evaluation pipeline](#evaluation-pipeline)
44
+ - [Configuration](#configuration)
45
+ - [How to implement an extraction pipeline](#how-to-implement-an-extraction-pipeline)
46
+ - [1) Define your extraction schema](#1-define-your-extraction-schema)
47
+ - [2) Implement pipeline components](#2-implement-pipeline-components)
48
+ - [3) Create configuration models and YAML files](#3-create-configuration-models-and-yaml-files)
49
+ - [4) Load config and run the pipeline](#4-load-config-and-run-the-pipeline)
50
+ - [How to implement an evaluation pipeline](#how-to-implement-an-evaluation-pipeline)
51
+ - [1) Implement evaluation pipeline components](#1-implement-evaluation-pipeline-components)
52
+ - [2) Create configuration models and YAML files](#2-create-configuration-models-and-yaml-files)
53
+ - [3) Load config and run the pipeline](#3-load-config-and-run-the-pipeline)
54
+ - [Concurrency model](#concurrency-model)
55
+ - [Development](#development)
56
+ - [Releasing](#releasing)
57
+ - [Contributing](#contributing)
58
+
59
+ ## Project layout
60
+
61
+ ```bash
62
+ .
63
+ ├── src
64
+ │ └── document_extraction_tools
65
+ │ ├── base # abstract base classes you implement
66
+ │ │ ├── converter # conversion interface definitions
67
+ │ │ ├── evaluator # evaluation interface definitions
68
+ │ │ ├── exporter # export interface definitions
69
+ │ │ ├── extractor # extraction interface definitions
70
+ │ │ ├── file_lister # file discovery interface definitions
71
+ │ │ ├── reader # document read interface definitions
72
+ │ │ └── test_data_loader # evaluation dataset loader interfaces
73
+ │ ├── config # Pydantic configs + YAML loader helpers
74
+ │ ├── runners # orchestrators that run pipelines
75
+ │ │ ├── evaluation # evaluation pipeline orchestration
76
+ │ │ └── extraction # extraction pipeline orchestration
77
+ │ ├── types # shared models/types used across modules
78
+ │ └── py.typed
79
+ ├── tests
80
+ ├── pull_request_template.md
81
+ ├── pyproject.toml
82
+ ├── README.md
83
+ └── uv.lock
84
+ ```
85
+
86
+ ## What this library gives you
87
+
88
+ - A consistent set of **interfaces** for the entire document-extraction lifecycle.
89
+ - A **typed data model** for documents, pages, and extraction results.
90
+ - **Orchestrators** that run extraction and evaluation pipelines concurrently and safely.
91
+ - A **configuration system** (Pydantic + YAML) for repeatable pipelines.
92
+
93
+ ## Core concepts and components
94
+
95
+ ### Data models
96
+
97
+ - `PathIdentifier`: A uniform handle for file locations plus optional context.
98
+ - `DocumentBytes`: Raw bytes + MIME type + path identifier.
99
+ - `Document`: Parsed content (pages, text/image data, metadata).
100
+ - `ExtractionSchema`: Your Pydantic model (the target output).
101
+ - `EvaluationExample`: (path, ground truth) pair for evaluation runs.
102
+ - `EvaluationResult`: Name + result + description for evaluation metrics.
103
+
104
+ ### Extraction pipeline
105
+
106
+ 1. **FileLister** (`BaseFileLister`)
107
+ - Discovers input files and returns a list of `PathIdentifier` objects.
108
+
109
+ 2. **Reader** (`BaseReader`)
110
+ - Reads raw bytes from the source and returns `DocumentBytes`.
111
+
112
+ 3. **Converter** (`BaseConverter`)
113
+ - Converts raw bytes into a structured `Document` (pages, metadata, content type).
114
+
115
+ 4. **Extractor** (`BaseExtractor`)
116
+ - Asynchronously extracts structured data into a Pydantic schema (`ExtractionSchema`).
117
+
118
+ 5. **ExtractionExporter** (`BaseExtractionExporter`)
119
+ - Asynchronously persists extracted data to your desired destination (DB, files, API, etc.).
120
+
121
+ 6. **ExtractionOrchestrator**
122
+ - Runs the pipeline with a thread pool for CPU-bound steps (read/convert) and async
123
+ concurrency for I/O-bound steps (extract/export).
124
+
125
+ ### Evaluation pipeline
126
+
127
+ 1. **TestDataLoader** (`BaseTestDataLoader`)
128
+ - Loads evaluation examples (ground truth + file path) as `EvaluationExample`.
129
+
130
+ 2. **Evaluator** (`BaseEvaluator`)
131
+ - Computes a metric by comparing `true` vs. `pred` schemas.
132
+
133
+ 3. **EvaluationExporter** (`BaseEvaluationExporter`)
134
+ - Persists evaluation results.
135
+
136
+ 4. **EvaluationOrchestrator**
137
+ - Runs extraction + evaluation across examples with the same concurrency model
138
+ (thread pool + async I/O).
139
+
140
+ ### Configuration
141
+
142
+ Each component has a matching **base config class** (Pydantic model) that defines a
143
+ default YAML filename and acts as the parent for your own config fields. You’ll subclass
144
+ these to add settings specific to your implementation.
145
+
146
+ Extraction config base classes:
147
+
148
+ - `BaseFileListerConfig`
149
+ - `BaseReaderConfig`
150
+ - `BaseConverterConfig`
151
+ - `BaseExtractorConfig`
152
+ - `BaseExtractionExporterConfig`
153
+ - `ExtractionOrchestratorConfig` (you can use as-is; no need to subclass)
154
+
155
+ Evaluation specific config base classes:
156
+
157
+ - `BaseTestDataLoaderConfig`
158
+ - `BaseEvaluatorConfig`
159
+ - `BaseEvaluationExporterConfig`
160
+ - `EvaluationOrchestratorConfig` (you can use as-is; no need to subclass)
161
+
162
+ ## How to implement an extraction pipeline
163
+
164
+ For a full worked example including evaluation, please see [the document-extraction-examples](https://github.com/artefactory-uk/document-extraction-examples) repository. Below we outline the steps for a successful implementation.
165
+
166
+ ### 1) Define your extraction schema
167
+
168
+ Create a Pydantic model that represents the structured data you want out of each document.
169
+
170
+ Example implementation:
171
+
172
+ ```python
173
+ from pydantic import BaseModel, Field
174
+
175
+ class InvoiceSchema(BaseModel):
176
+ invoice_id: str = Field(..., description="Unique invoice identifier.")
177
+ vendor: str = Field(..., description="Vendor or issuer name.")
178
+ total: float = Field(..., description="Total invoice amount.")
179
+ ```
180
+
181
+ ### 2) Implement pipeline components
182
+
183
+ Subclass the base interfaces and implement the required methods.
184
+
185
+ Example implementations:
186
+
187
+ ```python
188
+ from document_extraction_tools.base import (
189
+ BaseFileLister,
190
+ BaseReader,
191
+ BaseConverter,
192
+ BaseExtractor,
193
+ BaseExtractionExporter,
194
+ )
195
+ from document_extraction_tools.types import Document, DocumentBytes, PathIdentifier
196
+ from document_extraction_tools.config import (
197
+ BaseFileListerConfig,
198
+ BaseReaderConfig,
199
+ BaseConverterConfig,
200
+ BaseExtractorConfig,
201
+ BaseExtractionExporterConfig,
202
+ )
203
+
204
+ class MyFileLister(BaseFileLister):
205
+ def __init__(self, config: BaseFileListerConfig) -> None:
206
+ super().__init__(config)
207
+
208
+ def list_files(self) -> list[PathIdentifier]:
209
+ # Discover and return file identifiers
210
+ ...
211
+
212
+
213
+ class MyReader(BaseReader):
214
+ def __init__(self, config: BaseReaderConfig) -> None:
215
+ super().__init__(config)
216
+
217
+ def read(self, path_identifier: PathIdentifier) -> DocumentBytes:
218
+ # Read file bytes from disk, object storage, etc.
219
+ ...
220
+
221
+
222
+ class MyConverter(BaseConverter):
223
+ def __init__(self, config: BaseConverterConfig) -> None:
224
+ super().__init__(config)
225
+
226
+ def convert(self, document_bytes: DocumentBytes) -> Document:
227
+ # Parse PDF, OCR, etc. and return a Document
228
+ ...
229
+
230
+
231
+ class MyExtractor(BaseExtractor):
232
+ def __init__(self, config: BaseExtractorConfig) -> None:
233
+ super().__init__(config)
234
+
235
+ async def extract(self, document: Document, schema: type[InvoiceSchema]) -> InvoiceSchema:
236
+ # Call LLM or rules-based system
237
+ ...
238
+
239
+
240
+ class MyExtractionExporter(BaseExtractionExporter):
241
+ def __init__(self, config: BaseExtractionExporterConfig) -> None:
242
+ super().__init__(config)
243
+
244
+ async def export(self, document: Document, data: InvoiceSchema) -> None:
245
+ # Persist data to DB, filesystem, etc.
246
+ ...
247
+ ```
248
+
249
+ ### 3) Create configuration models and YAML files
250
+
251
+ Each component has a base config class with a default filename (e.g. `extractor.yaml`).
252
+ Subclass the config models to add your own fields, then provide YAML files in
253
+ the directory you pass as `config_dir` to `load_config` (default is
254
+ `config/yaml/`).
255
+
256
+ Default filenames:
257
+
258
+ - `extraction_orchestrator.yaml`
259
+ - `file_lister.yaml`
260
+ - `reader.yaml`
261
+ - `converter.yaml`
262
+ - `extractor.yaml`
263
+ - `extraction_exporter.yaml`
264
+
265
+ Example config model:
266
+
267
+ ```python
268
+ from document_extraction_tools.config import BaseExtractorConfig
269
+
270
+ class MyExtractorConfig(BaseExtractorConfig):
271
+ model_name: str
272
+ ```
273
+
274
+ Example YAML (`config/yaml/extractor.yaml`):
275
+
276
+ ```yaml
277
+ # add fields your Extractor config defines
278
+ model_name: "gemini-3-flash-preview"
279
+ ```
280
+
281
+ ### 4) Load config and run the pipeline
282
+
283
+ Example usage:
284
+
285
+ ```python
286
+ import asyncio
287
+ from document_extraction_tools.config import load_config
288
+ from document_extraction_tools.runners import ExtractionOrchestrator
289
+ from document_extraction_tools.config import ExtractionOrchestratorConfig
290
+
291
+ config = load_config(
292
+ lister_config_cls=MyFileListerConfig,
293
+ reader_config_cls=MyReaderConfig,
294
+ converter_config_cls=MyConverterConfig,
295
+ extractor_config_cls=MyExtractorConfig,
296
+ exporter_config_cls=MyExtractionExporterConfig,
297
+ orchestrator_config_cls=ExtractionOrchestratorConfig,
298
+ config_dir=Path("config/yaml"),
299
+ )
300
+
301
+ orchestrator = ExtractionOrchestrator.from_config(
302
+ config=config,
303
+ schema=InvoiceSchema,
304
+ reader_cls=MyReader,
305
+ converter_cls=MyConverter,
306
+ extractor_cls=MyExtractor,
307
+ exporter_cls=MyExtractionExporter,
308
+ )
309
+
310
+ file_lister = MyFileLister(config.file_lister)
311
+ file_paths = file_lister.list_files()
312
+
313
+ asyncio.run(orchestrator.run(file_paths))
314
+ ```
315
+
316
+ ## How to implement an evaluation pipeline
317
+
318
+ ### 1) Implement evaluation pipeline components
319
+
320
+ The evaluation pipeline reuses your reader/converter/extractor and adds three pieces:
321
+
322
+ 1. **TestDataLoader**: loads evaluation examples (file + ground truth)
323
+ 2. **Evaluator(s)**: compute metrics for each example
324
+ 3. **EvaluationExporter**: persist results
325
+
326
+ Example implementations:
327
+
328
+ ```python
329
+ from document_extraction_tools.base import (
330
+ BaseTestDataLoader,
331
+ BaseEvaluator,
332
+ BaseEvaluationExporter,
333
+ )
334
+ from document_extraction_tools.config import (
335
+ BaseTestDataLoaderConfig,
336
+ BaseEvaluatorConfig,
337
+ BaseEvaluationExporterConfig,
338
+ )
339
+ from document_extraction_tools.types import EvaluationExample, EvaluationResult, PathIdentifier
340
+
341
+
342
+ class MyTestDataLoader(BaseTestDataLoader[InvoiceSchema]):
343
+ def __init__(self, config: BaseTestDataLoaderConfig) -> None:
344
+ super().__init__(config)
345
+
346
+ def load_test_data(
347
+ self, path_identifier: PathIdentifier
348
+ ) -> list[EvaluationExample[InvoiceSchema]]:
349
+ # Load ground-truth + path pairs from disk/DB/etc.
350
+ ...
351
+
352
+
353
+ class MyEvaluator(BaseEvaluator[InvoiceSchema]):
354
+ def __init__(self, config: BaseEvaluatorConfig) -> None:
355
+ super().__init__(config)
356
+
357
+ def evaluate(
358
+ self, true: InvoiceSchema, pred: InvoiceSchema
359
+ ) -> EvaluationResult:
360
+ # Compare true vs pred and return a metric
361
+ ...
362
+
363
+
364
+ class MyEvaluationExporter(BaseEvaluationExporter):
365
+ def __init__(self, config: BaseEvaluationExporterConfig) -> None:
366
+ super().__init__(config)
367
+
368
+ async def export(
369
+ self, results: list[tuple[Document, list[EvaluationResult]]]
370
+ ) -> None:
371
+ # Persist evaluation results
372
+ ...
373
+ ```
374
+
375
+ ### 2) Create configuration models and YAML files
376
+
377
+ Implement your own config models by subclassing the base evaluation configs and adding any fields your components need.
378
+
379
+ Default YAML filenames for evaluation:
380
+
381
+ - `evaluation_orchestrator.yaml`
382
+ - `test_data_loader.yaml`
383
+ - `evaluator.yaml` (one top-level key per evaluator config class name)
384
+ - `reader.yaml`
385
+ - `converter.yaml`
386
+ - `extractor.yaml`
387
+ - `evaluation_exporter.yaml`
388
+
389
+ Warning: The top-level key in the YAML MUST match the evaluator configuration class name, and the evaluator configuration class name MUST be the name of the evaluator class with the suffix `Config`. For example:
390
+
391
+ ```python
392
+ class MyEvaluator(BaseEvaluator):
393
+ ...
394
+
395
+ class MyEvaluatorConfig(BaseEvaluatorConfig):
396
+ ...
397
+ ```
398
+
399
+ Example YAML (`config/yaml/evaluator.yaml`):
400
+
401
+ ```yaml
402
+ MyEvaluatorConfig:
403
+ # add fields your Evaluator config defines
404
+ threshold: 0.8
405
+ ```
406
+
407
+ ### 3) Load config and run the pipeline
408
+
409
+ Example usage:
410
+
411
+ ```python
412
+ from document_extraction_tools.config import load_evaluation_config
413
+ from document_extraction_tools.runners import EvaluationOrchestrator
414
+ from document_extraction_tools.config import EvaluationOrchestratorConfig
415
+
416
+ config = load_evaluation_config(
417
+ test_data_loader_config_cls=MyTestDataLoaderConfig,
418
+ evaluator_config_classes=[MyEvaluatorConfig],
419
+ reader_config_cls=MyReaderConfig,
420
+ converter_config_cls=MyConverterConfig,
421
+ extractor_config_cls=MyExtractorConfig,
422
+ evaluation_exporter_config_cls=MyEvaluationExporterConfig,
423
+ orchestrator_config_cls=EvaluationOrchestratorConfig,
424
+ config_dir=Path("config/yaml"),
425
+ )
426
+
427
+ orchestrator = EvaluationOrchestrator.from_config(
428
+ config=config,
429
+ schema=InvoiceSchema,
430
+ reader_cls=MyReader,
431
+ converter_cls=MyConverter,
432
+ extractor_cls=MyExtractor,
433
+ test_data_loader_cls=MyTestDataLoader,
434
+ evaluator_classes=[MyEvaluator],
435
+ evaluation_exporter_cls=MyEvaluationExporter,
436
+ )
437
+
438
+ examples = MyTestDataLoader(config.test_data_loader).load_test_data(
439
+ PathIdentifier(path="/path/to/eval-set")
440
+ )
441
+
442
+ asyncio.run(orchestrator.run(examples))
443
+ ```
444
+
445
+ ## Concurrency model
446
+
447
+ - **Reader + Converter** run in a thread pool (CPU-bound work).
448
+ - **Extractor + Exporter** run concurrently in the event loop (I/O-bound work).
449
+ - Tuning options live in `extraction_orchestrator.yaml` and `evaluation_orchestrator.yaml`:
450
+ - `max_workers` (thread pool size)
451
+ - `max_concurrency` (async I/O semaphore limit)
452
+
453
+ ## Development
454
+
455
+ - Install dependencies: `uv sync`
456
+ - Run pre-commit: `uv run pre-commit run --all-files`
457
+ - Run tests: `uv run pytest`
458
+
459
+ ## Releasing
460
+
461
+ ### Test release (TestPyPI)
462
+
463
+ 1. Create a release branch and bump version:
464
+ ```bash
465
+ git checkout -b release/v0.2.0-rc1
466
+ uv version --bump rc
467
+ # Or manually: uv version 0.2.0-rc1
468
+ ```
469
+
470
+ 2. Commit and push the branch:
471
+ ```bash
472
+ VERSION=$(uv version --short)
473
+ git add pyproject.toml
474
+ git commit -m "Bump version to $VERSION"
475
+ git push -u origin release/v$VERSION
476
+ ```
477
+
478
+ 3. Create and merge a PR to main.
479
+
480
+ 4. Tag the merge commit and push:
481
+ ```bash
482
+ git checkout main && git pull
483
+ VERSION=$(uv version --short)
484
+ git tag "v$VERSION"
485
+ git push --tags
486
+ ```
487
+
488
+ 5. The `publish-test.yaml` workflow automatically publishes to TestPyPI.
489
+
490
+ 6. Verify installation:
491
+ ```bash
492
+ uv pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ document-extraction-tools
493
+ ```
494
+
495
+ ### Production release (PyPI)
496
+
497
+ 1. Create a release branch and bump version:
498
+ ```bash
499
+ git checkout -b release/v0.2.0
500
+ uv version --bump minor # or: major, minor, patch
501
+ ```
502
+
503
+ 2. Commit and push the branch:
504
+ ```bash
505
+ VERSION=$(uv version --short)
506
+ git add pyproject.toml
507
+ git commit -m "Bump version to $VERSION"
508
+ git push -u origin release/v$VERSION
509
+ ```
510
+
511
+ 3. Create and merge a PR to main.
512
+
513
+ 4. Tag the merge commit and create the release:
514
+ ```bash
515
+ git checkout main && git pull
516
+ VERSION=$(uv version --short)
517
+ git tag "v$VERSION"
518
+ git push --tags
519
+ gh release create "v$VERSION" --title "v$VERSION" --generate-notes
520
+ ```
521
+
522
+ 5. The `publish.yaml` workflow automatically builds, publishes to PyPI, and runs smoke tests.
523
+
524
+ ## Contributing
525
+
526
+ Contributions are welcome. Please:
527
+
528
+ - Report bugs or feature requests by opening an issue.
529
+ - Create a new branch using the following naming conventions: `feat/short-description`, `fix/short-description`, etc.
530
+ - Describe the change clearly in the PR description.
531
+ - Add or update tests in `tests/`.
532
+ - Run linting and tests before pushing: `uv run pre-commit run --all-files` and `uv run pytest`.
533
+ - If you open a PR, please notify the maintainers ([Ollie Kemp]( https://github.com/ollie-artefact) or [Nikolas Moatsos](https://github.com/nmoatsos)).