extractor-api-lib 3.3.0__tar.gz → 3.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. extractor_api_lib-3.4.0/PKG-INFO +241 -0
  2. extractor_api_lib-3.4.0/README.md +180 -0
  3. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/pyproject.toml +14 -6
  4. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/api_endpoints/file_extractor.py +2 -0
  5. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/api_endpoints/source_extractor.py +2 -0
  6. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/apis/extractor_api_base.py +1 -0
  7. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/dependency_container.py +28 -5
  8. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/extractors/information_extractor.py +2 -1
  9. extractor_api_lib-3.4.0/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py +152 -0
  10. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/extractors/confluence_extractor.py +1 -0
  11. extractor_api_lib-3.4.0/src/extractor_api_lib/impl/extractors/file_extractors/docling_extractor.py +233 -0
  12. extractor_api_lib-3.4.0/src/extractor_api_lib/impl/extractors/file_extractors/image_extractor.py +123 -0
  13. extractor_api_lib-3.4.0/src/extractor_api_lib/impl/extractors/file_extractors/markitdown_extractor.py +295 -0
  14. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py +8 -6
  15. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/extractors/sitemap_extractor.py +1 -0
  16. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/file_services/s3_service.py +2 -2
  17. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/types/extractor_types.py +2 -0
  18. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/types/file_type.py +7 -0
  19. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/utils/sitemap_extractor_utils.py +4 -3
  20. extractor_api_lib-3.3.0/PKG-INFO +0 -147
  21. extractor_api_lib-3.3.0/README.md +0 -94
  22. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py +0 -80
  23. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/settings/pdf_extractor_settings.py +0 -20
  24. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/api_endpoints/__init__.py +0 -0
  25. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/apis/__init__.py +0 -0
  26. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/apis/extractor_api.py +0 -0
  27. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/extractors/__init__.py +0 -0
  28. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/extractors/information_file_extractor.py +0 -0
  29. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/file_services/file_service.py +0 -0
  30. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/__init__.py +0 -0
  31. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/api_endpoints/__init__.py +0 -0
  32. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py +0 -0
  33. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/extractor_api_impl.py +0 -0
  34. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/extractors/__init__.py +0 -0
  35. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/extractors/file_extractors/__init__.py +0 -0
  36. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/extractors/file_extractors/epub_extractor.py +0 -0
  37. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py +0 -0
  38. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py +0 -0
  39. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/file_services/__init__.py +0 -0
  40. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/mapper/__init__.py +0 -0
  41. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py +0 -0
  42. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py +0 -0
  43. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/mapper/langchain_document2information_piece.py +0 -0
  44. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py +0 -0
  45. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/settings/__init__.py +0 -0
  46. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/settings/s3_settings.py +0 -0
  47. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/table_converter/__init__.py +0 -0
  48. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/table_converter/dataframe2markdown.py +0 -0
  49. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/types/__init__.py +0 -0
  50. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/types/content_type.py +0 -0
  51. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/utils/__init__.py +0 -0
  52. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/impl/utils/utils.py +0 -0
  53. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/main.py +0 -0
  54. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/mapper/__init__.py +0 -0
  55. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/mapper/source_langchain_document2information_piece.py +0 -0
  56. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/models/__init__.py +0 -0
  57. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/models/confluence_parameters.py +0 -0
  58. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/models/content_type.py +0 -0
  59. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/models/dataclasses/__init__.py +0 -0
  60. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/models/dataclasses/internal_information_piece.py +0 -0
  61. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/models/extra_models.py +0 -0
  62. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/models/extraction_parameters.py +0 -0
  63. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/models/extraction_request.py +0 -0
  64. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/models/information_piece.py +0 -0
  65. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/models/key_value_pair.py +0 -0
  66. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/table_converter/__init__.py +0 -0
  67. {extractor_api_lib-3.3.0 → extractor_api_lib-3.4.0}/src/extractor_api_lib/table_converter/dataframe_converter.py +0 -0
@@ -0,0 +1,241 @@
1
+ Metadata-Version: 2.4
2
+ Name: extractor-api-lib
3
+ Version: 3.4.0
4
+ Summary: Extracts the content of documents, websites, etc and maps it to a common format.
5
+ License: Apache-2.0
6
+ Author: STACKIT GmbH & Co. KG
7
+ Author-email: data-ai@stackit.cloud
8
+ Maintainer: Andreas Klos
9
+ Maintainer-email: andreas.klos@stackit.cloud
10
+ Requires-Python: >=3.13,<3.14
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Requires-Dist: atlassian-python-api (>=4.0.3,<5.0.0)
15
+ Requires-Dist: boto3 (>=1.38.10,<2.0.0)
16
+ Requires-Dist: botocore (>=1.38.10,<2.0.0)
17
+ Requires-Dist: camelot-py[cv] (>=1.0.0,<2.0.0)
18
+ Requires-Dist: datasets (>=3.5.1,<4.0.0)
19
+ Requires-Dist: debugpy (>=1.8.14,<2.0.0)
20
+ Requires-Dist: dependency-injector (>=4.46.0,<5.0.0)
21
+ Requires-Dist: docling (==2.61.2)
22
+ Requires-Dist: docx2txt (>=0.9,<0.10)
23
+ Requires-Dist: fake-useragent (>=2.2.0,<3.0.0)
24
+ Requires-Dist: fastapi (>=0.118.0,<0.119.0)
25
+ Requires-Dist: fasttext (>=0.9.3,<0.10.0)
26
+ Requires-Dist: html5lib (>=1.1,<2.0)
27
+ Requires-Dist: langchain-community (>=0.3.23,<0.4.0)
28
+ Requires-Dist: langchain-core (==0.3.77)
29
+ Requires-Dist: lxml (>=5.4.0,<6.0.0)
30
+ Requires-Dist: markdownify (>=1.1.0,<2.0.0)
31
+ Requires-Dist: markitdown[all] (>=0.1.3,<0.2.0)
32
+ Requires-Dist: numpy (>=2.2.5,<3.0.0)
33
+ Requires-Dist: oauthlib (>=3.2.2,<4.0.0)
34
+ Requires-Dist: opencv-python-headless (==4.12.0.88)
35
+ Requires-Dist: pandas (>=2.2.2,<3.0.0)
36
+ Requires-Dist: partial (>=1.0,<2.0)
37
+ Requires-Dist: pdf2image (==1.17.0)
38
+ Requires-Dist: pdfplumber (==0.11.7)
39
+ Requires-Dist: pydantic-settings (>=2.9.1,<3.0.0)
40
+ Requires-Dist: pypandoc (>=1.6.2)
41
+ Requires-Dist: pypandoc-binary (>=1.15,<2.0)
42
+ Requires-Dist: pypandoc_binary (>=1.6.2)
43
+ Requires-Dist: pypdfium2 (==4.30.0)
44
+ Requires-Dist: pytesseract (>=0.3.10,<0.4.0)
45
+ Requires-Dist: python-multipart (>=0.0.20,<0.0.21)
46
+ Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
47
+ Requires-Dist: requests-oauthlib (>=2.0.0,<3.0.0)
48
+ Requires-Dist: starlette (>=0.47.2,<0.49.0)
49
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
50
+ Requires-Dist: tesserocr (>=2.9.1,<3.0.0)
51
+ Requires-Dist: torch (==2.9.0+cpu)
52
+ Requires-Dist: torchvision (==0.24.0+cpu)
53
+ Requires-Dist: transformers (>=4.53.3,<5.0.0)
54
+ Requires-Dist: unstructured[docx,pptx] (==0.18.15)
55
+ Requires-Dist: uvicorn (>=0.37.0,<0.38.0)
56
+ Requires-Dist: wheel (>=0.45.1,<0.46.0)
57
+ Project-URL: Homepage, https://pypi.org/project/extractor-api-lib
58
+ Project-URL: Repository, https://github.com/stackitcloud/rag-template
59
+ Description-Content-Type: text/markdown
60
+
61
+ # extractor-api-lib
62
+
63
+ Content ingestion layer for the STACKIT RAG template. This library exposes a FastAPI extraction service that ingests raw documents (files or remote sources), extracts and converts (to internal representations) the information, and hands output to [`admin-api-lib`](../admin-api-lib/).
64
+
65
+ ## Responsibilities
66
+
67
+ - Receive binary uploads and remote source descriptors from the admin backend.
68
+ - Route each request through the appropriate extractor (file, sitemap, Confluence, etc.).
69
+ - Convert extracted fragments into the shared `InformationPiece` schema expected by downstream services.
70
+
71
+ ## Feature highlights
72
+
73
+ - **Layered extraction pipeline** – Docling, MarkItDown, and the custom extractors now cooperate with a deterministic fallback chain, so a failed run automatically cascades to the next extractor.
74
+ - **Expanded format coverage** – PDFs, Office documents, EPUB, XML, Markdown/AsciiDoc, CSV/TXT, raster images, Confluence spaces, and sitemap-driven websites.
75
+ - **Consistent output schema** – Information pieces are returned in a unified structure with content type (`TEXT`, `TABLE`, `IMAGE`) and metadata.
76
+ - **Swappable extractors** – Dependency-injector container makes it easy to add or replace file/source extractors, table converters, etc.
77
+ - **Production-grade plumbing** – Built-in S3-compatible file service, LangChain loaders with retry/backoff, optional PDF OCR, and throttling controls for web crawls.
78
+
79
+ ## File extractor pipeline
80
+
81
+ [`GeneralFileExtractor`](src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py) orchestrates file parsing. It resolves the file type from the extension, filters the extractors that declare matching `compatible_file_types`, reverses that filtered list, and then executes the extractors in sequence until one returns content or all have failed. Exceptions are logged and the next extractor takes over; only if every extractor either returns no content or raises an exception do we bubble up an error.
82
+
83
+ ### Default execution order
84
+
85
+ The dependency container wires extractors in the following list:
86
+
87
+ 1. `DoclingFileExtractor`
88
+ 2. `MarkitdownFileExtractor`
89
+ 3. `PDFExtractor`
90
+ 4. `EpubExtractor`
91
+ 5. `XMLExtractor`
92
+ 6. `MSDocsExtractor`
93
+ 7. `TesseractImageExtractor`
94
+
95
+ Because the orchestrator reverses the candidate list before the fallback loop, the priority for overlapping formats is the reverse of this wiring. For example, PDFs run through Docling first, then fall back to MarkItDown, and finally to the custom PDF extractor; DOCX/PPTX files follow Docling → MarkItDown → MSDocs; raster images go through Docling’s OCR pipeline before falling back to the Tesseract-only extractor.
96
+
97
+ ### Supported formats
98
+
99
+ | Format family | Extensions | Primary extractor | Fallbacks | Notes |
100
+ |--------------------------|----------------------------------------------------------|----------------------------|----------------------------------------------------------|-------|
101
+ | PDF | `.pdf` | Docling | MarkItDown → Custom PDF extractor | Docling performs OCR + table extraction; the PDF extractor keeps Camelot/pdfplumber heuristics as a last resort. |
102
+ | Microsoft Word | `.docx` | Docling | MarkItDown → MSDocs | MSDocs keeps unstructured-based table conversion for custom cases. |
103
+ | Microsoft PowerPoint | `.pptx` | Docling | MarkItDown → MSDocs | MarkItDown splits slides by `<!-- Slide number: N -->`. |
104
+ | Microsoft Excel | `.xlsx` | Docling | — | Tables returned as markdown; Docling infers sheet structure. |
105
+ | EPUB | `.epub` | MarkItDown | EPUB extractor | MarkItDown covers simple ebooks; the LangChain-based EPUB extractor preserves metadata when MarkItDown fails. |
106
+ | HTML | `.html` | Docling | MarkItDown | Docling keeps DOM-aware segmentation; MarkItDown is lighter-weight. |
107
+ | Markdown | `.md`, `.markdown`, `.mdx` | Docling | — | MarkItDown does not currently register for Markdown. |
108
+ | AsciiDoc | `.adoc`, `.asciidoc` | Docling | — | |
109
+ | CSV | `.csv` | Docling | MarkItDown | Both produce markdown tables; Docling preserves structured metadata. |
110
+ | Plain text | `.txt` | MarkItDown | — | |
111
+ | XML | `.xml` | XML extractor | — | Uses the unstructured XML partitioner. |
112
+ | Raster images | `.jpg`, `.jpeg`, `.png`, `.tiff`, `.tif`, `.bmp` | Docling (OCR) | Tesseract image extractor | Docling feeds Tesseract CLI OCR; the fallback enforces single-frame images via Pillow. |
113
+
114
+ Image coverage currently excludes animated GIF, WebP, HEIC, and SVG files. These extensions are ignored by the routing logic and will surface as “No extractor found” errors until an extractor declares support.
115
+
116
+ ### Source extractor pipeline
117
+
118
+ `GeneralSourceExtractor` wires Confluence and sitemap loaders behind a similar abstraction. Unlike files, source extractors are keyed by `ExtractionParameters.source_type` and the matching extractor is called directly (no fallback chain).
119
+
120
+ ## Configuring extractor order
121
+
122
+ The order lives in `DependencyContainer.file_extractors`. You can override it either by subclassing the container or by overriding the provider at runtime before wiring the FastAPI app. Example:
123
+
124
+ `container.py`
125
+
126
+ ```python
127
+ from dependency_injector.providers import List
128
+
129
+ from extractor_api_lib.dependency_container import DependencyContainer
130
+
131
+
132
+ class CustomExtractorContainer(DependencyContainer):
133
+ file_extractors = List(
134
+ DependencyContainer.docling_extractor,
135
+ DependencyContainer.markitdown_extractor,
136
+ DependencyContainer.ms_docs_extractor,
137
+ DependencyContainer.pdf_extractor,
138
+ DependencyContainer.image_extractor,
139
+ DependencyContainer.xml_extractor,
140
+ DependencyContainer.epub_extractor,
141
+ )
142
+ ```
143
+
144
+ `main.py`
145
+
146
+ ```python
147
+ from extractor_api_lib.main import app as perfect_extractor_app, register_dependency_container
148
+
149
+ from container import CustomExtractorContainer
150
+
151
+ register_dependency_container(CustomExtractorContainer())
152
+ ```
153
+
154
+ The last provider in the list becomes the first extractor tried for a matching file type. Keep shared singleton providers (file service, converters) in the parent class to avoid double instantiation.
155
+
156
+ ## Installation
157
+
158
+ ```bash
159
+ pip install extractor-api-lib
160
+ ```
161
+
162
+ Python 3.13 is required. OCR and computer-vision features expect system packages such as `ffmpeg`, `poppler-utils`, and `tesseract` (see `services/document-extractor/README.md` for the full list).
163
+
164
+ ## Module tour
165
+
166
+ - `dependency_container.py` – Central dependency-injector wiring. Override providers here to plug in custom extractors, endpoints etc.
167
+ - `api_endpoints/` & `impl/api_endpoints/` – Thin FastAPI endpoint abstractions and implementations for file and source (like confluence & sitemaps) extractors.
168
+ - `apis/` – Extractor API abstractions and implementations.
169
+ - `extractors/` & `impl/extractors/` – Format-specific logic (PDF, DOCX, PPTX, XML, EPUB, Confluence, sitemap) packaged behind the `InformationExtractor`/`InformationFileExtractor` interfaces.
170
+ - `mapper/` & `impl/mapper/` – Abstractions and implementations to map langchain documents, internal and external information piece representations to each other.
171
+ - `file_services/` – Default S3-compatible storage adapter; replace it if you store files elsewhere.
172
+ - `impl/settings/` – Configuration settings for dependency injection container components.
173
+ - `table_converter/` & `impl/table_converter/` – Abstractions and implementations to convert `pandas.DataFrame` to markdown and vice versa.
174
+ - `impl/types/` - Enums for content-, extractor- and file types.
175
+ - `impl/utils/` – Helper functions for hashed datetime and sitemap crawling, header injection, and custom metadata parsing.
176
+
177
+ ## Endpoints provided
178
+
179
+ - `POST /extract_from_file` – Downloads the file from S3, extracts its contents, and returns normalized `InformationPiece` records.
180
+ - `POST /extract_from_source` – Pulls from remote sources (Confluence, sitemap) using credentials and further optional kwargs.
181
+
182
+ Both endpoints stream their results back to `admin-api-lib`, which takes care of enrichment and persistence.
183
+
184
+ ## How the file extraction endpoint works
185
+
186
+ 1. Download the file from S3.
187
+ 2. Derive the file type from the extension (normalizing common image/Markdown/AsciiDoc aliases).
188
+ 3. Select extractors that declare support for the resolved `FileType`.
189
+ 4. Run the extractors in priority order (highest priority first); stop at the first non-empty result or keep falling back if an extractor raises.
190
+ 5. Map the internal representation to the external schema and return the final output.
191
+
192
+ ## How the source extraction endpoint works
193
+
194
+ 1. Chose suitable source extractor based on the source type
195
+ 2. Pull the source content using the provided credentials and parameters
196
+ 3. Extract the content from the source
197
+ 4. Map the internal representation to the external schema
198
+ 5. Return the final output
199
+
200
+ ## Configuration overview
201
+
202
+ Two `pydantic-settings` models ship with this package:
203
+
204
+ - **S3 storage** (`S3Settings`) – configure the built-in file service with `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, `S3_ENDPOINT`, and `S3_BUCKET`.
205
+
206
+ Other extractors accept their parameters at runtime through the request payload (`ExtractionParameters`). For example, the admin backend forwards Confluence credentials, sitemap URLs, or custom headers when it calls `/extract_from_source`. This keeps the library stateless and makes it easy to plug in additional sources without redeploying.
207
+
208
+ The Helm chart exposes the environment variables mentioned above under `documentExtractor.envs.*` so production deployments remain declarative.
209
+
210
+ ## Typical usage
211
+
212
+ ```python
213
+ from extractor_api_lib.main import app as perfect_extractor_app
214
+ ```
215
+
216
+ `admin-api-lib` calls `/extract_from_file` and `/extract_from_source` to populate the ingestion pipeline.
217
+
218
+ ## Extending the library
219
+
220
+ 1. Implement `InformationFileExtractor` (for file-based inputs) or `InformationExtractor` (for remote sources).
221
+ 2. Add a provider to `DependencyContainer` (usually a `Singleton`) and wire dependencies such as the shared `FileService` or table converter.
222
+ 3. Append the provider to `file_extractors` (or to the source extractor list) in the desired position so that the fallback order is correct.
223
+ 4. Update mappers or metadata handling if additional fields are required.
224
+ 5. Cover the happy path and a failure edge case with tests under `libs/extractor-api-lib/tests`, mocking external services (OCR, network, file I/O).
225
+
226
+ ## Advantages and caveats
227
+
228
+ - Docling-first prioritisation dramatically improves structured extraction (tables, headings) and adds OCR to formats that previously lacked it.
229
+ - Retaining MarkItDown and the custom PDF/MS extractors provides graceful degradation when Docling fails or produces empty output.
230
+ - Image support now goes through Docling’s OCR before falling back to pure Tesseract.
231
+ - The configuration still requires code changes; there is no environment-variable switch to reshuffle or disable extractors at runtime.
232
+ - Multi-frame images, animated/novel image formats, and office formats such as ODT/RTF remain unsupported.
233
+
234
+ ## Contributing
235
+
236
+ Ensure new endpoints or adapters remain thin and defer to [`rag-core-lib`](../rag-core-lib/) for shared logic. Run `poetry run pytest` and the configured linters before opening a PR. For further instructions see the [Contributing Guide](https://github.com/stackitcloud/rag-template/blob/main/CONTRIBUTING.md).
237
+
238
+ ## License
239
+
240
+ Licensed under the project license. See the root [`LICENSE`](https://github.com/stackitcloud/rag-template/blob/main/LICENSE) file.
241
+
@@ -0,0 +1,180 @@
1
+ # extractor-api-lib
2
+
3
+ Content ingestion layer for the STACKIT RAG template. This library exposes a FastAPI extraction service that ingests raw documents (files or remote sources), extracts and converts (to internal representations) the information, and hands output to [`admin-api-lib`](../admin-api-lib/).
4
+
5
+ ## Responsibilities
6
+
7
+ - Receive binary uploads and remote source descriptors from the admin backend.
8
+ - Route each request through the appropriate extractor (file, sitemap, Confluence, etc.).
9
+ - Convert extracted fragments into the shared `InformationPiece` schema expected by downstream services.
10
+
11
+ ## Feature highlights
12
+
13
+ - **Layered extraction pipeline** – Docling, MarkItDown, and the custom extractors now cooperate with a deterministic fallback chain, so a failed run automatically cascades to the next extractor.
14
+ - **Expanded format coverage** – PDFs, Office documents, EPUB, XML, Markdown/AsciiDoc, CSV/TXT, raster images, Confluence spaces, and sitemap-driven websites.
15
+ - **Consistent output schema** – Information pieces are returned in a unified structure with content type (`TEXT`, `TABLE`, `IMAGE`) and metadata.
16
+ - **Swappable extractors** – Dependency-injector container makes it easy to add or replace file/source extractors, table converters, etc.
17
+ - **Production-grade plumbing** – Built-in S3-compatible file service, LangChain loaders with retry/backoff, optional PDF OCR, and throttling controls for web crawls.
18
+
19
+ ## File extractor pipeline
20
+
21
+ [`GeneralFileExtractor`](src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py) orchestrates file parsing. It resolves the file type from the extension, filters the extractors that declare matching `compatible_file_types`, reverses that filtered list, and then executes the extractors in sequence until one returns content or all have failed. Exceptions are logged and the next extractor takes over; only if every extractor either returns no content or raises an exception do we bubble up an error.
22
+
23
+ ### Default execution order
24
+
25
+ The dependency container wires extractors in the following list:
26
+
27
+ 1. `DoclingFileExtractor`
28
+ 2. `MarkitdownFileExtractor`
29
+ 3. `PDFExtractor`
30
+ 4. `EpubExtractor`
31
+ 5. `XMLExtractor`
32
+ 6. `MSDocsExtractor`
33
+ 7. `TesseractImageExtractor`
34
+
35
+ Because the orchestrator reverses the candidate list before the fallback loop, the priority for overlapping formats is the reverse of this wiring. For example, PDFs run through Docling first, then fall back to MarkItDown, and finally to the custom PDF extractor; DOCX/PPTX files follow Docling → MarkItDown → MSDocs; raster images go through Docling’s OCR pipeline before falling back to the Tesseract-only extractor.
36
+
37
+ ### Supported formats
38
+
39
+ | Format family | Extensions | Primary extractor | Fallbacks | Notes |
40
+ |--------------------------|----------------------------------------------------------|----------------------------|----------------------------------------------------------|-------|
41
+ | PDF | `.pdf` | Docling | MarkItDown → Custom PDF extractor | Docling performs OCR + table extraction; the PDF extractor keeps Camelot/pdfplumber heuristics as a last resort. |
42
+ | Microsoft Word | `.docx` | Docling | MarkItDown → MSDocs | MSDocs keeps unstructured-based table conversion for custom cases. |
43
+ | Microsoft PowerPoint | `.pptx` | Docling | MarkItDown → MSDocs | MarkItDown splits slides by `<!-- Slide number: N -->`. |
44
+ | Microsoft Excel | `.xlsx` | Docling | — | Tables returned as markdown; Docling infers sheet structure. |
45
+ | EPUB | `.epub` | MarkItDown | EPUB extractor | MarkItDown covers simple ebooks; the LangChain-based EPUB extractor preserves metadata when MarkItDown fails. |
46
+ | HTML | `.html` | Docling | MarkItDown | Docling keeps DOM-aware segmentation; MarkItDown is lighter-weight. |
47
+ | Markdown | `.md`, `.markdown`, `.mdx` | Docling | — | MarkItDown does not currently register for Markdown. |
48
+ | AsciiDoc | `.adoc`, `.asciidoc` | Docling | — | |
49
+ | CSV | `.csv` | Docling | MarkItDown | Both produce markdown tables; Docling preserves structured metadata. |
50
+ | Plain text | `.txt` | MarkItDown | — | |
51
+ | XML | `.xml` | XML extractor | — | Uses the unstructured XML partitioner. |
52
+ | Raster images | `.jpg`, `.jpeg`, `.png`, `.tiff`, `.tif`, `.bmp` | Docling (OCR) | Tesseract image extractor | Docling feeds Tesseract CLI OCR; the fallback enforces single-frame images via Pillow. |
53
+
54
+ Image coverage currently excludes animated GIF, WebP, HEIC, and SVG files. These extensions are ignored by the routing logic and will surface as “No extractor found” errors until an extractor declares support.
55
+
56
+ ### Source extractor pipeline
57
+
58
+ `GeneralSourceExtractor` wires Confluence and sitemap loaders behind a similar abstraction. Unlike files, source extractors are keyed by `ExtractionParameters.source_type` and the matching extractor is called directly (no fallback chain).
59
+
60
+ ## Configuring extractor order
61
+
62
+ The order lives in `DependencyContainer.file_extractors`. You can override it either by subclassing the container or by overriding the provider at runtime before wiring the FastAPI app. Example:
63
+
64
+ `container.py`
65
+
66
+ ```python
67
+ from dependency_injector.providers import List
68
+
69
+ from extractor_api_lib.dependency_container import DependencyContainer
70
+
71
+
72
+ class CustomExtractorContainer(DependencyContainer):
73
+ file_extractors = List(
74
+ DependencyContainer.docling_extractor,
75
+ DependencyContainer.markitdown_extractor,
76
+ DependencyContainer.ms_docs_extractor,
77
+ DependencyContainer.pdf_extractor,
78
+ DependencyContainer.image_extractor,
79
+ DependencyContainer.xml_extractor,
80
+ DependencyContainer.epub_extractor,
81
+ )
82
+ ```
83
+
84
+ `main.py`
85
+
86
+ ```python
87
+ from extractor_api_lib.main import app as perfect_extractor_app, register_dependency_container
88
+
89
+ from container import CustomExtractorContainer
90
+
91
+ register_dependency_container(CustomExtractorContainer())
92
+ ```
93
+
94
+ The last provider in the list becomes the first extractor tried for a matching file type. Keep shared singleton providers (file service, converters) in the parent class to avoid double instantiation.
95
+
96
+ ## Installation
97
+
98
+ ```bash
99
+ pip install extractor-api-lib
100
+ ```
101
+
102
+ Python 3.13 is required. OCR and computer-vision features expect system packages such as `ffmpeg`, `poppler-utils`, and `tesseract` (see `services/document-extractor/README.md` for the full list).
103
+
104
+ ## Module tour
105
+
106
+ - `dependency_container.py` – Central dependency-injector wiring. Override providers here to plug in custom extractors, endpoints etc.
107
+ - `api_endpoints/` & `impl/api_endpoints/` – Thin FastAPI endpoint abstractions and implementations for file and source (like confluence & sitemaps) extractors.
108
+ - `apis/` – Extractor API abstractions and implementations.
109
+ - `extractors/` & `impl/extractors/` – Format-specific logic (PDF, DOCX, PPTX, XML, EPUB, Confluence, sitemap) packaged behind the `InformationExtractor`/`InformationFileExtractor` interfaces.
110
+ - `mapper/` & `impl/mapper/` – Abstractions and implementations to map langchain documents, internal and external information piece representations to each other.
111
+ - `file_services/` – Default S3-compatible storage adapter; replace it if you store files elsewhere.
112
+ - `impl/settings/` – Configuration settings for dependency injection container components.
113
+ - `table_converter/` & `impl/table_converter/` – Abstractions and implementations to convert `pandas.DataFrame` to markdown and vice versa.
114
+ - `impl/types/` - Enums for content-, extractor- and file types.
115
+ - `impl/utils/` – Helper functions for hashed datetime and sitemap crawling, header injection, and custom metadata parsing.
116
+
117
+ ## Endpoints provided
118
+
119
+ - `POST /extract_from_file` – Downloads the file from S3, extracts its contents, and returns normalized `InformationPiece` records.
120
+ - `POST /extract_from_source` – Pulls from remote sources (Confluence, sitemap) using credentials and further optional kwargs.
121
+
122
+ Both endpoints stream their results back to `admin-api-lib`, which takes care of enrichment and persistence.
123
+
124
+ ## How the file extraction endpoint works
125
+
126
+ 1. Download the file from S3.
127
+ 2. Derive the file type from the extension (normalizing common image/Markdown/AsciiDoc aliases).
128
+ 3. Select extractors that declare support for the resolved `FileType`.
129
+ 4. Run the extractors in priority order (highest priority first); stop at the first non-empty result or keep falling back if an extractor raises.
130
+ 5. Map the internal representation to the external schema and return the final output.
131
+
132
+ ## How the source extraction endpoint works
133
+
134
+ 1. Chose suitable source extractor based on the source type
135
+ 2. Pull the source content using the provided credentials and parameters
136
+ 3. Extract the content from the source
137
+ 4. Map the internal representation to the external schema
138
+ 5. Return the final output
139
+
140
+ ## Configuration overview
141
+
142
+ Two `pydantic-settings` models ship with this package:
143
+
144
+ - **S3 storage** (`S3Settings`) – configure the built-in file service with `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, `S3_ENDPOINT`, and `S3_BUCKET`.
145
+
146
+ Other extractors accept their parameters at runtime through the request payload (`ExtractionParameters`). For example, the admin backend forwards Confluence credentials, sitemap URLs, or custom headers when it calls `/extract_from_source`. This keeps the library stateless and makes it easy to plug in additional sources without redeploying.
147
+
148
+ The Helm chart exposes the environment variables mentioned above under `documentExtractor.envs.*` so production deployments remain declarative.
149
+
150
+ ## Typical usage
151
+
152
+ ```python
153
+ from extractor_api_lib.main import app as perfect_extractor_app
154
+ ```
155
+
156
+ `admin-api-lib` calls `/extract_from_file` and `/extract_from_source` to populate the ingestion pipeline.
157
+
158
+ ## Extending the library
159
+
160
+ 1. Implement `InformationFileExtractor` (for file-based inputs) or `InformationExtractor` (for remote sources).
161
+ 2. Add a provider to `DependencyContainer` (usually a `Singleton`) and wire dependencies such as the shared `FileService` or table converter.
162
+ 3. Append the provider to `file_extractors` (or to the source extractor list) in the desired position so that the fallback order is correct.
163
+ 4. Update mappers or metadata handling if additional fields are required.
164
+ 5. Cover the happy path and a failure edge case with tests under `libs/extractor-api-lib/tests`, mocking external services (OCR, network, file I/O).
165
+
166
+ ## Advantages and caveats
167
+
168
+ - Docling-first prioritisation dramatically improves structured extraction (tables, headings) and adds OCR to formats that previously lacked it.
169
+ - Retaining MarkItDown and the custom PDF/MS extractors provides graceful degradation when Docling fails or produces empty output.
170
+ - Image support now goes through Docling’s OCR before falling back to pure Tesseract.
171
+ - The configuration still requires code changes; there is no environment-variable switch to reshuffle or disable extractors at runtime.
172
+ - Multi-frame images, animated/novel image formats, and office formats such as ODT/RTF remain unsupported.
173
+
174
+ ## Contributing
175
+
176
+ Ensure new endpoints or adapters remain thin and defer to [`rag-core-lib`](../rag-core-lib/) for shared logic. Run `poetry run pytest` and the configured linters before opening a PR. For further instructions see the [Contributing Guide](https://github.com/stackitcloud/rag-template/blob/main/CONTRIBUTING.md).
177
+
178
+ ## License
179
+
180
+ Licensed under the project license. See the root [`LICENSE`](https://github.com/stackitcloud/rag-template/blob/main/LICENSE) file.
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "extractor-api-lib"
7
- version = "v3.3.0"
7
+ version = "v3.4.0"
8
8
  description = "Extracts the content of documents, websites, etc and maps it to a common format."
9
9
  authors = [
10
10
  "STACKIT GmbH & Co. KG <data-ai@stackit.cloud>",
@@ -43,6 +43,7 @@ per-file-ignores = """
43
43
  ./src/extractor_api_lib/container.py: CCE002,CCE001,
44
44
  ./src/extractor_api_lib/apis/extractor_api_base.py: WOT001,
45
45
  ./tests/*: S101,E501,
46
+ **/__init__.py: D107,
46
47
  """
47
48
 
48
49
  [tool.black]
@@ -76,7 +77,7 @@ known_local_folder = ["extractor_api_lib", "rag_core_lib"]
76
77
  max-line-length = 120
77
78
 
78
79
  [tool.poetry.dependencies]
79
- python = "^3.13"
80
+ python = ">=3.13,<3.14"
80
81
  wheel = "^0.45.1"
81
82
  botocore = "^1.38.10"
82
83
  fasttext = "^0.9.3"
@@ -91,7 +92,8 @@ python-multipart = "^0.0.20"
91
92
  oauthlib = "^3.2.2"
92
93
  requests-oauthlib = "^2.0.0"
93
94
  pdfplumber = "0.11.7"
94
- opencv-python = "4.12.0.88"
95
+ tesserocr = "^2.9.1"
96
+ opencv-python-headless = "4.12.0.88"
95
97
  pdf2image = "1.17.0"
96
98
  datasets = "^3.5.1"
97
99
  pandas = "^2.2.2"
@@ -112,6 +114,13 @@ fake-useragent = "^2.2.0"
112
114
  pypdfium2 = "4.30.0"
113
115
  pypandoc-binary = "^1.15"
114
116
  starlette = ">=0.47.2,<0.49.0"
117
+ markitdown = {version = "^0.1.3", extras = ["all"]}
118
+ docling = "2.61.2"
119
+ torch = { version = "2.9.0+cpu", source = "pytorch_cpu" }
120
+ torchvision = { version = "0.24.0+cpu", source = "pytorch_cpu" }
121
+ transformers = ">=4.53.3,<5.0.0"
122
+ pypandoc = ">=1.6.2"
123
+ pypandoc_binary = ">=1.6.2"
115
124
 
116
125
  [tool.poetry.group.dev.dependencies]
117
126
  pytest = "^8.3.5"
@@ -138,8 +147,8 @@ flake8-wot = "^0.2.0"
138
147
  flake8-function-order = "^0.0.5"
139
148
  flake8-tidy-imports = "^4.10.0"
140
149
  black = "^25.1.0"
141
- # flake8-logging-format = "^2024.24.12"
142
- # flake8-docstrings = "^1.7.0"
150
+ flake8-logging-format = "^2024.24.12"
151
+ flake8-docstrings = "^1.7.0"
143
152
 
144
153
  [tool.poetry.group.tests.dependencies]
145
154
  httpx = "^0.28.1"
@@ -149,4 +158,3 @@ log_cli = true
149
158
  log_cli_level = "DEBUG"
150
159
  pythonpath = "src"
151
160
  testpaths = "src/tests"
152
-
@@ -1,3 +1,5 @@
1
+ """Module for the abstract file extractor."""
2
+
1
3
  from abc import ABC, abstractmethod
2
4
  from extractor_api_lib.models.extraction_request import ExtractionRequest
3
5
  from extractor_api_lib.models.information_piece import InformationPiece
@@ -1,3 +1,5 @@
1
+ """Module for the abstract source extractor."""
2
+
1
3
  from abc import ABC, abstractmethod
2
4
 
3
5
  from extractor_api_lib.models.extraction_parameters import ExtractionParameters
@@ -22,6 +22,7 @@ class BaseExtractorApi:
22
22
  subclasses: ClassVar[Tuple] = ()
23
23
 
24
24
  def __init_subclass__(cls, **kwargs):
25
+ """Register the subclass."""
25
26
  super().__init_subclass__(**kwargs)
26
27
  BaseExtractorApi.subclasses = BaseExtractorApi.subclasses + (cls,)
27
28
 
@@ -13,6 +13,15 @@ from extractor_api_lib.impl.extractors.confluence_extractor import ConfluenceExt
13
13
  from extractor_api_lib.impl.extractors.file_extractors.epub_extractor import (
14
14
  EpubExtractor,
15
15
  )
16
+ from extractor_api_lib.impl.extractors.file_extractors.docling_extractor import (
17
+ DoclingFileExtractor,
18
+ )
19
+ from extractor_api_lib.impl.extractors.file_extractors.image_extractor import (
20
+ TesseractImageExtractor,
21
+ )
22
+ from extractor_api_lib.impl.extractors.file_extractors.markitdown_extractor import (
23
+ MarkitdownFileExtractor,
24
+ )
16
25
  from extractor_api_lib.impl.extractors.file_extractors.ms_docs_extractor import (
17
26
  MSDocsExtractor,
18
27
  )
@@ -32,7 +41,6 @@ from extractor_api_lib.impl.mapper.langchain_document2information_piece import (
32
41
  from extractor_api_lib.impl.mapper.sitemap_document2information_piece import (
33
42
  SitemapLangchainDocument2InformationPiece,
34
43
  )
35
- from extractor_api_lib.impl.settings.pdf_extractor_settings import PDFExtractorSettings
36
44
  from extractor_api_lib.impl.settings.s3_settings import S3Settings
37
45
  from extractor_api_lib.impl.table_converter.dataframe2markdown import DataFrame2Markdown
38
46
  from extractor_api_lib.impl.utils.sitemap_extractor_utils import (
@@ -46,16 +54,18 @@ class DependencyContainer(DeclarativeContainer):
46
54
 
47
55
  # Settings
48
56
  settings_s3 = S3Settings()
49
- settings_pdf_extractor = PDFExtractorSettings()
50
57
 
51
58
  sitemap_parsing_function = Factory(lambda: custom_sitemap_parser_function)
52
59
  sitemap_meta_function = Factory(lambda: custom_sitemap_metadata_parser_function)
53
60
 
54
61
  database_converter = Singleton(DataFrame2Markdown)
55
62
  file_service = Singleton(S3Service, settings_s3)
56
- pdf_extractor = Singleton(PDFExtractor, file_service, settings_pdf_extractor, database_converter)
63
+ pdf_extractor = Singleton(PDFExtractor, file_service, database_converter)
57
64
  ms_docs_extractor = Singleton(MSDocsExtractor, file_service, database_converter)
58
65
  xml_extractor = Singleton(XMLExtractor, file_service)
66
+ markitdown_extractor = Singleton(MarkitdownFileExtractor, file_service)
67
+ docling_extractor = Singleton(DoclingFileExtractor, file_service)
68
+ image_extractor = Singleton(TesseractImageExtractor, file_service)
59
69
 
60
70
  intern2external = Singleton(Internal2ExternalInformationPiece)
61
71
  confluence_document2information_piece = Singleton(ConfluenceLangchainDocument2InformationPiece)
@@ -63,9 +73,22 @@ class DependencyContainer(DeclarativeContainer):
63
73
  sitemap_document2information_piece = Singleton(SitemapLangchainDocument2InformationPiece)
64
74
  epub_extractor = Singleton(EpubExtractor, file_service, langchain_document2information_piece)
65
75
 
66
- file_extractors = List(pdf_extractor, ms_docs_extractor, xml_extractor, epub_extractor)
76
+ file_extractors = List(
77
+ image_extractor,
78
+ ms_docs_extractor,
79
+ xml_extractor,
80
+ epub_extractor,
81
+ pdf_extractor,
82
+ markitdown_extractor,
83
+ docling_extractor,
84
+ )
67
85
 
68
- general_file_extractor = Singleton(GeneralFileExtractor, file_service, file_extractors, intern2external)
86
+ general_file_extractor = Singleton(
87
+ GeneralFileExtractor,
88
+ file_service=file_service,
89
+ available_extractors=file_extractors,
90
+ mapper=intern2external,
91
+ )
69
92
  confluence_extractor = Singleton(ConfluenceExtractor, mapper=confluence_document2information_piece)
70
93
 
71
94
  sitemap_extractor = Singleton(
@@ -13,7 +13,8 @@ class InformationExtractor(ABC):
13
13
 
14
14
  @property
15
15
  @abstractmethod
16
- def extractor_type(self) -> ExtractorTypes: ...
16
+ def extractor_type(self) -> ExtractorTypes:
17
+ """The type of the extractor."""
17
18
 
18
19
  @abstractmethod
19
20
  async def aextract_content(