extractor-api-lib 3.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- extractor_api_lib-3.3.0/PKG-INFO +147 -0
- extractor_api_lib-3.3.0/README.md +94 -0
- extractor_api_lib-3.3.0/pyproject.toml +152 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/api_endpoints/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/api_endpoints/file_extractor.py +23 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/api_endpoints/source_extractor.py +27 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/apis/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/apis/extractor_api.py +105 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/apis/extractor_api_base.py +62 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/dependency_container.py +81 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/extractors/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/extractors/information_extractor.py +35 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/extractors/information_file_extractor.py +52 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/file_services/file_service.py +77 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/api_endpoints/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py +80 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py +60 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractor_api_impl.py +61 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/confluence_extractor.py +92 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/file_extractors/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/file_extractors/epub_extractor.py +73 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py +183 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py +456 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py +114 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/sitemap_extractor.py +122 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/file_services/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/file_services/s3_service.py +129 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/mapper/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py +39 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py +47 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/mapper/langchain_document2information_piece.py +12 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py +45 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/settings/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/settings/pdf_extractor_settings.py +20 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/settings/s3_settings.py +18 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/table_converter/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/table_converter/dataframe2markdown.py +80 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/types/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/types/content_type.py +13 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/types/extractor_types.py +10 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/types/file_type.py +14 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/utils/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/utils/sitemap_extractor_utils.py +51 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/impl/utils/utils.py +21 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/main.py +51 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/mapper/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/mapper/source_langchain_document2information_piece.py +63 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/models/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/models/confluence_parameters.py +144 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/models/content_type.py +41 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/models/dataclasses/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/models/dataclasses/internal_information_piece.py +14 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/models/extra_models.py +9 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/models/extraction_parameters.py +104 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/models/extraction_request.py +83 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/models/information_piece.py +105 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/models/key_value_pair.py +93 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/table_converter/__init__.py +0 -0
- extractor_api_lib-3.3.0/src/extractor_api_lib/table_converter/dataframe_converter.py +45 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: extractor-api-lib
|
|
3
|
+
Version: 3.3.0
|
|
4
|
+
Summary: Extracts the content of documents, websites, etc and maps it to a common format.
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Author: STACKIT GmbH & Co. KG
|
|
7
|
+
Author-email: data-ai@stackit.cloud
|
|
8
|
+
Maintainer: Andreas Klos
|
|
9
|
+
Maintainer-email: andreas.klos@stackit.cloud
|
|
10
|
+
Requires-Python: >=3.13,<4.0
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Requires-Dist: atlassian-python-api (>=4.0.3,<5.0.0)
|
|
15
|
+
Requires-Dist: boto3 (>=1.38.10,<2.0.0)
|
|
16
|
+
Requires-Dist: botocore (>=1.38.10,<2.0.0)
|
|
17
|
+
Requires-Dist: camelot-py[cv] (>=1.0.0,<2.0.0)
|
|
18
|
+
Requires-Dist: datasets (>=3.5.1,<4.0.0)
|
|
19
|
+
Requires-Dist: debugpy (>=1.8.14,<2.0.0)
|
|
20
|
+
Requires-Dist: dependency-injector (>=4.46.0,<5.0.0)
|
|
21
|
+
Requires-Dist: docx2txt (>=0.9,<0.10)
|
|
22
|
+
Requires-Dist: fake-useragent (>=2.2.0,<3.0.0)
|
|
23
|
+
Requires-Dist: fastapi (>=0.118.0,<0.119.0)
|
|
24
|
+
Requires-Dist: fasttext (>=0.9.3,<0.10.0)
|
|
25
|
+
Requires-Dist: html5lib (>=1.1,<2.0)
|
|
26
|
+
Requires-Dist: langchain-community (>=0.3.23,<0.4.0)
|
|
27
|
+
Requires-Dist: langchain-core (==0.3.77)
|
|
28
|
+
Requires-Dist: lxml (>=5.4.0,<6.0.0)
|
|
29
|
+
Requires-Dist: markdownify (>=1.1.0,<2.0.0)
|
|
30
|
+
Requires-Dist: numpy (>=2.2.5,<3.0.0)
|
|
31
|
+
Requires-Dist: oauthlib (>=3.2.2,<4.0.0)
|
|
32
|
+
Requires-Dist: opencv-python (==4.12.0.88)
|
|
33
|
+
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
34
|
+
Requires-Dist: partial (>=1.0,<2.0)
|
|
35
|
+
Requires-Dist: pdf2image (==1.17.0)
|
|
36
|
+
Requires-Dist: pdfplumber (==0.11.7)
|
|
37
|
+
Requires-Dist: pydantic-settings (>=2.9.1,<3.0.0)
|
|
38
|
+
Requires-Dist: pypandoc-binary (>=1.15,<2.0)
|
|
39
|
+
Requires-Dist: pypdfium2 (==4.30.0)
|
|
40
|
+
Requires-Dist: pytesseract (>=0.3.10,<0.4.0)
|
|
41
|
+
Requires-Dist: python-multipart (>=0.0.20,<0.0.21)
|
|
42
|
+
Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
|
|
43
|
+
Requires-Dist: requests-oauthlib (>=2.0.0,<3.0.0)
|
|
44
|
+
Requires-Dist: starlette (>=0.47.2,<0.49.0)
|
|
45
|
+
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
46
|
+
Requires-Dist: unstructured[docx,pptx] (==0.18.15)
|
|
47
|
+
Requires-Dist: uvicorn (>=0.37.0,<0.38.0)
|
|
48
|
+
Requires-Dist: wheel (>=0.45.1,<0.46.0)
|
|
49
|
+
Project-URL: Homepage, https://pypi.org/project/extractor-api-lib
|
|
50
|
+
Project-URL: Repository, https://github.com/stackitcloud/rag-template
|
|
51
|
+
Description-Content-Type: text/markdown
|
|
52
|
+
|
|
53
|
+
# extractor-api-lib
|
|
54
|
+
|
|
55
|
+
Content ingestion layer for the STACKIT RAG template. This library exposes a FastAPI extraction service that ingests raw documents (files or remote sources), extracts and converts (to internal representations) the information, and hands output to [`admin-api-lib`](../admin-api-lib/).
|
|
56
|
+
|
|
57
|
+
## Responsibilities
|
|
58
|
+
|
|
59
|
+
- Receive binary uploads and remote source descriptors from the admin backend.
|
|
60
|
+
- Route each request through the appropriate extractor (file, sitemap, Confluence, etc.).
|
|
61
|
+
- Convert extracted fragments into the shared `InformationPiece` schema expected by downstream services.
|
|
62
|
+
|
|
63
|
+
## Feature highlights
|
|
64
|
+
|
|
65
|
+
- **Broad format coverage** – PDFs, DOCX, PPTX, XML/EPUB, Confluence spaces, and sitemap-driven websites.
|
|
66
|
+
- **Consistent output schema** – Information pieces are returned in a unified structure with content type (`TEXT`, `TABLE`, `IMAGE`) and metadata.
|
|
67
|
+
- **Swappable extractors** – Dependency-injector container makes it easy to add or replace file/source extractors, table converters, etc.
|
|
68
|
+
- **Production-grade plumbing** – Built-in S3-compatible file service, LangChain loaders with retry/backoff, optional PDF OCR, and throttling controls for web crawls.
|
|
69
|
+
|
|
70
|
+
## Installation
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install extractor-api-lib
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Python 3.13 is required. OCR and computer-vision features expect system packages such as `ffmpeg`, `poppler-utils`, and `tesseract` (see `services/document-extractor/README.md` for the full list).
|
|
77
|
+
|
|
78
|
+
## Module tour
|
|
79
|
+
|
|
80
|
+
- `dependency_container.py` – Central dependency-injector wiring. Override providers here to plug in custom extractors, endpoints etc.
|
|
81
|
+
- `api_endpoints/` & `impl/api_endpoints/` – Thin FastAPI endpoint abstractions and implementations for file and source (like confluence & sitemaps) extractors.
|
|
82
|
+
- `apis/` – Extractor API abstractions and implementations.
|
|
83
|
+
- `extractors/` & `impl/extractors/` – Format-specific logic (PDF, DOCX, PPTX, XML, EPUB, Confluence, sitemap) packaged behind the `InformationExtractor`/`InformationFileExtractor` interfaces.
|
|
84
|
+
- `mapper/` & `impl/mapper/` – Abstractions and implementations to map langchain documents, internal and external information piece representations to each other.
|
|
85
|
+
- `file_services/` – Default S3-compatible storage adapter; replace it if you store files elsewhere.
|
|
86
|
+
- `impl/settings/` – Configuration settings for dependency injection container components.
|
|
87
|
+
- `table_converter/` & `impl/table_converter/` – Abstractions and implementations to convert `pandas.DataFrame` to markdown and vice versa.
|
|
88
|
+
- `impl/types/` - Enums for content-, extractor- and file types.
|
|
89
|
+
- `impl/utils/` – Helper functions for hashed datetime and sitemap crawling, header injection, and custom metadata parsing.
|
|
90
|
+
|
|
91
|
+
## Endpoints provided
|
|
92
|
+
|
|
93
|
+
- `POST /extract_from_file` – Downloads the file from S3, extracts its contents, and returns normalized `InformationPiece` records.
|
|
94
|
+
- `POST /extract_from_source` – Pulls from remote sources (Confluence, sitemap) using credentials and further optional kwargs.
|
|
95
|
+
|
|
96
|
+
Both endpoints stream their results back to `admin-api-lib`, which takes care of enrichment and persistence.
|
|
97
|
+
|
|
98
|
+
## How the file extraction endpoint works
|
|
99
|
+
|
|
100
|
+
1. Download the file from S3
|
|
101
|
+
2. Chose suitable file extractor based on the filename ending
|
|
102
|
+
3. Extract the content from the file
|
|
103
|
+
4. Map the internal representation to the external schema
|
|
104
|
+
5. Return the final output
|
|
105
|
+
|
|
106
|
+
## How the source extraction endpoint works
|
|
107
|
+
|
|
108
|
+
1. Chose suitable source extractor based on the source type
|
|
109
|
+
2. Pull the source content using the provided credentials and parameters
|
|
110
|
+
3. Extract the content from the source
|
|
111
|
+
4. Map the internal representation to the external schema
|
|
112
|
+
5. Return the final output
|
|
113
|
+
|
|
114
|
+
## Configuration overview
|
|
115
|
+
|
|
116
|
+
Two `pydantic-settings` models ship with this package:
|
|
117
|
+
|
|
118
|
+
- **S3 storage** (`S3Settings`) – configure the built-in file service with `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, `S3_ENDPOINT`, and `S3_BUCKET`.
|
|
119
|
+
- **PDF extraction** (`PDFExtractorSettings`) – adjust footer trimming or diagram export via `PDF_EXTRACTOR_FOOTER_HEIGHT` and `PDF_EXTRACTOR_DIAGRAMS_FOLDER_NAME`.
|
|
120
|
+
|
|
121
|
+
Other extractors accept their parameters at runtime through the request payload (`ExtractionParameters`). For example, the admin backend forwards Confluence credentials, sitemap URLs, or custom headers when it calls `/extract_from_source`. This keeps the library stateless and makes it easy to plug in additional sources without redeploying.
|
|
122
|
+
|
|
123
|
+
The Helm chart exposes the environment variables mentioned above under `documentExtractor.envs.*` so production deployments remain declarative.
|
|
124
|
+
|
|
125
|
+
## Typical usage
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from extractor_api_lib.main import app as perfect_extractor_app
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
`admin-api-lib` calls `/extract_from_file` and `/extract_from_source` to populate the ingestion pipeline.
|
|
132
|
+
|
|
133
|
+
## Extending the library
|
|
134
|
+
|
|
135
|
+
1. Implement `InformationFileExtractor` or `InformationExtractor` for your new format/source.
|
|
136
|
+
2. Register it in `dependency_container.py` (append to `file_extractors` list or `source_extractors` dict).
|
|
137
|
+
3. Update mapper or metadata handling if additional fields are required.
|
|
138
|
+
4. Add unit tests under `libs/extractor-api-lib/tests` using fixtures and fake storage providers.
|
|
139
|
+
|
|
140
|
+
## Contributing
|
|
141
|
+
|
|
142
|
+
Ensure new endpoints or adapters remain thin and defer to [`rag-core-lib`](../rag-core-lib/) for shared logic. Run `poetry run pytest` and the configured linters before opening a PR. For further instructions see the [Contributing Guide](https://github.com/stackitcloud/rag-template/blob/main/CONTRIBUTING.md).
|
|
143
|
+
|
|
144
|
+
## License
|
|
145
|
+
|
|
146
|
+
Licensed under the project license. See the root [`LICENSE`](https://github.com/stackitcloud/rag-template/blob/main/LICENSE) file.
|
|
147
|
+
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# extractor-api-lib
|
|
2
|
+
|
|
3
|
+
Content ingestion layer for the STACKIT RAG template. This library exposes a FastAPI extraction service that ingests raw documents (files or remote sources), extracts and converts (to internal representations) the information, and hands output to [`admin-api-lib`](../admin-api-lib/).
|
|
4
|
+
|
|
5
|
+
## Responsibilities
|
|
6
|
+
|
|
7
|
+
- Receive binary uploads and remote source descriptors from the admin backend.
|
|
8
|
+
- Route each request through the appropriate extractor (file, sitemap, Confluence, etc.).
|
|
9
|
+
- Convert extracted fragments into the shared `InformationPiece` schema expected by downstream services.
|
|
10
|
+
|
|
11
|
+
## Feature highlights
|
|
12
|
+
|
|
13
|
+
- **Broad format coverage** – PDFs, DOCX, PPTX, XML/EPUB, Confluence spaces, and sitemap-driven websites.
|
|
14
|
+
- **Consistent output schema** – Information pieces are returned in a unified structure with content type (`TEXT`, `TABLE`, `IMAGE`) and metadata.
|
|
15
|
+
- **Swappable extractors** – Dependency-injector container makes it easy to add or replace file/source extractors, table converters, etc.
|
|
16
|
+
- **Production-grade plumbing** – Built-in S3-compatible file service, LangChain loaders with retry/backoff, optional PDF OCR, and throttling controls for web crawls.
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install extractor-api-lib
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Python 3.13 is required. OCR and computer-vision features expect system packages such as `ffmpeg`, `poppler-utils`, and `tesseract` (see `services/document-extractor/README.md` for the full list).
|
|
25
|
+
|
|
26
|
+
## Module tour
|
|
27
|
+
|
|
28
|
+
- `dependency_container.py` – Central dependency-injector wiring. Override providers here to plug in custom extractors, endpoints etc.
|
|
29
|
+
- `api_endpoints/` & `impl/api_endpoints/` – Thin FastAPI endpoint abstractions and implementations for file and source (like confluence & sitemaps) extractors.
|
|
30
|
+
- `apis/` – Extractor API abstractions and implementations.
|
|
31
|
+
- `extractors/` & `impl/extractors/` – Format-specific logic (PDF, DOCX, PPTX, XML, EPUB, Confluence, sitemap) packaged behind the `InformationExtractor`/`InformationFileExtractor` interfaces.
|
|
32
|
+
- `mapper/` & `impl/mapper/` – Abstractions and implementations to map langchain documents, internal and external information piece representations to each other.
|
|
33
|
+
- `file_services/` – Default S3-compatible storage adapter; replace it if you store files elsewhere.
|
|
34
|
+
- `impl/settings/` – Configuration settings for dependency injection container components.
|
|
35
|
+
- `table_converter/` & `impl/table_converter/` – Abstractions and implementations to convert `pandas.DataFrame` to markdown and vice versa.
|
|
36
|
+
- `impl/types/` - Enums for content-, extractor- and file types.
|
|
37
|
+
- `impl/utils/` – Helper functions for hashed datetime and sitemap crawling, header injection, and custom metadata parsing.
|
|
38
|
+
|
|
39
|
+
## Endpoints provided
|
|
40
|
+
|
|
41
|
+
- `POST /extract_from_file` – Downloads the file from S3, extracts its contents, and returns normalized `InformationPiece` records.
|
|
42
|
+
- `POST /extract_from_source` – Pulls from remote sources (Confluence, sitemap) using credentials and further optional kwargs.
|
|
43
|
+
|
|
44
|
+
Both endpoints stream their results back to `admin-api-lib`, which takes care of enrichment and persistence.
|
|
45
|
+
|
|
46
|
+
## How the file extraction endpoint works
|
|
47
|
+
|
|
48
|
+
1. Download the file from S3
|
|
49
|
+
2. Chose suitable file extractor based on the filename ending
|
|
50
|
+
3. Extract the content from the file
|
|
51
|
+
4. Map the internal representation to the external schema
|
|
52
|
+
5. Return the final output
|
|
53
|
+
|
|
54
|
+
## How the source extraction endpoint works
|
|
55
|
+
|
|
56
|
+
1. Chose suitable source extractor based on the source type
|
|
57
|
+
2. Pull the source content using the provided credentials and parameters
|
|
58
|
+
3. Extract the content from the source
|
|
59
|
+
4. Map the internal representation to the external schema
|
|
60
|
+
5. Return the final output
|
|
61
|
+
|
|
62
|
+
## Configuration overview
|
|
63
|
+
|
|
64
|
+
Two `pydantic-settings` models ship with this package:
|
|
65
|
+
|
|
66
|
+
- **S3 storage** (`S3Settings`) – configure the built-in file service with `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, `S3_ENDPOINT`, and `S3_BUCKET`.
|
|
67
|
+
- **PDF extraction** (`PDFExtractorSettings`) – adjust footer trimming or diagram export via `PDF_EXTRACTOR_FOOTER_HEIGHT` and `PDF_EXTRACTOR_DIAGRAMS_FOLDER_NAME`.
|
|
68
|
+
|
|
69
|
+
Other extractors accept their parameters at runtime through the request payload (`ExtractionParameters`). For example, the admin backend forwards Confluence credentials, sitemap URLs, or custom headers when it calls `/extract_from_source`. This keeps the library stateless and makes it easy to plug in additional sources without redeploying.
|
|
70
|
+
|
|
71
|
+
The Helm chart exposes the environment variables mentioned above under `documentExtractor.envs.*` so production deployments remain declarative.
|
|
72
|
+
|
|
73
|
+
## Typical usage
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from extractor_api_lib.main import app as perfect_extractor_app
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
`admin-api-lib` calls `/extract_from_file` and `/extract_from_source` to populate the ingestion pipeline.
|
|
80
|
+
|
|
81
|
+
## Extending the library
|
|
82
|
+
|
|
83
|
+
1. Implement `InformationFileExtractor` or `InformationExtractor` for your new format/source.
|
|
84
|
+
2. Register it in `dependency_container.py` (append to `file_extractors` list or `source_extractors` dict).
|
|
85
|
+
3. Update mapper or metadata handling if additional fields are required.
|
|
86
|
+
4. Add unit tests under `libs/extractor-api-lib/tests` using fixtures and fake storage providers.
|
|
87
|
+
|
|
88
|
+
## Contributing
|
|
89
|
+
|
|
90
|
+
Ensure new endpoints or adapters remain thin and defer to [`rag-core-lib`](../rag-core-lib/) for shared logic. Run `poetry run pytest` and the configured linters before opening a PR. For further instructions see the [Contributing Guide](https://github.com/stackitcloud/rag-template/blob/main/CONTRIBUTING.md).
|
|
91
|
+
|
|
92
|
+
## License
|
|
93
|
+
|
|
94
|
+
Licensed under the project license. See the root [`LICENSE`](https://github.com/stackitcloud/rag-template/blob/main/LICENSE) file.
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["poetry-core"]
|
|
3
|
+
build-backend = "poetry.core.masonry.api"
|
|
4
|
+
|
|
5
|
+
[tool.poetry]
|
|
6
|
+
name = "extractor-api-lib"
|
|
7
|
+
version = "v3.3.0"
|
|
8
|
+
description = "Extracts the content of documents, websites, etc and maps it to a common format."
|
|
9
|
+
authors = [
|
|
10
|
+
"STACKIT GmbH & Co. KG <data-ai@stackit.cloud>",
|
|
11
|
+
]
|
|
12
|
+
maintainers = [
|
|
13
|
+
"Andreas Klos <andreas.klos@stackit.cloud>",
|
|
14
|
+
]
|
|
15
|
+
packages = [{ include = "extractor_api_lib", from = "src" }]
|
|
16
|
+
readme = "README.md"
|
|
17
|
+
license = "Apache-2.0"
|
|
18
|
+
repository = "https://github.com/stackitcloud/rag-template"
|
|
19
|
+
homepage = "https://pypi.org/project/extractor-api-lib"
|
|
20
|
+
|
|
21
|
+
[[tool.poetry.source]]
|
|
22
|
+
name = "pytorch_cpu"
|
|
23
|
+
url = "https://download.pytorch.org/whl/cpu"
|
|
24
|
+
priority = "explicit"
|
|
25
|
+
|
|
26
|
+
[tool.flake8]
|
|
27
|
+
exclude = [".eggs", "./src/extractor_api_lib/models/*", ".git", ".hg", ".mypy_cache", ".tox", ".venv", ".devcontainer", "venv", "_build", "buck-out", "build", "dist", "**/__init__.py", "tests/test_data/generate_test_pdfs.py"]
|
|
28
|
+
statistics = true
|
|
29
|
+
show-source = false
|
|
30
|
+
max-complexity = 10
|
|
31
|
+
max-annotations-complexity = 3
|
|
32
|
+
docstring-convent = 'numpy'
|
|
33
|
+
max-line-length = 120
|
|
34
|
+
ignore = ["E203", "W503", "E704"]
|
|
35
|
+
inline-quotes = '"'
|
|
36
|
+
docstring-quotes = '"""'
|
|
37
|
+
multiline-quotes = '"""'
|
|
38
|
+
dictionaries = ["en_US", "python", "technical", "pandas"]
|
|
39
|
+
ban-relative-imports = true
|
|
40
|
+
per-file-ignores = """
|
|
41
|
+
./src/extractor_api_lib/apis/extractor_api.py: B008,WOT001,
|
|
42
|
+
./src/extractor_api_lib/impl/extractor_api_impl.py: B008,
|
|
43
|
+
./src/extractor_api_lib/container.py: CCE002,CCE001,
|
|
44
|
+
./src/extractor_api_lib/apis/extractor_api_base.py: WOT001,
|
|
45
|
+
./tests/*: S101,E501,
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
[tool.black]
|
|
49
|
+
line-length = 120
|
|
50
|
+
exclude = """
|
|
51
|
+
/(
|
|
52
|
+
| .eggs
|
|
53
|
+
| .git
|
|
54
|
+
| .hg
|
|
55
|
+
| .mypy_cache
|
|
56
|
+
| .nox
|
|
57
|
+
| .pants.d
|
|
58
|
+
| .tox
|
|
59
|
+
| .venv
|
|
60
|
+
| _build
|
|
61
|
+
| buck-out
|
|
62
|
+
| build
|
|
63
|
+
| dist
|
|
64
|
+
| node_modules
|
|
65
|
+
| venv
|
|
66
|
+
)/
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
[tool.isort]
|
|
70
|
+
profile = "black"
|
|
71
|
+
skip = ['.eggs', '.git', '.hg', '.mypy_cache', '.nox', '.pants.d', '.tox', '.venv', '_build', 'buck-out', 'build', 'dist', 'node_modules', 'venv']
|
|
72
|
+
skip_gitignore = true
|
|
73
|
+
known_local_folder = ["extractor_api_lib", "rag_core_lib"]
|
|
74
|
+
|
|
75
|
+
[tool.pylint]
|
|
76
|
+
max-line-length = 120
|
|
77
|
+
|
|
78
|
+
[tool.poetry.dependencies]
|
|
79
|
+
python = "^3.13"
|
|
80
|
+
wheel = "^0.45.1"
|
|
81
|
+
botocore = "^1.38.10"
|
|
82
|
+
fasttext = "^0.9.3"
|
|
83
|
+
pytesseract = "^0.3.10"
|
|
84
|
+
fastapi = "^0.118.0"
|
|
85
|
+
uvicorn = "^0.37.0"
|
|
86
|
+
dependency-injector = "^4.46.0"
|
|
87
|
+
pydantic-settings = "^2.9.1"
|
|
88
|
+
boto3 = "^1.38.10"
|
|
89
|
+
debugpy = "^1.8.14"
|
|
90
|
+
python-multipart = "^0.0.20"
|
|
91
|
+
oauthlib = "^3.2.2"
|
|
92
|
+
requests-oauthlib = "^2.0.0"
|
|
93
|
+
pdfplumber = "0.11.7"
|
|
94
|
+
opencv-python = "4.12.0.88"
|
|
95
|
+
pdf2image = "1.17.0"
|
|
96
|
+
datasets = "^3.5.1"
|
|
97
|
+
pandas = "^2.2.2"
|
|
98
|
+
tabulate = "^0.9.0"
|
|
99
|
+
lxml = "^5.4.0"
|
|
100
|
+
partial = "^1.0"
|
|
101
|
+
pyyaml = "^6.0.2"
|
|
102
|
+
numpy = "^2.2.5"
|
|
103
|
+
docx2txt = "^0.9"
|
|
104
|
+
unstructured = {extras = ["docx", "pptx"], version = "0.18.15"}
|
|
105
|
+
html5lib = "^1.1"
|
|
106
|
+
langchain-community = "^0.3.23"
|
|
107
|
+
atlassian-python-api = "^4.0.3"
|
|
108
|
+
markdownify = "^1.1.0"
|
|
109
|
+
langchain-core = "0.3.77"
|
|
110
|
+
camelot-py = {extras = ["cv"], version = "^1.0.0"}
|
|
111
|
+
fake-useragent = "^2.2.0"
|
|
112
|
+
pypdfium2 = "4.30.0"
|
|
113
|
+
pypandoc-binary = "^1.15"
|
|
114
|
+
starlette = ">=0.47.2,<0.49.0"
|
|
115
|
+
|
|
116
|
+
[tool.poetry.group.dev.dependencies]
|
|
117
|
+
pytest = "^8.3.5"
|
|
118
|
+
pytest-asyncio = "^0.26.0"
|
|
119
|
+
coverage = "^7.8.0"
|
|
120
|
+
flake8 = "^7.2.0"
|
|
121
|
+
flake8-black = "^0.4.0"
|
|
122
|
+
flake8-pyproject = "^1.2.3"
|
|
123
|
+
flake8-quotes = "^3.4.0"
|
|
124
|
+
flake8-return = "^1.2.0"
|
|
125
|
+
flake8-annotations-complexity = "^0.1.0"
|
|
126
|
+
flake8-bandit = "^4.1.1"
|
|
127
|
+
flake8-bugbear = "^24.12.12"
|
|
128
|
+
flake8-builtins = "^2.5.0"
|
|
129
|
+
flake8-comprehensions = "^3.15.0"
|
|
130
|
+
flake8-eradicate = "^1.5.0"
|
|
131
|
+
flake8-expression-complexity = "^0.0.11"
|
|
132
|
+
flake8-pytest-style = "^2.1.0"
|
|
133
|
+
pep8-naming = "^0.15.1"
|
|
134
|
+
flake8-eol = "^0.0.8"
|
|
135
|
+
flake8-exceptions = "^0.0.1a0"
|
|
136
|
+
flake8-simplify = "^0.22.0"
|
|
137
|
+
flake8-wot = "^0.2.0"
|
|
138
|
+
flake8-function-order = "^0.0.5"
|
|
139
|
+
flake8-tidy-imports = "^4.10.0"
|
|
140
|
+
black = "^25.1.0"
|
|
141
|
+
# flake8-logging-format = "^2024.24.12"
|
|
142
|
+
# flake8-docstrings = "^1.7.0"
|
|
143
|
+
|
|
144
|
+
[tool.poetry.group.tests.dependencies]
|
|
145
|
+
httpx = "^0.28.1"
|
|
146
|
+
|
|
147
|
+
[tool.pytest.ini_options]
|
|
148
|
+
log_cli = true
|
|
149
|
+
log_cli_level = "DEBUG"
|
|
150
|
+
pythonpath = "src"
|
|
151
|
+
testpaths = "src/tests"
|
|
152
|
+
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from extractor_api_lib.models.extraction_request import ExtractionRequest
|
|
3
|
+
from extractor_api_lib.models.information_piece import InformationPiece
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FileExtractor(ABC):
|
|
7
|
+
"""Abstract base class for extract__from_file endpoint."""
|
|
8
|
+
|
|
9
|
+
@abstractmethod
|
|
10
|
+
async def aextract_information(self, extraction_request: ExtractionRequest) -> list[InformationPiece]:
|
|
11
|
+
"""
|
|
12
|
+
Extract information of a document, given by the extraction_request.
|
|
13
|
+
|
|
14
|
+
Parameters
|
|
15
|
+
----------
|
|
16
|
+
extraction_request : ExtractionRequest
|
|
17
|
+
The request containing the details of the document to be processed for information extraction.
|
|
18
|
+
|
|
19
|
+
Returns
|
|
20
|
+
-------
|
|
21
|
+
list[InformationPiece]
|
|
22
|
+
A list of extracted information pieces from the document.
|
|
23
|
+
"""
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
|
|
3
|
+
from extractor_api_lib.models.extraction_parameters import ExtractionParameters
|
|
4
|
+
from extractor_api_lib.models.information_piece import InformationPiece
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SourceExtractor(ABC):
|
|
8
|
+
"""Abstract base class for extract_from_source endpoint."""
|
|
9
|
+
|
|
10
|
+
@abstractmethod
|
|
11
|
+
async def aextract_information(
|
|
12
|
+
self,
|
|
13
|
+
extraction_parameters: ExtractionParameters,
|
|
14
|
+
) -> list[InformationPiece]:
|
|
15
|
+
"""
|
|
16
|
+
Extract information from source, using the given parameters.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
extraction_parameters : ExtractionParameters
|
|
21
|
+
The parameters used to extract information from the source.
|
|
22
|
+
|
|
23
|
+
Returns
|
|
24
|
+
-------
|
|
25
|
+
list[InformationPiece]
|
|
26
|
+
A list of extracted information pieces.
|
|
27
|
+
"""
|
|
File without changes
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Module for the Extractor API."""
|
|
2
|
+
|
|
3
|
+
# coding: utf-8
|
|
4
|
+
|
|
5
|
+
from typing import Dict, List # noqa: F401
|
|
6
|
+
import importlib
|
|
7
|
+
import pkgutil
|
|
8
|
+
|
|
9
|
+
from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi
|
|
10
|
+
import extractor_api_lib.impl
|
|
11
|
+
|
|
12
|
+
from fastapi import ( # noqa: F401
|
|
13
|
+
APIRouter,
|
|
14
|
+
Body,
|
|
15
|
+
Cookie,
|
|
16
|
+
Depends,
|
|
17
|
+
Form,
|
|
18
|
+
Header,
|
|
19
|
+
HTTPException,
|
|
20
|
+
Path,
|
|
21
|
+
Query,
|
|
22
|
+
Response,
|
|
23
|
+
Security,
|
|
24
|
+
status,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
from extractor_api_lib.models.extra_models import TokenModel # noqa: F401
|
|
28
|
+
from extractor_api_lib.models.extraction_parameters import ExtractionParameters
|
|
29
|
+
from extractor_api_lib.models.extraction_request import ExtractionRequest
|
|
30
|
+
from extractor_api_lib.models.information_piece import InformationPiece
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
router = APIRouter()
|
|
34
|
+
|
|
35
|
+
ns_pkg = extractor_api_lib.impl
|
|
36
|
+
for _, name, _ in pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + "."):
|
|
37
|
+
importlib.import_module(name)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@router.post(
|
|
41
|
+
"/extract_from_file",
|
|
42
|
+
responses={
|
|
43
|
+
200: {"model": List[InformationPiece], "description": "List of extracted information."},
|
|
44
|
+
422: {"description": "Body is not a valid PDF."},
|
|
45
|
+
500: {"description": "Something somewhere went terribly wrong."},
|
|
46
|
+
},
|
|
47
|
+
tags=["extractor"],
|
|
48
|
+
response_model_by_alias=True,
|
|
49
|
+
)
|
|
50
|
+
async def extract_from_file_post(
|
|
51
|
+
extraction_request: ExtractionRequest = Body(None, description=""),
|
|
52
|
+
) -> List[InformationPiece]:
|
|
53
|
+
"""
|
|
54
|
+
Extract information from a file based on the provided extraction request.
|
|
55
|
+
|
|
56
|
+
Parameters
|
|
57
|
+
----------
|
|
58
|
+
extraction_request : ExtractionRequest
|
|
59
|
+
The request object containing details about the extraction process.
|
|
60
|
+
|
|
61
|
+
Returns
|
|
62
|
+
-------
|
|
63
|
+
List[InformationPiece]
|
|
64
|
+
A list of extracted information pieces.
|
|
65
|
+
"""
|
|
66
|
+
if not BaseExtractorApi.subclasses:
|
|
67
|
+
raise HTTPException(status_code=500, detail="Not implemented")
|
|
68
|
+
return await BaseExtractorApi.subclasses[0]().extract_from_file_post(extraction_request)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@router.post(
|
|
72
|
+
"/extract_from_source",
|
|
73
|
+
responses={
|
|
74
|
+
200: {"model": List[InformationPiece], "description": "ok"},
|
|
75
|
+
404: {"description": "not found"},
|
|
76
|
+
422: {"description": "unprocessable entity"},
|
|
77
|
+
500: {"description": "internal server error"},
|
|
78
|
+
},
|
|
79
|
+
tags=["extractor"],
|
|
80
|
+
response_model_by_alias=True,
|
|
81
|
+
)
|
|
82
|
+
async def extract_from_source(
|
|
83
|
+
extraction_parameters: ExtractionParameters = Body(None, description=""),
|
|
84
|
+
) -> List[InformationPiece]:
|
|
85
|
+
"""
|
|
86
|
+
Extract information from a source based on the provided extraction parameters.
|
|
87
|
+
|
|
88
|
+
Parameters
|
|
89
|
+
----------
|
|
90
|
+
extraction_parameters : ExtractionParameters, optional
|
|
91
|
+
The request object containing details about the extraction process.
|
|
92
|
+
|
|
93
|
+
Returns
|
|
94
|
+
-------
|
|
95
|
+
List[InformationPiece]
|
|
96
|
+
A list of extracted information pieces.
|
|
97
|
+
|
|
98
|
+
Raises
|
|
99
|
+
------
|
|
100
|
+
HTTPException
|
|
101
|
+
If the extraction process fails or encounters an error.
|
|
102
|
+
"""
|
|
103
|
+
if not BaseExtractorApi.subclasses:
|
|
104
|
+
raise HTTPException(status_code=500, detail="Not implemented")
|
|
105
|
+
return await BaseExtractorApi.subclasses[0]().extract_from_source(extraction_parameters)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Module for the base ExtractorApi interface."""
|
|
2
|
+
|
|
3
|
+
# coding: utf-8
|
|
4
|
+
|
|
5
|
+
from typing import ClassVar, Dict, List, Tuple # noqa: F401
|
|
6
|
+
|
|
7
|
+
from extractor_api_lib.models.extraction_parameters import ExtractionParameters
|
|
8
|
+
from extractor_api_lib.models.extraction_request import ExtractionRequest
|
|
9
|
+
from extractor_api_lib.models.information_piece import InformationPiece
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseExtractorApi:
|
|
13
|
+
"""
|
|
14
|
+
The base ExtractorApi interface.
|
|
15
|
+
|
|
16
|
+
Attributes
|
|
17
|
+
----------
|
|
18
|
+
subclasses : ClassVar[Tuple]
|
|
19
|
+
A tuple containing all subclasses of BaseExtractorApi.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
subclasses: ClassVar[Tuple] = ()
|
|
23
|
+
|
|
24
|
+
def __init_subclass__(cls, **kwargs):
|
|
25
|
+
super().__init_subclass__(**kwargs)
|
|
26
|
+
BaseExtractorApi.subclasses = BaseExtractorApi.subclasses + (cls,)
|
|
27
|
+
|
|
28
|
+
async def extract_from_file_post(
|
|
29
|
+
self,
|
|
30
|
+
extraction_request: ExtractionRequest,
|
|
31
|
+
) -> List[InformationPiece]:
|
|
32
|
+
"""
|
|
33
|
+
Extract information from a file based on the provided extraction request.
|
|
34
|
+
|
|
35
|
+
Parameters
|
|
36
|
+
----------
|
|
37
|
+
extraction_request : ExtractionRequest
|
|
38
|
+
The request object containing details about the extraction process.
|
|
39
|
+
|
|
40
|
+
Returns
|
|
41
|
+
-------
|
|
42
|
+
List[InformationPiece]
|
|
43
|
+
A list of extracted information pieces.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
async def extract_from_source(
|
|
47
|
+
self,
|
|
48
|
+
extraction_parameters: ExtractionParameters,
|
|
49
|
+
) -> List[InformationPiece]:
|
|
50
|
+
"""
|
|
51
|
+
Extract information from a source based on the provided extraction request.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
extraction_parameters : ExtractionParameters
|
|
56
|
+
The parameters required to access and extract information from the source.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
List[InformationPiece]
|
|
61
|
+
A list of extracted information pieces.
|
|
62
|
+
"""
|