extractor-api-lib 3.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. extractor_api_lib-3.3.0/PKG-INFO +147 -0
  2. extractor_api_lib-3.3.0/README.md +94 -0
  3. extractor_api_lib-3.3.0/pyproject.toml +152 -0
  4. extractor_api_lib-3.3.0/src/extractor_api_lib/api_endpoints/__init__.py +0 -0
  5. extractor_api_lib-3.3.0/src/extractor_api_lib/api_endpoints/file_extractor.py +23 -0
  6. extractor_api_lib-3.3.0/src/extractor_api_lib/api_endpoints/source_extractor.py +27 -0
  7. extractor_api_lib-3.3.0/src/extractor_api_lib/apis/__init__.py +0 -0
  8. extractor_api_lib-3.3.0/src/extractor_api_lib/apis/extractor_api.py +105 -0
  9. extractor_api_lib-3.3.0/src/extractor_api_lib/apis/extractor_api_base.py +62 -0
  10. extractor_api_lib-3.3.0/src/extractor_api_lib/dependency_container.py +81 -0
  11. extractor_api_lib-3.3.0/src/extractor_api_lib/extractors/__init__.py +0 -0
  12. extractor_api_lib-3.3.0/src/extractor_api_lib/extractors/information_extractor.py +35 -0
  13. extractor_api_lib-3.3.0/src/extractor_api_lib/extractors/information_file_extractor.py +52 -0
  14. extractor_api_lib-3.3.0/src/extractor_api_lib/file_services/file_service.py +77 -0
  15. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/__init__.py +0 -0
  16. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/api_endpoints/__init__.py +0 -0
  17. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/api_endpoints/general_file_extractor.py +80 -0
  18. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/api_endpoints/general_source_extractor.py +60 -0
  19. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractor_api_impl.py +61 -0
  20. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/__init__.py +0 -0
  21. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/confluence_extractor.py +92 -0
  22. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/file_extractors/__init__.py +0 -0
  23. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/file_extractors/epub_extractor.py +73 -0
  24. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/file_extractors/ms_docs_extractor.py +183 -0
  25. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/file_extractors/pdf_extractor.py +456 -0
  26. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/file_extractors/xml_extractor.py +114 -0
  27. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/extractors/sitemap_extractor.py +122 -0
  28. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/file_services/__init__.py +0 -0
  29. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/file_services/s3_service.py +129 -0
  30. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/mapper/__init__.py +0 -0
  31. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/mapper/confluence_langchain_document2information_piece.py +39 -0
  32. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/mapper/internal2external_information_piece.py +47 -0
  33. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/mapper/langchain_document2information_piece.py +12 -0
  34. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/mapper/sitemap_document2information_piece.py +45 -0
  35. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/settings/__init__.py +0 -0
  36. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/settings/pdf_extractor_settings.py +20 -0
  37. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/settings/s3_settings.py +18 -0
  38. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/table_converter/__init__.py +0 -0
  39. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/table_converter/dataframe2markdown.py +80 -0
  40. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/types/__init__.py +0 -0
  41. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/types/content_type.py +13 -0
  42. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/types/extractor_types.py +10 -0
  43. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/types/file_type.py +14 -0
  44. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/utils/__init__.py +0 -0
  45. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/utils/sitemap_extractor_utils.py +51 -0
  46. extractor_api_lib-3.3.0/src/extractor_api_lib/impl/utils/utils.py +21 -0
  47. extractor_api_lib-3.3.0/src/extractor_api_lib/main.py +51 -0
  48. extractor_api_lib-3.3.0/src/extractor_api_lib/mapper/__init__.py +0 -0
  49. extractor_api_lib-3.3.0/src/extractor_api_lib/mapper/source_langchain_document2information_piece.py +63 -0
  50. extractor_api_lib-3.3.0/src/extractor_api_lib/models/__init__.py +0 -0
  51. extractor_api_lib-3.3.0/src/extractor_api_lib/models/confluence_parameters.py +144 -0
  52. extractor_api_lib-3.3.0/src/extractor_api_lib/models/content_type.py +41 -0
  53. extractor_api_lib-3.3.0/src/extractor_api_lib/models/dataclasses/__init__.py +0 -0
  54. extractor_api_lib-3.3.0/src/extractor_api_lib/models/dataclasses/internal_information_piece.py +14 -0
  55. extractor_api_lib-3.3.0/src/extractor_api_lib/models/extra_models.py +9 -0
  56. extractor_api_lib-3.3.0/src/extractor_api_lib/models/extraction_parameters.py +104 -0
  57. extractor_api_lib-3.3.0/src/extractor_api_lib/models/extraction_request.py +83 -0
  58. extractor_api_lib-3.3.0/src/extractor_api_lib/models/information_piece.py +105 -0
  59. extractor_api_lib-3.3.0/src/extractor_api_lib/models/key_value_pair.py +93 -0
  60. extractor_api_lib-3.3.0/src/extractor_api_lib/table_converter/__init__.py +0 -0
  61. extractor_api_lib-3.3.0/src/extractor_api_lib/table_converter/dataframe_converter.py +45 -0
@@ -0,0 +1,147 @@
1
+ Metadata-Version: 2.3
2
+ Name: extractor-api-lib
3
+ Version: 3.3.0
4
+ Summary: Extracts the content of documents, websites, etc and maps it to a common format.
5
+ License: Apache-2.0
6
+ Author: STACKIT GmbH & Co. KG
7
+ Author-email: data-ai@stackit.cloud
8
+ Maintainer: Andreas Klos
9
+ Maintainer-email: andreas.klos@stackit.cloud
10
+ Requires-Python: >=3.13,<4.0
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Requires-Dist: atlassian-python-api (>=4.0.3,<5.0.0)
15
+ Requires-Dist: boto3 (>=1.38.10,<2.0.0)
16
+ Requires-Dist: botocore (>=1.38.10,<2.0.0)
17
+ Requires-Dist: camelot-py[cv] (>=1.0.0,<2.0.0)
18
+ Requires-Dist: datasets (>=3.5.1,<4.0.0)
19
+ Requires-Dist: debugpy (>=1.8.14,<2.0.0)
20
+ Requires-Dist: dependency-injector (>=4.46.0,<5.0.0)
21
+ Requires-Dist: docx2txt (>=0.9,<0.10)
22
+ Requires-Dist: fake-useragent (>=2.2.0,<3.0.0)
23
+ Requires-Dist: fastapi (>=0.118.0,<0.119.0)
24
+ Requires-Dist: fasttext (>=0.9.3,<0.10.0)
25
+ Requires-Dist: html5lib (>=1.1,<2.0)
26
+ Requires-Dist: langchain-community (>=0.3.23,<0.4.0)
27
+ Requires-Dist: langchain-core (==0.3.77)
28
+ Requires-Dist: lxml (>=5.4.0,<6.0.0)
29
+ Requires-Dist: markdownify (>=1.1.0,<2.0.0)
30
+ Requires-Dist: numpy (>=2.2.5,<3.0.0)
31
+ Requires-Dist: oauthlib (>=3.2.2,<4.0.0)
32
+ Requires-Dist: opencv-python (==4.12.0.88)
33
+ Requires-Dist: pandas (>=2.2.2,<3.0.0)
34
+ Requires-Dist: partial (>=1.0,<2.0)
35
+ Requires-Dist: pdf2image (==1.17.0)
36
+ Requires-Dist: pdfplumber (==0.11.7)
37
+ Requires-Dist: pydantic-settings (>=2.9.1,<3.0.0)
38
+ Requires-Dist: pypandoc-binary (>=1.15,<2.0)
39
+ Requires-Dist: pypdfium2 (==4.30.0)
40
+ Requires-Dist: pytesseract (>=0.3.10,<0.4.0)
41
+ Requires-Dist: python-multipart (>=0.0.20,<0.0.21)
42
+ Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
43
+ Requires-Dist: requests-oauthlib (>=2.0.0,<3.0.0)
44
+ Requires-Dist: starlette (>=0.47.2,<0.49.0)
45
+ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
46
+ Requires-Dist: unstructured[docx,pptx] (==0.18.15)
47
+ Requires-Dist: uvicorn (>=0.37.0,<0.38.0)
48
+ Requires-Dist: wheel (>=0.45.1,<0.46.0)
49
+ Project-URL: Homepage, https://pypi.org/project/extractor-api-lib
50
+ Project-URL: Repository, https://github.com/stackitcloud/rag-template
51
+ Description-Content-Type: text/markdown
52
+
53
+ # extractor-api-lib
54
+
55
+ Content ingestion layer for the STACKIT RAG template. This library exposes a FastAPI extraction service that ingests raw documents (files or remote sources), extracts and converts (to internal representations) the information, and hands output to [`admin-api-lib`](../admin-api-lib/).
56
+
57
+ ## Responsibilities
58
+
59
+ - Receive binary uploads and remote source descriptors from the admin backend.
60
+ - Route each request through the appropriate extractor (file, sitemap, Confluence, etc.).
61
+ - Convert extracted fragments into the shared `InformationPiece` schema expected by downstream services.
62
+
63
+ ## Feature highlights
64
+
65
+ - **Broad format coverage** – PDFs, DOCX, PPTX, XML/EPUB, Confluence spaces, and sitemap-driven websites.
66
+ - **Consistent output schema** – Information pieces are returned in a unified structure with content type (`TEXT`, `TABLE`, `IMAGE`) and metadata.
67
+ - **Swappable extractors** – Dependency-injector container makes it easy to add or replace file/source extractors, table converters, etc.
68
+ - **Production-grade plumbing** – Built-in S3-compatible file service, LangChain loaders with retry/backoff, optional PDF OCR, and throttling controls for web crawls.
69
+
70
+ ## Installation
71
+
72
+ ```bash
73
+ pip install extractor-api-lib
74
+ ```
75
+
76
+ Python 3.13 is required. OCR and computer-vision features expect system packages such as `ffmpeg`, `poppler-utils`, and `tesseract` (see `services/document-extractor/README.md` for the full list).
77
+
78
+ ## Module tour
79
+
80
+ - `dependency_container.py` – Central dependency-injector wiring. Override providers here to plug in custom extractors, endpoints etc.
81
+ - `api_endpoints/` & `impl/api_endpoints/` – Thin FastAPI endpoint abstractions and implementations for file and source (like confluence & sitemaps) extractors.
82
+ - `apis/` – Extractor API abstractions and implementations.
83
+ - `extractors/` & `impl/extractors/` – Format-specific logic (PDF, DOCX, PPTX, XML, EPUB, Confluence, sitemap) packaged behind the `InformationExtractor`/`InformationFileExtractor` interfaces.
84
+ - `mapper/` & `impl/mapper/` – Abstractions and implementations to map langchain documents, internal and external information piece representations to each other.
85
+ - `file_services/` – Default S3-compatible storage adapter; replace it if you store files elsewhere.
86
+ - `impl/settings/` – Configuration settings for dependency injection container components.
87
+ - `table_converter/` & `impl/table_converter/` – Abstractions and implementations to convert `pandas.DataFrame` to markdown and vice versa.
88
+ - `impl/types/` - Enums for content-, extractor- and file types.
89
+ - `impl/utils/` – Helper functions for hashed datetime and sitemap crawling, header injection, and custom metadata parsing.
90
+
91
+ ## Endpoints provided
92
+
93
+ - `POST /extract_from_file` – Downloads the file from S3, extracts its contents, and returns normalized `InformationPiece` records.
94
+ - `POST /extract_from_source` – Pulls from remote sources (Confluence, sitemap) using credentials and further optional kwargs.
95
+
96
+ Both endpoints stream their results back to `admin-api-lib`, which takes care of enrichment and persistence.
97
+
98
+ ## How the file extraction endpoint works
99
+
100
+ 1. Download the file from S3
101
+ 2. Chose suitable file extractor based on the filename ending
102
+ 3. Extract the content from the file
103
+ 4. Map the internal representation to the external schema
104
+ 5. Return the final output
105
+
106
+ ## How the source extraction endpoint works
107
+
108
+ 1. Chose suitable source extractor based on the source type
109
+ 2. Pull the source content using the provided credentials and parameters
110
+ 3. Extract the content from the source
111
+ 4. Map the internal representation to the external schema
112
+ 5. Return the final output
113
+
114
+ ## Configuration overview
115
+
116
+ Two `pydantic-settings` models ship with this package:
117
+
118
+ - **S3 storage** (`S3Settings`) – configure the built-in file service with `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, `S3_ENDPOINT`, and `S3_BUCKET`.
119
+ - **PDF extraction** (`PDFExtractorSettings`) – adjust footer trimming or diagram export via `PDF_EXTRACTOR_FOOTER_HEIGHT` and `PDF_EXTRACTOR_DIAGRAMS_FOLDER_NAME`.
120
+
121
+ Other extractors accept their parameters at runtime through the request payload (`ExtractionParameters`). For example, the admin backend forwards Confluence credentials, sitemap URLs, or custom headers when it calls `/extract_from_source`. This keeps the library stateless and makes it easy to plug in additional sources without redeploying.
122
+
123
+ The Helm chart exposes the environment variables mentioned above under `documentExtractor.envs.*` so production deployments remain declarative.
124
+
125
+ ## Typical usage
126
+
127
+ ```python
128
+ from extractor_api_lib.main import app as perfect_extractor_app
129
+ ```
130
+
131
+ `admin-api-lib` calls `/extract_from_file` and `/extract_from_source` to populate the ingestion pipeline.
132
+
133
+ ## Extending the library
134
+
135
+ 1. Implement `InformationFileExtractor` or `InformationExtractor` for your new format/source.
136
+ 2. Register it in `dependency_container.py` (append to `file_extractors` list or `source_extractors` dict).
137
+ 3. Update mapper or metadata handling if additional fields are required.
138
+ 4. Add unit tests under `libs/extractor-api-lib/tests` using fixtures and fake storage providers.
139
+
140
+ ## Contributing
141
+
142
+ Ensure new endpoints or adapters remain thin and defer to [`rag-core-lib`](../rag-core-lib/) for shared logic. Run `poetry run pytest` and the configured linters before opening a PR. For further instructions see the [Contributing Guide](https://github.com/stackitcloud/rag-template/blob/main/CONTRIBUTING.md).
143
+
144
+ ## License
145
+
146
+ Licensed under the project license. See the root [`LICENSE`](https://github.com/stackitcloud/rag-template/blob/main/LICENSE) file.
147
+
@@ -0,0 +1,94 @@
1
+ # extractor-api-lib
2
+
3
+ Content ingestion layer for the STACKIT RAG template. This library exposes a FastAPI extraction service that ingests raw documents (files or remote sources), extracts and converts (to internal representations) the information, and hands output to [`admin-api-lib`](../admin-api-lib/).
4
+
5
+ ## Responsibilities
6
+
7
+ - Receive binary uploads and remote source descriptors from the admin backend.
8
+ - Route each request through the appropriate extractor (file, sitemap, Confluence, etc.).
9
+ - Convert extracted fragments into the shared `InformationPiece` schema expected by downstream services.
10
+
11
+ ## Feature highlights
12
+
13
+ - **Broad format coverage** – PDFs, DOCX, PPTX, XML/EPUB, Confluence spaces, and sitemap-driven websites.
14
+ - **Consistent output schema** – Information pieces are returned in a unified structure with content type (`TEXT`, `TABLE`, `IMAGE`) and metadata.
15
+ - **Swappable extractors** – Dependency-injector container makes it easy to add or replace file/source extractors, table converters, etc.
16
+ - **Production-grade plumbing** – Built-in S3-compatible file service, LangChain loaders with retry/backoff, optional PDF OCR, and throttling controls for web crawls.
17
+
18
+ ## Installation
19
+
20
+ ```bash
21
+ pip install extractor-api-lib
22
+ ```
23
+
24
+ Python 3.13 is required. OCR and computer-vision features expect system packages such as `ffmpeg`, `poppler-utils`, and `tesseract` (see `services/document-extractor/README.md` for the full list).
25
+
26
+ ## Module tour
27
+
28
+ - `dependency_container.py` – Central dependency-injector wiring. Override providers here to plug in custom extractors, endpoints etc.
29
+ - `api_endpoints/` & `impl/api_endpoints/` – Thin FastAPI endpoint abstractions and implementations for file and source (like confluence & sitemaps) extractors.
30
+ - `apis/` – Extractor API abstractions and implementations.
31
+ - `extractors/` & `impl/extractors/` – Format-specific logic (PDF, DOCX, PPTX, XML, EPUB, Confluence, sitemap) packaged behind the `InformationExtractor`/`InformationFileExtractor` interfaces.
32
+ - `mapper/` & `impl/mapper/` – Abstractions and implementations to map langchain documents, internal and external information piece representations to each other.
33
+ - `file_services/` – Default S3-compatible storage adapter; replace it if you store files elsewhere.
34
+ - `impl/settings/` – Configuration settings for dependency injection container components.
35
+ - `table_converter/` & `impl/table_converter/` – Abstractions and implementations to convert `pandas.DataFrame` to markdown and vice versa.
36
+ - `impl/types/` - Enums for content-, extractor- and file types.
37
+ - `impl/utils/` – Helper functions for hashed datetime and sitemap crawling, header injection, and custom metadata parsing.
38
+
39
+ ## Endpoints provided
40
+
41
+ - `POST /extract_from_file` – Downloads the file from S3, extracts its contents, and returns normalized `InformationPiece` records.
42
+ - `POST /extract_from_source` – Pulls from remote sources (Confluence, sitemap) using credentials and further optional kwargs.
43
+
44
+ Both endpoints stream their results back to `admin-api-lib`, which takes care of enrichment and persistence.
45
+
46
+ ## How the file extraction endpoint works
47
+
48
+ 1. Download the file from S3
49
+ 2. Chose suitable file extractor based on the filename ending
50
+ 3. Extract the content from the file
51
+ 4. Map the internal representation to the external schema
52
+ 5. Return the final output
53
+
54
+ ## How the source extraction endpoint works
55
+
56
+ 1. Chose suitable source extractor based on the source type
57
+ 2. Pull the source content using the provided credentials and parameters
58
+ 3. Extract the content from the source
59
+ 4. Map the internal representation to the external schema
60
+ 5. Return the final output
61
+
62
+ ## Configuration overview
63
+
64
+ Two `pydantic-settings` models ship with this package:
65
+
66
+ - **S3 storage** (`S3Settings`) – configure the built-in file service with `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, `S3_ENDPOINT`, and `S3_BUCKET`.
67
+ - **PDF extraction** (`PDFExtractorSettings`) – adjust footer trimming or diagram export via `PDF_EXTRACTOR_FOOTER_HEIGHT` and `PDF_EXTRACTOR_DIAGRAMS_FOLDER_NAME`.
68
+
69
+ Other extractors accept their parameters at runtime through the request payload (`ExtractionParameters`). For example, the admin backend forwards Confluence credentials, sitemap URLs, or custom headers when it calls `/extract_from_source`. This keeps the library stateless and makes it easy to plug in additional sources without redeploying.
70
+
71
+ The Helm chart exposes the environment variables mentioned above under `documentExtractor.envs.*` so production deployments remain declarative.
72
+
73
+ ## Typical usage
74
+
75
+ ```python
76
+ from extractor_api_lib.main import app as perfect_extractor_app
77
+ ```
78
+
79
+ `admin-api-lib` calls `/extract_from_file` and `/extract_from_source` to populate the ingestion pipeline.
80
+
81
+ ## Extending the library
82
+
83
+ 1. Implement `InformationFileExtractor` or `InformationExtractor` for your new format/source.
84
+ 2. Register it in `dependency_container.py` (append to `file_extractors` list or `source_extractors` dict).
85
+ 3. Update mapper or metadata handling if additional fields are required.
86
+ 4. Add unit tests under `libs/extractor-api-lib/tests` using fixtures and fake storage providers.
87
+
88
+ ## Contributing
89
+
90
+ Ensure new endpoints or adapters remain thin and defer to [`rag-core-lib`](../rag-core-lib/) for shared logic. Run `poetry run pytest` and the configured linters before opening a PR. For further instructions see the [Contributing Guide](https://github.com/stackitcloud/rag-template/blob/main/CONTRIBUTING.md).
91
+
92
+ ## License
93
+
94
+ Licensed under the project license. See the root [`LICENSE`](https://github.com/stackitcloud/rag-template/blob/main/LICENSE) file.
@@ -0,0 +1,152 @@
1
+ [build-system]
2
+ requires = ["poetry-core"]
3
+ build-backend = "poetry.core.masonry.api"
4
+
5
+ [tool.poetry]
6
+ name = "extractor-api-lib"
7
+ version = "v3.3.0"
8
+ description = "Extracts the content of documents, websites, etc and maps it to a common format."
9
+ authors = [
10
+ "STACKIT GmbH & Co. KG <data-ai@stackit.cloud>",
11
+ ]
12
+ maintainers = [
13
+ "Andreas Klos <andreas.klos@stackit.cloud>",
14
+ ]
15
+ packages = [{ include = "extractor_api_lib", from = "src" }]
16
+ readme = "README.md"
17
+ license = "Apache-2.0"
18
+ repository = "https://github.com/stackitcloud/rag-template"
19
+ homepage = "https://pypi.org/project/extractor-api-lib"
20
+
21
+ [[tool.poetry.source]]
22
+ name = "pytorch_cpu"
23
+ url = "https://download.pytorch.org/whl/cpu"
24
+ priority = "explicit"
25
+
26
+ [tool.flake8]
27
+ exclude = [".eggs", "./src/extractor_api_lib/models/*", ".git", ".hg", ".mypy_cache", ".tox", ".venv", ".devcontainer", "venv", "_build", "buck-out", "build", "dist", "**/__init__.py", "tests/test_data/generate_test_pdfs.py"]
28
+ statistics = true
29
+ show-source = false
30
+ max-complexity = 10
31
+ max-annotations-complexity = 3
32
+ docstring-convent = 'numpy'
33
+ max-line-length = 120
34
+ ignore = ["E203", "W503", "E704"]
35
+ inline-quotes = '"'
36
+ docstring-quotes = '"""'
37
+ multiline-quotes = '"""'
38
+ dictionaries = ["en_US", "python", "technical", "pandas"]
39
+ ban-relative-imports = true
40
+ per-file-ignores = """
41
+ ./src/extractor_api_lib/apis/extractor_api.py: B008,WOT001,
42
+ ./src/extractor_api_lib/impl/extractor_api_impl.py: B008,
43
+ ./src/extractor_api_lib/container.py: CCE002,CCE001,
44
+ ./src/extractor_api_lib/apis/extractor_api_base.py: WOT001,
45
+ ./tests/*: S101,E501,
46
+ """
47
+
48
+ [tool.black]
49
+ line-length = 120
50
+ exclude = """
51
+ /(
52
+ | .eggs
53
+ | .git
54
+ | .hg
55
+ | .mypy_cache
56
+ | .nox
57
+ | .pants.d
58
+ | .tox
59
+ | .venv
60
+ | _build
61
+ | buck-out
62
+ | build
63
+ | dist
64
+ | node_modules
65
+ | venv
66
+ )/
67
+ """
68
+
69
+ [tool.isort]
70
+ profile = "black"
71
+ skip = ['.eggs', '.git', '.hg', '.mypy_cache', '.nox', '.pants.d', '.tox', '.venv', '_build', 'buck-out', 'build', 'dist', 'node_modules', 'venv']
72
+ skip_gitignore = true
73
+ known_local_folder = ["extractor_api_lib", "rag_core_lib"]
74
+
75
+ [tool.pylint]
76
+ max-line-length = 120
77
+
78
+ [tool.poetry.dependencies]
79
+ python = "^3.13"
80
+ wheel = "^0.45.1"
81
+ botocore = "^1.38.10"
82
+ fasttext = "^0.9.3"
83
+ pytesseract = "^0.3.10"
84
+ fastapi = "^0.118.0"
85
+ uvicorn = "^0.37.0"
86
+ dependency-injector = "^4.46.0"
87
+ pydantic-settings = "^2.9.1"
88
+ boto3 = "^1.38.10"
89
+ debugpy = "^1.8.14"
90
+ python-multipart = "^0.0.20"
91
+ oauthlib = "^3.2.2"
92
+ requests-oauthlib = "^2.0.0"
93
+ pdfplumber = "0.11.7"
94
+ opencv-python = "4.12.0.88"
95
+ pdf2image = "1.17.0"
96
+ datasets = "^3.5.1"
97
+ pandas = "^2.2.2"
98
+ tabulate = "^0.9.0"
99
+ lxml = "^5.4.0"
100
+ partial = "^1.0"
101
+ pyyaml = "^6.0.2"
102
+ numpy = "^2.2.5"
103
+ docx2txt = "^0.9"
104
+ unstructured = {extras = ["docx", "pptx"], version = "0.18.15"}
105
+ html5lib = "^1.1"
106
+ langchain-community = "^0.3.23"
107
+ atlassian-python-api = "^4.0.3"
108
+ markdownify = "^1.1.0"
109
+ langchain-core = "0.3.77"
110
+ camelot-py = {extras = ["cv"], version = "^1.0.0"}
111
+ fake-useragent = "^2.2.0"
112
+ pypdfium2 = "4.30.0"
113
+ pypandoc-binary = "^1.15"
114
+ starlette = ">=0.47.2,<0.49.0"
115
+
116
+ [tool.poetry.group.dev.dependencies]
117
+ pytest = "^8.3.5"
118
+ pytest-asyncio = "^0.26.0"
119
+ coverage = "^7.8.0"
120
+ flake8 = "^7.2.0"
121
+ flake8-black = "^0.4.0"
122
+ flake8-pyproject = "^1.2.3"
123
+ flake8-quotes = "^3.4.0"
124
+ flake8-return = "^1.2.0"
125
+ flake8-annotations-complexity = "^0.1.0"
126
+ flake8-bandit = "^4.1.1"
127
+ flake8-bugbear = "^24.12.12"
128
+ flake8-builtins = "^2.5.0"
129
+ flake8-comprehensions = "^3.15.0"
130
+ flake8-eradicate = "^1.5.0"
131
+ flake8-expression-complexity = "^0.0.11"
132
+ flake8-pytest-style = "^2.1.0"
133
+ pep8-naming = "^0.15.1"
134
+ flake8-eol = "^0.0.8"
135
+ flake8-exceptions = "^0.0.1a0"
136
+ flake8-simplify = "^0.22.0"
137
+ flake8-wot = "^0.2.0"
138
+ flake8-function-order = "^0.0.5"
139
+ flake8-tidy-imports = "^4.10.0"
140
+ black = "^25.1.0"
141
+ # flake8-logging-format = "^2024.24.12"
142
+ # flake8-docstrings = "^1.7.0"
143
+
144
+ [tool.poetry.group.tests.dependencies]
145
+ httpx = "^0.28.1"
146
+
147
+ [tool.pytest.ini_options]
148
+ log_cli = true
149
+ log_cli_level = "DEBUG"
150
+ pythonpath = "src"
151
+ testpaths = "src/tests"
152
+
@@ -0,0 +1,23 @@
1
+ from abc import ABC, abstractmethod
2
+ from extractor_api_lib.models.extraction_request import ExtractionRequest
3
+ from extractor_api_lib.models.information_piece import InformationPiece
4
+
5
+
6
+ class FileExtractor(ABC):
7
+ """Abstract base class for extract__from_file endpoint."""
8
+
9
+ @abstractmethod
10
+ async def aextract_information(self, extraction_request: ExtractionRequest) -> list[InformationPiece]:
11
+ """
12
+ Extract information of a document, given by the extraction_request.
13
+
14
+ Parameters
15
+ ----------
16
+ extraction_request : ExtractionRequest
17
+ The request containing the details of the document to be processed for information extraction.
18
+
19
+ Returns
20
+ -------
21
+ list[InformationPiece]
22
+ A list of extracted information pieces from the document.
23
+ """
@@ -0,0 +1,27 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ from extractor_api_lib.models.extraction_parameters import ExtractionParameters
4
+ from extractor_api_lib.models.information_piece import InformationPiece
5
+
6
+
7
+ class SourceExtractor(ABC):
8
+ """Abstract base class for extract_from_source endpoint."""
9
+
10
+ @abstractmethod
11
+ async def aextract_information(
12
+ self,
13
+ extraction_parameters: ExtractionParameters,
14
+ ) -> list[InformationPiece]:
15
+ """
16
+ Extract information from source, using the given parameters.
17
+
18
+ Parameters
19
+ ----------
20
+ extraction_parameters : ExtractionParameters
21
+ The parameters used to extract information from the source.
22
+
23
+ Returns
24
+ -------
25
+ list[InformationPiece]
26
+ A list of extracted information pieces.
27
+ """
@@ -0,0 +1,105 @@
1
+ """Module for the Extractor API."""
2
+
3
+ # coding: utf-8
4
+
5
+ from typing import Dict, List # noqa: F401
6
+ import importlib
7
+ import pkgutil
8
+
9
+ from extractor_api_lib.apis.extractor_api_base import BaseExtractorApi
10
+ import extractor_api_lib.impl
11
+
12
+ from fastapi import ( # noqa: F401
13
+ APIRouter,
14
+ Body,
15
+ Cookie,
16
+ Depends,
17
+ Form,
18
+ Header,
19
+ HTTPException,
20
+ Path,
21
+ Query,
22
+ Response,
23
+ Security,
24
+ status,
25
+ )
26
+
27
+ from extractor_api_lib.models.extra_models import TokenModel # noqa: F401
28
+ from extractor_api_lib.models.extraction_parameters import ExtractionParameters
29
+ from extractor_api_lib.models.extraction_request import ExtractionRequest
30
+ from extractor_api_lib.models.information_piece import InformationPiece
31
+
32
+
33
+ router = APIRouter()
34
+
35
+ ns_pkg = extractor_api_lib.impl
36
+ for _, name, _ in pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + "."):
37
+ importlib.import_module(name)
38
+
39
+
40
+ @router.post(
41
+ "/extract_from_file",
42
+ responses={
43
+ 200: {"model": List[InformationPiece], "description": "List of extracted information."},
44
+ 422: {"description": "Body is not a valid PDF."},
45
+ 500: {"description": "Something somewhere went terribly wrong."},
46
+ },
47
+ tags=["extractor"],
48
+ response_model_by_alias=True,
49
+ )
50
+ async def extract_from_file_post(
51
+ extraction_request: ExtractionRequest = Body(None, description=""),
52
+ ) -> List[InformationPiece]:
53
+ """
54
+ Extract information from a file based on the provided extraction request.
55
+
56
+ Parameters
57
+ ----------
58
+ extraction_request : ExtractionRequest
59
+ The request object containing details about the extraction process.
60
+
61
+ Returns
62
+ -------
63
+ List[InformationPiece]
64
+ A list of extracted information pieces.
65
+ """
66
+ if not BaseExtractorApi.subclasses:
67
+ raise HTTPException(status_code=500, detail="Not implemented")
68
+ return await BaseExtractorApi.subclasses[0]().extract_from_file_post(extraction_request)
69
+
70
+
71
+ @router.post(
72
+ "/extract_from_source",
73
+ responses={
74
+ 200: {"model": List[InformationPiece], "description": "ok"},
75
+ 404: {"description": "not found"},
76
+ 422: {"description": "unprocessable entity"},
77
+ 500: {"description": "internal server error"},
78
+ },
79
+ tags=["extractor"],
80
+ response_model_by_alias=True,
81
+ )
82
+ async def extract_from_source(
83
+ extraction_parameters: ExtractionParameters = Body(None, description=""),
84
+ ) -> List[InformationPiece]:
85
+ """
86
+ Extract information from a source based on the provided extraction parameters.
87
+
88
+ Parameters
89
+ ----------
90
+ extraction_parameters : ExtractionParameters, optional
91
+ The request object containing details about the extraction process.
92
+
93
+ Returns
94
+ -------
95
+ List[InformationPiece]
96
+ A list of extracted information pieces.
97
+
98
+ Raises
99
+ ------
100
+ HTTPException
101
+ If the extraction process fails or encounters an error.
102
+ """
103
+ if not BaseExtractorApi.subclasses:
104
+ raise HTTPException(status_code=500, detail="Not implemented")
105
+ return await BaseExtractorApi.subclasses[0]().extract_from_source(extraction_parameters)
@@ -0,0 +1,62 @@
1
+ """Module for the base ExtractorApi interface."""
2
+
3
+ # coding: utf-8
4
+
5
+ from typing import ClassVar, Dict, List, Tuple # noqa: F401
6
+
7
+ from extractor_api_lib.models.extraction_parameters import ExtractionParameters
8
+ from extractor_api_lib.models.extraction_request import ExtractionRequest
9
+ from extractor_api_lib.models.information_piece import InformationPiece
10
+
11
+
12
+ class BaseExtractorApi:
13
+ """
14
+ The base ExtractorApi interface.
15
+
16
+ Attributes
17
+ ----------
18
+ subclasses : ClassVar[Tuple]
19
+ A tuple containing all subclasses of BaseExtractorApi.
20
+ """
21
+
22
+ subclasses: ClassVar[Tuple] = ()
23
+
24
+ def __init_subclass__(cls, **kwargs):
25
+ super().__init_subclass__(**kwargs)
26
+ BaseExtractorApi.subclasses = BaseExtractorApi.subclasses + (cls,)
27
+
28
+ async def extract_from_file_post(
29
+ self,
30
+ extraction_request: ExtractionRequest,
31
+ ) -> List[InformationPiece]:
32
+ """
33
+ Extract information from a file based on the provided extraction request.
34
+
35
+ Parameters
36
+ ----------
37
+ extraction_request : ExtractionRequest
38
+ The request object containing details about the extraction process.
39
+
40
+ Returns
41
+ -------
42
+ List[InformationPiece]
43
+ A list of extracted information pieces.
44
+ """
45
+
46
+ async def extract_from_source(
47
+ self,
48
+ extraction_parameters: ExtractionParameters,
49
+ ) -> List[InformationPiece]:
50
+ """
51
+ Extract information from a source based on the provided extraction request.
52
+
53
+ Parameters
54
+ ----------
55
+ extraction_parameters : ExtractionParameters
56
+ The parameters required to access and extract information from the source.
57
+
58
+ Returns
59
+ -------
60
+ List[InformationPiece]
61
+ A list of extracted information pieces.
62
+ """