admin-api-lib 3.3.0__tar.gz → 4.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/PKG-INFO +9 -6
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/README.md +2 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/pyproject.toml +21 -17
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/api_endpoints/file_uploader.py +2 -1
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/api_endpoints/source_uploader.py +1 -1
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/api_endpoints/uploader_base.py +1 -3
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/apis/admin_api.py +2 -2
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/dependency_container.py +1 -1
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/api_endpoints/default_document_reference_retriever.py +3 -5
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/api_endpoints/default_file_uploader.py +12 -14
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/api_endpoints/default_source_uploader.py +14 -8
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/chunker/text_chunker.py +1 -1
- admin_api_lib-4.0.0/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py +113 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/key_db/file_status_key_value_store.py +48 -2
- admin_api_lib-4.0.0/src/admin_api_lib/impl/settings/key_value_settings.py +50 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/summarizer/langchain_summarizer.py +49 -4
- admin_api_lib-3.3.0/src/admin_api_lib/impl/information_enhancer/page_summary_enhancer.py +0 -62
- admin_api_lib-3.3.0/src/admin_api_lib/impl/settings/key_value_settings.py +0 -26
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/api_endpoints/document_deleter.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/api_endpoints/document_reference_retriever.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/api_endpoints/documents_status_retriever.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/apis/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/apis/admin_api_base.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/chunker/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/chunker/chunker.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/api/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/api/extractor_api.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/api_client.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/api_response.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/configuration.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/exceptions.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/models/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/models/content_type.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_parameters.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/models/extraction_request.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/models/information_piece.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/models/key_value_pair.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/rest.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/test/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/test/test_content_type.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_parameters.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extraction_request.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/test/test_extractor_api.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/test/test_information_piece.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/openapi_client/test/test_key_value_pair.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/file_services/file_service.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/admin_api.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/api_endpoints/default_document_deleter.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/api_endpoints/default_documents_status_retriever.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/chunker/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/chunker/chunker_type.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/chunker/semantic_text_chunker.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/file_services/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/file_services/s3_service.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/information_enhancer/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/information_enhancer/general_enhancer.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/information_enhancer/summary_enhancer.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/key_db/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/mapper/informationpiece2document.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/settings/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/settings/chunker_class_type_settings.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/settings/chunker_settings.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/settings/document_extractor_settings.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/settings/rag_api_settings.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/settings/s3_settings.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/settings/source_uploader_settings.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/settings/summarizer_settings.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/summarizer/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/information_enhancer/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/information_enhancer/information_enhancer.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/main.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/models/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/models/document_status.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/models/extra_models.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/models/http_validation_error.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/models/key_value_pair.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/models/status.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/models/validation_error.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/models/validation_error_loc_inner.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/prompt_templates/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/prompt_templates/summarize_prompt.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/api/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/api/rag_api.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/api_client.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/api_response.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/configuration.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/exceptions.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/models/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/models/chat_history.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/models/chat_history_message.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/models/chat_request.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/models/chat_response.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/models/chat_role.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/models/content_type.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/models/delete_request.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/models/information_piece.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/models/key_value_pair.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/openapi_client/rest.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/summarizer/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/summarizer/summarizer.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/utils/__init__.py +0 -0
- {admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/utils/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: admin-api-lib
|
|
3
|
-
Version:
|
|
3
|
+
Version: 4.0.0
|
|
4
4
|
Summary: The admin backend is responsible for the document management. This includes deletion, upload and returning the source document.
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Author: STACKIT GmbH & Co. KG
|
|
@@ -13,16 +13,17 @@ Classifier: Programming Language :: Python :: 3
|
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.13
|
|
14
14
|
Requires-Dist: boto3 (>=1.38.10,<2.0.0)
|
|
15
15
|
Requires-Dist: dependency-injector (>=4.46.0,<5.0.0)
|
|
16
|
-
Requires-Dist: fastapi (>=0.
|
|
17
|
-
Requires-Dist: langchain-experimental (>=0.
|
|
18
|
-
Requires-Dist: langfuse (
|
|
16
|
+
Requires-Dist: fastapi (>=0.121.2,<0.122.0)
|
|
17
|
+
Requires-Dist: langchain-experimental (>=0.4.0,<0.5.0)
|
|
18
|
+
Requires-Dist: langfuse (>=3.10.1,<4.0.0)
|
|
19
|
+
Requires-Dist: langgraph-checkpoint (>=3.0.0,<4.0.0)
|
|
19
20
|
Requires-Dist: nltk (>=3.9.2,<4.0.0)
|
|
20
21
|
Requires-Dist: python-dateutil (>=2.9.0.post0,<3.0.0)
|
|
21
22
|
Requires-Dist: python-multipart (>=0.0.20,<0.0.21)
|
|
22
23
|
Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
|
|
23
|
-
Requires-Dist: rag-core-lib (
|
|
24
|
+
Requires-Dist: rag-core-lib (==4.0.0)
|
|
24
25
|
Requires-Dist: redis (>=6.0.0,<7.0.0)
|
|
25
|
-
Requires-Dist: starlette (>=0.
|
|
26
|
+
Requires-Dist: starlette (>=0.49.1)
|
|
26
27
|
Requires-Dist: tenacity (==9.1.2)
|
|
27
28
|
Requires-Dist: tqdm (>=4.67.1,<5.0.0)
|
|
28
29
|
Requires-Dist: uvicorn (>=0.37.0,<0.38.0)
|
|
@@ -99,6 +100,8 @@ All settings are powered by `pydantic-settings`, so you can use environment vari
|
|
|
99
100
|
- `SUMMARIZER_MAXIMUM_INPUT_SIZE`, `SUMMARIZER_MAXIMUM_CONCURRENCY`, `SUMMARIZER_MAX_RETRIES`, etc. – tune summariser limits and retry behaviour.
|
|
100
101
|
- `SOURCE_UPLOADER_TIMEOUT` – adjust how long non-file source ingestions wait before timing out.
|
|
101
102
|
- `USECASE_KEYVALUE_HOST` / `USECASE_KEYVALUE_PORT` – configure the KeyDB/Redis instance that persists document status.
|
|
103
|
+
- `USECASE_KEYVALUE_USERNAME` / `USECASE_KEYVALUE_PASSWORD` – optional credentials for authenticating against KeyDB/Redis.
|
|
104
|
+
- `USECASE_KEYVALUE_USE_SSL`, `USECASE_KEYVALUE_SSL_CERT_REQS`, `USECASE_KEYVALUE_SSL_CA_CERTS`, `USECASE_KEYVALUE_SSL_CERTFILE`, `USECASE_KEYVALUE_SSL_KEYFILE`, `USECASE_KEYVALUE_SSL_CHECK_HOSTNAME` – optional TLS settings for managed Redis deployments (e.g., STACKIT Redis or other SSL-only endpoints).
|
|
102
105
|
|
|
103
106
|
The Helm chart forwards these values through `adminBackend.envs.*`, keeping deployments declarative. Local development can rely on `.env` as described in the repository root README.
|
|
104
107
|
|
|
@@ -67,6 +67,8 @@ All settings are powered by `pydantic-settings`, so you can use environment vari
|
|
|
67
67
|
- `SUMMARIZER_MAXIMUM_INPUT_SIZE`, `SUMMARIZER_MAXIMUM_CONCURRENCY`, `SUMMARIZER_MAX_RETRIES`, etc. – tune summariser limits and retry behaviour.
|
|
68
68
|
- `SOURCE_UPLOADER_TIMEOUT` – adjust how long non-file source ingestions wait before timing out.
|
|
69
69
|
- `USECASE_KEYVALUE_HOST` / `USECASE_KEYVALUE_PORT` – configure the KeyDB/Redis instance that persists document status.
|
|
70
|
+
- `USECASE_KEYVALUE_USERNAME` / `USECASE_KEYVALUE_PASSWORD` – optional credentials for authenticating against KeyDB/Redis.
|
|
71
|
+
- `USECASE_KEYVALUE_USE_SSL`, `USECASE_KEYVALUE_SSL_CERT_REQS`, `USECASE_KEYVALUE_SSL_CA_CERTS`, `USECASE_KEYVALUE_SSL_CERTFILE`, `USECASE_KEYVALUE_SSL_KEYFILE`, `USECASE_KEYVALUE_SSL_CHECK_HOSTNAME` – optional TLS settings for managed Redis deployments (e.g., STACKIT Redis or other SSL-only endpoints).
|
|
70
72
|
|
|
71
73
|
The Helm chart forwards these values through `adminBackend.envs.*`, keeping deployments declarative. Local development can rely on `.env` as described in the repository root README.
|
|
72
74
|
|
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
4
4
|
|
|
5
5
|
[tool.poetry]
|
|
6
6
|
name = "admin-api-lib"
|
|
7
|
-
version = "
|
|
7
|
+
version = "4.0.0"
|
|
8
8
|
description = "The admin backend is responsible for the document management. This includes deletion, upload and returning the source document."
|
|
9
9
|
authors = [
|
|
10
10
|
"STACKIT GmbH & Co. KG <data-ai@stackit.cloud>",
|
|
@@ -18,6 +18,11 @@ license = "Apache-2.0"
|
|
|
18
18
|
repository = "https://github.com/stackitcloud/rag-template"
|
|
19
19
|
homepage = "https://pypi.org/project/admin-api-lib"
|
|
20
20
|
|
|
21
|
+
[[tool.poetry.source]]
|
|
22
|
+
name = "testpypi"
|
|
23
|
+
url = "https://test.pypi.org/simple/"
|
|
24
|
+
priority = "explicit"
|
|
25
|
+
|
|
21
26
|
[tool.flake8]
|
|
22
27
|
exclude= [".eggs", "./libs/*", "./src/admin_api_lib/models/*", "./src/admin_api_lib/rag_backend_client/*", "./src/admin_api_lib/extractor_api_client/*", ".git", ".hg", ".mypy_cache", ".tox", ".venv", ".devcontainer", "venv", "_build", "buck-out", "build", "dist", "**/__init__.py"]
|
|
23
28
|
statistics = true
|
|
@@ -74,10 +79,12 @@ known_local_folder = ["admin_api_lib", "rag_core_lib"]
|
|
|
74
79
|
[tool.pylint]
|
|
75
80
|
max-line-length = 120
|
|
76
81
|
|
|
77
|
-
[tool.poetry.group.
|
|
78
|
-
debugpy = "^1.8.14"
|
|
82
|
+
[tool.poetry.group.test.dependencies]
|
|
79
83
|
pytest = "^8.3.5"
|
|
84
|
+
pytest-asyncio = "^1.0.0"
|
|
80
85
|
coverage = "^7.8.0"
|
|
86
|
+
|
|
87
|
+
[tool.poetry.group.lint.dependencies]
|
|
81
88
|
flake8 = "^7.2.0"
|
|
82
89
|
flake8-black = "^0.4.0"
|
|
83
90
|
flake8-pyproject = "^1.2.3"
|
|
@@ -99,36 +106,33 @@ flake8-wot = "^0.2.0"
|
|
|
99
106
|
flake8-function-order = "^0.0.5"
|
|
100
107
|
flake8-tidy-imports = "^4.10.0"
|
|
101
108
|
black = "^25.1.0"
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
109
|
+
flake8-logging-format = "^2024.24.12"
|
|
110
|
+
flake8-docstrings = "^1.7.0"
|
|
111
|
+
|
|
112
|
+
[tool.poetry.group.dev.dependencies]
|
|
113
|
+
debugpy = "^1.8.14"
|
|
105
114
|
|
|
106
115
|
[tool.poetry.dependencies]
|
|
107
|
-
rag-core-lib = "
|
|
116
|
+
rag-core-lib = "==4.0.0"
|
|
108
117
|
python = "^3.13"
|
|
109
118
|
uvicorn = "^0.37.0"
|
|
110
|
-
fastapi = "^0.
|
|
119
|
+
fastapi = "^0.121.2"
|
|
111
120
|
dependency-injector = "^4.46.0"
|
|
112
121
|
python-dateutil = "^2.9.0.post0"
|
|
113
122
|
tenacity = "9.1.2"
|
|
114
123
|
boto3 = "^1.38.10"
|
|
115
124
|
tqdm = "^4.67.1"
|
|
116
|
-
langfuse = "3.
|
|
125
|
+
langfuse = "^3.10.1"
|
|
117
126
|
redis = "^6.0.0"
|
|
118
127
|
pyyaml = "^6.0.2"
|
|
119
128
|
python-multipart = "^0.0.20"
|
|
120
|
-
|
|
121
|
-
langchain-experimental = "^0.3.4"
|
|
129
|
+
langchain-experimental = "^0.4.0"
|
|
122
130
|
nltk = "^3.9.2"
|
|
131
|
+
starlette = ">=0.49.1"
|
|
132
|
+
langgraph-checkpoint = ">=3.0.0,<4.0.0"
|
|
123
133
|
|
|
124
134
|
[tool.pytest.ini_options]
|
|
125
135
|
log_cli = true
|
|
126
136
|
log_cli_level = "DEBUG"
|
|
127
137
|
pythonpath = "src"
|
|
128
138
|
testpaths = "src/tests"
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
[[tool.poetry.source]]
|
|
132
|
-
name = "testpypi"
|
|
133
|
-
url = "https://test.pypi.org/simple/"
|
|
134
|
-
priority = "supplemental"
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/api_endpoints/file_uploader.py
RENAMED
|
@@ -8,6 +8,7 @@ from admin_api_lib.api_endpoints.uploader_base import UploaderBase
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class FileUploader(UploaderBase):
|
|
11
|
+
"""File uploader endpoint of the admin API."""
|
|
11
12
|
|
|
12
13
|
@abstractmethod
|
|
13
14
|
async def upload_file(
|
|
@@ -16,7 +17,7 @@ class FileUploader(UploaderBase):
|
|
|
16
17
|
file: UploadFile,
|
|
17
18
|
) -> None:
|
|
18
19
|
"""
|
|
19
|
-
|
|
20
|
+
Upload a source file for content extraction.
|
|
20
21
|
|
|
21
22
|
Parameters
|
|
22
23
|
----------
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/api_endpoints/uploader_base.py
RENAMED
|
@@ -7,9 +7,7 @@ class UploaderBase:
|
|
|
7
7
|
"""Base class for uploader API endpoints."""
|
|
8
8
|
|
|
9
9
|
def __init__(self):
|
|
10
|
-
"""
|
|
11
|
-
Initialize the UploaderBase.
|
|
12
|
-
"""
|
|
10
|
+
"""Initialize the UploaderBase."""
|
|
13
11
|
self._background_threads = []
|
|
14
12
|
|
|
15
13
|
def _prune_background_threads(self) -> list[Thread]:
|
|
@@ -149,7 +149,7 @@ async def upload_file(
|
|
|
149
149
|
request: Request,
|
|
150
150
|
) -> None:
|
|
151
151
|
"""
|
|
152
|
-
|
|
152
|
+
Upload user selected sources.
|
|
153
153
|
|
|
154
154
|
Parameters
|
|
155
155
|
----------
|
|
@@ -181,7 +181,7 @@ async def upload_source(
|
|
|
181
181
|
key_value_pair: List[KeyValuePair] = Body(None, description="The key-value pairs for the source"),
|
|
182
182
|
) -> None:
|
|
183
183
|
"""
|
|
184
|
-
|
|
184
|
+
Upload user selected sources.
|
|
185
185
|
|
|
186
186
|
Parameters
|
|
187
187
|
----------
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
from admin_api_lib.impl.api_endpoints.default_file_uploader import DefaultFileUploader
|
|
4
4
|
from dependency_injector.containers import DeclarativeContainer
|
|
5
5
|
from dependency_injector.providers import Configuration, List, Selector, Singleton
|
|
6
|
-
from
|
|
6
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
7
7
|
from langchain_community.embeddings import OllamaEmbeddings
|
|
8
8
|
from langfuse import Langfuse
|
|
9
9
|
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
import io
|
|
4
4
|
import logging
|
|
5
|
-
import traceback
|
|
6
5
|
|
|
7
6
|
from fastapi import HTTPException, Response, status
|
|
8
7
|
|
|
@@ -54,10 +53,9 @@ class DefaultDocumentReferenceRetriever(DocumentReferenceRetriever):
|
|
|
54
53
|
self._file_service.download_file(identification, document_buffer)
|
|
55
54
|
logger.debug("DONE retrieving document with id: %s", identification)
|
|
56
55
|
document_data = document_buffer.getvalue()
|
|
57
|
-
except Exception
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
)
|
|
56
|
+
except Exception:
|
|
57
|
+
# Log full stack trace without embedding the exception object in the message (G200)
|
|
58
|
+
logger.exception("Error retrieving document with id: %s.", identification)
|
|
61
59
|
raise ValueError(f"Document with id '{identification}' not found.")
|
|
62
60
|
finally:
|
|
63
61
|
document_buffer.close()
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
"""Module for the default file uploader implementation."""
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
from pathlib import Path
|
|
3
|
-
import traceback
|
|
4
5
|
import urllib
|
|
5
6
|
import tempfile
|
|
6
7
|
import asyncio
|
|
@@ -78,7 +79,7 @@ class DefaultFileUploader(FileUploader):
|
|
|
78
79
|
file: UploadFile,
|
|
79
80
|
) -> None:
|
|
80
81
|
"""
|
|
81
|
-
|
|
82
|
+
Upload a source file for content extraction.
|
|
82
83
|
|
|
83
84
|
Parameters
|
|
84
85
|
----------
|
|
@@ -109,7 +110,7 @@ class DefaultFileUploader(FileUploader):
|
|
|
109
110
|
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
|
|
110
111
|
except Exception as e:
|
|
111
112
|
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
112
|
-
logger.
|
|
113
|
+
logger.exception("Error while uploading %s", source_name)
|
|
113
114
|
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
|
|
114
115
|
|
|
115
116
|
def _log_task_exception(self, task: asyncio.Task) -> None:
|
|
@@ -124,19 +125,16 @@ class DefaultFileUploader(FileUploader):
|
|
|
124
125
|
if task.done() and not task.cancelled():
|
|
125
126
|
try:
|
|
126
127
|
task.result() # This will raise the exception if one occurred
|
|
127
|
-
except Exception
|
|
128
|
-
logger.
|
|
129
|
-
logger.debug("Background task exception traceback: %s", traceback.format_exc())
|
|
128
|
+
except Exception:
|
|
129
|
+
logger.exception("Background task failed with exception.")
|
|
130
130
|
|
|
131
131
|
def _prune_background_tasks(self) -> None:
|
|
132
|
-
"""
|
|
133
|
-
Remove completed background tasks from the list.
|
|
134
|
-
"""
|
|
132
|
+
"""Remove completed background tasks from the list."""
|
|
135
133
|
self._background_tasks = [task for task in self._background_tasks if not task.done()]
|
|
136
134
|
|
|
137
135
|
def _check_if_already_in_processing(self, source_name: str) -> None:
|
|
138
136
|
"""
|
|
139
|
-
|
|
137
|
+
Check if the source is already in processing state.
|
|
140
138
|
|
|
141
139
|
Parameters
|
|
142
140
|
----------
|
|
@@ -196,9 +194,9 @@ class DefaultFileUploader(FileUploader):
|
|
|
196
194
|
await asyncio.to_thread(self._rag_api.upload_information_piece, rag_information_pieces)
|
|
197
195
|
self._key_value_store.upsert(source_name, Status.READY)
|
|
198
196
|
logger.info("Source uploaded successfully: %s", source_name)
|
|
199
|
-
except Exception
|
|
197
|
+
except Exception:
|
|
200
198
|
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
201
|
-
logger.
|
|
199
|
+
logger.exception("Error while uploading %s", source_name)
|
|
202
200
|
|
|
203
201
|
def _add_file_url(self, file_name: str, base_url: str, chunked_documents: list[Document]):
|
|
204
202
|
document_url = f"{base_url.rstrip('/')}/document_reference/{urllib.parse.quote_plus(file_name)}"
|
|
@@ -229,6 +227,6 @@ class DefaultFileUploader(FileUploader):
|
|
|
229
227
|
|
|
230
228
|
self._file_service.upload_file(Path(temp_file_path), filename)
|
|
231
229
|
return filename
|
|
232
|
-
except Exception
|
|
233
|
-
logger.
|
|
230
|
+
except Exception:
|
|
231
|
+
logger.exception("Error during document saving")
|
|
234
232
|
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Module for the default source uploader implementation."""
|
|
2
|
+
|
|
1
3
|
import logging
|
|
2
4
|
import asyncio
|
|
3
5
|
from threading import Thread
|
|
@@ -28,6 +30,7 @@ logger = logging.getLogger(__name__)
|
|
|
28
30
|
|
|
29
31
|
|
|
30
32
|
class DefaultSourceUploader(SourceUploader):
|
|
33
|
+
"""The DefaultSourceUploader is responsible for uploading source files for content extraction."""
|
|
31
34
|
|
|
32
35
|
def __init__(
|
|
33
36
|
self,
|
|
@@ -78,7 +81,7 @@ class DefaultSourceUploader(SourceUploader):
|
|
|
78
81
|
kwargs: list[KeyValuePair],
|
|
79
82
|
) -> None:
|
|
80
83
|
"""
|
|
81
|
-
|
|
84
|
+
Upload the parameters for source content extraction.
|
|
82
85
|
|
|
83
86
|
Parameters
|
|
84
87
|
----------
|
|
@@ -95,7 +98,6 @@ class DefaultSourceUploader(SourceUploader):
|
|
|
95
98
|
-------
|
|
96
99
|
None
|
|
97
100
|
"""
|
|
98
|
-
|
|
99
101
|
self._prune_background_threads()
|
|
100
102
|
|
|
101
103
|
source_name = f"{source_type}:{sanitize_document_name(name)}"
|
|
@@ -111,12 +113,12 @@ class DefaultSourceUploader(SourceUploader):
|
|
|
111
113
|
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(e))
|
|
112
114
|
except Exception as e:
|
|
113
115
|
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
114
|
-
logger.
|
|
116
|
+
logger.exception("Error while uploading %s", source_name)
|
|
115
117
|
raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
|
|
116
118
|
|
|
117
119
|
def _check_if_already_in_processing(self, source_name: str) -> None:
|
|
118
120
|
"""
|
|
119
|
-
|
|
121
|
+
Check if the source is already in processing state.
|
|
120
122
|
|
|
121
123
|
Parameters
|
|
122
124
|
----------
|
|
@@ -147,10 +149,14 @@ class DefaultSourceUploader(SourceUploader):
|
|
|
147
149
|
)
|
|
148
150
|
)
|
|
149
151
|
except asyncio.TimeoutError:
|
|
150
|
-
logger.error(
|
|
152
|
+
logger.error(
|
|
153
|
+
"Upload of %s timed out after %s seconds (increase SOURCE_UPLOADER_TIMEOUT to allow longer ingestions)",
|
|
154
|
+
source_name,
|
|
155
|
+
timeout,
|
|
156
|
+
)
|
|
151
157
|
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
152
158
|
except Exception:
|
|
153
|
-
logger.
|
|
159
|
+
logger.exception("Error while uploading %s", source_name)
|
|
154
160
|
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
155
161
|
finally:
|
|
156
162
|
loop.close()
|
|
@@ -197,6 +203,6 @@ class DefaultSourceUploader(SourceUploader):
|
|
|
197
203
|
await asyncio.to_thread(self._rag_api.upload_information_piece, rag_information_pieces)
|
|
198
204
|
self._key_value_store.upsert(source_name, Status.READY)
|
|
199
205
|
logger.info("Source uploaded successfully: %s", source_name)
|
|
200
|
-
except Exception
|
|
206
|
+
except Exception:
|
|
201
207
|
self._key_value_store.upsert(source_name, Status.ERROR)
|
|
202
|
-
logger.
|
|
208
|
+
logger.exception("Error while uploading %s", source_name)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Module containing the TextChunker class."""
|
|
2
2
|
|
|
3
|
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
4
3
|
from langchain_core.documents import Document
|
|
4
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
5
5
|
|
|
6
6
|
from admin_api_lib.chunker.chunker import Chunker
|
|
7
7
|
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Module for enhancing the summary of pages by grouping information by page and summarizing each page."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from hashlib import sha256
|
|
5
|
+
from typing import Optional
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from langchain_core.documents import Document
|
|
9
|
+
from langchain_core.runnables import RunnableConfig
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
|
|
12
|
+
from admin_api_lib.impl.information_enhancer.summary_enhancer import SummaryEnhancer
|
|
13
|
+
from rag_core_lib.impl.data_types.content_type import ContentType
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PageSummaryEnhancer(SummaryEnhancer):
|
|
17
|
+
"""
|
|
18
|
+
Enhances the summary of pages by grouping information by page and summarizing each page.
|
|
19
|
+
|
|
20
|
+
Attributes
|
|
21
|
+
----------
|
|
22
|
+
BASE64_IMAGE_KEY : str
|
|
23
|
+
Key used to identify base64 encoded images in metadata.
|
|
24
|
+
DEFAULT_PAGE_NR : int
|
|
25
|
+
Default page number used when no page metadata is available.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
BASE64_IMAGE_KEY = "base64_image"
|
|
29
|
+
DOCUMENT_URL_KEY = "document_url"
|
|
30
|
+
DEFAULT_PAGE_NR = 1
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def _parse_max_concurrency(config: Optional[RunnableConfig]) -> int:
|
|
34
|
+
if not config:
|
|
35
|
+
return 1
|
|
36
|
+
raw = config.get("max_concurrency")
|
|
37
|
+
if raw is None:
|
|
38
|
+
return 1
|
|
39
|
+
try:
|
|
40
|
+
return max(1, int(raw))
|
|
41
|
+
except (TypeError, ValueError):
|
|
42
|
+
return 1
|
|
43
|
+
|
|
44
|
+
def _group_key(self, piece: Document) -> tuple[Any, ...]:
|
|
45
|
+
document_url = piece.metadata.get(self.DOCUMENT_URL_KEY)
|
|
46
|
+
page = piece.metadata.get("page", self.DEFAULT_PAGE_NR)
|
|
47
|
+
|
|
48
|
+
# For paged documents (PDF/docling/etc.) keep per-page summaries even if a shared document URL exists.
|
|
49
|
+
if isinstance(page, int) or (isinstance(page, str) and page != "Unknown Title"):
|
|
50
|
+
return ("page_number", document_url, page)
|
|
51
|
+
|
|
52
|
+
# For sources like sitemaps/confluence, `page` can be a non-unique title (or missing),
|
|
53
|
+
# so group by the page URL when available to ensure one summary per page.
|
|
54
|
+
if document_url:
|
|
55
|
+
return ("document_url", document_url)
|
|
56
|
+
|
|
57
|
+
return ("page", page)
|
|
58
|
+
|
|
59
|
+
async def _asummarize_page(self, page_pieces: list[Document], config: Optional[RunnableConfig]) -> Document:
|
|
60
|
+
full_page_content = " ".join([piece.page_content for piece in page_pieces])
|
|
61
|
+
summary = await self._summarizer.ainvoke(full_page_content, config)
|
|
62
|
+
meta = {key: value for key, value in page_pieces[0].metadata.items() if key != self.BASE64_IMAGE_KEY}
|
|
63
|
+
meta["id"] = sha256(str.encode(full_page_content)).hexdigest()
|
|
64
|
+
meta["related"] = meta["related"] + [piece.metadata["id"] for piece in page_pieces]
|
|
65
|
+
meta["related"] = list(set(meta["related"]))
|
|
66
|
+
meta["type"] = ContentType.SUMMARY.value
|
|
67
|
+
|
|
68
|
+
return Document(metadata=meta, page_content=summary)
|
|
69
|
+
|
|
70
|
+
async def _acreate_summary(self, information: list[Document], config: Optional[RunnableConfig]) -> list[Document]:
|
|
71
|
+
grouped = self._group_information(information)
|
|
72
|
+
max_concurrency = self._parse_max_concurrency(config)
|
|
73
|
+
return await self._summarize_groups(grouped, config, max_concurrency=max_concurrency)
|
|
74
|
+
|
|
75
|
+
def _group_information(self, information: list[Document]) -> list[list[Document]]:
|
|
76
|
+
ordered_keys: list[tuple[Any, ...]] = []
|
|
77
|
+
groups: dict[tuple[Any, ...], list[Document]] = {}
|
|
78
|
+
for info in information:
|
|
79
|
+
key = self._group_key(info)
|
|
80
|
+
if key not in groups:
|
|
81
|
+
ordered_keys.append(key)
|
|
82
|
+
groups[key] = []
|
|
83
|
+
groups[key].append(info)
|
|
84
|
+
return [groups[key] for key in ordered_keys]
|
|
85
|
+
|
|
86
|
+
async def _summarize_groups(
|
|
87
|
+
self,
|
|
88
|
+
grouped: list[list[Document]],
|
|
89
|
+
config: Optional[RunnableConfig],
|
|
90
|
+
*,
|
|
91
|
+
max_concurrency: int,
|
|
92
|
+
) -> list[Document]:
|
|
93
|
+
if max_concurrency == 1:
|
|
94
|
+
summaries: list[Document] = []
|
|
95
|
+
for info_group in tqdm(grouped):
|
|
96
|
+
summaries.append(await self._asummarize_page(info_group, config))
|
|
97
|
+
return summaries
|
|
98
|
+
|
|
99
|
+
semaphore = asyncio.Semaphore(max_concurrency)
|
|
100
|
+
results: list[Document | None] = [None] * len(grouped)
|
|
101
|
+
|
|
102
|
+
async def _run(idx: int, info_group: list[Document]) -> tuple[int, Document]:
|
|
103
|
+
async with semaphore:
|
|
104
|
+
return idx, await self._asummarize_page(info_group, config)
|
|
105
|
+
|
|
106
|
+
tasks = [asyncio.create_task(_run(idx, info_group)) for idx, info_group in enumerate(grouped)]
|
|
107
|
+
with tqdm(total=len(tasks)) as pbar:
|
|
108
|
+
for task in asyncio.as_completed(tasks):
|
|
109
|
+
idx, summary = await task
|
|
110
|
+
results[idx] = summary
|
|
111
|
+
pbar.update(1)
|
|
112
|
+
|
|
113
|
+
return [summary for summary in results if summary is not None]
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
"""Module containing the FileStatusKeyValueStore class."""
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import ssl
|
|
5
|
+
from typing import Any
|
|
4
6
|
|
|
5
7
|
from redis import Redis
|
|
6
8
|
|
|
@@ -37,9 +39,53 @@ class FileStatusKeyValueStore:
|
|
|
37
39
|
Parameters
|
|
38
40
|
----------
|
|
39
41
|
settings : KeyValueSettings
|
|
40
|
-
The settings object containing the
|
|
42
|
+
The settings object containing the connection information for the Redis connection.
|
|
41
43
|
"""
|
|
42
|
-
|
|
44
|
+
redis_kwargs: dict[str, Any] = {
|
|
45
|
+
"host": settings.host,
|
|
46
|
+
"port": settings.port,
|
|
47
|
+
"decode_responses": True,
|
|
48
|
+
**self._build_ssl_kwargs(settings),
|
|
49
|
+
}
|
|
50
|
+
if settings.username:
|
|
51
|
+
redis_kwargs["username"] = settings.username
|
|
52
|
+
if settings.password:
|
|
53
|
+
redis_kwargs["password"] = settings.password
|
|
54
|
+
|
|
55
|
+
self._redis = Redis(**redis_kwargs)
|
|
56
|
+
|
|
57
|
+
@staticmethod
|
|
58
|
+
def _build_ssl_kwargs(settings: KeyValueSettings) -> dict[str, Any]:
|
|
59
|
+
"""Build Redis SSL settings from configuration, mapping string values to ssl constants."""
|
|
60
|
+
if not settings.use_ssl:
|
|
61
|
+
return {}
|
|
62
|
+
|
|
63
|
+
cert_reqs_map = {
|
|
64
|
+
"required": ssl.CERT_REQUIRED,
|
|
65
|
+
"optional": ssl.CERT_OPTIONAL,
|
|
66
|
+
"none": ssl.CERT_NONE,
|
|
67
|
+
"cert_required": ssl.CERT_REQUIRED,
|
|
68
|
+
"cert_optional": ssl.CERT_OPTIONAL,
|
|
69
|
+
"cert_none": ssl.CERT_NONE,
|
|
70
|
+
}
|
|
71
|
+
ssl_cert_reqs = None
|
|
72
|
+
if settings.ssl_cert_reqs:
|
|
73
|
+
ssl_cert_reqs = cert_reqs_map.get(settings.ssl_cert_reqs.lower(), settings.ssl_cert_reqs)
|
|
74
|
+
|
|
75
|
+
ssl_kwargs: dict[str, Any] = {
|
|
76
|
+
"ssl": settings.use_ssl,
|
|
77
|
+
"ssl_check_hostname": settings.ssl_check_hostname,
|
|
78
|
+
}
|
|
79
|
+
if ssl_cert_reqs is not None:
|
|
80
|
+
ssl_kwargs["ssl_cert_reqs"] = ssl_cert_reqs
|
|
81
|
+
if settings.ssl_ca_certs:
|
|
82
|
+
ssl_kwargs["ssl_ca_certs"] = settings.ssl_ca_certs
|
|
83
|
+
if settings.ssl_certfile:
|
|
84
|
+
ssl_kwargs["ssl_certfile"] = settings.ssl_certfile
|
|
85
|
+
if settings.ssl_keyfile:
|
|
86
|
+
ssl_kwargs["ssl_keyfile"] = settings.ssl_keyfile
|
|
87
|
+
|
|
88
|
+
return ssl_kwargs
|
|
43
89
|
|
|
44
90
|
@staticmethod
|
|
45
91
|
def _to_str(file_name: str, file_status: Status) -> str:
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Contains settings regarding the key values store."""
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
from pydantic_settings import BaseSettings
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class KeyValueSettings(BaseSettings):
|
|
8
|
+
"""
|
|
9
|
+
Contains settings regarding the key value store.
|
|
10
|
+
|
|
11
|
+
Attributes
|
|
12
|
+
----------
|
|
13
|
+
host : str
|
|
14
|
+
The hostname of the key value store.
|
|
15
|
+
port : int
|
|
16
|
+
The port number of the key value store.
|
|
17
|
+
username : str | None
|
|
18
|
+
Optional username for authenticating with the key value store.
|
|
19
|
+
password : str | None
|
|
20
|
+
Optional password for authenticating with the key value store.
|
|
21
|
+
use_ssl : bool
|
|
22
|
+
Whether to use SSL/TLS when connecting to the key value store.
|
|
23
|
+
ssl_cert_reqs : str | None
|
|
24
|
+
SSL certificate requirement level (e.g., 'required', 'optional', 'none').
|
|
25
|
+
ssl_ca_certs : str | None
|
|
26
|
+
Path to a CA bundle file for verifying the server certificate.
|
|
27
|
+
ssl_certfile : str | None
|
|
28
|
+
Path to the client SSL certificate file (if mutual TLS is required).
|
|
29
|
+
ssl_keyfile : str | None
|
|
30
|
+
Path to the client SSL private key file (if mutual TLS is required).
|
|
31
|
+
ssl_check_hostname : bool
|
|
32
|
+
Whether to verify the server hostname against the certificate.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
class Config:
|
|
36
|
+
"""Config class for reading Fields from env."""
|
|
37
|
+
|
|
38
|
+
env_prefix = "USECASE_KEYVALUE_"
|
|
39
|
+
case_sensitive = False
|
|
40
|
+
|
|
41
|
+
host: str = Field()
|
|
42
|
+
port: int = Field()
|
|
43
|
+
username: str | None = Field(default=None)
|
|
44
|
+
password: str | None = Field(default=None)
|
|
45
|
+
use_ssl: bool = Field(default=False)
|
|
46
|
+
ssl_cert_reqs: str | None = Field(default=None)
|
|
47
|
+
ssl_ca_certs: str | None = Field(default=None)
|
|
48
|
+
ssl_certfile: str | None = Field(default=None)
|
|
49
|
+
ssl_keyfile: str | None = Field(default=None)
|
|
50
|
+
ssl_check_hostname: bool = Field(default=True)
|
|
@@ -4,9 +4,9 @@ import asyncio
|
|
|
4
4
|
import logging
|
|
5
5
|
from typing import Optional
|
|
6
6
|
|
|
7
|
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
8
7
|
from langchain_core.documents import Document
|
|
9
8
|
from langchain_core.runnables import Runnable, RunnableConfig, ensure_config
|
|
9
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
10
10
|
from openai import APIConnectionError, APIError, APITimeoutError, RateLimitError
|
|
11
11
|
|
|
12
12
|
from admin_api_lib.impl.settings.summarizer_settings import SummarizerSettings
|
|
@@ -44,6 +44,24 @@ class LangchainSummarizer(Summarizer):
|
|
|
44
44
|
self._semaphore = semaphore
|
|
45
45
|
self._retry_decorator_settings = create_retry_decorator_settings(summarizer_settings, retry_decorator_settings)
|
|
46
46
|
|
|
47
|
+
@staticmethod
|
|
48
|
+
def _parse_max_concurrency(config: RunnableConfig) -> Optional[int]:
|
|
49
|
+
"""Parse max concurrency from a RunnableConfig.
|
|
50
|
+
|
|
51
|
+
Returns
|
|
52
|
+
-------
|
|
53
|
+
Optional[int]
|
|
54
|
+
An integer >= 1 if configured and valid, otherwise None.
|
|
55
|
+
"""
|
|
56
|
+
max_concurrency = config.get("max_concurrency")
|
|
57
|
+
if max_concurrency is None:
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
return max(1, int(max_concurrency))
|
|
62
|
+
except (TypeError, ValueError):
|
|
63
|
+
return None
|
|
64
|
+
|
|
47
65
|
async def ainvoke(self, query: SummarizerInput, config: Optional[RunnableConfig] = None) -> SummarizerOutput:
|
|
48
66
|
"""
|
|
49
67
|
Asynchronously invokes the summarization process on the given query.
|
|
@@ -77,9 +95,8 @@ class LangchainSummarizer(Summarizer):
|
|
|
77
95
|
langchain_documents = self._chunker.split_documents([document])
|
|
78
96
|
logger.debug("Summarizing %d chunk(s)...", len(langchain_documents))
|
|
79
97
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
outputs = await asyncio.gather(*tasks)
|
|
98
|
+
max_concurrency = self._parse_max_concurrency(config)
|
|
99
|
+
outputs = await self._summarize_documents(langchain_documents, config, max_concurrency=max_concurrency)
|
|
83
100
|
|
|
84
101
|
if len(outputs) == 1:
|
|
85
102
|
return outputs[0]
|
|
@@ -93,6 +110,34 @@ class LangchainSummarizer(Summarizer):
|
|
|
93
110
|
)
|
|
94
111
|
return await self._summarize_chunk(merged, config)
|
|
95
112
|
|
|
113
|
+
async def _summarize_documents(
|
|
114
|
+
self,
|
|
115
|
+
documents: list[Document],
|
|
116
|
+
config: RunnableConfig,
|
|
117
|
+
*,
|
|
118
|
+
max_concurrency: Optional[int],
|
|
119
|
+
) -> list[SummarizerOutput]:
|
|
120
|
+
"""Summarize a set of already-chunked documents.
|
|
121
|
+
|
|
122
|
+
Notes
|
|
123
|
+
-----
|
|
124
|
+
This optionally limits task fan-out using a per-call semaphore (max_concurrency).
|
|
125
|
+
The actual LLM call concurrency is always bounded by the instance semaphore held
|
|
126
|
+
inside `_summarize_chunk`.
|
|
127
|
+
"""
|
|
128
|
+
if max_concurrency == 1:
|
|
129
|
+
return [await self._summarize_chunk(doc.page_content, config) for doc in documents]
|
|
130
|
+
|
|
131
|
+
limiter: asyncio.Semaphore | None = asyncio.Semaphore(max_concurrency) if max_concurrency is not None else None
|
|
132
|
+
|
|
133
|
+
async def _run(doc: Document) -> SummarizerOutput:
|
|
134
|
+
if limiter is None:
|
|
135
|
+
return await self._summarize_chunk(doc.page_content, config)
|
|
136
|
+
async with limiter:
|
|
137
|
+
return await self._summarize_chunk(doc.page_content, config)
|
|
138
|
+
|
|
139
|
+
return await asyncio.gather(*(_run(doc) for doc in documents))
|
|
140
|
+
|
|
96
141
|
def _create_chain(self) -> Runnable:
|
|
97
142
|
return self._langfuse_manager.get_base_prompt(self.__class__.__name__) | self._langfuse_manager.get_base_llm(
|
|
98
143
|
self.__class__.__name__
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
"""Module for enhancing the summary of pages by grouping information by page and summarizing each page."""
|
|
2
|
-
|
|
3
|
-
from asyncio import gather
|
|
4
|
-
from hashlib import sha256
|
|
5
|
-
from typing import Optional
|
|
6
|
-
|
|
7
|
-
from langchain_core.documents import Document
|
|
8
|
-
from langchain_core.runnables import RunnableConfig
|
|
9
|
-
from tqdm import tqdm
|
|
10
|
-
|
|
11
|
-
from admin_api_lib.impl.information_enhancer.summary_enhancer import SummaryEnhancer
|
|
12
|
-
from rag_core_lib.impl.data_types.content_type import ContentType
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class PageSummaryEnhancer(SummaryEnhancer):
|
|
16
|
-
"""
|
|
17
|
-
Enhances the summary of pages by grouping information by page and summarizing each page.
|
|
18
|
-
|
|
19
|
-
Attributes
|
|
20
|
-
----------
|
|
21
|
-
BASE64_IMAGE_KEY : str
|
|
22
|
-
Key used to identify base64 encoded images in metadata.
|
|
23
|
-
DEFAULT_PAGE_NR : int
|
|
24
|
-
Default page number used when no page metadata is available.
|
|
25
|
-
"""
|
|
26
|
-
|
|
27
|
-
BASE64_IMAGE_KEY = "base64_image"
|
|
28
|
-
DEFAULT_PAGE_NR = 1
|
|
29
|
-
|
|
30
|
-
async def _asummarize_page(self, page_pieces: list[Document], config: Optional[RunnableConfig]) -> Document:
|
|
31
|
-
full_page_content = " ".join([piece.page_content for piece in page_pieces])
|
|
32
|
-
summary = await self._summarizer.ainvoke(full_page_content, config)
|
|
33
|
-
meta = {key: value for key, value in page_pieces[0].metadata.items() if key != self.BASE64_IMAGE_KEY}
|
|
34
|
-
meta["id"] = sha256(str.encode(full_page_content)).hexdigest()
|
|
35
|
-
meta["related"] = meta["related"] + [piece.metadata["id"] for piece in page_pieces]
|
|
36
|
-
meta["related"] = list(set(meta["related"]))
|
|
37
|
-
meta["type"] = ContentType.SUMMARY.value
|
|
38
|
-
|
|
39
|
-
return Document(metadata=meta, page_content=summary)
|
|
40
|
-
|
|
41
|
-
async def _acreate_summary(self, information: list[Document], config: Optional[RunnableConfig]) -> list[Document]:
|
|
42
|
-
distinct_pages = []
|
|
43
|
-
for info in information:
|
|
44
|
-
if info.metadata.get("page", self.DEFAULT_PAGE_NR) not in distinct_pages:
|
|
45
|
-
distinct_pages.append(info.metadata.get("page", self.DEFAULT_PAGE_NR))
|
|
46
|
-
|
|
47
|
-
grouped = []
|
|
48
|
-
for page in distinct_pages:
|
|
49
|
-
group = []
|
|
50
|
-
for compare_info in information:
|
|
51
|
-
if compare_info.metadata.get("page", self.DEFAULT_PAGE_NR) == page:
|
|
52
|
-
group.append(compare_info)
|
|
53
|
-
if (
|
|
54
|
-
self._chunker_settings
|
|
55
|
-
and len(" ".join([item.page_content for item in group])) < self._chunker_settings.max_size
|
|
56
|
-
):
|
|
57
|
-
continue
|
|
58
|
-
grouped.append(group)
|
|
59
|
-
|
|
60
|
-
summary_tasks = [self._asummarize_page(info_group, config) for info_group in tqdm(grouped)]
|
|
61
|
-
|
|
62
|
-
return await gather(*summary_tasks)
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
"""Contains settings regarding the key values store."""
|
|
2
|
-
|
|
3
|
-
from pydantic import Field
|
|
4
|
-
from pydantic_settings import BaseSettings
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class KeyValueSettings(BaseSettings):
|
|
8
|
-
"""
|
|
9
|
-
Contains settings regarding the key value store.
|
|
10
|
-
|
|
11
|
-
Attributes
|
|
12
|
-
----------
|
|
13
|
-
host : str
|
|
14
|
-
The hostname of the key value store.
|
|
15
|
-
port : int
|
|
16
|
-
The port number of the key value store.
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
class Config:
|
|
20
|
-
"""Config class for reading Fields from env."""
|
|
21
|
-
|
|
22
|
-
env_prefix = "USECASE_KEYVALUE_"
|
|
23
|
-
case_sensitive = False
|
|
24
|
-
|
|
25
|
-
host: str = Field()
|
|
26
|
-
port: int = Field()
|
|
File without changes
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/api_endpoints/document_deleter.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/extractor_api_client/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/chunker/semantic_text_chunker.py
RENAMED
|
File without changes
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/file_services/__init__.py
RENAMED
|
File without changes
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/file_services/s3_service.py
RENAMED
|
File without changes
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/information_enhancer/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/settings/chunker_settings.py
RENAMED
|
File without changes
|
|
File without changes
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/settings/rag_api_settings.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/impl/settings/summarizer_settings.py
RENAMED
|
File without changes
|
|
File without changes
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/information_enhancer/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/models/http_validation_error.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/models/validation_error_loc_inner.py
RENAMED
|
File without changes
|
|
File without changes
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/prompt_templates/summarize_prompt.py
RENAMED
|
File without changes
|
{admin_api_lib-3.3.0 → admin_api_lib-4.0.0}/src/admin_api_lib/rag_backend_client/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|