trustgraph-base 2.2.15__tar.gz → 2.2.16__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/PKG-INFO +2 -1
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/pyproject.toml +1 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/library.py +3 -2
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/__init__.py +1 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/async_processor.py +5 -5
- trustgraph_base-2.2.16/trustgraph/base/chunking_service.py +104 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/consumer.py +52 -47
- trustgraph_base-2.2.16/trustgraph/base/librarian_client.py +246 -0
- trustgraph_base-2.2.16/trustgraph/base/pubsub.py +121 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/pulsar_backend.py +2 -110
- trustgraph_base-2.2.16/trustgraph/base/rabbitmq_backend.py +390 -0
- trustgraph_base-2.2.16/trustgraph/base/serialization.py +115 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/subscriber.py +1 -1
- trustgraph_base-2.2.16/trustgraph/base_version.py +1 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/clients/base.py +2 -9
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/clients/config_client.py +2 -6
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/library.py +6 -3
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph_base.egg-info/PKG-INFO +2 -1
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph_base.egg-info/SOURCES.txt +3 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph_base.egg-info/requires.txt +1 -0
- trustgraph_base-2.2.15/trustgraph/base/chunking_service.py +0 -264
- trustgraph_base-2.2.15/trustgraph/base/pubsub.py +0 -72
- trustgraph_base-2.2.15/trustgraph/base_version.py +0 -1
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/README.md +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/setup.cfg +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/__init__.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/api.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/async_bulk_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/async_flow.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/async_metrics.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/async_socket_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/bulk_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/collection.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/config.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/exceptions.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/explainability.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/flow.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/knowledge.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/metrics.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/socket_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/types.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/agent_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/agent_service.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/backend.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/cassandra_config.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/collection_config_handler.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/consumer_spec.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/document_embeddings_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/document_embeddings_query_service.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/document_embeddings_store_service.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/dynamic_tool_service.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/embeddings_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/embeddings_service.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/flow.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/flow_processor.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/graph_embeddings_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/graph_embeddings_query_service.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/graph_embeddings_store_service.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/graph_rag_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/llm_service.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/logging.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/metrics.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/parameter_spec.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/producer.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/producer_spec.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/prompt_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/publisher.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/request_response_spec.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/row_embeddings_query_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/spec.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/structured_query_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/subscriber_spec.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/text_completion_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/tool_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/tool_service.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/tool_service_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/triples_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/triples_query_service.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/triples_store_service.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/clients/__init__.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/clients/agent_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/clients/document_embeddings_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/clients/document_rag_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/clients/embeddings_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/clients/graph_embeddings_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/clients/graph_rag_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/clients/llm_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/clients/prompt_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/clients/row_embeddings_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/clients/triples_query_client.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/exceptions.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/knowledge/__init__.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/knowledge/defs.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/knowledge/document.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/knowledge/identifier.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/knowledge/organization.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/knowledge/publication.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/log_level.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/__init__.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/registry.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/__init__.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/agent.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/base.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/collection.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/config.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/diagnosis.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/document_loading.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/embeddings.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/embeddings_query.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/flow.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/knowledge.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/library.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/metadata.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/nlp_query.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/primitives.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/prompt.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/retrieval.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/rows_query.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/structured_query.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/text_completion.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/tool.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/messaging/translators/triples.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/objects/__init__.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/objects/field.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/objects/object.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/provenance/__init__.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/provenance/agent.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/provenance/namespaces.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/provenance/triples.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/provenance/uris.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/provenance/vocabulary.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/rdf.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/__init__.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/core/__init__.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/core/metadata.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/core/primitives.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/core/topic.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/knowledge/__init__.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/knowledge/document.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/knowledge/embeddings.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/knowledge/graph.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/knowledge/knowledge.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/knowledge/nlp.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/knowledge/object.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/knowledge/rows.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/knowledge/structured.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/__init__.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/agent.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/collection.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/config.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/diagnosis.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/flow.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/llm.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/lookup.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/nlp_query.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/prompt.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/query.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/retrieval.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/rows_query.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/storage.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/structured_query.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/schema/services/tool_service.py +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph_base.egg-info/dependency_links.txt +0 -0
- {trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph_base.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: trustgraph-base
|
|
3
|
-
Version: 2.2.
|
|
3
|
+
Version: 2.2.16
|
|
4
4
|
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
|
5
5
|
Author-email: "trustgraph.ai" <security@trustgraph.ai>
|
|
6
6
|
Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
|
|
@@ -12,5 +12,6 @@ Requires-Dist: pulsar-client
|
|
|
12
12
|
Requires-Dist: prometheus-client
|
|
13
13
|
Requires-Dist: requests
|
|
14
14
|
Requires-Dist: python-logging-loki
|
|
15
|
+
Requires-Dist: pika
|
|
15
16
|
|
|
16
17
|
See https://trustgraph.ai/
|
|
@@ -22,8 +22,9 @@ logger = logging.getLogger(__name__)
|
|
|
22
22
|
# Lower threshold provides progress feedback and resumability on slower connections
|
|
23
23
|
CHUNKED_UPLOAD_THRESHOLD = 2 * 1024 * 1024
|
|
24
24
|
|
|
25
|
-
# Default chunk size (
|
|
26
|
-
|
|
25
|
+
# Default chunk size (3MB - stays under broker message size limits
|
|
26
|
+
# after base64 encoding ~4MB)
|
|
27
|
+
DEFAULT_CHUNK_SIZE = 3 * 1024 * 1024
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
def to_value(x):
|
|
@@ -14,6 +14,7 @@ from . producer_spec import ProducerSpec
|
|
|
14
14
|
from . subscriber_spec import SubscriberSpec
|
|
15
15
|
from . request_response_spec import RequestResponseSpec
|
|
16
16
|
from . llm_service import LlmService, LlmResult, LlmChunk
|
|
17
|
+
from . librarian_client import LibrarianClient
|
|
17
18
|
from . chunking_service import ChunkingService
|
|
18
19
|
from . embeddings_service import EmbeddingsService
|
|
19
20
|
from . embeddings_client import EmbeddingsClientSpec
|
|
@@ -68,11 +68,12 @@ class AsyncProcessor:
|
|
|
68
68
|
processor = self.id, flow = None, name = "config",
|
|
69
69
|
)
|
|
70
70
|
|
|
71
|
-
# Subscribe to config queue
|
|
71
|
+
# Subscribe to config queue — exclusive so every processor
|
|
72
|
+
# gets its own copy of config pushes (broadcast pattern)
|
|
72
73
|
self.config_sub_task = Consumer(
|
|
73
74
|
|
|
74
75
|
taskgroup = self.taskgroup,
|
|
75
|
-
backend = self.pubsub_backend,
|
|
76
|
+
backend = self.pubsub_backend,
|
|
76
77
|
subscriber = config_subscriber_id,
|
|
77
78
|
flow = None,
|
|
78
79
|
|
|
@@ -83,9 +84,8 @@ class AsyncProcessor:
|
|
|
83
84
|
|
|
84
85
|
metrics = config_consumer_metrics,
|
|
85
86
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
start_of_messages = True
|
|
87
|
+
start_of_messages = True,
|
|
88
|
+
consumer_type = 'exclusive',
|
|
89
89
|
)
|
|
90
90
|
|
|
91
91
|
self.running = True
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base chunking service that provides parameter specification functionality
|
|
3
|
+
for chunk-size and chunk-overlap parameters, and librarian client for
|
|
4
|
+
fetching large document content.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import base64
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
from .flow_processor import FlowProcessor
|
|
12
|
+
from .parameter_spec import ParameterSpec
|
|
13
|
+
from .librarian_client import LibrarianClient
|
|
14
|
+
|
|
15
|
+
# Module logger
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ChunkingService(FlowProcessor):
|
|
20
|
+
"""Base service for chunking processors with parameter specification support"""
|
|
21
|
+
|
|
22
|
+
def __init__(self, **params):
|
|
23
|
+
|
|
24
|
+
id = params.get("id", "chunker")
|
|
25
|
+
|
|
26
|
+
# Call parent constructor
|
|
27
|
+
super(ChunkingService, self).__init__(**params)
|
|
28
|
+
|
|
29
|
+
# Register parameter specifications for chunk-size and chunk-overlap
|
|
30
|
+
self.register_specification(
|
|
31
|
+
ParameterSpec(name="chunk-size")
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
self.register_specification(
|
|
35
|
+
ParameterSpec(name="chunk-overlap")
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Librarian client
|
|
39
|
+
self.librarian = LibrarianClient(
|
|
40
|
+
id=id,
|
|
41
|
+
backend=self.pubsub,
|
|
42
|
+
taskgroup=self.taskgroup,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
logger.debug("ChunkingService initialized with parameter specifications")
|
|
46
|
+
|
|
47
|
+
async def start(self):
|
|
48
|
+
await super(ChunkingService, self).start()
|
|
49
|
+
await self.librarian.start()
|
|
50
|
+
|
|
51
|
+
async def get_document_text(self, doc):
|
|
52
|
+
"""
|
|
53
|
+
Get text content from a TextDocument, fetching from librarian if needed.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
doc: TextDocument with either inline text or document_id
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
str: The document text content
|
|
60
|
+
"""
|
|
61
|
+
if doc.document_id and not doc.text:
|
|
62
|
+
logger.info(f"Fetching document {doc.document_id} from librarian...")
|
|
63
|
+
text = await self.librarian.fetch_document_text(
|
|
64
|
+
document_id=doc.document_id,
|
|
65
|
+
user=doc.metadata.user,
|
|
66
|
+
)
|
|
67
|
+
logger.info(f"Fetched {len(text)} characters from librarian")
|
|
68
|
+
return text
|
|
69
|
+
else:
|
|
70
|
+
return doc.text.decode("utf-8")
|
|
71
|
+
|
|
72
|
+
async def chunk_document(self, msg, consumer, flow, default_chunk_size, default_chunk_overlap):
|
|
73
|
+
"""
|
|
74
|
+
Extract chunk parameters from flow and return effective values
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
msg: The message being processed
|
|
78
|
+
consumer: The consumer instance
|
|
79
|
+
flow: The flow object containing parameters
|
|
80
|
+
default_chunk_size: Default chunk size if not configured
|
|
81
|
+
default_chunk_overlap: Default chunk overlap if not configured
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
tuple: (chunk_size, chunk_overlap) effective values
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
chunk_size = default_chunk_size
|
|
88
|
+
chunk_overlap = default_chunk_overlap
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
cs = flow.parameters.get("chunk-size")
|
|
92
|
+
if cs is not None:
|
|
93
|
+
chunk_size = int(cs)
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.warning(f"Could not parse chunk-size parameter: {e}")
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
co = flow.parameters.get("chunk-overlap")
|
|
99
|
+
if co is not None:
|
|
100
|
+
chunk_overlap = int(co)
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logger.warning(f"Could not parse chunk-overlap parameter: {e}")
|
|
103
|
+
|
|
104
|
+
return chunk_size, chunk_overlap
|
|
@@ -32,6 +32,7 @@ class Consumer:
|
|
|
32
32
|
rate_limit_retry_time = 10, rate_limit_timeout = 7200,
|
|
33
33
|
reconnect_time = 5,
|
|
34
34
|
concurrency = 1, # Number of concurrent requests to handle
|
|
35
|
+
consumer_type = 'shared',
|
|
35
36
|
):
|
|
36
37
|
|
|
37
38
|
self.taskgroup = taskgroup
|
|
@@ -42,6 +43,8 @@ class Consumer:
|
|
|
42
43
|
self.schema = schema
|
|
43
44
|
self.handler = handler
|
|
44
45
|
|
|
46
|
+
self.consumer_type = consumer_type
|
|
47
|
+
|
|
45
48
|
self.rate_limit_retry_time = rate_limit_retry_time
|
|
46
49
|
self.rate_limit_timeout = rate_limit_timeout
|
|
47
50
|
|
|
@@ -93,33 +96,11 @@ class Consumer:
|
|
|
93
96
|
if self.metrics:
|
|
94
97
|
self.metrics.state("stopped")
|
|
95
98
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
if self.start_of_messages:
|
|
102
|
-
initial_pos = 'earliest'
|
|
103
|
-
else:
|
|
104
|
-
initial_pos = 'latest'
|
|
105
|
-
|
|
106
|
-
# Create consumer via backend
|
|
107
|
-
self.consumer = await asyncio.to_thread(
|
|
108
|
-
self.backend.create_consumer,
|
|
109
|
-
topic = self.topic,
|
|
110
|
-
subscription = self.subscriber,
|
|
111
|
-
schema = self.schema,
|
|
112
|
-
initial_position = initial_pos,
|
|
113
|
-
consumer_type = 'shared',
|
|
114
|
-
)
|
|
115
|
-
|
|
116
|
-
except Exception as e:
|
|
117
|
-
|
|
118
|
-
logger.error(f"Consumer subscription exception: {e}", exc_info=True)
|
|
119
|
-
await asyncio.sleep(self.reconnect_time)
|
|
120
|
-
continue
|
|
121
|
-
|
|
122
|
-
logger.info(f"Successfully subscribed to topic: {self.topic}")
|
|
99
|
+
# Determine initial position
|
|
100
|
+
if self.start_of_messages:
|
|
101
|
+
initial_pos = 'earliest'
|
|
102
|
+
else:
|
|
103
|
+
initial_pos = 'latest'
|
|
123
104
|
|
|
124
105
|
if self.metrics:
|
|
125
106
|
self.metrics.state("running")
|
|
@@ -128,14 +109,30 @@ class Consumer:
|
|
|
128
109
|
|
|
129
110
|
logger.info(f"Starting {self.concurrency} receiver threads")
|
|
130
111
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
112
|
+
# Create one backend consumer per concurrent task.
|
|
113
|
+
# Each gets its own connection — required for backends
|
|
114
|
+
# like RabbitMQ where connections are not thread-safe.
|
|
115
|
+
consumers = []
|
|
116
|
+
for i in range(self.concurrency):
|
|
117
|
+
try:
|
|
118
|
+
logger.info(f"Subscribing to topic: {self.topic} (worker {i})")
|
|
119
|
+
c = await asyncio.to_thread(
|
|
120
|
+
self.backend.create_consumer,
|
|
121
|
+
topic = self.topic,
|
|
122
|
+
subscription = self.subscriber,
|
|
123
|
+
schema = self.schema,
|
|
124
|
+
initial_position = initial_pos,
|
|
125
|
+
consumer_type = self.consumer_type,
|
|
138
126
|
)
|
|
127
|
+
consumers.append(c)
|
|
128
|
+
logger.info(f"Successfully subscribed to topic: {self.topic} (worker {i})")
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.error(f"Consumer subscription exception (worker {i}): {e}", exc_info=True)
|
|
131
|
+
raise
|
|
132
|
+
|
|
133
|
+
async with asyncio.TaskGroup() as tg:
|
|
134
|
+
for c in consumers:
|
|
135
|
+
tg.create_task(self.consume_from_queue(c))
|
|
139
136
|
|
|
140
137
|
if self.metrics:
|
|
141
138
|
self.metrics.state("stopped")
|
|
@@ -143,23 +140,31 @@ class Consumer:
|
|
|
143
140
|
except Exception as e:
|
|
144
141
|
|
|
145
142
|
logger.error(f"Consumer loop exception: {e}", exc_info=True)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
143
|
+
for c in consumers:
|
|
144
|
+
try:
|
|
145
|
+
c.unsubscribe()
|
|
146
|
+
c.close()
|
|
147
|
+
except Exception:
|
|
148
|
+
pass
|
|
149
|
+
consumers = []
|
|
149
150
|
await asyncio.sleep(self.reconnect_time)
|
|
150
151
|
continue
|
|
151
152
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
153
|
+
finally:
|
|
154
|
+
for c in consumers:
|
|
155
|
+
try:
|
|
156
|
+
c.unsubscribe()
|
|
157
|
+
c.close()
|
|
158
|
+
except Exception:
|
|
159
|
+
pass
|
|
155
160
|
|
|
156
|
-
async def consume_from_queue(self):
|
|
161
|
+
async def consume_from_queue(self, consumer):
|
|
157
162
|
|
|
158
163
|
while self.running:
|
|
159
164
|
|
|
160
165
|
try:
|
|
161
166
|
msg = await asyncio.to_thread(
|
|
162
|
-
|
|
167
|
+
consumer.receive,
|
|
163
168
|
timeout_millis=2000
|
|
164
169
|
)
|
|
165
170
|
except Exception as e:
|
|
@@ -168,9 +173,9 @@ class Consumer:
|
|
|
168
173
|
continue
|
|
169
174
|
raise e
|
|
170
175
|
|
|
171
|
-
await self.handle_one_from_queue(msg)
|
|
176
|
+
await self.handle_one_from_queue(msg, consumer)
|
|
172
177
|
|
|
173
|
-
async def handle_one_from_queue(self, msg):
|
|
178
|
+
async def handle_one_from_queue(self, msg, consumer):
|
|
174
179
|
|
|
175
180
|
expiry = time.time() + self.rate_limit_timeout
|
|
176
181
|
|
|
@@ -183,7 +188,7 @@ class Consumer:
|
|
|
183
188
|
|
|
184
189
|
# Message failed to be processed, this causes it to
|
|
185
190
|
# be retried
|
|
186
|
-
|
|
191
|
+
consumer.negative_acknowledge(msg)
|
|
187
192
|
|
|
188
193
|
if self.metrics:
|
|
189
194
|
self.metrics.process("error")
|
|
@@ -206,7 +211,7 @@ class Consumer:
|
|
|
206
211
|
logger.debug("Message processed successfully")
|
|
207
212
|
|
|
208
213
|
# Acknowledge successful processing of the message
|
|
209
|
-
|
|
214
|
+
consumer.acknowledge(msg)
|
|
210
215
|
|
|
211
216
|
if self.metrics:
|
|
212
217
|
self.metrics.process("success")
|
|
@@ -233,7 +238,7 @@ class Consumer:
|
|
|
233
238
|
|
|
234
239
|
# Message failed to be processed, this causes it to
|
|
235
240
|
# be retried
|
|
236
|
-
|
|
241
|
+
consumer.negative_acknowledge(msg)
|
|
237
242
|
|
|
238
243
|
if self.metrics:
|
|
239
244
|
self.metrics.process("error")
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared librarian client for services that need to communicate
|
|
3
|
+
with the librarian via pub/sub.
|
|
4
|
+
|
|
5
|
+
Provides request-response and streaming operations over the message
|
|
6
|
+
broker, with proper support for large documents via stream-document.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
self.librarian = LibrarianClient(
|
|
10
|
+
id=id, backend=self.pubsub, taskgroup=self.taskgroup, **params
|
|
11
|
+
)
|
|
12
|
+
await self.librarian.start()
|
|
13
|
+
content = await self.librarian.fetch_document_content(doc_id, user)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
import base64
|
|
18
|
+
import logging
|
|
19
|
+
import uuid
|
|
20
|
+
|
|
21
|
+
from .consumer import Consumer
|
|
22
|
+
from .producer import Producer
|
|
23
|
+
from .metrics import ConsumerMetrics, ProducerMetrics
|
|
24
|
+
|
|
25
|
+
from ..schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
|
|
26
|
+
from ..schema import librarian_request_queue, librarian_response_queue
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LibrarianClient:
|
|
32
|
+
"""Client for librarian request-response over the message broker."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, id, backend, taskgroup, **params):
|
|
35
|
+
|
|
36
|
+
librarian_request_q = params.get(
|
|
37
|
+
"librarian_request_queue", librarian_request_queue,
|
|
38
|
+
)
|
|
39
|
+
librarian_response_q = params.get(
|
|
40
|
+
"librarian_response_queue", librarian_response_queue,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
librarian_request_metrics = ProducerMetrics(
|
|
44
|
+
processor=id, flow=None, name="librarian-request",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
self._producer = Producer(
|
|
48
|
+
backend=backend,
|
|
49
|
+
topic=librarian_request_q,
|
|
50
|
+
schema=LibrarianRequest,
|
|
51
|
+
metrics=librarian_request_metrics,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
librarian_response_metrics = ConsumerMetrics(
|
|
55
|
+
processor=id, flow=None, name="librarian-response",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
self._consumer = Consumer(
|
|
59
|
+
taskgroup=taskgroup,
|
|
60
|
+
backend=backend,
|
|
61
|
+
flow=None,
|
|
62
|
+
topic=librarian_response_q,
|
|
63
|
+
subscriber=f"{id}-librarian",
|
|
64
|
+
schema=LibrarianResponse,
|
|
65
|
+
handler=self._on_response,
|
|
66
|
+
metrics=librarian_response_metrics,
|
|
67
|
+
consumer_type='exclusive',
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Single-response requests: request_id -> asyncio.Future
|
|
71
|
+
self._pending = {}
|
|
72
|
+
# Streaming requests: request_id -> asyncio.Queue
|
|
73
|
+
self._streams = {}
|
|
74
|
+
|
|
75
|
+
async def start(self):
|
|
76
|
+
"""Start the librarian producer and consumer."""
|
|
77
|
+
await self._producer.start()
|
|
78
|
+
await self._consumer.start()
|
|
79
|
+
|
|
80
|
+
async def _on_response(self, msg, consumer, flow):
|
|
81
|
+
"""Route librarian responses to the right waiter."""
|
|
82
|
+
response = msg.value()
|
|
83
|
+
request_id = msg.properties().get("id")
|
|
84
|
+
|
|
85
|
+
if not request_id:
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
if request_id in self._pending:
|
|
89
|
+
future = self._pending.pop(request_id)
|
|
90
|
+
future.set_result(response)
|
|
91
|
+
elif request_id in self._streams:
|
|
92
|
+
await self._streams[request_id].put(response)
|
|
93
|
+
|
|
94
|
+
async def request(self, request, timeout=120):
|
|
95
|
+
"""Send a request to the librarian and wait for a single response."""
|
|
96
|
+
request_id = str(uuid.uuid4())
|
|
97
|
+
|
|
98
|
+
future = asyncio.get_event_loop().create_future()
|
|
99
|
+
self._pending[request_id] = future
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
await self._producer.send(
|
|
103
|
+
request, properties={"id": request_id},
|
|
104
|
+
)
|
|
105
|
+
response = await asyncio.wait_for(future, timeout=timeout)
|
|
106
|
+
|
|
107
|
+
if response.error:
|
|
108
|
+
raise RuntimeError(
|
|
109
|
+
f"Librarian error: {response.error.type}: "
|
|
110
|
+
f"{response.error.message}"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
return response
|
|
114
|
+
|
|
115
|
+
except asyncio.TimeoutError:
|
|
116
|
+
self._pending.pop(request_id, None)
|
|
117
|
+
raise RuntimeError("Timeout waiting for librarian response")
|
|
118
|
+
|
|
119
|
+
async def stream(self, request, timeout=120):
|
|
120
|
+
"""Send a request and collect streamed response chunks."""
|
|
121
|
+
request_id = str(uuid.uuid4())
|
|
122
|
+
|
|
123
|
+
q = asyncio.Queue()
|
|
124
|
+
self._streams[request_id] = q
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
await self._producer.send(
|
|
128
|
+
request, properties={"id": request_id},
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
chunks = []
|
|
132
|
+
while True:
|
|
133
|
+
response = await asyncio.wait_for(q.get(), timeout=timeout)
|
|
134
|
+
|
|
135
|
+
if response.error:
|
|
136
|
+
raise RuntimeError(
|
|
137
|
+
f"Librarian error: {response.error.type}: "
|
|
138
|
+
f"{response.error.message}"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
chunks.append(response)
|
|
142
|
+
|
|
143
|
+
if response.is_final:
|
|
144
|
+
break
|
|
145
|
+
|
|
146
|
+
return chunks
|
|
147
|
+
|
|
148
|
+
except asyncio.TimeoutError:
|
|
149
|
+
self._streams.pop(request_id, None)
|
|
150
|
+
raise RuntimeError("Timeout waiting for librarian stream")
|
|
151
|
+
finally:
|
|
152
|
+
self._streams.pop(request_id, None)
|
|
153
|
+
|
|
154
|
+
async def fetch_document_content(self, document_id, user, timeout=120):
|
|
155
|
+
"""Fetch document content using streaming.
|
|
156
|
+
|
|
157
|
+
Returns base64-encoded content. Caller is responsible for decoding.
|
|
158
|
+
"""
|
|
159
|
+
req = LibrarianRequest(
|
|
160
|
+
operation="stream-document",
|
|
161
|
+
document_id=document_id,
|
|
162
|
+
user=user,
|
|
163
|
+
)
|
|
164
|
+
chunks = await self.stream(req, timeout=timeout)
|
|
165
|
+
|
|
166
|
+
# Decode each chunk's base64 to raw bytes, concatenate,
|
|
167
|
+
# re-encode for the caller.
|
|
168
|
+
raw = b""
|
|
169
|
+
for chunk in chunks:
|
|
170
|
+
if chunk.content:
|
|
171
|
+
if isinstance(chunk.content, bytes):
|
|
172
|
+
raw += base64.b64decode(chunk.content)
|
|
173
|
+
else:
|
|
174
|
+
raw += base64.b64decode(
|
|
175
|
+
chunk.content.encode("utf-8")
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
return base64.b64encode(raw)
|
|
179
|
+
|
|
180
|
+
async def fetch_document_text(self, document_id, user, timeout=120):
|
|
181
|
+
"""Fetch document content and decode as UTF-8 text."""
|
|
182
|
+
content = await self.fetch_document_content(
|
|
183
|
+
document_id, user, timeout=timeout,
|
|
184
|
+
)
|
|
185
|
+
return base64.b64decode(content).decode("utf-8")
|
|
186
|
+
|
|
187
|
+
async def fetch_document_metadata(self, document_id, user, timeout=120):
|
|
188
|
+
"""Fetch document metadata from the librarian."""
|
|
189
|
+
req = LibrarianRequest(
|
|
190
|
+
operation="get-document-metadata",
|
|
191
|
+
document_id=document_id,
|
|
192
|
+
user=user,
|
|
193
|
+
)
|
|
194
|
+
response = await self.request(req, timeout=timeout)
|
|
195
|
+
return response.document_metadata
|
|
196
|
+
|
|
197
|
+
async def save_child_document(self, doc_id, parent_id, user, content,
|
|
198
|
+
document_type="chunk", title=None,
|
|
199
|
+
kind="text/plain", timeout=120):
|
|
200
|
+
"""Save a child document to the librarian."""
|
|
201
|
+
if isinstance(content, str):
|
|
202
|
+
content = content.encode("utf-8")
|
|
203
|
+
|
|
204
|
+
doc_metadata = DocumentMetadata(
|
|
205
|
+
id=doc_id,
|
|
206
|
+
user=user,
|
|
207
|
+
kind=kind,
|
|
208
|
+
title=title or doc_id,
|
|
209
|
+
parent_id=parent_id,
|
|
210
|
+
document_type=document_type,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
req = LibrarianRequest(
|
|
214
|
+
operation="add-child-document",
|
|
215
|
+
document_metadata=doc_metadata,
|
|
216
|
+
content=base64.b64encode(content).decode("utf-8"),
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
await self.request(req, timeout=timeout)
|
|
220
|
+
return doc_id
|
|
221
|
+
|
|
222
|
+
async def save_document(self, doc_id, user, content, title=None,
|
|
223
|
+
document_type="answer", kind="text/plain",
|
|
224
|
+
timeout=120):
|
|
225
|
+
"""Save a document to the librarian."""
|
|
226
|
+
if isinstance(content, str):
|
|
227
|
+
content = content.encode("utf-8")
|
|
228
|
+
|
|
229
|
+
doc_metadata = DocumentMetadata(
|
|
230
|
+
id=doc_id,
|
|
231
|
+
user=user,
|
|
232
|
+
kind=kind,
|
|
233
|
+
title=title or doc_id,
|
|
234
|
+
document_type=document_type,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
req = LibrarianRequest(
|
|
238
|
+
operation="add-document",
|
|
239
|
+
document_id=doc_id,
|
|
240
|
+
document_metadata=doc_metadata,
|
|
241
|
+
content=base64.b64encode(content).decode("utf-8"),
|
|
242
|
+
user=user,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
await self.request(req, timeout=timeout)
|
|
246
|
+
return doc_id
|