PyPI - trustgraph-base - Versions diffs - 2.2.15__tar.gz → 2.2.16__tar.gz - Mend

trustgraph-base 2.2.15tar.gz → 2.2.16tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

{trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: trustgraph-base
-Version: 2.2.15
+Version: 2.2.16
 Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
 Author-email: "trustgraph.ai" <security@trustgraph.ai>
 Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
@@ -12,5 +12,6 @@ Requires-Dist: pulsar-client
 Requires-Dist: prometheus-client
 Requires-Dist: requests
 Requires-Dist: python-logging-loki
+Requires-Dist: pika
 See https://trustgraph.ai/

{trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/pyproject.toml RENAMED Viewed

@@ -14,6 +14,7 @@ dependencies = [
     "prometheus-client",
     "requests",
     "python-logging-loki",
+    "pika",
 ]
 classifiers = [
     "Programming Language :: Python :: 3",

{trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/api/library.py RENAMED Viewed

@@ -22,8 +22,9 @@ logger = logging.getLogger(__name__)
 # Lower threshold provides progress feedback and resumability on slower connections
 CHUNKED_UPLOAD_THRESHOLD = 2 * 1024 * 1024
-# Default chunk size (5MB - S3 multipart minimum)
-DEFAULT_CHUNK_SIZE = 5 * 1024 * 1024
+# Default chunk size (3MB - stays under broker message size limits
+# after base64 encoding ~4MB)
+DEFAULT_CHUNK_SIZE = 3 * 1024 * 1024
 def to_value(x):

{trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/__init__.py RENAMED Viewed

@@ -14,6 +14,7 @@ from . producer_spec import ProducerSpec
 from . subscriber_spec import SubscriberSpec
 from . request_response_spec import RequestResponseSpec
 from . llm_service import LlmService, LlmResult, LlmChunk
+from . librarian_client import LibrarianClient
 from . chunking_service import ChunkingService
 from . embeddings_service import EmbeddingsService
 from . embeddings_client import EmbeddingsClientSpec

{trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/async_processor.py RENAMED Viewed

@@ -68,11 +68,12 @@ class AsyncProcessor:
             processor = self.id, flow = None, name = "config",
         )
-        # Subscribe to config queue
+        # Subscribe to config queue — exclusive so every processor
+        # gets its own copy of config pushes (broadcast pattern)
         self.config_sub_task = Consumer(
             taskgroup = self.taskgroup,
-            backend = self.pubsub_backend,  # Changed from client to backend
+            backend = self.pubsub_backend,
             subscriber = config_subscriber_id,
             flow = None,
@@ -83,9 +84,8 @@ class AsyncProcessor:
             metrics = config_consumer_metrics,
-            # This causes new subscriptions to view the entire history of
-            # configuration
-            start_of_messages = True
+            start_of_messages = True,
+            consumer_type = 'exclusive',
         )
         self.running = True

trustgraph_base-2.2.16/trustgraph/base/chunking_service.py ADDED Viewed

@@ -0,0 +1,104 @@
+"""
+Base chunking service that provides parameter specification functionality
+for chunk-size and chunk-overlap parameters, and librarian client for
+fetching large document content.
+"""
+import asyncio
+import base64
+import logging
+from .flow_processor import FlowProcessor
+from .parameter_spec import ParameterSpec
+from .librarian_client import LibrarianClient
+# Module logger
+logger = logging.getLogger(__name__)
+class ChunkingService(FlowProcessor):
+    """Base service for chunking processors with parameter specification support"""
+    def __init__(self, **params):
+        id = params.get("id", "chunker")
+        # Call parent constructor
+        super(ChunkingService, self).__init__(**params)
+        # Register parameter specifications for chunk-size and chunk-overlap
+        self.register_specification(
+            ParameterSpec(name="chunk-size")
+        )
+        self.register_specification(
+            ParameterSpec(name="chunk-overlap")
+        )
+        # Librarian client
+        self.librarian = LibrarianClient(
+            id=id,
+            backend=self.pubsub,
+            taskgroup=self.taskgroup,
+        )
+        logger.debug("ChunkingService initialized with parameter specifications")
+    async def start(self):
+        await super(ChunkingService, self).start()
+        await self.librarian.start()
+    async def get_document_text(self, doc):
+        """
+        Get text content from a TextDocument, fetching from librarian if needed.
+        Args:
+            doc: TextDocument with either inline text or document_id
+        Returns:
+            str: The document text content
+        """
+        if doc.document_id and not doc.text:
+            logger.info(f"Fetching document {doc.document_id} from librarian...")
+            text = await self.librarian.fetch_document_text(
+                document_id=doc.document_id,
+                user=doc.metadata.user,
+            )
+            logger.info(f"Fetched {len(text)} characters from librarian")
+            return text
+        else:
+            return doc.text.decode("utf-8")
+    async def chunk_document(self, msg, consumer, flow, default_chunk_size, default_chunk_overlap):
+        """
+        Extract chunk parameters from flow and return effective values
+        Args:
+            msg: The message being processed
+            consumer: The consumer instance
+            flow: The flow object containing parameters
+            default_chunk_size: Default chunk size if not configured
+            default_chunk_overlap: Default chunk overlap if not configured
+        Returns:
+            tuple: (chunk_size, chunk_overlap) effective values
+        """
+        chunk_size = default_chunk_size
+        chunk_overlap = default_chunk_overlap
+        try:
+            cs = flow.parameters.get("chunk-size")
+            if cs is not None:
+                chunk_size = int(cs)
+        except Exception as e:
+            logger.warning(f"Could not parse chunk-size parameter: {e}")
+        try:
+            co = flow.parameters.get("chunk-overlap")
+            if co is not None:
+                chunk_overlap = int(co)
+        except Exception as e:
+            logger.warning(f"Could not parse chunk-overlap parameter: {e}")
+        return chunk_size, chunk_overlap

{trustgraph_base-2.2.15 → trustgraph_base-2.2.16}/trustgraph/base/consumer.py RENAMED Viewed

@@ -32,6 +32,7 @@ class Consumer:
             rate_limit_retry_time = 10, rate_limit_timeout = 7200,
             reconnect_time = 5,
             concurrency = 1, # Number of concurrent requests to handle
+            consumer_type = 'shared',
     ):
         self.taskgroup = taskgroup
@@ -42,6 +43,8 @@ class Consumer:
         self.schema = schema
         self.handler = handler
+        self.consumer_type = consumer_type
         self.rate_limit_retry_time = rate_limit_retry_time
         self.rate_limit_timeout = rate_limit_timeout
@@ -93,33 +96,11 @@ class Consumer:
             if self.metrics:
                 self.metrics.state("stopped")
-            try:
-                logger.info(f"Subscribing to topic: {self.topic}")
-                # Determine initial position
-                if self.start_of_messages:
-                    initial_pos = 'earliest'
-                else:
-                    initial_pos = 'latest'
-                # Create consumer via backend
-                self.consumer = await asyncio.to_thread(
-                    self.backend.create_consumer,
-                    topic = self.topic,
-                    subscription = self.subscriber,
-                    schema = self.schema,
-                    initial_position = initial_pos,
-                    consumer_type = 'shared',
-                )
-            except Exception as e:
-                logger.error(f"Consumer subscription exception: {e}", exc_info=True)
-                await asyncio.sleep(self.reconnect_time)
-                continue
-            logger.info(f"Successfully subscribed to topic: {self.topic}")
+            # Determine initial position
+            if self.start_of_messages:
+                initial_pos = 'earliest'
+            else:
+                initial_pos = 'latest'
             if self.metrics:
                 self.metrics.state("running")
@@ -128,14 +109,30 @@ class Consumer:
                 logger.info(f"Starting {self.concurrency} receiver threads")
-                async with asyncio.TaskGroup() as tg:
-                    tasks = []
-                    for i in range(0, self.concurrency):
-                        tasks.append(
-                            tg.create_task(self.consume_from_queue())
+                # Create one backend consumer per concurrent task.
+                # Each gets its own connection — required for backends
+                # like RabbitMQ where connections are not thread-safe.
+                consumers = []
+                for i in range(self.concurrency):
+                    try:
+                        logger.info(f"Subscribing to topic: {self.topic} (worker {i})")
+                        c = await asyncio.to_thread(
+                            self.backend.create_consumer,
+                            topic = self.topic,
+                            subscription = self.subscriber,
+                            schema = self.schema,
+                            initial_position = initial_pos,
+                            consumer_type = self.consumer_type,
                         )
+                        consumers.append(c)
+                        logger.info(f"Successfully subscribed to topic: {self.topic} (worker {i})")
+                    except Exception as e:
+                        logger.error(f"Consumer subscription exception (worker {i}): {e}", exc_info=True)
+                        raise
+                async with asyncio.TaskGroup() as tg:
+                    for c in consumers:
+                        tg.create_task(self.consume_from_queue(c))
                 if self.metrics:
                     self.metrics.state("stopped")
@@ -143,23 +140,31 @@ class Consumer:
             except Exception as e:
                 logger.error(f"Consumer loop exception: {e}", exc_info=True)
-                self.consumer.unsubscribe()
-                self.consumer.close()
-                self.consumer = None
+                for c in consumers:
+                    try:
+                        c.unsubscribe()
+                        c.close()
+                    except Exception:
+                        pass
+                consumers = []
                 await asyncio.sleep(self.reconnect_time)
                 continue
-        if self.consumer:
-            self.consumer.unsubscribe()
-            self.consumer.close()
+            finally:
+                for c in consumers:
+                    try:
+                        c.unsubscribe()
+                        c.close()
+                    except Exception:
+                        pass
-    async def consume_from_queue(self):
+    async def consume_from_queue(self, consumer):
         while self.running:
             try:
                 msg = await asyncio.to_thread(
-                    self.consumer.receive,
+                    consumer.receive,
                     timeout_millis=2000
                 )
             except Exception as e:
@@ -168,9 +173,9 @@ class Consumer:
                     continue
                 raise e
-            await self.handle_one_from_queue(msg)
+            await self.handle_one_from_queue(msg, consumer)
-    async def handle_one_from_queue(self, msg):
+    async def handle_one_from_queue(self, msg, consumer):
         expiry = time.time() + self.rate_limit_timeout
@@ -183,7 +188,7 @@ class Consumer:
                 # Message failed to be processed, this causes it to
                 # be retried
-                self.consumer.negative_acknowledge(msg)
+                consumer.negative_acknowledge(msg)
                 if self.metrics:
                     self.metrics.process("error")
@@ -206,7 +211,7 @@ class Consumer:
                 logger.debug("Message processed successfully")
                 # Acknowledge successful processing of the message
-                self.consumer.acknowledge(msg)
+                consumer.acknowledge(msg)
                 if self.metrics:
                     self.metrics.process("success")
@@ -233,7 +238,7 @@ class Consumer:
                 # Message failed to be processed, this causes it to
                 # be retried
-                self.consumer.negative_acknowledge(msg)
+                consumer.negative_acknowledge(msg)
                 if self.metrics:
                     self.metrics.process("error")

trustgraph_base-2.2.16/trustgraph/base/librarian_client.py ADDED Viewed

@@ -0,0 +1,246 @@
+"""
+Shared librarian client for services that need to communicate
+with the librarian via pub/sub.
+Provides request-response and streaming operations over the message
+broker, with proper support for large documents via stream-document.
+Usage:
+    self.librarian = LibrarianClient(
+        id=id, backend=self.pubsub, taskgroup=self.taskgroup, **params
+    )
+    await self.librarian.start()
+    content = await self.librarian.fetch_document_content(doc_id, user)
+"""
+import asyncio
+import base64
+import logging
+import uuid
+from .consumer import Consumer
+from .producer import Producer
+from .metrics import ConsumerMetrics, ProducerMetrics
+from ..schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
+from ..schema import librarian_request_queue, librarian_response_queue
+logger = logging.getLogger(__name__)
+class LibrarianClient:
+    """Client for librarian request-response over the message broker."""
+    def __init__(self, id, backend, taskgroup, **params):
+        librarian_request_q = params.get(
+            "librarian_request_queue", librarian_request_queue,
+        )
+        librarian_response_q = params.get(
+            "librarian_response_queue", librarian_response_queue,
+        )
+        librarian_request_metrics = ProducerMetrics(
+            processor=id, flow=None, name="librarian-request",
+        )
+        self._producer = Producer(
+            backend=backend,
+            topic=librarian_request_q,
+            schema=LibrarianRequest,
+            metrics=librarian_request_metrics,
+        )
+        librarian_response_metrics = ConsumerMetrics(
+            processor=id, flow=None, name="librarian-response",
+        )
+        self._consumer = Consumer(
+            taskgroup=taskgroup,
+            backend=backend,
+            flow=None,
+            topic=librarian_response_q,
+            subscriber=f"{id}-librarian",
+            schema=LibrarianResponse,
+            handler=self._on_response,
+            metrics=librarian_response_metrics,
+            consumer_type='exclusive',
+        )
+        # Single-response requests: request_id -> asyncio.Future
+        self._pending = {}
+        # Streaming requests: request_id -> asyncio.Queue
+        self._streams = {}
+    async def start(self):
+        """Start the librarian producer and consumer."""
+        await self._producer.start()
+        await self._consumer.start()
+    async def _on_response(self, msg, consumer, flow):
+        """Route librarian responses to the right waiter."""
+        response = msg.value()
+        request_id = msg.properties().get("id")
+        if not request_id:
+            return
+        if request_id in self._pending:
+            future = self._pending.pop(request_id)
+            future.set_result(response)
+        elif request_id in self._streams:
+            await self._streams[request_id].put(response)
+    async def request(self, request, timeout=120):
+        """Send a request to the librarian and wait for a single response."""
+        request_id = str(uuid.uuid4())
+        future = asyncio.get_event_loop().create_future()
+        self._pending[request_id] = future
+        try:
+            await self._producer.send(
+                request, properties={"id": request_id},
+            )
+            response = await asyncio.wait_for(future, timeout=timeout)
+            if response.error:
+                raise RuntimeError(
+                    f"Librarian error: {response.error.type}: "
+                    f"{response.error.message}"
+                )
+            return response
+        except asyncio.TimeoutError:
+            self._pending.pop(request_id, None)
+            raise RuntimeError("Timeout waiting for librarian response")
+    async def stream(self, request, timeout=120):
+        """Send a request and collect streamed response chunks."""
+        request_id = str(uuid.uuid4())
+        q = asyncio.Queue()
+        self._streams[request_id] = q
+        try:
+            await self._producer.send(
+                request, properties={"id": request_id},
+            )
+            chunks = []
+            while True:
+                response = await asyncio.wait_for(q.get(), timeout=timeout)
+                if response.error:
+                    raise RuntimeError(
+                        f"Librarian error: {response.error.type}: "
+                        f"{response.error.message}"
+                    )
+                chunks.append(response)
+                if response.is_final:
+                    break
+            return chunks
+        except asyncio.TimeoutError:
+            self._streams.pop(request_id, None)
+            raise RuntimeError("Timeout waiting for librarian stream")
+        finally:
+            self._streams.pop(request_id, None)
+    async def fetch_document_content(self, document_id, user, timeout=120):
+        """Fetch document content using streaming.
+        Returns base64-encoded content. Caller is responsible for decoding.
+        """
+        req = LibrarianRequest(
+            operation="stream-document",
+            document_id=document_id,
+            user=user,
+        )
+        chunks = await self.stream(req, timeout=timeout)
+        # Decode each chunk's base64 to raw bytes, concatenate,
+        # re-encode for the caller.
+        raw = b""
+        for chunk in chunks:
+            if chunk.content:
+                if isinstance(chunk.content, bytes):
+                    raw += base64.b64decode(chunk.content)
+                else:
+                    raw += base64.b64decode(
+                        chunk.content.encode("utf-8")
+                    )
+        return base64.b64encode(raw)
+    async def fetch_document_text(self, document_id, user, timeout=120):
+        """Fetch document content and decode as UTF-8 text."""
+        content = await self.fetch_document_content(
+            document_id, user, timeout=timeout,
+        )
+        return base64.b64decode(content).decode("utf-8")
+    async def fetch_document_metadata(self, document_id, user, timeout=120):
+        """Fetch document metadata from the librarian."""
+        req = LibrarianRequest(
+            operation="get-document-metadata",
+            document_id=document_id,
+            user=user,
+        )
+        response = await self.request(req, timeout=timeout)
+        return response.document_metadata
+    async def save_child_document(self, doc_id, parent_id, user, content,
+                                  document_type="chunk", title=None,
+                                  kind="text/plain", timeout=120):
+        """Save a child document to the librarian."""
+        if isinstance(content, str):
+            content = content.encode("utf-8")
+        doc_metadata = DocumentMetadata(
+            id=doc_id,
+            user=user,
+            kind=kind,
+            title=title or doc_id,
+            parent_id=parent_id,
+            document_type=document_type,
+        )
+        req = LibrarianRequest(
+            operation="add-child-document",
+            document_metadata=doc_metadata,
+            content=base64.b64encode(content).decode("utf-8"),
+        )
+        await self.request(req, timeout=timeout)
+        return doc_id
+    async def save_document(self, doc_id, user, content, title=None,
+                            document_type="answer", kind="text/plain",
+                            timeout=120):
+        """Save a document to the librarian."""
+        if isinstance(content, str):
+            content = content.encode("utf-8")
+        doc_metadata = DocumentMetadata(
+            id=doc_id,
+            user=user,
+            kind=kind,
+            title=title or doc_id,
+            document_type=document_type,
+        )
+        req = LibrarianRequest(
+            operation="add-document",
+            document_id=doc_id,
+            document_metadata=doc_metadata,
+            content=base64.b64encode(content).decode("utf-8"),
+            user=user,
+        )
+        await self.request(req, timeout=timeout)
+        return doc_id

trustgraph-base 2.2.15__tar.gz → 2.2.16__tar.gz

trustgraph-base 2.2.15tar.gz → 2.2.16tar.gz