trustgraph-ocr 2.2.15__tar.gz → 2.2.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: trustgraph-ocr
3
- Version: 2.2.15
3
+ Version: 2.2.16
4
4
  Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
5
  Author-email: "trustgraph.ai" <security@trustgraph.ai>
6
6
  Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
@@ -7,19 +7,15 @@ Supports both inline document data and fetching from librarian via Pulsar
7
7
  for large documents.
8
8
  """
9
9
 
10
- import asyncio
11
10
  import base64
12
11
  import logging
13
- import uuid
14
12
  import pytesseract
15
13
  from pdf2image import convert_from_bytes
16
14
 
17
15
  from ... schema import Document, TextDocument, Metadata
18
- from ... schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
19
16
  from ... schema import librarian_request_queue, librarian_response_queue
20
17
  from ... schema import Triples
21
- from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
22
- from ... base import Consumer, Producer, ConsumerMetrics, ProducerMetrics
18
+ from ... base import FlowProcessor, ConsumerSpec, ProducerSpec, LibrarianClient
23
19
 
24
20
  from ... provenance import (
25
21
  document_uri, page_uri as make_page_uri, derived_entity_triples,
@@ -72,173 +68,16 @@ class Processor(FlowProcessor):
72
68
  )
73
69
  )
74
70
 
75
- # Librarian client for fetching document content
76
- librarian_request_q = params.get(
77
- "librarian_request_queue", default_librarian_request_queue
71
+ # Librarian client
72
+ self.librarian = LibrarianClient(
73
+ id=id, backend=self.pubsub, taskgroup=self.taskgroup,
78
74
  )
79
- librarian_response_q = params.get(
80
- "librarian_response_queue", default_librarian_response_queue
81
- )
82
-
83
- librarian_request_metrics = ProducerMetrics(
84
- processor = id, flow = None, name = "librarian-request"
85
- )
86
-
87
- self.librarian_request_producer = Producer(
88
- backend = self.pubsub,
89
- topic = librarian_request_q,
90
- schema = LibrarianRequest,
91
- metrics = librarian_request_metrics,
92
- )
93
-
94
- librarian_response_metrics = ConsumerMetrics(
95
- processor = id, flow = None, name = "librarian-response"
96
- )
97
-
98
- self.librarian_response_consumer = Consumer(
99
- taskgroup = self.taskgroup,
100
- backend = self.pubsub,
101
- flow = None,
102
- topic = librarian_response_q,
103
- subscriber = f"{id}-librarian",
104
- schema = LibrarianResponse,
105
- handler = self.on_librarian_response,
106
- metrics = librarian_response_metrics,
107
- )
108
-
109
- # Pending librarian requests: request_id -> asyncio.Future
110
- self.pending_requests = {}
111
75
 
112
76
  logger.info("PDF OCR processor initialized")
113
77
 
114
78
  async def start(self):
115
79
  await super(Processor, self).start()
116
- await self.librarian_request_producer.start()
117
- await self.librarian_response_consumer.start()
118
-
119
- async def on_librarian_response(self, msg, consumer, flow):
120
- """Handle responses from the librarian service."""
121
- response = msg.value()
122
- request_id = msg.properties().get("id")
123
-
124
- if request_id and request_id in self.pending_requests:
125
- future = self.pending_requests.pop(request_id)
126
- future.set_result(response)
127
-
128
- async def fetch_document_metadata(self, document_id, user, timeout=120):
129
- """
130
- Fetch document metadata from librarian via Pulsar.
131
- """
132
- request_id = str(uuid.uuid4())
133
-
134
- request = LibrarianRequest(
135
- operation="get-document-metadata",
136
- document_id=document_id,
137
- user=user,
138
- )
139
-
140
- future = asyncio.get_event_loop().create_future()
141
- self.pending_requests[request_id] = future
142
-
143
- try:
144
- await self.librarian_request_producer.send(
145
- request, properties={"id": request_id}
146
- )
147
-
148
- response = await asyncio.wait_for(future, timeout=timeout)
149
-
150
- if response.error:
151
- raise RuntimeError(
152
- f"Librarian error: {response.error.type}: {response.error.message}"
153
- )
154
-
155
- return response.document_metadata
156
-
157
- except asyncio.TimeoutError:
158
- self.pending_requests.pop(request_id, None)
159
- raise RuntimeError(f"Timeout fetching metadata for {document_id}")
160
-
161
- async def fetch_document_content(self, document_id, user, timeout=120):
162
- """
163
- Fetch document content from librarian via Pulsar.
164
- """
165
- request_id = str(uuid.uuid4())
166
-
167
- request = LibrarianRequest(
168
- operation="get-document-content",
169
- document_id=document_id,
170
- user=user,
171
- )
172
-
173
- # Create future for response
174
- future = asyncio.get_event_loop().create_future()
175
- self.pending_requests[request_id] = future
176
-
177
- try:
178
- # Send request
179
- await self.librarian_request_producer.send(
180
- request, properties={"id": request_id}
181
- )
182
-
183
- # Wait for response
184
- response = await asyncio.wait_for(future, timeout=timeout)
185
-
186
- if response.error:
187
- raise RuntimeError(
188
- f"Librarian error: {response.error.type}: {response.error.message}"
189
- )
190
-
191
- return response.content
192
-
193
- except asyncio.TimeoutError:
194
- self.pending_requests.pop(request_id, None)
195
- raise RuntimeError(f"Timeout fetching document {document_id}")
196
-
197
- async def save_child_document(self, doc_id, parent_id, user, content,
198
- document_type="page", title=None, timeout=120):
199
- """
200
- Save a child document to the librarian.
201
- """
202
- request_id = str(uuid.uuid4())
203
-
204
- doc_metadata = DocumentMetadata(
205
- id=doc_id,
206
- user=user,
207
- kind="text/plain",
208
- title=title or doc_id,
209
- parent_id=parent_id,
210
- document_type=document_type,
211
- )
212
-
213
- request = LibrarianRequest(
214
- operation="add-child-document",
215
- document_metadata=doc_metadata,
216
- content=base64.b64encode(content).decode("utf-8"),
217
- )
218
-
219
- # Create future for response
220
- future = asyncio.get_event_loop().create_future()
221
- self.pending_requests[request_id] = future
222
-
223
- try:
224
- # Send request
225
- await self.librarian_request_producer.send(
226
- request, properties={"id": request_id}
227
- )
228
-
229
- # Wait for response
230
- response = await asyncio.wait_for(future, timeout=timeout)
231
-
232
- if response.error:
233
- raise RuntimeError(
234
- f"Librarian error saving child document: {response.error.type}: {response.error.message}"
235
- )
236
-
237
- return doc_id
238
-
239
- except asyncio.TimeoutError:
240
- self.pending_requests.pop(request_id, None)
241
- raise RuntimeError(f"Timeout saving child document {doc_id}")
80
+ await self.librarian.start()
242
81
 
243
82
  async def on_message(self, msg, consumer, flow):
244
83
 
@@ -250,7 +89,7 @@ class Processor(FlowProcessor):
250
89
 
251
90
  # Check MIME type if fetching from librarian
252
91
  if v.document_id:
253
- doc_meta = await self.fetch_document_metadata(
92
+ doc_meta = await self.librarian.fetch_document_metadata(
254
93
  document_id=v.document_id,
255
94
  user=v.metadata.user,
256
95
  )
@@ -265,7 +104,7 @@ class Processor(FlowProcessor):
265
104
  # Get PDF content - fetch from librarian or use inline data
266
105
  if v.document_id:
267
106
  logger.info(f"Fetching document {v.document_id} from librarian...")
268
- content = await self.fetch_document_content(
107
+ content = await self.librarian.fetch_document_content(
269
108
  document_id=v.document_id,
270
109
  user=v.metadata.user,
271
110
  )
@@ -299,7 +138,7 @@ class Processor(FlowProcessor):
299
138
  page_content = text.encode("utf-8")
300
139
 
301
140
  # Save page as child document in librarian
302
- await self.save_child_document(
141
+ await self.librarian.save_child_document(
303
142
  doc_id=page_doc_id,
304
143
  parent_id=source_doc_id,
305
144
  user=v.metadata.user,
@@ -0,0 +1 @@
1
+ __version__ = "2.2.16"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: trustgraph-ocr
3
- Version: 2.2.15
3
+ Version: 2.2.16
4
4
  Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
5
  Author-email: "trustgraph.ai" <security@trustgraph.ai>
6
6
  Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
@@ -1 +0,0 @@
1
- __version__ = "2.2.15"