trustgraph-ocr 2.2.15__tar.gz → 2.2.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {trustgraph_ocr-2.2.15 → trustgraph_ocr-2.2.17}/PKG-INFO +1 -1
- {trustgraph_ocr-2.2.15 → trustgraph_ocr-2.2.17}/trustgraph/decoding/ocr/pdf_decoder.py +8 -169
- trustgraph_ocr-2.2.17/trustgraph/ocr_version.py +1 -0
- {trustgraph_ocr-2.2.15 → trustgraph_ocr-2.2.17}/trustgraph_ocr.egg-info/PKG-INFO +1 -1
- trustgraph_ocr-2.2.15/trustgraph/ocr_version.py +0 -1
- {trustgraph_ocr-2.2.15 → trustgraph_ocr-2.2.17}/README.md +0 -0
- {trustgraph_ocr-2.2.15 → trustgraph_ocr-2.2.17}/pyproject.toml +0 -0
- {trustgraph_ocr-2.2.15 → trustgraph_ocr-2.2.17}/setup.cfg +0 -0
- {trustgraph_ocr-2.2.15 → trustgraph_ocr-2.2.17}/trustgraph/decoding/ocr/__init__.py +0 -0
- {trustgraph_ocr-2.2.15 → trustgraph_ocr-2.2.17}/trustgraph/decoding/ocr/__main__.py +0 -0
- {trustgraph_ocr-2.2.15 → trustgraph_ocr-2.2.17}/trustgraph_ocr.egg-info/SOURCES.txt +0 -0
- {trustgraph_ocr-2.2.15 → trustgraph_ocr-2.2.17}/trustgraph_ocr.egg-info/dependency_links.txt +0 -0
- {trustgraph_ocr-2.2.15 → trustgraph_ocr-2.2.17}/trustgraph_ocr.egg-info/entry_points.txt +0 -0
- {trustgraph_ocr-2.2.15 → trustgraph_ocr-2.2.17}/trustgraph_ocr.egg-info/requires.txt +0 -0
- {trustgraph_ocr-2.2.15 → trustgraph_ocr-2.2.17}/trustgraph_ocr.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: trustgraph-ocr
|
|
3
|
-
Version: 2.2.
|
|
3
|
+
Version: 2.2.17
|
|
4
4
|
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
|
5
5
|
Author-email: "trustgraph.ai" <security@trustgraph.ai>
|
|
6
6
|
Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
|
|
@@ -7,19 +7,15 @@ Supports both inline document data and fetching from librarian via Pulsar
|
|
|
7
7
|
for large documents.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
-
import asyncio
|
|
11
10
|
import base64
|
|
12
11
|
import logging
|
|
13
|
-
import uuid
|
|
14
12
|
import pytesseract
|
|
15
13
|
from pdf2image import convert_from_bytes
|
|
16
14
|
|
|
17
15
|
from ... schema import Document, TextDocument, Metadata
|
|
18
|
-
from ... schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
|
|
19
16
|
from ... schema import librarian_request_queue, librarian_response_queue
|
|
20
17
|
from ... schema import Triples
|
|
21
|
-
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
|
22
|
-
from ... base import Consumer, Producer, ConsumerMetrics, ProducerMetrics
|
|
18
|
+
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec, LibrarianClient
|
|
23
19
|
|
|
24
20
|
from ... provenance import (
|
|
25
21
|
document_uri, page_uri as make_page_uri, derived_entity_triples,
|
|
@@ -72,173 +68,16 @@ class Processor(FlowProcessor):
|
|
|
72
68
|
)
|
|
73
69
|
)
|
|
74
70
|
|
|
75
|
-
# Librarian client
|
|
76
|
-
|
|
77
|
-
|
|
71
|
+
# Librarian client
|
|
72
|
+
self.librarian = LibrarianClient(
|
|
73
|
+
id=id, backend=self.pubsub, taskgroup=self.taskgroup,
|
|
78
74
|
)
|
|
79
|
-
librarian_response_q = params.get(
|
|
80
|
-
"librarian_response_queue", default_librarian_response_queue
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
librarian_request_metrics = ProducerMetrics(
|
|
84
|
-
processor = id, flow = None, name = "librarian-request"
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
self.librarian_request_producer = Producer(
|
|
88
|
-
backend = self.pubsub,
|
|
89
|
-
topic = librarian_request_q,
|
|
90
|
-
schema = LibrarianRequest,
|
|
91
|
-
metrics = librarian_request_metrics,
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
librarian_response_metrics = ConsumerMetrics(
|
|
95
|
-
processor = id, flow = None, name = "librarian-response"
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
self.librarian_response_consumer = Consumer(
|
|
99
|
-
taskgroup = self.taskgroup,
|
|
100
|
-
backend = self.pubsub,
|
|
101
|
-
flow = None,
|
|
102
|
-
topic = librarian_response_q,
|
|
103
|
-
subscriber = f"{id}-librarian",
|
|
104
|
-
schema = LibrarianResponse,
|
|
105
|
-
handler = self.on_librarian_response,
|
|
106
|
-
metrics = librarian_response_metrics,
|
|
107
|
-
)
|
|
108
|
-
|
|
109
|
-
# Pending librarian requests: request_id -> asyncio.Future
|
|
110
|
-
self.pending_requests = {}
|
|
111
75
|
|
|
112
76
|
logger.info("PDF OCR processor initialized")
|
|
113
77
|
|
|
114
78
|
async def start(self):
|
|
115
79
|
await super(Processor, self).start()
|
|
116
|
-
await self.
|
|
117
|
-
await self.librarian_response_consumer.start()
|
|
118
|
-
|
|
119
|
-
async def on_librarian_response(self, msg, consumer, flow):
|
|
120
|
-
"""Handle responses from the librarian service."""
|
|
121
|
-
response = msg.value()
|
|
122
|
-
request_id = msg.properties().get("id")
|
|
123
|
-
|
|
124
|
-
if request_id and request_id in self.pending_requests:
|
|
125
|
-
future = self.pending_requests.pop(request_id)
|
|
126
|
-
future.set_result(response)
|
|
127
|
-
|
|
128
|
-
async def fetch_document_metadata(self, document_id, user, timeout=120):
|
|
129
|
-
"""
|
|
130
|
-
Fetch document metadata from librarian via Pulsar.
|
|
131
|
-
"""
|
|
132
|
-
request_id = str(uuid.uuid4())
|
|
133
|
-
|
|
134
|
-
request = LibrarianRequest(
|
|
135
|
-
operation="get-document-metadata",
|
|
136
|
-
document_id=document_id,
|
|
137
|
-
user=user,
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
future = asyncio.get_event_loop().create_future()
|
|
141
|
-
self.pending_requests[request_id] = future
|
|
142
|
-
|
|
143
|
-
try:
|
|
144
|
-
await self.librarian_request_producer.send(
|
|
145
|
-
request, properties={"id": request_id}
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
response = await asyncio.wait_for(future, timeout=timeout)
|
|
149
|
-
|
|
150
|
-
if response.error:
|
|
151
|
-
raise RuntimeError(
|
|
152
|
-
f"Librarian error: {response.error.type}: {response.error.message}"
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
return response.document_metadata
|
|
156
|
-
|
|
157
|
-
except asyncio.TimeoutError:
|
|
158
|
-
self.pending_requests.pop(request_id, None)
|
|
159
|
-
raise RuntimeError(f"Timeout fetching metadata for {document_id}")
|
|
160
|
-
|
|
161
|
-
async def fetch_document_content(self, document_id, user, timeout=120):
|
|
162
|
-
"""
|
|
163
|
-
Fetch document content from librarian via Pulsar.
|
|
164
|
-
"""
|
|
165
|
-
request_id = str(uuid.uuid4())
|
|
166
|
-
|
|
167
|
-
request = LibrarianRequest(
|
|
168
|
-
operation="get-document-content",
|
|
169
|
-
document_id=document_id,
|
|
170
|
-
user=user,
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
# Create future for response
|
|
174
|
-
future = asyncio.get_event_loop().create_future()
|
|
175
|
-
self.pending_requests[request_id] = future
|
|
176
|
-
|
|
177
|
-
try:
|
|
178
|
-
# Send request
|
|
179
|
-
await self.librarian_request_producer.send(
|
|
180
|
-
request, properties={"id": request_id}
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
# Wait for response
|
|
184
|
-
response = await asyncio.wait_for(future, timeout=timeout)
|
|
185
|
-
|
|
186
|
-
if response.error:
|
|
187
|
-
raise RuntimeError(
|
|
188
|
-
f"Librarian error: {response.error.type}: {response.error.message}"
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
return response.content
|
|
192
|
-
|
|
193
|
-
except asyncio.TimeoutError:
|
|
194
|
-
self.pending_requests.pop(request_id, None)
|
|
195
|
-
raise RuntimeError(f"Timeout fetching document {document_id}")
|
|
196
|
-
|
|
197
|
-
async def save_child_document(self, doc_id, parent_id, user, content,
|
|
198
|
-
document_type="page", title=None, timeout=120):
|
|
199
|
-
"""
|
|
200
|
-
Save a child document to the librarian.
|
|
201
|
-
"""
|
|
202
|
-
request_id = str(uuid.uuid4())
|
|
203
|
-
|
|
204
|
-
doc_metadata = DocumentMetadata(
|
|
205
|
-
id=doc_id,
|
|
206
|
-
user=user,
|
|
207
|
-
kind="text/plain",
|
|
208
|
-
title=title or doc_id,
|
|
209
|
-
parent_id=parent_id,
|
|
210
|
-
document_type=document_type,
|
|
211
|
-
)
|
|
212
|
-
|
|
213
|
-
request = LibrarianRequest(
|
|
214
|
-
operation="add-child-document",
|
|
215
|
-
document_metadata=doc_metadata,
|
|
216
|
-
content=base64.b64encode(content).decode("utf-8"),
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
# Create future for response
|
|
220
|
-
future = asyncio.get_event_loop().create_future()
|
|
221
|
-
self.pending_requests[request_id] = future
|
|
222
|
-
|
|
223
|
-
try:
|
|
224
|
-
# Send request
|
|
225
|
-
await self.librarian_request_producer.send(
|
|
226
|
-
request, properties={"id": request_id}
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
# Wait for response
|
|
230
|
-
response = await asyncio.wait_for(future, timeout=timeout)
|
|
231
|
-
|
|
232
|
-
if response.error:
|
|
233
|
-
raise RuntimeError(
|
|
234
|
-
f"Librarian error saving child document: {response.error.type}: {response.error.message}"
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
return doc_id
|
|
238
|
-
|
|
239
|
-
except asyncio.TimeoutError:
|
|
240
|
-
self.pending_requests.pop(request_id, None)
|
|
241
|
-
raise RuntimeError(f"Timeout saving child document {doc_id}")
|
|
80
|
+
await self.librarian.start()
|
|
242
81
|
|
|
243
82
|
async def on_message(self, msg, consumer, flow):
|
|
244
83
|
|
|
@@ -250,7 +89,7 @@ class Processor(FlowProcessor):
|
|
|
250
89
|
|
|
251
90
|
# Check MIME type if fetching from librarian
|
|
252
91
|
if v.document_id:
|
|
253
|
-
doc_meta = await self.fetch_document_metadata(
|
|
92
|
+
doc_meta = await self.librarian.fetch_document_metadata(
|
|
254
93
|
document_id=v.document_id,
|
|
255
94
|
user=v.metadata.user,
|
|
256
95
|
)
|
|
@@ -265,7 +104,7 @@ class Processor(FlowProcessor):
|
|
|
265
104
|
# Get PDF content - fetch from librarian or use inline data
|
|
266
105
|
if v.document_id:
|
|
267
106
|
logger.info(f"Fetching document {v.document_id} from librarian...")
|
|
268
|
-
content = await self.fetch_document_content(
|
|
107
|
+
content = await self.librarian.fetch_document_content(
|
|
269
108
|
document_id=v.document_id,
|
|
270
109
|
user=v.metadata.user,
|
|
271
110
|
)
|
|
@@ -299,7 +138,7 @@ class Processor(FlowProcessor):
|
|
|
299
138
|
page_content = text.encode("utf-8")
|
|
300
139
|
|
|
301
140
|
# Save page as child document in librarian
|
|
302
|
-
await self.save_child_document(
|
|
141
|
+
await self.librarian.save_child_document(
|
|
303
142
|
doc_id=page_doc_id,
|
|
304
143
|
parent_id=source_doc_id,
|
|
305
144
|
user=v.metadata.user,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "2.2.17"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: trustgraph-ocr
|
|
3
|
-
Version: 2.2.
|
|
3
|
+
Version: 2.2.17
|
|
4
4
|
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
|
5
5
|
Author-email: "trustgraph.ai" <security@trustgraph.ai>
|
|
6
6
|
Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "2.2.15"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{trustgraph_ocr-2.2.15 → trustgraph_ocr-2.2.17}/trustgraph_ocr.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|