trustgraph-unstructured 2.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- trustgraph_unstructured-2.2.1/PKG-INFO +15 -0
- trustgraph_unstructured-2.2.1/pyproject.toml +34 -0
- trustgraph_unstructured-2.2.1/setup.cfg +4 -0
- trustgraph_unstructured-2.2.1/trustgraph/decoding/universal/__init__.py +2 -0
- trustgraph_unstructured-2.2.1/trustgraph/decoding/universal/__main__.py +6 -0
- trustgraph_unstructured-2.2.1/trustgraph/decoding/universal/processor.py +710 -0
- trustgraph_unstructured-2.2.1/trustgraph/decoding/universal/strategies.py +171 -0
- trustgraph_unstructured-2.2.1/trustgraph/unstructured_version.py +1 -0
- trustgraph_unstructured-2.2.1/trustgraph_unstructured.egg-info/PKG-INFO +15 -0
- trustgraph_unstructured-2.2.1/trustgraph_unstructured.egg-info/SOURCES.txt +12 -0
- trustgraph_unstructured-2.2.1/trustgraph_unstructured.egg-info/dependency_links.txt +1 -0
- trustgraph_unstructured-2.2.1/trustgraph_unstructured.egg-info/entry_points.txt +2 -0
- trustgraph_unstructured-2.2.1/trustgraph_unstructured.egg-info/requires.txt +5 -0
- trustgraph_unstructured-2.2.1/trustgraph_unstructured.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: trustgraph-unstructured
|
|
3
|
+
Version: 2.2.1
|
|
4
|
+
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
|
5
|
+
Author-email: "trustgraph.ai" <security@trustgraph.ai>
|
|
6
|
+
Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: trustgraph-base<2.3,>=2.2
|
|
12
|
+
Requires-Dist: pulsar-client
|
|
13
|
+
Requires-Dist: prometheus-client
|
|
14
|
+
Requires-Dist: python-magic
|
|
15
|
+
Requires-Dist: unstructured[csv,docx,epub,md,odt,pptx,rst,rtf,tsv,xlsx]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "trustgraph-unstructured"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
authors = [{name = "trustgraph.ai", email = "security@trustgraph.ai"}]
|
|
9
|
+
description = "TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline."
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
dependencies = [
|
|
13
|
+
"trustgraph-base>=2.2,<2.3",
|
|
14
|
+
"pulsar-client",
|
|
15
|
+
"prometheus-client",
|
|
16
|
+
"python-magic",
|
|
17
|
+
"unstructured[csv,docx,epub,md,odt,pptx,rst,rtf,tsv,xlsx]",
|
|
18
|
+
]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Operating System :: OS Independent",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://github.com/trustgraph-ai/trustgraph"
|
|
26
|
+
|
|
27
|
+
[project.scripts]
|
|
28
|
+
universal-decoder = "trustgraph.decoding.universal:run"
|
|
29
|
+
|
|
30
|
+
[tool.setuptools.packages.find]
|
|
31
|
+
include = ["trustgraph*"]
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.dynamic]
|
|
34
|
+
version = {attr = "trustgraph.unstructured_version.__version__"}
|
|
@@ -0,0 +1,710 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
Universal document decoder powered by the unstructured library.
|
|
4
|
+
|
|
5
|
+
Accepts documents in any common format (PDF, DOCX, XLSX, HTML, Markdown,
|
|
6
|
+
plain text, PPTX, etc.) on input, outputs pages or sections as text
|
|
7
|
+
as separate output objects.
|
|
8
|
+
|
|
9
|
+
Supports both inline document data and fetching from librarian via Pulsar
|
|
10
|
+
for large documents. Fetches document metadata from the librarian to
|
|
11
|
+
determine mime type for format detection.
|
|
12
|
+
|
|
13
|
+
Tables are preserved as HTML markup for better downstream extraction.
|
|
14
|
+
Images are stored in the librarian but not sent to the text pipeline.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import base64
|
|
19
|
+
import logging
|
|
20
|
+
import magic
|
|
21
|
+
import tempfile
|
|
22
|
+
import os
|
|
23
|
+
import uuid
|
|
24
|
+
|
|
25
|
+
from unstructured.partition.auto import partition
|
|
26
|
+
|
|
27
|
+
from ... schema import Document, TextDocument, Metadata
|
|
28
|
+
from ... schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
|
|
29
|
+
from ... schema import librarian_request_queue, librarian_response_queue
|
|
30
|
+
from ... schema import Triples
|
|
31
|
+
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
|
32
|
+
from ... base import Consumer, Producer, ConsumerMetrics, ProducerMetrics
|
|
33
|
+
|
|
34
|
+
from ... provenance import (
|
|
35
|
+
document_uri, page_uri as make_page_uri,
|
|
36
|
+
section_uri as make_section_uri, image_uri as make_image_uri,
|
|
37
|
+
derived_entity_triples, set_graph, GRAPH_SOURCE,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
from . strategies import get_strategy
|
|
41
|
+
|
|
42
|
+
# Component identification for provenance
|
|
43
|
+
COMPONENT_NAME = "universal-decoder"
|
|
44
|
+
COMPONENT_VERSION = "1.0.0"
|
|
45
|
+
|
|
46
|
+
# Module logger
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
default_ident = "document-decoder"
|
|
50
|
+
|
|
51
|
+
default_librarian_request_queue = librarian_request_queue
|
|
52
|
+
default_librarian_response_queue = librarian_response_queue
|
|
53
|
+
|
|
54
|
+
# Mime type to unstructured content_type mapping
|
|
55
|
+
# unstructured auto-detects most formats, but we pass the hint when available
|
|
56
|
+
MIME_EXTENSIONS = {
|
|
57
|
+
"application/pdf": ".pdf",
|
|
58
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
59
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
60
|
+
"application/vnd.ms-excel": ".xls",
|
|
61
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
62
|
+
"text/html": ".html",
|
|
63
|
+
"text/markdown": ".md",
|
|
64
|
+
"text/plain": ".txt",
|
|
65
|
+
"text/csv": ".csv",
|
|
66
|
+
"text/tab-separated-values": ".tsv",
|
|
67
|
+
"application/rtf": ".rtf",
|
|
68
|
+
"text/x-rst": ".rst",
|
|
69
|
+
"application/vnd.oasis.opendocument.text": ".odt",
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# Formats that have natural page boundaries
|
|
73
|
+
PAGE_BASED_FORMATS = {
|
|
74
|
+
"application/pdf",
|
|
75
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
76
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
77
|
+
"application/vnd.ms-excel",
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def assemble_section_text(elements):
|
|
82
|
+
"""
|
|
83
|
+
Assemble text from a list of unstructured elements.
|
|
84
|
+
|
|
85
|
+
- Text elements: plain text, joined with double newlines
|
|
86
|
+
- Table elements: HTML table markup from text_as_html
|
|
87
|
+
- Image elements: skipped (stored separately, not in text output)
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
tuple: (assembled_text, element_types_set, table_count, image_count)
|
|
91
|
+
"""
|
|
92
|
+
parts = []
|
|
93
|
+
element_types = set()
|
|
94
|
+
table_count = 0
|
|
95
|
+
image_count = 0
|
|
96
|
+
|
|
97
|
+
for el in elements:
|
|
98
|
+
category = getattr(el, 'category', 'UncategorizedText')
|
|
99
|
+
element_types.add(category)
|
|
100
|
+
|
|
101
|
+
if category == 'Image':
|
|
102
|
+
image_count += 1
|
|
103
|
+
continue # Images are NOT included in text output
|
|
104
|
+
|
|
105
|
+
if category == 'Table':
|
|
106
|
+
table_count += 1
|
|
107
|
+
# Prefer HTML representation for tables
|
|
108
|
+
html = getattr(el.metadata, 'text_as_html', None) if hasattr(el, 'metadata') else None
|
|
109
|
+
if html:
|
|
110
|
+
parts.append(html)
|
|
111
|
+
else:
|
|
112
|
+
# Fallback to plain text
|
|
113
|
+
text = getattr(el, 'text', '') or ''
|
|
114
|
+
if text:
|
|
115
|
+
parts.append(text)
|
|
116
|
+
else:
|
|
117
|
+
text = getattr(el, 'text', '') or ''
|
|
118
|
+
if text:
|
|
119
|
+
parts.append(text)
|
|
120
|
+
|
|
121
|
+
return '\n\n'.join(parts), element_types, table_count, image_count
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class Processor(FlowProcessor):
|
|
125
|
+
|
|
126
|
+
def __init__(self, **params):
|
|
127
|
+
|
|
128
|
+
id = params.get("id", default_ident)
|
|
129
|
+
|
|
130
|
+
self.partition_strategy = params.get("strategy", "auto")
|
|
131
|
+
self.languages = params.get("languages", "eng").split(",")
|
|
132
|
+
self.section_strategy_name = params.get(
|
|
133
|
+
"section_strategy", "whole-document"
|
|
134
|
+
)
|
|
135
|
+
self.section_element_count = params.get("section_element_count", 20)
|
|
136
|
+
self.section_max_size = params.get("section_max_size", 4000)
|
|
137
|
+
self.section_within_pages = params.get("section_within_pages", False)
|
|
138
|
+
|
|
139
|
+
self.section_strategy = get_strategy(self.section_strategy_name)
|
|
140
|
+
|
|
141
|
+
super(Processor, self).__init__(
|
|
142
|
+
**params | {
|
|
143
|
+
"id": id,
|
|
144
|
+
}
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
self.register_specification(
|
|
148
|
+
ConsumerSpec(
|
|
149
|
+
name="input",
|
|
150
|
+
schema=Document,
|
|
151
|
+
handler=self.on_message,
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
self.register_specification(
|
|
156
|
+
ProducerSpec(
|
|
157
|
+
name="output",
|
|
158
|
+
schema=TextDocument,
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
self.register_specification(
|
|
163
|
+
ProducerSpec(
|
|
164
|
+
name="triples",
|
|
165
|
+
schema=Triples,
|
|
166
|
+
)
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Librarian client for fetching/storing document content
|
|
170
|
+
librarian_request_q = params.get(
|
|
171
|
+
"librarian_request_queue", default_librarian_request_queue
|
|
172
|
+
)
|
|
173
|
+
librarian_response_q = params.get(
|
|
174
|
+
"librarian_response_queue", default_librarian_response_queue
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
librarian_request_metrics = ProducerMetrics(
|
|
178
|
+
processor=id, flow=None, name="librarian-request"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
self.librarian_request_producer = Producer(
|
|
182
|
+
backend=self.pubsub,
|
|
183
|
+
topic=librarian_request_q,
|
|
184
|
+
schema=LibrarianRequest,
|
|
185
|
+
metrics=librarian_request_metrics,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
librarian_response_metrics = ConsumerMetrics(
|
|
189
|
+
processor=id, flow=None, name="librarian-response"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
self.librarian_response_consumer = Consumer(
|
|
193
|
+
taskgroup=self.taskgroup,
|
|
194
|
+
backend=self.pubsub,
|
|
195
|
+
flow=None,
|
|
196
|
+
topic=librarian_response_q,
|
|
197
|
+
subscriber=f"{id}-librarian",
|
|
198
|
+
schema=LibrarianResponse,
|
|
199
|
+
handler=self.on_librarian_response,
|
|
200
|
+
metrics=librarian_response_metrics,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# Pending librarian requests: request_id -> asyncio.Future
|
|
204
|
+
self.pending_requests = {}
|
|
205
|
+
|
|
206
|
+
logger.info("Universal decoder initialized")
|
|
207
|
+
|
|
208
|
+
async def start(self):
|
|
209
|
+
await super(Processor, self).start()
|
|
210
|
+
await self.librarian_request_producer.start()
|
|
211
|
+
await self.librarian_response_consumer.start()
|
|
212
|
+
|
|
213
|
+
async def on_librarian_response(self, msg, consumer, flow):
|
|
214
|
+
"""Handle responses from the librarian service."""
|
|
215
|
+
response = msg.value()
|
|
216
|
+
request_id = msg.properties().get("id")
|
|
217
|
+
|
|
218
|
+
if request_id and request_id in self.pending_requests:
|
|
219
|
+
future = self.pending_requests.pop(request_id)
|
|
220
|
+
future.set_result(response)
|
|
221
|
+
|
|
222
|
+
async def _librarian_request(self, request, timeout=120):
|
|
223
|
+
"""Send a request to the librarian and wait for response."""
|
|
224
|
+
request_id = str(uuid.uuid4())
|
|
225
|
+
|
|
226
|
+
future = asyncio.get_event_loop().create_future()
|
|
227
|
+
self.pending_requests[request_id] = future
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
await self.librarian_request_producer.send(
|
|
231
|
+
request, properties={"id": request_id}
|
|
232
|
+
)
|
|
233
|
+
response = await asyncio.wait_for(future, timeout=timeout)
|
|
234
|
+
|
|
235
|
+
if response.error:
|
|
236
|
+
raise RuntimeError(
|
|
237
|
+
f"Librarian error: {response.error.type}: "
|
|
238
|
+
f"{response.error.message}"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
return response
|
|
242
|
+
|
|
243
|
+
except asyncio.TimeoutError:
|
|
244
|
+
self.pending_requests.pop(request_id, None)
|
|
245
|
+
raise RuntimeError("Timeout waiting for librarian response")
|
|
246
|
+
|
|
247
|
+
async def fetch_document_metadata(self, document_id, user):
|
|
248
|
+
"""Fetch document metadata from the librarian."""
|
|
249
|
+
request = LibrarianRequest(
|
|
250
|
+
operation="get-document-metadata",
|
|
251
|
+
document_id=document_id,
|
|
252
|
+
user=user,
|
|
253
|
+
)
|
|
254
|
+
response = await self._librarian_request(request)
|
|
255
|
+
return response.document_metadata
|
|
256
|
+
|
|
257
|
+
async def fetch_document_content(self, document_id, user):
|
|
258
|
+
"""Fetch document content from the librarian."""
|
|
259
|
+
request = LibrarianRequest(
|
|
260
|
+
operation="get-document-content",
|
|
261
|
+
document_id=document_id,
|
|
262
|
+
user=user,
|
|
263
|
+
)
|
|
264
|
+
response = await self._librarian_request(request)
|
|
265
|
+
return response.content
|
|
266
|
+
|
|
267
|
+
async def save_child_document(self, doc_id, parent_id, user, content,
|
|
268
|
+
document_type="page", title=None,
|
|
269
|
+
kind="text/plain"):
|
|
270
|
+
"""Save a child document to the librarian."""
|
|
271
|
+
if isinstance(content, str):
|
|
272
|
+
content = content.encode("utf-8")
|
|
273
|
+
|
|
274
|
+
doc_metadata = DocumentMetadata(
|
|
275
|
+
id=doc_id,
|
|
276
|
+
user=user,
|
|
277
|
+
kind=kind,
|
|
278
|
+
title=title or doc_id,
|
|
279
|
+
parent_id=parent_id,
|
|
280
|
+
document_type=document_type,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
request = LibrarianRequest(
|
|
284
|
+
operation="add-child-document",
|
|
285
|
+
document_metadata=doc_metadata,
|
|
286
|
+
content=base64.b64encode(content).decode("utf-8"),
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
await self._librarian_request(request)
|
|
290
|
+
return doc_id
|
|
291
|
+
|
|
292
|
+
def extract_elements(self, blob, mime_type=None):
|
|
293
|
+
"""
|
|
294
|
+
Extract elements from a document using unstructured.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
blob: Raw document bytes
|
|
298
|
+
mime_type: Optional mime type hint
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
List of unstructured Element objects
|
|
302
|
+
"""
|
|
303
|
+
# Determine file extension for unstructured
|
|
304
|
+
suffix = MIME_EXTENSIONS.get(mime_type, "") if mime_type else ""
|
|
305
|
+
if not suffix:
|
|
306
|
+
suffix = ".bin"
|
|
307
|
+
|
|
308
|
+
with tempfile.NamedTemporaryFile(
|
|
309
|
+
delete=False, suffix=suffix
|
|
310
|
+
) as fp:
|
|
311
|
+
fp.write(blob)
|
|
312
|
+
temp_path = fp.name
|
|
313
|
+
|
|
314
|
+
try:
|
|
315
|
+
kwargs = {
|
|
316
|
+
"filename": temp_path,
|
|
317
|
+
"strategy": self.partition_strategy,
|
|
318
|
+
"languages": self.languages,
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
# For hi_res strategy, request image extraction
|
|
322
|
+
if self.partition_strategy == "hi_res":
|
|
323
|
+
kwargs["extract_image_block_to_payload"] = True
|
|
324
|
+
|
|
325
|
+
elements = partition(**kwargs)
|
|
326
|
+
|
|
327
|
+
logger.info(
|
|
328
|
+
f"Extracted {len(elements)} elements "
|
|
329
|
+
f"(strategy: {self.partition_strategy})"
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
return elements
|
|
333
|
+
|
|
334
|
+
finally:
|
|
335
|
+
try:
|
|
336
|
+
os.unlink(temp_path)
|
|
337
|
+
except OSError:
|
|
338
|
+
pass
|
|
339
|
+
|
|
340
|
+
def group_by_page(self, elements):
|
|
341
|
+
"""
|
|
342
|
+
Group elements by page number.
|
|
343
|
+
|
|
344
|
+
Returns list of (page_number, elements) tuples.
|
|
345
|
+
"""
|
|
346
|
+
pages = {}
|
|
347
|
+
|
|
348
|
+
for el in elements:
|
|
349
|
+
page_num = getattr(
|
|
350
|
+
el.metadata, 'page_number', None
|
|
351
|
+
) if hasattr(el, 'metadata') else None
|
|
352
|
+
if page_num is None:
|
|
353
|
+
page_num = 1
|
|
354
|
+
if page_num not in pages:
|
|
355
|
+
pages[page_num] = []
|
|
356
|
+
pages[page_num].append(el)
|
|
357
|
+
|
|
358
|
+
return sorted(pages.items())
|
|
359
|
+
|
|
360
|
+
async def emit_section(self, elements, parent_doc_id, doc_uri_str,
|
|
361
|
+
metadata, flow, mime_type=None,
|
|
362
|
+
page_number=None, section_index=None):
|
|
363
|
+
"""
|
|
364
|
+
Process a group of elements as a page or section.
|
|
365
|
+
|
|
366
|
+
Assembles text, saves to librarian, emits provenance, sends
|
|
367
|
+
TextDocument downstream. Returns the entity URI.
|
|
368
|
+
"""
|
|
369
|
+
text, element_types, table_count, image_count = (
|
|
370
|
+
assemble_section_text(elements)
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
if not text.strip():
|
|
374
|
+
logger.debug("Skipping empty section")
|
|
375
|
+
return None
|
|
376
|
+
|
|
377
|
+
is_page = page_number is not None
|
|
378
|
+
char_length = len(text)
|
|
379
|
+
|
|
380
|
+
if is_page:
|
|
381
|
+
entity_uri = make_page_uri()
|
|
382
|
+
label = f"Page {page_number}"
|
|
383
|
+
else:
|
|
384
|
+
entity_uri = make_section_uri()
|
|
385
|
+
label = f"Section {section_index}" if section_index else "Section"
|
|
386
|
+
|
|
387
|
+
doc_id = entity_uri
|
|
388
|
+
page_content = text.encode("utf-8")
|
|
389
|
+
|
|
390
|
+
# Save to librarian
|
|
391
|
+
await self.save_child_document(
|
|
392
|
+
doc_id=doc_id,
|
|
393
|
+
parent_id=parent_doc_id,
|
|
394
|
+
user=metadata.user,
|
|
395
|
+
content=page_content,
|
|
396
|
+
document_type="page" if is_page else "section",
|
|
397
|
+
title=label,
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# Emit provenance triples
|
|
401
|
+
element_types_str = ",".join(sorted(element_types)) if element_types else None
|
|
402
|
+
|
|
403
|
+
prov_triples = derived_entity_triples(
|
|
404
|
+
entity_uri=entity_uri,
|
|
405
|
+
parent_uri=doc_uri_str,
|
|
406
|
+
component_name=COMPONENT_NAME,
|
|
407
|
+
component_version=COMPONENT_VERSION,
|
|
408
|
+
label=label,
|
|
409
|
+
page_number=page_number,
|
|
410
|
+
section=not is_page,
|
|
411
|
+
char_length=char_length,
|
|
412
|
+
mime_type=mime_type,
|
|
413
|
+
element_types=element_types_str,
|
|
414
|
+
table_count=table_count if table_count > 0 else None,
|
|
415
|
+
image_count=image_count if image_count > 0 else None,
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
await flow("triples").send(Triples(
|
|
419
|
+
metadata=Metadata(
|
|
420
|
+
id=entity_uri,
|
|
421
|
+
root=metadata.root,
|
|
422
|
+
user=metadata.user,
|
|
423
|
+
collection=metadata.collection,
|
|
424
|
+
),
|
|
425
|
+
triples=set_graph(prov_triples, GRAPH_SOURCE),
|
|
426
|
+
))
|
|
427
|
+
|
|
428
|
+
# Send TextDocument downstream (chunker will fetch from librarian)
|
|
429
|
+
r = TextDocument(
|
|
430
|
+
metadata=Metadata(
|
|
431
|
+
id=entity_uri,
|
|
432
|
+
root=metadata.root,
|
|
433
|
+
user=metadata.user,
|
|
434
|
+
collection=metadata.collection,
|
|
435
|
+
),
|
|
436
|
+
document_id=doc_id,
|
|
437
|
+
text=b"",
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
await flow("output").send(r)
|
|
441
|
+
|
|
442
|
+
return entity_uri
|
|
443
|
+
|
|
444
|
+
async def emit_image(self, element, parent_uri, parent_doc_id,
|
|
445
|
+
metadata, flow, mime_type=None, page_number=None):
|
|
446
|
+
"""
|
|
447
|
+
Store an image element in the librarian with provenance.
|
|
448
|
+
|
|
449
|
+
Images are stored but NOT sent downstream to the text pipeline.
|
|
450
|
+
"""
|
|
451
|
+
img_uri = make_image_uri()
|
|
452
|
+
|
|
453
|
+
# Get image data
|
|
454
|
+
img_data = None
|
|
455
|
+
if hasattr(element, 'metadata'):
|
|
456
|
+
img_data = getattr(element.metadata, 'image_base64', None)
|
|
457
|
+
|
|
458
|
+
if not img_data:
|
|
459
|
+
# No image payload available, just record provenance
|
|
460
|
+
logger.debug("Image element without payload, recording provenance only")
|
|
461
|
+
img_content = b""
|
|
462
|
+
img_kind = "image/unknown"
|
|
463
|
+
else:
|
|
464
|
+
if isinstance(img_data, str):
|
|
465
|
+
img_content = base64.b64decode(img_data)
|
|
466
|
+
else:
|
|
467
|
+
img_content = img_data
|
|
468
|
+
img_kind = "image/png" # unstructured typically extracts as PNG
|
|
469
|
+
|
|
470
|
+
# Save to librarian
|
|
471
|
+
if img_content:
|
|
472
|
+
await self.save_child_document(
|
|
473
|
+
doc_id=img_uri,
|
|
474
|
+
parent_id=parent_doc_id,
|
|
475
|
+
user=metadata.user,
|
|
476
|
+
content=img_content,
|
|
477
|
+
document_type="image",
|
|
478
|
+
title=f"Image from page {page_number}" if page_number else "Image",
|
|
479
|
+
kind=img_kind,
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
# Emit provenance triples
|
|
483
|
+
prov_triples = derived_entity_triples(
|
|
484
|
+
entity_uri=img_uri,
|
|
485
|
+
parent_uri=parent_uri,
|
|
486
|
+
component_name=COMPONENT_NAME,
|
|
487
|
+
component_version=COMPONENT_VERSION,
|
|
488
|
+
label=f"Image from page {page_number}" if page_number else "Image",
|
|
489
|
+
image=True,
|
|
490
|
+
page_number=page_number,
|
|
491
|
+
mime_type=mime_type,
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
await flow("triples").send(Triples(
|
|
495
|
+
metadata=Metadata(
|
|
496
|
+
id=img_uri,
|
|
497
|
+
root=metadata.root,
|
|
498
|
+
user=metadata.user,
|
|
499
|
+
collection=metadata.collection,
|
|
500
|
+
),
|
|
501
|
+
triples=set_graph(prov_triples, GRAPH_SOURCE),
|
|
502
|
+
))
|
|
503
|
+
|
|
504
|
+
async def on_message(self, msg, consumer, flow):
|
|
505
|
+
|
|
506
|
+
logger.debug("Document message received")
|
|
507
|
+
|
|
508
|
+
v = msg.value()
|
|
509
|
+
|
|
510
|
+
logger.info(f"Decoding {v.metadata.id}...")
|
|
511
|
+
|
|
512
|
+
# Determine content and mime type
|
|
513
|
+
mime_type = None
|
|
514
|
+
|
|
515
|
+
if v.document_id:
|
|
516
|
+
# Librarian path: fetch metadata then content
|
|
517
|
+
logger.info(
|
|
518
|
+
f"Fetching document {v.document_id} from librarian..."
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
doc_meta = await self.fetch_document_metadata(
|
|
522
|
+
document_id=v.document_id,
|
|
523
|
+
user=v.metadata.user,
|
|
524
|
+
)
|
|
525
|
+
mime_type = doc_meta.kind if doc_meta else None
|
|
526
|
+
|
|
527
|
+
content = await self.fetch_document_content(
|
|
528
|
+
document_id=v.document_id,
|
|
529
|
+
user=v.metadata.user,
|
|
530
|
+
)
|
|
531
|
+
|
|
532
|
+
if isinstance(content, str):
|
|
533
|
+
content = content.encode('utf-8')
|
|
534
|
+
blob = base64.b64decode(content)
|
|
535
|
+
|
|
536
|
+
logger.info(
|
|
537
|
+
f"Fetched {len(blob)} bytes, mime: {mime_type}"
|
|
538
|
+
)
|
|
539
|
+
else:
|
|
540
|
+
# Inline path: detect format from content
|
|
541
|
+
blob = base64.b64decode(v.data)
|
|
542
|
+
try:
|
|
543
|
+
mime_type = magic.from_buffer(blob, mime=True)
|
|
544
|
+
logger.info(f"Detected mime type: {mime_type}")
|
|
545
|
+
except Exception as e:
|
|
546
|
+
logger.warning(f"Could not detect mime type: {e}")
|
|
547
|
+
|
|
548
|
+
# Get the source document ID
|
|
549
|
+
source_doc_id = v.document_id or v.metadata.id
|
|
550
|
+
doc_uri_str = document_uri(source_doc_id)
|
|
551
|
+
|
|
552
|
+
# Extract elements using unstructured
|
|
553
|
+
elements = self.extract_elements(blob, mime_type)
|
|
554
|
+
|
|
555
|
+
if not elements:
|
|
556
|
+
logger.warning("No elements extracted from document")
|
|
557
|
+
return
|
|
558
|
+
|
|
559
|
+
# Determine if this is a page-based format
|
|
560
|
+
is_page_based = mime_type in PAGE_BASED_FORMATS if mime_type else False
|
|
561
|
+
|
|
562
|
+
# Also check if elements actually have page numbers
|
|
563
|
+
if not is_page_based:
|
|
564
|
+
has_pages = any(
|
|
565
|
+
getattr(el.metadata, 'page_number', None) is not None
|
|
566
|
+
for el in elements
|
|
567
|
+
if hasattr(el, 'metadata')
|
|
568
|
+
)
|
|
569
|
+
if has_pages:
|
|
570
|
+
is_page_based = True
|
|
571
|
+
|
|
572
|
+
if is_page_based:
|
|
573
|
+
# Group by page
|
|
574
|
+
page_groups = self.group_by_page(elements)
|
|
575
|
+
|
|
576
|
+
for page_num, page_elements in page_groups:
|
|
577
|
+
|
|
578
|
+
# Extract and store images separately
|
|
579
|
+
image_elements = [
|
|
580
|
+
el for el in page_elements
|
|
581
|
+
if getattr(el, 'category', '') == 'Image'
|
|
582
|
+
]
|
|
583
|
+
text_elements = [
|
|
584
|
+
el for el in page_elements
|
|
585
|
+
if getattr(el, 'category', '') != 'Image'
|
|
586
|
+
]
|
|
587
|
+
|
|
588
|
+
# Emit the page as a text section
|
|
589
|
+
page_uri_str = await self.emit_section(
|
|
590
|
+
text_elements, source_doc_id, doc_uri_str,
|
|
591
|
+
v.metadata, flow,
|
|
592
|
+
mime_type=mime_type, page_number=page_num,
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
# Store images (not sent to text pipeline)
|
|
596
|
+
for img_el in image_elements:
|
|
597
|
+
await self.emit_image(
|
|
598
|
+
img_el,
|
|
599
|
+
page_uri_str or doc_uri_str,
|
|
600
|
+
source_doc_id,
|
|
601
|
+
v.metadata, flow,
|
|
602
|
+
mime_type=mime_type, page_number=page_num,
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
else:
|
|
606
|
+
# Non-page format: use section strategy
|
|
607
|
+
|
|
608
|
+
# Separate images from text elements
|
|
609
|
+
image_elements = [
|
|
610
|
+
el for el in elements
|
|
611
|
+
if getattr(el, 'category', '') == 'Image'
|
|
612
|
+
]
|
|
613
|
+
text_elements = [
|
|
614
|
+
el for el in elements
|
|
615
|
+
if getattr(el, 'category', '') != 'Image'
|
|
616
|
+
]
|
|
617
|
+
|
|
618
|
+
# Apply section strategy to text elements
|
|
619
|
+
strategy_kwargs = {
|
|
620
|
+
'element_count': self.section_element_count,
|
|
621
|
+
'max_size': self.section_max_size,
|
|
622
|
+
}
|
|
623
|
+
groups = self.section_strategy(text_elements, **strategy_kwargs)
|
|
624
|
+
|
|
625
|
+
for idx, group in enumerate(groups):
|
|
626
|
+
section_idx = idx + 1
|
|
627
|
+
|
|
628
|
+
await self.emit_section(
|
|
629
|
+
group, source_doc_id, doc_uri_str,
|
|
630
|
+
v.metadata, flow,
|
|
631
|
+
mime_type=mime_type, section_index=section_idx,
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
# Store images (not sent to text pipeline)
|
|
635
|
+
for img_el in image_elements:
|
|
636
|
+
await self.emit_image(
|
|
637
|
+
img_el, doc_uri_str, source_doc_id,
|
|
638
|
+
v.metadata, flow,
|
|
639
|
+
mime_type=mime_type,
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
logger.info("Document decoding complete")
|
|
643
|
+
|
|
644
|
+
@staticmethod
|
|
645
|
+
def add_args(parser):
|
|
646
|
+
|
|
647
|
+
FlowProcessor.add_args(parser)
|
|
648
|
+
|
|
649
|
+
parser.add_argument(
|
|
650
|
+
'--strategy',
|
|
651
|
+
default='auto',
|
|
652
|
+
choices=['auto', 'hi_res', 'fast'],
|
|
653
|
+
help='Partitioning strategy (default: auto)',
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
parser.add_argument(
|
|
657
|
+
'--languages',
|
|
658
|
+
default='eng',
|
|
659
|
+
help='Comma-separated OCR language codes (default: eng)',
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
parser.add_argument(
|
|
663
|
+
'--section-strategy',
|
|
664
|
+
default='whole-document',
|
|
665
|
+
choices=[
|
|
666
|
+
'whole-document', 'heading', 'element-type', 'count', 'size'
|
|
667
|
+
],
|
|
668
|
+
help='Section grouping strategy for non-page formats '
|
|
669
|
+
'(default: whole-document)',
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
parser.add_argument(
|
|
673
|
+
'--section-element-count',
|
|
674
|
+
type=int,
|
|
675
|
+
default=20,
|
|
676
|
+
help='Elements per section for count strategy (default: 20)',
|
|
677
|
+
)
|
|
678
|
+
|
|
679
|
+
parser.add_argument(
|
|
680
|
+
'--section-max-size',
|
|
681
|
+
type=int,
|
|
682
|
+
default=4000,
|
|
683
|
+
help='Max chars per section for size strategy (default: 4000)',
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
parser.add_argument(
|
|
687
|
+
'--section-within-pages',
|
|
688
|
+
action='store_true',
|
|
689
|
+
default=False,
|
|
690
|
+
help='Apply section strategy within pages too (default: false)',
|
|
691
|
+
)
|
|
692
|
+
|
|
693
|
+
parser.add_argument(
|
|
694
|
+
'--librarian-request-queue',
|
|
695
|
+
default=default_librarian_request_queue,
|
|
696
|
+
help=f'Librarian request queue '
|
|
697
|
+
f'(default: {default_librarian_request_queue})',
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
parser.add_argument(
|
|
701
|
+
'--librarian-response-queue',
|
|
702
|
+
default=default_librarian_response_queue,
|
|
703
|
+
help=f'Librarian response queue '
|
|
704
|
+
f'(default: {default_librarian_response_queue})',
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
|
|
708
|
+
def run():
|
|
709
|
+
|
|
710
|
+
Processor.launch(default_ident, __doc__)
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
Section grouping strategies for the universal document decoder.
|
|
4
|
+
|
|
5
|
+
Each strategy takes a list of unstructured elements and returns a list
|
|
6
|
+
of element groups. Each group becomes one TextDocument output.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def group_whole_document(elements, **kwargs):
|
|
15
|
+
"""
|
|
16
|
+
Emit the entire document as a single section.
|
|
17
|
+
|
|
18
|
+
The downstream chunker handles all splitting.
|
|
19
|
+
"""
|
|
20
|
+
if not elements:
|
|
21
|
+
return []
|
|
22
|
+
return [elements]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def group_by_heading(elements, **kwargs):
|
|
26
|
+
"""
|
|
27
|
+
Split at heading elements (Title category).
|
|
28
|
+
|
|
29
|
+
Each section is a heading plus all content until the next heading.
|
|
30
|
+
Falls back to whole-document if no headings are found.
|
|
31
|
+
"""
|
|
32
|
+
if not elements:
|
|
33
|
+
return []
|
|
34
|
+
|
|
35
|
+
# Check if any headings exist
|
|
36
|
+
has_headings = any(
|
|
37
|
+
getattr(el, 'category', '') == 'Title' for el in elements
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
if not has_headings:
|
|
41
|
+
logger.debug("No headings found, falling back to whole-document")
|
|
42
|
+
return group_whole_document(elements)
|
|
43
|
+
|
|
44
|
+
groups = []
|
|
45
|
+
current_group = []
|
|
46
|
+
|
|
47
|
+
for el in elements:
|
|
48
|
+
if getattr(el, 'category', '') == 'Title' and current_group:
|
|
49
|
+
groups.append(current_group)
|
|
50
|
+
current_group = []
|
|
51
|
+
current_group.append(el)
|
|
52
|
+
|
|
53
|
+
if current_group:
|
|
54
|
+
groups.append(current_group)
|
|
55
|
+
|
|
56
|
+
return groups
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def group_by_element_type(elements, **kwargs):
|
|
60
|
+
"""
|
|
61
|
+
Split on transitions between narrative text and tables.
|
|
62
|
+
|
|
63
|
+
Consecutive elements of the same broad category stay grouped.
|
|
64
|
+
"""
|
|
65
|
+
if not elements:
|
|
66
|
+
return []
|
|
67
|
+
|
|
68
|
+
def is_table(el):
|
|
69
|
+
return getattr(el, 'category', '') == 'Table'
|
|
70
|
+
|
|
71
|
+
groups = []
|
|
72
|
+
current_group = []
|
|
73
|
+
current_is_table = None
|
|
74
|
+
|
|
75
|
+
for el in elements:
|
|
76
|
+
el_is_table = is_table(el)
|
|
77
|
+
if current_is_table is not None and el_is_table != current_is_table:
|
|
78
|
+
groups.append(current_group)
|
|
79
|
+
current_group = []
|
|
80
|
+
current_group.append(el)
|
|
81
|
+
current_is_table = el_is_table
|
|
82
|
+
|
|
83
|
+
if current_group:
|
|
84
|
+
groups.append(current_group)
|
|
85
|
+
|
|
86
|
+
return groups
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def group_by_count(elements, element_count=20, **kwargs):
|
|
90
|
+
"""
|
|
91
|
+
Group a fixed number of elements per section.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
elements: List of unstructured elements
|
|
95
|
+
element_count: Number of elements per group (default: 20)
|
|
96
|
+
"""
|
|
97
|
+
if not elements:
|
|
98
|
+
return []
|
|
99
|
+
|
|
100
|
+
groups = []
|
|
101
|
+
for i in range(0, len(elements), element_count):
|
|
102
|
+
groups.append(elements[i:i + element_count])
|
|
103
|
+
|
|
104
|
+
return groups
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def group_by_size(elements, max_size=4000, **kwargs):
|
|
108
|
+
"""
|
|
109
|
+
Accumulate elements until a character limit is reached.
|
|
110
|
+
|
|
111
|
+
Respects element boundaries — never splits mid-element. If a
|
|
112
|
+
single element exceeds the limit, it becomes its own section.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
elements: List of unstructured elements
|
|
116
|
+
max_size: Max characters per section (default: 4000)
|
|
117
|
+
"""
|
|
118
|
+
if not elements:
|
|
119
|
+
return []
|
|
120
|
+
|
|
121
|
+
groups = []
|
|
122
|
+
current_group = []
|
|
123
|
+
current_size = 0
|
|
124
|
+
|
|
125
|
+
for el in elements:
|
|
126
|
+
el_text = getattr(el, 'text', '') or ''
|
|
127
|
+
el_size = len(el_text)
|
|
128
|
+
|
|
129
|
+
if current_group and current_size + el_size > max_size:
|
|
130
|
+
groups.append(current_group)
|
|
131
|
+
current_group = []
|
|
132
|
+
current_size = 0
|
|
133
|
+
|
|
134
|
+
current_group.append(el)
|
|
135
|
+
current_size += el_size
|
|
136
|
+
|
|
137
|
+
if current_group:
|
|
138
|
+
groups.append(current_group)
|
|
139
|
+
|
|
140
|
+
return groups
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# Strategy registry
|
|
144
|
+
STRATEGIES = {
|
|
145
|
+
'whole-document': group_whole_document,
|
|
146
|
+
'heading': group_by_heading,
|
|
147
|
+
'element-type': group_by_element_type,
|
|
148
|
+
'count': group_by_count,
|
|
149
|
+
'size': group_by_size,
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def get_strategy(name):
|
|
154
|
+
"""
|
|
155
|
+
Get a section grouping strategy by name.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
name: Strategy name (whole-document, heading, element-type, count, size)
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Strategy function
|
|
162
|
+
|
|
163
|
+
Raises:
|
|
164
|
+
ValueError: If strategy name is not recognized
|
|
165
|
+
"""
|
|
166
|
+
if name not in STRATEGIES:
|
|
167
|
+
raise ValueError(
|
|
168
|
+
f"Unknown section strategy: {name}. "
|
|
169
|
+
f"Available: {', '.join(STRATEGIES.keys())}"
|
|
170
|
+
)
|
|
171
|
+
return STRATEGIES[name]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "2.2.1"
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: trustgraph-unstructured
|
|
3
|
+
Version: 2.2.1
|
|
4
|
+
Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
|
|
5
|
+
Author-email: "trustgraph.ai" <security@trustgraph.ai>
|
|
6
|
+
Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Operating System :: OS Independent
|
|
9
|
+
Requires-Python: >=3.8
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: trustgraph-base<2.3,>=2.2
|
|
12
|
+
Requires-Dist: pulsar-client
|
|
13
|
+
Requires-Dist: prometheus-client
|
|
14
|
+
Requires-Dist: python-magic
|
|
15
|
+
Requires-Dist: unstructured[csv,docx,epub,md,odt,pptx,rst,rtf,tsv,xlsx]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
trustgraph/unstructured_version.py
|
|
3
|
+
trustgraph/decoding/universal/__init__.py
|
|
4
|
+
trustgraph/decoding/universal/__main__.py
|
|
5
|
+
trustgraph/decoding/universal/processor.py
|
|
6
|
+
trustgraph/decoding/universal/strategies.py
|
|
7
|
+
trustgraph_unstructured.egg-info/PKG-INFO
|
|
8
|
+
trustgraph_unstructured.egg-info/SOURCES.txt
|
|
9
|
+
trustgraph_unstructured.egg-info/dependency_links.txt
|
|
10
|
+
trustgraph_unstructured.egg-info/entry_points.txt
|
|
11
|
+
trustgraph_unstructured.egg-info/requires.txt
|
|
12
|
+
trustgraph_unstructured.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
trustgraph
|