trustgraph-unstructured 2.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: trustgraph-unstructured
3
+ Version: 2.2.1
4
+ Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
+ Author-email: "trustgraph.ai" <security@trustgraph.ai>
6
+ Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: trustgraph-base<2.3,>=2.2
12
+ Requires-Dist: pulsar-client
13
+ Requires-Dist: prometheus-client
14
+ Requires-Dist: python-magic
15
+ Requires-Dist: unstructured[csv,docx,epub,md,odt,pptx,rst,rtf,tsv,xlsx]
@@ -0,0 +1,34 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "trustgraph-unstructured"
7
+ dynamic = ["version"]
8
+ authors = [{name = "trustgraph.ai", email = "security@trustgraph.ai"}]
9
+ description = "TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline."
10
+ readme = "README.md"
11
+ requires-python = ">=3.8"
12
+ dependencies = [
13
+ "trustgraph-base>=2.2,<2.3",
14
+ "pulsar-client",
15
+ "prometheus-client",
16
+ "python-magic",
17
+ "unstructured[csv,docx,epub,md,odt,pptx,rst,rtf,tsv,xlsx]",
18
+ ]
19
+ classifiers = [
20
+ "Programming Language :: Python :: 3",
21
+ "Operating System :: OS Independent",
22
+ ]
23
+
24
+ [project.urls]
25
+ Homepage = "https://github.com/trustgraph-ai/trustgraph"
26
+
27
+ [project.scripts]
28
+ universal-decoder = "trustgraph.decoding.universal:run"
29
+
30
+ [tool.setuptools.packages.find]
31
+ include = ["trustgraph*"]
32
+
33
+ [tool.setuptools.dynamic]
34
+ version = {attr = "trustgraph.unstructured_version.__version__"}
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,2 @@
1
+
2
+ from . processor import *
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from . processor import run
4
+
5
+ if __name__ == '__main__':
6
+ run()
@@ -0,0 +1,710 @@
1
+
2
+ """
3
+ Universal document decoder powered by the unstructured library.
4
+
5
+ Accepts documents in any common format (PDF, DOCX, XLSX, HTML, Markdown,
6
+ plain text, PPTX, etc.) on input, outputs pages or sections as text
7
+ as separate output objects.
8
+
9
+ Supports both inline document data and fetching from librarian via Pulsar
10
+ for large documents. Fetches document metadata from the librarian to
11
+ determine mime type for format detection.
12
+
13
+ Tables are preserved as HTML markup for better downstream extraction.
14
+ Images are stored in the librarian but not sent to the text pipeline.
15
+ """
16
+
17
+ import asyncio
18
+ import base64
19
+ import logging
20
+ import magic
21
+ import tempfile
22
+ import os
23
+ import uuid
24
+
25
+ from unstructured.partition.auto import partition
26
+
27
+ from ... schema import Document, TextDocument, Metadata
28
+ from ... schema import LibrarianRequest, LibrarianResponse, DocumentMetadata
29
+ from ... schema import librarian_request_queue, librarian_response_queue
30
+ from ... schema import Triples
31
+ from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
32
+ from ... base import Consumer, Producer, ConsumerMetrics, ProducerMetrics
33
+
34
+ from ... provenance import (
35
+ document_uri, page_uri as make_page_uri,
36
+ section_uri as make_section_uri, image_uri as make_image_uri,
37
+ derived_entity_triples, set_graph, GRAPH_SOURCE,
38
+ )
39
+
40
+ from . strategies import get_strategy
41
+
42
+ # Component identification for provenance
43
+ COMPONENT_NAME = "universal-decoder"
44
+ COMPONENT_VERSION = "1.0.0"
45
+
46
+ # Module logger
47
+ logger = logging.getLogger(__name__)
48
+
49
+ default_ident = "document-decoder"
50
+
51
+ default_librarian_request_queue = librarian_request_queue
52
+ default_librarian_response_queue = librarian_response_queue
53
+
54
+ # Mime type to unstructured content_type mapping
55
+ # unstructured auto-detects most formats, but we pass the hint when available
56
+ MIME_EXTENSIONS = {
57
+ "application/pdf": ".pdf",
58
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
59
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
60
+ "application/vnd.ms-excel": ".xls",
61
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
62
+ "text/html": ".html",
63
+ "text/markdown": ".md",
64
+ "text/plain": ".txt",
65
+ "text/csv": ".csv",
66
+ "text/tab-separated-values": ".tsv",
67
+ "application/rtf": ".rtf",
68
+ "text/x-rst": ".rst",
69
+ "application/vnd.oasis.opendocument.text": ".odt",
70
+ }
71
+
72
+ # Formats that have natural page boundaries
73
+ PAGE_BASED_FORMATS = {
74
+ "application/pdf",
75
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
76
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
77
+ "application/vnd.ms-excel",
78
+ }
79
+
80
+
81
+ def assemble_section_text(elements):
82
+ """
83
+ Assemble text from a list of unstructured elements.
84
+
85
+ - Text elements: plain text, joined with double newlines
86
+ - Table elements: HTML table markup from text_as_html
87
+ - Image elements: skipped (stored separately, not in text output)
88
+
89
+ Returns:
90
+ tuple: (assembled_text, element_types_set, table_count, image_count)
91
+ """
92
+ parts = []
93
+ element_types = set()
94
+ table_count = 0
95
+ image_count = 0
96
+
97
+ for el in elements:
98
+ category = getattr(el, 'category', 'UncategorizedText')
99
+ element_types.add(category)
100
+
101
+ if category == 'Image':
102
+ image_count += 1
103
+ continue # Images are NOT included in text output
104
+
105
+ if category == 'Table':
106
+ table_count += 1
107
+ # Prefer HTML representation for tables
108
+ html = getattr(el.metadata, 'text_as_html', None) if hasattr(el, 'metadata') else None
109
+ if html:
110
+ parts.append(html)
111
+ else:
112
+ # Fallback to plain text
113
+ text = getattr(el, 'text', '') or ''
114
+ if text:
115
+ parts.append(text)
116
+ else:
117
+ text = getattr(el, 'text', '') or ''
118
+ if text:
119
+ parts.append(text)
120
+
121
+ return '\n\n'.join(parts), element_types, table_count, image_count
122
+
123
+
124
+ class Processor(FlowProcessor):
125
+
126
+ def __init__(self, **params):
127
+
128
+ id = params.get("id", default_ident)
129
+
130
+ self.partition_strategy = params.get("strategy", "auto")
131
+ self.languages = params.get("languages", "eng").split(",")
132
+ self.section_strategy_name = params.get(
133
+ "section_strategy", "whole-document"
134
+ )
135
+ self.section_element_count = params.get("section_element_count", 20)
136
+ self.section_max_size = params.get("section_max_size", 4000)
137
+ self.section_within_pages = params.get("section_within_pages", False)
138
+
139
+ self.section_strategy = get_strategy(self.section_strategy_name)
140
+
141
+ super(Processor, self).__init__(
142
+ **params | {
143
+ "id": id,
144
+ }
145
+ )
146
+
147
+ self.register_specification(
148
+ ConsumerSpec(
149
+ name="input",
150
+ schema=Document,
151
+ handler=self.on_message,
152
+ )
153
+ )
154
+
155
+ self.register_specification(
156
+ ProducerSpec(
157
+ name="output",
158
+ schema=TextDocument,
159
+ )
160
+ )
161
+
162
+ self.register_specification(
163
+ ProducerSpec(
164
+ name="triples",
165
+ schema=Triples,
166
+ )
167
+ )
168
+
169
+ # Librarian client for fetching/storing document content
170
+ librarian_request_q = params.get(
171
+ "librarian_request_queue", default_librarian_request_queue
172
+ )
173
+ librarian_response_q = params.get(
174
+ "librarian_response_queue", default_librarian_response_queue
175
+ )
176
+
177
+ librarian_request_metrics = ProducerMetrics(
178
+ processor=id, flow=None, name="librarian-request"
179
+ )
180
+
181
+ self.librarian_request_producer = Producer(
182
+ backend=self.pubsub,
183
+ topic=librarian_request_q,
184
+ schema=LibrarianRequest,
185
+ metrics=librarian_request_metrics,
186
+ )
187
+
188
+ librarian_response_metrics = ConsumerMetrics(
189
+ processor=id, flow=None, name="librarian-response"
190
+ )
191
+
192
+ self.librarian_response_consumer = Consumer(
193
+ taskgroup=self.taskgroup,
194
+ backend=self.pubsub,
195
+ flow=None,
196
+ topic=librarian_response_q,
197
+ subscriber=f"{id}-librarian",
198
+ schema=LibrarianResponse,
199
+ handler=self.on_librarian_response,
200
+ metrics=librarian_response_metrics,
201
+ )
202
+
203
+ # Pending librarian requests: request_id -> asyncio.Future
204
+ self.pending_requests = {}
205
+
206
+ logger.info("Universal decoder initialized")
207
+
208
+ async def start(self):
209
+ await super(Processor, self).start()
210
+ await self.librarian_request_producer.start()
211
+ await self.librarian_response_consumer.start()
212
+
213
+ async def on_librarian_response(self, msg, consumer, flow):
214
+ """Handle responses from the librarian service."""
215
+ response = msg.value()
216
+ request_id = msg.properties().get("id")
217
+
218
+ if request_id and request_id in self.pending_requests:
219
+ future = self.pending_requests.pop(request_id)
220
+ future.set_result(response)
221
+
222
+ async def _librarian_request(self, request, timeout=120):
223
+ """Send a request to the librarian and wait for response."""
224
+ request_id = str(uuid.uuid4())
225
+
226
+ future = asyncio.get_event_loop().create_future()
227
+ self.pending_requests[request_id] = future
228
+
229
+ try:
230
+ await self.librarian_request_producer.send(
231
+ request, properties={"id": request_id}
232
+ )
233
+ response = await asyncio.wait_for(future, timeout=timeout)
234
+
235
+ if response.error:
236
+ raise RuntimeError(
237
+ f"Librarian error: {response.error.type}: "
238
+ f"{response.error.message}"
239
+ )
240
+
241
+ return response
242
+
243
+ except asyncio.TimeoutError:
244
+ self.pending_requests.pop(request_id, None)
245
+ raise RuntimeError("Timeout waiting for librarian response")
246
+
247
+ async def fetch_document_metadata(self, document_id, user):
248
+ """Fetch document metadata from the librarian."""
249
+ request = LibrarianRequest(
250
+ operation="get-document-metadata",
251
+ document_id=document_id,
252
+ user=user,
253
+ )
254
+ response = await self._librarian_request(request)
255
+ return response.document_metadata
256
+
257
+ async def fetch_document_content(self, document_id, user):
258
+ """Fetch document content from the librarian."""
259
+ request = LibrarianRequest(
260
+ operation="get-document-content",
261
+ document_id=document_id,
262
+ user=user,
263
+ )
264
+ response = await self._librarian_request(request)
265
+ return response.content
266
+
267
+ async def save_child_document(self, doc_id, parent_id, user, content,
268
+ document_type="page", title=None,
269
+ kind="text/plain"):
270
+ """Save a child document to the librarian."""
271
+ if isinstance(content, str):
272
+ content = content.encode("utf-8")
273
+
274
+ doc_metadata = DocumentMetadata(
275
+ id=doc_id,
276
+ user=user,
277
+ kind=kind,
278
+ title=title or doc_id,
279
+ parent_id=parent_id,
280
+ document_type=document_type,
281
+ )
282
+
283
+ request = LibrarianRequest(
284
+ operation="add-child-document",
285
+ document_metadata=doc_metadata,
286
+ content=base64.b64encode(content).decode("utf-8"),
287
+ )
288
+
289
+ await self._librarian_request(request)
290
+ return doc_id
291
+
292
+ def extract_elements(self, blob, mime_type=None):
293
+ """
294
+ Extract elements from a document using unstructured.
295
+
296
+ Args:
297
+ blob: Raw document bytes
298
+ mime_type: Optional mime type hint
299
+
300
+ Returns:
301
+ List of unstructured Element objects
302
+ """
303
+ # Determine file extension for unstructured
304
+ suffix = MIME_EXTENSIONS.get(mime_type, "") if mime_type else ""
305
+ if not suffix:
306
+ suffix = ".bin"
307
+
308
+ with tempfile.NamedTemporaryFile(
309
+ delete=False, suffix=suffix
310
+ ) as fp:
311
+ fp.write(blob)
312
+ temp_path = fp.name
313
+
314
+ try:
315
+ kwargs = {
316
+ "filename": temp_path,
317
+ "strategy": self.partition_strategy,
318
+ "languages": self.languages,
319
+ }
320
+
321
+ # For hi_res strategy, request image extraction
322
+ if self.partition_strategy == "hi_res":
323
+ kwargs["extract_image_block_to_payload"] = True
324
+
325
+ elements = partition(**kwargs)
326
+
327
+ logger.info(
328
+ f"Extracted {len(elements)} elements "
329
+ f"(strategy: {self.partition_strategy})"
330
+ )
331
+
332
+ return elements
333
+
334
+ finally:
335
+ try:
336
+ os.unlink(temp_path)
337
+ except OSError:
338
+ pass
339
+
340
+ def group_by_page(self, elements):
341
+ """
342
+ Group elements by page number.
343
+
344
+ Returns list of (page_number, elements) tuples.
345
+ """
346
+ pages = {}
347
+
348
+ for el in elements:
349
+ page_num = getattr(
350
+ el.metadata, 'page_number', None
351
+ ) if hasattr(el, 'metadata') else None
352
+ if page_num is None:
353
+ page_num = 1
354
+ if page_num not in pages:
355
+ pages[page_num] = []
356
+ pages[page_num].append(el)
357
+
358
+ return sorted(pages.items())
359
+
360
+ async def emit_section(self, elements, parent_doc_id, doc_uri_str,
361
+ metadata, flow, mime_type=None,
362
+ page_number=None, section_index=None):
363
+ """
364
+ Process a group of elements as a page or section.
365
+
366
+ Assembles text, saves to librarian, emits provenance, sends
367
+ TextDocument downstream. Returns the entity URI.
368
+ """
369
+ text, element_types, table_count, image_count = (
370
+ assemble_section_text(elements)
371
+ )
372
+
373
+ if not text.strip():
374
+ logger.debug("Skipping empty section")
375
+ return None
376
+
377
+ is_page = page_number is not None
378
+ char_length = len(text)
379
+
380
+ if is_page:
381
+ entity_uri = make_page_uri()
382
+ label = f"Page {page_number}"
383
+ else:
384
+ entity_uri = make_section_uri()
385
+ label = f"Section {section_index}" if section_index else "Section"
386
+
387
+ doc_id = entity_uri
388
+ page_content = text.encode("utf-8")
389
+
390
+ # Save to librarian
391
+ await self.save_child_document(
392
+ doc_id=doc_id,
393
+ parent_id=parent_doc_id,
394
+ user=metadata.user,
395
+ content=page_content,
396
+ document_type="page" if is_page else "section",
397
+ title=label,
398
+ )
399
+
400
+ # Emit provenance triples
401
+ element_types_str = ",".join(sorted(element_types)) if element_types else None
402
+
403
+ prov_triples = derived_entity_triples(
404
+ entity_uri=entity_uri,
405
+ parent_uri=doc_uri_str,
406
+ component_name=COMPONENT_NAME,
407
+ component_version=COMPONENT_VERSION,
408
+ label=label,
409
+ page_number=page_number,
410
+ section=not is_page,
411
+ char_length=char_length,
412
+ mime_type=mime_type,
413
+ element_types=element_types_str,
414
+ table_count=table_count if table_count > 0 else None,
415
+ image_count=image_count if image_count > 0 else None,
416
+ )
417
+
418
+ await flow("triples").send(Triples(
419
+ metadata=Metadata(
420
+ id=entity_uri,
421
+ root=metadata.root,
422
+ user=metadata.user,
423
+ collection=metadata.collection,
424
+ ),
425
+ triples=set_graph(prov_triples, GRAPH_SOURCE),
426
+ ))
427
+
428
+ # Send TextDocument downstream (chunker will fetch from librarian)
429
+ r = TextDocument(
430
+ metadata=Metadata(
431
+ id=entity_uri,
432
+ root=metadata.root,
433
+ user=metadata.user,
434
+ collection=metadata.collection,
435
+ ),
436
+ document_id=doc_id,
437
+ text=b"",
438
+ )
439
+
440
+ await flow("output").send(r)
441
+
442
+ return entity_uri
443
+
444
+ async def emit_image(self, element, parent_uri, parent_doc_id,
445
+ metadata, flow, mime_type=None, page_number=None):
446
+ """
447
+ Store an image element in the librarian with provenance.
448
+
449
+ Images are stored but NOT sent downstream to the text pipeline.
450
+ """
451
+ img_uri = make_image_uri()
452
+
453
+ # Get image data
454
+ img_data = None
455
+ if hasattr(element, 'metadata'):
456
+ img_data = getattr(element.metadata, 'image_base64', None)
457
+
458
+ if not img_data:
459
+ # No image payload available, just record provenance
460
+ logger.debug("Image element without payload, recording provenance only")
461
+ img_content = b""
462
+ img_kind = "image/unknown"
463
+ else:
464
+ if isinstance(img_data, str):
465
+ img_content = base64.b64decode(img_data)
466
+ else:
467
+ img_content = img_data
468
+ img_kind = "image/png" # unstructured typically extracts as PNG
469
+
470
+ # Save to librarian
471
+ if img_content:
472
+ await self.save_child_document(
473
+ doc_id=img_uri,
474
+ parent_id=parent_doc_id,
475
+ user=metadata.user,
476
+ content=img_content,
477
+ document_type="image",
478
+ title=f"Image from page {page_number}" if page_number else "Image",
479
+ kind=img_kind,
480
+ )
481
+
482
+ # Emit provenance triples
483
+ prov_triples = derived_entity_triples(
484
+ entity_uri=img_uri,
485
+ parent_uri=parent_uri,
486
+ component_name=COMPONENT_NAME,
487
+ component_version=COMPONENT_VERSION,
488
+ label=f"Image from page {page_number}" if page_number else "Image",
489
+ image=True,
490
+ page_number=page_number,
491
+ mime_type=mime_type,
492
+ )
493
+
494
+ await flow("triples").send(Triples(
495
+ metadata=Metadata(
496
+ id=img_uri,
497
+ root=metadata.root,
498
+ user=metadata.user,
499
+ collection=metadata.collection,
500
+ ),
501
+ triples=set_graph(prov_triples, GRAPH_SOURCE),
502
+ ))
503
+
504
+ async def on_message(self, msg, consumer, flow):
505
+
506
+ logger.debug("Document message received")
507
+
508
+ v = msg.value()
509
+
510
+ logger.info(f"Decoding {v.metadata.id}...")
511
+
512
+ # Determine content and mime type
513
+ mime_type = None
514
+
515
+ if v.document_id:
516
+ # Librarian path: fetch metadata then content
517
+ logger.info(
518
+ f"Fetching document {v.document_id} from librarian..."
519
+ )
520
+
521
+ doc_meta = await self.fetch_document_metadata(
522
+ document_id=v.document_id,
523
+ user=v.metadata.user,
524
+ )
525
+ mime_type = doc_meta.kind if doc_meta else None
526
+
527
+ content = await self.fetch_document_content(
528
+ document_id=v.document_id,
529
+ user=v.metadata.user,
530
+ )
531
+
532
+ if isinstance(content, str):
533
+ content = content.encode('utf-8')
534
+ blob = base64.b64decode(content)
535
+
536
+ logger.info(
537
+ f"Fetched {len(blob)} bytes, mime: {mime_type}"
538
+ )
539
+ else:
540
+ # Inline path: detect format from content
541
+ blob = base64.b64decode(v.data)
542
+ try:
543
+ mime_type = magic.from_buffer(blob, mime=True)
544
+ logger.info(f"Detected mime type: {mime_type}")
545
+ except Exception as e:
546
+ logger.warning(f"Could not detect mime type: {e}")
547
+
548
+ # Get the source document ID
549
+ source_doc_id = v.document_id or v.metadata.id
550
+ doc_uri_str = document_uri(source_doc_id)
551
+
552
+ # Extract elements using unstructured
553
+ elements = self.extract_elements(blob, mime_type)
554
+
555
+ if not elements:
556
+ logger.warning("No elements extracted from document")
557
+ return
558
+
559
+ # Determine if this is a page-based format
560
+ is_page_based = mime_type in PAGE_BASED_FORMATS if mime_type else False
561
+
562
+ # Also check if elements actually have page numbers
563
+ if not is_page_based:
564
+ has_pages = any(
565
+ getattr(el.metadata, 'page_number', None) is not None
566
+ for el in elements
567
+ if hasattr(el, 'metadata')
568
+ )
569
+ if has_pages:
570
+ is_page_based = True
571
+
572
+ if is_page_based:
573
+ # Group by page
574
+ page_groups = self.group_by_page(elements)
575
+
576
+ for page_num, page_elements in page_groups:
577
+
578
+ # Extract and store images separately
579
+ image_elements = [
580
+ el for el in page_elements
581
+ if getattr(el, 'category', '') == 'Image'
582
+ ]
583
+ text_elements = [
584
+ el for el in page_elements
585
+ if getattr(el, 'category', '') != 'Image'
586
+ ]
587
+
588
+ # Emit the page as a text section
589
+ page_uri_str = await self.emit_section(
590
+ text_elements, source_doc_id, doc_uri_str,
591
+ v.metadata, flow,
592
+ mime_type=mime_type, page_number=page_num,
593
+ )
594
+
595
+ # Store images (not sent to text pipeline)
596
+ for img_el in image_elements:
597
+ await self.emit_image(
598
+ img_el,
599
+ page_uri_str or doc_uri_str,
600
+ source_doc_id,
601
+ v.metadata, flow,
602
+ mime_type=mime_type, page_number=page_num,
603
+ )
604
+
605
+ else:
606
+ # Non-page format: use section strategy
607
+
608
+ # Separate images from text elements
609
+ image_elements = [
610
+ el for el in elements
611
+ if getattr(el, 'category', '') == 'Image'
612
+ ]
613
+ text_elements = [
614
+ el for el in elements
615
+ if getattr(el, 'category', '') != 'Image'
616
+ ]
617
+
618
+ # Apply section strategy to text elements
619
+ strategy_kwargs = {
620
+ 'element_count': self.section_element_count,
621
+ 'max_size': self.section_max_size,
622
+ }
623
+ groups = self.section_strategy(text_elements, **strategy_kwargs)
624
+
625
+ for idx, group in enumerate(groups):
626
+ section_idx = idx + 1
627
+
628
+ await self.emit_section(
629
+ group, source_doc_id, doc_uri_str,
630
+ v.metadata, flow,
631
+ mime_type=mime_type, section_index=section_idx,
632
+ )
633
+
634
+ # Store images (not sent to text pipeline)
635
+ for img_el in image_elements:
636
+ await self.emit_image(
637
+ img_el, doc_uri_str, source_doc_id,
638
+ v.metadata, flow,
639
+ mime_type=mime_type,
640
+ )
641
+
642
+ logger.info("Document decoding complete")
643
+
644
+ @staticmethod
645
+ def add_args(parser):
646
+
647
+ FlowProcessor.add_args(parser)
648
+
649
+ parser.add_argument(
650
+ '--strategy',
651
+ default='auto',
652
+ choices=['auto', 'hi_res', 'fast'],
653
+ help='Partitioning strategy (default: auto)',
654
+ )
655
+
656
+ parser.add_argument(
657
+ '--languages',
658
+ default='eng',
659
+ help='Comma-separated OCR language codes (default: eng)',
660
+ )
661
+
662
+ parser.add_argument(
663
+ '--section-strategy',
664
+ default='whole-document',
665
+ choices=[
666
+ 'whole-document', 'heading', 'element-type', 'count', 'size'
667
+ ],
668
+ help='Section grouping strategy for non-page formats '
669
+ '(default: whole-document)',
670
+ )
671
+
672
+ parser.add_argument(
673
+ '--section-element-count',
674
+ type=int,
675
+ default=20,
676
+ help='Elements per section for count strategy (default: 20)',
677
+ )
678
+
679
+ parser.add_argument(
680
+ '--section-max-size',
681
+ type=int,
682
+ default=4000,
683
+ help='Max chars per section for size strategy (default: 4000)',
684
+ )
685
+
686
+ parser.add_argument(
687
+ '--section-within-pages',
688
+ action='store_true',
689
+ default=False,
690
+ help='Apply section strategy within pages too (default: false)',
691
+ )
692
+
693
+ parser.add_argument(
694
+ '--librarian-request-queue',
695
+ default=default_librarian_request_queue,
696
+ help=f'Librarian request queue '
697
+ f'(default: {default_librarian_request_queue})',
698
+ )
699
+
700
+ parser.add_argument(
701
+ '--librarian-response-queue',
702
+ default=default_librarian_response_queue,
703
+ help=f'Librarian response queue '
704
+ f'(default: {default_librarian_response_queue})',
705
+ )
706
+
707
+
708
+ def run():
709
+
710
+ Processor.launch(default_ident, __doc__)
@@ -0,0 +1,171 @@
1
+
2
+ """
3
+ Section grouping strategies for the universal document decoder.
4
+
5
+ Each strategy takes a list of unstructured elements and returns a list
6
+ of element groups. Each group becomes one TextDocument output.
7
+ """
8
+
9
+ import logging
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def group_whole_document(elements, **kwargs):
15
+ """
16
+ Emit the entire document as a single section.
17
+
18
+ The downstream chunker handles all splitting.
19
+ """
20
+ if not elements:
21
+ return []
22
+ return [elements]
23
+
24
+
25
+ def group_by_heading(elements, **kwargs):
26
+ """
27
+ Split at heading elements (Title category).
28
+
29
+ Each section is a heading plus all content until the next heading.
30
+ Falls back to whole-document if no headings are found.
31
+ """
32
+ if not elements:
33
+ return []
34
+
35
+ # Check if any headings exist
36
+ has_headings = any(
37
+ getattr(el, 'category', '') == 'Title' for el in elements
38
+ )
39
+
40
+ if not has_headings:
41
+ logger.debug("No headings found, falling back to whole-document")
42
+ return group_whole_document(elements)
43
+
44
+ groups = []
45
+ current_group = []
46
+
47
+ for el in elements:
48
+ if getattr(el, 'category', '') == 'Title' and current_group:
49
+ groups.append(current_group)
50
+ current_group = []
51
+ current_group.append(el)
52
+
53
+ if current_group:
54
+ groups.append(current_group)
55
+
56
+ return groups
57
+
58
+
59
+ def group_by_element_type(elements, **kwargs):
60
+ """
61
+ Split on transitions between narrative text and tables.
62
+
63
+ Consecutive elements of the same broad category stay grouped.
64
+ """
65
+ if not elements:
66
+ return []
67
+
68
+ def is_table(el):
69
+ return getattr(el, 'category', '') == 'Table'
70
+
71
+ groups = []
72
+ current_group = []
73
+ current_is_table = None
74
+
75
+ for el in elements:
76
+ el_is_table = is_table(el)
77
+ if current_is_table is not None and el_is_table != current_is_table:
78
+ groups.append(current_group)
79
+ current_group = []
80
+ current_group.append(el)
81
+ current_is_table = el_is_table
82
+
83
+ if current_group:
84
+ groups.append(current_group)
85
+
86
+ return groups
87
+
88
+
89
+ def group_by_count(elements, element_count=20, **kwargs):
90
+ """
91
+ Group a fixed number of elements per section.
92
+
93
+ Args:
94
+ elements: List of unstructured elements
95
+ element_count: Number of elements per group (default: 20)
96
+ """
97
+ if not elements:
98
+ return []
99
+
100
+ groups = []
101
+ for i in range(0, len(elements), element_count):
102
+ groups.append(elements[i:i + element_count])
103
+
104
+ return groups
105
+
106
+
107
+ def group_by_size(elements, max_size=4000, **kwargs):
108
+ """
109
+ Accumulate elements until a character limit is reached.
110
+
111
+ Respects element boundaries — never splits mid-element. If a
112
+ single element exceeds the limit, it becomes its own section.
113
+
114
+ Args:
115
+ elements: List of unstructured elements
116
+ max_size: Max characters per section (default: 4000)
117
+ """
118
+ if not elements:
119
+ return []
120
+
121
+ groups = []
122
+ current_group = []
123
+ current_size = 0
124
+
125
+ for el in elements:
126
+ el_text = getattr(el, 'text', '') or ''
127
+ el_size = len(el_text)
128
+
129
+ if current_group and current_size + el_size > max_size:
130
+ groups.append(current_group)
131
+ current_group = []
132
+ current_size = 0
133
+
134
+ current_group.append(el)
135
+ current_size += el_size
136
+
137
+ if current_group:
138
+ groups.append(current_group)
139
+
140
+ return groups
141
+
142
+
143
+ # Strategy registry
144
+ STRATEGIES = {
145
+ 'whole-document': group_whole_document,
146
+ 'heading': group_by_heading,
147
+ 'element-type': group_by_element_type,
148
+ 'count': group_by_count,
149
+ 'size': group_by_size,
150
+ }
151
+
152
+
153
+ def get_strategy(name):
154
+ """
155
+ Get a section grouping strategy by name.
156
+
157
+ Args:
158
+ name: Strategy name (whole-document, heading, element-type, count, size)
159
+
160
+ Returns:
161
+ Strategy function
162
+
163
+ Raises:
164
+ ValueError: If strategy name is not recognized
165
+ """
166
+ if name not in STRATEGIES:
167
+ raise ValueError(
168
+ f"Unknown section strategy: {name}. "
169
+ f"Available: {', '.join(STRATEGIES.keys())}"
170
+ )
171
+ return STRATEGIES[name]
@@ -0,0 +1 @@
1
+ __version__ = "2.2.1"
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.4
2
+ Name: trustgraph-unstructured
3
+ Version: 2.2.1
4
+ Summary: TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.
5
+ Author-email: "trustgraph.ai" <security@trustgraph.ai>
6
+ Project-URL: Homepage, https://github.com/trustgraph-ai/trustgraph
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: trustgraph-base<2.3,>=2.2
12
+ Requires-Dist: pulsar-client
13
+ Requires-Dist: prometheus-client
14
+ Requires-Dist: python-magic
15
+ Requires-Dist: unstructured[csv,docx,epub,md,odt,pptx,rst,rtf,tsv,xlsx]
@@ -0,0 +1,12 @@
1
+ pyproject.toml
2
+ trustgraph/unstructured_version.py
3
+ trustgraph/decoding/universal/__init__.py
4
+ trustgraph/decoding/universal/__main__.py
5
+ trustgraph/decoding/universal/processor.py
6
+ trustgraph/decoding/universal/strategies.py
7
+ trustgraph_unstructured.egg-info/PKG-INFO
8
+ trustgraph_unstructured.egg-info/SOURCES.txt
9
+ trustgraph_unstructured.egg-info/dependency_links.txt
10
+ trustgraph_unstructured.egg-info/entry_points.txt
11
+ trustgraph_unstructured.egg-info/requires.txt
12
+ trustgraph_unstructured.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ universal-decoder = trustgraph.decoding.universal:run
@@ -0,0 +1,5 @@
1
+ trustgraph-base<2.3,>=2.2
2
+ pulsar-client
3
+ prometheus-client
4
+ python-magic
5
+ unstructured[csv,docx,epub,md,odt,pptx,rst,rtf,tsv,xlsx]