flowllm 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. flowllm/__init__.py +4 -3
  2. flowllm/app.py +1 -1
  3. flowllm/config/base.yaml +75 -0
  4. flowllm/config/fin_supply.yaml +39 -0
  5. flowllm/config/pydantic_config_parser.py +16 -1
  6. flowllm/context/__init__.py +2 -0
  7. flowllm/context/base_context.py +10 -20
  8. flowllm/context/flow_context.py +45 -2
  9. flowllm/context/service_context.py +69 -10
  10. flowllm/embedding_model/openai_compatible_embedding_model.py +1 -2
  11. flowllm/enumeration/chunk_enum.py +1 -0
  12. flowllm/flow/__init__.py +9 -0
  13. flowllm/flow/base_flow.py +44 -13
  14. flowllm/flow/expression/__init__.py +1 -0
  15. flowllm/flow/{parser → expression}/expression_parser.py +5 -2
  16. flowllm/flow/expression/expression_tool_flow.py +25 -0
  17. flowllm/flow/gallery/__init__.py +1 -8
  18. flowllm/flow/gallery/mock_tool_flow.py +46 -28
  19. flowllm/flow/tool_op_flow.py +97 -0
  20. flowllm/llm/base_llm.py +0 -2
  21. flowllm/op/__init__.py +3 -4
  22. flowllm/op/akshare/get_ak_a_code_op.py +1 -1
  23. flowllm/op/akshare/get_ak_a_info_op.py +1 -1
  24. flowllm/op/base_op.py +232 -16
  25. flowllm/op/base_tool_op.py +47 -0
  26. flowllm/op/gallery/__init__.py +0 -1
  27. flowllm/op/gallery/mock_op.py +13 -7
  28. flowllm/op/llm/__init__.py +3 -0
  29. flowllm/op/{agent/react_v2_op.py → llm/react_llm_op.py} +43 -24
  30. flowllm/op/llm/simple_llm_op.py +48 -0
  31. flowllm/op/llm/stream_llm_op.py +61 -0
  32. flowllm/op/mcp/__init__.py +2 -0
  33. flowllm/op/mcp/ant_op.py +42 -0
  34. flowllm/op/mcp/base_sse_mcp_op.py +28 -0
  35. flowllm/op/parallel_op.py +5 -1
  36. flowllm/op/search/__init__.py +1 -2
  37. flowllm/op/search/dashscope_search_op.py +73 -128
  38. flowllm/op/search/tavily_search_op.py +64 -82
  39. flowllm/op/sequential_op.py +4 -0
  40. flowllm/schema/flow_stream_chunk.py +11 -0
  41. flowllm/schema/service_config.py +8 -3
  42. flowllm/schema/tool_call.py +46 -1
  43. flowllm/service/__init__.py +0 -1
  44. flowllm/service/base_service.py +31 -14
  45. flowllm/service/http_service.py +45 -36
  46. flowllm/service/mcp_service.py +17 -23
  47. flowllm/storage/vector_store/__init__.py +1 -0
  48. flowllm/storage/vector_store/base_vector_store.py +99 -15
  49. flowllm/storage/vector_store/chroma_vector_store.py +250 -8
  50. flowllm/storage/vector_store/es_vector_store.py +288 -32
  51. flowllm/storage/vector_store/local_vector_store.py +206 -9
  52. flowllm/storage/vector_store/memory_vector_store.py +509 -0
  53. flowllm/utils/common_utils.py +54 -0
  54. flowllm/utils/miner_u_pdf_processor.py +726 -0
  55. {flowllm-0.1.3.dist-info → flowllm-0.1.5.dist-info}/METADATA +7 -6
  56. flowllm-0.1.5.dist-info/RECORD +98 -0
  57. flowllm/config/default.yaml +0 -77
  58. flowllm/config/empty.yaml +0 -37
  59. flowllm/flow/gallery/cmd_flow.py +0 -11
  60. flowllm/flow/gallery/code_tool_flow.py +0 -30
  61. flowllm/flow/gallery/dashscope_search_tool_flow.py +0 -34
  62. flowllm/flow/gallery/deepsearch_tool_flow.py +0 -39
  63. flowllm/flow/gallery/expression_tool_flow.py +0 -18
  64. flowllm/flow/gallery/tavily_search_tool_flow.py +0 -30
  65. flowllm/flow/gallery/terminate_tool_flow.py +0 -30
  66. flowllm/flow/parser/__init__.py +0 -0
  67. flowllm/op/agent/__init__.py +0 -1
  68. flowllm/op/agent/react_v1_op.py +0 -109
  69. flowllm/op/agent/react_v1_prompt.yaml +0 -54
  70. flowllm/op/base_ray_op.py +0 -313
  71. flowllm/op/code/__init__.py +0 -1
  72. flowllm/op/code/execute_code_op.py +0 -42
  73. flowllm/op/gallery/terminate_op.py +0 -29
  74. flowllm/op/search/dashscope_deep_research_op.py +0 -267
  75. flowllm/service/cmd_service.py +0 -15
  76. flowllm-0.1.3.dist-info/RECORD +0 -102
  77. /flowllm/op/{agent/react_v2_prompt.yaml → llm/react_llm_prompt.yaml} +0 -0
  78. {flowllm-0.1.3.dist-info → flowllm-0.1.5.dist-info}/WHEEL +0 -0
  79. {flowllm-0.1.3.dist-info → flowllm-0.1.5.dist-info}/entry_points.txt +0 -0
  80. {flowllm-0.1.3.dist-info → flowllm-0.1.5.dist-info}/licenses/LICENSE +0 -0
  81. {flowllm-0.1.3.dist-info → flowllm-0.1.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,726 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ MinerU PDF Processor
4
+
5
+ A comprehensive PDF processing utility that leverages MinerU for extracting structured content
6
+ from PDF documents. Returns both Markdown content and structured content lists for further processing.
7
+
8
+ This processor provides a high-level interface to MinerU's command-line tools, handling
9
+ file I/O, error management, and result parsing automatically.
10
+ """
11
+
12
+ import json
13
+ import logging
14
+ import os
15
+ import platform
16
+ import re
17
+ import subprocess
18
+ from pathlib import Path
19
+ from typing import Dict, List, Any, Tuple, Optional, Union
20
+
21
+
22
+ class MinerUPDFProcessor:
23
+ """
24
+ MinerU-based PDF Processing Engine
25
+
26
+ A robust PDF processor that wraps MinerU functionality to extract structured content
27
+ from PDF documents. Inspired by RAGAnything's processing logic but operates independently
28
+ with MinerU as the core engine.
29
+
30
+ Features:
31
+ - Automatic MinerU installation validation
32
+ - Multiple parsing methods (auto, txt, ocr)
33
+ - Language-specific OCR optimization
34
+ - Structured content extraction with metadata
35
+ - Image path resolution and management
36
+ - Comprehensive error handling and logging
37
+
38
+ Example:
39
+ processor = MinerUPDFProcessor(log_level="INFO")
40
+ content_list, markdown = processor.process_pdf("document.pdf")
41
+ """
42
+
43
+ def __init__(self, log_level: str = "INFO"):
44
+ """
45
+ Initialize the PDF processor with logging configuration.
46
+
47
+ Args:
48
+ log_level (str): Logging level for the processor.
49
+ Options: "DEBUG", "INFO", "WARNING", "ERROR"
50
+
51
+ Raises:
52
+ RuntimeError: If MinerU is not properly installed or accessible
53
+ """
54
+ # Configure logging system
55
+ logging.basicConfig(
56
+ level=getattr(logging, log_level.upper()),
57
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
58
+ )
59
+ self.logger = logging.getLogger(__name__)
60
+
61
+ # Validate MinerU installation before proceeding
62
+ if not self.check_mineru_installation():
63
+ raise RuntimeError(
64
+ "MinerU is not properly installed. Please install using:\n"
65
+ "pip install -U 'mineru[core]' or uv pip install -U 'mineru[core]'"
66
+ )
67
+
68
+ @classmethod
69
+ def create_with_defaults(cls, log_level: str = "INFO") -> "MinerUPDFProcessor":
70
+ """
71
+ Create a MinerUPDFProcessor instance with default settings.
72
+
73
+ Convenience method for quick instantiation with standard configuration.
74
+
75
+ Args:
76
+ log_level (str): Logging level (default: "INFO")
77
+
78
+ Returns:
79
+ MinerUPDFProcessor: Configured processor instance
80
+ """
81
+ return cls(log_level=log_level)
82
+
83
+ def check_mineru_installation(self) -> bool:
84
+ """
85
+ Verify that MinerU is properly installed and accessible.
86
+
87
+ Attempts to run the MinerU command-line tool to check its availability
88
+ and version information.
89
+
90
+ Returns:
91
+ bool: True if MinerU is properly installed, False otherwise
92
+ """
93
+ try:
94
+ # Configure subprocess parameters for cross-platform compatibility
95
+ subprocess_kwargs = {
96
+ "capture_output": True,
97
+ "text": True,
98
+ "check": True,
99
+ "encoding": "utf-8",
100
+ "errors": "ignore",
101
+ }
102
+
103
+ # Hide console window on Windows systems
104
+ if platform.system() == "Windows":
105
+ subprocess_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
106
+
107
+ # Execute version check command
108
+ result = subprocess.run(["mineru", "--version"], **subprocess_kwargs)
109
+ self.logger.debug(f"MinerU version detected: {result.stdout.strip()}")
110
+ return True
111
+ except (subprocess.CalledProcessError, FileNotFoundError):
112
+ return False
113
+
114
+ def _run_mineru_command(
115
+ self,
116
+ input_path: Union[str, Path],
117
+ output_dir: Union[str, Path],
118
+ method: str = "auto",
119
+ lang: Optional[str] = None,
120
+ backend: str = "pipeline",
121
+ start_page: Optional[int] = None,
122
+ end_page: Optional[int] = None,
123
+ formula: bool = True,
124
+ table: bool = True,
125
+ device: Optional[str] = None,
126
+ source: str = "modelscope",
127
+ vlm_url: Optional[str] = None,
128
+ ) -> None:
129
+ """
130
+ Execute MinerU command-line tool with specified parameters.
131
+
132
+ This method constructs and executes the MinerU command with all provided
133
+ options, handling cross-platform subprocess execution and error management.
134
+
135
+ Args:
136
+ input_path (Union[str, Path]): Path to the input PDF file
137
+ output_dir (Union[str, Path]): Directory path for output files
138
+ method (str): Parsing method - "auto", "txt", or "ocr"
139
+ lang (Optional[str]): Document language for OCR optimization (e.g., "en", "ch", "ja")
140
+ backend (str): Processing backend to use
141
+ start_page (Optional[int]): Starting page number (0-based indexing)
142
+ end_page (Optional[int]): Ending page number (0-based indexing)
143
+ formula (bool): Enable mathematical formula parsing
144
+ table (bool): Enable table structure parsing
145
+ device (Optional[str]): Computing device for inference (e.g., "cuda", "cpu")
146
+ source (str): Model source repository
147
+ vlm_url (Optional[str]): VLM server URL (required for vlm-sglang-client backend)
148
+
149
+ Raises:
150
+ subprocess.CalledProcessError: If MinerU command execution fails
151
+ FileNotFoundError: If MinerU executable is not found
152
+ RuntimeError: If MinerU is not properly installed
153
+ """
154
+ # Build base command with required parameters
155
+ cmd = [
156
+ "mineru",
157
+ "-p", str(input_path),
158
+ "-o", str(output_dir),
159
+ "-m", method,
160
+ # Note: backend and source parameters are commented out as they may not be
161
+ # available in all MinerU versions or configurations
162
+ # "-b", backend,
163
+ # "--source", source,
164
+ ]
165
+
166
+ # Add optional parameters if specified
167
+ if lang:
168
+ cmd.extend(["-l", lang])
169
+ if start_page is not None:
170
+ cmd.extend(["-s", str(start_page)])
171
+ if end_page is not None:
172
+ cmd.extend(["-e", str(end_page)])
173
+ if not formula:
174
+ cmd.extend(["-f", "false"])
175
+ if not table:
176
+ cmd.extend(["-t", "false"])
177
+ if device:
178
+ cmd.extend(["-d", device])
179
+ if vlm_url:
180
+ cmd.extend(["-u", vlm_url])
181
+
182
+ try:
183
+ # Configure subprocess execution parameters
184
+ subprocess_kwargs = {
185
+ "capture_output": True,
186
+ "text": True,
187
+ "check": True,
188
+ "encoding": "utf-8",
189
+ "errors": "ignore",
190
+ }
191
+
192
+ # Hide console window on Windows systems
193
+ if platform.system() == "Windows":
194
+ subprocess_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
195
+
196
+ self.logger.info(f"Executing MinerU command: {' '.join(cmd)}")
197
+ result = subprocess.run(cmd, **subprocess_kwargs)
198
+
199
+ self.logger.info("MinerU command executed successfully")
200
+ if result.stdout:
201
+ self.logger.debug(f"MinerU output: {result.stdout}")
202
+
203
+ except subprocess.CalledProcessError as e:
204
+ self.logger.error(f"MinerU command execution failed: {e}")
205
+ if e.stderr:
206
+ self.logger.error(f"Error details: {e.stderr}")
207
+ raise
208
+ except FileNotFoundError:
209
+ raise RuntimeError(
210
+ "MinerU command not found. Please ensure MinerU 2.0 is properly installed:\n"
211
+ "pip install -U 'mineru[core]' or uv pip install -U 'mineru[core]'"
212
+ )
213
+
214
+ def _read_output_files(
215
+ self,
216
+ output_dir: Path,
217
+ file_stem: str,
218
+ method: str = "auto"
219
+ ) -> Tuple[List[Dict[str, Any]], str]:
220
+ """
221
+ Read and parse MinerU-generated output files.
222
+
223
+ This method locates and reads the Markdown and JSON files generated by MinerU,
224
+ handling different directory structures and resolving image paths to absolute paths.
225
+
226
+ Args:
227
+ output_dir (Path): Directory containing the MinerU output files
228
+ file_stem (str): Base filename without extension
229
+ method (str): Parsing method used ("auto", "txt", "ocr", "vlm")
230
+
231
+ Returns:
232
+ Tuple[List[Dict[str, Any]], str]: A tuple containing:
233
+ - content_list: Structured content list with metadata
234
+ - markdown_content: Raw Markdown text content
235
+ """
236
+ # Locate generated output files - handle both flat and nested directory structures
237
+ md_file = output_dir / f"{file_stem}.md"
238
+ json_file = output_dir / f"{file_stem}_content_list.json"
239
+ images_base_dir = output_dir
240
+
241
+ # Check for nested subdirectory structure (common with newer MinerU versions)
242
+ file_stem_subdir = output_dir / file_stem
243
+ if file_stem_subdir.exists():
244
+ md_file = file_stem_subdir / method / f"{file_stem}.md"
245
+ json_file = file_stem_subdir / method / f"{file_stem}_content_list.json"
246
+ images_base_dir = file_stem_subdir / method
247
+
248
+ # Read Markdown content
249
+ md_content = ""
250
+ if md_file.exists():
251
+ try:
252
+ with open(md_file, "r", encoding="utf-8") as f:
253
+ md_content = f.read()
254
+ self.logger.info(f"Successfully read Markdown file: {md_file}")
255
+ except Exception as e:
256
+ self.logger.warning(f"Failed to read Markdown file {md_file}: {e}")
257
+ else:
258
+ self.logger.warning(f"Markdown file not found: {md_file}")
259
+
260
+ # Read structured content list from JSON
261
+ content_list = []
262
+ if json_file.exists():
263
+ try:
264
+ with open(json_file, "r", encoding="utf-8") as f:
265
+ content_list = json.load(f)
266
+
267
+ # Convert relative image paths to absolute paths for proper access
268
+ self.logger.info(f"Resolving image paths relative to: {images_base_dir}")
269
+ for item in content_list:
270
+ if isinstance(item, dict):
271
+ # Process various image path fields that may be present
272
+ for field_name in ["img_path", "table_img_path", "equation_img_path"]:
273
+ if field_name in item and item[field_name]:
274
+ img_path = item[field_name]
275
+ if not os.path.isabs(img_path):
276
+ absolute_img_path = (images_base_dir / img_path).resolve()
277
+ item[field_name] = str(absolute_img_path)
278
+ self.logger.debug(f"Updated {field_name}: {img_path} -> {item[field_name]}")
279
+
280
+ self.logger.info(
281
+ f"Successfully read JSON file: {json_file}, containing {len(content_list)} content blocks")
282
+
283
+ except Exception as e:
284
+ self.logger.warning(f"Failed to read JSON file {json_file}: {e}")
285
+ else:
286
+ self.logger.warning(f"JSON file not found: {json_file}")
287
+
288
+ return content_list, md_content
289
+
290
+ def process_pdf(
291
+ self,
292
+ pdf_path: Union[str, Path],
293
+ output_dir: Optional[Union[str, Path]] = None,
294
+ method: str = "auto",
295
+ lang: Optional[str] = None,
296
+ backend: str = "pipeline",
297
+ **kwargs
298
+ ) -> Tuple[List[Dict[str, Any]], str]:
299
+ """
300
+ Process a PDF file and extract structured content using MinerU.
301
+
302
+ This is the main entry point for PDF processing. It validates input,
303
+ executes MinerU processing, and returns both structured content and markdown.
304
+
305
+ Args:
306
+ pdf_path (Union[str, Path]): Path to the input PDF file
307
+ output_dir (Optional[Union[str, Path]]): Output directory path.
308
+ If None, creates 'mineru_output' in PDF's directory
309
+ method (str): Parsing method - "auto" (recommended), "txt", or "ocr"
310
+ lang (Optional[str]): Document language for OCR optimization
311
+ (e.g., "ch" for Chinese, "en" for English, "ja" for Japanese)
312
+ backend (str): Processing backend - "pipeline", "vlm-transformers",
313
+ "vlm-sglang-engine", or "vlm-sglang-client"
314
+ **kwargs: Additional MinerU parameters (start_page, end_page, formula, table, etc.)
315
+
316
+ Returns:
317
+ Tuple[List[Dict[str, Any]], str]: A tuple containing:
318
+ - content_list: Structured list of content blocks with metadata
319
+ - markdown_content: Complete document in Markdown format
320
+
321
+ Raises:
322
+ FileNotFoundError: If the specified PDF file does not exist
323
+ ValueError: If the file is not a valid PDF format
324
+ RuntimeError: If MinerU processing fails or encounters errors
325
+ """
326
+ # Convert to Path object and validate input
327
+ pdf_path = Path(pdf_path)
328
+ if not pdf_path.exists():
329
+ raise FileNotFoundError(f"PDF file does not exist: {pdf_path}")
330
+
331
+ if not pdf_path.suffix.lower() == '.pdf':
332
+ raise ValueError(f"File is not a PDF format: {pdf_path}")
333
+
334
+ name_without_suffix = pdf_path.stem
335
+
336
+ # Prepare output directory
337
+ if output_dir:
338
+ base_output_dir = Path(output_dir)
339
+ else:
340
+ base_output_dir = pdf_path.parent / "mineru_output"
341
+
342
+ base_output_dir.mkdir(parents=True, exist_ok=True)
343
+
344
+ try:
345
+ # Execute MinerU processing
346
+ self.logger.info(f"Starting PDF processing: {pdf_path}")
347
+
348
+ self._run_mineru_command(
349
+ input_path=pdf_path,
350
+ output_dir=base_output_dir,
351
+ method=method,
352
+ lang=lang,
353
+ backend=backend,
354
+ **kwargs
355
+ )
356
+
357
+ # Read generated output files
358
+ backend_method = method
359
+ if backend.startswith("vlm-"):
360
+ backend_method = "vlm"
361
+
362
+ content_list, markdown_content = self._read_output_files(
363
+ base_output_dir, name_without_suffix, method=backend_method
364
+ )
365
+
366
+ # Generate processing statistics
367
+ content_stats = {}
368
+ for item in content_list:
369
+ if isinstance(item, dict):
370
+ content_type = item.get("type", "unknown")
371
+ content_stats[content_type] = content_stats.get(content_type, 0) + 1
372
+
373
+ self.logger.info(f"PDF processing completed! Extracted {len(content_list)} content blocks")
374
+ self.logger.info("Content type statistics:")
375
+ for content_type, count in content_stats.items():
376
+ self.logger.info(f" - {content_type}: {count}")
377
+
378
+ return content_list, markdown_content
379
+
380
+ except Exception as e:
381
+ self.logger.error(f"Error occurred during PDF processing: {str(e)}")
382
+ raise
383
+
384
+ def save_results(
385
+ self,
386
+ content_list: List[Dict[str, Any]],
387
+ markdown_content: str,
388
+ output_path: Union[str, Path],
389
+ save_markdown: bool = True,
390
+ save_json: bool = True,
391
+ indent: int = 2
392
+ ) -> Dict[str, Path]:
393
+ """
394
+ Save processing results to files.
395
+
396
+ Saves the extracted content in both JSON (structured) and Markdown (text) formats
397
+ for different use cases and downstream processing needs.
398
+
399
+ Args:
400
+ content_list (List[Dict[str, Any]]): Structured content list with metadata
401
+ markdown_content (str): Complete document in Markdown format
402
+ output_path (Union[str, Path]): Output file path (without extension)
403
+ save_markdown (bool): Whether to save Markdown file
404
+ save_json (bool): Whether to save JSON file with structured content
405
+ indent (int): JSON file indentation for readability
406
+
407
+ Returns:
408
+ Dict[str, Path]: Dictionary mapping file types to their saved paths
409
+ Keys: 'markdown', 'json' (if respective files were saved)
410
+
411
+ Raises:
412
+ Exception: If file writing operations fail
413
+ """
414
+ output_path = Path(output_path)
415
+ saved_files = {}
416
+
417
+ try:
418
+ # Ensure output directory exists
419
+ output_path.parent.mkdir(parents=True, exist_ok=True)
420
+
421
+ # Save Markdown file
422
+ if save_markdown and markdown_content:
423
+ md_path = output_path.with_suffix('.md')
424
+ with open(md_path, 'w', encoding='utf-8') as f:
425
+ f.write(markdown_content)
426
+ saved_files['markdown'] = md_path
427
+ self.logger.info(f"Markdown file saved: {md_path}")
428
+
429
+ # Save JSON file with structured content
430
+ if save_json and content_list:
431
+ json_path = output_path.with_suffix('.json')
432
+ with open(json_path, 'w', encoding='utf-8') as f:
433
+ json.dump(content_list, f, indent=indent, ensure_ascii=False)
434
+ saved_files['json'] = json_path
435
+ self.logger.info(f"JSON file saved: {json_path}")
436
+
437
+ return saved_files
438
+
439
+ except Exception as e:
440
+ self.logger.error(f"Error occurred while saving files: {e}")
441
+ raise
442
+
443
+ @staticmethod
444
+ def get_content_statistics(content_list: List[Dict[str, Any]]) -> Dict[str, Any]:
445
+ """
446
+ Generate detailed statistics about the processed content.
447
+
448
+ Analyzes the content list to provide insights into document structure,
449
+ content types, and processing results.
450
+
451
+ Args:
452
+ content_list (List[Dict[str, Any]]): Structured content list from MinerU
453
+
454
+ Returns:
455
+ Dict[str, Any]: Dictionary containing various statistics:
456
+ - total_blocks: Total number of content blocks
457
+ - content_types: Count of each content type
458
+ - text_stats: Text-specific statistics (characters, words, etc.)
459
+ - image_count: Number of images found
460
+ - table_count: Number of tables found
461
+ """
462
+ stats = {
463
+ "total_blocks": len(content_list),
464
+ "content_types": {},
465
+ "text_stats": {"total_characters": 0, "total_words": 0, "title_levels": {}},
466
+ "image_count": 0,
467
+ "table_count": 0,
468
+ "has_formulas": False
469
+ }
470
+
471
+ for item in content_list:
472
+ if not isinstance(item, dict):
473
+ continue
474
+
475
+ content_type = item.get("type", "unknown")
476
+ stats["content_types"][content_type] = stats["content_types"].get(content_type, 0) + 1
477
+
478
+ if content_type == "text":
479
+ text = item.get("text", "")
480
+ stats["text_stats"]["total_characters"] += len(text)
481
+ stats["text_stats"]["total_words"] += len(text.split())
482
+
483
+ level = item.get("text_level", 0)
484
+ if level > 0:
485
+ stats["text_stats"]["title_levels"][level] = stats["text_stats"]["title_levels"].get(level, 0) + 1
486
+
487
+ elif content_type == "image":
488
+ stats["image_count"] += 1
489
+
490
+ elif content_type == "table":
491
+ stats["table_count"] += 1
492
+
493
+ elif content_type == "formula":
494
+ stats["has_formulas"] = True
495
+
496
+ return stats
497
+
498
+ def validate_output_quality(self, content_list: List[Dict[str, Any]], markdown_content: str) -> Dict[str, Any]:
499
+ """
500
+ Validate the quality and completeness of the processing output.
501
+
502
+ Performs various checks to ensure the processed content meets quality standards
503
+ and provides warnings or suggestions for improvement.
504
+
505
+ Args:
506
+ content_list (List[Dict[str, Any]]): Structured content list
507
+ markdown_content (str): Markdown content string
508
+
509
+ Returns:
510
+ Dict[str, Any]: Validation results containing:
511
+ - is_valid: Overall validation status
512
+ - warnings: List of warning messages
513
+ - suggestions: List of improvement suggestions
514
+ - quality_score: Numeric quality score (0-100)
515
+ """
516
+ validation = {
517
+ "is_valid": True,
518
+ "warnings": [],
519
+ "suggestions": [],
520
+ "quality_score": 100
521
+ }
522
+
523
+ # Check if content was extracted
524
+ if not content_list and not markdown_content.strip():
525
+ validation["is_valid"] = False
526
+ validation["warnings"].append("No content was extracted from the PDF")
527
+ validation["quality_score"] = 0
528
+ return validation
529
+
530
+ # Check content diversity
531
+ stats = self.get_content_statistics(content_list)
532
+ if stats["total_blocks"] < 5:
533
+ validation["warnings"].append("Very few content blocks extracted - document may be complex or image-heavy")
534
+ validation["quality_score"] -= 20
535
+
536
+ # Check text content ratio
537
+ text_blocks = stats["content_types"].get("text", 0)
538
+ if text_blocks == 0:
539
+ validation["warnings"].append("No text blocks found - consider using OCR method for image-based PDFs")
540
+ validation["quality_score"] -= 30
541
+ elif text_blocks / stats["total_blocks"] < 0.3:
542
+ validation["suggestions"].append("Low text content ratio - document may benefit from OCR processing")
543
+ validation["quality_score"] -= 10
544
+
545
+ # Check for images without processing
546
+ if stats["image_count"] > 0 and stats["content_types"].get("text", 0) == 0:
547
+ validation["suggestions"].append(
548
+ "Images detected but no text extracted - consider using VLM backend for image analysis")
549
+
550
+ # Check markdown length vs content blocks
551
+ if len(markdown_content.strip()) < 100 and stats["total_blocks"] > 10:
552
+ validation["warnings"].append("Markdown content seems unusually short for the number of content blocks")
553
+ validation["quality_score"] -= 15
554
+
555
+ return validation
556
+
557
+
558
+ def chunk_pdf_content(content_list: List[Dict[str, Any]], max_length: int = 4000) -> List[str]:
559
+ """
560
+ Split MinerU-parsed content list into text chunks of specified length.
561
+
562
+ This utility function converts structured content from MinerU into manageable
563
+ text chunks suitable for downstream processing like embedding generation or
564
+ language model input.
565
+
566
+ Args:
567
+ content_list (List[Dict[str, Any]]): MinerU-parsed structured content list
568
+ max_length (int): Maximum character length per chunk (default: 4000)
569
+
570
+ Returns:
571
+ List[str]: List of text chunks, each prefixed with chunk metadata
572
+ including chunk number, total chunks, and character count
573
+ """
574
+
575
+ def extract_text(item: Dict[str, Any]) -> str:
576
+ """
577
+ Extract text content from a single content item.
578
+
579
+ Handles different content types (text, table, image) and formats them
580
+ appropriately for text-based processing.
581
+
582
+ Args:
583
+ item (Dict[str, Any]): Single content item from MinerU output
584
+
585
+ Returns:
586
+ str: Extracted and formatted text content
587
+ """
588
+ if item.get("type") == "text":
589
+ text = item.get("text", "").strip()
590
+ if not text:
591
+ return ""
592
+ # Add markdown header formatting for titles
593
+ level = item.get("text_level", 0)
594
+ if level > 0:
595
+ return f"{'#' * min(level, 6)} {text}"
596
+ return text
597
+
598
+ elif item.get("type") == "table":
599
+ parts = []
600
+ if item.get("table_caption"):
601
+ parts.append("Table: " + " | ".join(item["table_caption"]))
602
+ if item.get("table_body"):
603
+ # Simple HTML tag cleanup and formatting
604
+ table_text = re.sub(r'<[^>]+>', ' | ', item["table_body"])
605
+ table_text = re.sub(r'\s+', ' ', table_text).strip()
606
+ parts.append(table_text)
607
+ return "\n".join(parts) if parts else ""
608
+
609
+ elif item.get("type") == "image":
610
+ if item.get("image_caption"):
611
+ return "Image: " + " | ".join(item["image_caption"])
612
+ return ""
613
+
614
+ return ""
615
+
616
+ # Extract all text content from the structured list
617
+ all_text = ""
618
+ for item in content_list:
619
+ text = extract_text(item)
620
+ if text.strip():
621
+ all_text += text + "\n"
622
+
623
+ if not all_text.strip():
624
+ return []
625
+
626
+ # Split into chunks based on max_length
627
+ chunks = []
628
+ current_chunk = ""
629
+
630
+ for line in all_text.split('\n'):
631
+ # Check if adding this line would exceed max_length
632
+ if len(current_chunk) + len(line) + 1 > max_length and current_chunk:
633
+ chunks.append(current_chunk.strip())
634
+ current_chunk = line
635
+ else:
636
+ current_chunk += line + "\n" if current_chunk else line
637
+
638
+ # Add the final chunk if it contains content
639
+ if current_chunk.strip():
640
+ chunks.append(current_chunk.strip())
641
+
642
+ # Add chunk metadata headers
643
+ total_chunks = len(chunks)
644
+ marked_chunks = []
645
+ for i, chunk in enumerate(chunks):
646
+ header = f"=== CHUNK {i + 1}/{total_chunks} ({len(chunk)} characters) ===\n"
647
+ marked_chunks.append(header + chunk)
648
+
649
+ return marked_chunks
650
+
651
+
652
+ # Example usage and demonstration
653
+ if __name__ == "__main__":
654
+ """
655
+ Example usage of the MinerUPDFProcessor class.
656
+
657
+ This example demonstrates the basic workflow for processing a PDF file
658
+ and working with the extracted content.
659
+ """
660
+ import sys
661
+
662
+
663
+ # Example usage
664
+ def example_usage():
665
+ """Demonstrate basic PDF processing workflow."""
666
+ try:
667
+ # Initialize processor
668
+ processor = MinerUPDFProcessor.create_with_defaults(log_level="INFO")
669
+
670
+ # Example PDF path (replace with actual PDF file)
671
+ pdf_path = "example_document.pdf"
672
+
673
+ if not Path(pdf_path).exists():
674
+ print(f"Example PDF file not found: {pdf_path}")
675
+ print("Please provide a valid PDF file path to test the processor.")
676
+ return
677
+
678
+ # Process PDF with different methods
679
+ print("Processing PDF with auto method...")
680
+ content_list, markdown_content = processor.process_pdf(
681
+ pdf_path=pdf_path,
682
+ method="auto",
683
+ lang="en" # Specify language for better OCR results
684
+ )
685
+
686
+ # Generate statistics
687
+ stats = processor.get_content_statistics(content_list)
688
+ print(f"Processing Statistics:")
689
+ print(f" Total blocks: {stats['total_blocks']}")
690
+ print(f" Content types: {stats['content_types']}")
691
+ print(f" Text characters: {stats['text_stats']['total_characters']}")
692
+ print(f" Text words: {stats['text_stats']['total_words']}")
693
+
694
+ # Validate output quality
695
+ validation = processor.validate_output_quality(content_list, markdown_content)
696
+ print(f"Quality Score: {validation['quality_score']}/100")
697
+ if validation['warnings']:
698
+ print("Warnings:", validation['warnings'])
699
+ if validation['suggestions']:
700
+ print("Suggestions:", validation['suggestions'])
701
+
702
+ # Save results
703
+ output_path = Path(pdf_path).stem + "_processed"
704
+ saved_files = processor.save_results(
705
+ content_list=content_list,
706
+ markdown_content=markdown_content,
707
+ output_path=output_path
708
+ )
709
+ print(f"Results saved to: {saved_files}")
710
+
711
+ # Create text chunks for downstream processing
712
+ chunks = chunk_pdf_content(content_list, max_length=2000)
713
+ print(f"Created {len(chunks)} text chunks")
714
+
715
+ # Display first chunk as example
716
+ if chunks:
717
+ print("First chunk preview:")
718
+ print(chunks[0][:200] + "..." if len(chunks[0]) > 200 else chunks[0])
719
+
720
+ except Exception as e:
721
+ print(f"Error during processing: {e}")
722
+ sys.exit(1)
723
+
724
+
725
+ # Run example if script is executed directly
726
+ example_usage()