reme-ai 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reme_ai/__init__.py +1 -1
- reme_ai/config/default.yaml +16 -0
- reme_ai/constants/common_constants.py +0 -2
- reme_ai/constants/language_constants.py +1 -1
- reme_ai/enumeration/language_enum.py +14 -0
- reme_ai/summary/task/__init__.py +0 -1
- reme_ai/summary/task/trajectory_preprocess_op.py +2 -31
- reme_ai/utils/datetime_handler.py +1 -1
- reme_ai-0.1.4.dist-info/METADATA +649 -0
- {reme_ai-0.1.2.dist-info → reme_ai-0.1.4.dist-info}/RECORD +14 -16
- reme_ai/enumeration/language_constants.py +0 -215
- reme_ai/summary/task/pdf_preprocess_op_wrapper.py +0 -50
- reme_ai/utils/miner_u_pdf_processor.py +0 -726
- reme_ai-0.1.2.dist-info/METADATA +0 -215
- {reme_ai-0.1.2.dist-info → reme_ai-0.1.4.dist-info}/WHEEL +0 -0
- {reme_ai-0.1.2.dist-info → reme_ai-0.1.4.dist-info}/entry_points.txt +0 -0
- {reme_ai-0.1.2.dist-info → reme_ai-0.1.4.dist-info}/licenses/LICENSE +0 -0
- {reme_ai-0.1.2.dist-info → reme_ai-0.1.4.dist-info}/top_level.txt +0 -0
@@ -1,726 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
"""
|
3
|
-
MinerU PDF Processor
|
4
|
-
|
5
|
-
A comprehensive PDF processing utility that leverages MinerU for extracting structured content
|
6
|
-
from PDF documents. Returns both Markdown content and structured content lists for further processing.
|
7
|
-
|
8
|
-
This processor provides a high-level interface to MinerU's command-line tools, handling
|
9
|
-
file I/O, error management, and result parsing automatically.
|
10
|
-
"""
|
11
|
-
|
12
|
-
import json
|
13
|
-
import logging
|
14
|
-
import os
|
15
|
-
import platform
|
16
|
-
import re
|
17
|
-
import subprocess
|
18
|
-
from pathlib import Path
|
19
|
-
from typing import Dict, List, Any, Tuple, Optional, Union
|
20
|
-
|
21
|
-
|
22
|
-
class MinerUPDFProcessor:
|
23
|
-
"""
|
24
|
-
MinerU-based PDF Processing Engine
|
25
|
-
|
26
|
-
A robust PDF processor that wraps MinerU functionality to extract structured content
|
27
|
-
from PDF documents. Inspired by RAGAnything's processing logic but operates independently
|
28
|
-
with MinerU as the core engine.
|
29
|
-
|
30
|
-
Features:
|
31
|
-
- Automatic MinerU installation validation
|
32
|
-
- Multiple parsing methods (auto, txt, ocr)
|
33
|
-
- Language-specific OCR optimization
|
34
|
-
- Structured content extraction with metadata
|
35
|
-
- Image path resolution and management
|
36
|
-
- Comprehensive error handling and logging
|
37
|
-
|
38
|
-
Example:
|
39
|
-
processor = MinerUPDFProcessor(log_level="INFO")
|
40
|
-
content_list, markdown = processor.process_pdf("document.pdf")
|
41
|
-
"""
|
42
|
-
|
43
|
-
def __init__(self, log_level: str = "INFO"):
|
44
|
-
"""
|
45
|
-
Initialize the PDF processor with logging configuration.
|
46
|
-
|
47
|
-
Args:
|
48
|
-
log_level (str): Logging level for the processor.
|
49
|
-
Options: "DEBUG", "INFO", "WARNING", "ERROR"
|
50
|
-
|
51
|
-
Raises:
|
52
|
-
RuntimeError: If MinerU is not properly installed or accessible
|
53
|
-
"""
|
54
|
-
# Configure logging system
|
55
|
-
logging.basicConfig(
|
56
|
-
level=getattr(logging, log_level.upper()),
|
57
|
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
58
|
-
)
|
59
|
-
self.logger = logging.getLogger(__name__)
|
60
|
-
|
61
|
-
# Validate MinerU installation before proceeding
|
62
|
-
if not self.check_mineru_installation():
|
63
|
-
raise RuntimeError(
|
64
|
-
"MinerU is not properly installed. Please install using:\n"
|
65
|
-
"pip install -U 'mineru[core]' or uv pip install -U 'mineru[core]'"
|
66
|
-
)
|
67
|
-
|
68
|
-
@classmethod
|
69
|
-
def create_with_defaults(cls, log_level: str = "INFO") -> "MinerUPDFProcessor":
|
70
|
-
"""
|
71
|
-
Create a MinerUPDFProcessor instance with default settings.
|
72
|
-
|
73
|
-
Convenience method for quick instantiation with standard configuration.
|
74
|
-
|
75
|
-
Args:
|
76
|
-
log_level (str): Logging level (default: "INFO")
|
77
|
-
|
78
|
-
Returns:
|
79
|
-
MinerUPDFProcessor: Configured processor instance
|
80
|
-
"""
|
81
|
-
return cls(log_level=log_level)
|
82
|
-
|
83
|
-
def check_mineru_installation(self) -> bool:
|
84
|
-
"""
|
85
|
-
Verify that MinerU is properly installed and accessible.
|
86
|
-
|
87
|
-
Attempts to run the MinerU command-line tool to check its availability
|
88
|
-
and version information.
|
89
|
-
|
90
|
-
Returns:
|
91
|
-
bool: True if MinerU is properly installed, False otherwise
|
92
|
-
"""
|
93
|
-
try:
|
94
|
-
# Configure subprocess parameters for cross-platform compatibility
|
95
|
-
subprocess_kwargs = {
|
96
|
-
"capture_output": True,
|
97
|
-
"text": True,
|
98
|
-
"check": True,
|
99
|
-
"encoding": "utf-8",
|
100
|
-
"errors": "ignore",
|
101
|
-
}
|
102
|
-
|
103
|
-
# Hide console window on Windows systems
|
104
|
-
if platform.system() == "Windows":
|
105
|
-
subprocess_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
|
106
|
-
|
107
|
-
# Execute version check command
|
108
|
-
result = subprocess.run(["mineru", "--version"], **subprocess_kwargs)
|
109
|
-
self.logger.debug(f"MinerU version detected: {result.stdout.strip()}")
|
110
|
-
return True
|
111
|
-
except (subprocess.CalledProcessError, FileNotFoundError):
|
112
|
-
return False
|
113
|
-
|
114
|
-
def _run_mineru_command(
|
115
|
-
self,
|
116
|
-
input_path: Union[str, Path],
|
117
|
-
output_dir: Union[str, Path],
|
118
|
-
method: str = "auto",
|
119
|
-
lang: Optional[str] = None,
|
120
|
-
backend: str = "pipeline",
|
121
|
-
start_page: Optional[int] = None,
|
122
|
-
end_page: Optional[int] = None,
|
123
|
-
formula: bool = True,
|
124
|
-
table: bool = True,
|
125
|
-
device: Optional[str] = None,
|
126
|
-
source: str = "modelscope",
|
127
|
-
vlm_url: Optional[str] = None,
|
128
|
-
) -> None:
|
129
|
-
"""
|
130
|
-
Execute MinerU command-line tool with specified parameters.
|
131
|
-
|
132
|
-
This method constructs and executes the MinerU command with all provided
|
133
|
-
options, handling cross-platform subprocess execution and error management.
|
134
|
-
|
135
|
-
Args:
|
136
|
-
input_path (Union[str, Path]): Path to the input PDF file
|
137
|
-
output_dir (Union[str, Path]): Directory path for output files
|
138
|
-
method (str): Parsing method - "auto", "txt", or "ocr"
|
139
|
-
lang (Optional[str]): Document language for OCR optimization (e.g., "en", "ch", "ja")
|
140
|
-
backend (str): Processing backend to use
|
141
|
-
start_page (Optional[int]): Starting page number (0-based indexing)
|
142
|
-
end_page (Optional[int]): Ending page number (0-based indexing)
|
143
|
-
formula (bool): Enable mathematical formula parsing
|
144
|
-
table (bool): Enable table structure parsing
|
145
|
-
device (Optional[str]): Computing device for inference (e.g., "cuda", "cpu")
|
146
|
-
source (str): Model source repository
|
147
|
-
vlm_url (Optional[str]): VLM server URL (required for vlm-sglang-client backend)
|
148
|
-
|
149
|
-
Raises:
|
150
|
-
subprocess.CalledProcessError: If MinerU command execution fails
|
151
|
-
FileNotFoundError: If MinerU executable is not found
|
152
|
-
RuntimeError: If MinerU is not properly installed
|
153
|
-
"""
|
154
|
-
# Build base command with required parameters
|
155
|
-
cmd = [
|
156
|
-
"mineru",
|
157
|
-
"-p", str(input_path),
|
158
|
-
"-o", str(output_dir),
|
159
|
-
"-m", method,
|
160
|
-
# Note: backend and source parameters are commented out as they may not be
|
161
|
-
# available in all MinerU versions or configurations
|
162
|
-
# "-b", backend,
|
163
|
-
# "--source", source,
|
164
|
-
]
|
165
|
-
|
166
|
-
# Add optional parameters if specified
|
167
|
-
if lang:
|
168
|
-
cmd.extend(["-l", lang])
|
169
|
-
if start_page is not None:
|
170
|
-
cmd.extend(["-s", str(start_page)])
|
171
|
-
if end_page is not None:
|
172
|
-
cmd.extend(["-e", str(end_page)])
|
173
|
-
if not formula:
|
174
|
-
cmd.extend(["-f", "false"])
|
175
|
-
if not table:
|
176
|
-
cmd.extend(["-t", "false"])
|
177
|
-
if device:
|
178
|
-
cmd.extend(["-d", device])
|
179
|
-
if vlm_url:
|
180
|
-
cmd.extend(["-u", vlm_url])
|
181
|
-
|
182
|
-
try:
|
183
|
-
# Configure subprocess execution parameters
|
184
|
-
subprocess_kwargs = {
|
185
|
-
"capture_output": True,
|
186
|
-
"text": True,
|
187
|
-
"check": True,
|
188
|
-
"encoding": "utf-8",
|
189
|
-
"errors": "ignore",
|
190
|
-
}
|
191
|
-
|
192
|
-
# Hide console window on Windows systems
|
193
|
-
if platform.system() == "Windows":
|
194
|
-
subprocess_kwargs["creationflags"] = subprocess.CREATE_NO_WINDOW
|
195
|
-
|
196
|
-
self.logger.info(f"Executing MinerU command: {' '.join(cmd)}")
|
197
|
-
result = subprocess.run(cmd, **subprocess_kwargs)
|
198
|
-
|
199
|
-
self.logger.info("MinerU command executed successfully")
|
200
|
-
if result.stdout:
|
201
|
-
self.logger.debug(f"MinerU output: {result.stdout}")
|
202
|
-
|
203
|
-
except subprocess.CalledProcessError as e:
|
204
|
-
self.logger.error(f"MinerU command execution failed: {e}")
|
205
|
-
if e.stderr:
|
206
|
-
self.logger.error(f"Error details: {e.stderr}")
|
207
|
-
raise
|
208
|
-
except FileNotFoundError:
|
209
|
-
raise RuntimeError(
|
210
|
-
"MinerU command not found. Please ensure MinerU 2.0 is properly installed:\n"
|
211
|
-
"pip install -U 'mineru[core]' or uv pip install -U 'mineru[core]'"
|
212
|
-
)
|
213
|
-
|
214
|
-
def _read_output_files(
|
215
|
-
self,
|
216
|
-
output_dir: Path,
|
217
|
-
file_stem: str,
|
218
|
-
method: str = "auto"
|
219
|
-
) -> Tuple[List[Dict[str, Any]], str]:
|
220
|
-
"""
|
221
|
-
Read and parse MinerU-generated output files.
|
222
|
-
|
223
|
-
This method locates and reads the Markdown and JSON files generated by MinerU,
|
224
|
-
handling different directory structures and resolving image paths to absolute paths.
|
225
|
-
|
226
|
-
Args:
|
227
|
-
output_dir (Path): Directory containing the MinerU output files
|
228
|
-
file_stem (str): Base filename without extension
|
229
|
-
method (str): Parsing method used ("auto", "txt", "ocr", "vlm")
|
230
|
-
|
231
|
-
Returns:
|
232
|
-
Tuple[List[Dict[str, Any]], str]: A tuple containing:
|
233
|
-
- content_list: Structured content list with metadata
|
234
|
-
- markdown_content: Raw Markdown text content
|
235
|
-
"""
|
236
|
-
# Locate generated output files - handle both flat and nested directory structures
|
237
|
-
md_file = output_dir / f"{file_stem}.md"
|
238
|
-
json_file = output_dir / f"{file_stem}_content_list.json"
|
239
|
-
images_base_dir = output_dir
|
240
|
-
|
241
|
-
# Check for nested subdirectory structure (common with newer MinerU versions)
|
242
|
-
file_stem_subdir = output_dir / file_stem
|
243
|
-
if file_stem_subdir.exists():
|
244
|
-
md_file = file_stem_subdir / method / f"{file_stem}.md"
|
245
|
-
json_file = file_stem_subdir / method / f"{file_stem}_content_list.json"
|
246
|
-
images_base_dir = file_stem_subdir / method
|
247
|
-
|
248
|
-
# Read Markdown content
|
249
|
-
md_content = ""
|
250
|
-
if md_file.exists():
|
251
|
-
try:
|
252
|
-
with open(md_file, "r", encoding="utf-8") as f:
|
253
|
-
md_content = f.read()
|
254
|
-
self.logger.info(f"Successfully read Markdown file: {md_file}")
|
255
|
-
except Exception as e:
|
256
|
-
self.logger.warning(f"Failed to read Markdown file {md_file}: {e}")
|
257
|
-
else:
|
258
|
-
self.logger.warning(f"Markdown file not found: {md_file}")
|
259
|
-
|
260
|
-
# Read structured content list from JSON
|
261
|
-
content_list = []
|
262
|
-
if json_file.exists():
|
263
|
-
try:
|
264
|
-
with open(json_file, "r", encoding="utf-8") as f:
|
265
|
-
content_list = json.load(f)
|
266
|
-
|
267
|
-
# Convert relative image paths to absolute paths for proper access
|
268
|
-
self.logger.info(f"Resolving image paths relative to: {images_base_dir}")
|
269
|
-
for item in content_list:
|
270
|
-
if isinstance(item, dict):
|
271
|
-
# Process various image path fields that may be present
|
272
|
-
for field_name in ["img_path", "table_img_path", "equation_img_path"]:
|
273
|
-
if field_name in item and item[field_name]:
|
274
|
-
img_path = item[field_name]
|
275
|
-
if not os.path.isabs(img_path):
|
276
|
-
absolute_img_path = (images_base_dir / img_path).resolve()
|
277
|
-
item[field_name] = str(absolute_img_path)
|
278
|
-
self.logger.debug(f"Updated {field_name}: {img_path} -> {item[field_name]}")
|
279
|
-
|
280
|
-
self.logger.info(
|
281
|
-
f"Successfully read JSON file: {json_file}, containing {len(content_list)} content blocks")
|
282
|
-
|
283
|
-
except Exception as e:
|
284
|
-
self.logger.warning(f"Failed to read JSON file {json_file}: {e}")
|
285
|
-
else:
|
286
|
-
self.logger.warning(f"JSON file not found: {json_file}")
|
287
|
-
|
288
|
-
return content_list, md_content
|
289
|
-
|
290
|
-
def process_pdf(
|
291
|
-
self,
|
292
|
-
pdf_path: Union[str, Path],
|
293
|
-
output_dir: Optional[Union[str, Path]] = None,
|
294
|
-
method: str = "auto",
|
295
|
-
lang: Optional[str] = None,
|
296
|
-
backend: str = "pipeline",
|
297
|
-
**kwargs
|
298
|
-
) -> Tuple[List[Dict[str, Any]], str]:
|
299
|
-
"""
|
300
|
-
Process a PDF file and extract structured content using MinerU.
|
301
|
-
|
302
|
-
This is the main entry point for PDF processing. It validates input,
|
303
|
-
executes MinerU processing, and returns both structured content and markdown.
|
304
|
-
|
305
|
-
Args:
|
306
|
-
pdf_path (Union[str, Path]): Path to the input PDF file
|
307
|
-
output_dir (Optional[Union[str, Path]]): Output directory path.
|
308
|
-
If None, creates 'mineru_output' in PDF's directory
|
309
|
-
method (str): Parsing method - "auto" (recommended), "txt", or "ocr"
|
310
|
-
lang (Optional[str]): Document language for OCR optimization
|
311
|
-
(e.g., "ch" for Chinese, "en" for English, "ja" for Japanese)
|
312
|
-
backend (str): Processing backend - "pipeline", "vlm-transformers",
|
313
|
-
"vlm-sglang-engine", or "vlm-sglang-client"
|
314
|
-
**kwargs: Additional MinerU parameters (start_page, end_page, formula, table, etc.)
|
315
|
-
|
316
|
-
Returns:
|
317
|
-
Tuple[List[Dict[str, Any]], str]: A tuple containing:
|
318
|
-
- content_list: Structured list of content blocks with metadata
|
319
|
-
- markdown_content: Complete document in Markdown format
|
320
|
-
|
321
|
-
Raises:
|
322
|
-
FileNotFoundError: If the specified PDF file does not exist
|
323
|
-
ValueError: If the file is not a valid PDF format
|
324
|
-
RuntimeError: If MinerU processing fails or encounters errors
|
325
|
-
"""
|
326
|
-
# Convert to Path object and validate input
|
327
|
-
pdf_path = Path(pdf_path)
|
328
|
-
if not pdf_path.exists():
|
329
|
-
raise FileNotFoundError(f"PDF file does not exist: {pdf_path}")
|
330
|
-
|
331
|
-
if not pdf_path.suffix.lower() == '.pdf':
|
332
|
-
raise ValueError(f"File is not a PDF format: {pdf_path}")
|
333
|
-
|
334
|
-
name_without_suffix = pdf_path.stem
|
335
|
-
|
336
|
-
# Prepare output directory
|
337
|
-
if output_dir:
|
338
|
-
base_output_dir = Path(output_dir)
|
339
|
-
else:
|
340
|
-
base_output_dir = pdf_path.parent / "mineru_output"
|
341
|
-
|
342
|
-
base_output_dir.mkdir(parents=True, exist_ok=True)
|
343
|
-
|
344
|
-
try:
|
345
|
-
# Execute MinerU processing
|
346
|
-
self.logger.info(f"Starting PDF processing: {pdf_path}")
|
347
|
-
|
348
|
-
self._run_mineru_command(
|
349
|
-
input_path=pdf_path,
|
350
|
-
output_dir=base_output_dir,
|
351
|
-
method=method,
|
352
|
-
lang=lang,
|
353
|
-
backend=backend,
|
354
|
-
**kwargs
|
355
|
-
)
|
356
|
-
|
357
|
-
# Read generated output files
|
358
|
-
backend_method = method
|
359
|
-
if backend.startswith("vlm-"):
|
360
|
-
backend_method = "vlm"
|
361
|
-
|
362
|
-
content_list, markdown_content = self._read_output_files(
|
363
|
-
base_output_dir, name_without_suffix, method=backend_method
|
364
|
-
)
|
365
|
-
|
366
|
-
# Generate processing statistics
|
367
|
-
content_stats = {}
|
368
|
-
for item in content_list:
|
369
|
-
if isinstance(item, dict):
|
370
|
-
content_type = item.get("type", "unknown")
|
371
|
-
content_stats[content_type] = content_stats.get(content_type, 0) + 1
|
372
|
-
|
373
|
-
self.logger.info(f"PDF processing completed! Extracted {len(content_list)} content blocks")
|
374
|
-
self.logger.info("Content type statistics:")
|
375
|
-
for content_type, count in content_stats.items():
|
376
|
-
self.logger.info(f" - {content_type}: {count}")
|
377
|
-
|
378
|
-
return content_list, markdown_content
|
379
|
-
|
380
|
-
except Exception as e:
|
381
|
-
self.logger.error(f"Error occurred during PDF processing: {str(e)}")
|
382
|
-
raise
|
383
|
-
|
384
|
-
def save_results(
|
385
|
-
self,
|
386
|
-
content_list: List[Dict[str, Any]],
|
387
|
-
markdown_content: str,
|
388
|
-
output_path: Union[str, Path],
|
389
|
-
save_markdown: bool = True,
|
390
|
-
save_json: bool = True,
|
391
|
-
indent: int = 2
|
392
|
-
) -> Dict[str, Path]:
|
393
|
-
"""
|
394
|
-
Save processing results to files.
|
395
|
-
|
396
|
-
Saves the extracted content in both JSON (structured) and Markdown (text) formats
|
397
|
-
for different use cases and downstream processing needs.
|
398
|
-
|
399
|
-
Args:
|
400
|
-
content_list (List[Dict[str, Any]]): Structured content list with metadata
|
401
|
-
markdown_content (str): Complete document in Markdown format
|
402
|
-
output_path (Union[str, Path]): Output file path (without extension)
|
403
|
-
save_markdown (bool): Whether to save Markdown file
|
404
|
-
save_json (bool): Whether to save JSON file with structured content
|
405
|
-
indent (int): JSON file indentation for readability
|
406
|
-
|
407
|
-
Returns:
|
408
|
-
Dict[str, Path]: Dictionary mapping file types to their saved paths
|
409
|
-
Keys: 'markdown', 'json' (if respective files were saved)
|
410
|
-
|
411
|
-
Raises:
|
412
|
-
Exception: If file writing operations fail
|
413
|
-
"""
|
414
|
-
output_path = Path(output_path)
|
415
|
-
saved_files = {}
|
416
|
-
|
417
|
-
try:
|
418
|
-
# Ensure output directory exists
|
419
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
420
|
-
|
421
|
-
# Save Markdown file
|
422
|
-
if save_markdown and markdown_content:
|
423
|
-
md_path = output_path.with_suffix('.md')
|
424
|
-
with open(md_path, 'w', encoding='utf-8') as f:
|
425
|
-
f.write(markdown_content)
|
426
|
-
saved_files['markdown'] = md_path
|
427
|
-
self.logger.info(f"Markdown file saved: {md_path}")
|
428
|
-
|
429
|
-
# Save JSON file with structured content
|
430
|
-
if save_json and content_list:
|
431
|
-
json_path = output_path.with_suffix('.json')
|
432
|
-
with open(json_path, 'w', encoding='utf-8') as f:
|
433
|
-
json.dump(content_list, f, indent=indent, ensure_ascii=False)
|
434
|
-
saved_files['json'] = json_path
|
435
|
-
self.logger.info(f"JSON file saved: {json_path}")
|
436
|
-
|
437
|
-
return saved_files
|
438
|
-
|
439
|
-
except Exception as e:
|
440
|
-
self.logger.error(f"Error occurred while saving files: {e}")
|
441
|
-
raise
|
442
|
-
|
443
|
-
@staticmethod
|
444
|
-
def get_content_statistics(content_list: List[Dict[str, Any]]) -> Dict[str, Any]:
|
445
|
-
"""
|
446
|
-
Generate detailed statistics about the processed content.
|
447
|
-
|
448
|
-
Analyzes the content list to provide insights into document structure,
|
449
|
-
content types, and processing results.
|
450
|
-
|
451
|
-
Args:
|
452
|
-
content_list (List[Dict[str, Any]]): Structured content list from MinerU
|
453
|
-
|
454
|
-
Returns:
|
455
|
-
Dict[str, Any]: Dictionary containing various statistics:
|
456
|
-
- total_blocks: Total number of content blocks
|
457
|
-
- content_types: Count of each content type
|
458
|
-
- text_stats: Text-specific statistics (characters, words, etc.)
|
459
|
-
- image_count: Number of images found
|
460
|
-
- table_count: Number of tables found
|
461
|
-
"""
|
462
|
-
stats = {
|
463
|
-
"total_blocks": len(content_list),
|
464
|
-
"content_types": {},
|
465
|
-
"text_stats": {"total_characters": 0, "total_words": 0, "title_levels": {}},
|
466
|
-
"image_count": 0,
|
467
|
-
"table_count": 0,
|
468
|
-
"has_formulas": False
|
469
|
-
}
|
470
|
-
|
471
|
-
for item in content_list:
|
472
|
-
if not isinstance(item, dict):
|
473
|
-
continue
|
474
|
-
|
475
|
-
content_type = item.get("type", "unknown")
|
476
|
-
stats["content_types"][content_type] = stats["content_types"].get(content_type, 0) + 1
|
477
|
-
|
478
|
-
if content_type == "text":
|
479
|
-
text = item.get("text", "")
|
480
|
-
stats["text_stats"]["total_characters"] += len(text)
|
481
|
-
stats["text_stats"]["total_words"] += len(text.split())
|
482
|
-
|
483
|
-
level = item.get("text_level", 0)
|
484
|
-
if level > 0:
|
485
|
-
stats["text_stats"]["title_levels"][level] = stats["text_stats"]["title_levels"].get(level, 0) + 1
|
486
|
-
|
487
|
-
elif content_type == "image":
|
488
|
-
stats["image_count"] += 1
|
489
|
-
|
490
|
-
elif content_type == "table":
|
491
|
-
stats["table_count"] += 1
|
492
|
-
|
493
|
-
elif content_type == "formula":
|
494
|
-
stats["has_formulas"] = True
|
495
|
-
|
496
|
-
return stats
|
497
|
-
|
498
|
-
def validate_output_quality(self, content_list: List[Dict[str, Any]], markdown_content: str) -> Dict[str, Any]:
|
499
|
-
"""
|
500
|
-
Validate the quality and completeness of the processing output.
|
501
|
-
|
502
|
-
Performs various checks to ensure the processed content meets quality standards
|
503
|
-
and provides warnings or suggestions for improvement.
|
504
|
-
|
505
|
-
Args:
|
506
|
-
content_list (List[Dict[str, Any]]): Structured content list
|
507
|
-
markdown_content (str): Markdown content string
|
508
|
-
|
509
|
-
Returns:
|
510
|
-
Dict[str, Any]: Validation results containing:
|
511
|
-
- is_valid: Overall validation status
|
512
|
-
- warnings: List of warning messages
|
513
|
-
- suggestions: List of improvement suggestions
|
514
|
-
- quality_score: Numeric quality score (0-100)
|
515
|
-
"""
|
516
|
-
validation = {
|
517
|
-
"is_valid": True,
|
518
|
-
"warnings": [],
|
519
|
-
"suggestions": [],
|
520
|
-
"quality_score": 100
|
521
|
-
}
|
522
|
-
|
523
|
-
# Check if content was extracted
|
524
|
-
if not content_list and not markdown_content.strip():
|
525
|
-
validation["is_valid"] = False
|
526
|
-
validation["warnings"].append("No content was extracted from the PDF")
|
527
|
-
validation["quality_score"] = 0
|
528
|
-
return validation
|
529
|
-
|
530
|
-
# Check content diversity
|
531
|
-
stats = self.get_content_statistics(content_list)
|
532
|
-
if stats["total_blocks"] < 5:
|
533
|
-
validation["warnings"].append("Very few content blocks extracted - document may be complex or image-heavy")
|
534
|
-
validation["quality_score"] -= 20
|
535
|
-
|
536
|
-
# Check text content ratio
|
537
|
-
text_blocks = stats["content_types"].get("text", 0)
|
538
|
-
if text_blocks == 0:
|
539
|
-
validation["warnings"].append("No text blocks found - consider using OCR method for image-based PDFs")
|
540
|
-
validation["quality_score"] -= 30
|
541
|
-
elif text_blocks / stats["total_blocks"] < 0.3:
|
542
|
-
validation["suggestions"].append("Low text content ratio - document may benefit from OCR processing")
|
543
|
-
validation["quality_score"] -= 10
|
544
|
-
|
545
|
-
# Check for images without processing
|
546
|
-
if stats["image_count"] > 0 and stats["content_types"].get("text", 0) == 0:
|
547
|
-
validation["suggestions"].append(
|
548
|
-
"Images detected but no text extracted - consider using VLM backend for image analysis")
|
549
|
-
|
550
|
-
# Check markdown length vs content blocks
|
551
|
-
if len(markdown_content.strip()) < 100 and stats["total_blocks"] > 10:
|
552
|
-
validation["warnings"].append("Markdown content seems unusually short for the number of content blocks")
|
553
|
-
validation["quality_score"] -= 15
|
554
|
-
|
555
|
-
return validation
|
556
|
-
|
557
|
-
|
558
|
-
def chunk_pdf_content(content_list: List[Dict[str, Any]], max_length: int = 4000) -> List[str]:
|
559
|
-
"""
|
560
|
-
Split MinerU-parsed content list into text chunks of specified length.
|
561
|
-
|
562
|
-
This utility function converts structured content from MinerU into manageable
|
563
|
-
text chunks suitable for downstream processing like embedding generation or
|
564
|
-
language model input.
|
565
|
-
|
566
|
-
Args:
|
567
|
-
content_list (List[Dict[str, Any]]): MinerU-parsed structured content list
|
568
|
-
max_length (int): Maximum character length per chunk (default: 4000)
|
569
|
-
|
570
|
-
Returns:
|
571
|
-
List[str]: List of text chunks, each prefixed with chunk metadata
|
572
|
-
including chunk number, total chunks, and character count
|
573
|
-
"""
|
574
|
-
|
575
|
-
def extract_text(item: Dict[str, Any]) -> str:
|
576
|
-
"""
|
577
|
-
Extract text content from a single content item.
|
578
|
-
|
579
|
-
Handles different content types (text, table, image) and formats them
|
580
|
-
appropriately for text-based processing.
|
581
|
-
|
582
|
-
Args:
|
583
|
-
item (Dict[str, Any]): Single content item from MinerU output
|
584
|
-
|
585
|
-
Returns:
|
586
|
-
str: Extracted and formatted text content
|
587
|
-
"""
|
588
|
-
if item.get("type") == "text":
|
589
|
-
text = item.get("text", "").strip()
|
590
|
-
if not text:
|
591
|
-
return ""
|
592
|
-
# Add markdown header formatting for titles
|
593
|
-
level = item.get("text_level", 0)
|
594
|
-
if level > 0:
|
595
|
-
return f"{'#' * min(level, 6)} {text}"
|
596
|
-
return text
|
597
|
-
|
598
|
-
elif item.get("type") == "table":
|
599
|
-
parts = []
|
600
|
-
if item.get("table_caption"):
|
601
|
-
parts.append("Table: " + " | ".join(item["table_caption"]))
|
602
|
-
if item.get("table_body"):
|
603
|
-
# Simple HTML tag cleanup and formatting
|
604
|
-
table_text = re.sub(r'<[^>]+>', ' | ', item["table_body"])
|
605
|
-
table_text = re.sub(r'\s+', ' ', table_text).strip()
|
606
|
-
parts.append(table_text)
|
607
|
-
return "\n".join(parts) if parts else ""
|
608
|
-
|
609
|
-
elif item.get("type") == "image":
|
610
|
-
if item.get("image_caption"):
|
611
|
-
return "Image: " + " | ".join(item["image_caption"])
|
612
|
-
return ""
|
613
|
-
|
614
|
-
return ""
|
615
|
-
|
616
|
-
# Extract all text content from the structured list
|
617
|
-
all_text = ""
|
618
|
-
for item in content_list:
|
619
|
-
text = extract_text(item)
|
620
|
-
if text.strip():
|
621
|
-
all_text += text + "\n"
|
622
|
-
|
623
|
-
if not all_text.strip():
|
624
|
-
return []
|
625
|
-
|
626
|
-
# Split into chunks based on max_length
|
627
|
-
chunks = []
|
628
|
-
current_chunk = ""
|
629
|
-
|
630
|
-
for line in all_text.split('\n'):
|
631
|
-
# Check if adding this line would exceed max_length
|
632
|
-
if len(current_chunk) + len(line) + 1 > max_length and current_chunk:
|
633
|
-
chunks.append(current_chunk.strip())
|
634
|
-
current_chunk = line
|
635
|
-
else:
|
636
|
-
current_chunk += line + "\n" if current_chunk else line
|
637
|
-
|
638
|
-
# Add the final chunk if it contains content
|
639
|
-
if current_chunk.strip():
|
640
|
-
chunks.append(current_chunk.strip())
|
641
|
-
|
642
|
-
# Add chunk metadata headers
|
643
|
-
total_chunks = len(chunks)
|
644
|
-
marked_chunks = []
|
645
|
-
for i, chunk in enumerate(chunks):
|
646
|
-
header = f"=== CHUNK {i + 1}/{total_chunks} ({len(chunk)} characters) ===\n"
|
647
|
-
marked_chunks.append(header + chunk)
|
648
|
-
|
649
|
-
return marked_chunks
|
650
|
-
|
651
|
-
|
652
|
-
# Example usage and demonstration
|
653
|
-
if __name__ == "__main__":
|
654
|
-
"""
|
655
|
-
Example usage of the MinerUPDFProcessor class.
|
656
|
-
|
657
|
-
This example demonstrates the basic workflow for processing a PDF file
|
658
|
-
and working with the extracted content.
|
659
|
-
"""
|
660
|
-
import sys
|
661
|
-
|
662
|
-
|
663
|
-
# Example usage
|
664
|
-
def example_usage():
|
665
|
-
"""Demonstrate basic PDF processing workflow."""
|
666
|
-
try:
|
667
|
-
# Initialize processor
|
668
|
-
processor = MinerUPDFProcessor.create_with_defaults(log_level="INFO")
|
669
|
-
|
670
|
-
# Example PDF path (replace with actual PDF file)
|
671
|
-
pdf_path = "example_document.pdf"
|
672
|
-
|
673
|
-
if not Path(pdf_path).exists():
|
674
|
-
print(f"Example PDF file not found: {pdf_path}")
|
675
|
-
print("Please provide a valid PDF file path to test the processor.")
|
676
|
-
return
|
677
|
-
|
678
|
-
# Process PDF with different methods
|
679
|
-
print("Processing PDF with auto method...")
|
680
|
-
content_list, markdown_content = processor.process_pdf(
|
681
|
-
pdf_path=pdf_path,
|
682
|
-
method="auto",
|
683
|
-
lang="en" # Specify language for better OCR results
|
684
|
-
)
|
685
|
-
|
686
|
-
# Generate statistics
|
687
|
-
stats = processor.get_content_statistics(content_list)
|
688
|
-
print(f"Processing Statistics:")
|
689
|
-
print(f" Total blocks: {stats['total_blocks']}")
|
690
|
-
print(f" Content types: {stats['content_types']}")
|
691
|
-
print(f" Text characters: {stats['text_stats']['total_characters']}")
|
692
|
-
print(f" Text words: {stats['text_stats']['total_words']}")
|
693
|
-
|
694
|
-
# Validate output quality
|
695
|
-
validation = processor.validate_output_quality(content_list, markdown_content)
|
696
|
-
print(f"Quality Score: {validation['quality_score']}/100")
|
697
|
-
if validation['warnings']:
|
698
|
-
print("Warnings:", validation['warnings'])
|
699
|
-
if validation['suggestions']:
|
700
|
-
print("Suggestions:", validation['suggestions'])
|
701
|
-
|
702
|
-
# Save results
|
703
|
-
output_path = Path(pdf_path).stem + "_processed"
|
704
|
-
saved_files = processor.save_results(
|
705
|
-
content_list=content_list,
|
706
|
-
markdown_content=markdown_content,
|
707
|
-
output_path=output_path
|
708
|
-
)
|
709
|
-
print(f"Results saved to: {saved_files}")
|
710
|
-
|
711
|
-
# Create text chunks for downstream processing
|
712
|
-
chunks = chunk_pdf_content(content_list, max_length=2000)
|
713
|
-
print(f"Created {len(chunks)} text chunks")
|
714
|
-
|
715
|
-
# Display first chunk as example
|
716
|
-
if chunks:
|
717
|
-
print("First chunk preview:")
|
718
|
-
print(chunks[0][:200] + "..." if len(chunks[0]) > 200 else chunks[0])
|
719
|
-
|
720
|
-
except Exception as e:
|
721
|
-
print(f"Error during processing: {e}")
|
722
|
-
sys.exit(1)
|
723
|
-
|
724
|
-
|
725
|
-
# Run example if script is executed directly
|
726
|
-
example_usage()
|