opendataloader-pdf 0.0.0__py3-none-any.whl → 1.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,71 @@
1
+ """
2
+ Low-level JAR runner for opendataloader-pdf.
3
+ """
4
+ import locale
5
+ import subprocess
6
+ import sys
7
+ import importlib.resources as resources
8
+ from typing import List
9
+
10
+ # The consistent name of the JAR file bundled with the package
11
+ _JAR_NAME = "opendataloader-pdf-cli.jar"
12
+
13
+
14
+ def run_jar(args: List[str], quiet: bool = False) -> str:
15
+ """Run the opendataloader-pdf JAR with the given arguments."""
16
+ try:
17
+ # Access the embedded JAR inside the package
18
+ jar_ref = resources.files("opendataloader_pdf").joinpath("jar", _JAR_NAME)
19
+ with resources.as_file(jar_ref) as jar_path:
20
+ command = ["java", "-jar", str(jar_path), *args]
21
+
22
+ if quiet:
23
+ # Quiet mode → capture all output
24
+ result = subprocess.run(
25
+ command,
26
+ capture_output=True,
27
+ text=True,
28
+ check=True,
29
+ encoding=locale.getpreferredencoding(False),
30
+ )
31
+ return result.stdout
32
+
33
+ # Streaming mode → live output
34
+ with subprocess.Popen(
35
+ command,
36
+ stdout=subprocess.PIPE,
37
+ stderr=subprocess.STDOUT,
38
+ text=True,
39
+ encoding=locale.getpreferredencoding(False),
40
+ ) as process:
41
+ output_lines: List[str] = []
42
+ for line in process.stdout:
43
+ sys.stdout.write(line)
44
+ output_lines.append(line)
45
+
46
+ return_code = process.wait()
47
+ captured_output = "".join(output_lines)
48
+
49
+ if return_code:
50
+ raise subprocess.CalledProcessError(
51
+ return_code, command, output=captured_output
52
+ )
53
+ return captured_output
54
+
55
+ except FileNotFoundError:
56
+ print(
57
+ "Error: 'java' command not found. Please ensure Java is installed and in your system's PATH.",
58
+ file=sys.stderr,
59
+ )
60
+ raise
61
+
62
+ except subprocess.CalledProcessError as error:
63
+ print("Error running opendataloader-pdf CLI.", file=sys.stderr)
64
+ print(f"Return code: {error.returncode}", file=sys.stderr)
65
+ if error.output:
66
+ print(f"Output: {error.output}", file=sys.stderr)
67
+ if error.stderr:
68
+ print(f"Stderr: {error.stderr}", file=sys.stderr)
69
+ if error.stdout:
70
+ print(f"Stdout: {error.stdout}", file=sys.stderr)
71
+ raise
@@ -1,126 +1,117 @@
1
+ import argparse
1
2
  import subprocess
2
3
  import sys
3
- import importlib_resources
4
- from pathlib import Path
4
+ import warnings
5
+ from typing import List, Optional
5
6
 
6
- # The consistent name of the JAR file bundled with the package
7
- _JAR_NAME = "opendataloader-pdf-cli.jar"
7
+ from .cli_options_generated import add_options_to_parser
8
+ from .convert_generated import convert
9
+ from .runner import run_jar
8
10
 
11
+ # Re-export for backward compatibility
12
+ __all__ = ["convert", "run", "run_jar", "main"]
9
13
 
14
+
15
+ # Deprecated : Use `convert()` instead. This function will be removed in a future version.
10
16
  def run(
11
17
  input_path: str,
12
- output_folder: str,
13
- password: str = None,
14
- to_markdown: bool = False,
15
- to_annotated_pdf: bool = False,
18
+ output_folder: Optional[str] = None,
19
+ password: Optional[str] = None,
20
+ replace_invalid_chars: Optional[str] = None,
21
+ generate_markdown: bool = False,
22
+ generate_html: bool = False,
23
+ generate_annotated_pdf: bool = False,
16
24
  keep_line_breaks: bool = False,
17
- find_hidden_text: bool = False,
25
+ content_safety_off: Optional[str] = None,
18
26
  html_in_markdown: bool = False,
19
27
  add_image_to_markdown: bool = False,
28
+ no_json: bool = False,
20
29
  debug: bool = False,
30
+ use_struct_tree: bool = False,
21
31
  ):
22
32
  """
23
33
  Runs the opendataloader-pdf with the given arguments.
24
34
 
35
+ .. deprecated::
36
+ Use :func:`convert` instead. This function will be removed in a future version.
37
+
25
38
  Args:
26
39
  input_path: Path to the input PDF file or folder.
27
40
  output_folder: Path to the output folder. Defaults to the input folder.
28
41
  password: Password for the PDF file.
29
- to_markdown: If True, generates a Markdown output file.
30
- to_annotated_pdf: If True, generates an annotated PDF output file.
42
+ replace_invalid_chars: Character to replace invalid or unrecognized characters (e.g., , \\u0000) with.
43
+ generate_markdown: If True, generates a Markdown output file.
44
+ generate_html: If True, generates an HTML output file.
45
+ generate_annotated_pdf: If True, generates an annotated PDF output file.
31
46
  keep_line_breaks: If True, keeps line breaks in the output.
32
- find_hidden_text: If True, finds hidden text in the PDF.
33
47
  html_in_markdown: If True, uses HTML in the Markdown output.
34
48
  add_image_to_markdown: If True, adds images to the Markdown output.
49
+ no_json: If True, disable the JSON output.
35
50
  debug: If True, prints all messages from the CLI to the console during execution.
36
-
37
- Returns:
38
- The stdout from the CLI tool if successful.
51
+ use_struct_tree: If True, enable processing structure tree (disabled by default)
39
52
 
40
53
  Raises:
41
54
  FileNotFoundError: If the 'java' command is not found or input_path is invalid.
42
55
  subprocess.CalledProcessError: If the CLI tool returns a non-zero exit code.
43
56
  """
44
- if not Path(input_path).exists():
45
- raise FileNotFoundError(f"Input file or folder not found: {input_path}")
46
-
47
- args = []
48
- if output_folder:
49
- args.extend(["--folder", output_folder])
50
- if password:
51
- args.extend(["--password", password])
52
- if to_markdown:
53
- args.append("--markdown")
54
- if to_annotated_pdf:
55
- args.append("--pdf")
56
- if keep_line_breaks:
57
- args.append("--keeplinebreaks")
58
- if find_hidden_text:
59
- args.append("--findhiddentext")
60
- if html_in_markdown:
61
- args.append("--htmlinmarkdown")
62
- if add_image_to_markdown:
63
- args.append("--addimagetomarkdown")
64
-
65
- args.append(input_path)
57
+ warnings.warn(
58
+ "run() is deprecated and will be removed in a future version. Use convert() instead.",
59
+ DeprecationWarning,
60
+ stacklevel=2,
61
+ )
62
+
63
+ # Build format list based on legacy boolean options
64
+ formats: List[str] = []
65
+ if not no_json:
66
+ formats.append("json")
67
+ if generate_markdown:
68
+ if add_image_to_markdown:
69
+ formats.append("markdown-with-images")
70
+ elif html_in_markdown:
71
+ formats.append("markdown-with-html")
72
+ else:
73
+ formats.append("markdown")
74
+ if generate_html:
75
+ formats.append("html")
76
+ if generate_annotated_pdf:
77
+ formats.append("pdf")
78
+
79
+ convert(
80
+ input_path=input_path,
81
+ output_dir=output_folder,
82
+ password=password,
83
+ replace_invalid_chars=replace_invalid_chars,
84
+ keep_line_breaks=keep_line_breaks,
85
+ content_safety_off=content_safety_off,
86
+ use_struct_tree=use_struct_tree,
87
+ format=formats if formats else None,
88
+ quiet=not debug,
89
+ )
90
+
91
+
92
+ def main(argv=None) -> int:
93
+ """CLI entry point for running the wrapper from the command line."""
94
+ parser = argparse.ArgumentParser(
95
+ description="Run the opendataloader-pdf CLI using the bundled JAR."
96
+ )
97
+ parser.add_argument(
98
+ "input_path", nargs="+", help="Path to the input PDF file or directory."
99
+ )
100
+
101
+ # Register CLI options from auto-generated module
102
+ add_options_to_parser(parser)
103
+
104
+ args = parser.parse_args(argv)
66
105
 
67
106
  try:
68
- # Find the JAR file within the package
69
- jar_ref = importlib_resources.files("opendataloader_pdf").joinpath(
70
- "jar", _JAR_NAME
71
- )
72
- with importlib_resources.as_file(jar_ref) as jar_path:
73
- command = ["java", "-jar", str(jar_path)] + args
74
-
75
- if debug:
76
- print(f"Running command: {' '.join(command)}", file=sys.stderr)
77
- process = subprocess.Popen(
78
- command,
79
- stdout=subprocess.PIPE,
80
- stderr=subprocess.STDOUT,
81
- text=True,
82
- encoding="utf-8",
83
- )
84
-
85
- output_lines = []
86
- for line in iter(process.stdout.readline, ""):
87
- sys.stdout.write(line)
88
- output_lines.append(line)
89
-
90
- process.stdout.close()
91
- return_code = process.wait()
92
- captured_output = "".join(output_lines)
93
-
94
- if return_code:
95
- # Manually raise error with the combined output
96
- raise subprocess.CalledProcessError(
97
- return_code, command, output=captured_output
98
- )
99
- return captured_output
100
- else:
101
- result = subprocess.run(
102
- command,
103
- capture_output=True,
104
- text=True,
105
- check=True,
106
- encoding="utf-8",
107
- )
108
- return result.stdout
109
-
110
- except FileNotFoundError:
111
- print(
112
- "Error: 'java' command not found. Please ensure Java is installed and in your system's PATH.",
113
- file=sys.stderr,
114
- )
115
- raise
116
-
117
- except subprocess.CalledProcessError as e:
118
- print("Error running opendataloader-pdf CLI.", file=sys.stderr)
119
- print(f"Return code: {e.returncode}", file=sys.stderr)
120
- if e.output:
121
- print(f"Output: {e.output}", file=sys.stderr)
122
- elif e.stderr:
123
- print(f"Stderr: {e.stderr}", file=sys.stderr)
124
- if e.stdout:
125
- print(f"Stdout: {e.stdout}", file=sys.stderr)
126
- raise e
107
+ convert(**vars(args))
108
+ return 0
109
+ except FileNotFoundError as err:
110
+ print(err, file=sys.stderr)
111
+ return 1
112
+ except subprocess.CalledProcessError as err:
113
+ return err.returncode or 1
114
+
115
+
116
+ if __name__ == "__main__":
117
+ sys.exit(main())
@@ -0,0 +1,361 @@
1
+ Metadata-Version: 2.4
2
+ Name: opendataloader-pdf
3
+ Version: 1.8.2
4
+ Summary: A Python wrapper for the opendataloader-pdf Java CLI.
5
+ Project-URL: Homepage, https://github.com/opendataloader-project/opendataloader-pdf
6
+ Author-email: opendataloader-project <open.dataloader@hancom.com>
7
+ License-Expression: MPL-2.0
8
+ Classifier: Operating System :: OS Independent
9
+ Classifier: Programming Language :: Python :: 3
10
+ Requires-Python: >=3.10
11
+ Provides-Extra: hybrid
12
+ Requires-Dist: docling[easyocr]>=2.0.0; extra == 'hybrid'
13
+ Requires-Dist: fastapi>=0.100.0; extra == 'hybrid'
14
+ Requires-Dist: python-multipart>=0.0.6; extra == 'hybrid'
15
+ Requires-Dist: uvicorn>=0.20.0; extra == 'hybrid'
16
+ Description-Content-Type: text/markdown
17
+
18
+ # OpenDataLoader PDF
19
+
20
+ **PDF Parsing for RAG** — Convert to Markdown & JSON, Fast, Local, No GPU
21
+
22
+ [![License](https://img.shields.io/pypi/l/opendataloader-pdf.svg)](https://github.com/opendataloader-project/opendataloader-pdf/blob/main/LICENSE)
23
+ [![PyPI version](https://img.shields.io/pypi/v/opendataloader-pdf.svg)](https://pypi.org/project/opendataloader-pdf/)
24
+ [![npm version](https://img.shields.io/npm/v/@opendataloader/pdf.svg)](https://www.npmjs.com/package/@opendataloader/pdf)
25
+ [![Maven Central](https://img.shields.io/maven-central/v/org.opendataloader/opendataloader-pdf-core.svg)](https://search.maven.org/artifact/org.opendataloader/opendataloader-pdf-core)
26
+ [![GHCR Version](https://ghcr-badge.egpl.dev/opendataloader-project/opendataloader-pdf-cli/latest_tag?trim=major&label=docker)](https://github.com/opendataloader-project/opendataloader-pdf/pkgs/container/opendataloader-pdf-cli)
27
+ [![Java](https://img.shields.io/badge/Java-11%2B-blue.svg)](https://github.com/opendataloader-project/opendataloader-pdf#java)
28
+
29
+ Convert PDFs into **LLM-ready Markdown and JSON** with accurate reading order, table extraction, and bounding boxes — all running locally on your machine.
30
+
31
+ **Why developers choose OpenDataLoader:**
32
+ - **Deterministic** — Same input always produces same output (no LLM hallucinations)
33
+ - **Fast** — Process 100+ pages per second on CPU
34
+ - **Private** — 100% local, zero data transmission
35
+ - **Accurate** — Bounding boxes for every element, correct multi-column reading order
36
+
37
+ ```bash
38
+ pip install -U opendataloader-pdf
39
+ ```
40
+
41
+ ```python
42
+ import opendataloader_pdf
43
+
44
+ # PDF to Markdown for RAG
45
+ opendataloader_pdf.convert(
46
+ input_path="document.pdf",
47
+ output_dir="output/",
48
+ format="markdown,json"
49
+ )
50
+ ```
51
+
52
+ <br/>
53
+
54
+ ## Why OpenDataLoader?
55
+
56
+ Building RAG pipelines? You've probably hit these problems:
57
+
58
+ | Problem | How We Solve It |
59
+ |---------|-----------------|
60
+ | **Multi-column text reads left-to-right incorrectly** | XY-Cut++ algorithm preserves correct reading order |
61
+ | **Tables lose structure** | Border + cluster detection keeps rows/columns intact |
62
+ | **Headers/footers pollute context** | Auto-filtered before output |
63
+ | **No coordinates for citations** | Bounding box for every element |
64
+ | **Cloud APIs = privacy concerns** | 100% local, no data leaves your machine |
65
+ | **GPU required** | Pure CPU, rule-based — runs anywhere |
66
+
67
+ <br/>
68
+
69
+ ## Key Features
70
+
71
+ ### For RAG & LLM Pipelines
72
+
73
+ - **Structured Output** — JSON with semantic types (heading, paragraph, table, list, caption)
74
+ - **Bounding Boxes** — Every element includes `[x1, y1, x2, y2]` coordinates for citations
75
+ - **Reading Order** — XY-Cut++ algorithm handles multi-column layouts correctly
76
+ - **Noise Filtering** — Headers, footers, hidden text, watermarks auto-removed
77
+ - **LangChain Integration** — [Official document loader](https://python.langchain.com/docs/integrations/document_loaders/opendataloader_pdf/)
78
+
79
+ ### Performance & Privacy
80
+
81
+ - **No GPU** — Fast, rule-based heuristics
82
+ - **Local-First** — Your documents never leave your machine
83
+ - **High Throughput** — Process thousands of PDFs efficiently
84
+ - **Multi-Language SDK** — Python, Node.js, Java, Docker
85
+
86
+ ### Document Understanding
87
+
88
+ - **Tables** — Detects borders, handles merged cells
89
+ - **Lists** — Numbered, bulleted, nested
90
+ - **Headings** — Auto-detects hierarchy levels
91
+ - **Images** — Extracts with captions linked
92
+ - **Tagged PDF Support** — Uses native PDF structure when available
93
+ - **AI Safety** — Auto-filters prompt injection content
94
+
95
+ <br/>
96
+
97
+ ## Output Formats
98
+
99
+ | Format | Use Case |
100
+ |--------|----------|
101
+ | **JSON** | Structured data with bounding boxes, semantic types |
102
+ | **Markdown** | Clean text for LLM context, RAG chunks |
103
+ | **HTML** | Web display with styling |
104
+ | **Annotated PDF** | Visual debugging — see detected structures ([sample](https://opendataloader.org/demo/samples/01030000000000?view1=annot&view2=json)) |
105
+
106
+ <br/>
107
+
108
+ ## JSON Output Example
109
+
110
+ ```json
111
+ {
112
+ "type": "heading",
113
+ "id": 42,
114
+ "level": "Title",
115
+ "page number": 1,
116
+ "bounding box": [72.0, 700.0, 540.0, 730.0],
117
+ "heading level": 1,
118
+ "font": "Helvetica-Bold",
119
+ "font size": 24.0,
120
+ "text color": "[0.0]",
121
+ "content": "Introduction"
122
+ }
123
+ ```
124
+
125
+ | Field | Description |
126
+ |-------|-------------|
127
+ | `type` | Element type: heading, paragraph, table, list, image, caption |
128
+ | `id` | Unique identifier for cross-referencing |
129
+ | `page number` | 1-indexed page reference |
130
+ | `bounding box` | `[left, bottom, right, top]` in PDF points |
131
+ | `heading level` | Heading depth (1+) |
132
+ | `font`, `font size` | Typography info |
133
+ | `content` | Extracted text |
134
+
135
+ [Full JSON Schema →](https://opendataloader.org/docs/json-schema)
136
+
137
+ <br/>
138
+
139
+ ## Quick Start
140
+
141
+ - [Python](https://opendataloader.org/docs/quick-start-python)
142
+ - [Node.js / TypeScript](https://opendataloader.org/docs/quick-start-nodejs)
143
+ - [Docker](https://opendataloader.org/docs/quick-start-docker)
144
+ - [Java](https://opendataloader.org/docs/quick-start-java)
145
+
146
+ <br/>
147
+
148
+ ## Advanced Options
149
+
150
+ ```python
151
+ opendataloader_pdf.convert(
152
+ input_path="document.pdf",
153
+ output_dir="output/",
154
+ format="json,markdown,pdf",
155
+
156
+ # Image output mode: "off", "embedded" (Base64), or "external" (default)
157
+ image_output="embedded",
158
+
159
+ # Image format: "png" or "jpeg"
160
+ image_format="jpeg",
161
+
162
+ # Tagged PDF
163
+ use_struct_tree=True, # Use native PDF structure
164
+ )
165
+ ```
166
+
167
+ [Full CLI Options Reference →](https://opendataloader.org/docs/cli-options-reference)
168
+
169
+ <br/>
170
+
171
+ ## AI Safety
172
+
173
+ PDFs can contain hidden prompt injection attacks. OpenDataLoader automatically filters:
174
+
175
+ - Hidden text (transparent, zero-size)
176
+ - Off-page content
177
+ - Suspicious invisible layers
178
+
179
+ This is **enabled by default**. [Learn more →](https://opendataloader.org/docs/ai-safety)
180
+
181
+ <br/>
182
+
183
+ ## Tagged PDF Support
184
+
185
+ **Why it matters:** The [European Accessibility Act (EAA)](https://commission.europa.eu/strategy-and-policy/policies/justice-and-fundamental-rights/disability/union-equality-strategy-rights-persons-disabilities-2021-2030/european-accessibility-act_en) took effect June 28, 2025, requiring accessible digital documents across the EU. This means more PDFs will be properly tagged with semantic structure.
186
+
187
+ **OpenDataLoader leverages this:**
188
+
189
+ - When a PDF has structure tags, we extract the **exact layout** the author intended
190
+ - Headings, lists, tables, reading order — all preserved from the source
191
+ - No guessing, no heuristics needed — **pixel-perfect semantic extraction**
192
+
193
+ ```python
194
+ opendataloader_pdf.convert(
195
+ input_path="accessible_document.pdf",
196
+ use_struct_tree=True # Use native PDF structure tags
197
+ )
198
+ ```
199
+
200
+ Most PDF parsers ignore structure tags entirely. We're one of the few that fully support them.
201
+
202
+ [Learn more about Tagged PDF →](https://opendataloader.org/docs/tagged-pdf)
203
+
204
+ <br/>
205
+
206
+ ## Hybrid Mode
207
+
208
+ For documents with complex tables or OCR needs, enable hybrid mode to route challenging pages to an AI backend while keeping simple pages fast and local.
209
+
210
+ **Results**: Table accuracy jumps from 0.49 → 0.93 (+90%) with acceptable speed trade-off.
211
+
212
+ ```bash
213
+ pip install -U "opendataloader-pdf[hybrid]"
214
+ ```
215
+
216
+ Terminal 1: Start the backend server
217
+
218
+ ```bash
219
+ opendataloader-pdf-hybrid --port 5002
220
+ ```
221
+
222
+ Terminal 2: Process PDFs with hybrid mode
223
+
224
+ ```bash
225
+ opendataloader-pdf --hybrid docling-fast input.pdf
226
+ ```
227
+
228
+ Or use in Python:
229
+
230
+ ```python
231
+ opendataloader_pdf.convert(
232
+ input_path="complex_tables.pdf",
233
+ output_dir="output/",
234
+ hybrid="docling-fast" # Routes complex pages to AI backend
235
+ )
236
+ ```
237
+
238
+ - **Local-first**: Simple pages processed locally, complex pages routed to backend
239
+ - **Fallback**: If backend unavailable, gracefully falls back to local processing
240
+ - **Privacy**: Run the backend locally in Docker for 100% on-premise
241
+
242
+ [Hybrid Mode Guide →](https://opendataloader.org/docs/hybrid-mode)
243
+
244
+ <br/>
245
+
246
+ ## LangChain Integration
247
+
248
+ OpenDataLoader PDF has an official LangChain integration for seamless RAG pipeline development.
249
+
250
+ ```bash
251
+ pip install -U langchain-opendataloader-pdf
252
+ ```
253
+
254
+ ```python
255
+ from langchain_opendataloader_pdf import OpenDataLoaderPDFLoader
256
+
257
+ loader = OpenDataLoaderPDFLoader(
258
+ file_path=["document.pdf"],
259
+ format="text"
260
+ )
261
+ documents = loader.load()
262
+
263
+ # Use with any LangChain pipeline
264
+ for doc in documents:
265
+ print(doc.page_content[:100])
266
+ ```
267
+
268
+ - [LangChain Documentation](https://python.langchain.com/docs/integrations/document_loaders/opendataloader_pdf/)
269
+ - [GitHub Repository](https://github.com/opendataloader-project/langchain-opendataloader-pdf)
270
+ - [PyPI Package](https://pypi.org/project/langchain-opendataloader-pdf/)
271
+
272
+ <br/>
273
+
274
+ ## Benchmarks
275
+
276
+ We continuously benchmark against real-world documents.
277
+
278
+ [View full benchmark results →](https://github.com/opendataloader-project/opendataloader-bench)
279
+
280
+ ### Quick Comparison
281
+
282
+ | Engine | Overall | Reading Order | Table | Heading | Speed (s/page) |
283
+ |-----------------------------|----------|---------------|----------|----------|----------------|
284
+ | **opendataloader** | 0.68 | 0.91 | 0.49 | 0.65 | **0.05** |
285
+ | **opendataloader [hybrid]** | **0.88** | **0.93** | **0.93** | 0.78 | 0.48 |
286
+ | docling | 0.86 | 0.90 | 0.89 | **0.80** | 0.73 |
287
+ | marker | 0.83 | 0.89 | 0.81 | **0.80** | 53.93 |
288
+ | mineru | 0.82 | 0.86 | 0.87 | 0.74 | 5.96 |
289
+ | pymupdf4llm | 0.57 | 0.89 | 0.40 | 0.41 | 0.09 |
290
+ | markitdown | 0.29 | 0.88 | 0.00 | 0.00 | **0.04** |
291
+
292
+ > Scores are normalized to [0, 1]. Higher is better for accuracy metrics; lower is better for speed. **Bold** indicates best performance.
293
+
294
+ ### Visual Comparison
295
+
296
+ [![Benchmark](https://github.com/opendataloader-project/opendataloader-bench/raw/refs/heads/main/charts/benchmark.png)](https://github.com/opendataloader-project/opendataloader-bench)
297
+
298
+
299
+ <br/>
300
+
301
+ ## Roadmap
302
+
303
+ See our [upcoming features and priorities →](https://opendataloader.org/docs/upcoming-roadmap)
304
+
305
+ <br/>
306
+
307
+ ## Documentation
308
+
309
+ - [Quick Start Guide](https://opendataloader.org/docs/quick-start-python)
310
+ - [JSON Schema Reference](https://opendataloader.org/docs/json-schema)
311
+ - [CLI Options](https://opendataloader.org/docs/cli-options-reference)
312
+ - [Tagged PDF Support](https://opendataloader.org/docs/tagged-pdf)
313
+ - [AI Safety Features](https://opendataloader.org/docs/ai-safety)
314
+
315
+ <br/>
316
+
317
+ ## Frequently Asked Questions
318
+
319
+ ### What is the best PDF parser for RAG?
320
+
321
+ For RAG pipelines, you need a parser that preserves document structure, maintains correct reading order, and provides element coordinates for citations. OpenDataLoader is designed specifically for this use case — it outputs structured JSON with bounding boxes, handles multi-column layouts correctly with XY-Cut++, and runs locally without GPU requirements.
322
+
323
+ ### How do I extract tables from PDF for LLM?
324
+
325
+ OpenDataLoader detects tables using both border analysis and text clustering, preserving row/column structure in the output. Tables are exported as structured data in JSON or as formatted Markdown tables, ready for LLM consumption.
326
+
327
+ ### Can I use this without sending data to the cloud?
328
+
329
+ Yes. OpenDataLoader runs 100% locally on your machine. No API calls, no data transmission — your documents never leave your environment. This makes it ideal for sensitive documents in legal, healthcare, and financial industries.
330
+
331
+ ### What makes OpenDataLoader unique?
332
+
333
+ OpenDataLoader takes a different approach from many PDF parsers:
334
+
335
+ - **Rule-based extraction** — Deterministic output without GPU requirements
336
+ - **Bounding boxes for all elements** — Essential for citation systems
337
+ - **XY-Cut++ reading order** — Handles multi-column layouts correctly
338
+ - **Built-in AI safety filters** — Protects against prompt injection
339
+ - **Native Tagged PDF support** — Leverages accessibility metadata
340
+
341
+ This means: consistent output (same input = same output), no GPU required, faster processing, and no model hallucinations.
342
+
343
+ ### How do I get better accuracy for complex tables?
344
+
345
+ Enable hybrid mode with `pip install -U "opendataloader-pdf[hybrid]"`. This routes pages with complex tables to an AI backend (like docling-serve) while keeping simple pages fast and local. Table accuracy improves from 0.49 to 0.93 — matching or exceeding dedicated AI parsers while remaining faster and more cost-effective.
346
+
347
+ <br/>
348
+
349
+ ## Contributing
350
+
351
+ We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
352
+
353
+ <br/>
354
+
355
+ ## License
356
+
357
+ [Mozilla Public License 2.0](LICENSE)
358
+
359
+ ---
360
+
361
+ **Found this useful?** Give us a star to help others discover OpenDataLoader.
@@ -1,7 +1,12 @@
1
1
  opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
2
- opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
3
- opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
4
- opendataloader_pdf/wrapper.py,sha256=ahbxo7YYunsNt3w66eXn_oa0XKy1LLfoA5GQcSuMvvM,4407
2
+ opendataloader_pdf/NOTICE.md,sha256=KKi_EHvEr5PHlWqtQ-IF-S0luNdVUjdVGkWmXtgDr3c,567
3
+ opendataloader_pdf/__init__.py,sha256=fdiSf1klK66Nr_gAeKXC86YVMPu3f4wt9e6WDcy4Hio,84
4
+ opendataloader_pdf/__main__.py,sha256=lmla4yz3SaYBfRJXOXnwO_8ID31-Ja20aQmomiz1eEc,84
5
+ opendataloader_pdf/cli_options_generated.py,sha256=207MSvhFzLFrh99lLLvR6DcLC32IOH8IB_F7bc_rOHc,7537
6
+ opendataloader_pdf/convert_generated.py,sha256=0A6_wIatkQpHvbGVmT5COGS6n-cn1C7KFUxdb7zffoQ,5869
7
+ opendataloader_pdf/hybrid_server.py,sha256=p_wV_Wi-waEwZpa__5-gBfsRhLfdLF6vzKQcbOpD3cE,8445
8
+ opendataloader_pdf/runner.py,sha256=hJPSGZyZ2UcULh-ZpIwN7Sh2sjk_Q_3cJsRMC-tnTO8,2494
9
+ opendataloader_pdf/wrapper.py,sha256=2a47-EhuJ_na-U5URNCK9DB2EomzV3qClslRvwgsPy8,4020
5
10
  opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
6
11
  opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
7
12
  opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
@@ -13,8 +18,8 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
13
18
  opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
14
19
  opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
15
20
  opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
16
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=jUVcuDXxNBFweOQbusLebCX1AZ3l3ssns4nJtSWsRTI,22116373
17
- opendataloader_pdf-0.0.0.dist-info/METADATA,sha256=y-T7UA9NodfcETSd2tX5FJdXMNArmpgXZG22wINIHRo,3382
18
- opendataloader_pdf-0.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
- opendataloader_pdf-0.0.0.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
20
- opendataloader_pdf-0.0.0.dist-info/RECORD,,
21
+ opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=BIiShJd5yfr7fqXi_f5bJWA5L5P4JgsoyKerC_3siuo,23819110
22
+ opendataloader_pdf-1.8.2.dist-info/METADATA,sha256=wWrKyJ52avv_xa6swj4k6SGN_gxyk4dMNq6JmcqCOvw,13418
23
+ opendataloader_pdf-1.8.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
24
+ opendataloader_pdf-1.8.2.dist-info/entry_points.txt,sha256=svIY_pCftFm2_nca5m0RVtPtdpQ_uj5KTEuZS5TR0DQ,137
25
+ opendataloader_pdf-1.8.2.dist-info/RECORD,,