opendataloader-pdf 0.0.15__py3-none-any.whl → 0.0.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opendataloader-pdf might be problematic. Click here for more details.

@@ -12,11 +12,12 @@ def run(
12
12
  input_path: str,
13
13
  output_folder: str = None,
14
14
  password: str = None,
15
+ replace_invalid_chars: str = None,
15
16
  generate_markdown: bool = False,
16
17
  generate_html: bool = False,
17
18
  generate_annotated_pdf: bool = False,
18
19
  keep_line_breaks: bool = False,
19
- find_hidden_text: bool = False,
20
+ content_safety_off: str = None,
20
21
  html_in_markdown: bool = False,
21
22
  add_image_to_markdown: bool = False,
22
23
  debug: bool = False,
@@ -28,6 +29,7 @@ def run(
28
29
  input_path: Path to the input PDF file or folder.
29
30
  output_folder: Path to the output folder. Defaults to the input folder.
30
31
  password: Password for the PDF file.
32
+ replace_invalid_chars: Character to replace invalid or unrecognized characters (e.g., �, \u0000) with.
31
33
  generate_markdown: If True, generates a Markdown output file.
32
34
  generate_html: If True, generates an HTML output file.
33
35
  generate_annotated_pdf: If True, generates an annotated PDF output file.
@@ -49,9 +51,11 @@ def run(
49
51
 
50
52
  args = []
51
53
  if output_folder:
52
- args.extend(["--folder", output_folder])
54
+ args.extend(["--output-dir", output_folder])
53
55
  if password:
54
56
  args.extend(["--password", password])
57
+ if replace_invalid_chars:
58
+ args.extend(["--replace-invalid-chars", replace_invalid_chars])
55
59
  if generate_markdown:
56
60
  args.append("--markdown")
57
61
  if generate_html:
@@ -59,13 +63,13 @@ def run(
59
63
  if generate_annotated_pdf:
60
64
  args.append("--pdf")
61
65
  if keep_line_breaks:
62
- args.append("--keeplinebreaks")
63
- if find_hidden_text:
64
- args.append("--findhiddentext")
66
+ args.append("--keep-line-breaks")
67
+ if content_safety_off:
68
+ args.append(["--content-safety-off", content_safety_off])
65
69
  if html_in_markdown:
66
- args.append("--htmlinmarkdown")
70
+ args.append("--markdown-with-html")
67
71
  if add_image_to_markdown:
68
- args.append("--addimagetomarkdown")
72
+ args.append("--markdown-with-images")
69
73
 
70
74
  args.append(input_path)
71
75
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opendataloader-pdf
3
- Version: 0.0.15
3
+ Version: 0.0.16
4
4
  Summary: A Python wrapper for the opendataloader-pdf Java CLI.
5
5
  Home-page: https://github.com/opendataloader-project/opendataloader-pdf
6
6
  Author: opendataloader-project
@@ -25,11 +25,12 @@ Dynamic: summary
25
25
  # OpenDataLoader PDF
26
26
 
27
27
  ![Pre-release](https://img.shields.io/badge/Pre--release-FFA500&logo=github)
28
- [![License](https://img.shields.io/pypi/l/opendataloader-pdf.svg)](https://pypi.org/project/opendataloader-pdf/)
28
+ [![License](https://img.shields.io/pypi/l/opendataloader-pdf.svg)](https://github.com/opendataloader-project/opendataloader-pdf/blob/main/LICENSE)
29
29
  ![Java](https://img.shields.io/badge/Java-11+-blue.svg)
30
30
  ![Python](https://img.shields.io/badge/Python-3.8+-blue.svg)
31
- [![Maven Central](https://img.shields.io/maven-central/v/io.github.opendataloader-project/opendataloader-pdf-core.svg)](https://search.maven.org/artifact/io.github.opendataloader-project/opendataloader-pdf-core)
31
+ [![Maven Central](https://img.shields.io/maven-central/v/org.opendataloader/opendataloader-pdf-core.svg)](https://search.maven.org/artifact/org.opendataloader/opendataloader-pdf-core)
32
32
  [![PyPI version](https://img.shields.io/pypi/v/opendataloader-pdf.svg)](https://pypi.org/project/opendataloader-pdf/)
33
+ [![npm version](https://img.shields.io/npm/v/@opendataloader/pdf.svg)](https://www.npmjs.com/package/@opendataloader/pdf)
33
34
  [![GHCR Version](https://ghcr-badge.egpl.dev/opendataloader-project/opendataloader-pdf-cli/latest_tag?trim=major&label=docker-image)](https://github.com/opendataloader-project/opendataloader-pdf/pkgs/container/opendataloader-pdf-cli)
34
35
  [![Coverage](https://codecov.io/gh/opendataloader-project/opendataloader-pdf/branch/main/graph/badge.svg)](https://app.codecov.io/gh/opendataloader-project/opendataloader-pdf)
35
36
  [![CLA assistant](https://cla-assistant.io/readme/badge/opendataloader-project/opendataloader-pdf)](https://cla-assistant.io/opendataloader-project/opendataloader-pdf)
@@ -50,10 +51,9 @@ AI-safety is enabled by default and automatically filters likely prompt-injectio
50
51
 
51
52
  - 🧾 **Rich, Structured Output** — JSON, Markdown or Html
52
53
  - 🧩 **Layout Reconstruction** — Headings, Lists, Tables, Images, Reading Order
53
- - 🔒 **Local-First Privacy** — Runs fully on your machine
54
54
  - ⚡ **Fast & Lightweight** — Rule-Based Heuristic, High-Throughput, No GPU
55
- - 🛡️ **AI-Safety** — Auto-Filters likely prompt-injection content
56
- - 👐 **Open-Source** — Free for commercial use
55
+ - 🔒 **Local-First Privacy** — Runs fully on your machine
56
+ - 🛡️ **AI-Safety** — Auto-Filters likely prompt-injection content - [Learn more about AI-Safety](https://github.com/opendataloader-project/opendataloader-pdf/blob/main/docs/AI_SAFETY.md)
57
57
  - 🖍️ **Annotated PDF Visualization** — See detected structures overlaid on the original
58
58
 
59
59
  [Download Annotated PDF Sample](https://raw.githubusercontent.com/opendataloader-project/opendataloader-pdf/main/resources/1901.03003_annotated.pdf)
@@ -95,8 +95,8 @@ pip install -U opendataloader-pdf
95
95
  import opendataloader_pdf
96
96
 
97
97
  opendataloader_pdf.run(
98
- input_path="path/to/document.pdf",
99
- output_folder="path/to/output",
98
+ input_path="path-to-document.pdf",
99
+ output_folder="path-to-output",
100
100
  generate_markdown=True,
101
101
  generate_html=True,
102
102
  generate_annotated_pdf=True,
@@ -107,36 +107,115 @@ opendataloader_pdf.run(
107
107
 
108
108
  The main function to process PDFs.
109
109
 
110
- | Parameter | Type | Required | Default | Description |
111
- | ----------------------- | ------ | -------- | ------------ | --------------------------------------------------------------- |
112
- | `input_path` | `str` | ✅ Yes | — | Path to the input PDF file or folder. |
113
- | `output_folder` | `str` | No | input folder | Path to the output folder. |
114
- | `password` | `str` | No | `None` | Password for the PDF file. |
115
- | `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file. |
116
- | `generate_html` | `bool` | No | `False` | If `True`, generates an HTML output file. |
117
- | `generate_annotated_pdf`| `bool` | No | `False` | If `True`, generates an annotated PDF output file. |
118
- | `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
119
- | `find_hidden_text` | `bool` | No | `False` | If `True`, finds hidden text in the PDF. |
120
- | `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output. |
121
- | `add_image_to_markdown` | `bool` | No | `False` | If `True`, adds images to the Markdown output. |
122
- | `debug` | `bool` | No | `False` | If `True`, prints CLI messages to the console during execution. |
110
+ | Parameter | Type | Required | Default | Description |
111
+ |--------------------------| ------ | -------- |--------------|-------------------------------------------------------------------------------------------------------------------------------------|
112
+ | `input_path` | `str` | ✅ Yes | — | Path to the input PDF file or folder. |
113
+ | `output_folder` | `str` | No | input folder | Path to the output folder. |
114
+ | `password` | `str` | No | `None` | Password for the PDF file. |
115
+ | `replace_invalid_chars` | `str` | No | `" "` | Character to replace invalid or unrecognized characters (e.g., �, \u0000) |
116
+ | `content_safety_off` | `str` | No | `None` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page. |
117
+ | `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file. |
118
+ | `generate_html` | `bool` | No | `False` | If `True`, generates an HTML output file. |
119
+ | `generate_annotated_pdf` | `bool` | No | `False` | If `True`, generates an annotated PDF output file. |
120
+ | `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
121
+ | `html_in_markdown` | `bool` | No | `False` | If `True`, uses HTML in the Markdown output. |
122
+ | `add_image_to_markdown` | `bool` | No | `False` | If `True`, adds images to the Markdown output. |
123
+ | `debug` | `bool` | No | `False` | If `True`, prints CLI messages to the console during execution. |
124
+
125
+ <br/>
126
+
127
+ ## Node.js / NPM
128
+
129
+ **Note:** This package is a wrapper around a Java CLI and is intended for use in a Node.js backend environment. It cannot be used in a browser-based frontend.
130
+
131
+ ### Prerequisites
132
+
133
+ - Java 11 or higher must be installed and available in your system's PATH.
134
+
135
+ ### Installation
136
+
137
+ ```sh
138
+ npm install @opendataloader/pdf
139
+ ```
140
+
141
+ ### Usage
142
+
143
+ - `inputPath` can be either the path to a single document or the path to a folder.
144
+ - If you don’t specify an `outputFolder`, the output data will be saved in the same directory as the input document.
145
+
146
+ ```typescript
147
+ import { run } from '@opendataloader/pdf';
148
+
149
+ async function main() {
150
+ try {
151
+ const output = await run('path-to-document.pdf', {
152
+ outputFolder: 'path-to-output',
153
+ generateMarkdown: true,
154
+ generateHtml: true,
155
+ generateAnnotatedPdf: true,
156
+ debug: true,
157
+ });
158
+ console.log('PDF processing complete.', output);
159
+ } catch (error) {
160
+ console.error('Error processing PDF:', error);
161
+ }
162
+ }
163
+
164
+ main();
165
+ ```
166
+
167
+ ### Function: run()
168
+
169
+ `run(inputPath: string, options?: RunOptions): Promise<string>`
170
+
171
+ The main function to process PDFs.
172
+
173
+ **Parameters**
174
+
175
+ | Parameter | Type | Required | Description |
176
+ | ----------- | -------- | -------- | ------------------------------------- |
177
+ | `inputPath` | `string` | ✅ Yes | Path to the input PDF file or folder. |
178
+ | `options` | `RunOptions` | No | Configuration options for the run. |
179
+
180
+ **RunOptions**
181
+
182
+ | Property | Type | Default | Description |
183
+ | ----------------------- | --------- | ------------- | --------------------------------------------------------------------------- |
184
+ | `outputFolder` | `string` | `undefined` | Path to the output folder. If not set, output is saved next to the input. |
185
+ | `password` | `string` | `undefined` | Password for the PDF file. |
186
+ | `replaceInvalidChars` | `string` | `" "` | Character to replace invalid or unrecognized characters (e.g., , \u0000). |
187
+ | `content_safety_off` | `string` | `undefined` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page. |
188
+ | `generateMarkdown` | `boolean` | `false` | If `true`, generates a Markdown output file. |
189
+ | `generateHtml` | `boolean` | `false` | If `true`, generates an HTML output file. |
190
+ | `generateAnnotatedPdf` | `boolean` | `false` | If `true`, generates an annotated PDF output file. |
191
+ | `keepLineBreaks` | `boolean` | `false` | If `true`, keeps line breaks in the output. |
192
+ | `htmlInMarkdown` | `boolean` | `false` | If `true`, uses HTML in the Markdown output. |
193
+ | `addImageToMarkdown` | `boolean` | `false` | If `true`, adds images to the Markdown output. |
194
+ | `debug` | `boolean` | `false` | If `true`, prints CLI messages to the console during execution. |
123
195
 
124
196
  <br/>
125
197
 
126
198
  ## Java
127
199
 
200
+ For various example templates, including Gradle and Maven, please refer to https://github.com/opendataloader-project/opendataloader-pdf/tree/main/examples/java.
201
+
128
202
  ### Dependency
129
203
 
130
204
  To include OpenDataLoader PDF in your Maven project, add the dependency below to your `pom.xml` file.
131
205
 
132
- Check for the latest version on [Maven Central](https://search.maven.org/artifact/io.github.opendataloader-project/opendataloader-pdf-core).
206
+ Check for the latest version on [Maven Central](https://search.maven.org/artifact/org.opendataloader/opendataloader-pdf-core).
133
207
 
134
208
  ```xml
135
- <dependency>
136
- <groupId>io.github.opendataloader-project</groupId>
137
- <artifactId>opendataloader-pdf-core</artifactId>
138
- <version>0.0.12</version>
139
- </dependency>
209
+ <project>
210
+ <!-- other configurations... -->
211
+
212
+ <dependencies>
213
+ <dependency>
214
+ <groupId>org.opendataloader</groupId>
215
+ <artifactId>opendataloader-pdf-core</artifactId>
216
+ <version>0.0.15</version>
217
+ </dependency>
218
+ </dependencies>
140
219
 
141
220
  <repositories>
142
221
  <repository>
@@ -158,6 +237,9 @@ Check for the latest version on [Maven Central](https://search.maven.org/artifac
158
237
  <url>https://artifactory.openpreservation.org/artifactory/vera-dev</url>
159
238
  </pluginRepository>
160
239
  </pluginRepositories>
240
+
241
+ <!-- other configurations... -->
242
+ </project>
161
243
  ```
162
244
 
163
245
 
@@ -166,51 +248,22 @@ Check for the latest version on [Maven Central](https://search.maven.org/artifac
166
248
  To integrate Layout recognition API into Java code, one can follow the sample code below.
167
249
 
168
250
  ```java
169
- import com.hancom.opendataloader.pdf.api.Config;
170
- import com.hancom.opendataloader.pdf.api.OpenDataLoaderPDF;
251
+ import org.opendataloader.pdf.api.Config;
252
+ import org.opendataloader.pdf.api.OpenDataLoaderPDF;
171
253
 
172
254
  import java.io.IOException;
173
255
 
174
256
  public class Sample {
175
257
 
176
258
  public static void main(String[] args) {
177
- //create default config
178
259
  Config config = new Config();
179
-
180
- //set output folder relative to the input PDF
181
- //if the output folder is not set, the current folder of the input PDF is used
182
- config.setOutputFolder("output");
183
-
184
- //generating pdf output file
260
+ config.setOutputFolder("path/to/output");
185
261
  config.setGeneratePDF(true);
186
-
187
- //set password of input pdf file
188
- config.setPassword("password");
189
-
190
- //generate markdown output file
191
262
  config.setGenerateMarkdown(true);
192
-
193
- //generate html output file
194
263
  config.setGenerateHtml(true);
195
264
 
196
- //enable html in markdown output file
197
- config.setUseHTMLInMarkdown(true);
198
-
199
- //add images to markdown output file
200
- config.setAddImageToMarkdown(true);
201
-
202
- //disable json output file
203
- config.setGenerateJSON(false);
204
-
205
- //keep line breaks
206
- config.setKeepLineBreaks(true);
207
-
208
- //find hidden text
209
- config.setFindHiddenText(true);
210
-
211
265
  try {
212
- //process pdf file
213
- OpenDataLoaderPDF.processFile("input.pdf", config);
266
+ OpenDataLoaderPDF.processFile("path/to/document.pdf", config);
214
267
  } catch (Exception exception) {
215
268
  //exception during processing
216
269
  }
@@ -220,7 +273,7 @@ public class Sample {
220
273
 
221
274
  ### API Documentation
222
275
 
223
- The full API documentation is available at [javadoc](https://javadoc.io/doc/io.github.opendataloader-project/opendataloader-pdf-core/latest/)
276
+ The full API documentation is available at [javadoc](https://javadoc.io/doc/org.opendataloader/opendataloader-pdf-core/latest/)
224
277
 
225
278
  <br/>
226
279
 
@@ -267,25 +320,27 @@ Additionally, annotated PDF with recognized structures, Markdown and Html are ge
267
320
 
268
321
  By default all line breaks and hyphenation characters are removed, the Markdown does not include any images and does not use any HTML.
269
322
 
270
- The option `--keeplinebreaks` to preserve the original line breaks text content in JSON and Markdown output.
271
-
272
- The option `--htmlinmarkdown` enables use of HTML in Markdown, which may improve Markdown preview in processors that support HTML tags.
273
- The option `--addimagetomarkdown` enables inclusion of image references into the output Markdown.
323
+ The option `--keep-line-breaks` to preserve the original line breaks text content in JSON and Markdown output.
324
+ The option `--content-safety-off` disables one or more content safety filters. Accepts a comma-separated list of filter names.
325
+ The option `--markdown-with-html` enables use of HTML in Markdown, which may improve Markdown preview in processors that support HTML tags.
326
+ The option `--markdown-with-images` enables inclusion of image references into the output Markdown.
327
+ The option `--replace-invalid-chars` replaces invalid or unrecognized characters (e.g., �, \u0000) with the specified character.
274
328
  The images are extracted from PDF as individual files and stored in a subfolder next to the Markdown output.
275
329
 
276
330
  #### Available options:
277
331
 
278
332
  ```
279
333
  Options:
280
- -f,--folder <arg> Specify output folder (default the folder of the input PDF)
281
- -klb,--keeplinebreaks Keep line breaks
282
- -ht,--findhiddentext Find hidden text
283
- -htmlmd,--htmlinmarkdown Use html in markdown
284
- -im,--addimagetomarkdown Add images to markdown
285
- -markdown,--markdown Generates markdown output
286
- -html,--html Generates html output
287
- -p,--password <arg> Specifies password
288
- -pdf,--pdf Generates pdf output
334
+ -o,--output-dir <arg> Specifies the output directory for generated files
335
+ --keep-line-breaks Preserves original line breaks in the extracted text
336
+ --content-safety-off <arg> Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page
337
+ --markdown-with-html Sets the data extraction output format to Markdown with rendering complex elements like tables as HTML for better structure
338
+ --markdown-with-images Sets the data extraction output format to Markdown with extracting images from the PDF and includes them as links
339
+ --markdown Sets the data extraction output format to Markdown
340
+ --html Sets the data extraction output format to HTML
341
+ -p,--password <arg> Specifies the password for an encrypted PDF
342
+ --pdf Generates a new PDF file where the extracted layout data is visualized as annotations
343
+ --replace-invalid-chars <arg> Replaces invalid or unrecognized characters (e.g., �, \u0000) with the specified character
289
344
  ```
290
345
 
291
346
  ### Schema of the JSON output
@@ -1,7 +1,7 @@
1
1
  opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
2
2
  opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
3
3
  opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
4
- opendataloader_pdf/wrapper.py,sha256=YuCPVrqZdoA6kg-_MiXYo9KvIkmRIY_QxDqem8Sd8V0,4666
4
+ opendataloader_pdf/wrapper.py,sha256=bPy-wNmQfJpmCg9dVx9uNTrGfW446GdGNrlJnt0cosA,4960
5
5
  opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
6
6
  opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
7
7
  opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
@@ -13,8 +13,8 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
13
13
  opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
14
14
  opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
15
15
  opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
16
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=GCTahEYOHGxpId3ce3pbkB4C2CVf2VbHMY78WjvzIk4,22126046
17
- opendataloader_pdf-0.0.15.dist-info/METADATA,sha256=7J_lFR5yzyXMywass6JZaUh6GSt3UG4nBInfMlS_c5c,18727
18
- opendataloader_pdf-0.0.15.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
- opendataloader_pdf-0.0.15.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
20
- opendataloader_pdf-0.0.15.dist-info/RECORD,,
16
+ opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=DI0_vONCuUqmvKnVwkzUcRoA4HSv4B8EWqs27vb8u2w,22126375
17
+ opendataloader_pdf-0.0.16.dist-info/METADATA,sha256=FjpkSNX7uz8YdehHMeZenaWi7ZVQKgJnJ-4RXAR_ITI,23689
18
+ opendataloader_pdf-0.0.16.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
+ opendataloader_pdf-0.0.16.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
20
+ opendataloader_pdf-0.0.16.dist-info/RECORD,,