opendataloader-pdf 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opendataloader-pdf might be problematic. Click here for more details.

@@ -13,6 +13,7 @@ def run(
13
13
  output_folder: str = None,
14
14
  password: str = None,
15
15
  generate_markdown: bool = False,
16
+ generate_html: bool = False,
16
17
  generate_annotated_pdf: bool = False,
17
18
  keep_line_breaks: bool = False,
18
19
  find_hidden_text: bool = False,
@@ -28,6 +29,7 @@ def run(
28
29
  output_folder: Path to the output folder. Defaults to the input folder.
29
30
  password: Password for the PDF file.
30
31
  generate_markdown: If True, generates a Markdown output file.
32
+ generate_html: If True, generates an HTML output file.
31
33
  generate_annotated_pdf: If True, generates an annotated PDF output file.
32
34
  keep_line_breaks: If True, keeps line breaks in the output.
33
35
  find_hidden_text: If True, finds hidden text in the PDF.
@@ -52,6 +54,8 @@ def run(
52
54
  args.extend(["--password", password])
53
55
  if generate_markdown:
54
56
  args.append("--markdown")
57
+ if generate_html:
58
+ args.append("--html")
55
59
  if generate_annotated_pdf:
56
60
  args.append("--pdf")
57
61
  if keep_line_breaks:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: opendataloader-pdf
3
- Version: 0.0.10
3
+ Version: 0.0.11
4
4
  Summary: A Python wrapper for the opendataloader-pdf Java CLI.
5
5
  Home-page: https://github.com/opendataloader-project/opendataloader-pdf
6
6
  Author: opendataloader-project
@@ -24,10 +24,12 @@ Dynamic: summary
24
24
 
25
25
  ![Pre-release](https://img.shields.io/badge/Pre--release-FFA500&logo=github)
26
26
  [![License](https://img.shields.io/pypi/l/opendataloader-pdf.svg)](https://pypi.org/project/opendataloader-pdf/)
27
+ ![Java](https://img.shields.io/badge/Java-11+-blue.svg)
28
+ ![Python](https://img.shields.io/badge/Python-3.8+-blue.svg)
27
29
  [![Maven Central](https://img.shields.io/maven-central/v/io.github.opendataloader-project/opendataloader-pdf-core.svg)](https://search.maven.org/artifact/io.github.opendataloader-project/opendataloader-pdf-core)
28
30
  [![PyPI version](https://img.shields.io/pypi/v/opendataloader-pdf.svg)](https://pypi.org/project/opendataloader-pdf/)
29
- [![Python Version](https://img.shields.io/pypi/pyversions/opendataloader-pdf.svg)](https://pypi.org/project/opendataloader-pdf/)
30
31
  [![GHCR Version](https://ghcr-badge.egpl.dev/opendataloader-project/opendataloader-pdf-cli/latest_tag?trim=major&label=docker-image)](https://github.com/opendataloader-project/opendataloader-pdf/pkgs/container/opendataloader-pdf-cli)
32
+ [![Coverage](https://codecov.io/gh/opendataloader-project/opendataloader-pdf/branch/main/graph/badge.svg)](https://app.codecov.io/gh/opendataloader-project/opendataloader-pdf)
31
33
  [![CLA assistant](https://cla-assistant.io/readme/badge/opendataloader-project/opendataloader-pdf)](https://cla-assistant.io/opendataloader-project/opendataloader-pdf)
32
34
 
33
35
  <br/>
@@ -92,7 +94,8 @@ opendataloader_pdf.run(
92
94
  input_path="path/to/document.pdf",
93
95
  output_folder="path/to/output",
94
96
  generate_markdown=True,
95
- generate_annotated_pdf=True
97
+ generate_html=True,
98
+ generate_annotated_pdf=True,
96
99
  )
97
100
  ```
98
101
 
@@ -106,6 +109,7 @@ The main function to process PDFs.
106
109
  | `output_folder` | `str` | No | input folder | Path to the output folder. |
107
110
  | `password` | `str` | No | `None` | Password for the PDF file. |
108
111
  | `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file. |
112
+ | `generate_html` | `bool` | No | `False` | If `True`, generates an HTML output file. |
109
113
  | `generate_annotated_pdf`| `bool` | No | `False` | If `True`, generates an annotated PDF output file. |
110
114
  | `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
111
115
  | `find_hidden_text` | `bool` | No | `False` | If `True`, finds hidden text in the PDF. |
@@ -180,6 +184,9 @@ public class Sample {
180
184
  //generate markdown output file
181
185
  config.setGenerateMarkdown(true);
182
186
 
187
+ //generate html output file
188
+ config.setGenerateHtml(true);
189
+
183
190
  //enable html in markdown output file
184
191
  config.setUseHTMLInMarkdown(true);
185
192
 
@@ -218,7 +225,7 @@ curl -L -o 1901.03003.pdf https://arxiv.org/pdf/1901.03003
218
225
  Run opendataloader-pdf in Docker container
219
226
 
220
227
  ```
221
- docker run --rm -v "$PWD":/work ghcr.io/opendataloader-project/opendataloader-pdf-cli:latest /work/1901.03003.pdf --markdown --pdf
228
+ docker run --rm -v "$PWD":/work ghcr.io/opendataloader-project/opendataloader-pdf-cli:latest /work/1901.03003.pdf --markdown --html --pdf
222
229
  ```
223
230
 
224
231
  <br/>
@@ -242,17 +249,17 @@ java/opendataloader-pdf-cli/target
242
249
  ### CLI usage
243
250
 
244
251
  ```sh
245
- java -jar ... [options] <INPUT FILE OR FOLDER>
252
+ java -jar opendataloader-pdf-cli-<VERSION>.jar [options] <INPUT FILE OR FOLDER>
246
253
  ```
247
254
 
248
255
  This generates a JSON file with layout recognition results in the specified output folder.
249
- Additionally, annotated PDF with recognized structures and Markdown file are generated if options `--pdf` and `--markdown` are specified.
256
+ Additionally, annotated PDF with recognized structures, Markdown and Html are generated if options `--pdf`, `--markdown` and `--html` are specified.
250
257
 
251
258
  By default all line breaks and hyphenation characters are removed, the Markdown does not include any images and does not use any HTML.
252
259
 
253
260
  The option `--keeplinebreaks` to preserve the original line breaks text content in JSON and Markdown output.
254
261
 
255
- The option `--html`` enables use of HTML in Markdown, which may improve Markdown preview in processors that support HTML tags.
262
+ The option `--htmlinmarkdown` enables use of HTML in Markdown, which may improve Markdown preview in processors that support HTML tags.
256
263
  The option `--addimagetomarkdown` enables inclusion of image references into the output Markdown.
257
264
  The images are extracted from PDF as individual files and stored in a subfolder next to the Markdown output.
258
265
 
@@ -263,9 +270,10 @@ Options:
263
270
  -f,--folder <arg> Specify output folder (default the folder of the input PDF)
264
271
  -klb,--keeplinebreaks Keep line breaks
265
272
  -ht,--findhiddentext Find hidden text
266
- -html,--htmlinmarkdown Use html in markdown
273
+ -htmlmd,--htmlinmarkdown Use html in markdown
267
274
  -im,--addimagetomarkdown Add images to markdown
268
275
  -markdown,--markdown Generates markdown output
276
+ -html,--html Generates html output
269
277
  -p,--password <arg> Specifies password
270
278
  -pdf,--pdf Generates pdf output
271
279
  ```
@@ -1,7 +1,7 @@
1
1
  opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
2
2
  opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
3
3
  opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
4
- opendataloader_pdf/wrapper.py,sha256=DGwzBVy1DyNxUFPLxi8Mzwb68u3fo0k0B5YEBufy0vI,4518
4
+ opendataloader_pdf/wrapper.py,sha256=YuCPVrqZdoA6kg-_MiXYo9KvIkmRIY_QxDqem8Sd8V0,4666
5
5
  opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
6
6
  opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
7
7
  opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
@@ -13,8 +13,8 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
13
13
  opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
14
14
  opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
15
15
  opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
16
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=Qp9qnNbptrsdrL2UJn8bw-WStRqfI9EGYd883EtDZfE,22114700
17
- opendataloader_pdf-0.0.10.dist-info/METADATA,sha256=ESHbbQmEr8L5VqKwpll0u7h4EiiG7YTnYiD10p7Z7h0,17626
18
- opendataloader_pdf-0.0.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
- opendataloader_pdf-0.0.10.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
20
- opendataloader_pdf-0.0.10.dist-info/RECORD,,
16
+ opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=sM8mynVYq8p4gqQwXxI3sBf-jQukTtFp_0ts1VAMSpM,22122871
17
+ opendataloader_pdf-0.0.11.dist-info/METADATA,sha256=fKVr3VPqq2vmV6RKE2WvztIbAFxs-g6fAvHMO15QQ1k,18128
18
+ opendataloader_pdf-0.0.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
+ opendataloader_pdf-0.0.11.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
20
+ opendataloader_pdf-0.0.11.dist-info/RECORD,,