opendataloader-pdf 0.0.10__py3-none-any.whl → 0.0.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opendataloader-pdf might be problematic. Click here for more details.
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar +0 -0
- opendataloader_pdf/wrapper.py +4 -0
- {opendataloader_pdf-0.0.10.dist-info → opendataloader_pdf-0.0.12.dist-info}/METADATA +26 -12
- {opendataloader_pdf-0.0.10.dist-info → opendataloader_pdf-0.0.12.dist-info}/RECORD +6 -6
- {opendataloader_pdf-0.0.10.dist-info → opendataloader_pdf-0.0.12.dist-info}/WHEEL +0 -0
- {opendataloader_pdf-0.0.10.dist-info → opendataloader_pdf-0.0.12.dist-info}/top_level.txt +0 -0
|
Binary file
|
opendataloader_pdf/wrapper.py
CHANGED
|
@@ -13,6 +13,7 @@ def run(
|
|
|
13
13
|
output_folder: str = None,
|
|
14
14
|
password: str = None,
|
|
15
15
|
generate_markdown: bool = False,
|
|
16
|
+
generate_html: bool = False,
|
|
16
17
|
generate_annotated_pdf: bool = False,
|
|
17
18
|
keep_line_breaks: bool = False,
|
|
18
19
|
find_hidden_text: bool = False,
|
|
@@ -28,6 +29,7 @@ def run(
|
|
|
28
29
|
output_folder: Path to the output folder. Defaults to the input folder.
|
|
29
30
|
password: Password for the PDF file.
|
|
30
31
|
generate_markdown: If True, generates a Markdown output file.
|
|
32
|
+
generate_html: If True, generates an HTML output file.
|
|
31
33
|
generate_annotated_pdf: If True, generates an annotated PDF output file.
|
|
32
34
|
keep_line_breaks: If True, keeps line breaks in the output.
|
|
33
35
|
find_hidden_text: If True, finds hidden text in the PDF.
|
|
@@ -52,6 +54,8 @@ def run(
|
|
|
52
54
|
args.extend(["--password", password])
|
|
53
55
|
if generate_markdown:
|
|
54
56
|
args.append("--markdown")
|
|
57
|
+
if generate_html:
|
|
58
|
+
args.append("--html")
|
|
55
59
|
if generate_annotated_pdf:
|
|
56
60
|
args.append("--pdf")
|
|
57
61
|
if keep_line_breaks:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opendataloader-pdf
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.12
|
|
4
4
|
Summary: A Python wrapper for the opendataloader-pdf Java CLI.
|
|
5
5
|
Home-page: https://github.com/opendataloader-project/opendataloader-pdf
|
|
6
6
|
Author: opendataloader-project
|
|
@@ -24,10 +24,12 @@ Dynamic: summary
|
|
|
24
24
|
|
|
25
25
|

|
|
26
26
|
[](https://pypi.org/project/opendataloader-pdf/)
|
|
27
|
+

|
|
28
|
+

|
|
27
29
|
[](https://search.maven.org/artifact/io.github.opendataloader-project/opendataloader-pdf-core)
|
|
28
30
|
[](https://pypi.org/project/opendataloader-pdf/)
|
|
29
|
-
[](https://pypi.org/project/opendataloader-pdf/)
|
|
30
31
|
[](https://github.com/opendataloader-project/opendataloader-pdf/pkgs/container/opendataloader-pdf-cli)
|
|
32
|
+
[](https://app.codecov.io/gh/opendataloader-project/opendataloader-pdf)
|
|
31
33
|
[](https://cla-assistant.io/opendataloader-project/opendataloader-pdf)
|
|
32
34
|
|
|
33
35
|
<br/>
|
|
@@ -92,7 +94,8 @@ opendataloader_pdf.run(
|
|
|
92
94
|
input_path="path/to/document.pdf",
|
|
93
95
|
output_folder="path/to/output",
|
|
94
96
|
generate_markdown=True,
|
|
95
|
-
|
|
97
|
+
generate_html=True,
|
|
98
|
+
generate_annotated_pdf=True,
|
|
96
99
|
)
|
|
97
100
|
```
|
|
98
101
|
|
|
@@ -106,6 +109,7 @@ The main function to process PDFs.
|
|
|
106
109
|
| `output_folder` | `str` | No | input folder | Path to the output folder. |
|
|
107
110
|
| `password` | `str` | No | `None` | Password for the PDF file. |
|
|
108
111
|
| `generate_markdown` | `bool` | No | `False` | If `True`, generates a Markdown output file. |
|
|
112
|
+
| `generate_html` | `bool` | No | `False` | If `True`, generates an HTML output file. |
|
|
109
113
|
| `generate_annotated_pdf`| `bool` | No | `False` | If `True`, generates an annotated PDF output file. |
|
|
110
114
|
| `keep_line_breaks` | `bool` | No | `False` | If `True`, keeps line breaks in the output. |
|
|
111
115
|
| `find_hidden_text` | `bool` | No | `False` | If `True`, finds hidden text in the PDF. |
|
|
@@ -121,11 +125,13 @@ The main function to process PDFs.
|
|
|
121
125
|
|
|
122
126
|
To include OpenDataLoader PDF in your Maven project, add the dependency below to your `pom.xml` file.
|
|
123
127
|
|
|
128
|
+
Check for the latest version on [Maven Central](https://search.maven.org/artifact/io.github.opendataloader-project/opendataloader-pdf-core).
|
|
129
|
+
|
|
124
130
|
```xml
|
|
125
131
|
<dependency>
|
|
126
132
|
<groupId>io.github.opendataloader-project</groupId>
|
|
127
133
|
<artifactId>opendataloader-pdf-core</artifactId>
|
|
128
|
-
<version>0.0.
|
|
134
|
+
<version>0.0.12</version>
|
|
129
135
|
</dependency>
|
|
130
136
|
|
|
131
137
|
<repositories>
|
|
@@ -156,8 +162,8 @@ To include OpenDataLoader PDF in your Maven project, add the dependency below to
|
|
|
156
162
|
To integrate Layout recognition API into Java code, one can follow the sample code below.
|
|
157
163
|
|
|
158
164
|
```java
|
|
159
|
-
import com.hancom.opendataloader.pdf.
|
|
160
|
-
import com.hancom.opendataloader.pdf.
|
|
165
|
+
import com.hancom.opendataloader.pdf.api.Config;
|
|
166
|
+
import com.hancom.opendataloader.pdf.api.OpenDataLoaderPDF;
|
|
161
167
|
|
|
162
168
|
import java.io.IOException;
|
|
163
169
|
|
|
@@ -180,6 +186,9 @@ public class Sample {
|
|
|
180
186
|
//generate markdown output file
|
|
181
187
|
config.setGenerateMarkdown(true);
|
|
182
188
|
|
|
189
|
+
//generate html output file
|
|
190
|
+
config.setGenerateHtml(true);
|
|
191
|
+
|
|
183
192
|
//enable html in markdown output file
|
|
184
193
|
config.setUseHTMLInMarkdown(true);
|
|
185
194
|
|
|
@@ -197,7 +206,7 @@ public class Sample {
|
|
|
197
206
|
|
|
198
207
|
try {
|
|
199
208
|
//process pdf file
|
|
200
|
-
|
|
209
|
+
OpenDataLoaderPDF.processFile("input.pdf", config);
|
|
201
210
|
} catch (Exception exception) {
|
|
202
211
|
//exception during processing
|
|
203
212
|
}
|
|
@@ -205,6 +214,10 @@ public class Sample {
|
|
|
205
214
|
}
|
|
206
215
|
```
|
|
207
216
|
|
|
217
|
+
### API Documentation
|
|
218
|
+
|
|
219
|
+
The full API documentation is available at [javadoc](https://javadoc.io/doc/io.github.opendataloader-project/opendataloader-pdf-core/latest/)
|
|
220
|
+
|
|
208
221
|
<br/>
|
|
209
222
|
|
|
210
223
|
## Docker
|
|
@@ -218,7 +231,7 @@ curl -L -o 1901.03003.pdf https://arxiv.org/pdf/1901.03003
|
|
|
218
231
|
Run opendataloader-pdf in Docker container
|
|
219
232
|
|
|
220
233
|
```
|
|
221
|
-
docker run --rm -v "$PWD":/work ghcr.io/opendataloader-project/opendataloader-pdf-cli:latest /work/1901.03003.pdf --markdown --pdf
|
|
234
|
+
docker run --rm -v "$PWD":/work ghcr.io/opendataloader-project/opendataloader-pdf-cli:latest /work/1901.03003.pdf --markdown --html --pdf
|
|
222
235
|
```
|
|
223
236
|
|
|
224
237
|
<br/>
|
|
@@ -242,17 +255,17 @@ java/opendataloader-pdf-cli/target
|
|
|
242
255
|
### CLI usage
|
|
243
256
|
|
|
244
257
|
```sh
|
|
245
|
-
java -jar
|
|
258
|
+
java -jar opendataloader-pdf-cli-<VERSION>.jar [options] <INPUT FILE OR FOLDER>
|
|
246
259
|
```
|
|
247
260
|
|
|
248
261
|
This generates a JSON file with layout recognition results in the specified output folder.
|
|
249
|
-
Additionally, annotated PDF with recognized structures and
|
|
262
|
+
Additionally, annotated PDF with recognized structures, Markdown and Html are generated if options `--pdf`, `--markdown` and `--html` are specified.
|
|
250
263
|
|
|
251
264
|
By default all line breaks and hyphenation characters are removed, the Markdown does not include any images and does not use any HTML.
|
|
252
265
|
|
|
253
266
|
The option `--keeplinebreaks` to preserve the original line breaks text content in JSON and Markdown output.
|
|
254
267
|
|
|
255
|
-
The option `--
|
|
268
|
+
The option `--htmlinmarkdown` enables use of HTML in Markdown, which may improve Markdown preview in processors that support HTML tags.
|
|
256
269
|
The option `--addimagetomarkdown` enables inclusion of image references into the output Markdown.
|
|
257
270
|
The images are extracted from PDF as individual files and stored in a subfolder next to the Markdown output.
|
|
258
271
|
|
|
@@ -263,9 +276,10 @@ Options:
|
|
|
263
276
|
-f,--folder <arg> Specify output folder (default the folder of the input PDF)
|
|
264
277
|
-klb,--keeplinebreaks Keep line breaks
|
|
265
278
|
-ht,--findhiddentext Find hidden text
|
|
266
|
-
-
|
|
279
|
+
-htmlmd,--htmlinmarkdown Use html in markdown
|
|
267
280
|
-im,--addimagetomarkdown Add images to markdown
|
|
268
281
|
-markdown,--markdown Generates markdown output
|
|
282
|
+
-html,--html Generates html output
|
|
269
283
|
-p,--password <arg> Specifies password
|
|
270
284
|
-pdf,--pdf Generates pdf output
|
|
271
285
|
```
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
opendataloader_pdf/LICENSE,sha256=rxdbnZbuk8IaA2FS4bkFsLlTBNSujCySHHYJEAuo334,15921
|
|
2
2
|
opendataloader_pdf/NOTICE.md,sha256=Uxc6sEbVz2hfsDinzzSNMtmsjx9HsQUod0yy0cswUwg,562
|
|
3
3
|
opendataloader_pdf/__init__.py,sha256=T5RV-dcgjNCm8klNy_EH-IgOeodcPg6Yc34HHXtuAmQ,44
|
|
4
|
-
opendataloader_pdf/wrapper.py,sha256=
|
|
4
|
+
opendataloader_pdf/wrapper.py,sha256=YuCPVrqZdoA6kg-_MiXYo9KvIkmRIY_QxDqem8Sd8V0,4666
|
|
5
5
|
opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_LICENSES.md,sha256=QRYYiXFS2zBDGdmWRo_SrRfGhrdRBwhiRo1SdUKfrQo,11235
|
|
6
6
|
opendataloader_pdf/THIRD_PARTY/THIRD_PARTY_NOTICES.md,sha256=pB2ZitFM1u0x3rIDpMHsLxOe4OFNCZRqkzeR-bfpFzE,8911
|
|
7
7
|
opendataloader_pdf/THIRD_PARTY/licenses/Apache-2.0.txt,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
@@ -13,8 +13,8 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
|
|
|
13
13
|
opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
|
|
14
14
|
opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
|
|
15
15
|
opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
|
|
16
|
-
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=
|
|
17
|
-
opendataloader_pdf-0.0.
|
|
18
|
-
opendataloader_pdf-0.0.
|
|
19
|
-
opendataloader_pdf-0.0.
|
|
20
|
-
opendataloader_pdf-0.0.
|
|
16
|
+
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=LaNv3QC6mCax37Gb3CWvIPcGcZGzrNAf2gNEfsUzkCc,22123626
|
|
17
|
+
opendataloader_pdf-0.0.12.dist-info/METADATA,sha256=UboAjGI04oWHIlIQVfhrQXZBVjz4O6jgb_hJfT4EZIE,18429
|
|
18
|
+
opendataloader_pdf-0.0.12.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
opendataloader_pdf-0.0.12.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
|
|
20
|
+
opendataloader_pdf-0.0.12.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|