opendataloader-pdf 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opendataloader-pdf might be problematic. Click here for more details.
- opendataloader_pdf/jar/opendataloader-pdf-cli.jar +0 -0
- {opendataloader_pdf-1.1.1.dist-info → opendataloader_pdf-1.1.3.dist-info}/METADATA +45 -46
- {opendataloader_pdf-1.1.1.dist-info → opendataloader_pdf-1.1.3.dist-info}/RECORD +6 -6
- {opendataloader_pdf-1.1.1.dist-info → opendataloader_pdf-1.1.3.dist-info}/WHEEL +0 -0
- {opendataloader_pdf-1.1.1.dist-info → opendataloader_pdf-1.1.3.dist-info}/entry_points.txt +0 -0
- {opendataloader_pdf-1.1.1.dist-info → opendataloader_pdf-1.1.3.dist-info}/top_level.txt +0 -0
|
Binary file
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: opendataloader-pdf
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.3
|
|
4
4
|
Summary: A Python wrapper for the opendataloader-pdf Java CLI.
|
|
5
5
|
Home-page: https://github.com/opendataloader-project/opendataloader-pdf
|
|
6
6
|
Author: opendataloader-project
|
|
@@ -152,22 +152,18 @@ npm install @opendataloader/pdf
|
|
|
152
152
|
|
|
153
153
|
### Usage
|
|
154
154
|
|
|
155
|
-
|
|
156
|
-
- If you don’t specify an `outputFolder`, the output data will be saved in the same directory as the input document.
|
|
155
|
+
`inputPath` can be either the path to a single document or the path to a folder.
|
|
157
156
|
|
|
158
157
|
```typescript
|
|
159
|
-
import {
|
|
158
|
+
import { convert } from '@opendataloader/pdf';
|
|
160
159
|
|
|
161
160
|
async function main() {
|
|
162
161
|
try {
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
generateHtml: true,
|
|
167
|
-
generateAnnotatedPdf: true,
|
|
168
|
-
debug: true,
|
|
162
|
+
await convert(['path/to/document.pdf', 'path/to/folder'], {
|
|
163
|
+
outputDir: 'path/to/output',
|
|
164
|
+
format: ['json', 'html', 'pdf', 'markdown'],
|
|
169
165
|
});
|
|
170
|
-
console.log('
|
|
166
|
+
console.log('convert() complete');
|
|
171
167
|
} catch (error) {
|
|
172
168
|
console.error('Error processing PDF:', error);
|
|
173
169
|
}
|
|
@@ -175,60 +171,63 @@ async function main() {
|
|
|
175
171
|
|
|
176
172
|
main();
|
|
177
173
|
```
|
|
174
|
+
### Function: convert()
|
|
175
|
+
|
|
176
|
+
`convert(inputPaths: string[], options?: ConvertOptions): Promise<string>`
|
|
177
|
+
|
|
178
|
+
Multi-input helper matching the Python wrapper.
|
|
179
|
+
|
|
180
|
+
| Property | Type | Default | Description |
|
|
181
|
+
| --------------------------------| ---------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------------- |
|
|
182
|
+
| `inputPaths` | `string[]` | — | One or more file paths or directories to process. |
|
|
183
|
+
| `options.outputDir` | `string` | `undefined` | Directory where outputs are written. |
|
|
184
|
+
| `options.password` | `string` | `undefined` | Password for encrypted PDFs. |
|
|
185
|
+
| `options.format` | `string[]` | `undefined` | Output formats (any combination of `json`, `text`, `html`, `pdf`, `markdown`, `markdown-with-html`, `markdown-with-images`). |
|
|
186
|
+
| `options.quiet` | `boolean` | `false` | Suppress CLI logging output and prevent streaming. |
|
|
187
|
+
| `options.contentSafetyOff` | `string[]` | `undefined` | Disable one or more content safety filters (`all`, `hidden-text`, `off-page`, `tiny`, `hidden-ocg`). |
|
|
188
|
+
| `options.keepLineBreaks` | `boolean` | `false` | Preserve line breaks in text output. |
|
|
189
|
+
| `options.replaceInvalidChars` | `string` | `undefined` | Replacement character for invalid or unrecognized characters. |
|
|
190
|
+
|
|
191
|
+
### Function: run()
|
|
192
|
+
|
|
193
|
+
Deprecated.
|
|
178
194
|
|
|
179
|
-
|
|
195
|
+
### CLI
|
|
180
196
|
|
|
181
197
|
```bash
|
|
182
|
-
npx @opendataloader/pdf path/to/document.pdf -o path/to/output
|
|
198
|
+
npx @opendataloader/pdf path/to/document.pdf path/to/folder -o path/to/output -f json html pdf markdown
|
|
183
199
|
```
|
|
184
200
|
|
|
185
|
-
|
|
201
|
+
Or install globally:
|
|
186
202
|
|
|
187
203
|
```bash
|
|
188
204
|
npm install -g @opendataloader/pdf
|
|
189
205
|
```
|
|
190
206
|
|
|
191
|
-
|
|
207
|
+
Then run:
|
|
192
208
|
|
|
193
209
|
```bash
|
|
194
|
-
opendataloader-pdf path/to/document.pdf -o path/to/output
|
|
210
|
+
opendataloader-pdf path/to/document.pdf path/to/folder -o path/to/output -f json html pdf markdown
|
|
195
211
|
```
|
|
196
212
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
`run(inputPath: string, options?: RunOptions): Promise<string>`
|
|
213
|
+
#### Available options
|
|
200
214
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
| Property | Type | Default | Description |
|
|
213
|
-
| ----------------------- | --------- | ------------- |-------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
214
|
-
| `outputFolder` | `string` | `undefined` | Path to the output folder. If not set, output is saved next to the input. |
|
|
215
|
-
| `password` | `string` | `undefined` | Password for the PDF file. |
|
|
216
|
-
| `replaceInvalidChars` | `string` | `" "` | Character to replace invalid or unrecognized characters (e.g., , \u0000). |
|
|
217
|
-
| `contentSafetyOff` | `string` | `undefined` | Disables one or more content safety filters. Accepts a comma-separated list of filter names. Arguments: all, hidden-text, off-page, tiny, hidden-ocg. |
|
|
218
|
-
| `generateMarkdown` | `boolean` | `false` | If `true`, generates a Markdown output file. |
|
|
219
|
-
| `generateHtml` | `boolean` | `false` | If `true`, generates an HTML output file. |
|
|
220
|
-
| `generateAnnotatedPdf` | `boolean` | `false` | If `true`, generates an annotated PDF output file. |
|
|
221
|
-
| `keepLineBreaks` | `boolean` | `false` | If `true`, keeps line breaks in the output. |
|
|
222
|
-
| `htmlInMarkdown` | `boolean` | `false` | If `true`, uses HTML in the Markdown output. |
|
|
223
|
-
| `addImageToMarkdown` | `boolean` | `false` | If `true`, adds images to the Markdown output. |
|
|
224
|
-
| `noJson` | `boolean` | `false` | If `true`, disables the JSON output. |
|
|
225
|
-
| `debug` | `boolean` | `false` | If `true`, prints CLI messages to the console during execution. |
|
|
215
|
+
```
|
|
216
|
+
-o, --output-dir <path> Directory where outputs are written
|
|
217
|
+
-p, --password <password> Password for encrypted PDFs
|
|
218
|
+
-f, --format <value...> Output formats to generate (json, text, html, pdf, markdown, markdown-with-html, markdown-with-images)
|
|
219
|
+
-q, --quiet Suppress CLI logging output
|
|
220
|
+
--content-safety-off <mode...> Disable one or more content safety filters (all, hidden-text, off-page, tiny, hidden-ocg)
|
|
221
|
+
--keep-line-breaks Preserve line breaks in text output
|
|
222
|
+
--replace-invalid-chars <c> Replacement character for invalid or unrecognized characters
|
|
223
|
+
-h, --help Show usage information
|
|
224
|
+
```
|
|
226
225
|
|
|
227
226
|
<br/>
|
|
228
227
|
|
|
229
228
|
## Java
|
|
230
229
|
|
|
231
|
-
For various example templates, including Gradle and Maven, please refer to https://github.com/opendataloader-project/opendataloader-pdf
|
|
230
|
+
For various example templates, including Gradle and Maven, please refer to [Examples](https://github.com/opendataloader-project/opendataloader-pdf-examples).
|
|
232
231
|
|
|
233
232
|
### Dependency
|
|
234
233
|
|
|
@@ -244,7 +243,7 @@ Check for the latest version on [Maven Central](https://search.maven.org/artifac
|
|
|
244
243
|
<dependency>
|
|
245
244
|
<groupId>org.opendataloader</groupId>
|
|
246
245
|
<artifactId>opendataloader-pdf-core</artifactId>
|
|
247
|
-
<version>1.
|
|
246
|
+
<version>1.1.2</version>
|
|
248
247
|
</dependency>
|
|
249
248
|
</dependencies>
|
|
250
249
|
|
|
@@ -14,9 +14,9 @@ opendataloader_pdf/THIRD_PARTY/licenses/LICENSE-JJ2000.txt,sha256=itSesIy3XiNWgJ
|
|
|
14
14
|
opendataloader_pdf/THIRD_PARTY/licenses/MIT.txt,sha256=JPCdbR3BU0uO_KypOd3sGWnKwlVHGq4l0pmrjoGtop8,1078
|
|
15
15
|
opendataloader_pdf/THIRD_PARTY/licenses/MPL-2.0.txt,sha256=CGF6Fx5WV7DJmRZJ8_6w6JEt2N9bu4p6zDo18fTHHRw,15818
|
|
16
16
|
opendataloader_pdf/THIRD_PARTY/licenses/Plexus Classworlds License.txt,sha256=ZQuKXwVz4FeC34ApB20vYg8kPTwgIUKRzEk5ew74-hU,1937
|
|
17
|
-
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=
|
|
18
|
-
opendataloader_pdf-1.1.
|
|
19
|
-
opendataloader_pdf-1.1.
|
|
20
|
-
opendataloader_pdf-1.1.
|
|
21
|
-
opendataloader_pdf-1.1.
|
|
22
|
-
opendataloader_pdf-1.1.
|
|
17
|
+
opendataloader_pdf/jar/opendataloader-pdf-cli.jar,sha256=lO7KQMXlaldURR_RoOC5nZ_kiVNbjDPxlBHZp-ro0Rg,20540216
|
|
18
|
+
opendataloader_pdf-1.1.3.dist-info/METADATA,sha256=eeGkaZt33Flu_X-wWu9EjFE2XOsxc7oqQxIM7_I8jIc,24562
|
|
19
|
+
opendataloader_pdf-1.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
20
|
+
opendataloader_pdf-1.1.3.dist-info/entry_points.txt,sha256=Tupa9pVNF6nXD9sqzCLI8PCHbSu0jKkL3SYyTkQy0dc,71
|
|
21
|
+
opendataloader_pdf-1.1.3.dist-info/top_level.txt,sha256=xee0qFQd6HPfS50E2NLICGuR6cq9C9At5SJ81yv5HkY,19
|
|
22
|
+
opendataloader_pdf-1.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|