pdfmuse-haystack 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Rust
|
|
2
|
+
/target/
|
|
3
|
+
**/*.rs.bk
|
|
4
|
+
|
|
5
|
+
# Node
|
|
6
|
+
node_modules/
|
|
7
|
+
dist/
|
|
8
|
+
pkg/
|
|
9
|
+
crates/pdfmuse-wasm/pkg/
|
|
10
|
+
*.node
|
|
11
|
+
*.tsbuildinfo
|
|
12
|
+
# generated by napi/tsc in the node binding
|
|
13
|
+
bindings/node/index.js
|
|
14
|
+
bindings/node/index.d.ts
|
|
15
|
+
bindings/node/native.js
|
|
16
|
+
bindings/node/native.d.ts
|
|
17
|
+
|
|
18
|
+
# Python
|
|
19
|
+
__pycache__/
|
|
20
|
+
*.py[cod]
|
|
21
|
+
.venv/
|
|
22
|
+
venv/
|
|
23
|
+
*.egg-info/
|
|
24
|
+
build/
|
|
25
|
+
*.whl
|
|
26
|
+
.pytest_cache/
|
|
27
|
+
|
|
28
|
+
# Native artifacts
|
|
29
|
+
*.so
|
|
30
|
+
*.dylib
|
|
31
|
+
*.pyd
|
|
32
|
+
|
|
33
|
+
# OS / editor
|
|
34
|
+
.DS_Store
|
|
35
|
+
*.swp
|
|
36
|
+
.idea/
|
|
37
|
+
.vscode/
|
|
38
|
+
|
|
39
|
+
# Note: Cargo.lock IS committed (this workspace ships a binary, pdfmuse-cli).
|
|
40
|
+
|
|
41
|
+
bindings/node/package-lock.json
|
|
42
|
+
bindings/node/npm/
|
|
43
|
+
bindings/node/artifacts/
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdfmuse-haystack
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Haystack converter for pdfmuse — deterministic PDF/DOCX parsing for RAG.
|
|
5
|
+
Project-URL: Homepage, https://github.com/casperkwok/pdfmuse
|
|
6
|
+
Project-URL: Repository, https://github.com/casperkwok/pdfmuse
|
|
7
|
+
Author: Casper Kwok
|
|
8
|
+
License-Expression: MIT OR Apache-2.0
|
|
9
|
+
Keywords: converter,docx,haystack,llm,parser,pdf,pdfmuse,rag
|
|
10
|
+
Requires-Python: >=3.9
|
|
11
|
+
Requires-Dist: haystack-ai>=2.0
|
|
12
|
+
Requires-Dist: pdfmuse>=0.1.6
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# pdfmuse-haystack
|
|
16
|
+
|
|
17
|
+
[Haystack](https://haystack.deepset.ai) converter for [**pdfmuse**](https://github.com/casperkwok/pdfmuse) —
|
|
18
|
+
a **deterministic** PDF/DOCX parser for RAG. Same file in → same Document out.
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install pdfmuse-haystack
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Usage
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from pdfmuse_haystack import PdfmuseConverter
|
|
28
|
+
|
|
29
|
+
converter = PdfmuseConverter(mode="markdown") # or "text" (default)
|
|
30
|
+
docs = converter.run(sources=["report.pdf"])["documents"]
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
In a pipeline:
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from haystack import Pipeline
|
|
37
|
+
from pdfmuse_haystack import PdfmuseConverter
|
|
38
|
+
|
|
39
|
+
pipe = Pipeline()
|
|
40
|
+
pipe.add_component("converter", PdfmuseConverter(mode="text"))
|
|
41
|
+
# ... connect to a splitter / embedder / writer ...
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Modes
|
|
45
|
+
|
|
46
|
+
- `"text"` *(default)* — plain reading-order text (fast path).
|
|
47
|
+
- `"markdown"` — structured Markdown (headings + tables).
|
|
48
|
+
|
|
49
|
+
Extracts text with exact coordinates, tables and structure; no probabilistic models
|
|
50
|
+
in the core path, so your index is reproducible run-to-run. Scanned/image-only pages
|
|
51
|
+
surface a `NeedsOcr` warning (OCR is a pluggable backend, kept out of the core).
|
|
52
|
+
|
|
53
|
+
MIT OR Apache-2.0 · part of the [pdfmuse](https://github.com/casperkwok/pdfmuse) project.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# pdfmuse-haystack
|
|
2
|
+
|
|
3
|
+
[Haystack](https://haystack.deepset.ai) converter for [**pdfmuse**](https://github.com/casperkwok/pdfmuse) —
|
|
4
|
+
a **deterministic** PDF/DOCX parser for RAG. Same file in → same Document out.
|
|
5
|
+
|
|
6
|
+
```bash
|
|
7
|
+
pip install pdfmuse-haystack
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
## Usage
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
from pdfmuse_haystack import PdfmuseConverter
|
|
14
|
+
|
|
15
|
+
converter = PdfmuseConverter(mode="markdown") # or "text" (default)
|
|
16
|
+
docs = converter.run(sources=["report.pdf"])["documents"]
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
In a pipeline:
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from haystack import Pipeline
|
|
23
|
+
from pdfmuse_haystack import PdfmuseConverter
|
|
24
|
+
|
|
25
|
+
pipe = Pipeline()
|
|
26
|
+
pipe.add_component("converter", PdfmuseConverter(mode="text"))
|
|
27
|
+
# ... connect to a splitter / embedder / writer ...
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Modes
|
|
31
|
+
|
|
32
|
+
- `"text"` *(default)* — plain reading-order text (fast path).
|
|
33
|
+
- `"markdown"` — structured Markdown (headings + tables).
|
|
34
|
+
|
|
35
|
+
Extracts text with exact coordinates, tables and structure; no probabilistic models
|
|
36
|
+
in the core path, so your index is reproducible run-to-run. Scanned/image-only pages
|
|
37
|
+
surface a `NeedsOcr` warning (OCR is a pluggable backend, kept out of the core).
|
|
38
|
+
|
|
39
|
+
MIT OR Apache-2.0 · part of the [pdfmuse](https://github.com/casperkwok/pdfmuse) project.
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Haystack converter for **pdfmuse** — a deterministic PDF/DOCX parser.
|
|
2
|
+
|
|
3
|
+
from pdfmuse_haystack import PdfmuseConverter
|
|
4
|
+
|
|
5
|
+
converter = PdfmuseConverter(mode="markdown")
|
|
6
|
+
docs = converter.run(sources=["report.pdf"])["documents"]
|
|
7
|
+
|
|
8
|
+
Deterministic: the same file always yields the same Document, so your index is
|
|
9
|
+
reproducible run-to-run. Scanned/image-only pages surface a NeedsOcr warning.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, Dict, List, Optional, Union
|
|
15
|
+
|
|
16
|
+
from haystack import Document, component, default_from_dict, default_to_dict
|
|
17
|
+
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
|
|
18
|
+
from haystack.dataclasses import ByteStream
|
|
19
|
+
|
|
20
|
+
__all__ = ["PdfmuseConverter"]
|
|
21
|
+
__version__ = "0.1.0"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _to_str(data: bytes, mode: str) -> str:
|
|
25
|
+
import pdfmuse
|
|
26
|
+
|
|
27
|
+
if mode == "markdown":
|
|
28
|
+
return pdfmuse.to_markdown(data)
|
|
29
|
+
return pdfmuse.to_text(data)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@component
|
|
33
|
+
class PdfmuseConverter:
|
|
34
|
+
"""Convert PDF/DOCX files to Haystack ``Document``s via pdfmuse.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
mode: ``"text"`` (plain reading-order text, default) or ``"markdown"``
|
|
38
|
+
(headings + tables).
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, mode: str = "text") -> None:
|
|
42
|
+
if mode not in ("text", "markdown"):
|
|
43
|
+
raise ValueError("mode must be 'text' or 'markdown'")
|
|
44
|
+
self.mode = mode
|
|
45
|
+
|
|
46
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
47
|
+
return default_to_dict(self, mode=self.mode)
|
|
48
|
+
|
|
49
|
+
@classmethod
|
|
50
|
+
def from_dict(cls, data: Dict[str, Any]) -> "PdfmuseConverter":
|
|
51
|
+
return default_from_dict(cls, data)
|
|
52
|
+
|
|
53
|
+
@component.output_types(documents=List[Document])
|
|
54
|
+
def run(
|
|
55
|
+
self,
|
|
56
|
+
sources: List[Union[str, Path, ByteStream]],
|
|
57
|
+
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
|
|
58
|
+
):
|
|
59
|
+
documents: List[Document] = []
|
|
60
|
+
meta_list = normalize_metadata(meta, sources_count=len(sources))
|
|
61
|
+
for source, md in zip(sources, meta_list):
|
|
62
|
+
try:
|
|
63
|
+
bytestream = get_bytestream_from_source(source)
|
|
64
|
+
except Exception:
|
|
65
|
+
continue
|
|
66
|
+
try:
|
|
67
|
+
content = _to_str(bytestream.data, self.mode)
|
|
68
|
+
except Exception:
|
|
69
|
+
continue
|
|
70
|
+
merged = {**bytestream.meta, **md}
|
|
71
|
+
documents.append(Document(content=content, meta=merged))
|
|
72
|
+
return {"documents": documents}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pdfmuse-haystack"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Haystack converter for pdfmuse — deterministic PDF/DOCX parsing for RAG."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = "MIT OR Apache-2.0"
|
|
12
|
+
authors = [{ name = "Casper Kwok" }]
|
|
13
|
+
keywords = ["haystack", "pdf", "docx", "rag", "llm", "parser", "pdfmuse", "converter"]
|
|
14
|
+
dependencies = [
|
|
15
|
+
"pdfmuse>=0.1.6",
|
|
16
|
+
"haystack-ai>=2.0",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.urls]
|
|
20
|
+
Homepage = "https://github.com/casperkwok/pdfmuse"
|
|
21
|
+
Repository = "https://github.com/casperkwok/pdfmuse"
|
|
22
|
+
|
|
23
|
+
[tool.hatch.build.targets.wheel]
|
|
24
|
+
packages = ["pdfmuse_haystack"]
|