pdfmuse-haystack 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ # Rust
2
+ /target/
3
+ **/*.rs.bk
4
+
5
+ # Node
6
+ node_modules/
7
+ dist/
8
+ pkg/
9
+ crates/pdfmuse-wasm/pkg/
10
+ *.node
11
+ *.tsbuildinfo
12
+ # generated by napi/tsc in the node binding
13
+ bindings/node/index.js
14
+ bindings/node/index.d.ts
15
+ bindings/node/native.js
16
+ bindings/node/native.d.ts
17
+
18
+ # Python
19
+ __pycache__/
20
+ *.py[cod]
21
+ .venv/
22
+ venv/
23
+ *.egg-info/
24
+ build/
25
+ *.whl
26
+ .pytest_cache/
27
+
28
+ # Native artifacts
29
+ *.so
30
+ *.dylib
31
+ *.pyd
32
+
33
+ # OS / editor
34
+ .DS_Store
35
+ *.swp
36
+ .idea/
37
+ .vscode/
38
+
39
+ # Note: Cargo.lock IS committed (this workspace ships a binary, pdfmuse-cli).
40
+
41
+ bindings/node/package-lock.json
42
+ bindings/node/npm/
43
+ bindings/node/artifacts/
@@ -0,0 +1,53 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdfmuse-haystack
3
+ Version: 0.1.0
4
+ Summary: Haystack converter for pdfmuse — deterministic PDF/DOCX parsing for RAG.
5
+ Project-URL: Homepage, https://github.com/casperkwok/pdfmuse
6
+ Project-URL: Repository, https://github.com/casperkwok/pdfmuse
7
+ Author: Casper Kwok
8
+ License-Expression: MIT OR Apache-2.0
9
+ Keywords: converter,docx,haystack,llm,parser,pdf,pdfmuse,rag
10
+ Requires-Python: >=3.9
11
+ Requires-Dist: haystack-ai>=2.0
12
+ Requires-Dist: pdfmuse>=0.1.6
13
+ Description-Content-Type: text/markdown
14
+
15
+ # pdfmuse-haystack
16
+
17
+ [Haystack](https://haystack.deepset.ai) converter for [**pdfmuse**](https://github.com/casperkwok/pdfmuse) —
18
+ a **deterministic** PDF/DOCX parser for RAG. Same file in → same Document out.
19
+
20
+ ```bash
21
+ pip install pdfmuse-haystack
22
+ ```
23
+
24
+ ## Usage
25
+
26
+ ```python
27
+ from pdfmuse_haystack import PdfmuseConverter
28
+
29
+ converter = PdfmuseConverter(mode="markdown") # or "text" (default)
30
+ docs = converter.run(sources=["report.pdf"])["documents"]
31
+ ```
32
+
33
+ In a pipeline:
34
+
35
+ ```python
36
+ from haystack import Pipeline
37
+ from pdfmuse_haystack import PdfmuseConverter
38
+
39
+ pipe = Pipeline()
40
+ pipe.add_component("converter", PdfmuseConverter(mode="text"))
41
+ # ... connect to a splitter / embedder / writer ...
42
+ ```
43
+
44
+ ## Modes
45
+
46
+ - `"text"` *(default)* — plain reading-order text (fast path).
47
+ - `"markdown"` — structured Markdown (headings + tables).
48
+
49
+ Extracts text with exact coordinates, tables and structure; no probabilistic models
50
+ in the core path, so your index is reproducible run-to-run. Scanned/image-only pages
51
+ surface a `NeedsOcr` warning (OCR is a pluggable backend, kept out of the core).
52
+
53
+ MIT OR Apache-2.0 · part of the [pdfmuse](https://github.com/casperkwok/pdfmuse) project.
@@ -0,0 +1,39 @@
1
+ # pdfmuse-haystack
2
+
3
+ [Haystack](https://haystack.deepset.ai) converter for [**pdfmuse**](https://github.com/casperkwok/pdfmuse) —
4
+ a **deterministic** PDF/DOCX parser for RAG. Same file in → same Document out.
5
+
6
+ ```bash
7
+ pip install pdfmuse-haystack
8
+ ```
9
+
10
+ ## Usage
11
+
12
+ ```python
13
+ from pdfmuse_haystack import PdfmuseConverter
14
+
15
+ converter = PdfmuseConverter(mode="markdown") # or "text" (default)
16
+ docs = converter.run(sources=["report.pdf"])["documents"]
17
+ ```
18
+
19
+ In a pipeline:
20
+
21
+ ```python
22
+ from haystack import Pipeline
23
+ from pdfmuse_haystack import PdfmuseConverter
24
+
25
+ pipe = Pipeline()
26
+ pipe.add_component("converter", PdfmuseConverter(mode="text"))
27
+ # ... connect to a splitter / embedder / writer ...
28
+ ```
29
+
30
+ ## Modes
31
+
32
+ - `"text"` *(default)* — plain reading-order text (fast path).
33
+ - `"markdown"` — structured Markdown (headings + tables).
34
+
35
+ Extracts text with exact coordinates, tables and structure; no probabilistic models
36
+ in the core path, so your index is reproducible run-to-run. Scanned/image-only pages
37
+ surface a `NeedsOcr` warning (OCR is a pluggable backend, kept out of the core).
38
+
39
+ MIT OR Apache-2.0 · part of the [pdfmuse](https://github.com/casperkwok/pdfmuse) project.
@@ -0,0 +1,72 @@
1
+ """Haystack converter for **pdfmuse** — a deterministic PDF/DOCX parser.
2
+
3
+ from pdfmuse_haystack import PdfmuseConverter
4
+
5
+ converter = PdfmuseConverter(mode="markdown")
6
+ docs = converter.run(sources=["report.pdf"])["documents"]
7
+
8
+ Deterministic: the same file always yields the same Document, so your index is
9
+ reproducible run-to-run. Scanned/image-only pages surface a NeedsOcr warning.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ from pathlib import Path
14
+ from typing import Any, Dict, List, Optional, Union
15
+
16
+ from haystack import Document, component, default_from_dict, default_to_dict
17
+ from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
18
+ from haystack.dataclasses import ByteStream
19
+
20
+ __all__ = ["PdfmuseConverter"]
21
+ __version__ = "0.1.0"
22
+
23
+
24
+ def _to_str(data: bytes, mode: str) -> str:
25
+ import pdfmuse
26
+
27
+ if mode == "markdown":
28
+ return pdfmuse.to_markdown(data)
29
+ return pdfmuse.to_text(data)
30
+
31
+
32
+ @component
33
+ class PdfmuseConverter:
34
+ """Convert PDF/DOCX files to Haystack ``Document``s via pdfmuse.
35
+
36
+ Args:
37
+ mode: ``"text"`` (plain reading-order text, default) or ``"markdown"``
38
+ (headings + tables).
39
+ """
40
+
41
+ def __init__(self, mode: str = "text") -> None:
42
+ if mode not in ("text", "markdown"):
43
+ raise ValueError("mode must be 'text' or 'markdown'")
44
+ self.mode = mode
45
+
46
+ def to_dict(self) -> Dict[str, Any]:
47
+ return default_to_dict(self, mode=self.mode)
48
+
49
+ @classmethod
50
+ def from_dict(cls, data: Dict[str, Any]) -> "PdfmuseConverter":
51
+ return default_from_dict(cls, data)
52
+
53
+ @component.output_types(documents=List[Document])
54
+ def run(
55
+ self,
56
+ sources: List[Union[str, Path, ByteStream]],
57
+ meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
58
+ ):
59
+ documents: List[Document] = []
60
+ meta_list = normalize_metadata(meta, sources_count=len(sources))
61
+ for source, md in zip(sources, meta_list):
62
+ try:
63
+ bytestream = get_bytestream_from_source(source)
64
+ except Exception:
65
+ continue
66
+ try:
67
+ content = _to_str(bytestream.data, self.mode)
68
+ except Exception:
69
+ continue
70
+ merged = {**bytestream.meta, **md}
71
+ documents.append(Document(content=content, meta=merged))
72
+ return {"documents": documents}
@@ -0,0 +1,24 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "pdfmuse-haystack"
7
+ version = "0.1.0"
8
+ description = "Haystack converter for pdfmuse — deterministic PDF/DOCX parsing for RAG."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = "MIT OR Apache-2.0"
12
+ authors = [{ name = "Casper Kwok" }]
13
+ keywords = ["haystack", "pdf", "docx", "rag", "llm", "parser", "pdfmuse", "converter"]
14
+ dependencies = [
15
+ "pdfmuse>=0.1.6",
16
+ "haystack-ai>=2.0",
17
+ ]
18
+
19
+ [project.urls]
20
+ Homepage = "https://github.com/casperkwok/pdfmuse"
21
+ Repository = "https://github.com/casperkwok/pdfmuse"
22
+
23
+ [tool.hatch.build.targets.wheel]
24
+ packages = ["pdfmuse_haystack"]