obsidian-import 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .pixi/
7
+ .pytest_cache/
8
+ .hypothesis/
9
+ .ruff_cache/
10
+ .coverage
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Matthias Christenson
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,175 @@
1
+ Metadata-Version: 2.4
2
+ Name: obsidian-import
3
+ Version: 0.1.0
4
+ Summary: Extract files (PDF, DOCX, PPTX, XLSX) into Obsidian-flavored Markdown
5
+ Project-URL: Documentation, https://neuralsignal.github.io/obsidian-import/
6
+ Project-URL: Repository, https://github.com/neuralsignal/obsidian-import
7
+ Author: Matthias Christenson
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Keywords: docx,extraction,import,markdown,obsidian,pdf,pptx,xlsx
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: End Users/Desktop
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Office/Business
18
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
19
+ Requires-Python: >=3.12
20
+ Requires-Dist: click<9,>=8.0
21
+ Requires-Dist: defusedxml<1,>=0.7
22
+ Requires-Dist: openpyxl<4,>=3.1
23
+ Requires-Dist: pdfplumber<1,>=0.11
24
+ Requires-Dist: pypdf<6,>=5.0
25
+ Requires-Dist: python-pptx<2,>=1.0
26
+ Requires-Dist: pyyaml<7,>=6.0
27
+ Provides-Extra: dev
28
+ Requires-Dist: hypothesis<7,>=6.0; extra == 'dev'
29
+ Requires-Dist: pytest-cov<6,>=5.0; extra == 'dev'
30
+ Requires-Dist: pytest<9,>=8.0; extra == 'dev'
31
+ Provides-Extra: docling
32
+ Requires-Dist: docling>=2.0; extra == 'docling'
33
+ Provides-Extra: markitdown
34
+ Requires-Dist: markitdown[all]>=0.1; extra == 'markitdown'
35
+ Description-Content-Type: text/markdown
36
+
37
+ # obsidian-import
38
+
39
+ Extract files (PDF, DOCX, PPTX, XLSX) into Obsidian-flavored Markdown.
40
+
41
+ The mirror of [obsidian-export](https://github.com/neuralsignal/obsidian-export): where obsidian-export converts Obsidian notes to PDF/DOCX, obsidian-import converts external documents into Obsidian-ready markdown with YAML frontmatter.
42
+
43
+ ## Installation
44
+
45
+ ```bash
46
+ pip install obsidian-import
47
+ ```
48
+
49
+ With optional backends:
50
+
51
+ ```bash
52
+ pip install obsidian-import[markitdown] # fallback for HTML, CSV, etc.
53
+ pip install obsidian-import[docling] # high-quality ML-based extraction
54
+ ```
55
+
56
+ ## Quick Start
57
+
58
+ ### Single file
59
+
60
+ ```bash
61
+ obsidian-import convert report.pdf --output vault/imports/report.md
62
+ ```
63
+
64
+ ### Batch extraction
65
+
66
+ ```bash
67
+ obsidian-import batch --config config.yaml
68
+ ```
69
+
70
+ ### Check backend availability
71
+
72
+ ```bash
73
+ obsidian-import doctor
74
+ ```
75
+
76
+ ## Python API
77
+
78
+ ```python
79
+ from pathlib import Path
80
+ from obsidian_import import extract_file, discover_files
81
+ from obsidian_import.config import load_config
82
+ from obsidian_import.output import format_output
83
+
84
+ config = load_config(Path("config.yaml"))
85
+
86
+ # Single file
87
+ doc = extract_file(Path("report.pdf"), config)
88
+ markdown = format_output(doc, config.output)
89
+
90
+ # Batch discovery
91
+ for file in discover_files(config):
92
+ print(f"{file.extension} {file.size_bytes:,} bytes {file.path}")
93
+ ```
94
+
95
+ ## Configuration
96
+
97
+ Create a `config.yaml`:
98
+
99
+ ```yaml
100
+ input:
101
+ directories:
102
+ - path: /path/to/documents
103
+ extensions: [".pdf", ".docx", ".pptx", ".xlsx"]
104
+ exclude: ["*.tmp", "~$*"]
105
+
106
+ output:
107
+ directory: ./extracted
108
+ frontmatter: true
109
+ metadata_fields:
110
+ - title
111
+ - source
112
+ - original_path
113
+ - file_type
114
+ - extracted_at
115
+ - page_count
116
+
117
+ backends:
118
+ pdf: native # pdfplumber + pypdf
119
+ docx: native # defusedxml
120
+ pptx: native # python-pptx
121
+ xlsx: native # openpyxl
122
+ default: native # fallback for unknown extensions
123
+
124
+ extraction:
125
+ timeout_seconds: 120
126
+ max_file_size_mb: 100
127
+ xlsx_max_rows_per_sheet: 500
128
+ ```
129
+
130
+ ## Backend Selection
131
+
132
+ | Backend | Extensions | Dependencies | Quality |
133
+ |---------|-----------|--------------|---------|
134
+ | `native` | .pdf, .docx, .pptx, .xlsx | Core (included) | Good for text-heavy documents |
135
+ | `markitdown` | Any | `[markitdown]` extra | Good fallback for HTML, CSV, etc. |
136
+ | `docling` | Any | `[docling]` extra | Best for complex layouts, tables |
137
+
138
+ ## CLI Reference
139
+
140
+ | Command | Description |
141
+ |---------|-------------|
142
+ | `obsidian-import convert <path>` | Extract a single file |
143
+ | `obsidian-import discover --config <yaml>` | List matching files |
144
+ | `obsidian-import batch --config <yaml>` | Extract all discovered files |
145
+ | `obsidian-import doctor` | Check backend availability |
146
+
147
+ ## Output Format
148
+
149
+ Extracted files are written as Obsidian-flavored markdown with YAML frontmatter:
150
+
151
+ ```markdown
152
+ ---
153
+ title: Annual Report
154
+ source: obsidian-import
155
+ original_path: /documents/report.pdf
156
+ file_type: pdf
157
+ extracted_at: 2026-03-09T10:30:00Z
158
+ page_count: 12
159
+ ---
160
+
161
+ # Annual Report
162
+
163
+ ## Page 1
164
+
165
+ Content extracted from the first page...
166
+ ```
167
+
168
+ ## Related Packages
169
+
170
+ - [obsidian-export](https://github.com/neuralsignal/obsidian-export) — Convert Obsidian notes to PDF/DOCX
171
+ - [agentic-brain](https://github.com/neuralsignal/agentic-brain) — Agentic knowledge management (consumes both packages)
172
+
173
+ ## License
174
+
175
+ MIT
@@ -0,0 +1,139 @@
1
+ # obsidian-import
2
+
3
+ Extract files (PDF, DOCX, PPTX, XLSX) into Obsidian-flavored Markdown.
4
+
5
+ The mirror of [obsidian-export](https://github.com/neuralsignal/obsidian-export): where obsidian-export converts Obsidian notes to PDF/DOCX, obsidian-import converts external documents into Obsidian-ready markdown with YAML frontmatter.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ pip install obsidian-import
11
+ ```
12
+
13
+ With optional backends:
14
+
15
+ ```bash
16
+ pip install obsidian-import[markitdown] # fallback for HTML, CSV, etc.
17
+ pip install obsidian-import[docling] # high-quality ML-based extraction
18
+ ```
19
+
20
+ ## Quick Start
21
+
22
+ ### Single file
23
+
24
+ ```bash
25
+ obsidian-import convert report.pdf --output vault/imports/report.md
26
+ ```
27
+
28
+ ### Batch extraction
29
+
30
+ ```bash
31
+ obsidian-import batch --config config.yaml
32
+ ```
33
+
34
+ ### Check backend availability
35
+
36
+ ```bash
37
+ obsidian-import doctor
38
+ ```
39
+
40
+ ## Python API
41
+
42
+ ```python
43
+ from pathlib import Path
44
+ from obsidian_import import extract_file, discover_files
45
+ from obsidian_import.config import load_config
46
+ from obsidian_import.output import format_output
47
+
48
+ config = load_config(Path("config.yaml"))
49
+
50
+ # Single file
51
+ doc = extract_file(Path("report.pdf"), config)
52
+ markdown = format_output(doc, config.output)
53
+
54
+ # Batch discovery
55
+ for file in discover_files(config):
56
+ print(f"{file.extension} {file.size_bytes:,} bytes {file.path}")
57
+ ```
58
+
59
+ ## Configuration
60
+
61
+ Create a `config.yaml`:
62
+
63
+ ```yaml
64
+ input:
65
+ directories:
66
+ - path: /path/to/documents
67
+ extensions: [".pdf", ".docx", ".pptx", ".xlsx"]
68
+ exclude: ["*.tmp", "~$*"]
69
+
70
+ output:
71
+ directory: ./extracted
72
+ frontmatter: true
73
+ metadata_fields:
74
+ - title
75
+ - source
76
+ - original_path
77
+ - file_type
78
+ - extracted_at
79
+ - page_count
80
+
81
+ backends:
82
+ pdf: native # pdfplumber + pypdf
83
+ docx: native # defusedxml
84
+ pptx: native # python-pptx
85
+ xlsx: native # openpyxl
86
+ default: native # fallback for unknown extensions
87
+
88
+ extraction:
89
+ timeout_seconds: 120
90
+ max_file_size_mb: 100
91
+ xlsx_max_rows_per_sheet: 500
92
+ ```
93
+
94
+ ## Backend Selection
95
+
96
+ | Backend | Extensions | Dependencies | Quality |
97
+ |---------|-----------|--------------|---------|
98
+ | `native` | .pdf, .docx, .pptx, .xlsx | Core (included) | Good for text-heavy documents |
99
+ | `markitdown` | Any | `[markitdown]` extra | Good fallback for HTML, CSV, etc. |
100
+ | `docling` | Any | `[docling]` extra | Best for complex layouts, tables |
101
+
102
+ ## CLI Reference
103
+
104
+ | Command | Description |
105
+ |---------|-------------|
106
+ | `obsidian-import convert <path>` | Extract a single file |
107
+ | `obsidian-import discover --config <yaml>` | List matching files |
108
+ | `obsidian-import batch --config <yaml>` | Extract all discovered files |
109
+ | `obsidian-import doctor` | Check backend availability |
110
+
111
+ ## Output Format
112
+
113
+ Extracted files are written as Obsidian-flavored markdown with YAML frontmatter:
114
+
115
+ ```markdown
116
+ ---
117
+ title: Annual Report
118
+ source: obsidian-import
119
+ original_path: /documents/report.pdf
120
+ file_type: pdf
121
+ extracted_at: 2026-03-09T10:30:00Z
122
+ page_count: 12
123
+ ---
124
+
125
+ # Annual Report
126
+
127
+ ## Page 1
128
+
129
+ Content extracted from the first page...
130
+ ```
131
+
132
+ ## Related Packages
133
+
134
+ - [obsidian-export](https://github.com/neuralsignal/obsidian-export) — Convert Obsidian notes to PDF/DOCX
135
+ - [agentic-brain](https://github.com/neuralsignal/agentic-brain) — Agentic knowledge management (consumes both packages)
136
+
137
+ ## License
138
+
139
+ MIT
@@ -0,0 +1,81 @@
1
+ """obsidian-import: Extract files into Obsidian-flavored Markdown.
2
+
3
+ Public API:
4
+ extract_file(path, config) -> ExtractedDocument
5
+ extract_text(path, config) -> str
6
+ discover_files(config) -> Iterator[DiscoveredFile]
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from collections.abc import Iterator
12
+ from pathlib import Path
13
+
14
+ from obsidian_import.config import ImportConfig
15
+ from obsidian_import.discovery import DiscoveredFile
16
+ from obsidian_import.discovery import discover_files as _discover_files
17
+ from obsidian_import.output import ExtractedDocument
18
+ from obsidian_import.registry import extract_with_backend
19
+
20
+
21
+ def extract_file(path: Path, config: ImportConfig) -> ExtractedDocument:
22
+ """Extract a single file to Obsidian-flavored markdown.
23
+
24
+ Uses the configured backend for the file's extension.
25
+ Returns an ExtractedDocument with the extracted markdown and metadata.
26
+ """
27
+ extension = path.suffix.lower()
28
+ extra_kwargs: dict[str, object] = {}
29
+
30
+ if extension == ".xlsx":
31
+ extra_kwargs["max_rows_per_sheet"] = config.extraction.xlsx_max_rows_per_sheet
32
+
33
+ markdown = extract_with_backend(
34
+ path,
35
+ backends=config.backends,
36
+ timeout_seconds=config.extraction.timeout_seconds,
37
+ **extra_kwargs,
38
+ )
39
+
40
+ page_count = _estimate_page_count(markdown, extension)
41
+
42
+ return ExtractedDocument(
43
+ source_path=path,
44
+ markdown=markdown,
45
+ title=path.stem,
46
+ file_type=extension.lstrip("."),
47
+ page_count=page_count,
48
+ )
49
+
50
+
51
+ def discover_files(config: ImportConfig) -> Iterator[DiscoveredFile]:
52
+ """Discover files matching the configured input directories and extensions."""
53
+ return _discover_files(config)
54
+
55
+
56
+ def extract_text(path: Path, config: ImportConfig) -> str:
57
+ """Extract raw markdown text from a file. No frontmatter, no metadata wrapping."""
58
+ extension = path.suffix.lower()
59
+ extra_kwargs: dict[str, object] = {}
60
+ if extension == ".xlsx":
61
+ extra_kwargs["max_rows_per_sheet"] = config.extraction.xlsx_max_rows_per_sheet
62
+ return extract_with_backend(
63
+ path,
64
+ backends=config.backends,
65
+ timeout_seconds=config.extraction.timeout_seconds,
66
+ **extra_kwargs,
67
+ )
68
+
69
+
70
+ def _estimate_page_count(markdown: str, extension: str) -> int | None:
71
+ """Estimate page count from extracted markdown.
72
+
73
+ For PDFs, count '## Page N' headings. For other formats, return None.
74
+ """
75
+ if extension == ".pdf":
76
+ count = 0
77
+ for line in markdown.splitlines():
78
+ if line.startswith("## Page "):
79
+ count += 1
80
+ return count if count > 0 else None
81
+ return None
@@ -0,0 +1 @@
1
+ """Backend modules for obsidian-import extractors."""
@@ -0,0 +1,31 @@
1
+ """High-quality document extraction using docling.
2
+
3
+ Requires the [docling] extra: pip install obsidian-import[docling]
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from pathlib import Path
9
+
10
+ from obsidian_import.exceptions import BackendNotAvailableError
11
+ from obsidian_import.timeout import run_with_timeout
12
+
13
+
14
+ def extract(path: Path, timeout_seconds: int) -> str:
15
+ """Extract text using docling for high-quality document conversion."""
16
+ try:
17
+ from docling.document_converter import DocumentConverter # noqa: F811
18
+ except ImportError as exc:
19
+ raise BackendNotAvailableError(
20
+ "docling is not installed. Install with: pip install obsidian-import[docling]"
21
+ ) from exc
22
+
23
+ def _do_extract() -> str:
24
+ converter = DocumentConverter()
25
+ doc_result = converter.convert(str(path))
26
+ text = doc_result.document.export_to_markdown()
27
+ if not text or not text.strip():
28
+ return f"*No text content extracted from `{path.name}`.*"
29
+ return text
30
+
31
+ return run_with_timeout(_do_extract, timeout_seconds, "docling", path)
@@ -0,0 +1,31 @@
1
+ """Fallback extractor using markitdown for unrecognized formats.
2
+
3
+ Requires the [markitdown] extra: pip install obsidian-import[markitdown]
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from pathlib import Path
9
+
10
+ from obsidian_import.exceptions import BackendNotAvailableError
11
+ from obsidian_import.timeout import run_with_timeout
12
+
13
+
14
+ def extract(path: Path, timeout_seconds: int) -> str:
15
+ """Extract text using markitdown as a fallback converter."""
16
+ try:
17
+ from markitdown import MarkItDown # noqa: F811
18
+ except ImportError as exc:
19
+ raise BackendNotAvailableError(
20
+ "markitdown is not installed. Install with: pip install obsidian-import[markitdown]"
21
+ ) from exc
22
+
23
+ def _do_extract() -> str:
24
+ converter = MarkItDown()
25
+ converted = converter.convert(str(path))
26
+ text = converted.text_content
27
+ if not text or not text.strip():
28
+ return f"*No text content extracted from `{path.name}`.*"
29
+ return text
30
+
31
+ return run_with_timeout(_do_extract, timeout_seconds, "markitdown", path)
@@ -0,0 +1,130 @@
1
+ """DOCX text extraction using defusedxml + zipfile.
2
+
3
+ Opens the DOCX as a ZIP archive, parses word/document.xml to extract
4
+ text with structure preservation (headings, paragraphs, tables).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import zipfile
10
+ from pathlib import Path
11
+ from xml.etree.ElementTree import Element
12
+
13
+ from obsidian_import.exceptions import ExtractionError
14
+ from obsidian_import.timeout import run_with_timeout
15
+
16
+ _NS = {
17
+ "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
18
+ "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
19
+ }
20
+
21
+
22
+ def extract(path: Path, timeout_seconds: int) -> str:
23
+ """Extract text from a DOCX file, returning markdown."""
24
+ return run_with_timeout(lambda: _extract_docx(path), timeout_seconds, "DOCX", path)
25
+
26
+
27
+ def _extract_docx(path: Path) -> str:
28
+ """Internal DOCX extraction logic."""
29
+ from defusedxml.ElementTree import fromstring
30
+
31
+ if not zipfile.is_zipfile(str(path)):
32
+ raise ExtractionError(f"Not a valid DOCX (ZIP) file: {path}")
33
+
34
+ with zipfile.ZipFile(str(path), "r") as zf:
35
+ if "word/document.xml" not in zf.namelist():
36
+ raise ExtractionError(f"No word/document.xml found in: {path}")
37
+
38
+ doc_xml = zf.read("word/document.xml")
39
+ root = fromstring(doc_xml)
40
+
41
+ sections: list[str] = [f"# {path.stem}"]
42
+ body = root.find(f"{{{_NS['w']}}}body")
43
+ if body is None:
44
+ return f"# {path.stem}\n\n*No body content found.*"
45
+
46
+ for element in body:
47
+ tag = _local_name(element)
48
+
49
+ if tag == "p":
50
+ text = _extract_paragraph(element)
51
+ if text:
52
+ sections.append(text)
53
+
54
+ elif tag == "tbl":
55
+ table_md = _extract_table(element)
56
+ if table_md:
57
+ sections.append(table_md)
58
+
59
+ return "\n\n".join(sections)
60
+
61
+
62
+ def _local_name(element: Element) -> str:
63
+ """Get the local name of an XML element (strip namespace)."""
64
+ tag = element.tag
65
+ if "}" in tag:
66
+ return tag.split("}", 1)[1]
67
+ return tag
68
+
69
+
70
+ def _extract_paragraph(para: Element) -> str:
71
+ """Extract text from a w:p element, applying heading styles."""
72
+ ppr = para.find(f"{{{_NS['w']}}}pPr")
73
+ heading_level = 0
74
+ if ppr is not None:
75
+ pstyle = ppr.find(f"{{{_NS['w']}}}pStyle")
76
+ if pstyle is not None:
77
+ style_val = pstyle.get(f"{{{_NS['w']}}}val", "")
78
+ if style_val.startswith("Heading"):
79
+ try:
80
+ heading_level = int(style_val.replace("Heading", ""))
81
+ except ValueError:
82
+ heading_level = 0
83
+
84
+ texts: list[str] = []
85
+ for run in para.iter(f"{{{_NS['w']}}}r"):
86
+ for t in run.iter(f"{{{_NS['w']}}}t"):
87
+ if t.text:
88
+ texts.append(t.text)
89
+
90
+ text = "".join(texts).strip()
91
+ if not text:
92
+ return ""
93
+
94
+ if heading_level > 0:
95
+ return f"{'#' * (heading_level + 1)} {text}"
96
+
97
+ return text
98
+
99
+
100
+ def _extract_table(tbl: Element) -> str:
101
+ """Extract a w:tbl element as a markdown table."""
102
+ rows: list[list[str]] = []
103
+
104
+ for tr in tbl.iter(f"{{{_NS['w']}}}tr"):
105
+ cells: list[str] = []
106
+ for tc in tr.iter(f"{{{_NS['w']}}}tc"):
107
+ cell_texts: list[str] = []
108
+ for p in tc.iter(f"{{{_NS['w']}}}p"):
109
+ p_text = _extract_paragraph(p)
110
+ if p_text:
111
+ cell_texts.append(p_text)
112
+ cells.append(" ".join(cell_texts).replace("\n", " ").strip())
113
+ if cells:
114
+ rows.append(cells)
115
+
116
+ if not rows:
117
+ return ""
118
+
119
+ max_cols = max(len(row) for row in rows)
120
+ for row in rows:
121
+ while len(row) < max_cols:
122
+ row.append("")
123
+
124
+ headers = rows[0]
125
+ md = ["| " + " | ".join(headers) + " |"]
126
+ md.append("| " + " | ".join(["---"] * max_cols) + " |")
127
+ for row in rows[1:]:
128
+ md.append("| " + " | ".join(row) + " |")
129
+
130
+ return "\n".join(md)
@@ -0,0 +1,74 @@
1
+ """PDF text extraction using pdfplumber + pypdf.
2
+
3
+ Extracts text with layout preservation, tables as markdown, and form field metadata.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from pathlib import Path
9
+
10
+ from obsidian_import.timeout import run_with_timeout
11
+
12
+
13
+ def extract(path: Path, timeout_seconds: int) -> str:
14
+ """Extract text and tables from a PDF file, returning markdown."""
15
+ return run_with_timeout(lambda: _extract_pdf(path), timeout_seconds, "PDF", path)
16
+
17
+
18
+ def _extract_pdf(path: Path) -> str:
19
+ """Internal PDF extraction logic."""
20
+ import pdfplumber
21
+ from pypdf import PdfReader
22
+
23
+ sections: list[str] = []
24
+
25
+ reader = PdfReader(str(path))
26
+ meta = reader.metadata
27
+ if meta:
28
+ title = meta.title or path.stem
29
+ if meta.author:
30
+ sections.append(f"**Author:** {meta.author}")
31
+ if meta.creation_date:
32
+ sections.append(f"**Created:** {meta.creation_date}")
33
+ else:
34
+ title = path.stem
35
+
36
+ sections.insert(0, f"# {title}")
37
+
38
+ fields = reader.get_fields()
39
+ if fields:
40
+ field_lines = ["", "## Form Fields", ""]
41
+ for name, field in fields.items():
42
+ field_type = field.get("/FT", "unknown")
43
+ value = field.get("/V", "")
44
+ field_lines.append(f"- **{name}** ({field_type}): {value}")
45
+ sections.append("\n".join(field_lines))
46
+
47
+ with pdfplumber.open(str(path)) as pdf:
48
+ for i, page in enumerate(pdf.pages, 1):
49
+ page_sections: list[str] = [f"\n## Page {i}\n"]
50
+
51
+ tables = page.extract_tables()
52
+ if tables:
53
+ for table in tables:
54
+ if not table or not table[0]:
55
+ continue
56
+ headers = [str(cell or "").strip() for cell in table[0]]
57
+ md_table = ["| " + " | ".join(headers) + " |"]
58
+ md_table.append("| " + " | ".join(["---"] * len(headers)) + " |")
59
+ for row in table[1:]:
60
+ cells = [str(cell or "").strip().replace("\n", " ") for cell in row]
61
+ while len(cells) < len(headers):
62
+ cells.append("")
63
+ cells = cells[: len(headers)]
64
+ md_table.append("| " + " | ".join(cells) + " |")
65
+ page_sections.append("\n".join(md_table))
66
+
67
+ text = page.extract_text()
68
+ if text:
69
+ page_sections.append(text.strip())
70
+
71
+ if len(page_sections) > 1:
72
+ sections.append("\n".join(page_sections))
73
+
74
+ return "\n\n".join(sections)