obsidian-import 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- obsidian_import-0.1.0/.gitignore +10 -0
- obsidian_import-0.1.0/LICENSE +21 -0
- obsidian_import-0.1.0/PKG-INFO +175 -0
- obsidian_import-0.1.0/README.md +139 -0
- obsidian_import-0.1.0/obsidian_import/__init__.py +81 -0
- obsidian_import-0.1.0/obsidian_import/backends/__init__.py +1 -0
- obsidian_import-0.1.0/obsidian_import/backends/docling.py +31 -0
- obsidian_import-0.1.0/obsidian_import/backends/markitdown.py +31 -0
- obsidian_import-0.1.0/obsidian_import/backends/native_docx.py +130 -0
- obsidian_import-0.1.0/obsidian_import/backends/native_pdf.py +74 -0
- obsidian_import-0.1.0/obsidian_import/backends/native_pptx.py +102 -0
- obsidian_import-0.1.0/obsidian_import/backends/native_xlsx.py +63 -0
- obsidian_import-0.1.0/obsidian_import/cli.py +135 -0
- obsidian_import-0.1.0/obsidian_import/config.py +147 -0
- obsidian_import-0.1.0/obsidian_import/defaults/default.yaml +34 -0
- obsidian_import-0.1.0/obsidian_import/discovery.py +66 -0
- obsidian_import-0.1.0/obsidian_import/exceptions.py +25 -0
- obsidian_import-0.1.0/obsidian_import/output.py +78 -0
- obsidian_import-0.1.0/obsidian_import/registry.py +94 -0
- obsidian_import-0.1.0/obsidian_import/timeout.py +48 -0
- obsidian_import-0.1.0/pyproject.toml +63 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Matthias Christenson
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: obsidian-import
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extract files (PDF, DOCX, PPTX, XLSX) into Obsidian-flavored Markdown
|
|
5
|
+
Project-URL: Documentation, https://neuralsignal.github.io/obsidian-import/
|
|
6
|
+
Project-URL: Repository, https://github.com/neuralsignal/obsidian-import
|
|
7
|
+
Author: Matthias Christenson
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: docx,extraction,import,markdown,obsidian,pdf,pptx,xlsx
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Office/Business
|
|
18
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
19
|
+
Requires-Python: >=3.12
|
|
20
|
+
Requires-Dist: click<9,>=8.0
|
|
21
|
+
Requires-Dist: defusedxml<1,>=0.7
|
|
22
|
+
Requires-Dist: openpyxl<4,>=3.1
|
|
23
|
+
Requires-Dist: pdfplumber<1,>=0.11
|
|
24
|
+
Requires-Dist: pypdf<6,>=5.0
|
|
25
|
+
Requires-Dist: python-pptx<2,>=1.0
|
|
26
|
+
Requires-Dist: pyyaml<7,>=6.0
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: hypothesis<7,>=6.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest-cov<6,>=5.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest<9,>=8.0; extra == 'dev'
|
|
31
|
+
Provides-Extra: docling
|
|
32
|
+
Requires-Dist: docling>=2.0; extra == 'docling'
|
|
33
|
+
Provides-Extra: markitdown
|
|
34
|
+
Requires-Dist: markitdown[all]>=0.1; extra == 'markitdown'
|
|
35
|
+
Description-Content-Type: text/markdown
|
|
36
|
+
|
|
37
|
+
# obsidian-import
|
|
38
|
+
|
|
39
|
+
Extract files (PDF, DOCX, PPTX, XLSX) into Obsidian-flavored Markdown.
|
|
40
|
+
|
|
41
|
+
The mirror of [obsidian-export](https://github.com/neuralsignal/obsidian-export): where obsidian-export converts Obsidian notes to PDF/DOCX, obsidian-import converts external documents into Obsidian-ready markdown with YAML frontmatter.
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install obsidian-import
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
With optional backends:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install obsidian-import[markitdown] # fallback for HTML, CSV, etc.
|
|
53
|
+
pip install obsidian-import[docling] # high-quality ML-based extraction
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Quick Start
|
|
57
|
+
|
|
58
|
+
### Single file
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
obsidian-import convert report.pdf --output vault/imports/report.md
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Batch extraction
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
obsidian-import batch --config config.yaml
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Check backend availability
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
obsidian-import doctor
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Python API
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from pathlib import Path
|
|
80
|
+
from obsidian_import import extract_file, discover_files
|
|
81
|
+
from obsidian_import.config import load_config
|
|
82
|
+
from obsidian_import.output import format_output
|
|
83
|
+
|
|
84
|
+
config = load_config(Path("config.yaml"))
|
|
85
|
+
|
|
86
|
+
# Single file
|
|
87
|
+
doc = extract_file(Path("report.pdf"), config)
|
|
88
|
+
markdown = format_output(doc, config.output)
|
|
89
|
+
|
|
90
|
+
# Batch discovery
|
|
91
|
+
for file in discover_files(config):
|
|
92
|
+
print(f"{file.extension} {file.size_bytes:,} bytes {file.path}")
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Configuration
|
|
96
|
+
|
|
97
|
+
Create a `config.yaml`:
|
|
98
|
+
|
|
99
|
+
```yaml
|
|
100
|
+
input:
|
|
101
|
+
directories:
|
|
102
|
+
- path: /path/to/documents
|
|
103
|
+
extensions: [".pdf", ".docx", ".pptx", ".xlsx"]
|
|
104
|
+
exclude: ["*.tmp", "~$*"]
|
|
105
|
+
|
|
106
|
+
output:
|
|
107
|
+
directory: ./extracted
|
|
108
|
+
frontmatter: true
|
|
109
|
+
metadata_fields:
|
|
110
|
+
- title
|
|
111
|
+
- source
|
|
112
|
+
- original_path
|
|
113
|
+
- file_type
|
|
114
|
+
- extracted_at
|
|
115
|
+
- page_count
|
|
116
|
+
|
|
117
|
+
backends:
|
|
118
|
+
pdf: native # pdfplumber + pypdf
|
|
119
|
+
docx: native # defusedxml
|
|
120
|
+
pptx: native # python-pptx
|
|
121
|
+
xlsx: native # openpyxl
|
|
122
|
+
default: native # fallback for unknown extensions
|
|
123
|
+
|
|
124
|
+
extraction:
|
|
125
|
+
timeout_seconds: 120
|
|
126
|
+
max_file_size_mb: 100
|
|
127
|
+
xlsx_max_rows_per_sheet: 500
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Backend Selection
|
|
131
|
+
|
|
132
|
+
| Backend | Extensions | Dependencies | Quality |
|
|
133
|
+
|---------|-----------|--------------|---------|
|
|
134
|
+
| `native` | .pdf, .docx, .pptx, .xlsx | Core (included) | Good for text-heavy documents |
|
|
135
|
+
| `markitdown` | Any | `[markitdown]` extra | Good fallback for HTML, CSV, etc. |
|
|
136
|
+
| `docling` | Any | `[docling]` extra | Best for complex layouts, tables |
|
|
137
|
+
|
|
138
|
+
## CLI Reference
|
|
139
|
+
|
|
140
|
+
| Command | Description |
|
|
141
|
+
|---------|-------------|
|
|
142
|
+
| `obsidian-import convert <path>` | Extract a single file |
|
|
143
|
+
| `obsidian-import discover --config <yaml>` | List matching files |
|
|
144
|
+
| `obsidian-import batch --config <yaml>` | Extract all discovered files |
|
|
145
|
+
| `obsidian-import doctor` | Check backend availability |
|
|
146
|
+
|
|
147
|
+
## Output Format
|
|
148
|
+
|
|
149
|
+
Extracted files are written as Obsidian-flavored markdown with YAML frontmatter:
|
|
150
|
+
|
|
151
|
+
```markdown
|
|
152
|
+
---
|
|
153
|
+
title: Annual Report
|
|
154
|
+
source: obsidian-import
|
|
155
|
+
original_path: /documents/report.pdf
|
|
156
|
+
file_type: pdf
|
|
157
|
+
extracted_at: 2026-03-09T10:30:00Z
|
|
158
|
+
page_count: 12
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
# Annual Report
|
|
162
|
+
|
|
163
|
+
## Page 1
|
|
164
|
+
|
|
165
|
+
Content extracted from the first page...
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## Related Packages
|
|
169
|
+
|
|
170
|
+
- [obsidian-export](https://github.com/neuralsignal/obsidian-export) — Convert Obsidian notes to PDF/DOCX
|
|
171
|
+
- [agentic-brain](https://github.com/neuralsignal/agentic-brain) — Agentic knowledge management (consumes both packages)
|
|
172
|
+
|
|
173
|
+
## License
|
|
174
|
+
|
|
175
|
+
MIT
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# obsidian-import
|
|
2
|
+
|
|
3
|
+
Extract files (PDF, DOCX, PPTX, XLSX) into Obsidian-flavored Markdown.
|
|
4
|
+
|
|
5
|
+
The mirror of [obsidian-export](https://github.com/neuralsignal/obsidian-export): where obsidian-export converts Obsidian notes to PDF/DOCX, obsidian-import converts external documents into Obsidian-ready markdown with YAML frontmatter.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install obsidian-import
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
With optional backends:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install obsidian-import[markitdown] # fallback for HTML, CSV, etc.
|
|
17
|
+
pip install obsidian-import[docling] # high-quality ML-based extraction
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
### Single file
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
obsidian-import convert report.pdf --output vault/imports/report.md
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Batch extraction
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
obsidian-import batch --config config.yaml
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### Check backend availability
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
obsidian-import doctor
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Python API
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from pathlib import Path
|
|
44
|
+
from obsidian_import import extract_file, discover_files
|
|
45
|
+
from obsidian_import.config import load_config
|
|
46
|
+
from obsidian_import.output import format_output
|
|
47
|
+
|
|
48
|
+
config = load_config(Path("config.yaml"))
|
|
49
|
+
|
|
50
|
+
# Single file
|
|
51
|
+
doc = extract_file(Path("report.pdf"), config)
|
|
52
|
+
markdown = format_output(doc, config.output)
|
|
53
|
+
|
|
54
|
+
# Batch discovery
|
|
55
|
+
for file in discover_files(config):
|
|
56
|
+
print(f"{file.extension} {file.size_bytes:,} bytes {file.path}")
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Configuration
|
|
60
|
+
|
|
61
|
+
Create a `config.yaml`:
|
|
62
|
+
|
|
63
|
+
```yaml
|
|
64
|
+
input:
|
|
65
|
+
directories:
|
|
66
|
+
- path: /path/to/documents
|
|
67
|
+
extensions: [".pdf", ".docx", ".pptx", ".xlsx"]
|
|
68
|
+
exclude: ["*.tmp", "~$*"]
|
|
69
|
+
|
|
70
|
+
output:
|
|
71
|
+
directory: ./extracted
|
|
72
|
+
frontmatter: true
|
|
73
|
+
metadata_fields:
|
|
74
|
+
- title
|
|
75
|
+
- source
|
|
76
|
+
- original_path
|
|
77
|
+
- file_type
|
|
78
|
+
- extracted_at
|
|
79
|
+
- page_count
|
|
80
|
+
|
|
81
|
+
backends:
|
|
82
|
+
pdf: native # pdfplumber + pypdf
|
|
83
|
+
docx: native # defusedxml
|
|
84
|
+
pptx: native # python-pptx
|
|
85
|
+
xlsx: native # openpyxl
|
|
86
|
+
default: native # fallback for unknown extensions
|
|
87
|
+
|
|
88
|
+
extraction:
|
|
89
|
+
timeout_seconds: 120
|
|
90
|
+
max_file_size_mb: 100
|
|
91
|
+
xlsx_max_rows_per_sheet: 500
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Backend Selection
|
|
95
|
+
|
|
96
|
+
| Backend | Extensions | Dependencies | Quality |
|
|
97
|
+
|---------|-----------|--------------|---------|
|
|
98
|
+
| `native` | .pdf, .docx, .pptx, .xlsx | Core (included) | Good for text-heavy documents |
|
|
99
|
+
| `markitdown` | Any | `[markitdown]` extra | Good fallback for HTML, CSV, etc. |
|
|
100
|
+
| `docling` | Any | `[docling]` extra | Best for complex layouts, tables |
|
|
101
|
+
|
|
102
|
+
## CLI Reference
|
|
103
|
+
|
|
104
|
+
| Command | Description |
|
|
105
|
+
|---------|-------------|
|
|
106
|
+
| `obsidian-import convert <path>` | Extract a single file |
|
|
107
|
+
| `obsidian-import discover --config <yaml>` | List matching files |
|
|
108
|
+
| `obsidian-import batch --config <yaml>` | Extract all discovered files |
|
|
109
|
+
| `obsidian-import doctor` | Check backend availability |
|
|
110
|
+
|
|
111
|
+
## Output Format
|
|
112
|
+
|
|
113
|
+
Extracted files are written as Obsidian-flavored markdown with YAML frontmatter:
|
|
114
|
+
|
|
115
|
+
```markdown
|
|
116
|
+
---
|
|
117
|
+
title: Annual Report
|
|
118
|
+
source: obsidian-import
|
|
119
|
+
original_path: /documents/report.pdf
|
|
120
|
+
file_type: pdf
|
|
121
|
+
extracted_at: 2026-03-09T10:30:00Z
|
|
122
|
+
page_count: 12
|
|
123
|
+
---
|
|
124
|
+
|
|
125
|
+
# Annual Report
|
|
126
|
+
|
|
127
|
+
## Page 1
|
|
128
|
+
|
|
129
|
+
Content extracted from the first page...
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Related Packages
|
|
133
|
+
|
|
134
|
+
- [obsidian-export](https://github.com/neuralsignal/obsidian-export) — Convert Obsidian notes to PDF/DOCX
|
|
135
|
+
- [agentic-brain](https://github.com/neuralsignal/agentic-brain) — Agentic knowledge management (consumes both packages)
|
|
136
|
+
|
|
137
|
+
## License
|
|
138
|
+
|
|
139
|
+
MIT
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""obsidian-import: Extract files into Obsidian-flavored Markdown.
|
|
2
|
+
|
|
3
|
+
Public API:
|
|
4
|
+
extract_file(path, config) -> ExtractedDocument
|
|
5
|
+
extract_text(path, config) -> str
|
|
6
|
+
discover_files(config) -> Iterator[DiscoveredFile]
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections.abc import Iterator
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from obsidian_import.config import ImportConfig
|
|
15
|
+
from obsidian_import.discovery import DiscoveredFile
|
|
16
|
+
from obsidian_import.discovery import discover_files as _discover_files
|
|
17
|
+
from obsidian_import.output import ExtractedDocument
|
|
18
|
+
from obsidian_import.registry import extract_with_backend
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def extract_file(path: Path, config: ImportConfig) -> ExtractedDocument:
|
|
22
|
+
"""Extract a single file to Obsidian-flavored markdown.
|
|
23
|
+
|
|
24
|
+
Uses the configured backend for the file's extension.
|
|
25
|
+
Returns an ExtractedDocument with the extracted markdown and metadata.
|
|
26
|
+
"""
|
|
27
|
+
extension = path.suffix.lower()
|
|
28
|
+
extra_kwargs: dict[str, object] = {}
|
|
29
|
+
|
|
30
|
+
if extension == ".xlsx":
|
|
31
|
+
extra_kwargs["max_rows_per_sheet"] = config.extraction.xlsx_max_rows_per_sheet
|
|
32
|
+
|
|
33
|
+
markdown = extract_with_backend(
|
|
34
|
+
path,
|
|
35
|
+
backends=config.backends,
|
|
36
|
+
timeout_seconds=config.extraction.timeout_seconds,
|
|
37
|
+
**extra_kwargs,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
page_count = _estimate_page_count(markdown, extension)
|
|
41
|
+
|
|
42
|
+
return ExtractedDocument(
|
|
43
|
+
source_path=path,
|
|
44
|
+
markdown=markdown,
|
|
45
|
+
title=path.stem,
|
|
46
|
+
file_type=extension.lstrip("."),
|
|
47
|
+
page_count=page_count,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def discover_files(config: ImportConfig) -> Iterator[DiscoveredFile]:
|
|
52
|
+
"""Discover files matching the configured input directories and extensions."""
|
|
53
|
+
return _discover_files(config)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def extract_text(path: Path, config: ImportConfig) -> str:
|
|
57
|
+
"""Extract raw markdown text from a file. No frontmatter, no metadata wrapping."""
|
|
58
|
+
extension = path.suffix.lower()
|
|
59
|
+
extra_kwargs: dict[str, object] = {}
|
|
60
|
+
if extension == ".xlsx":
|
|
61
|
+
extra_kwargs["max_rows_per_sheet"] = config.extraction.xlsx_max_rows_per_sheet
|
|
62
|
+
return extract_with_backend(
|
|
63
|
+
path,
|
|
64
|
+
backends=config.backends,
|
|
65
|
+
timeout_seconds=config.extraction.timeout_seconds,
|
|
66
|
+
**extra_kwargs,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _estimate_page_count(markdown: str, extension: str) -> int | None:
|
|
71
|
+
"""Estimate page count from extracted markdown.
|
|
72
|
+
|
|
73
|
+
For PDFs, count '## Page N' headings. For other formats, return None.
|
|
74
|
+
"""
|
|
75
|
+
if extension == ".pdf":
|
|
76
|
+
count = 0
|
|
77
|
+
for line in markdown.splitlines():
|
|
78
|
+
if line.startswith("## Page "):
|
|
79
|
+
count += 1
|
|
80
|
+
return count if count > 0 else None
|
|
81
|
+
return None
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Backend modules for obsidian-import extractors."""
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""High-quality document extraction using docling.
|
|
2
|
+
|
|
3
|
+
Requires the [docling] extra: pip install obsidian-import[docling]
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from obsidian_import.exceptions import BackendNotAvailableError
|
|
11
|
+
from obsidian_import.timeout import run_with_timeout
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extract(path: Path, timeout_seconds: int) -> str:
|
|
15
|
+
"""Extract text using docling for high-quality document conversion."""
|
|
16
|
+
try:
|
|
17
|
+
from docling.document_converter import DocumentConverter # noqa: F811
|
|
18
|
+
except ImportError as exc:
|
|
19
|
+
raise BackendNotAvailableError(
|
|
20
|
+
"docling is not installed. Install with: pip install obsidian-import[docling]"
|
|
21
|
+
) from exc
|
|
22
|
+
|
|
23
|
+
def _do_extract() -> str:
|
|
24
|
+
converter = DocumentConverter()
|
|
25
|
+
doc_result = converter.convert(str(path))
|
|
26
|
+
text = doc_result.document.export_to_markdown()
|
|
27
|
+
if not text or not text.strip():
|
|
28
|
+
return f"*No text content extracted from `{path.name}`.*"
|
|
29
|
+
return text
|
|
30
|
+
|
|
31
|
+
return run_with_timeout(_do_extract, timeout_seconds, "docling", path)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Fallback extractor using markitdown for unrecognized formats.
|
|
2
|
+
|
|
3
|
+
Requires the [markitdown] extra: pip install obsidian-import[markitdown]
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from obsidian_import.exceptions import BackendNotAvailableError
|
|
11
|
+
from obsidian_import.timeout import run_with_timeout
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extract(path: Path, timeout_seconds: int) -> str:
|
|
15
|
+
"""Extract text using markitdown as a fallback converter."""
|
|
16
|
+
try:
|
|
17
|
+
from markitdown import MarkItDown # noqa: F811
|
|
18
|
+
except ImportError as exc:
|
|
19
|
+
raise BackendNotAvailableError(
|
|
20
|
+
"markitdown is not installed. Install with: pip install obsidian-import[markitdown]"
|
|
21
|
+
) from exc
|
|
22
|
+
|
|
23
|
+
def _do_extract() -> str:
|
|
24
|
+
converter = MarkItDown()
|
|
25
|
+
converted = converter.convert(str(path))
|
|
26
|
+
text = converted.text_content
|
|
27
|
+
if not text or not text.strip():
|
|
28
|
+
return f"*No text content extracted from `{path.name}`.*"
|
|
29
|
+
return text
|
|
30
|
+
|
|
31
|
+
return run_with_timeout(_do_extract, timeout_seconds, "markitdown", path)
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""DOCX text extraction using defusedxml + zipfile.
|
|
2
|
+
|
|
3
|
+
Opens the DOCX as a ZIP archive, parses word/document.xml to extract
|
|
4
|
+
text with structure preservation (headings, paragraphs, tables).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import zipfile
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from xml.etree.ElementTree import Element
|
|
12
|
+
|
|
13
|
+
from obsidian_import.exceptions import ExtractionError
|
|
14
|
+
from obsidian_import.timeout import run_with_timeout
|
|
15
|
+
|
|
16
|
+
_NS = {
|
|
17
|
+
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
|
18
|
+
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def extract(path: Path, timeout_seconds: int) -> str:
|
|
23
|
+
"""Extract text from a DOCX file, returning markdown."""
|
|
24
|
+
return run_with_timeout(lambda: _extract_docx(path), timeout_seconds, "DOCX", path)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _extract_docx(path: Path) -> str:
|
|
28
|
+
"""Internal DOCX extraction logic."""
|
|
29
|
+
from defusedxml.ElementTree import fromstring
|
|
30
|
+
|
|
31
|
+
if not zipfile.is_zipfile(str(path)):
|
|
32
|
+
raise ExtractionError(f"Not a valid DOCX (ZIP) file: {path}")
|
|
33
|
+
|
|
34
|
+
with zipfile.ZipFile(str(path), "r") as zf:
|
|
35
|
+
if "word/document.xml" not in zf.namelist():
|
|
36
|
+
raise ExtractionError(f"No word/document.xml found in: {path}")
|
|
37
|
+
|
|
38
|
+
doc_xml = zf.read("word/document.xml")
|
|
39
|
+
root = fromstring(doc_xml)
|
|
40
|
+
|
|
41
|
+
sections: list[str] = [f"# {path.stem}"]
|
|
42
|
+
body = root.find(f"{{{_NS['w']}}}body")
|
|
43
|
+
if body is None:
|
|
44
|
+
return f"# {path.stem}\n\n*No body content found.*"
|
|
45
|
+
|
|
46
|
+
for element in body:
|
|
47
|
+
tag = _local_name(element)
|
|
48
|
+
|
|
49
|
+
if tag == "p":
|
|
50
|
+
text = _extract_paragraph(element)
|
|
51
|
+
if text:
|
|
52
|
+
sections.append(text)
|
|
53
|
+
|
|
54
|
+
elif tag == "tbl":
|
|
55
|
+
table_md = _extract_table(element)
|
|
56
|
+
if table_md:
|
|
57
|
+
sections.append(table_md)
|
|
58
|
+
|
|
59
|
+
return "\n\n".join(sections)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _local_name(element: Element) -> str:
|
|
63
|
+
"""Get the local name of an XML element (strip namespace)."""
|
|
64
|
+
tag = element.tag
|
|
65
|
+
if "}" in tag:
|
|
66
|
+
return tag.split("}", 1)[1]
|
|
67
|
+
return tag
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _extract_paragraph(para: Element) -> str:
|
|
71
|
+
"""Extract text from a w:p element, applying heading styles."""
|
|
72
|
+
ppr = para.find(f"{{{_NS['w']}}}pPr")
|
|
73
|
+
heading_level = 0
|
|
74
|
+
if ppr is not None:
|
|
75
|
+
pstyle = ppr.find(f"{{{_NS['w']}}}pStyle")
|
|
76
|
+
if pstyle is not None:
|
|
77
|
+
style_val = pstyle.get(f"{{{_NS['w']}}}val", "")
|
|
78
|
+
if style_val.startswith("Heading"):
|
|
79
|
+
try:
|
|
80
|
+
heading_level = int(style_val.replace("Heading", ""))
|
|
81
|
+
except ValueError:
|
|
82
|
+
heading_level = 0
|
|
83
|
+
|
|
84
|
+
texts: list[str] = []
|
|
85
|
+
for run in para.iter(f"{{{_NS['w']}}}r"):
|
|
86
|
+
for t in run.iter(f"{{{_NS['w']}}}t"):
|
|
87
|
+
if t.text:
|
|
88
|
+
texts.append(t.text)
|
|
89
|
+
|
|
90
|
+
text = "".join(texts).strip()
|
|
91
|
+
if not text:
|
|
92
|
+
return ""
|
|
93
|
+
|
|
94
|
+
if heading_level > 0:
|
|
95
|
+
return f"{'#' * (heading_level + 1)} {text}"
|
|
96
|
+
|
|
97
|
+
return text
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _extract_table(tbl: Element) -> str:
|
|
101
|
+
"""Extract a w:tbl element as a markdown table."""
|
|
102
|
+
rows: list[list[str]] = []
|
|
103
|
+
|
|
104
|
+
for tr in tbl.iter(f"{{{_NS['w']}}}tr"):
|
|
105
|
+
cells: list[str] = []
|
|
106
|
+
for tc in tr.iter(f"{{{_NS['w']}}}tc"):
|
|
107
|
+
cell_texts: list[str] = []
|
|
108
|
+
for p in tc.iter(f"{{{_NS['w']}}}p"):
|
|
109
|
+
p_text = _extract_paragraph(p)
|
|
110
|
+
if p_text:
|
|
111
|
+
cell_texts.append(p_text)
|
|
112
|
+
cells.append(" ".join(cell_texts).replace("\n", " ").strip())
|
|
113
|
+
if cells:
|
|
114
|
+
rows.append(cells)
|
|
115
|
+
|
|
116
|
+
if not rows:
|
|
117
|
+
return ""
|
|
118
|
+
|
|
119
|
+
max_cols = max(len(row) for row in rows)
|
|
120
|
+
for row in rows:
|
|
121
|
+
while len(row) < max_cols:
|
|
122
|
+
row.append("")
|
|
123
|
+
|
|
124
|
+
headers = rows[0]
|
|
125
|
+
md = ["| " + " | ".join(headers) + " |"]
|
|
126
|
+
md.append("| " + " | ".join(["---"] * max_cols) + " |")
|
|
127
|
+
for row in rows[1:]:
|
|
128
|
+
md.append("| " + " | ".join(row) + " |")
|
|
129
|
+
|
|
130
|
+
return "\n".join(md)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""PDF text extraction using pdfplumber + pypdf.
|
|
2
|
+
|
|
3
|
+
Extracts text with layout preservation, tables as markdown, and form field metadata.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from obsidian_import.timeout import run_with_timeout
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def extract(path: Path, timeout_seconds: int) -> str:
|
|
14
|
+
"""Extract text and tables from a PDF file, returning markdown."""
|
|
15
|
+
return run_with_timeout(lambda: _extract_pdf(path), timeout_seconds, "PDF", path)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _extract_pdf(path: Path) -> str:
|
|
19
|
+
"""Internal PDF extraction logic."""
|
|
20
|
+
import pdfplumber
|
|
21
|
+
from pypdf import PdfReader
|
|
22
|
+
|
|
23
|
+
sections: list[str] = []
|
|
24
|
+
|
|
25
|
+
reader = PdfReader(str(path))
|
|
26
|
+
meta = reader.metadata
|
|
27
|
+
if meta:
|
|
28
|
+
title = meta.title or path.stem
|
|
29
|
+
if meta.author:
|
|
30
|
+
sections.append(f"**Author:** {meta.author}")
|
|
31
|
+
if meta.creation_date:
|
|
32
|
+
sections.append(f"**Created:** {meta.creation_date}")
|
|
33
|
+
else:
|
|
34
|
+
title = path.stem
|
|
35
|
+
|
|
36
|
+
sections.insert(0, f"# {title}")
|
|
37
|
+
|
|
38
|
+
fields = reader.get_fields()
|
|
39
|
+
if fields:
|
|
40
|
+
field_lines = ["", "## Form Fields", ""]
|
|
41
|
+
for name, field in fields.items():
|
|
42
|
+
field_type = field.get("/FT", "unknown")
|
|
43
|
+
value = field.get("/V", "")
|
|
44
|
+
field_lines.append(f"- **{name}** ({field_type}): {value}")
|
|
45
|
+
sections.append("\n".join(field_lines))
|
|
46
|
+
|
|
47
|
+
with pdfplumber.open(str(path)) as pdf:
|
|
48
|
+
for i, page in enumerate(pdf.pages, 1):
|
|
49
|
+
page_sections: list[str] = [f"\n## Page {i}\n"]
|
|
50
|
+
|
|
51
|
+
tables = page.extract_tables()
|
|
52
|
+
if tables:
|
|
53
|
+
for table in tables:
|
|
54
|
+
if not table or not table[0]:
|
|
55
|
+
continue
|
|
56
|
+
headers = [str(cell or "").strip() for cell in table[0]]
|
|
57
|
+
md_table = ["| " + " | ".join(headers) + " |"]
|
|
58
|
+
md_table.append("| " + " | ".join(["---"] * len(headers)) + " |")
|
|
59
|
+
for row in table[1:]:
|
|
60
|
+
cells = [str(cell or "").strip().replace("\n", " ") for cell in row]
|
|
61
|
+
while len(cells) < len(headers):
|
|
62
|
+
cells.append("")
|
|
63
|
+
cells = cells[: len(headers)]
|
|
64
|
+
md_table.append("| " + " | ".join(cells) + " |")
|
|
65
|
+
page_sections.append("\n".join(md_table))
|
|
66
|
+
|
|
67
|
+
text = page.extract_text()
|
|
68
|
+
if text:
|
|
69
|
+
page_sections.append(text.strip())
|
|
70
|
+
|
|
71
|
+
if len(page_sections) > 1:
|
|
72
|
+
sections.append("\n".join(page_sections))
|
|
73
|
+
|
|
74
|
+
return "\n\n".join(sections)
|