repulp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- repulp/__init__.py +172 -0
- repulp/cache.py +88 -0
- repulp/cleaner.py +99 -0
- repulp/cli.py +492 -0
- repulp/config.py +69 -0
- repulp/converter.py +151 -0
- repulp/engine.py +243 -0
- repulp/extractor.py +168 -0
- repulp/fetcher.py +62 -0
- repulp/formatter.py +56 -0
- repulp/frontmatter.py +67 -0
- repulp/py.typed +0 -0
- repulp/watcher.py +168 -0
- repulp-0.1.0.dist-info/METADATA +366 -0
- repulp-0.1.0.dist-info/RECORD +18 -0
- repulp-0.1.0.dist-info/WHEEL +4 -0
- repulp-0.1.0.dist-info/entry_points.txt +2 -0
- repulp-0.1.0.dist-info/licenses/LICENSE +21 -0
repulp/__init__.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""repulp — Parallel batch document conversion, watch mode, and structured extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
import tempfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional, Union
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def convert(
|
|
14
|
+
source: Union[str, Path],
|
|
15
|
+
clean: bool = True,
|
|
16
|
+
frontmatter: bool = False,
|
|
17
|
+
format: str = "md",
|
|
18
|
+
) -> "ConversionResult":
|
|
19
|
+
"""Convert a file, URL, or path to Markdown.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
source: File path, URL (http/https), or "-" for stdin.
|
|
23
|
+
clean: Post-process the markdown output.
|
|
24
|
+
frontmatter: Inject YAML frontmatter with metadata.
|
|
25
|
+
format: Output format - "md", "text", or "json".
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
ConversionResult with markdown content and metadata.
|
|
29
|
+
"""
|
|
30
|
+
from repulp.converter import convert_file, convert_url, ConversionResult
|
|
31
|
+
from repulp.fetcher import is_url
|
|
32
|
+
from repulp.frontmatter import inject_frontmatter
|
|
33
|
+
from repulp.formatter import format_output
|
|
34
|
+
|
|
35
|
+
source_str = str(source)
|
|
36
|
+
|
|
37
|
+
if source_str == "-":
|
|
38
|
+
content = sys.stdin.buffer.read()
|
|
39
|
+
tmp = tempfile.NamedTemporaryFile(suffix=".html", delete=False)
|
|
40
|
+
tmp.write(content)
|
|
41
|
+
tmp.close()
|
|
42
|
+
try:
|
|
43
|
+
result = convert_file(Path(tmp.name), clean=clean)
|
|
44
|
+
result = ConversionResult(
|
|
45
|
+
source_path=Path("stdin"),
|
|
46
|
+
markdown=result.markdown,
|
|
47
|
+
success=result.success,
|
|
48
|
+
error=result.error,
|
|
49
|
+
)
|
|
50
|
+
finally:
|
|
51
|
+
Path(tmp.name).unlink(missing_ok=True)
|
|
52
|
+
elif is_url(source_str):
|
|
53
|
+
result = convert_url(source_str, clean=clean)
|
|
54
|
+
else:
|
|
55
|
+
result = convert_file(Path(source_str), clean=clean)
|
|
56
|
+
|
|
57
|
+
if result.success and frontmatter:
|
|
58
|
+
result = ConversionResult(
|
|
59
|
+
source_path=result.source_path,
|
|
60
|
+
markdown=inject_frontmatter(result.markdown, source_str),
|
|
61
|
+
success=True,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if result.success and format != "md":
|
|
65
|
+
result = ConversionResult(
|
|
66
|
+
source_path=result.source_path,
|
|
67
|
+
markdown=format_output(result.markdown, format, source_str),
|
|
68
|
+
success=True,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return result
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def batch(
|
|
75
|
+
source: Union[str, Path],
|
|
76
|
+
output_dir: Optional[Union[str, Path]] = None,
|
|
77
|
+
workers: Optional[int] = None,
|
|
78
|
+
recursive: bool = False,
|
|
79
|
+
include: Optional[list[str]] = None,
|
|
80
|
+
exclude: Optional[list[str]] = None,
|
|
81
|
+
clean: bool = True,
|
|
82
|
+
incremental: bool = False,
|
|
83
|
+
) -> "BatchResult":
|
|
84
|
+
"""Convert all supported files in a directory using parallel workers.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
source: Directory to scan.
|
|
88
|
+
output_dir: Output directory for .md files.
|
|
89
|
+
workers: Number of parallel workers. None = auto.
|
|
90
|
+
recursive: Scan subdirectories.
|
|
91
|
+
include: Glob patterns to include.
|
|
92
|
+
exclude: Glob patterns to exclude.
|
|
93
|
+
clean: Post-process markdown.
|
|
94
|
+
incremental: Skip unchanged files.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
BatchResult with conversion results and statistics.
|
|
98
|
+
"""
|
|
99
|
+
from repulp.engine import batch_convert
|
|
100
|
+
|
|
101
|
+
return batch_convert(
|
|
102
|
+
Path(source),
|
|
103
|
+
workers=workers,
|
|
104
|
+
recursive=recursive,
|
|
105
|
+
include=include,
|
|
106
|
+
exclude=exclude,
|
|
107
|
+
clean=clean,
|
|
108
|
+
incremental=incremental,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def extract_tables(
|
|
113
|
+
source: Union[str, Path],
|
|
114
|
+
format: str = "dict",
|
|
115
|
+
clean: bool = True,
|
|
116
|
+
) -> list:
|
|
117
|
+
"""Extract tables from a document as structured data.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
source: File path or URL.
|
|
121
|
+
format: "dict" (list[dict]), "csv" (strings), "dataframe" (pandas), "markdown" (raw).
|
|
122
|
+
clean: Post-process markdown before extraction.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List of tables in the requested format.
|
|
126
|
+
"""
|
|
127
|
+
from repulp.extractor import extract_tables_structured
|
|
128
|
+
|
|
129
|
+
result = convert(source, clean=clean)
|
|
130
|
+
if not result.success:
|
|
131
|
+
return []
|
|
132
|
+
|
|
133
|
+
return extract_tables_structured(result.markdown, format=format)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def watch(
|
|
137
|
+
source: Union[str, Path],
|
|
138
|
+
output_dir: Optional[Union[str, Path]] = None,
|
|
139
|
+
recursive: bool = True,
|
|
140
|
+
include: Optional[list[str]] = None,
|
|
141
|
+
exclude: Optional[list[str]] = None,
|
|
142
|
+
clean: bool = True,
|
|
143
|
+
debounce_ms: int = 500,
|
|
144
|
+
on_change=None,
|
|
145
|
+
on_command: Optional[str] = None,
|
|
146
|
+
) -> None:
|
|
147
|
+
"""Watch a directory and auto-convert files on change.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
source: Directory to watch.
|
|
151
|
+
output_dir: Where to write .md files.
|
|
152
|
+
recursive: Watch subdirectories.
|
|
153
|
+
include: Glob patterns to include.
|
|
154
|
+
exclude: Glob patterns to exclude.
|
|
155
|
+
clean: Post-process markdown.
|
|
156
|
+
debounce_ms: Debounce interval in milliseconds.
|
|
157
|
+
on_change: Callback(WatchEvent) after each conversion.
|
|
158
|
+
on_command: Shell command to run after each conversion.
|
|
159
|
+
"""
|
|
160
|
+
from repulp.watcher import watch_directory
|
|
161
|
+
|
|
162
|
+
watch_directory(
|
|
163
|
+
Path(source),
|
|
164
|
+
output_dir=Path(output_dir) if output_dir else None,
|
|
165
|
+
recursive=recursive,
|
|
166
|
+
include=include,
|
|
167
|
+
exclude=exclude,
|
|
168
|
+
clean=clean,
|
|
169
|
+
debounce_ms=debounce_ms,
|
|
170
|
+
on_change=on_change,
|
|
171
|
+
on_command=on_command,
|
|
172
|
+
)
|
repulp/cache.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Incremental build cache for repulp.
|
|
2
|
+
|
|
3
|
+
Tracks SHA256 hashes of source files so that batch_convert() can skip
|
|
4
|
+
files that have not changed since the last successful conversion.
|
|
5
|
+
The cache is persisted as a JSON file (typically `.repulp.cache` inside
|
|
6
|
+
the source directory).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import json
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ConversionCache:
|
|
18
|
+
"""File-hash cache that detects which source files have changed.
|
|
19
|
+
|
|
20
|
+
Keys are resolved absolute path strings; values are hex-encoded SHA256
|
|
21
|
+
digests of the file contents at the time of last successful conversion.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, cache_path: Path) -> None:
|
|
25
|
+
self._cache_path = Path(cache_path)
|
|
26
|
+
self._hashes: dict[str, str] = {}
|
|
27
|
+
|
|
28
|
+
if self._cache_path.exists():
|
|
29
|
+
try:
|
|
30
|
+
self._hashes = json.loads(self._cache_path.read_text(encoding="utf-8"))
|
|
31
|
+
except (json.JSONDecodeError, OSError):
|
|
32
|
+
self._hashes = {}
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def _hash_file(path: Path) -> Optional[str]:
|
|
36
|
+
"""Return the SHA256 hex digest of *path*'s contents, or None if unreadable."""
|
|
37
|
+
try:
|
|
38
|
+
data = Path(path).resolve().read_bytes()
|
|
39
|
+
return hashlib.sha256(data).hexdigest()
|
|
40
|
+
except (OSError, PermissionError):
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
def is_changed(self, path: Path) -> bool:
|
|
44
|
+
"""Return True if *path* has changed since the last cached conversion.
|
|
45
|
+
|
|
46
|
+
A file is considered changed when:
|
|
47
|
+
- It has never been cached before.
|
|
48
|
+
- Its current SHA256 hash differs from the stored hash.
|
|
49
|
+
- The file no longer exists (hash returns None).
|
|
50
|
+
"""
|
|
51
|
+
key = str(Path(path).resolve())
|
|
52
|
+
current_hash = self._hash_file(path)
|
|
53
|
+
if current_hash is None:
|
|
54
|
+
return True
|
|
55
|
+
return self._hashes.get(key) != current_hash
|
|
56
|
+
|
|
57
|
+
def mark_converted(self, path: Path) -> None:
|
|
58
|
+
"""Record the current SHA256 hash for *path* as the latest converted state."""
|
|
59
|
+
key = str(Path(path).resolve())
|
|
60
|
+
file_hash = self._hash_file(path)
|
|
61
|
+
if file_hash is not None:
|
|
62
|
+
self._hashes[key] = file_hash
|
|
63
|
+
|
|
64
|
+
def partition(self, files: list[Path]) -> tuple[list[Path], list[Path]]:
|
|
65
|
+
"""Split *files* into (changed, unchanged) based on cached hashes.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
files: List of file paths to check.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
A 2-tuple of (changed_files, unchanged_files).
|
|
72
|
+
"""
|
|
73
|
+
changed: list[Path] = []
|
|
74
|
+
unchanged: list[Path] = []
|
|
75
|
+
for f in files:
|
|
76
|
+
if self.is_changed(f):
|
|
77
|
+
changed.append(f)
|
|
78
|
+
else:
|
|
79
|
+
unchanged.append(f)
|
|
80
|
+
return changed, unchanged
|
|
81
|
+
|
|
82
|
+
def save(self) -> None:
|
|
83
|
+
"""Persist the hash cache to disk as JSON."""
|
|
84
|
+
self._cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
self._cache_path.write_text(
|
|
86
|
+
json.dumps(self._hashes, indent=2, sort_keys=True) + "\n",
|
|
87
|
+
encoding="utf-8",
|
|
88
|
+
)
|
repulp/cleaner.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _normalize_blank_lines(text: str) -> str:
|
|
7
|
+
return re.sub(r"\n{3,}", "\n\n", text)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _strip_trailing_whitespace(text: str) -> str:
|
|
11
|
+
return "\n".join(line.rstrip() for line in text.split("\n"))
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _ensure_heading_spacing(text: str) -> str:
|
|
15
|
+
lines = text.split("\n")
|
|
16
|
+
result: list[str] = []
|
|
17
|
+
for i, line in enumerate(lines):
|
|
18
|
+
is_heading = bool(re.match(r"^#{1,6}\s", line))
|
|
19
|
+
if is_heading and i > 0 and result and result[-1] != "":
|
|
20
|
+
result.append("")
|
|
21
|
+
result.append(line)
|
|
22
|
+
if is_heading and i < len(lines) - 1 and lines[i + 1] != "":
|
|
23
|
+
result.append("")
|
|
24
|
+
return "\n".join(result)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _fix_table_alignment(text: str) -> str:
|
|
28
|
+
lines = text.split("\n")
|
|
29
|
+
i = 0
|
|
30
|
+
result_lines: list[str] = []
|
|
31
|
+
|
|
32
|
+
while i < len(lines):
|
|
33
|
+
if "|" in lines[i] and i + 1 < len(lines) and re.match(
|
|
34
|
+
r"^\|[\s\-:|]+\|$", lines[i + 1].strip()
|
|
35
|
+
):
|
|
36
|
+
table_lines: list[str] = []
|
|
37
|
+
while i < len(lines) and "|" in lines[i]:
|
|
38
|
+
table_lines.append(lines[i])
|
|
39
|
+
i += 1
|
|
40
|
+
result_lines.extend(_format_table(table_lines))
|
|
41
|
+
else:
|
|
42
|
+
result_lines.append(lines[i])
|
|
43
|
+
i += 1
|
|
44
|
+
|
|
45
|
+
return "\n".join(result_lines)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _format_table(table_lines: list[str]) -> list[str]:
|
|
49
|
+
rows: list[list[str]] = []
|
|
50
|
+
separator_idx = 1
|
|
51
|
+
|
|
52
|
+
for line in table_lines:
|
|
53
|
+
cells = [c.strip() for c in line.strip().strip("|").split("|")]
|
|
54
|
+
rows.append(cells)
|
|
55
|
+
|
|
56
|
+
if len(rows) < 2:
|
|
57
|
+
return table_lines
|
|
58
|
+
|
|
59
|
+
num_cols = max(len(row) for row in rows)
|
|
60
|
+
for row in rows:
|
|
61
|
+
while len(row) < num_cols:
|
|
62
|
+
row.append("")
|
|
63
|
+
|
|
64
|
+
col_widths = [
|
|
65
|
+
max(len(rows[r][c]) for r in range(len(rows)) if r != separator_idx)
|
|
66
|
+
for c in range(num_cols)
|
|
67
|
+
]
|
|
68
|
+
col_widths = [max(w, 3) for w in col_widths]
|
|
69
|
+
|
|
70
|
+
formatted: list[str] = []
|
|
71
|
+
for r, row in enumerate(rows):
|
|
72
|
+
if r == separator_idx:
|
|
73
|
+
cells = ["-" * w for w in col_widths]
|
|
74
|
+
formatted.append("| " + " | ".join(cells) + " |")
|
|
75
|
+
else:
|
|
76
|
+
cells = [row[c].ljust(col_widths[c]) for c in range(num_cols)]
|
|
77
|
+
formatted.append("| " + " | ".join(cells) + " |")
|
|
78
|
+
|
|
79
|
+
return formatted
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _strip_artifacts(text: str) -> str:
|
|
83
|
+
text = text.replace("\x0c", "\n")
|
|
84
|
+
text = text.replace("\x00", "")
|
|
85
|
+
text = re.sub(r"[\x01-\x08\x0b\x0e-\x1f\x7f]", "", text)
|
|
86
|
+
return text
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def clean_markdown(text: str) -> str:
|
|
90
|
+
if not text:
|
|
91
|
+
return ""
|
|
92
|
+
|
|
93
|
+
text = _strip_artifacts(text)
|
|
94
|
+
text = _strip_trailing_whitespace(text)
|
|
95
|
+
text = _ensure_heading_spacing(text)
|
|
96
|
+
text = _normalize_blank_lines(text)
|
|
97
|
+
text = _fix_table_alignment(text)
|
|
98
|
+
text = text.strip()
|
|
99
|
+
return text
|